From 5bbcf8eb531e1c2d90756fa454953f62ff1bec73 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Mon, 20 Apr 2026 14:02:06 +1000
Subject: [PATCH 01/30] add foundation/spec compiler eval scaffolding

---
 .gitignore                                    |  13 +-
 README.md                                     |  16 ++
 package-lock.json                             | 188 ++++++++++++++++
 package.json                                  |  15 ++
 scripts/run-baml-eval.mjs                     | 203 ++++++++++++++++++
 skills/foundation-creator/SKILL.md            |  41 ++++
 .../foundation-creator/baml_src/clients.baml  |  10 +
 .../foundation_compiler/common_types.baml     |  20 ++
 .../compiler_functions.baml                   |  99 +++++++++
 .../foundation_compiler/eval_runner.baml      |  63 ++++++
 .../foundation_compiler/eval_types.baml       |  25 +++
 .../foundation_compiler/foundation_types.baml |  31 +++
 .../baml_src/generators.baml                  |   6 +
 skills/foundation-creator/evals/evals.json    |  26 +++
 .../fixtures/vercel/expected_criteria.md      |  20 ++
 .../evals/fixtures/vercel/raw_notes.md        | 111 ++++++++++
 skills/spec-creator/baml_src/clients.baml     |  10 +
 skills/spec-creator/baml_src/generators.baml  |   6 +
 .../baml_src/spec_compiler/common_types.baml  |  20 ++
 .../spec_compiler/compiler_functions.baml     |  78 +++++++
 .../baml_src/spec_compiler/eval_runner.baml   |  67 ++++++
 .../baml_src/spec_compiler/eval_types.baml    |  26 +++
 .../baml_src/spec_compiler/spec_types.baml    |  37 ++++
 skills/spec-creator/evals/evals.json          |  19 ++
 .../fixtures/vercel_mcp/expected_criteria.md  |  16 ++
 .../evals/fixtures/vercel_mcp/raw_notes.md    |  62 ++++++
 26 files changed, 1222 insertions(+), 6 deletions(-)
 create mode 100644 package-lock.json
 create mode 100644 package.json
 create mode 100644 scripts/run-baml-eval.mjs
 create mode 100644 skills/foundation-creator/SKILL.md
 create mode 100644 skills/foundation-creator/baml_src/clients.baml
 create mode 100644 skills/foundation-creator/baml_src/foundation_compiler/common_types.baml
 create mode 100644 skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
 create mode 100644 skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
 create mode 100644 skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml
 create mode 100644 skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml
 create mode 100644 skills/foundation-creator/baml_src/generators.baml
 create mode 100644 skills/foundation-creator/evals/evals.json
 create mode 100644 skills/foundation-creator/evals/fixtures/vercel/expected_criteria.md
 create mode 100644 skills/foundation-creator/evals/fixtures/vercel/raw_notes.md
 create mode 100644 skills/spec-creator/baml_src/clients.baml
 create mode 100644 skills/spec-creator/baml_src/generators.baml
 create mode 100644 skills/spec-creator/baml_src/spec_compiler/common_types.baml
 create mode 100644 skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
 create mode 100644 skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
 create mode 100644 skills/spec-creator/baml_src/spec_compiler/eval_types.baml
 create mode 100644 skills/spec-creator/baml_src/spec_compiler/spec_types.baml
 create mode 100644 skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md
 create mode 100644 skills/spec-creator/evals/fixtures/vercel_mcp/raw_notes.md

diff --git a/.gitignore b/.gitignore
index 92c1e04..a7f1d20 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
-.DS_Store
-__pycache__/
-*.py[cod]
-.venv/
-.idea/
-.vscode/
+node_modules/
+skills/foundation-creator/baml_client/
+skills/foundation-creator/baml_client_dist/
+skills/spec-creator/baml_client/
+skills/spec-creator/baml_client_dist/
+skills/foundation-creator/evals/runs/
+skills/spec-creator/evals/runs/
diff --git a/README.md b/README.md
index 1c9a33a..7af46e7 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ Agent skills published by Lightfast. Compatible with [Claude Code](https://docs.
 
 | Skill | Purpose |
 |---|---|
+| [`foundation-creator`](skills/foundation-creator/) | Draft a top-level foundation document for a product or company primitive: thesis, mission, boundaries, actor model, surfaces, strategic bets, and open questions. |
 | [`spec-creator`](skills/spec-creator/) | Write and update a top-level `SPEC.md` service specification following a strict template and language guide. |
 
 ## Install
@@ -13,11 +14,26 @@ Agent skills published by Lightfast. Compatible with [Claude Code](https://docs.
 Each skill is a subdirectory under `skills/`. To install one into a project:
 
 ```bash
+npx skills add lightfastai/skills --skill foundation-creator
 npx skills add lightfastai/skills --skill spec-creator
 ```
 
 Or copy the directory directly into `.claude/skills/` in your project.
 
+## Local evals
+
+This repo now includes BAML-backed fixture evals for `foundation-creator` and
+`spec-creator`.
+
+```bash
+npm install
+OPENAI_API_KEY=... npm run eval:foundation -- create-foundation-from-vercel-source-packet
+OPENAI_API_KEY=... npm run eval:spec -- create-from-vercel-mcp-source-packet
+```
+
+Each run writes packet, brief, candidate document, and evaluation report
+artifacts under `skills/<skill>/evals/runs/`.
+
 ## License
 
 MIT
diff --git a/package-lock.json b/package-lock.json
new file mode 100644
index 0000000..7889375
--- /dev/null
+++ b/package-lock.json
@@ -0,0 +1,188 @@
+{
+  "name": "@lightfastai/skills",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "@lightfastai/skills",
+      "dependencies": {
+        "@boundaryml/baml": "0.221.0",
+        "typescript": "5.9.3"
+      }
+    },
+    "node_modules/@boundaryml/baml": {
+      "version": "0.221.0",
+      "resolved": "https://registry.npmjs.org/@boundaryml/baml/-/baml-0.221.0.tgz",
+      "integrity": "sha512-pPOp2JVsG4Wa/tMLnJv/rxil5jsuVDgxnA0xO0h4lKy7t/fKCXOVvO+nzpOZ4byLTP/Ow+8pVvoKRKvx1J/Hsw==",
+      "license": "MIT",
+      "dependencies": {
+        "@scarf/scarf": "^1.3.0"
+      },
+      "bin": {
+        "baml": "cli.js",
+        "baml-cli": "cli.js"
+      },
+      "engines": {
+        "node": ">= 10"
+      },
+      "optionalDependencies": {
+        "@boundaryml/baml-darwin-arm64": "0.221.0",
+        "@boundaryml/baml-darwin-x64": "0.221.0",
+        "@boundaryml/baml-linux-arm64-gnu": "0.221.0",
+        "@boundaryml/baml-linux-arm64-musl": "0.221.0",
+        "@boundaryml/baml-linux-x64-gnu": "0.221.0",
+        "@boundaryml/baml-linux-x64-musl": "0.221.0",
+        "@boundaryml/baml-win32-arm64-msvc": "0.221.0",
+        "@boundaryml/baml-win32-x64-msvc": "0.221.0"
+      }
+    },
+    "node_modules/@boundaryml/baml-darwin-arm64": {
+      "version": "0.221.0",
+      "resolved": "https://registry.npmjs.org/@boundaryml/baml-darwin-arm64/-/baml-darwin-arm64-0.221.0.tgz",
+      "integrity": "sha512-GxqdjVUodyKtgKX/CIDGZyz5lXS0d0iFnV2x7thMQM9ziMrOPcWd3qwflOLYdgDo6Hy9yMULrqtMPkCrmbwEHQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@boundaryml/baml-darwin-x64": {
+      "version": "0.221.0",
+      "resolved": "https://registry.npmjs.org/@boundaryml/baml-darwin-x64/-/baml-darwin-x64-0.221.0.tgz",
+      "integrity": "sha512-wG3jsgOIr8C+09j0AFZY4F8EHvd1gKoKw6+HR1Oi+cw4pijklCk2LI0AIwMPzgG12BAxWV6jEIONMORmspesFQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@boundaryml/baml-linux-arm64-gnu": {
+      "version": "0.221.0",
+      "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-arm64-gnu/-/baml-linux-arm64-gnu-0.221.0.tgz",
+      "integrity": "sha512-Xy1M3muUV2B/4f8dVUpX/IN2CI1m4hGtw31V+kQdFYsy3Hvo58qjijtlkKNYZOjqWBqVlgPMFhTvv8N0cD4N/w==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@boundaryml/baml-linux-arm64-musl": {
+      "version": "0.221.0",
+      "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-arm64-musl/-/baml-linux-arm64-musl-0.221.0.tgz",
+      "integrity": "sha512-6RIkHCViXQEsn6Ts5Uk9c6SDgokkXGO4GkoHpoNnKluTJtuB/B2nUOv2O147GFDqtspFDL2jk5d+oiYibfMn0g==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@boundaryml/baml-linux-x64-gnu": {
+      "version": "0.221.0",
+      "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-x64-gnu/-/baml-linux-x64-gnu-0.221.0.tgz",
+      "integrity": "sha512-YoOz6N6E37UE4ULRCe24P/Ov2pNxjvI4R+I6Bwhkqdt5HOGsJrf2uJUSC+XxKZpkPqlbo1gGZPoCB0lcyeSkeA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@boundaryml/baml-linux-x64-musl": {
+      "version": "0.221.0",
+      "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-x64-musl/-/baml-linux-x64-musl-0.221.0.tgz",
+      "integrity": "sha512-gY67VRXrixgTenDtDzVSMo0GjLbeofGtCZuArfiDgCglfJ5/KGBSgwzqrrTuyUVLGK902NmCaYA5OrPSXezSzg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@boundaryml/baml-win32-arm64-msvc": {
+      "version": "0.221.0",
+      "resolved": "https://registry.npmjs.org/@boundaryml/baml-win32-arm64-msvc/-/baml-win32-arm64-msvc-0.221.0.tgz",
+      "integrity": "sha512-pTHPv6GVlW7nLVszgm7P7+PdQ97JJ8xnRp3/TeP/ya5z08wKi0ejOInLzElMyZVTB+XY707qGlM9CreJnDH3vg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@boundaryml/baml-win32-x64-msvc": {
+      "version": "0.221.0",
+      "resolved": "https://registry.npmjs.org/@boundaryml/baml-win32-x64-msvc/-/baml-win32-x64-msvc-0.221.0.tgz",
+      "integrity": "sha512-XP3CxwsYxOZAOzkWqZd2Dg8iNpDOMrbA/Bz3nqI7oX/wL+ZMkHJwjWQwxIVL+sg2rp+TceV+21UPb6LTmt+qJw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@scarf/scarf": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/@scarf/scarf/-/scarf-1.4.0.tgz",
+      "integrity": "sha512-xxeapPiUXdZAE3che6f3xogoJPeZgig6omHEy1rIY5WVsB3H2BHNnZH+gHG6x91SCWyQCzWGsuL2Hh3ClO5/qQ==",
+      "hasInstallScript": true,
+      "license": "Apache-2.0"
+    },
+    "node_modules/typescript": {
+      "version": "5.9.3",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
+      "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
+      "license": "Apache-2.0",
+      "bin": {
+        "tsc": "bin/tsc",
+        "tsserver": "bin/tsserver"
+      },
+      "engines": {
+        "node": ">=14.17"
+      }
+    }
+  }
+}
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..0ebcd73
--- /dev/null
+++ b/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "@lightfastai/skills",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "baml:generate:foundation": "npx baml-cli generate --from ./skills/foundation-creator/baml_src",
+    "baml:generate:spec": "npx baml-cli generate --from ./skills/spec-creator/baml_src",
+    "eval:foundation": "node ./scripts/run-baml-eval.mjs foundation-creator",
+    "eval:spec": "node ./scripts/run-baml-eval.mjs spec-creator"
+  },
+  "dependencies": {
+    "@boundaryml/baml": "0.221.0",
+    "typescript": "5.9.3"
+  }
+}
diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs
new file mode 100644
index 0000000..af18cec
--- /dev/null
+++ b/scripts/run-baml-eval.mjs
@@ -0,0 +1,203 @@
+import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath, pathToFileURL } from "node:url";
+import { spawn } from "node:child_process";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const repoRoot = path.resolve(__dirname, "..");
+
+function fail(message) {
+  console.error(message);
+  process.exit(1);
+}
+
+function runCommand(command, args, cwd) {
+  return new Promise((resolve, reject) => {
+    const child = spawn(command, args, {
+      cwd,
+      stdio: "inherit",
+      env: process.env,
+      shell: false,
+    });
+
+    child.on("error", reject);
+    child.on("exit", (code) => {
+      if (code === 0) {
+        resolve();
+      } else {
+        reject(new Error(`${command} ${args.join(" ")} exited with code ${code}`));
+      }
+    });
+  });
+}
+
+async function loadJson(filePath) {
+  return JSON.parse(await readFile(filePath, "utf8"));
+}
+
+async function loadText(filePath) {
+  return readFile(filePath, "utf8");
+}
+
+function getEvalBySelector(evals, selector) {
+  if (!selector) {
+    if (evals.length === 1) {
+      return evals[0];
+    }
+    fail("Multiple evals exist. Pass an eval id or name.");
+  }
+
+  const numeric = Number(selector);
+  if (!Number.isNaN(numeric)) {
+    const byId = evals.find((entry) => entry.id === numeric);
+    if (byId) {
+      return byId;
+    }
+  }
+
+  const byName = evals.find((entry) => entry.eval_name === selector);
+  if (byName) {
+    return byName;
+  }
+
+  fail(`Eval '${selector}' not found.`);
+}
+
+async function generateClient(skillRoot) {
+  const bamlSrc = path.join(skillRoot, "baml_src");
+  await runCommand("npx", ["baml-cli", "generate", "--from", bamlSrc], repoRoot);
+}
+
+async function importGeneratedClient(skillRoot) {
+  const clientPath = path.join(skillRoot, "baml_client_dist", "index.js");
+  return import(pathToFileURL(clientPath).href);
+}
+
+async function buildPacket(evalEntry, evalsDir, packetType) {
+  const packetFiles = evalEntry.packet_files ?? {};
+  const rawNotesPath = packetFiles.raw_notes
+    ? path.join(evalsDir, packetFiles.raw_notes)
+    : null;
+  const expectedCriteriaPath = packetFiles.expected_criteria
+    ? path.join(evalsDir, packetFiles.expected_criteria)
+    : null;
+  const existingSpecPath = packetFiles.existing_spec
+    ? path.join(evalsDir, packetFiles.existing_spec)
+    : null;
+
+  const packet = {
+    packet_name: evalEntry.eval_name,
+    task_prompt: evalEntry.prompt,
+    raw_notes: rawNotesPath ? await loadText(rawNotesPath) : "",
+    expected_criteria: expectedCriteriaPath ? await loadText(expectedCriteriaPath) : "",
+  };
+
+  if (packetType === "SpecEvalPacket") {
+    packet.existing_spec = existingSpecPath ? await loadText(existingSpecPath) : null;
+  }
+
+  return packet;
+}
+
+async function ensureFreshClient(skillRoot) {
+  const clientDir = path.join(skillRoot, "baml_client");
+  const distDir = path.join(skillRoot, "baml_client_dist");
+  const tsconfigPath = path.join(skillRoot, ".tmp-baml-client-tsconfig.json");
+  await rm(clientDir, { recursive: true, force: true });
+  await rm(distDir, { recursive: true, force: true });
+  await generateClient(skillRoot);
+  await writeFile(
+    tsconfigPath,
+    JSON.stringify(
+      {
+        compilerOptions: {
+          module: "NodeNext",
+          moduleResolution: "NodeNext",
+          target: "ES2022",
+          declaration: false,
+          sourceMap: false,
+          skipLibCheck: true,
+          outDir: distDir,
+          rootDir: clientDir,
+        },
+        include: [path.join(clientDir, "*.ts")],
+      },
+      null,
+      2,
+    ),
+    "utf8",
+  );
+  try {
+    await runCommand("npx", ["tsc", "--project", tsconfigPath], repoRoot);
+  } finally {
+    await rm(tsconfigPath, { force: true });
+  }
+}
+
+async function writeRunArtifacts(runDir, artifacts) {
+  await mkdir(runDir, { recursive: true });
+  for (const [name, value] of Object.entries(artifacts)) {
+    const filePath = path.join(runDir, name);
+    const content = typeof value === "string" ? value : JSON.stringify(value, null, 2);
+    await writeFile(filePath, content, "utf8");
+  }
+}
+
+async function main() {
+  const skillName = process.argv[2];
+  const selector = process.argv[3];
+
+  if (!skillName) {
+    fail("Usage: node ./scripts/run-baml-eval.mjs <foundation-creator|spec-creator> [eval-id-or-name]");
+  }
+
+  const skillRoot = path.join(repoRoot, "skills", skillName);
+  const evalsDir = path.join(skillRoot, "evals");
+  const manifestPath = path.join(evalsDir, "evals.json");
+  const manifest = await loadJson(manifestPath);
+  const evalEntry = getEvalBySelector(manifest.evals, selector);
+  const runner = manifest.runner_contract;
+
+  if (!runner || runner.type !== "baml_pipeline") {
+    fail(`Skill '${skillName}' does not declare a supported runner_contract.`);
+  }
+
+  if (!process.env.OPENAI_API_KEY) {
+    fail("OPENAI_API_KEY is required to execute BAML evals with the current client configuration.");
+  }
+
+  await ensureFreshClient(skillRoot);
+  const generated = await importGeneratedClient(skillRoot);
+  const { b } = generated;
+
+  const packet = await buildPacket(evalEntry, evalsDir, runner.packet_type);
+  const compileFn = b[runner.compile_brief_function];
+  const renderFn = b[runner.render_document_function];
+  const evaluateFn = b[runner.evaluate_document_function];
+
+  if (!compileFn || !renderFn || !evaluateFn) {
+    fail(`Generated client is missing one or more runner functions for '${skillName}'.`);
+  }
+
+  const brief = await compileFn(packet);
+  const candidateDocument = await renderFn(brief);
+  const report = await evaluateFn(packet, candidateDocument);
+
+  const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
+  const runDir = path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`);
+  await writeRunArtifacts(runDir, {
+    "packet.json": packet,
+    "brief.json": brief,
+    "candidate.md": candidateDocument,
+    "report.json": report,
+  });
+
+  console.log(`Run complete: ${runDir}`);
+  console.log(`Overall status: ${report.overall_status}`);
+}
+
+main().catch((error) => {
+  console.error(error instanceof Error ? error.message : String(error));
+  process.exit(1);
+});
diff --git a/skills/foundation-creator/SKILL.md b/skills/foundation-creator/SKILL.md
new file mode 100644
index 0000000..e6dbb90
--- /dev/null
+++ b/skills/foundation-creator/SKILL.md
@@ -0,0 +1,41 @@
+---
+name: foundation-creator
+description: >
+  Use this skill when the user wants to write, draft, update, or revise a
+  top-level foundation document for a company, product, or new primitive.
+  This document captures thesis, mission, boundaries, actor model, surfaces,
+  strategic bets, and open questions without collapsing ambiguity into
+  implementation decisions. Applies when the user is still defining what the
+  system is, what it is not, and what long-term direction it implies. Does
+  NOT apply to concrete service specifications in `SPEC.md`, implementation
+  plans, PRDs, or roadmap execution docs.
+---
+
+# Foundation Creator
+
+Writes and updates a single top-level foundation document for an early-stage
+product or company primitive. The resulting document is strategic and
+behavioral, not implementation-level. It should preserve uncertainty where
+decisions are not yet mature.
+
+## Core behavior
+
+- Start from thesis and boundaries, not components.
+- Prefer explicit open questions over invented certainty.
+- Separate durable beliefs from speculative bets.
+- Avoid implementation detail unless the user explicitly wants it.
+- Escalate to `spec-creator` only when a subsystem is concrete enough to
+  deserve a `SPEC.md`.
+
+## Current compiler surface
+
+This skill includes typed BAML contracts under `baml_src/foundation_compiler/`
+for:
+
+- extracting atomic claims from messy notes
+- compiling a stable foundation kernel
+- critiquing ambiguity, contradiction, and implementation leakage
+- compiling a brief suitable for downstream document rendering
+
+The BAML layer is schema-first. Prompt wording and document templates can
+evolve without changing the core interfaces.
diff --git a/skills/foundation-creator/baml_src/clients.baml b/skills/foundation-creator/baml_src/clients.baml
new file mode 100644
index 0000000..cd662f1
--- /dev/null
+++ b/skills/foundation-creator/baml_src/clients.baml
@@ -0,0 +1,10 @@
+client<llm> EvalModel {
+  provider "openai-responses"
+  options {
+    api_key env.OPENAI_API_KEY
+    model "gpt-5-mini"
+    reasoning {
+      effort "medium"
+    }
+  }
+}
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/common_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/common_types.baml
new file mode 100644
index 0000000..cb35cb6
--- /dev/null
+++ b/skills/foundation-creator/baml_src/foundation_compiler/common_types.baml
@@ -0,0 +1,20 @@
+enum ClaimKind {
+  Fact
+  Thesis
+  Boundary
+  Constraint
+  OpenQuestion
+}
+
+enum Confidence {
+  High
+  Medium
+  Low
+}
+
+class Claim {
+  statement string @assert(nonempty_statement, {{ this|length > 0 }})
+  kind ClaimKind
+  confidence Confidence
+  sources string[] @assert(has_source, {{ this|length > 0 }})
+}
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
new file mode 100644
index 0000000..0b2b6c0
--- /dev/null
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -0,0 +1,99 @@
+function ExtractClaims(raw_notes: string) -> Claim[] {
+  client EvalModel
+  prompt #"
+    Extract atomic claims from the raw notes below.
+
+    Rules:
+    - Preserve the original intent.
+    - Split compound statements into separate claims when useful.
+    - Use `Fact` only for concrete statements in the notes.
+    - Use `Thesis` for directional beliefs or long-term positions.
+    - Use `Boundary` for explicit scope limits or exclusions.
+    - Use `Constraint` for hard operating requirements.
+    - Use `OpenQuestion` when the note is unresolved.
+    - Keep `sources` short and human-readable.
+
+    Raw notes:
+    {{ raw_notes }}
+
+    {{ ctx.output_format }}
+  "#
+}
+
+function BuildFoundationKernel(claims: Claim[]) -> FoundationKernel {
+  client EvalModel
+  prompt #"
+    Build a stable foundation kernel from the extracted claims below.
+
+    Rules:
+    - Preserve ambiguity when the claims do not support a hard decision.
+    - Capture durable thesis-level information.
+    - Do not invent implementation detail.
+    - Use empty lists when a category is not yet supported by the claims.
+
+    Claims:
+    {{ claims|format(type="yaml") }}
+
+    {{ ctx.output_format }}
+  "#
+}
+
+function CritiqueFoundationKernel(kernel: FoundationKernel) -> FoundationCritique {
+  client EvalModel
+  prompt #"
+    Critique the foundation kernel below.
+
+    Rules:
+    - Flag contradictions across thesis, boundaries, and bets.
+    - Flag vague claims that should be sharpened before document rendering.
+    - Flag implementation leakage.
+    - Flag missing boundaries that create strategic confusion.
+    - Ask only high-leverage clarification questions.
+
+    Kernel:
+    {{ kernel|format(type="yaml") }}
+
+    {{ ctx.output_format }}
+  "#
+}
+
+function CompileFoundationBrief(
+  kernel: FoundationKernel,
+  critique: FoundationCritique
+) -> FoundationBrief {
+  client EvalModel
+  prompt #"
+    Compile a concise foundation brief from the kernel and critique below.
+
+    Rules:
+    - Optimize for document rendering, not implementation planning.
+    - Preserve unresolved questions explicitly.
+    - Exclude critique items that were already resolved by the kernel.
+    - Keep the brief compact and high-signal.
+
+    Kernel:
+    {{ kernel|format(type="yaml") }}
+
+    Critique:
+    {{ critique|format(type="yaml") }}
+
+    {{ ctx.output_format }}
+  "#
+}
+
+function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
+  client EvalModel
+  prompt #"
+    Render a prompt for the `foundation-creator` skill using the brief below.
+
+    Rules:
+    - Ask for a top-level foundation document.
+    - Preserve strategic ambiguity where the brief leaves open questions.
+    - Avoid implementation detail.
+    - Emphasize thesis, boundaries, actor model, surfaces, and strategic bets.
+    - Make the prompt directly usable by an agent.
+
+    Brief:
+    {{ brief|format(type="yaml") }}
+  "#
+}
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
new file mode 100644
index 0000000..72df02a
--- /dev/null
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -0,0 +1,63 @@
+function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> FoundationBrief {
+  client EvalModel
+  prompt #"
+    Compile a foundation brief from the evaluation packet below.
+
+    Task prompt:
+    {{ packet.task_prompt }}
+
+    Raw notes:
+    {{ packet.raw_notes }}
+
+    Expected criteria:
+    {{ packet.expected_criteria }}
+
+    Rules:
+    - Produce a concise, durable foundation brief.
+    - Preserve ambiguity where the notes do not settle the framing.
+    - Treat expected criteria as evaluation guidance, not as license to invent.
+    - Avoid implementation detail.
+
+    {{ ctx.output_format }}
+  "#
+}
+
+function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
+  client EvalModel
+  prompt #"
+    Draft a top-level foundation document from the brief below.
+
+    Rules:
+    - Write a durable strategic document, not a `SPEC.md`.
+    - Start from thesis and boundaries, not architecture.
+    - Preserve unresolved questions explicitly.
+    - Avoid implementation detail.
+
+    Brief:
+    {{ brief|format(type="yaml") }}
+  "#
+}
+
+function EvaluateFoundationDocument(
+  packet: FoundationEvalPacket,
+  candidate_document: string
+) -> EvalReport {
+  client EvalModel
+  prompt #"
+    Evaluate the candidate foundation document against the evaluation packet.
+
+    Packet:
+    {{ packet|format(type="yaml") }}
+
+    Candidate document:
+    {{ candidate_document }}
+
+    Rules:
+    - Grade against the expected criteria explicitly.
+    - Reward preservation of uncertainty when the source packet is genuinely mixed.
+    - Penalize invented certainty, invented capabilities, or implementation leakage.
+    - Use `Pass`, `Partial`, or `Fail` for each criterion.
+
+    {{ ctx.output_format }}
+  "#
+}
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml
new file mode 100644
index 0000000..94058d1
--- /dev/null
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml
@@ -0,0 +1,25 @@
+enum EvalStatus {
+  Pass
+  Partial
+  Fail
+}
+
+class EvalCheck {
+  criterion string @assert(nonempty_criterion, {{ this|length > 0 }})
+  status EvalStatus
+  rationale string @assert(nonempty_rationale, {{ this|length > 0 }})
+}
+
+class EvalReport {
+  overall_status EvalStatus
+  summary string @assert(nonempty_summary, {{ this|length > 0 }})
+  checks EvalCheck[]
+  open_issues string[]
+}
+
+class FoundationEvalPacket {
+  packet_name string @assert(nonempty_packet_name, {{ this|length > 0 }})
+  task_prompt string @assert(nonempty_task_prompt, {{ this|length > 0 }})
+  raw_notes string @assert(nonempty_raw_notes, {{ this|length > 0 }})
+  expected_criteria string @assert(nonempty_expected_criteria, {{ this|length > 0 }})
+}
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml
new file mode 100644
index 0000000..16ffa0f
--- /dev/null
+++ b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml
@@ -0,0 +1,31 @@
+class FoundationKernel {
+  name string @assert(nonempty_name, {{ this|length > 0 }})
+  mission string?
+  primitive string?
+  theses string[]
+  problems string[]
+  actors string[]
+  surfaces string[]
+  boundaries string[]
+  strategic_bets string[]
+  open_questions string[]
+}
+
+class FoundationCritique {
+  contradictions string[]
+  vague_claims string[]
+  implementation_leaks string[]
+  missing_boundaries string[]
+  leverage_questions string[]
+}
+
+class FoundationBrief {
+  title string @assert(nonempty_title, {{ this|length > 0 }})
+  summary string @assert(nonempty_summary, {{ this|length > 0 }})
+  core_theses string[]
+  boundaries string[]
+  actor_model string[]
+  surfaces string[]
+  strategic_bets string[]
+  unresolved_questions string[]
+}
diff --git a/skills/foundation-creator/baml_src/generators.baml b/skills/foundation-creator/baml_src/generators.baml
new file mode 100644
index 0000000..248fe62
--- /dev/null
+++ b/skills/foundation-creator/baml_src/generators.baml
@@ -0,0 +1,6 @@
+generator target {
+  output_type "typescript"
+  output_dir "../"
+  module_format "esm"
+  version "0.221.0"
+}
diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json
new file mode 100644
index 0000000..d0eae13
--- /dev/null
+++ b/skills/foundation-creator/evals/evals.json
@@ -0,0 +1,26 @@
+{
+  "skill_name": "foundation-creator",
+  "runner_contract": {
+    "type": "baml_pipeline",
+    "packet_type": "FoundationEvalPacket",
+    "compile_brief_function": "CompileFoundationBriefFromPacket",
+    "render_document_function": "RenderFoundationDocumentDraft",
+    "evaluate_document_function": "EvaluateFoundationDocument"
+  },
+  "evals": [
+    {
+      "id": 0,
+      "eval_name": "create-foundation-from-vercel-source-packet",
+      "prompt": "Use the source packet in `fixtures/vercel/raw_notes.md` to draft a top-level foundation document for Vercel. Preserve ambiguity where the positioning is in transition. Do not produce a `SPEC.md`, implementation plan, or architecture diagram.",
+      "expected_output": "A top-level foundation document that frames Vercel as a developer cloud/platform company, captures the current tension between `Frontend Cloud` and `AI Cloud`, identifies core surfaces such as deployment workflow, collaboration, security, AI infrastructure, and platform-building, clarifies that Vercel is not just static hosting or general-purpose IaaS, and preserves open questions or strategic bets instead of inventing certainty.",
+      "expected_file": "fixtures/vercel/expected_criteria.md",
+      "packet_files": {
+        "raw_notes": "fixtures/vercel/raw_notes.md",
+        "expected_criteria": "fixtures/vercel/expected_criteria.md"
+      },
+      "files": [
+        "fixtures/vercel/raw_notes.md"
+      ]
+    }
+  ]
+}
diff --git a/skills/foundation-creator/evals/fixtures/vercel/expected_criteria.md b/skills/foundation-creator/evals/fixtures/vercel/expected_criteria.md
new file mode 100644
index 0000000..f6ed893
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/vercel/expected_criteria.md
@@ -0,0 +1,20 @@
+# Expected Criteria
+
+- The output should identify Vercel as a developer platform or cloud for
+  shipping web and AI products, not merely a static hosting company.
+- The output should preserve the current positioning tension between
+  `Frontend Cloud` and `AI Cloud` instead of pretending only one framing
+  exists.
+- The output should identify multiple durable surfaces:
+  deployment workflow, collaboration, security, AI infrastructure, and
+  platform-building support.
+- The output should describe at least one actor model that includes developers
+  or teams, and it should ideally also recognize AI tools or platform builders
+  as meaningful actors.
+- The output should set clear boundaries:
+  not a general-purpose IaaS provider, not just a frontend framework, and not
+  just an AI tooling brand.
+- The output should preserve at least one open question or strategic bet
+  instead of forcing a final conclusion about Vercel's ultimate primitive.
+- The output should not invent internal organizational structure, financial
+  claims, or product lines that are not present in the packet.
diff --git a/skills/foundation-creator/evals/fixtures/vercel/raw_notes.md b/skills/foundation-creator/evals/fixtures/vercel/raw_notes.md
new file mode 100644
index 0000000..f7b6ea9
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/vercel/raw_notes.md
@@ -0,0 +1,111 @@
+# Vercel Source Packet
+
+Assembled on April 20, 2026 from official Vercel sources.
+
+This packet is intentionally paraphrased. It is meant to test whether
+`foundation-creator` can turn a modern company/product platform into a durable
+foundation document without collapsing brand transition into fake certainty.
+
+## Source 1
+
+- URL: [https://vercel.com/about](https://vercel.com/about)
+- Accessed: April 20, 2026
+- The About page says Vercel "enables the world to ship the best products."
+- It describes the `Frontend Cloud` as the developer experience and
+  infrastructure to build, scale, and secure a faster, more personalized web.
+- Its brand values emphasize products that are easy, universal, and
+  accessible.
+
+## Source 2
+
+- URL: [https://vercel.com/docs](https://vercel.com/docs)
+- Last updated: January 30, 2026
+- The docs index now describes Vercel as the `AI Cloud`, a unified platform
+  for building, deploying, and scaling AI-powered applications.
+- The docs say Vercel can ship web apps, agentic workloads, and "everything in
+  between."
+- Git-connected deployment is still central: connect a repository and deploy
+  on every push, with automatic preview environments before production.
+- Build surfaces listed in the docs include `Next.js`, `Functions`, `Routing
+  Middleware`, `Incremental Static Regeneration`, `Image Optimization`,
+  environment management, and feature flags.
+- AI surfaces listed in the docs include `v0`, `AI SDK`, `AI Gateway`,
+  `Agents`, `MCP Servers`, `Agent Resources`, `Sandbox`, and claim
+  deployments.
+- Collaboration surfaces listed in the docs include `Toolbar`, `Comments`, and
+  `Draft mode`.
+- Security surfaces listed in the docs include `Deployment Protection`,
+  `RBAC`, `Configurable WAF`, `Bot Management`, and `BotID`.
+
+## Source 3
+
+- URL: [https://vercel.com/docs/getting-started-with-vercel](https://vercel.com/docs/getting-started-with-vercel)
+- Last updated: September 24, 2025
+- Vercel is described as a platform for developers that provides tools,
+  workflows, and infrastructure to build and deploy web apps faster without
+  needing additional configuration.
+- The getting started guide says Vercel supports popular frontend frameworks
+  out of the box.
+- It also says the infrastructure is globally distributed.
+- During development, Vercel provides preview and production environments and
+  comments for real-time collaboration.
+- The docs repeatedly support both dashboard and CLI workflows.
+
+## Source 4
+
+- URL: [https://vercel.com/blog/introducing-vercel-mcp-connect-vercel-to-your-ai-tools](https://vercel.com/blog/introducing-vercel-mcp-connect-vercel-to-your-ai-tools)
+- Published: August 6, 2025
+- Vercel launched an official MCP server in public beta.
+- The launch framing says AI tools lacked secure, structured access to
+  infrastructure like Vercel.
+- The launch post describes Vercel MCP as a secure, OAuth-compliant interface
+  that lets AI clients interact with Vercel projects.
+- The launch capabilities include searching docs, retrieving deployment logs,
+  fetching teams, and fetching projects.
+- The launch post explicitly frames the initial server as read-only and
+  approved-client only.
+- The launch post also says Vercel wants to be a place where developers ship
+  their own MCP servers.
+
+## Source 5
+
+- URL: [https://vercel.com/docs/agent-resources/vercel-mcp](https://vercel.com/docs/agent-resources/vercel-mcp)
+- Last updated: January 30, 2026
+- The product docs describe Vercel MCP as Vercel's official remote MCP with
+  OAuth at `https://mcp.vercel.com`.
+- The docs say it lets AI tools search docs, manage projects and deployments,
+  and analyze deployment logs.
+- Supported AI clients listed in the docs include Claude, ChatGPT, Codex CLI,
+  Cursor, VS Code with Copilot, Devin, Raycast, Goose, Windsurf, and Gemini
+  tools.
+- The docs emphasize allowlisted clients, OAuth, and official endpoint
+  verification as security controls.
+- There is a likely product transition to capture: the August 6, 2025 launch
+  post frames the service as read-only, while the January 30, 2026 product
+  docs frame it as broader project and deployment management.
+
+## Source 6
+
+- URL: [https://vercel.com/platforms/docs](https://vercel.com/platforms/docs)
+- Accessed: April 20, 2026
+- Vercel for Platforms supports two patterns: `Multi-Tenant` and
+  `Multi-Project`.
+- The docs say `Multi-Tenant` is for one application structure with
+  tenant-specific content and branding.
+- The docs say `Multi-Project` is for unique codebases, per-customer
+  environments, and AI coding platforms where complete isolation is required.
+- This suggests Vercel is not only a deployment product for a single app team;
+  it is also a substrate for other platforms.
+
+## Tensions and questions the evaluator should preserve
+
+- Vercel still publicly uses `Frontend Cloud` language on the About page while
+  the docs index now centers `AI Cloud`.
+- The same company appears to span deployment infrastructure, collaboration,
+  security, AI application tooling, and platform-building primitives.
+- It should be treated as more than hosting, but not flattened into generic
+  cloud infrastructure.
+- A good foundation document should preserve whether the core primitive is
+  "ship products fast", "deploy web and AI apps", "developer cloud", or
+  "infrastructure for the product surface of the internet" if the sources do
+  not settle it cleanly.
diff --git a/skills/spec-creator/baml_src/clients.baml b/skills/spec-creator/baml_src/clients.baml
new file mode 100644
index 0000000..cd662f1
--- /dev/null
+++ b/skills/spec-creator/baml_src/clients.baml
@@ -0,0 +1,10 @@
+client<llm> EvalModel {
+  provider "openai-responses"
+  options {
+    api_key env.OPENAI_API_KEY
+    model "gpt-5-mini"
+    reasoning {
+      effort "medium"
+    }
+  }
+}
diff --git a/skills/spec-creator/baml_src/generators.baml b/skills/spec-creator/baml_src/generators.baml
new file mode 100644
index 0000000..248fe62
--- /dev/null
+++ b/skills/spec-creator/baml_src/generators.baml
@@ -0,0 +1,6 @@
+generator target {
+  output_type "typescript"
+  output_dir "../"
+  module_format "esm"
+  version "0.221.0"
+}
diff --git a/skills/spec-creator/baml_src/spec_compiler/common_types.baml b/skills/spec-creator/baml_src/spec_compiler/common_types.baml
new file mode 100644
index 0000000..cb35cb6
--- /dev/null
+++ b/skills/spec-creator/baml_src/spec_compiler/common_types.baml
@@ -0,0 +1,20 @@
+enum ClaimKind {
+  Fact
+  Thesis
+  Boundary
+  Constraint
+  OpenQuestion
+}
+
+enum Confidence {
+  High
+  Medium
+  Low
+}
+
+class Claim {
+  statement string @assert(nonempty_statement, {{ this|length > 0 }})
+  kind ClaimKind
+  confidence Confidence
+  sources string[] @assert(has_source, {{ this|length > 0 }})
+}
diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
new file mode 100644
index 0000000..5fbe8b0
--- /dev/null
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -0,0 +1,78 @@
+function ExtractClaims(raw_notes: string) -> Claim[] {
+  client EvalModel
+  prompt #"
+    Extract atomic claims from the raw notes below.
+
+    Rules:
+    - Preserve the original intent.
+    - Split compound statements into separate claims when useful.
+    - Use `Fact` only for concrete statements in the notes.
+    - Use `Thesis` for directional beliefs that still affect scope.
+    - Use `Boundary` for explicit scope limits or exclusions.
+    - Use `Constraint` for hard operating requirements.
+    - Use `OpenQuestion` when the note is unresolved.
+    - Keep `sources` short and human-readable.
+
+    Raw notes:
+    {{ raw_notes }}
+
+    {{ ctx.output_format }}
+  "#
+}
+
+function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrief {
+  client EvalModel
+  prompt #"
+    Compile a spec brief from the inputs below.
+
+    Rules:
+    - Produce a behavioral brief suitable for `spec-creator`.
+    - Prefer service-level behavior over product doctrine.
+    - Preserve unresolved questions explicitly.
+    - Do not invent implementation detail.
+    - If `existing_spec` is present, preserve compatible intent and only sharpen gaps.
+
+    Raw notes:
+    {{ raw_notes }}
+
+    Existing spec:
+    {{ existing_spec }}
+
+    {{ ctx.output_format }}
+  "#
+}
+
+function CritiqueSpecBrief(brief: SpecBrief) -> SpecCritique {
+  client EvalModel
+  prompt #"
+    Critique the spec brief below.
+
+    Rules:
+    - Flag contradictions across goals, non-goals, and boundaries.
+    - Flag implementation leakage.
+    - Flag ambiguous terms that weaken a behavioral specification.
+    - Flag missing sections implied by the current brief.
+
+    Brief:
+    {{ brief|format(type="yaml") }}
+
+    {{ ctx.output_format }}
+  "#
+}
+
+function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
+  client EvalModel
+  prompt #"
+    Render a prompt for the `spec-creator` skill using the brief below.
+
+    Rules:
+    - Ask for a top-level `SPEC.md`.
+    - Keep the output language-agnostic and behavioral.
+    - Preserve unresolved questions instead of inventing decisions.
+    - Emphasize problem statement, goals, non-goals, components, dependencies, and entities.
+    - Make the prompt directly usable by an agent.
+
+    Brief:
+    {{ brief|format(type="yaml") }}
+  "#
+}
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
new file mode 100644
index 0000000..068a32a
--- /dev/null
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -0,0 +1,67 @@
+function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
+  client EvalModel
+  prompt #"
+    Compile a spec brief from the evaluation packet below.
+
+    Task prompt:
+    {{ packet.task_prompt }}
+
+    Raw notes:
+    {{ packet.raw_notes }}
+
+    Existing spec:
+    {{ packet.existing_spec }}
+
+    Expected criteria:
+    {{ packet.expected_criteria }}
+
+    Rules:
+    - Produce a behavioral service brief suitable for `spec-creator`.
+    - Treat expected criteria as evaluation guidance, not as license to invent.
+    - Preserve unresolved questions explicitly.
+    - Avoid implementation detail.
+
+    {{ ctx.output_format }}
+  "#
+}
+
+function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
+  client EvalModel
+  prompt #"
+    Draft a `SPEC.md` from the brief below.
+
+    Rules:
+    - Keep the document behavioral and language-agnostic.
+    - Use problem statement, goals, non-goals, boundaries, components,
+      dependencies, and entities.
+    - Preserve uncertainty where the source packet is in transition.
+    - Avoid implementation detail.
+
+    Brief:
+    {{ brief|format(type="yaml") }}
+  "#
+}
+
+function EvaluateSpecDocument(
+  packet: SpecEvalPacket,
+  candidate_document: string
+) -> EvalReport {
+  client EvalModel
+  prompt #"
+    Evaluate the candidate `SPEC.md` against the evaluation packet.
+
+    Packet:
+    {{ packet|format(type="yaml") }}
+
+    Candidate document:
+    {{ candidate_document }}
+
+    Rules:
+    - Grade against the expected criteria explicitly.
+    - Penalize invented capabilities, invented certainty, or implementation leakage.
+    - Reward correct scope boundaries and careful handling of transition states.
+    - Use `Pass`, `Partial`, or `Fail` for each criterion.
+
+    {{ ctx.output_format }}
+  "#
+}
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_types.baml b/skills/spec-creator/baml_src/spec_compiler/eval_types.baml
new file mode 100644
index 0000000..d3576c9
--- /dev/null
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_types.baml
@@ -0,0 +1,26 @@
+enum EvalStatus {
+  Pass
+  Partial
+  Fail
+}
+
+class EvalCheck {
+  criterion string @assert(nonempty_criterion, {{ this|length > 0 }})
+  status EvalStatus
+  rationale string @assert(nonempty_rationale, {{ this|length > 0 }})
+}
+
+class EvalReport {
+  overall_status EvalStatus
+  summary string @assert(nonempty_summary, {{ this|length > 0 }})
+  checks EvalCheck[]
+  open_issues string[]
+}
+
+class SpecEvalPacket {
+  packet_name string @assert(nonempty_packet_name, {{ this|length > 0 }})
+  task_prompt string @assert(nonempty_task_prompt, {{ this|length > 0 }})
+  raw_notes string @assert(nonempty_raw_notes, {{ this|length > 0 }})
+  expected_criteria string @assert(nonempty_expected_criteria, {{ this|length > 0 }})
+  existing_spec string?
+}
diff --git a/skills/spec-creator/baml_src/spec_compiler/spec_types.baml b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
new file mode 100644
index 0000000..3d116b5
--- /dev/null
+++ b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
@@ -0,0 +1,37 @@
+class ComponentBrief {
+  name string @assert(nonempty_name, {{ this|length > 0 }})
+  responsibility string @assert(nonempty_responsibility, {{ this|length > 0 }})
+}
+
+class FieldBrief {
+  name string @assert(nonempty_name, {{ this|length > 0 }})
+  type_expression string @assert(nonempty_type_expression, {{ this|length > 0 }})
+  description string @assert(nonempty_description, {{ this|length > 0 }})
+  required bool
+}
+
+class EntityBrief {
+  name string @assert(nonempty_name, {{ this|length > 0 }})
+  description string @assert(nonempty_description, {{ this|length > 0 }})
+  fields FieldBrief[]
+}
+
+class SpecBrief {
+  service_name string @assert(nonempty_service_name, {{ this|length > 0 }})
+  purpose string @assert(nonempty_purpose, {{ this|length > 0 }})
+  operational_problems string[]
+  goals string[]
+  non_goals string[]
+  important_boundaries string[]
+  components ComponentBrief[]
+  external_dependencies string[]
+  entities EntityBrief[]
+  unresolved_questions string[]
+}
+
+class SpecCritique {
+  contradictions string[]
+  implementation_leaks string[]
+  ambiguous_terms string[]
+  missing_sections string[]
+}
diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json
index 8e81200..8c9e5ad 100644
--- a/skills/spec-creator/evals/evals.json
+++ b/skills/spec-creator/evals/evals.json
@@ -1,5 +1,12 @@
 {
   "skill_name": "spec-creator",
+  "runner_contract": {
+    "type": "baml_pipeline",
+    "packet_type": "SpecEvalPacket",
+    "compile_brief_function": "CompileSpecBriefFromPacket",
+    "render_document_function": "RenderSpecDocumentDraft",
+    "evaluate_document_function": "EvaluateSpecDocument"
+  },
   "evals": [
     {
       "id": 0,
@@ -21,6 +28,18 @@
       "prompt": "We have an existing SPEC.md at the repo root for Log Shipper. Please update it: add a non-goal stating that cross-region log replication is out of scope, and add a new main component called `Offset Store` that persists per-file read offsets. Keep everything else as-is.",
       "expected_output": "SPEC.md with the existing sections preserved verbatim except: (1) a new bullet added under Non-Goals covering cross-region replication, phrased as a gerund/noun phrase per the language guide; (2) a new numbered component `Offset Store` added under 3.1 with a verb-led description. Section numbering remains 1, 2, 3; component numbering extends to 4. No first-person pronouns introduced.",
       "files": ["fixtures/existing_spec.md"]
+    },
+    {
+      "id": 3,
+      "eval_name": "create-from-vercel-mcp-source-packet",
+      "prompt": "Use the source packet in `fixtures/vercel_mcp/raw_notes.md` to write a `SPEC.md` for a service called `Vercel MCP`. Treat it as a long-running remote MCP service, not as a company-level foundation document. Preserve timeline-specific ambiguity where the source packet is in transition, and do not invent write capabilities that the notes do not justify.",
+      "expected_output": "A `SPEC.md` that frames `Vercel MCP` as an OAuth-protected remote MCP service for AI tools, includes a Problem Statement about secure structured access to Vercel docs, projects, deployments, and logs, sets clear goals and non-goals, identifies boundaries around approved clients and official endpoint usage, captures the current beta/read-only tension without pretending the service already has unconstrained write access, and stays behavioral rather than implementation-level.",
+      "expected_file": "fixtures/vercel_mcp/expected_criteria.md",
+      "packet_files": {
+        "raw_notes": "fixtures/vercel_mcp/raw_notes.md",
+        "expected_criteria": "fixtures/vercel_mcp/expected_criteria.md"
+      },
+      "files": ["fixtures/vercel_mcp/raw_notes.md"]
     }
   ]
 }
diff --git a/skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md b/skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md
new file mode 100644
index 0000000..d297751
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md
@@ -0,0 +1,16 @@
+# Expected Criteria
+
+- The output should frame `Vercel MCP` as a remote MCP service for AI tools
+  interacting with Vercel resources.
+- The purpose should mention secure access to Vercel docs, projects,
+  deployments, or logs through an OAuth-protected MCP endpoint.
+- The problem statement should capture the need for structured AI access to
+  Vercel context from tools or development environments.
+- The output should include boundaries around official endpoint usage,
+  approved clients, and security-sensitive access patterns.
+- The output should preserve the transition between the August 6, 2025
+  read-only launch framing and the January 30, 2026 broader management framing.
+- The output should avoid claiming unconstrained write behavior unless it is
+  explicitly scoped or qualified.
+- The output should stay behavioral and language-agnostic, not implementation
+  specific.
diff --git a/skills/spec-creator/evals/fixtures/vercel_mcp/raw_notes.md b/skills/spec-creator/evals/fixtures/vercel_mcp/raw_notes.md
new file mode 100644
index 0000000..ad3be92
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/vercel_mcp/raw_notes.md
@@ -0,0 +1,62 @@
+# Vercel MCP Source Packet
+
+Assembled on April 20, 2026 from official Vercel sources.
+
+This packet is intentionally narrow. It exists to test whether `spec-creator`
+can produce a concrete service specification from a modern product surface
+without inventing capabilities that are still in transition.
+
+## Source 1
+
+- URL: [https://vercel.com/blog/introducing-vercel-mcp-connect-vercel-to-your-ai-tools](https://vercel.com/blog/introducing-vercel-mcp-connect-vercel-to-your-ai-tools)
+- Published: August 6, 2025
+- Vercel introduced an official MCP server in public beta.
+- The launch post describes the service as a secure, OAuth-compliant interface
+  that lets AI clients interact with Vercel projects.
+- The launch motivation is that AI tools need secure, structured access to
+  infrastructure like Vercel from inside development environments and AI
+  assistants.
+- The launch capabilities include searching docs, retrieving deployment logs,
+  fetching teams, and fetching projects.
+- The launch post says the initial service is read-only.
+- The launch post also says only approved clients are allowed, and OAuth
+  consent is shown on every connection.
+- Official endpoint in the launch post: `https://mcp.vercel.com`.
+
+## Source 2
+
+- URL: [https://vercel.com/docs/agent-resources/vercel-mcp](https://vercel.com/docs/agent-resources/vercel-mcp)
+- Last updated: January 30, 2026
+- The product docs describe Vercel MCP as Vercel's official remote MCP with
+  OAuth.
+- The docs say it lets AI tools search docs, manage projects and deployments,
+  and analyze deployment logs.
+- The docs list many supported clients including Claude tools, ChatGPT, Codex
+  CLI, Cursor, VS Code with Copilot, Devin, Raycast, Goose, Windsurf, and
+  Gemini tools.
+- The docs emphasize endpoint verification, OAuth, and approved-client
+  restrictions as security controls.
+- The docs position the service as part of an agent workflow around live
+  Vercel context and project operations.
+
+## Source 3
+
+- URL: [https://vercel.com/docs](https://vercel.com/docs)
+- Last updated: January 30, 2026
+- The docs index places Vercel MCP inside a broader AI infrastructure surface
+  alongside `Agents`, `MCP Servers`, `Agent Resources`, `Sandbox`, `AI SDK`,
+  and `AI Gateway`.
+- This suggests Vercel MCP is not an isolated side experiment; it is part of a
+  broader AI-tooling platform direction.
+
+## Important boundaries and tensions
+
+- The service is specifically about Vercel context and operations for AI
+  clients, not a generic MCP hosting platform.
+- The endpoint is official and singular in the notes:
+  `https://mcp.vercel.com`.
+- August 6, 2025 launch framing is explicitly read-only.
+- January 30, 2026 docs language suggests broader project/deployment
+  management.
+- A good spec should preserve that transition carefully. It should not invent
+  arbitrary mutation powers if the source packet does not settle them.

From 068687836acd5fb05c67f0a4962110aa01b1525f Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Mon, 20 Apr 2026 15:26:15 +1000
Subject: [PATCH 02/30] Migrate evals to AI Gateway and Bun

---
 .gitignore                                    |  23 ++-
 README.md                                     |  16 +-
 bun.lock                                      |  59 ++++++
 package-lock.json                             | 188 ------------------
 package.json                                  |  13 +-
 scripts/run-baml-eval.mjs                     |  24 ++-
 .../foundation-creator/baml_src/clients.baml  |   5 +-
 .../compiler_functions.baml                   |   5 +
 .../foundation_compiler/eval_runner.baml      |   3 +
 skills/spec-creator/baml_src/clients.baml     |   5 +-
 .../spec_compiler/compiler_functions.baml     |   4 +
 .../baml_src/spec_compiler/eval_runner.baml   |   3 +
 12 files changed, 133 insertions(+), 215 deletions(-)
 create mode 100644 bun.lock
 delete mode 100644 package-lock.json

diff --git a/.gitignore b/.gitignore
index a7f1d20..f399c82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,18 @@
 node_modules/
-skills/foundation-creator/baml_client/
-skills/foundation-creator/baml_client_dist/
-skills/spec-creator/baml_client/
-skills/spec-creator/baml_client_dist/
-skills/foundation-creator/evals/runs/
-skills/spec-creator/evals/runs/
+
+# Generated BAML clients
+skills/**/baml_client/
+skills/**/baml_client_dist/
+
+# Local eval outputs
+skills/**/evals/runs/
+
+# Local environment files
+.env
+.env.local
+.env.*.local
+
+# Local OS and tooling noise
+.DS_Store
+*.log
+.tmp-baml-client-tsconfig.json
diff --git a/README.md b/README.md
index 7af46e7..4a628c3 100644
--- a/README.md
+++ b/README.md
@@ -26,14 +26,24 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and
 `spec-creator`.
 
 ```bash
-npm install
-OPENAI_API_KEY=... npm run eval:foundation -- create-foundation-from-vercel-source-packet
-OPENAI_API_KEY=... npm run eval:spec -- create-from-vercel-mcp-source-packet
+bun install
+bun run eval:foundation -- create-foundation-from-vercel-source-packet
+bun run eval:spec -- create-from-vercel-mcp-source-packet
 ```
 
 Each run writes packet, brief, candidate document, and evaluation report
 artifacts under `skills/<skill>/evals/runs/`.
 
+`bun run eval:*` loads `.env` automatically through `dotenv-cli`, so
+`AI_GATEWAY_API_KEY` can live in the repo-local `.env` without manual
+`source` steps.
+
+For other local commands that should inherit `.env`, use:
+
+```bash
+bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-vercel-source-packet
+```
+
 ## License
 
 MIT
diff --git a/bun.lock b/bun.lock
new file mode 100644
index 0000000..0a5ac2e
--- /dev/null
+++ b/bun.lock
@@ -0,0 +1,59 @@
+{
+  "lockfileVersion": 1,
+  "configVersion": 0,
+  "workspaces": {
+    "": {
+      "name": "@lightfastai/skills",
+      "dependencies": {
+        "@boundaryml/baml": "0.221.0",
+        "typescript": "5.9.3",
+      },
+      "devDependencies": {
+        "dotenv-cli": "^8.0.0",
+      },
+    },
+  },
+  "packages": {
+    "@boundaryml/baml": ["@boundaryml/baml@0.221.0", "", { "dependencies": { "@scarf/scarf": "^1.3.0" }, "optionalDependencies": { "@boundaryml/baml-darwin-arm64": "0.221.0", "@boundaryml/baml-darwin-x64": "0.221.0", "@boundaryml/baml-linux-arm64-gnu": "0.221.0", "@boundaryml/baml-linux-arm64-musl": "0.221.0", "@boundaryml/baml-linux-x64-gnu": "0.221.0", "@boundaryml/baml-linux-x64-musl": "0.221.0", "@boundaryml/baml-win32-arm64-msvc": "0.221.0", "@boundaryml/baml-win32-x64-msvc": "0.221.0" }, "bin": { "baml": "cli.js", "baml-cli": "cli.js" } }, "sha512-pPOp2JVsG4Wa/tMLnJv/rxil5jsuVDgxnA0xO0h4lKy7t/fKCXOVvO+nzpOZ4byLTP/Ow+8pVvoKRKvx1J/Hsw=="],
+
+    "@boundaryml/baml-darwin-arm64": ["@boundaryml/baml-darwin-arm64@0.221.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-GxqdjVUodyKtgKX/CIDGZyz5lXS0d0iFnV2x7thMQM9ziMrOPcWd3qwflOLYdgDo6Hy9yMULrqtMPkCrmbwEHQ=="],
+
+    "@boundaryml/baml-darwin-x64": ["@boundaryml/baml-darwin-x64@0.221.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-wG3jsgOIr8C+09j0AFZY4F8EHvd1gKoKw6+HR1Oi+cw4pijklCk2LI0AIwMPzgG12BAxWV6jEIONMORmspesFQ=="],
+
+    "@boundaryml/baml-linux-arm64-gnu": ["@boundaryml/baml-linux-arm64-gnu@0.221.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-Xy1M3muUV2B/4f8dVUpX/IN2CI1m4hGtw31V+kQdFYsy3Hvo58qjijtlkKNYZOjqWBqVlgPMFhTvv8N0cD4N/w=="],
+
+    "@boundaryml/baml-linux-arm64-musl": ["@boundaryml/baml-linux-arm64-musl@0.221.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-6RIkHCViXQEsn6Ts5Uk9c6SDgokkXGO4GkoHpoNnKluTJtuB/B2nUOv2O147GFDqtspFDL2jk5d+oiYibfMn0g=="],
+
+    "@boundaryml/baml-linux-x64-gnu": ["@boundaryml/baml-linux-x64-gnu@0.221.0", "", { "os": "linux", "cpu": "x64" }, "sha512-YoOz6N6E37UE4ULRCe24P/Ov2pNxjvI4R+I6Bwhkqdt5HOGsJrf2uJUSC+XxKZpkPqlbo1gGZPoCB0lcyeSkeA=="],
+
+    "@boundaryml/baml-linux-x64-musl": ["@boundaryml/baml-linux-x64-musl@0.221.0", "", { "os": "linux", "cpu": "x64" }, "sha512-gY67VRXrixgTenDtDzVSMo0GjLbeofGtCZuArfiDgCglfJ5/KGBSgwzqrrTuyUVLGK902NmCaYA5OrPSXezSzg=="],
+
+    "@boundaryml/baml-win32-arm64-msvc": ["@boundaryml/baml-win32-arm64-msvc@0.221.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-pTHPv6GVlW7nLVszgm7P7+PdQ97JJ8xnRp3/TeP/ya5z08wKi0ejOInLzElMyZVTB+XY707qGlM9CreJnDH3vg=="],
+
+    "@boundaryml/baml-win32-x64-msvc": ["@boundaryml/baml-win32-x64-msvc@0.221.0", "", { "os": "win32", "cpu": "x64" }, "sha512-XP3CxwsYxOZAOzkWqZd2Dg8iNpDOMrbA/Bz3nqI7oX/wL+ZMkHJwjWQwxIVL+sg2rp+TceV+21UPb6LTmt+qJw=="],
+
+    "@scarf/scarf": ["@scarf/scarf@1.4.0", "", {}, "sha512-xxeapPiUXdZAE3che6f3xogoJPeZgig6omHEy1rIY5WVsB3H2BHNnZH+gHG6x91SCWyQCzWGsuL2Hh3ClO5/qQ=="],
+
+    "cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="],
+
+    "dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="],
+
+    "dotenv-cli": ["dotenv-cli@8.0.0", "", { "dependencies": { "cross-spawn": "^7.0.6", "dotenv": "^16.3.0", "dotenv-expand": "^10.0.0", "minimist": "^1.2.6" }, "bin": { "dotenv": "cli.js" } }, "sha512-aLqYbK7xKOiTMIRf1lDPbI+Y+Ip/wo5k3eyp6ePysVaSqbyxjyK3dK35BTxG+rmd7djf5q2UPs4noPNH+cj0Qw=="],
+
+    "dotenv-expand": ["dotenv-expand@10.0.0", "", {}, "sha512-GopVGCpVS1UKH75VKHGuQFqS1Gusej0z4FyQkPdwjil2gNIv+LNsqBlboOzpJFZKVT95GkCyWJbBSdFEFUWI2A=="],
+
+    "isexe": ["isexe@2.0.0", "", {}, "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="],
+
+    "minimist": ["minimist@1.2.8", "", {}, "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA=="],
+
+    "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="],
+
+    "shebang-command": ["shebang-command@2.0.0", "", { "dependencies": { "shebang-regex": "^3.0.0" } }, "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA=="],
+
+    "shebang-regex": ["shebang-regex@3.0.0", "", {}, "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="],
+
+    "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
+
+    "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
+  }
+}
diff --git a/package-lock.json b/package-lock.json
deleted file mode 100644
index 7889375..0000000
--- a/package-lock.json
+++ /dev/null
@@ -1,188 +0,0 @@
-{
-  "name": "@lightfastai/skills",
-  "lockfileVersion": 3,
-  "requires": true,
-  "packages": {
-    "": {
-      "name": "@lightfastai/skills",
-      "dependencies": {
-        "@boundaryml/baml": "0.221.0",
-        "typescript": "5.9.3"
-      }
-    },
-    "node_modules/@boundaryml/baml": {
-      "version": "0.221.0",
-      "resolved": "https://registry.npmjs.org/@boundaryml/baml/-/baml-0.221.0.tgz",
-      "integrity": "sha512-pPOp2JVsG4Wa/tMLnJv/rxil5jsuVDgxnA0xO0h4lKy7t/fKCXOVvO+nzpOZ4byLTP/Ow+8pVvoKRKvx1J/Hsw==",
-      "license": "MIT",
-      "dependencies": {
-        "@scarf/scarf": "^1.3.0"
-      },
-      "bin": {
-        "baml": "cli.js",
-        "baml-cli": "cli.js"
-      },
-      "engines": {
-        "node": ">= 10"
-      },
-      "optionalDependencies": {
-        "@boundaryml/baml-darwin-arm64": "0.221.0",
-        "@boundaryml/baml-darwin-x64": "0.221.0",
-        "@boundaryml/baml-linux-arm64-gnu": "0.221.0",
-        "@boundaryml/baml-linux-arm64-musl": "0.221.0",
-        "@boundaryml/baml-linux-x64-gnu": "0.221.0",
-        "@boundaryml/baml-linux-x64-musl": "0.221.0",
-        "@boundaryml/baml-win32-arm64-msvc": "0.221.0",
-        "@boundaryml/baml-win32-x64-msvc": "0.221.0"
-      }
-    },
-    "node_modules/@boundaryml/baml-darwin-arm64": {
-      "version": "0.221.0",
-      "resolved": "https://registry.npmjs.org/@boundaryml/baml-darwin-arm64/-/baml-darwin-arm64-0.221.0.tgz",
-      "integrity": "sha512-GxqdjVUodyKtgKX/CIDGZyz5lXS0d0iFnV2x7thMQM9ziMrOPcWd3qwflOLYdgDo6Hy9yMULrqtMPkCrmbwEHQ==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@boundaryml/baml-darwin-x64": {
-      "version": "0.221.0",
-      "resolved": "https://registry.npmjs.org/@boundaryml/baml-darwin-x64/-/baml-darwin-x64-0.221.0.tgz",
-      "integrity": "sha512-wG3jsgOIr8C+09j0AFZY4F8EHvd1gKoKw6+HR1Oi+cw4pijklCk2LI0AIwMPzgG12BAxWV6jEIONMORmspesFQ==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@boundaryml/baml-linux-arm64-gnu": {
-      "version": "0.221.0",
-      "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-arm64-gnu/-/baml-linux-arm64-gnu-0.221.0.tgz",
-      "integrity": "sha512-Xy1M3muUV2B/4f8dVUpX/IN2CI1m4hGtw31V+kQdFYsy3Hvo58qjijtlkKNYZOjqWBqVlgPMFhTvv8N0cD4N/w==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@boundaryml/baml-linux-arm64-musl": {
-      "version": "0.221.0",
-      "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-arm64-musl/-/baml-linux-arm64-musl-0.221.0.tgz",
-      "integrity": "sha512-6RIkHCViXQEsn6Ts5Uk9c6SDgokkXGO4GkoHpoNnKluTJtuB/B2nUOv2O147GFDqtspFDL2jk5d+oiYibfMn0g==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@boundaryml/baml-linux-x64-gnu": {
-      "version": "0.221.0",
-      "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-x64-gnu/-/baml-linux-x64-gnu-0.221.0.tgz",
-      "integrity": "sha512-YoOz6N6E37UE4ULRCe24P/Ov2pNxjvI4R+I6Bwhkqdt5HOGsJrf2uJUSC+XxKZpkPqlbo1gGZPoCB0lcyeSkeA==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@boundaryml/baml-linux-x64-musl": {
-      "version": "0.221.0",
-      "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-x64-musl/-/baml-linux-x64-musl-0.221.0.tgz",
-      "integrity": "sha512-gY67VRXrixgTenDtDzVSMo0GjLbeofGtCZuArfiDgCglfJ5/KGBSgwzqrrTuyUVLGK902NmCaYA5OrPSXezSzg==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@boundaryml/baml-win32-arm64-msvc": {
-      "version": "0.221.0",
-      "resolved": "https://registry.npmjs.org/@boundaryml/baml-win32-arm64-msvc/-/baml-win32-arm64-msvc-0.221.0.tgz",
-      "integrity": "sha512-pTHPv6GVlW7nLVszgm7P7+PdQ97JJ8xnRp3/TeP/ya5z08wKi0ejOInLzElMyZVTB+XY707qGlM9CreJnDH3vg==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@boundaryml/baml-win32-x64-msvc": {
-      "version": "0.221.0",
-      "resolved": "https://registry.npmjs.org/@boundaryml/baml-win32-x64-msvc/-/baml-win32-x64-msvc-0.221.0.tgz",
-      "integrity": "sha512-XP3CxwsYxOZAOzkWqZd2Dg8iNpDOMrbA/Bz3nqI7oX/wL+ZMkHJwjWQwxIVL+sg2rp+TceV+21UPb6LTmt+qJw==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@scarf/scarf": {
-      "version": "1.4.0",
-      "resolved": "https://registry.npmjs.org/@scarf/scarf/-/scarf-1.4.0.tgz",
-      "integrity": "sha512-xxeapPiUXdZAE3che6f3xogoJPeZgig6omHEy1rIY5WVsB3H2BHNnZH+gHG6x91SCWyQCzWGsuL2Hh3ClO5/qQ==",
-      "hasInstallScript": true,
-      "license": "Apache-2.0"
-    },
-    "node_modules/typescript": {
-      "version": "5.9.3",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
-      "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
-      "license": "Apache-2.0",
-      "bin": {
-        "tsc": "bin/tsc",
-        "tsserver": "bin/tsserver"
-      },
-      "engines": {
-        "node": ">=14.17"
-      }
-    }
-  }
-}
diff --git a/package.json b/package.json
index 0ebcd73..944dfe8 100644
--- a/package.json
+++ b/package.json
@@ -1,15 +1,20 @@
 {
   "name": "@lightfastai/skills",
   "private": true,
+  "packageManager": "bun@1.3.9",
   "type": "module",
   "scripts": {
-    "baml:generate:foundation": "npx baml-cli generate --from ./skills/foundation-creator/baml_src",
-    "baml:generate:spec": "npx baml-cli generate --from ./skills/spec-creator/baml_src",
-    "eval:foundation": "node ./scripts/run-baml-eval.mjs foundation-creator",
-    "eval:spec": "node ./scripts/run-baml-eval.mjs spec-creator"
+    "with-env": "dotenv -e .env --",
+    "baml:generate:foundation": "bunx baml-cli generate --from ./skills/foundation-creator/baml_src",
+    "baml:generate:spec": "bunx baml-cli generate --from ./skills/spec-creator/baml_src",
+    "eval:foundation": "bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator",
+    "eval:spec": "bun run with-env -- bun run ./scripts/run-baml-eval.mjs spec-creator"
   },
   "dependencies": {
     "@boundaryml/baml": "0.221.0",
     "typescript": "5.9.3"
+  },
+  "devDependencies": {
+    "dotenv-cli": "^8.0.0"
   }
 }
diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs
index af18cec..039d4a6 100644
--- a/scripts/run-baml-eval.mjs
+++ b/scripts/run-baml-eval.mjs
@@ -163,8 +163,8 @@ async function main() {
     fail(`Skill '${skillName}' does not declare a supported runner_contract.`);
   }
 
-  if (!process.env.OPENAI_API_KEY) {
-    fail("OPENAI_API_KEY is required to execute BAML evals with the current client configuration.");
+  if (!process.env.AI_GATEWAY_API_KEY) {
+    fail("AI_GATEWAY_API_KEY is required to execute BAML evals.");
   }
 
   await ensureFreshClient(skillRoot);
@@ -172,17 +172,21 @@ async function main() {
   const { b } = generated;
 
   const packet = await buildPacket(evalEntry, evalsDir, runner.packet_type);
-  const compileFn = b[runner.compile_brief_function];
-  const renderFn = b[runner.render_document_function];
-  const evaluateFn = b[runner.evaluate_document_function];
-
-  if (!compileFn || !renderFn || !evaluateFn) {
+  const compileFnName = runner.compile_brief_function;
+  const renderFnName = runner.render_document_function;
+  const evaluateFnName = runner.evaluate_document_function;
+
+  if (
+    typeof b[compileFnName] !== "function" ||
+    typeof b[renderFnName] !== "function" ||
+    typeof b[evaluateFnName] !== "function"
+  ) {
     fail(`Generated client is missing one or more runner functions for '${skillName}'.`);
   }
 
-  const brief = await compileFn(packet);
-  const candidateDocument = await renderFn(brief);
-  const report = await evaluateFn(packet, candidateDocument);
+  const brief = await b[compileFnName](packet);
+  const candidateDocument = await b[renderFnName](brief);
+  const report = await b[evaluateFnName](packet, candidateDocument);
 
   const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
   const runDir = path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`);
diff --git a/skills/foundation-creator/baml_src/clients.baml b/skills/foundation-creator/baml_src/clients.baml
index cd662f1..f37af5b 100644
--- a/skills/foundation-creator/baml_src/clients.baml
+++ b/skills/foundation-creator/baml_src/clients.baml
@@ -1,8 +1,9 @@
 client<llm> EvalModel {
   provider "openai-responses"
   options {
-    api_key env.OPENAI_API_KEY
-    model "gpt-5-mini"
+    api_key env.AI_GATEWAY_API_KEY
+    base_url "https://ai-gateway.vercel.sh/v1"
+    model "openai/gpt-5-mini"
     reasoning {
       effort "medium"
     }
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index 0b2b6c0..e5c720e 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -1,6 +1,7 @@
 function ExtractClaims(raw_notes: string) -> Claim[] {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Extract atomic claims from the raw notes below.
 
     Rules:
@@ -23,6 +24,7 @@ function ExtractClaims(raw_notes: string) -> Claim[] {
 function BuildFoundationKernel(claims: Claim[]) -> FoundationKernel {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Build a stable foundation kernel from the extracted claims below.
 
     Rules:
@@ -41,6 +43,7 @@ function BuildFoundationKernel(claims: Claim[]) -> FoundationKernel {
 function CritiqueFoundationKernel(kernel: FoundationKernel) -> FoundationCritique {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Critique the foundation kernel below.
 
     Rules:
@@ -63,6 +66,7 @@ function CompileFoundationBrief(
 ) -> FoundationBrief {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Compile a concise foundation brief from the kernel and critique below.
 
     Rules:
@@ -84,6 +88,7 @@ function CompileFoundationBrief(
 function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Render a prompt for the `foundation-creator` skill using the brief below.
 
     Rules:
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index 72df02a..3193812 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -1,6 +1,7 @@
 function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> FoundationBrief {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Compile a foundation brief from the evaluation packet below.
 
     Task prompt:
@@ -25,6 +26,7 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
 function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Draft a top-level foundation document from the brief below.
 
     Rules:
@@ -44,6 +46,7 @@ function EvaluateFoundationDocument(
 ) -> EvalReport {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Evaluate the candidate foundation document against the evaluation packet.
 
     Packet:
diff --git a/skills/spec-creator/baml_src/clients.baml b/skills/spec-creator/baml_src/clients.baml
index cd662f1..f37af5b 100644
--- a/skills/spec-creator/baml_src/clients.baml
+++ b/skills/spec-creator/baml_src/clients.baml
@@ -1,8 +1,9 @@
 client<llm> EvalModel {
   provider "openai-responses"
   options {
-    api_key env.OPENAI_API_KEY
-    model "gpt-5-mini"
+    api_key env.AI_GATEWAY_API_KEY
+    base_url "https://ai-gateway.vercel.sh/v1"
+    model "openai/gpt-5-mini"
     reasoning {
       effort "medium"
     }
diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
index 5fbe8b0..83ba3a7 100644
--- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -1,6 +1,7 @@
 function ExtractClaims(raw_notes: string) -> Claim[] {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Extract atomic claims from the raw notes below.
 
     Rules:
@@ -23,6 +24,7 @@ function ExtractClaims(raw_notes: string) -> Claim[] {
 function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrief {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Compile a spec brief from the inputs below.
 
     Rules:
@@ -45,6 +47,7 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
 function CritiqueSpecBrief(brief: SpecBrief) -> SpecCritique {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Critique the spec brief below.
 
     Rules:
@@ -63,6 +66,7 @@ function CritiqueSpecBrief(brief: SpecBrief) -> SpecCritique {
 function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Render a prompt for the `spec-creator` skill using the brief below.
 
     Rules:
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
index 068a32a..b94e59a 100644
--- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -1,6 +1,7 @@
 function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Compile a spec brief from the evaluation packet below.
 
     Task prompt:
@@ -28,6 +29,7 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
 function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Draft a `SPEC.md` from the brief below.
 
     Rules:
@@ -48,6 +50,7 @@ function EvaluateSpecDocument(
 ) -> EvalReport {
   client EvalModel
   prompt #"
+    {{ _.role("user") }}
     Evaluate the candidate `SPEC.md` against the evaluation packet.
 
     Packet:

From b19bf60f514b6a25e90b003fdd90ab1f1198eaa0 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Mon, 20 Apr 2026 15:46:41 +1000
Subject: [PATCH 03/30] Tighten foundation creator prompts and eval coverage

---
 skills/foundation-creator/SKILL.md            |  55 +++++++-
 .../compiler_functions.baml                   |  18 +++
 .../foundation_compiler/eval_runner.baml      |  16 ++-
 .../foundation_compiler/foundation_types.baml |   1 +
 skills/foundation-creator/evals/evals.json    |  14 +++
 .../fixtures/cloudflare/expected_criteria.md  |  21 ++++
 .../evals/fixtures/cloudflare/raw_notes.md    | 119 ++++++++++++++++++
 .../foundation-creator/references/language.md |  63 ++++++++++
 .../foundation-creator/references/template.md |  52 ++++++++
 9 files changed, 357 insertions(+), 2 deletions(-)
 create mode 100644 skills/foundation-creator/evals/fixtures/cloudflare/expected_criteria.md
 create mode 100644 skills/foundation-creator/evals/fixtures/cloudflare/raw_notes.md
 create mode 100644 skills/foundation-creator/references/language.md
 create mode 100644 skills/foundation-creator/references/template.md

diff --git a/skills/foundation-creator/SKILL.md b/skills/foundation-creator/SKILL.md
index e6dbb90..811bb59 100644
--- a/skills/foundation-creator/SKILL.md
+++ b/skills/foundation-creator/SKILL.md
@@ -18,6 +18,21 @@ product or company primitive. The resulting document is strategic and
 behavioral, not implementation-level. It should preserve uncertainty where
 decisions are not yet mature.
 
+This skill is a source-bound documentarian, not a strategy consultant. Its job
+is to synthesize what the available material already supports about the
+primitive: what it is, what it is not, what durable surfaces exist, and what
+remains unresolved.
+
+## Reference files
+
+Load on demand, not upfront.
+
+- `references/template.md` — the allowed section shape for a foundation
+  document. Read it when drafting a new foundation doc and when checking
+  whether the output stayed within scope.
+- `references/language.md` — wording and restraint rules. Read it before
+  writing prose and again during the validation pass.
+
 ## Core behavior
 
 - Start from thesis and boundaries, not components.
@@ -27,6 +42,43 @@ decisions are not yet mature.
 - Escalate to `spec-creator` only when a subsystem is concrete enough to
   deserve a `SPEC.md`.
 
+## Allowed content
+
+- What the primitive is.
+- What the primitive is not.
+- Durable thesis-level framing.
+- Actor model and durable surfaces.
+- Strategic bets only when they are clearly supported by the source material.
+  Frame them as observed directional bets, not recommendations.
+- Open questions and unresolved tensions.
+
+## Forbidden drift
+
+- Do not invent monetization, revenue models, KPIs, or internal
+  organizational structure unless the source explicitly states them.
+- Do not produce roadmap items, implementation plans, operating cadences,
+  pilot programs, or execution checklists unless the user explicitly asks.
+- Do not turn open questions into decision agendas or recommended next steps.
+- Do not fill gaps with plausible-sounding business language. Prefer omission
+  or an explicit unresolved question.
+- Do not collapse ambiguous positioning into a single confident frame when the
+  source material remains mixed.
+- Do not assert market leadership, superiority, or competitive differentiation
+  unless the source explicitly makes that claim and it matters to the
+  foundation.
+
+## Validation focus
+
+Before finalizing, check for these failure modes:
+
+- unsupported inference
+- consulting-style sections (`Success Signals`, `Decision Agenda`,
+  `Next Steps`, `Operating Guidance`, similar)
+- implementation leakage
+- business-model speculation
+- metrics or operational milestones not present in the source
+- missing explicit open questions where the notes remain unsettled
+
 ## Current compiler surface
 
 This skill includes typed BAML contracts under `baml_src/foundation_compiler/`
@@ -34,7 +86,8 @@ for:
 
 - extracting atomic claims from messy notes
 - compiling a stable foundation kernel
-- critiquing ambiguity, contradiction, and implementation leakage
+- critiquing ambiguity, contradiction, unsupported inference, and
+  implementation leakage
 - compiling a brief suitable for downstream document rendering
 
 The BAML layer is schema-first. Prompt wording and document templates can
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index e5c720e..b4a90d1 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -13,6 +13,8 @@ function ExtractClaims(raw_notes: string) -> Claim[] {
     - Use `Constraint` for hard operating requirements.
     - Use `OpenQuestion` when the note is unresolved.
     - Keep `sources` short and human-readable.
+    - Do not infer business model, monetization, org structure, metrics, or roadmap items unless explicit in the notes.
+    - Prefer omission over speculation.
 
     Raw notes:
     {{ raw_notes }}
@@ -32,6 +34,8 @@ function BuildFoundationKernel(claims: Claim[]) -> FoundationKernel {
     - Capture durable thesis-level information.
     - Do not invent implementation detail.
     - Use empty lists when a category is not yet supported by the claims.
+    - Do not infer monetization, pricing, KPIs, org structure, GTM strategy, partnership priorities, or execution plans unless explicitly supported.
+    - Prefer a shorter kernel over a speculative one.
 
     Claims:
     {{ claims|format(type="yaml") }}
@@ -50,7 +54,9 @@ function CritiqueFoundationKernel(kernel: FoundationKernel) -> FoundationCritiqu
     - Flag contradictions across thesis, boundaries, and bets.
     - Flag vague claims that should be sharpened before document rendering.
     - Flag implementation leakage.
+    - Flag unsupported inferences that are plausible-sounding but not clearly supported by source material.
     - Flag missing boundaries that create strategic confusion.
+    - Flag consulting-style drift such as metrics, decision agendas, operating plans, or monetization claims.
     - Ask only high-leverage clarification questions.
 
     Kernel:
@@ -74,6 +80,12 @@ function CompileFoundationBrief(
     - Preserve unresolved questions explicitly.
     - Exclude critique items that were already resolved by the kernel.
     - Keep the brief compact and high-signal.
+    - Remove unsupported inferences rather than softening them.
+    - Prefer omission over consultant-style expansion.
+    - Do not add business-model, KPI, org, roadmap, or operating-plan language unless source-backed.
+    - Treat `strategic_bets` as observed directional bets, not recommendations or settled future state.
+    - If a surface is visible but still in transition, preserve that qualification in the surrounding summary or questions.
+    - Avoid market-leadership or competitive-superiority language unless directly supported by source material.
 
     Kernel:
     {{ kernel|format(type="yaml") }}
@@ -96,6 +108,12 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Preserve strategic ambiguity where the brief leaves open questions.
     - Avoid implementation detail.
     - Emphasize thesis, boundaries, actor model, surfaces, and strategic bets.
+    - State that the writer is a source-bound synthesizer, not a strategy consultant.
+    - Require exactly these sections unless the user asks otherwise: `What This Is`, `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, `Open Questions`.
+    - Forbid extra sections like `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or `Roadmap`.
+    - Require `Strategic Bets` to be phrased as observed directional bets rather than prescriptions.
+    - Require recently emerging or transitional surfaces to be qualified explicitly rather than flattened as fully settled.
+    - Forbid market-leadership or superiority claims unless they are explicit in the brief.
     - Make the prompt directly usable by an agent.
 
     Brief:
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index 3193812..7760d3d 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -18,6 +18,8 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
     - Preserve ambiguity where the notes do not settle the framing.
     - Treat expected criteria as evaluation guidance, not as license to invent.
     - Avoid implementation detail.
+    - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them.
+    - Prefer omission over speculation.
 
     {{ ctx.output_format }}
   "#
@@ -30,10 +32,17 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     Draft a top-level foundation document from the brief below.
 
     Rules:
-    - Write a durable strategic document, not a `SPEC.md`.
+    - Write a durable foundation document, not a `SPEC.md`, strategy memo, roadmap, or operating plan.
     - Start from thesis and boundaries, not architecture.
     - Preserve unresolved questions explicitly.
     - Avoid implementation detail.
+    - Stay source-bound: do not invent monetization, KPIs, org structure, partnerships, operating guidance, or next-step plans.
+    - Prefer omission over plausible-sounding speculation.
+    - Use exactly these sections and no others: `What This Is`, `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, `Open Questions`.
+    - If `Strategic Bets` is weakly supported, keep it short rather than expanding it.
+    - Phrase `Strategic Bets` as observed directional bets, not recommendations or settled future state.
+    - When a surface is visible but still evolving in the packet, qualify it explicitly as emerging, evolving, or unsettled.
+    - Do not use market-leadership or competitive-superiority language unless the packet explicitly supports it.
 
     Brief:
     {{ brief|format(type="yaml") }}
@@ -59,6 +68,11 @@ function EvaluateFoundationDocument(
     - Grade against the expected criteria explicitly.
     - Reward preservation of uncertainty when the source packet is genuinely mixed.
     - Penalize invented certainty, invented capabilities, or implementation leakage.
+    - Penalize unsupported business-model, monetization, KPI, org, partnership, or operating-plan language.
+    - Penalize consulting-style sections such as `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or similar drift.
+    - Penalize `Strategic Bets` phrased as recommendations or settled conclusions when the packet only supports directional evidence.
+    - Penalize flattening transitional surfaces as fully settled if the packet presents them as evolving.
+    - Penalize market-leadership or competitive-superiority claims not explicitly supported by the packet.
     - Use `Pass`, `Partial`, or `Fail` for each criterion.
 
     {{ ctx.output_format }}
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml
index 16ffa0f..002c42d 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml
@@ -15,6 +15,7 @@ class FoundationCritique {
   contradictions string[]
   vague_claims string[]
   implementation_leaks string[]
+  unsupported_inferences string[]
   missing_boundaries string[]
   leverage_questions string[]
 }
diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json
index d0eae13..b44aefd 100644
--- a/skills/foundation-creator/evals/evals.json
+++ b/skills/foundation-creator/evals/evals.json
@@ -21,6 +21,20 @@
       "files": [
         "fixtures/vercel/raw_notes.md"
       ]
+    },
+    {
+      "id": 1,
+      "eval_name": "create-foundation-from-cloudflare-source-packet",
+      "prompt": "Use the source packet in `fixtures/cloudflare/raw_notes.md` to draft a top-level foundation document for Cloudflare. Preserve the tension between the connectivity cloud, developer platform, and AI/agents platform framings. Do not produce a `SPEC.md`, implementation plan, or architecture diagram.",
+      "expected_output": "A top-level foundation document that frames Cloudflare as a unified platform spanning security/connectivity, developer infrastructure, and AI surfaces; preserves the tension between `connectivity cloud` and developer/AI platform identities; identifies durable surfaces like network/security control plane, developer runtime, AI infrastructure, and platform-building primitives; clarifies boundaries against generic hyperscaler or single-product framings; and preserves open questions rather than inventing certainty.",
+      "expected_file": "fixtures/cloudflare/expected_criteria.md",
+      "packet_files": {
+        "raw_notes": "fixtures/cloudflare/raw_notes.md",
+        "expected_criteria": "fixtures/cloudflare/expected_criteria.md"
+      },
+      "files": [
+        "fixtures/cloudflare/raw_notes.md"
+      ]
     }
   ]
 }
diff --git a/skills/foundation-creator/evals/fixtures/cloudflare/expected_criteria.md b/skills/foundation-creator/evals/fixtures/cloudflare/expected_criteria.md
new file mode 100644
index 0000000..e8e2601
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/cloudflare/expected_criteria.md
@@ -0,0 +1,21 @@
+# Expected Criteria
+
+- The output should identify Cloudflare as more than CDN/perimeter security:
+  it should recognize a unified platform spanning connectivity, security,
+  development, and AI-related surfaces.
+- The output should preserve the tension between the enterprise-scale
+  `connectivity cloud` framing and the `Developer Platform` / AI agent
+  platform framing instead of collapsing Cloudflare into only one of those.
+- The output should identify multiple durable surfaces such as network/security
+  control plane, developer runtime/platform, AI infrastructure or agents, and
+  platform-building or multitenant support.
+- The output should include an actor model that covers at least enterprise or
+  security/network teams and developers; it should ideally also recognize
+  platform builders or AI agents/tools as meaningful actors.
+- The output should set clear boundaries:
+  not a generic hyperscaler/IaaS, not just a security product suite, and not
+  just an AI agent runtime.
+- The output should preserve at least one open question or strategic bet about
+  how the company's center of gravity is evolving.
+- The output should not invent internal org structure, revenue model,
+  financial claims, or unsupported product lines beyond the packet.
diff --git a/skills/foundation-creator/evals/fixtures/cloudflare/raw_notes.md b/skills/foundation-creator/evals/fixtures/cloudflare/raw_notes.md
new file mode 100644
index 0000000..1fd059e
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/cloudflare/raw_notes.md
@@ -0,0 +1,119 @@
+# Cloudflare Source Packet
+
+Assembled on April 20, 2026 from official Cloudflare sources.
+
+This packet is intentionally paraphrased. It is meant to test whether
+`foundation-creator` can handle a company that spans enterprise security and
+connectivity, developer infrastructure, and newer AI/agent surfaces without
+flattening that breadth into a single overconfident label.
+
+## Source 1
+
+- URL: [https://www.cloudflare.com/connectivity-cloud/](https://www.cloudflare.com/connectivity-cloud/)
+- Accessed: April 20, 2026
+- Cloudflare describes itself through the `connectivity cloud`.
+- The page says the platform is unified across security, connectivity, and
+  development.
+- The page emphasizes one network, one control plane, global scale,
+  resilience, composable programmable services, and a simplified management
+  interface.
+- The primary framing appears enterprise-oriented: reduce complexity, improve
+  security, increase performance, and accelerate digital projects.
+
+## Source 2
+
+- URL: [https://developers.cloudflare.com/](https://developers.cloudflare.com/)
+- Accessed: April 20, 2026
+- The docs homepage says the `Cloudflare Developer Platform` provides a
+  serverless execution environment for building new applications or augmenting
+  existing ones without maintaining infrastructure.
+- The same docs surface groups products into `Developer Products`,
+  `AI Products`, and `Cloudflare One` products.
+- The page frames Cloudflare not only as a security/network vendor but also as
+  a place to build software directly.
+
+## Source 3
+
+- URL: [https://developers.cloudflare.com/workers/](https://developers.cloudflare.com/workers/)
+- Last updated: April 15, 2026
+- Workers is described as a serverless platform for building, deploying, and
+  scaling apps across Cloudflare's global network with no infrastructure to
+  manage.
+- The Workers docs position the platform as full-stack, globally distributed,
+  and language-flexible.
+- The product surface includes front-end applications, back-end applications,
+  serverless AI inference, background jobs, observability, and integrations
+  with storage and compute products like Durable Objects, D1, KV, Queues,
+  Workers AI, Workflows, Vectorize, and R2.
+
+## Source 4
+
+- URL: [https://developers.cloudflare.com/ai/](https://developers.cloudflare.com/ai/)
+- Last updated: April 16, 2026
+- Cloudflare AI is described as a unified platform for running AI models,
+  whether hosted on Cloudflare infrastructure via Workers AI or proxied
+  through AI Gateway to external providers.
+- Related AI products include Workers AI, AI Gateway, Vectorize, Agents,
+  AI Search, AI Crawl Control, Browser Rendering, and Cloudflare Agent.
+- This suggests Cloudflare now treats AI as a first-class product surface
+  within the platform.
+
+## Source 5
+
+- URL: [https://developers.cloudflare.com/agents/](https://developers.cloudflare.com/agents/)
+- Last updated: April 14, 2026
+- The Agents docs say real agents need memory, scheduling, tool use,
+  coordination, and persistent state.
+- The Agents SDK is built around Durable Objects and positions Cloudflare as a
+  place to run long-lived, stateful, globally distributed agents.
+- The docs say agents can use and serve tools through MCP, schedule tasks,
+  coordinate workflows, browse the web, and connect to AI models including
+  Workers AI and external providers.
+- This is a stronger framing than "AI inference only"; it pushes Cloudflare
+  toward an agent runtime platform.
+
+## Source 6
+
+- URL: [https://developers.cloudflare.com/cloudflare-for-platforms/](https://developers.cloudflare.com/cloudflare-for-platforms/)
+- Last updated: December 15, 2025
+- `Cloudflare for Platforms` says customers can offer Cloudflare's own
+  products and functionality to their own customers inside their own product.
+- The page emphasizes custom domains/subdomains, isolation and multitenancy,
+  programmable routing/ingress/egress, storage and databases, and ability to
+  deploy millions of applications and domains.
+- The docs explicitly mention deploying an AI vibe coding platform as a starter
+  use case.
+- This suggests Cloudflare is not only a platform for direct customers; it is
+  also a substrate for other platforms.
+
+## Source 7
+
+- URL: [https://www.cloudflare.com/press/press-releases/2025/cloudflare-accelerates-ai-agent-development-remote-mcp/](https://www.cloudflare.com/press/press-releases/2025/cloudflare-accelerates-ai-agent-development-remote-mcp/)
+- Published: April 7, 2025
+- Cloudflare announced new offerings for AI agent development, including a
+  remote MCP server, durable Workflows, and Durable Objects free tier.
+- The press release says Cloudflare's developer platform and global network are
+  the best place to build and deploy AI agents.
+- The launch framing expands Cloudflare beyond web performance/security into an
+  opinionated platform for agent development.
+
+## Source 8
+
+- URL: [https://developers.cloudflare.com/agents/model-context-protocol/mcp-servers-for-cloudflare/](https://developers.cloudflare.com/agents/model-context-protocol/mcp-servers-for-cloudflare/)
+- Accessed: April 20, 2026
+- Cloudflare documents its own MCP servers, including product-specific servers
+  and a docs server.
+- This suggests Cloudflare is not only supporting MCP as a standard for others
+  but actively using it across its own product/API surface.
+
+## Tensions and questions the evaluator should preserve
+
+- Cloudflare uses the enterprise-scale `connectivity cloud` framing while also
+  maintaining a distinct `Developer Platform` identity.
+- The company spans security, networking, performance, developer runtime,
+  AI infrastructure, agents, and platform-building primitives.
+- A good foundation document should avoid flattening Cloudflare into just a CDN,
+  just Zero Trust/security, or just a developer runtime.
+- AI and agents appear increasingly central, but the packet does not fully
+  settle whether they are an extension of the connectivity cloud, a new primary
+  platform identity, or one major layer within a broader company primitive.
diff --git a/skills/foundation-creator/references/language.md b/skills/foundation-creator/references/language.md
new file mode 100644
index 0000000..d51ba80
--- /dev/null
+++ b/skills/foundation-creator/references/language.md
@@ -0,0 +1,63 @@
+# Foundation Language Guide
+
+How the foundation document should be worded.
+
+## 1. Role
+
+- The document is a source-bound synthesis.
+- It names durable framing, boundaries, actors, surfaces, bets, and
+  unresolved questions.
+- It does not act like a consultant memo, board brief, or operating plan.
+
+## 2. Voice and Tense
+
+- Present tense, active voice.
+- Third person only. Never "we" or "you".
+- Declarative statements over persuasive rhetoric.
+- Prefer short, dense paragraphs and compact bullets.
+
+## 3. Restraint Rules
+
+- Prefer omission over invention.
+- If a point is plausible but not supported, omit it or convert it into an
+  open question.
+- If the source material is mixed, preserve the tension explicitly.
+- Do not infer monetization, pricing, revenue, GTM segmentation, org design,
+  partnership priorities, KPIs, or timelines unless directly supported.
+- Do not write recommendations, action items, or decision deadlines unless the
+  user explicitly asks for them.
+- Do not assert market leadership, competitive superiority, or winner/loser
+  framing unless directly supported by the source packet.
+
+## 4. Allowed Section Behavior
+
+- `What This Is` explains the primitive at a durable level.
+- `Core Thesis` contains only source-backed, thesis-level claims.
+- `Boundaries` should be explicit and contrastive.
+- `Actor Model` names meaningful actors without inventing internal roles.
+- `Durable Surfaces` names persistent product or platform surfaces, not
+  implementation components. If a surface is source-visible but still in
+  transition, qualify it explicitly as emerging or evolving.
+- `Strategic Bets` should be minimal and clearly grounded in repeated signals.
+  Phrase them as observed directional bets (`public materials suggest a bet
+  on...`, `the company appears to be betting on...`) rather than settled
+  declarations or recommendations.
+- `Open Questions` should remain open rather than being quietly resolved in
+  prose elsewhere.
+
+## 5. Disallowed Drift
+
+- No `Success Signals`, KPI, or metrics section.
+- No monetization strategy or revenue language.
+- No `Decision Agenda`, `Next Steps`, `Operating Guidance`, or milestone plan.
+- No implementation components, architecture diagrams, or system design.
+- No internal org structure unless explicit in the source.
+- No market-leadership or competitive-positioning claims unless explicit in the
+  source.
+
+## 6. Tone
+
+- Dense, calm, and specific.
+- No hype language.
+- No filler ("it is worth noting", "in order to", "clearly", "obviously").
+- No false certainty when the source packet is mixed.
diff --git a/skills/foundation-creator/references/template.md b/skills/foundation-creator/references/template.md
new file mode 100644
index 0000000..7fc4adf
--- /dev/null
+++ b/skills/foundation-creator/references/template.md
@@ -0,0 +1,52 @@
+# {Primitive Name} Foundation
+
+Use only the sections below unless the user explicitly asks for more.
+
+## What This Is
+
+{One short paragraph describing the primitive or company/product foundation.}
+
+## Core Thesis
+
+- {Durable belief supported by the source material.}
+- {Another source-backed thesis.}
+
+## Boundaries
+
+- {What this is not.}
+- {Explicit limit of scope or category.}
+
+## Actor Model
+
+- {Primary actor and relationship to the primitive.}
+- {Secondary actor when source-backed.}
+
+## Durable Surfaces
+
+- {Surface area that persists across implementations or product shifts.}
+- {Another durable surface.}
+
+## Strategic Bets
+
+- {Only if clearly supported by the material.}
+- {Use fewer bullets rather than speculative ones.}
+
+## Open Questions
+
+- {Unresolved tension or ambiguity the source does not settle.}
+- {Another open question.}
+
+## Disallowed Sections
+
+Do not add sections like:
+
+- `Success Signals`
+- `Metrics`
+- `Decision Agenda`
+- `Next Steps`
+- `Operating Guidance`
+- `Implementation Plan`
+- `Roadmap`
+- `Partnership Strategy`
+
+Those belong to downstream planning artifacts, not the foundation document.

From 2679aeaee753b4b64a975b2f0f696e0aff21aa85 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Mon, 20 Apr 2026 15:56:28 +1000
Subject: [PATCH 04/30] Add deterministic checks and benchmarks to eval runner

---
 README.md                                  |  13 +
 scripts/run-baml-eval.mjs                  | 542 ++++++++++++++++++++-
 skills/foundation-creator/evals/evals.json |   6 +
 skills/spec-creator/evals/evals.json       |   6 +
 4 files changed, 541 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 4a628c3..08353d9 100644
--- a/README.md
+++ b/README.md
@@ -29,11 +29,24 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and
 bun install
 bun run eval:foundation -- create-foundation-from-vercel-source-packet
 bun run eval:spec -- create-from-vercel-mcp-source-packet
+bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3
 ```
 
 Each run writes packet, brief, candidate document, and evaluation report
 artifacts under `skills/<skill>/evals/runs/`.
 
+The runner now also writes:
+
+- `deterministic_checks.json` — reference-driven checks derived from the skill's
+  `template.md` and `language.md`
+- `timing.json` — per-stage local timing
+- `summary.json` — per-trial LLM status + combined status
+- `benchmark.json` — aggregated status counts and timing summaries across all
+  trials
+
+When `--trials N` is used, the run directory contains `trial-1/`, `trial-2/`,
+... plus a top-level `benchmark.json`.
+
 `bun run eval:*` loads `.env` automatically through `dotenv-cli`, so
 `AI_GATEWAY_API_KEY` can live in the repo-local `.env` without manual
 `source` steps.
diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs
index 039d4a6..f64f4ab 100644
--- a/scripts/run-baml-eval.mjs
+++ b/scripts/run-baml-eval.mjs
@@ -12,6 +12,57 @@ function fail(message) {
   process.exit(1);
 }
 
+const STATUS_RANK = {
+  Pass: 0,
+  Partial: 1,
+  Fail: 2,
+};
+
+function normalizeLine(line) {
+  return line.trim().replace(/\s+/g, " ");
+}
+
+function normalizeHeading(line) {
+  return normalizeLine(line)
+    .replace(/^#+\s*/, "")
+    .replace(/\s+/g, " ")
+    .trim();
+}
+
+function summarizeNumeric(values) {
+  if (values.length === 0) {
+    return {
+      mean: 0,
+      min: 0,
+      max: 0,
+    };
+  }
+
+  const total = values.reduce((sum, value) => sum + value, 0);
+  return {
+    mean: Math.round(total / values.length),
+    min: Math.min(...values),
+    max: Math.max(...values),
+  };
+}
+
+function worstStatus(statuses) {
+  return statuses.reduce((worst, current) => {
+    if (!worst) {
+      return current;
+    }
+    return STATUS_RANK[current] > STATUS_RANK[worst] ? current : worst;
+  }, null);
+}
+
+function hasPronounDrift(document) {
+  return /\b(we|our|ours|us|you|your|yours)\b/i.test(document);
+}
+
+function hasUppercaseObligationKeyword(document) {
+  return /\b(MUST|SHOULD|MAY)\b/.test(document);
+}
+
 function runCommand(command, args, cwd) {
   return new Promise((resolve, reject) => {
     const child = spawn(command, args, {
@@ -40,6 +91,36 @@ async function loadText(filePath) {
   return readFile(filePath, "utf8");
 }
 
+function parseArgs(argv) {
+  const positionals = [];
+  let trials = 1;
+
+  for (let index = 0; index < argv.length; index += 1) {
+    const arg = argv[index];
+
+    if (arg === "--trials") {
+      const next = argv[index + 1];
+      if (!next) {
+        fail("Missing value after --trials.");
+      }
+      trials = Number.parseInt(next, 10);
+      if (!Number.isInteger(trials) || trials < 1) {
+        fail("--trials must be a positive integer.");
+      }
+      index += 1;
+      continue;
+    }
+
+    positionals.push(arg);
+  }
+
+  return {
+    skillName: positionals[0],
+    selector: positionals[1],
+    trials,
+  };
+}
+
 function getEvalBySelector(evals, selector) {
   if (!selector) {
     if (evals.length === 1) {
@@ -66,7 +147,7 @@ function getEvalBySelector(evals, selector) {
 
 async function generateClient(skillRoot) {
   const bamlSrc = path.join(skillRoot, "baml_src");
-  await runCommand("npx", ["baml-cli", "generate", "--from", bamlSrc], repoRoot);
+  await runCommand("bunx", ["baml-cli", "generate", "--from", bamlSrc], repoRoot);
 }
 
 async function importGeneratedClient(skillRoot) {
@@ -129,7 +210,7 @@ async function ensureFreshClient(skillRoot) {
     "utf8",
   );
   try {
-    await runCommand("npx", ["tsc", "--project", tsconfigPath], repoRoot);
+    await runCommand("bunx", ["tsc", "--project", tsconfigPath], repoRoot);
   } finally {
     await rm(tsconfigPath, { force: true });
   }
@@ -144,33 +225,282 @@ async function writeRunArtifacts(runDir, artifacts) {
   }
 }
 
-async function main() {
-  const skillName = process.argv[2];
-  const selector = process.argv[3];
+function createCheck(id, passed, details) {
+  return { id, passed, details };
+}
 
-  if (!skillName) {
-    fail("Usage: node ./scripts/run-baml-eval.mjs <foundation-creator|spec-creator> [eval-id-or-name]");
+function extractFoundationTemplateSections(templateText) {
+  const sections = [];
+  const lines = templateText.split(/\r?\n/);
+
+  for (const rawLine of lines) {
+    const line = normalizeLine(rawLine);
+    if (line === "## Disallowed Sections") {
+      break;
+    }
+    if (line.startsWith("## ")) {
+      sections.push(line.slice(3).trim());
+    }
   }
 
-  const skillRoot = path.join(repoRoot, "skills", skillName);
-  const evalsDir = path.join(skillRoot, "evals");
-  const manifestPath = path.join(evalsDir, "evals.json");
-  const manifest = await loadJson(manifestPath);
-  const evalEntry = getEvalBySelector(manifest.evals, selector);
-  const runner = manifest.runner_contract;
+  return sections;
+}
 
-  if (!runner || runner.type !== "baml_pipeline") {
-    fail(`Skill '${skillName}' does not declare a supported runner_contract.`);
+function extractFoundationDisallowedHeadings(templateText) {
+  const disallowed = new Set();
+  const lines = templateText.split(/\r?\n/);
+  let inDisallowedSection = false;
+
+  for (const rawLine of lines) {
+    const line = normalizeLine(rawLine);
+    if (line === "## Disallowed Sections") {
+      inDisallowedSection = true;
+      continue;
+    }
+    if (!inDisallowedSection) {
+      continue;
+    }
+    const match = line.match(/^- `(.+)`$/);
+    if (match) {
+      disallowed.add(match[1]);
+    }
   }
 
-  if (!process.env.AI_GATEWAY_API_KEY) {
-    fail("AI_GATEWAY_API_KEY is required to execute BAML evals.");
+  return disallowed;
+}
+
+function validateFoundationDocument(candidateDocument, templateText) {
+  const requiredSections = extractFoundationTemplateSections(templateText);
+  const disallowedHeadings = extractFoundationDisallowedHeadings(templateText);
+  const lines = candidateDocument.split(/\r?\n/);
+  const lineMap = new Map();
+
+  for (const [index, rawLine] of lines.entries()) {
+    const line = normalizeHeading(rawLine);
+    if (!line) {
+      continue;
+    }
+    if (!lineMap.has(line)) {
+      lineMap.set(line, []);
+    }
+    lineMap.get(line).push(index);
   }
 
-  await ensureFreshClient(skillRoot);
-  const generated = await importGeneratedClient(skillRoot);
-  const { b } = generated;
+  const missingSections = [];
+  const duplicateSections = [];
+  const positions = [];
+
+  for (const section of requiredSections) {
+    const matches = lineMap.get(section) ?? [];
+    if (matches.length === 0) {
+      missingSections.push(section);
+      continue;
+    }
+    if (matches.length > 1) {
+      duplicateSections.push(section);
+    }
+    positions.push({
+      section,
+      index: matches[0],
+    });
+  }
+
+  const orderIsCorrect = positions.every((entry, index) => {
+    if (index === 0) {
+      return true;
+    }
+    return entry.index > positions[index - 1].index;
+  });
+
+  const emptySections = [];
+  for (let index = 0; index < positions.length; index += 1) {
+    const current = positions[index];
+    const next = positions[index + 1];
+    const start = current.index + 1;
+    const end = next ? next.index : lines.length;
+    const sectionBody = lines.slice(start, end).join("\n").trim();
+    if (!sectionBody) {
+      emptySections.push(current.section);
+    }
+  }
+
+  const presentDisallowedSections = [...disallowedHeadings].filter((section) =>
+    (lineMap.get(section) ?? []).length > 0,
+  );
+
+  return [
+    createCheck(
+      "required_sections_present_once",
+      missingSections.length === 0 && duplicateSections.length === 0,
+      missingSections.length === 0 && duplicateSections.length === 0
+        ? `All required sections from template are present exactly once: ${requiredSections.join(", ")}.`
+        : `Missing: ${missingSections.join(", ") || "none"}. Duplicate: ${duplicateSections.join(", ") || "none"}.`,
+    ),
+    createCheck(
+      "required_sections_in_template_order",
+      missingSections.length === 0 && orderIsCorrect,
+      missingSections.length > 0
+        ? "Section order check skipped because one or more required sections are missing."
+        : orderIsCorrect
+          ? "Required sections follow the template order."
+          : "Required sections are present but not in template order.",
+    ),
+    createCheck(
+      "required_sections_nonempty",
+      emptySections.length === 0,
+      emptySections.length === 0
+        ? "Every required section has non-empty content."
+        : `Empty sections: ${emptySections.join(", ")}.`,
+    ),
+    createCheck(
+      "no_disallowed_sections",
+      presentDisallowedSections.length === 0,
+      presentDisallowedSections.length === 0
+        ? "No disallowed downstream-planning sections were detected."
+        : `Disallowed sections present: ${presentDisallowedSections.join(", ")}.`,
+    ),
+    createCheck(
+      "no_first_or_second_person",
+      !hasPronounDrift(candidateDocument),
+      !hasPronounDrift(candidateDocument)
+        ? "No obvious first-person or second-person pronouns detected."
+        : "Detected first-person or second-person pronouns that violate the language guide.",
+    ),
+  ];
+}
 
+function extractSpecMajorSections(templateText) {
+  const sections = [];
+  const lines = templateText.split(/\r?\n/);
+
+  for (const rawLine of lines) {
+    const line = normalizeLine(rawLine);
+    const match = line.match(/^## \d+\.\s+(.+)$/);
+    if (match) {
+      sections.push(match[1]);
+    }
+  }
+
+  return sections;
+}
+
+function lineExists(lines, matcher) {
+  return lines.some((line) => matcher(normalizeHeading(line)));
+}
+
+function validateSpecDocument(candidateDocument, templateText) {
+  const requiredSections = extractSpecMajorSections(templateText);
+  const lines = candidateDocument.split(/\r?\n/);
+  const missingSections = requiredSections.filter(
+    (section) => !lineExists(lines, (line) => line.toLowerCase() === section.toLowerCase()),
+  );
+
+  const hasPurpose = lineExists(lines, (line) => line === "Purpose");
+  const hasProblemStatement = lineExists(
+    lines,
+    (line) => line.toLowerCase() === "problem statement",
+  );
+  const hasNumberedComponents = /^\d+\.\s+`[^`]+`/m.test(candidateDocument);
+  const hasFieldFormatting = /- `[^`]+` \([^)]+\)/.test(candidateDocument);
+
+  return [
+    createCheck(
+      "core_sections_present",
+      missingSections.length === 0,
+      missingSections.length === 0
+        ? `All major template sections are present: ${requiredSections.join(", ")}.`
+        : `Missing major sections: ${missingSections.join(", ")}.`,
+    ),
+    createCheck(
+      "purpose_heading_present",
+      hasPurpose,
+      hasPurpose
+        ? "Purpose heading is present."
+        : "Purpose heading is missing.",
+    ),
+    createCheck(
+      "problem_statement_present",
+      hasProblemStatement,
+      hasProblemStatement
+        ? "Problem Statement heading is present."
+        : "Problem Statement heading is missing.",
+    ),
+    createCheck(
+      "component_list_uses_numbering",
+      hasNumberedComponents,
+      hasNumberedComponents
+        ? "Detected numbered component entries in the spec."
+        : "Did not detect numbered component entries like `1. `Component Name``.",
+    ),
+    createCheck(
+      "domain_fields_use_template_shape",
+      hasFieldFormatting,
+      hasFieldFormatting
+        ? "Detected domain-field lines using the `` `field_name` (type) `` format."
+        : "Did not detect any domain-field lines using the template field format.",
+    ),
+    createCheck(
+      "no_first_or_second_person",
+      !hasPronounDrift(candidateDocument),
+      !hasPronounDrift(candidateDocument)
+        ? "No obvious first-person or second-person pronouns detected."
+        : "Detected first-person or second-person pronouns that violate the language guide.",
+    ),
+    createCheck(
+      "obligation_keywords_lowercase",
+      !hasUppercaseObligationKeyword(candidateDocument),
+      !hasUppercaseObligationKeyword(candidateDocument)
+        ? "No uppercase obligation keywords detected."
+        : "Detected uppercase MUST/SHOULD/MAY, which violates the language guide.",
+    ),
+  ];
+}
+
+async function runDeterministicChecks(skillRoot, validationContract, candidateDocument) {
+  if (!validationContract || validationContract.type !== "reference_document_checks") {
+    return {
+      enabled: false,
+      overall_pass: true,
+      checks: [],
+    };
+  }
+
+  const templatePath = path.join(skillRoot, validationContract.template_file);
+  const languagePath = path.join(skillRoot, validationContract.language_file);
+  const [templateText, languageText] = await Promise.all([
+    loadText(templatePath),
+    loadText(languagePath),
+  ]);
+
+  let checks;
+  switch (validationContract.validator) {
+    case "foundation-v1":
+      checks = validateFoundationDocument(candidateDocument, templateText, languageText);
+      break;
+    case "spec-v1":
+      checks = validateSpecDocument(candidateDocument, templateText, languageText);
+      break;
+    default:
+      fail(`Unknown validation contract '${validationContract.validator}'.`);
+  }
+
+  return {
+    enabled: true,
+    validator: validationContract.validator,
+    overall_pass: checks.every((check) => check.passed),
+    checks,
+  };
+}
+
+async function runSingleTrial({
+  evalEntry,
+  evalsDir,
+  generated,
+  runner,
+  skillRoot,
+  validationContract,
+}) {
+  const { b } = generated;
   const packet = await buildPacket(evalEntry, evalsDir, runner.packet_type);
   const compileFnName = runner.compile_brief_function;
   const renderFnName = runner.render_document_function;
@@ -181,24 +511,184 @@ async function main() {
     typeof b[renderFnName] !== "function" ||
     typeof b[evaluateFnName] !== "function"
   ) {
-    fail(`Generated client is missing one or more runner functions for '${skillName}'.`);
+    fail(`Generated client is missing one or more runner functions for '${path.basename(skillRoot)}'.`);
   }
 
+  const timing = {};
+  const startedAt = Date.now();
+
+  const compileStartedAt = Date.now();
   const brief = await b[compileFnName](packet);
+  timing.compile_ms = Date.now() - compileStartedAt;
+
+  const renderStartedAt = Date.now();
   const candidateDocument = await b[renderFnName](brief);
+  timing.render_ms = Date.now() - renderStartedAt;
+
+  const deterministicStartedAt = Date.now();
+  const deterministic_checks = await runDeterministicChecks(
+    skillRoot,
+    validationContract,
+    candidateDocument,
+  );
+  timing.deterministic_ms = Date.now() - deterministicStartedAt;
+
+  const evaluateStartedAt = Date.now();
   const report = await b[evaluateFnName](packet, candidateDocument);
+  timing.evaluate_ms = Date.now() - evaluateStartedAt;
+  timing.total_ms = Date.now() - startedAt;
+
+  const combined_status =
+    deterministic_checks.enabled && !deterministic_checks.overall_pass
+      ? worstStatus([report.overall_status, "Fail"])
+      : report.overall_status;
+
+  return {
+    packet,
+    brief,
+    candidateDocument,
+    report,
+    deterministic_checks,
+    timing,
+    summary: {
+      llm_status: report.overall_status,
+      combined_status,
+      deterministic_pass: deterministic_checks.overall_pass,
+    },
+  };
+}
 
+function buildBenchmark(skillName, evalName, trials) {
+  const judgeStatuses = trials.map((trial) => trial.report.overall_status);
+  const combinedStatuses = trials.map((trial) => trial.summary.combined_status);
+  const deterministicPassCount = trials.filter(
+    (trial) => trial.deterministic_checks.overall_pass,
+  ).length;
+
+  const checkStats = new Map();
+  for (const trial of trials) {
+    for (const check of trial.deterministic_checks.checks) {
+      if (!checkStats.has(check.id)) {
+        checkStats.set(check.id, {
+          id: check.id,
+          passed: 0,
+          total: 0,
+          last_details: "",
+        });
+      }
+      const stat = checkStats.get(check.id);
+      stat.total += 1;
+      if (check.passed) {
+        stat.passed += 1;
+      }
+      stat.last_details = check.details;
+    }
+  }
+
+  return {
+    skill_name: skillName,
+    eval_name: evalName,
+    trial_count: trials.length,
+    judge_status_counts: {
+      Pass: judgeStatuses.filter((status) => status === "Pass").length,
+      Partial: judgeStatuses.filter((status) => status === "Partial").length,
+      Fail: judgeStatuses.filter((status) => status === "Fail").length,
+    },
+    combined_status_counts: {
+      Pass: combinedStatuses.filter((status) => status === "Pass").length,
+      Partial: combinedStatuses.filter((status) => status === "Partial").length,
+      Fail: combinedStatuses.filter((status) => status === "Fail").length,
+    },
+    benchmark_summary: {
+      llm_worst_status: worstStatus(judgeStatuses),
+      combined_worst_status: worstStatus(combinedStatuses),
+      deterministic_pass_rate: Number((deterministicPassCount / trials.length).toFixed(2)),
+    },
+    timing_ms: {
+      compile: summarizeNumeric(trials.map((trial) => trial.timing.compile_ms)),
+      render: summarizeNumeric(trials.map((trial) => trial.timing.render_ms)),
+      deterministic: summarizeNumeric(trials.map((trial) => trial.timing.deterministic_ms)),
+      evaluate: summarizeNumeric(trials.map((trial) => trial.timing.evaluate_ms)),
+      total: summarizeNumeric(trials.map((trial) => trial.timing.total_ms)),
+    },
+    deterministic_checks: [...checkStats.values()].map((stat) => ({
+      id: stat.id,
+      pass_rate: Number((stat.passed / stat.total).toFixed(2)),
+      passed_trials: stat.passed,
+      total_trials: stat.total,
+      last_details: stat.last_details,
+    })),
+  };
+}
+
+async function main() {
+  const { skillName, selector, trials } = parseArgs(process.argv.slice(2));
+
+  if (!skillName) {
+    fail(
+      "Usage: bun run ./scripts/run-baml-eval.mjs <foundation-creator|spec-creator> [eval-id-or-name] [--trials N]",
+    );
+  }
+
+  const skillRoot = path.join(repoRoot, "skills", skillName);
+  const evalsDir = path.join(skillRoot, "evals");
+  const manifestPath = path.join(evalsDir, "evals.json");
+  const manifest = await loadJson(manifestPath);
+  const evalEntry = getEvalBySelector(manifest.evals, selector);
+  const runner = manifest.runner_contract;
+  const validationContract = manifest.validation_contract ?? null;
+
+  if (!runner || runner.type !== "baml_pipeline") {
+    fail(`Skill '${skillName}' does not declare a supported runner_contract.`);
+  }
+
+  if (!process.env.AI_GATEWAY_API_KEY) {
+    fail("AI_GATEWAY_API_KEY is required to execute BAML evals.");
+  }
+
+  await ensureFreshClient(skillRoot);
+  const generated = await importGeneratedClient(skillRoot);
   const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
   const runDir = path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`);
+  const trialResults = [];
+
+  for (let trialIndex = 0; trialIndex < trials; trialIndex += 1) {
+    const trialResult = await runSingleTrial({
+      evalEntry,
+      evalsDir,
+      generated,
+      runner,
+      skillRoot,
+      validationContract,
+    });
+    trialResults.push(trialResult);
+
+    const trialArtifacts = {
+      "packet.json": trialResult.packet,
+      "brief.json": trialResult.brief,
+      "candidate.md": trialResult.candidateDocument,
+      "report.json": trialResult.report,
+      "deterministic_checks.json": trialResult.deterministic_checks,
+      "timing.json": trialResult.timing,
+      "summary.json": trialResult.summary,
+    };
+
+    if (trials === 1) {
+      await writeRunArtifacts(runDir, trialArtifacts);
+    } else {
+      await writeRunArtifacts(path.join(runDir, `trial-${trialIndex + 1}`), trialArtifacts);
+    }
+  }
+
+  const benchmark = buildBenchmark(skillName, evalEntry.eval_name, trialResults);
   await writeRunArtifacts(runDir, {
-    "packet.json": packet,
-    "brief.json": brief,
-    "candidate.md": candidateDocument,
-    "report.json": report,
+    "benchmark.json": benchmark,
   });
 
   console.log(`Run complete: ${runDir}`);
-  console.log(`Overall status: ${report.overall_status}`);
+  console.log(`Trials: ${trialResults.length}`);
+  console.log(`LLM worst status: ${benchmark.benchmark_summary.llm_worst_status}`);
+  console.log(`Combined worst status: ${benchmark.benchmark_summary.combined_worst_status}`);
 }
 
 main().catch((error) => {
diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json
index b44aefd..1b5a1b8 100644
--- a/skills/foundation-creator/evals/evals.json
+++ b/skills/foundation-creator/evals/evals.json
@@ -7,6 +7,12 @@
     "render_document_function": "RenderFoundationDocumentDraft",
     "evaluate_document_function": "EvaluateFoundationDocument"
   },
+  "validation_contract": {
+    "type": "reference_document_checks",
+    "validator": "foundation-v1",
+    "template_file": "references/template.md",
+    "language_file": "references/language.md"
+  },
   "evals": [
     {
       "id": 0,
diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json
index 8c9e5ad..cec004a 100644
--- a/skills/spec-creator/evals/evals.json
+++ b/skills/spec-creator/evals/evals.json
@@ -7,6 +7,12 @@
     "render_document_function": "RenderSpecDocumentDraft",
     "evaluate_document_function": "EvaluateSpecDocument"
   },
+  "validation_contract": {
+    "type": "reference_document_checks",
+    "validator": "spec-v1",
+    "template_file": "references/template.md",
+    "language_file": "references/language.md"
+  },
   "evals": [
     {
       "id": 0,

From caea9721a1c9ee5c4f936c78a779839764986f7e Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Tue, 21 Apr 2026 00:13:44 +1000
Subject: [PATCH 05/30] Tighten evals and expand foundation coverage

---
 README.md                                     |  13 +
 evals/TAXONOMY.md                             |  82 ++++++
 scripts/run-baml-eval.mjs                     | 277 ++++++++++++++++--
 .../compiler_functions.baml                   |  15 +-
 .../foundation_compiler/eval_runner.baml      |  23 +-
 skills/foundation-creator/evals/evals.json    |  64 ++++
 .../fixtures/harbor_care/expected_criteria.md |  24 ++
 .../evals/fixtures/harbor_care/raw_notes.md   |  69 +++++
 .../expected_criteria.md                      |  28 ++
 .../lightfast_founder_notes/raw_notes.md      |  76 +++++
 .../foundation-creator/references/language.md |  31 +-
 .../foundation-creator/references/template.md |  10 +-
 .../spec_compiler/compiler_functions.baml     |  22 ++
 .../baml_src/spec_compiler/eval_runner.baml   |  37 ++-
 .../baml_src/spec_compiler/spec_types.baml    |   2 +
 skills/spec-creator/evals/evals.json          |  43 +++
 16 files changed, 767 insertions(+), 49 deletions(-)
 create mode 100644 evals/TAXONOMY.md
 create mode 100644 skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md
 create mode 100644 skills/foundation-creator/evals/fixtures/harbor_care/raw_notes.md
 create mode 100644 skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md
 create mode 100644 skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md

diff --git a/README.md b/README.md
index 08353d9..e38a7db 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and
 ```bash
 bun install
 bun run eval:foundation -- create-foundation-from-vercel-source-packet
+bun run eval:foundation -- create-foundation-from-lightfast-founder-notes
 bun run eval:spec -- create-from-vercel-mcp-source-packet
 bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3
 ```
@@ -35,6 +36,13 @@ bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator creat
 Each run writes packet, brief, candidate document, and evaluation report
 artifacts under `skills/<skill>/evals/runs/`.
 
+Current `foundation-creator` corpus includes:
+
+- `create-foundation-from-vercel-source-packet`
+- `create-foundation-from-cloudflare-source-packet`
+- `create-foundation-from-lightfast-founder-notes`
+- `create-foundation-from-harbor-care-source-packet`
+
 The runner now also writes:
 
 - `deterministic_checks.json` — reference-driven checks derived from the skill's
@@ -44,6 +52,11 @@ The runner now also writes:
 - `benchmark.json` — aggregated status counts and timing summaries across all
   trials
 
+Eval manifests also carry lightweight taxonomy metadata
+(`scenario_type`, `input_shape`, `ambiguity_level`, `domain_profile`,
+`primary_risks`) so benchmark runs can be grouped by failure mode. Shared
+taxonomy guidance lives in [`evals/TAXONOMY.md`](evals/TAXONOMY.md).
+
 When `--trials N` is used, the run directory contains `trial-1/`, `trial-2/`,
 ... plus a top-level `benchmark.json`.
 
diff --git a/evals/TAXONOMY.md b/evals/TAXONOMY.md
new file mode 100644
index 0000000..3f843e7
--- /dev/null
+++ b/evals/TAXONOMY.md
@@ -0,0 +1,82 @@
+# Eval Taxonomy
+
+This repo tracks eval coverage across a small shared taxonomy so new packets
+expand the corpus intentionally instead of growing as one-off examples.
+
+## Manifest fields
+
+Each eval entry in `skills/*/evals/evals.json` should declare:
+
+- `scenario_type`
+- `input_shape`
+- `ambiguity_level`
+- `domain_profile`
+- `primary_risks`
+
+These fields are lightweight metadata. They do not change execution, but they
+show up in `benchmark.json` so runs can be grouped by failure mode later.
+
+## Canonical scenario types
+
+- `clear_intent_prompt`
+  - Straightforward create-mode request with explicit scope and components.
+- `unstructured_notes_prompt`
+  - Messy notes, but still mostly service-shaped and not deeply ambiguous.
+- `source_packet_transition`
+  - Curated packet with real-world material that is internally mixed, evolving,
+    or timeline-sensitive.
+- `update_existing_doc`
+  - Existing document is the dominant constraint; success depends on precise
+    in-place edits without broad drift.
+- `founder_notes_ambiguity`
+  - Highly ambiguous notes where positioning, boundaries, and unresolved
+    questions matter more than completeness.
+- `cross_domain_generalization`
+  - Non-default domain chosen to test whether the skill overfits to developer
+    infrastructure examples.
+
+## Supporting axes
+
+### `input_shape`
+
+- `direct_prompt`
+- `notes_prompt`
+- `source_packet`
+- `existing_doc_update`
+
+### `ambiguity_level`
+
+- `low`
+- `medium`
+- `high`
+
+### `domain_profile`
+
+- `developer_infrastructure`
+- `company_foundation`
+- `non_developer_domain`
+
+### `primary_risks`
+
+Use a short list of the dominant failure modes for the eval. Current common
+values:
+
+- `template_drift`
+- `implementation_leakage`
+- `invented_capabilities`
+- `invented_certainty`
+- `scope_bleed`
+- `source_overfitting`
+- `weak_boundaries`
+- `update_regression`
+
+## Current expansion priority
+
+The next missing slices are:
+
+- `update_existing_doc` for `foundation-creator` once revise-in-place behavior
+  is defined
+- baseline comparison runs (`current skill` vs `previous skill` / `no skill`)
+  when the local harness is ready to compare deltas directly
+- optional Braintrust-style scorer/export integration if local JSON artifacts are
+  no longer sufficient for experiment tracking
diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs
index f64f4ab..d1e7e52 100644
--- a/scripts/run-baml-eval.mjs
+++ b/scripts/run-baml-eval.mjs
@@ -29,6 +29,10 @@ function normalizeHeading(line) {
     .trim();
 }
 
+function stripLeadingSectionNumber(heading) {
+  return heading.replace(/^\d+(?:\.\d+)*\.?\s+/, "").trim();
+}
+
 function summarizeNumeric(values) {
   if (values.length === 0) {
     return {
@@ -63,6 +67,28 @@ function hasUppercaseObligationKeyword(document) {
   return /\b(MUST|SHOULD|MAY)\b/.test(document);
 }
 
+function extractNonEmptyNormalizedLines(text) {
+  return text
+    .split(/\r?\n/)
+    .map((line) => normalizeLine(line))
+    .filter((line) => line.length > 0);
+}
+
+function linesAppearInOrder(needleLines, haystackLines) {
+  let needleIndex = 0;
+
+  for (const line of haystackLines) {
+    if (needleIndex >= needleLines.length) {
+      break;
+    }
+    if (line === needleLines[needleIndex]) {
+      needleIndex += 1;
+    }
+  }
+
+  return needleIndex === needleLines.length;
+}
+
 function runCommand(command, args, cwd) {
   return new Promise((resolve, reject) => {
     const child = spawn(command, args, {
@@ -170,8 +196,10 @@ async function buildPacket(evalEntry, evalsDir, packetType) {
   const packet = {
     packet_name: evalEntry.eval_name,
     task_prompt: evalEntry.prompt,
-    raw_notes: rawNotesPath ? await loadText(rawNotesPath) : "",
-    expected_criteria: expectedCriteriaPath ? await loadText(expectedCriteriaPath) : "",
+    raw_notes: rawNotesPath ? await loadText(rawNotesPath) : evalEntry.prompt,
+    expected_criteria: expectedCriteriaPath
+      ? await loadText(expectedCriteriaPath)
+      : (evalEntry.expected_output ?? ""),
   };
 
   if (packetType === "SpecEvalPacket") {
@@ -269,17 +297,46 @@ function extractFoundationDisallowedHeadings(templateText) {
   return disallowed;
 }
 
+function extractFoundationSectionBodies(candidateDocument) {
+  const lines = candidateDocument.split(/\r?\n/);
+  const sections = [];
+
+  for (const [index, rawLine] of lines.entries()) {
+    const match = rawLine.match(/^\s*##\s+(.+?)\s*$/);
+    if (match) {
+      sections.push({
+        title: normalizeLine(match[1]),
+        index,
+      });
+    }
+  }
+
+  const bodies = new Map();
+  for (let index = 0; index < sections.length; index += 1) {
+    const current = sections[index];
+    const next = sections[index + 1];
+    const start = current.index + 1;
+    const end = next ? next.index : lines.length;
+    bodies.set(current.title, lines.slice(start, end).join("\n").trim());
+  }
+
+  return {
+    lines,
+    sections,
+    bodies,
+  };
+}
+
 function validateFoundationDocument(candidateDocument, templateText) {
   const requiredSections = extractFoundationTemplateSections(templateText);
   const disallowedHeadings = extractFoundationDisallowedHeadings(templateText);
-  const lines = candidateDocument.split(/\r?\n/);
+  const { lines, sections, bodies } = extractFoundationSectionBodies(candidateDocument);
+  const titleLine = lines.find((line) => normalizeLine(line).length > 0) ?? "";
+  const hasMarkdownTitle = /^\s*#\s+.+\s+Foundation\s*$/.test(titleLine);
   const lineMap = new Map();
 
-  for (const [index, rawLine] of lines.entries()) {
-    const line = normalizeHeading(rawLine);
-    if (!line) {
-      continue;
-    }
+  for (const { title, index } of sections) {
+    const line = normalizeLine(title);
     if (!lineMap.has(line)) {
       lineMap.set(line, []);
     }
@@ -315,10 +372,7 @@ function validateFoundationDocument(candidateDocument, templateText) {
   const emptySections = [];
   for (let index = 0; index < positions.length; index += 1) {
     const current = positions[index];
-    const next = positions[index + 1];
-    const start = current.index + 1;
-    const end = next ? next.index : lines.length;
-    const sectionBody = lines.slice(start, end).join("\n").trim();
+    const sectionBody = bodies.get(current.section) ?? "";
     if (!sectionBody) {
       emptySections.push(current.section);
     }
@@ -327,8 +381,40 @@ function validateFoundationDocument(candidateDocument, templateText) {
   const presentDisallowedSections = [...disallowedHeadings].filter((section) =>
     (lineMap.get(section) ?? []).length > 0,
   );
+  const strategicBetsBody = bodies.get("Strategic Bets") ?? "";
+  const openQuestionsBody = bodies.get("Open Questions") ?? "";
+  const openQuestionBullets = openQuestionsBody
+    .split(/\r?\n/)
+    .map((line) => normalizeLine(line))
+    .filter((line) => line.startsWith("- "));
+  const openQuestionsLookOpen =
+    openQuestionBullets.length > 0 &&
+    openQuestionBullets.every((line) => line.endsWith("?"));
+  const strategicBetLines = strategicBetsBody
+    .split(/\r?\n/)
+    .map((line) => normalizeLine(line))
+    .filter((line) => line.startsWith("- "));
+  const strategicBetsBodyHasDirectionalPreamble = /\b(observed directional bets|public signals)\b/i.test(
+    strategicBetsBody,
+  );
+  const hedgedStrategicBets =
+    strategicBetLines.length === 0 ||
+    strategicBetLines.every((line) =>
+      /^\-\s+Bet:/i.test(line)
+        ? strategicBetsBodyHasDirectionalPreamble
+        : /\b(appears?|suggests?|signals?|signaling|indicates?|indicating|directional bet|directional bets|observed bet|observed bets|a bet that|bet that|bet on)\b/i.test(
+            line,
+          ),
+    );
 
   return [
+    createCheck(
+      "title_heading_present",
+      hasMarkdownTitle,
+      hasMarkdownTitle
+        ? "Detected the required markdown title heading."
+        : "Missing required title heading like `# <Primitive Name> Foundation`.",
+    ),
     createCheck(
       "required_sections_present_once",
       missingSections.length === 0 && duplicateSections.length === 0,
@@ -359,6 +445,20 @@ function validateFoundationDocument(candidateDocument, templateText) {
         ? "No disallowed downstream-planning sections were detected."
         : `Disallowed sections present: ${presentDisallowedSections.join(", ")}.`,
     ),
+    createCheck(
+      "strategic_bets_use_directional_language",
+      hedgedStrategicBets,
+      hedgedStrategicBets
+        ? "Strategic Bets are framed as directional bets or observed signals."
+        : "One or more Strategic Bets bullets read as settled conclusions instead of directional bets or observed signals.",
+    ),
+    createCheck(
+      "open_questions_remain_questions",
+      openQuestionsLookOpen,
+      openQuestionsLookOpen
+        ? "Open Questions are written as explicit unanswered questions."
+        : "Open Questions should be bullet questions that remain open and usually end with `?`.",
+    ),
     createCheck(
       "no_first_or_second_person",
       !hasPronounDrift(candidateDocument),
@@ -392,14 +492,20 @@ function validateSpecDocument(candidateDocument, templateText) {
   const requiredSections = extractSpecMajorSections(templateText);
   const lines = candidateDocument.split(/\r?\n/);
   const missingSections = requiredSections.filter(
-    (section) => !lineExists(lines, (line) => line.toLowerCase() === section.toLowerCase()),
+    (section) =>
+      !lineExists(
+        lines,
+        (line) => stripLeadingSectionNumber(line).toLowerCase() === section.toLowerCase(),
+      ),
   );
 
-  const hasPurpose = lineExists(lines, (line) => line === "Purpose");
-  const hasProblemStatement = lineExists(
-    lines,
-    (line) => line.toLowerCase() === "problem statement",
-  );
+  const hasStatusLine = lines.some((line) => /^\s*Status:\s+\S+/.test(line));
+  const hasPurposeLine = lines.some((line) => /^\s*Purpose:\s+\S+/.test(line));
+  const hasProblemStatement = lineExists(lines, (line) => line === "1. Problem Statement");
+  const hasGoalSubsections =
+    lineExists(lines, (line) => line === "2.1 Goals") &&
+    lineExists(lines, (line) => line === "2.2 Non-Goals");
+  const hasImportantBoundaryBlock = /(^|\n)Important boundary:\s*\n/m.test(candidateDocument);
   const hasNumberedComponents = /^\d+\.\s+`[^`]+`/m.test(candidateDocument);
   const hasFieldFormatting = /- `[^`]+` \([^)]+\)/.test(candidateDocument);
 
@@ -412,18 +518,39 @@ function validateSpecDocument(candidateDocument, templateText) {
         : `Missing major sections: ${missingSections.join(", ")}.`,
     ),
     createCheck(
-      "purpose_heading_present",
-      hasPurpose,
-      hasPurpose
-        ? "Purpose heading is present."
-        : "Purpose heading is missing.",
+      "status_line_present",
+      hasStatusLine,
+      hasStatusLine
+        ? "Status line is present."
+        : "Status line is missing or malformed.",
+    ),
+    createCheck(
+      "purpose_line_present",
+      hasPurposeLine,
+      hasPurposeLine
+        ? "Purpose line is present."
+        : "Purpose line is missing or malformed.",
     ),
     createCheck(
       "problem_statement_present",
       hasProblemStatement,
       hasProblemStatement
-        ? "Problem Statement heading is present."
-        : "Problem Statement heading is missing.",
+        ? "Problem Statement major section is present."
+        : "Problem Statement major section is missing.",
+    ),
+    createCheck(
+      "goals_subsections_present",
+      hasGoalSubsections,
+      hasGoalSubsections
+        ? "Detected `2.1 Goals` and `2.2 Non-Goals` subsections."
+        : "Missing one or both of the required goals subsections: `2.1 Goals`, `2.2 Non-Goals`.",
+    ),
+    createCheck(
+      "important_boundary_block_present",
+      hasImportantBoundaryBlock,
+      hasImportantBoundaryBlock
+        ? "Detected an `Important boundary:` block inside the document."
+        : "Did not detect the required `Important boundary:` block.",
     ),
     createCheck(
       "component_list_uses_numbering",
@@ -456,6 +583,55 @@ function validateSpecDocument(candidateDocument, templateText) {
   ];
 }
 
+function validateSpecUpdateDocument(candidateDocument, existingSpecText) {
+  const existingLines = extractNonEmptyNormalizedLines(existingSpecText);
+  const candidateLines = extractNonEmptyNormalizedLines(candidateDocument);
+  const preservesExistingContent = linesAppearInOrder(existingLines, candidateLines);
+  const hasOffsetStoreComponent = /^\s*4\.\s+`Offset Store`/m.test(candidateDocument);
+  const hasCrossRegionNonGoal =
+    /(^|\n)-\s+(Cross-region log replication\.|Replicating logs across regions\.)/mi.test(
+      candidateDocument,
+    );
+
+  return [
+    createCheck(
+      "existing_content_preserved_in_order",
+      preservesExistingContent,
+      preservesExistingContent
+        ? "All non-empty lines from the existing spec appear in order in the candidate."
+        : "One or more non-empty lines from the existing spec were removed or reordered.",
+    ),
+    createCheck(
+      "offset_store_component_present",
+      hasOffsetStoreComponent,
+      hasOffsetStoreComponent
+        ? "Detected numbered component `4. `Offset Store``."
+        : "Did not detect numbered component `4. `Offset Store``.",
+    ),
+    createCheck(
+      "cross_region_nongoal_present",
+      hasCrossRegionNonGoal,
+      hasCrossRegionNonGoal
+        ? "Detected the requested cross-region replication non-goal."
+        : "Did not detect the requested cross-region replication non-goal.",
+    ),
+    createCheck(
+      "no_first_or_second_person",
+      !hasPronounDrift(candidateDocument),
+      !hasPronounDrift(candidateDocument)
+        ? "No obvious first-person or second-person pronouns detected."
+        : "Detected first-person or second-person pronouns that violate the language guide.",
+    ),
+    createCheck(
+      "obligation_keywords_lowercase",
+      !hasUppercaseObligationKeyword(candidateDocument),
+      !hasUppercaseObligationKeyword(candidateDocument)
+        ? "No uppercase obligation keywords detected."
+        : "Detected uppercase MUST/SHOULD/MAY, which violates the language guide.",
+    ),
+  ];
+}
+
 async function runDeterministicChecks(skillRoot, validationContract, candidateDocument) {
   if (!validationContract || validationContract.type !== "reference_document_checks") {
     return {
@@ -465,11 +641,19 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo
     };
   }
 
-  const templatePath = path.join(skillRoot, validationContract.template_file);
-  const languagePath = path.join(skillRoot, validationContract.language_file);
-  const [templateText, languageText] = await Promise.all([
-    loadText(templatePath),
-    loadText(languagePath),
+  const templatePath = validationContract.template_file
+    ? path.join(skillRoot, validationContract.template_file)
+    : null;
+  const languagePath = validationContract.language_file
+    ? path.join(skillRoot, validationContract.language_file)
+    : null;
+  const existingSpecPath = validationContract.existing_spec_file
+    ? path.join(skillRoot, validationContract.existing_spec_file)
+    : null;
+  const [templateText, languageText, existingSpecText] = await Promise.all([
+    validationContract.template_file ? loadText(templatePath) : Promise.resolve(""),
+    languagePath ? loadText(languagePath) : Promise.resolve(""),
+    existingSpecPath ? loadText(existingSpecPath) : Promise.resolve(""),
   ]);
 
   let checks;
@@ -480,6 +664,9 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo
     case "spec-v1":
       checks = validateSpecDocument(candidateDocument, templateText, languageText);
       break;
+    case "spec-update-v1":
+      checks = validateSpecUpdateDocument(candidateDocument, existingSpecText, languageText);
+      break;
     default:
       fail(`Unknown validation contract '${validationContract.validator}'.`);
   }
@@ -521,6 +708,13 @@ async function runSingleTrial({
   const brief = await b[compileFnName](packet);
   timing.compile_ms = Date.now() - compileStartedAt;
 
+  if (runner.packet_type === "SpecEvalPacket") {
+    brief.update_request = packet.task_prompt;
+    if (packet.existing_spec) {
+      brief.existing_spec = packet.existing_spec;
+    }
+  }
+
   const renderStartedAt = Date.now();
   const candidateDocument = await b[renderFnName](brief);
   timing.render_ms = Date.now() - renderStartedAt;
@@ -621,6 +815,22 @@ function buildBenchmark(skillName, evalName, trials) {
   };
 }
 
+function extractEvalMetadata(evalEntry) {
+  const fields = [
+    "scenario_type",
+    "input_shape",
+    "ambiguity_level",
+    "domain_profile",
+    "primary_risks",
+  ];
+
+  return Object.fromEntries(
+    fields
+      .filter((field) => evalEntry[field] !== undefined)
+      .map((field) => [field, evalEntry[field]]),
+  );
+}
+
 async function main() {
   const { skillName, selector, trials } = parseArgs(process.argv.slice(2));
 
@@ -636,7 +846,7 @@ async function main() {
   const manifest = await loadJson(manifestPath);
   const evalEntry = getEvalBySelector(manifest.evals, selector);
   const runner = manifest.runner_contract;
-  const validationContract = manifest.validation_contract ?? null;
+  const validationContract = evalEntry.validation_contract ?? manifest.validation_contract ?? null;
 
   if (!runner || runner.type !== "baml_pipeline") {
     fail(`Skill '${skillName}' does not declare a supported runner_contract.`);
@@ -680,7 +890,10 @@ async function main() {
     }
   }
 
-  const benchmark = buildBenchmark(skillName, evalEntry.eval_name, trialResults);
+  const benchmark = {
+    ...buildBenchmark(skillName, evalEntry.eval_name, trialResults),
+    eval_metadata: extractEvalMetadata(evalEntry),
+  };
   await writeRunArtifacts(runDir, {
     "benchmark.json": benchmark,
   });
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index b4a90d1..f793b6f 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -109,10 +109,21 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Avoid implementation detail.
     - Emphasize thesis, boundaries, actor model, surfaces, and strategic bets.
     - State that the writer is a source-bound synthesizer, not a strategy consultant.
-    - Require exactly these sections unless the user asks otherwise: `What This Is`, `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, `Open Questions`.
+    - Require markdown output with exactly this heading structure unless the user asks otherwise:
+      - `# <Primitive Name> Foundation`
+      - `## What This Is`
+      - `## Core Thesis`
+      - `## Boundaries`
+      - `## Actor Model`
+      - `## Durable Surfaces`
+      - `## Strategic Bets`
+      - `## Open Questions`
     - Forbid extra sections like `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or `Roadmap`.
-    - Require `Strategic Bets` to be phrased as observed directional bets rather than prescriptions.
+    - Require `Strategic Bets` to be phrased as observed directional bets or public signals rather than prescriptions.
+    - Require each `Strategic Bets` bullet to start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`.
+    - Forbid bare `Bet:` labels and categorical claims like `X is the wedge` or `Y is a defensible primitive`.
     - Require recently emerging or transitional surfaces to be qualified explicitly rather than flattened as fully settled.
+    - Require `Open Questions` bullets to remain actual unresolved questions rather than disguised conclusions.
     - Forbid market-leadership or superiority claims unless they are explicit in the brief.
     - Make the prompt directly usable by an agent.
 
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index 7760d3d..fb8fd23 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -19,6 +19,11 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
     - Treat expected criteria as evaluation guidance, not as license to invent.
     - Avoid implementation detail.
     - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them.
+    - If `strategic_bets` are included, phrase them as observed directional signals or evidence.
+    - Each `strategic_bets` item should start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`.
+    - Do not use bare `Bet:` labels or categorical claims like `X is the wedge` or `Y is a defensible primitive`.
+    - Keep `strategic_bets` short when evidence is weak.
+    - Keep unresolved questions as actual unresolved questions rather than quietly resolving them in the summary.
     - Prefer omission over speculation.
 
     {{ ctx.output_format }}
@@ -38,10 +43,22 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     - Avoid implementation detail.
     - Stay source-bound: do not invent monetization, KPIs, org structure, partnerships, operating guidance, or next-step plans.
     - Prefer omission over plausible-sounding speculation.
-    - Use exactly these sections and no others: `What This Is`, `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, `Open Questions`.
+    - Return markdown.
+    - Start with `# <Primitive Name> Foundation`.
+    - Use exactly these `##` sections and no others:
+      - `## What This Is`
+      - `## Core Thesis`
+      - `## Boundaries`
+      - `## Actor Model`
+      - `## Durable Surfaces`
+      - `## Strategic Bets`
+      - `## Open Questions`
     - If `Strategic Bets` is weakly supported, keep it short rather than expanding it.
-    - Phrase `Strategic Bets` as observed directional bets, not recommendations or settled future state.
+    - Phrase `Strategic Bets` as observed directional bets or public signals, not recommendations or settled future state.
+    - Each `Strategic Bets` bullet must start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`.
+    - Do not use bare `Bet:` labels or categorical statements like `X is the wedge` or `Y is a defensible primitive`.
     - When a surface is visible but still evolving in the packet, qualify it explicitly as emerging, evolving, or unsettled.
+    - Write `Open Questions` as actual unresolved questions, typically ending with `?`.
     - Do not use market-leadership or competitive-superiority language unless the packet explicitly supports it.
 
     Brief:
@@ -70,8 +87,10 @@ function EvaluateFoundationDocument(
     - Penalize invented certainty, invented capabilities, or implementation leakage.
     - Penalize unsupported business-model, monetization, KPI, org, partnership, or operating-plan language.
     - Penalize consulting-style sections such as `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or similar drift.
+    - Penalize missing markdown heading structure if the document drifts from the required template shape.
     - Penalize `Strategic Bets` phrased as recommendations or settled conclusions when the packet only supports directional evidence.
     - Penalize flattening transitional surfaces as fully settled if the packet presents them as evolving.
+    - Penalize `Open Questions` that silently resolve ambiguity instead of keeping it open.
     - Penalize market-leadership or competitive-superiority claims not explicitly supported by the packet.
     - Use `Pass`, `Partial`, or `Fail` for each criterion.
 
diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json
index 1b5a1b8..d8c6e80 100644
--- a/skills/foundation-creator/evals/evals.json
+++ b/skills/foundation-creator/evals/evals.json
@@ -17,6 +17,15 @@
     {
       "id": 0,
       "eval_name": "create-foundation-from-vercel-source-packet",
+      "scenario_type": "source_packet_transition",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "company_foundation",
+      "primary_risks": [
+        "invented_certainty",
+        "scope_bleed",
+        "weak_boundaries"
+      ],
       "prompt": "Use the source packet in `fixtures/vercel/raw_notes.md` to draft a top-level foundation document for Vercel. Preserve ambiguity where the positioning is in transition. Do not produce a `SPEC.md`, implementation plan, or architecture diagram.",
       "expected_output": "A top-level foundation document that frames Vercel as a developer cloud/platform company, captures the current tension between `Frontend Cloud` and `AI Cloud`, identifies core surfaces such as deployment workflow, collaboration, security, AI infrastructure, and platform-building, clarifies that Vercel is not just static hosting or general-purpose IaaS, and preserves open questions or strategic bets instead of inventing certainty.",
       "expected_file": "fixtures/vercel/expected_criteria.md",
@@ -31,6 +40,15 @@
     {
       "id": 1,
       "eval_name": "create-foundation-from-cloudflare-source-packet",
+      "scenario_type": "source_packet_transition",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "company_foundation",
+      "primary_risks": [
+        "invented_certainty",
+        "scope_bleed",
+        "weak_boundaries"
+      ],
       "prompt": "Use the source packet in `fixtures/cloudflare/raw_notes.md` to draft a top-level foundation document for Cloudflare. Preserve the tension between the connectivity cloud, developer platform, and AI/agents platform framings. Do not produce a `SPEC.md`, implementation plan, or architecture diagram.",
       "expected_output": "A top-level foundation document that frames Cloudflare as a unified platform spanning security/connectivity, developer infrastructure, and AI surfaces; preserves the tension between `connectivity cloud` and developer/AI platform identities; identifies durable surfaces like network/security control plane, developer runtime, AI infrastructure, and platform-building primitives; clarifies boundaries against generic hyperscaler or single-product framings; and preserves open questions rather than inventing certainty.",
       "expected_file": "fixtures/cloudflare/expected_criteria.md",
@@ -41,6 +59,52 @@
       "files": [
         "fixtures/cloudflare/raw_notes.md"
       ]
+    },
+    {
+      "id": 2,
+      "eval_name": "create-foundation-from-lightfast-founder-notes",
+      "scenario_type": "founder_notes_ambiguity",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "company_foundation",
+      "primary_risks": [
+        "invented_certainty",
+        "scope_bleed",
+        "weak_boundaries"
+      ],
+      "prompt": "Use the source packet in `fixtures/lightfast_founder_notes/raw_notes.md` to draft a top-level foundation document for Lightfast. Preserve unresolved framing tension. Do not produce a `SPEC.md`, implementation plan, org design, or business strategy memo.",
+      "expected_output": "A top-level foundation document that frames Lightfast as a durable artifact/constraint layer for agent work, preserves the tension between installable skills, compiler/eval/document substrate, and a broader operating layer for durable agent work, identifies surfaces like skill packages, document artifacts, typed contracts, evals, and possible discovery/distribution, clarifies boundaries against chat wrappers, no-code automation, agencies, and fully autonomous company-in-a-box framings, and keeps open questions unresolved instead of inventing certainty.",
+      "expected_file": "fixtures/lightfast_founder_notes/expected_criteria.md",
+      "packet_files": {
+        "raw_notes": "fixtures/lightfast_founder_notes/raw_notes.md",
+        "expected_criteria": "fixtures/lightfast_founder_notes/expected_criteria.md"
+      },
+      "files": [
+        "fixtures/lightfast_founder_notes/raw_notes.md"
+      ]
+    },
+    {
+      "id": 3,
+      "eval_name": "create-foundation-from-harbor-care-source-packet",
+      "scenario_type": "cross_domain_generalization",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "non_developer_domain",
+      "primary_risks": [
+        "source_overfitting",
+        "invented_certainty",
+        "weak_boundaries"
+      ],
+      "prompt": "Use the source packet in `fixtures/harbor_care/raw_notes.md` to draft a top-level foundation document for Harbor Care. Treat it as a trust-heavy care navigation and coordination primitive, not a developer platform. Preserve unresolved framing tension and do not invent clinical authority, business strategy, or implementation detail.",
+      "expected_output": "A top-level foundation document that frames Harbor Care as a care navigation or coordination primitive, preserves the tension between shared longitudinal care picture, coordination operating layer, and benefits/logistics interpreter framings, identifies durable surfaces like intake, shared timeline, coordination, handoffs, and benefits interpretation, clarifies boundaries against telehealth, insurance, EHR, and clinician marketplace framings, and keeps open questions about automation, sponsor shape, and clinical boundary unresolved.",
+      "expected_file": "fixtures/harbor_care/expected_criteria.md",
+      "packet_files": {
+        "raw_notes": "fixtures/harbor_care/raw_notes.md",
+        "expected_criteria": "fixtures/harbor_care/expected_criteria.md"
+      },
+      "files": [
+        "fixtures/harbor_care/raw_notes.md"
+      ]
     }
   ]
 }
diff --git a/skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md b/skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md
new file mode 100644
index 0000000..e3ed403
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md
@@ -0,0 +1,24 @@
+# Expected Criteria
+
+- The output should identify Harbor Care as a care navigation or coordination
+  primitive, not as telehealth, insurance, an EHR, or a clinician marketplace.
+- The output should preserve at least some tension between the plausible
+  framings:
+  shared longitudinal care picture,
+  coordination operating layer,
+  or benefits/logistics interpreter.
+- The output should identify multiple durable surfaces such as intake/context,
+  shared timeline or longitudinal record, coordination, handoff support, and
+  benefits/eligibility interpretation.
+- The output should include an actor model that recognizes caregivers or
+  families, human advocates/coordinators, and at least one institutional actor
+  such as providers, payers, or employers.
+- The output should set clear boundaries against clinical diagnosis, telehealth,
+  insurance, or generic marketplaces.
+- The output should preserve open questions about business shape, automation vs
+  human-in-the-loop, and the clinical boundary instead of collapsing them into
+  certainty.
+- The output should not invent medical claims, treatment outcomes, revenue
+  model, internal org structure, or unsupported platform surfaces not present in
+  the packet.
+
diff --git a/skills/foundation-creator/evals/fixtures/harbor_care/raw_notes.md b/skills/foundation-creator/evals/fixtures/harbor_care/raw_notes.md
new file mode 100644
index 0000000..b08c67e
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/harbor_care/raw_notes.md
@@ -0,0 +1,69 @@
+# Harbor Care Source Packet
+
+Assembled on April 20, 2026 from synthetic product-positioning notes.
+
+This packet exists to test whether `foundation-creator` generalizes beyond
+developer infrastructure and still writes a strong foundation document when the
+domain is trust-heavy, operational, and human-centered.
+
+## Positioning notes
+
+- Harbor Care helps families and care teams navigate fragmented eldercare and
+  chronic-care coordination.
+- The problem is not diagnosis. The problem is that care episodes span many
+  people, systems, approvals, and handoffs, and nobody has a stable operating
+  picture.
+- Families keep becoming accidental project managers for health and home-care
+  logistics.
+- Human care advocates still matter. Trust breaks if the product acts like a
+  fully autonomous care robot.
+- Not telehealth.
+- Not an insurer.
+- Not an electronic health record.
+- Not a marketplace for doctors or home aides.
+- It might be best framed as a care navigation layer or coordination fabric,
+  but those are still slightly different framings.
+- One framing is "shared longitudinal picture of a care journey."
+- Another framing is "operating layer for care coordination across people and
+  institutions."
+- Another framing is "benefits + logistics interpreter for families."
+- The right center of gravity is not fully settled.
+
+## Durable surfaces that seem to recur
+
+- Intake and context gathering across patient, family, and care situation.
+- Shared timeline of events, decisions, documents, and upcoming tasks.
+- Coordination surface for family members, advocates, providers, and service
+  organizations.
+- Benefits or eligibility interpretation support.
+- Handoff support across hospital, clinic, home care, rehab, pharmacy, and
+  payer contexts.
+- Longitudinal record of what happened, what is pending, and who owns which
+  next action.
+
+## Actor notes
+
+- Patients may be involved directly, but family caregivers are often the active
+  operators.
+- Human care advocates or coordinators are central, not incidental.
+- Provider offices and discharge planners interact with the system unevenly.
+- Employers or payers might sponsor access in some versions of the business,
+  but that is not the core product truth.
+- Regulated and trust-heavy settings mean the product cannot casually overclaim
+  clinical authority.
+
+## Strategic tension and open questions
+
+- The company probably wins on trust, continuity, and coordination quality more
+  than pure automation.
+- There may be a bet on asynchronous coordination rather than forcing every
+  interaction into a live call center model.
+- There may be a bet on keeping humans in the loop while using software and AI
+  to structure the work around them.
+- Open question:
+  is Harbor Care primarily for families, for employer/payer-sponsored programs,
+  or for provider-linked coordination models?
+- Open question:
+  how much intelligence should be automated vs escalated to human advocates?
+- Open question:
+  where is the clean boundary between care coordination and clinical guidance?
diff --git a/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md
new file mode 100644
index 0000000..28cc166
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md
@@ -0,0 +1,28 @@
+# Expected Criteria
+
+- The output should identify Lightfast as a durable artifact or constraint layer
+  for agent work, not merely a chat wrapper, prompt library, or generic AI
+  assistant.
+- The output should preserve the tension between at least two plausible
+  framings:
+  installable skills/distribution,
+  compiler/eval/document substrate,
+  or a broader operating layer for durable agent work.
+- The output should identify multiple durable surfaces such as skill packages,
+  foundation/spec artifacts, typed contracts or compiler surfaces, evals, and
+  possibly discovery or distribution.
+- The output should include an actor model that recognizes at least some mix of
+  founders/operators, builders or engineering teams, and other teams using
+  agents for repeatable work.
+- The output should set clear boundaries:
+  not project management,
+  not generic no-code workflow automation,
+  not an agency,
+  and not a fully autonomous company-in-a-box.
+- The output should preserve open questions around repo-native vs hosted,
+  authoring vs evaluation vs distribution, and coding-first vs broader
+  applicability instead of pretending those choices are settled.
+- The output should not invent monetization, marketplace certainty, internal
+  org structure, financial claims, or execution plans that are not present in
+  the notes.
+
diff --git a/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md
new file mode 100644
index 0000000..ee51510
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md
@@ -0,0 +1,76 @@
+# Lightfast Founder Notes Packet
+
+Assembled on April 20, 2026 from synthetic founder-style notes for Lightfast.
+
+This packet is intentionally messy, incomplete, and slightly contradictory. It
+tests whether `foundation-creator` can preserve ambiguity and write a durable
+foundation document without inventing resolution, monetization, or execution
+plans.
+
+## Raw notes
+
+- Lightfast feels like it sits in the layer before code and after vague intent:
+  turn messy human direction into durable working artifacts.
+- Skills are probably the first wedge, but the company does not obviously stop
+  at "prompt packs" or a "skills marketplace."
+- Not another chat wrapper.
+- Not an agency that manually delivers strategy decks or ops work.
+- Not generic no-code workflow automation.
+- Probably not a fully autonomous "company in a box" either.
+- Strong belief: prompts vanish, transcripts vanish, and code is too late;
+  durable artifacts like foundation docs, specs, typed contracts, evals, and
+  reusable skill packages survive model churn better.
+- The product should help rough notes become clearer artifacts:
+  messy notes -> foundation -> spec -> working automation.
+- There is a real boundary here: the system should structure judgment, not
+  silently replace it.
+- Human supervision matters. The product should not quietly decide company
+  strategy or product direction because a model sounded confident.
+- Maybe the primitive is "turn ambiguity into stable constraints."
+- Maybe the primitive is "operating system for durable agent work."
+- That second framing might be too grand or premature.
+- Feels repo-native and artifact-native, not just chat-native.
+- There is something important about local-first work loops, versioned files,
+  and outputs that can be inspected and edited.
+- Possible durable surfaces:
+  skills, compiler contracts, eval runners, document templates, reference
+  artifacts, maybe later a discovery/catalog layer.
+- Maybe distribution ends up mattering as much as authoring:
+  not only creating skills, but making operational knowledge installable.
+- Unsure whether the main buyer/user is:
+  solo builder,
+  founder/operator,
+  product/engineering team,
+  or internal ops team running agents.
+- Coding is the easiest first wedge because eval loops are tighter, but the
+  company should probably not be framed as coding-only forever.
+- It may eventually apply across coding, ops, research, support, GTM, and other
+  repeatable work with heavy ambiguity.
+- Tension:
+  is this mainly better instructions for agents,
+  or a durable interface layer between humans, models, repositories, and
+  outputs?
+- Tension:
+  local-first repo tooling
+  vs a hosted control plane, catalog, or distribution surface.
+- Tension:
+  standardization that makes artifacts portable
+  vs customization that matches how each team actually works.
+- Good boundary:
+  not project management,
+  not ticketing,
+  not a drag-and-drop workflow builder.
+- Another boundary:
+  not a general AI assistant that answers everything.
+- Another boundary:
+  not a substitute for direct ownership or decision-making.
+- Open question:
+  when does a foundation document become a spec, and when does a spec become
+  code?
+- Open question:
+  how much of the value is in authoring, how much in evaluation, and how much in
+  distribution?
+- Open question:
+  should Lightfast remain repo-native first, or eventually become a hosted
+  system of record for agent work?
+
diff --git a/skills/foundation-creator/references/language.md b/skills/foundation-creator/references/language.md
index d51ba80..49a4370 100644
--- a/skills/foundation-creator/references/language.md
+++ b/skills/foundation-creator/references/language.md
@@ -16,7 +16,21 @@ How the foundation document should be worded.
 - Declarative statements over persuasive rhetoric.
 - Prefer short, dense paragraphs and compact bullets.
 
-## 3. Restraint Rules
+## 3. Structural Conventions
+
+- Use `#` for the document title: `# <Primitive Name> Foundation`.
+- Use `##` for every major section in `references/template.md`.
+- Keep the section order exactly as the template defines it.
+- Use bullets for `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`,
+  `Strategic Bets`, and `Open Questions`.
+- `Open Questions` bullets should be written as actual open questions and usually
+  end with `?`.
+- `Strategic Bets` should be framed as observed directional signals or bets
+  rather than recommendations. Wording like `materials suggest a bet on...`,
+  `the company appears to be betting on...`, or `public signals indicate...`
+  is preferred.
+
+## 4. Restraint Rules
 
 - Prefer omission over invention.
 - If a point is plausible but not supported, omit it or convert it into an
@@ -29,7 +43,7 @@ How the foundation document should be worded.
 - Do not assert market leadership, competitive superiority, or winner/loser
   framing unless directly supported by the source packet.
 
-## 4. Allowed Section Behavior
+## 5. Allowed Section Behavior
 
 - `What This Is` explains the primitive at a durable level.
 - `Core Thesis` contains only source-backed, thesis-level claims.
@@ -39,13 +53,16 @@ How the foundation document should be worded.
   implementation components. If a surface is source-visible but still in
   transition, qualify it explicitly as emerging or evolving.
 - `Strategic Bets` should be minimal and clearly grounded in repeated signals.
-  Phrase them as observed directional bets (`public materials suggest a bet
-  on...`, `the company appears to be betting on...`) rather than settled
-  declarations or recommendations.
+  Each bullet should start with visible-evidence language such as `the notes
+  suggest a bet on...`, `public materials suggest...`, `there are visible
+  signals that...`, or `the company appears to be betting on...` rather than
+  settled declarations or recommendations.
+  Avoid naked labels like `Bet:` and avoid categorical claims like `X is the
+  wedge` or `Y is a defensible primitive`.
 - `Open Questions` should remain open rather than being quietly resolved in
   prose elsewhere.
 
-## 5. Disallowed Drift
+## 6. Disallowed Drift
 
 - No `Success Signals`, KPI, or metrics section.
 - No monetization strategy or revenue language.
@@ -55,7 +72,7 @@ How the foundation document should be worded.
 - No market-leadership or competitive-positioning claims unless explicit in the
   source.
 
-## 6. Tone
+## 7. Tone
 
 - Dense, calm, and specific.
 - No hype language.
diff --git a/skills/foundation-creator/references/template.md b/skills/foundation-creator/references/template.md
index 7fc4adf..2a323ab 100644
--- a/skills/foundation-creator/references/template.md
+++ b/skills/foundation-creator/references/template.md
@@ -1,6 +1,8 @@
 # {Primitive Name} Foundation
 
 Use only the sections below unless the user explicitly asks for more.
+All headings in the final document must use markdown heading syntax exactly as
+shown here.
 
 ## What This Is
 
@@ -28,13 +30,13 @@ Use only the sections below unless the user explicitly asks for more.
 
 ## Strategic Bets
 
-- {Only if clearly supported by the material.}
-- {Use fewer bullets rather than speculative ones.}
+- {The notes suggest a bet on ...}
+- {There are visible signals that ...}
 
 ## Open Questions
 
-- {Unresolved tension or ambiguity the source does not settle.}
-- {Another open question.}
+- {Unresolved tension or ambiguity the source does not settle?}
+- {Another open question?}
 
 ## Disallowed Sections
 
diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
index 83ba3a7..3339db7 100644
--- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -73,6 +73,28 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
     - Ask for a top-level `SPEC.md`.
     - Keep the output language-agnostic and behavioral.
     - Preserve unresolved questions instead of inventing decisions.
+    - If the brief includes `existing_spec`, treat the task as update mode and preserve the existing document verbatim except for the requested edits.
+    - Require the canonical `spec-creator` section shape:
+      - `# <Service Name> Specification`
+      - `Status: Draft v1 (language-agnostic)`
+      - `Purpose: <one sentence>`
+      - `## 1. Problem Statement`
+      - `## 2. Goals and Non-Goals`
+      - `### 2.1 Goals`
+      - `### 2.2 Non-Goals`
+      - `## 3. System Overview`
+      - `### 3.1 Main Components`
+      - `### 3.2 External Dependencies`
+      - `## 4. Core Domain Model`
+      - `### 4.1 Entities`
+    - Require `## 1. Problem Statement` to contain:
+      - one opening paragraph
+      - a line like `The service solves <N> operational problems:`
+      - a bullet list of explicit operational problems
+    - Require `Important boundary:` as a labeled block inside the Problem Statement.
+    - Require numbered components and template-shaped field lines: `- `field_name` (type, constraints)`.
+    - Require external dependencies to stay at the system/service level rather than naming low-level API methods or SDK calls.
+    - In update mode, do not add new top-level sections or template-completion material unless the request explicitly asks for them.
     - Emphasize problem statement, goals, non-goals, components, dependencies, and entities.
     - Make the prompt directly usable by an agent.
 
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
index b94e59a..d093142 100644
--- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -33,11 +33,43 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
     Draft a `SPEC.md` from the brief below.
 
     Rules:
+    - Output valid markdown.
     - Keep the document behavioral and language-agnostic.
-    - Use problem statement, goals, non-goals, boundaries, components,
-      dependencies, and entities.
+    - If `existing_spec` is present, treat this as update mode and let the update-mode rules override the create-mode template-completion rules below.
+    - Follow the `spec-creator` template shape directly. Do not rename the core headings or collapse them into ad hoc labels.
+    - Use exactly this opening shape:
+      - `# <Service Name> Specification`
+      - `Status: Draft v1 (language-agnostic)`
+      - `Purpose: <one sentence>`
+    - Include these sections in order:
+      - `## 1. Problem Statement`
+      - `## 2. Goals and Non-Goals`
+      - `### 2.1 Goals`
+      - `### 2.2 Non-Goals`
+      - `## 3. System Overview`
+      - `### 3.1 Main Components`
+      - `### 3.2 External Dependencies`
+      - `## 4. Core Domain Model`
+      - `### 4.1 Entities`
+    - In `## 1. Problem Statement`, write:
+      - one opening paragraph describing the service at a high level
+      - a line like `The service solves <N> operational problems:`
+      - a bullet list of concrete operational problems
+    - Keep `Important boundary:` as a labeled block inside `## 1. Problem Statement`. Do not turn it into a standalone section.
+    - Use a numbered list for `### 3.1 Main Components` in the form `1. `Component Name`` followed by indented responsibility bullets.
+    - For entities, use `#### 4.1.x EntityName`, then `Fields:`, then field lines in the form `- `field_name` (type, constraints)` with indented semantic bullets and optional `Default: `value`` lines.
+    - Use logical field types (`string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, `string or null`), not implementation types.
+    - List external dependencies as systems or services, not low-level API method names, SDK calls, classes, or internal implementation choices.
     - Preserve uncertainty where the source packet is in transition.
     - Avoid implementation detail.
+    - Optional extra sections are allowed only when the brief materially needs them; if used, place them after `## 4. Core Domain Model`.
+    - Do not add end markers, appendix-only filler, or operational/program-management sections unless the brief explicitly requires them.
+    - If `existing_spec` is present, treat this as update mode:
+      - start from `existing_spec`
+      - preserve all unchanged lines verbatim
+      - apply only the edits implied by `update_request`
+      - do not rewrite existing paragraphs, bullets, or component descriptions unless the request requires it
+      - do not add new top-level sections, domain model content, dependencies, or other template-completion material unless `update_request` explicitly asks for them
 
     Brief:
     {{ brief|format(type="yaml") }}
@@ -63,6 +95,7 @@ function EvaluateSpecDocument(
     - Grade against the expected criteria explicitly.
     - Penalize invented capabilities, invented certainty, or implementation leakage.
     - Reward correct scope boundaries and careful handling of transition states.
+    - Penalize missing core template sections, malformed field formatting, or moving `Important boundary:` out of the Problem Statement block.
     - Use `Pass`, `Partial`, or `Fail` for each criterion.
 
     {{ ctx.output_format }}
diff --git a/skills/spec-creator/baml_src/spec_compiler/spec_types.baml b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
index 3d116b5..2e03a1e 100644
--- a/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
@@ -27,6 +27,8 @@ class SpecBrief {
   external_dependencies string[]
   entities EntityBrief[]
   unresolved_questions string[]
+  update_request string?
+  existing_spec string?
 }
 
 class SpecCritique {
diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json
index cec004a..93083be 100644
--- a/skills/spec-creator/evals/evals.json
+++ b/skills/spec-creator/evals/evals.json
@@ -17,6 +17,14 @@
     {
       "id": 0,
       "eval_name": "create-from-clear-intent",
+      "scenario_type": "clear_intent_prompt",
+      "input_shape": "direct_prompt",
+      "ambiguity_level": "low",
+      "domain_profile": "developer_infrastructure",
+      "primary_risks": [
+        "template_drift",
+        "implementation_leakage"
+      ],
       "prompt": "Can you write a SPEC.md for a service we're building called Glacier Tier Manager? It watches S3 buckets and moves old objects to Glacier Deep Archive after policy-defined age thresholds. Polling, not event-driven — we want deterministic cadence. Main components: a Bucket Poller that lists objects on a schedule, a Policy Evaluator that decides which objects are eligible, and a Tiering Executor that issues the storage-class change. External deps: S3 API and per-bucket lifecycle policy files. Not a general-purpose cost optimizer — it only does tiering.",
       "expected_output": "A SPEC.md at repo root covering Purpose, Problem Statement (with 3 bullets + 'Important boundary' block), Goals + Non-Goals (non-goal should reflect 'not a cost optimizer'), System Overview with the three named components, External Dependencies, and a Core Domain Model. Voice is third-person, fields use the `name` (type) format, obligation keywords are lowercase.",
       "files": []
@@ -24,6 +32,14 @@
     {
       "id": 1,
       "eval_name": "create-from-unstructured-notes",
+      "scenario_type": "unstructured_notes_prompt",
+      "input_shape": "notes_prompt",
+      "ambiguity_level": "medium",
+      "domain_profile": "developer_infrastructure",
+      "primary_risks": [
+        "template_drift",
+        "weak_boundaries"
+      ],
       "prompt": "I have rough notes on a service I want formalized into a SPEC.md. Here's what I have:\n\n- name: Hookshot. it's a webhook retry daemon.\n- retries failed outbound webhooks with exponential backoff\n- keeps state per endpoint (success rate, last attempt)\n- has a per-URL circuit breaker so one broken target doesn't starve others\n- retry schedules configurable per endpoint\n- NOT a general-purpose message queue. teams shouldn't push arbitrary jobs through it\n- reads webhook delivery records from a postgres table called webhook_deliveries\n- marks them delivered or dead-lettered\n\nTurn this into a proper spec.",
       "expected_output": "A SPEC.md that reorganizes the notes into the template structure: Purpose (one sentence about webhook retry), Problem Statement with bullets, 'Important boundary' + Non-Goals covering the 'not a queue' constraint, Main Components (retry scheduler, circuit breaker, state tracker or equivalent), External Dependencies (postgres), Domain Model with an entity for the delivery record. Notes-style phrasings are rewritten in spec voice.",
       "files": []
@@ -31,13 +47,40 @@
     {
       "id": 2,
       "eval_name": "update-add-nongoal-and-component",
+      "scenario_type": "update_existing_doc",
+      "input_shape": "existing_doc_update",
+      "ambiguity_level": "low",
+      "domain_profile": "developer_infrastructure",
+      "primary_risks": [
+        "update_regression",
+        "template_drift"
+      ],
       "prompt": "We have an existing SPEC.md at the repo root for Log Shipper. Please update it: add a non-goal stating that cross-region log replication is out of scope, and add a new main component called `Offset Store` that persists per-file read offsets. Keep everything else as-is.",
       "expected_output": "SPEC.md with the existing sections preserved verbatim except: (1) a new bullet added under Non-Goals covering cross-region replication, phrased as a gerund/noun phrase per the language guide; (2) a new numbered component `Offset Store` added under 3.1 with a verb-led description. Section numbering remains 1, 2, 3; component numbering extends to 4. No first-person pronouns introduced.",
+      "validation_contract": {
+        "type": "reference_document_checks",
+        "validator": "spec-update-v1",
+        "language_file": "references/language.md",
+        "existing_spec_file": "evals/fixtures/existing_spec.md"
+      },
+      "packet_files": {
+        "existing_spec": "fixtures/existing_spec.md"
+      },
       "files": ["fixtures/existing_spec.md"]
     },
     {
       "id": 3,
       "eval_name": "create-from-vercel-mcp-source-packet",
+      "scenario_type": "source_packet_transition",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "developer_infrastructure",
+      "primary_risks": [
+        "invented_capabilities",
+        "invented_certainty",
+        "template_drift",
+        "implementation_leakage"
+      ],
       "prompt": "Use the source packet in `fixtures/vercel_mcp/raw_notes.md` to write a `SPEC.md` for a service called `Vercel MCP`. Treat it as a long-running remote MCP service, not as a company-level foundation document. Preserve timeline-specific ambiguity where the source packet is in transition, and do not invent write capabilities that the notes do not justify.",
       "expected_output": "A `SPEC.md` that frames `Vercel MCP` as an OAuth-protected remote MCP service for AI tools, includes a Problem Statement about secure structured access to Vercel docs, projects, deployments, and logs, sets clear goals and non-goals, identifies boundaries around approved clients and official endpoint usage, captures the current beta/read-only tension without pretending the service already has unconstrained write access, and stays behavioral rather than implementation-level.",
       "expected_file": "fixtures/vercel_mcp/expected_criteria.md",

From 65d821ffac690d945251bf55168845b4f40b9b57 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Tue, 21 Apr 2026 00:28:31 +1000
Subject: [PATCH 06/30] Add foundation update-mode eval coverage

---
 README.md                                     |   2 +
 evals/TAXONOMY.md                             |   4 +-
 scripts/run-baml-eval.mjs                     | 105 ++++++++++++++++--
 skills/foundation-creator/SKILL.md            |  49 ++++++++
 .../compiler_functions.baml                   |   2 +
 .../foundation_compiler/eval_runner.baml      |  16 +++
 .../foundation_compiler/eval_types.baml       |   1 +
 .../foundation_compiler/foundation_types.baml |   2 +
 skills/foundation-creator/evals/evals.json    |  53 +++++++++
 .../evals/fixtures/existing_foundation.md     |  56 ++++++++++
 .../expected_criteria.md                      |  13 +++
 .../foundation-creator/references/language.md |   2 +
 12 files changed, 293 insertions(+), 12 deletions(-)
 create mode 100644 skills/foundation-creator/evals/fixtures/existing_foundation.md
 create mode 100644 skills/foundation-creator/evals/fixtures/lightfast_foundation_update/expected_criteria.md

diff --git a/README.md b/README.md
index e38a7db..9381b4c 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,7 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and
 bun install
 bun run eval:foundation -- create-foundation-from-vercel-source-packet
 bun run eval:foundation -- create-foundation-from-lightfast-founder-notes
+bun run eval:foundation -- update-lightfast-foundation-boundary-surface-question
 bun run eval:spec -- create-from-vercel-mcp-source-packet
 bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3
 ```
@@ -42,6 +43,7 @@ Current `foundation-creator` corpus includes:
 - `create-foundation-from-cloudflare-source-packet`
 - `create-foundation-from-lightfast-founder-notes`
 - `create-foundation-from-harbor-care-source-packet`
+- `update-lightfast-foundation-boundary-surface-question`
 
 The runner now also writes:
 
diff --git a/evals/TAXONOMY.md b/evals/TAXONOMY.md
index 3f843e7..7cb33b3 100644
--- a/evals/TAXONOMY.md
+++ b/evals/TAXONOMY.md
@@ -74,8 +74,8 @@ values:
 
 The next missing slices are:
 
-- `update_existing_doc` for `foundation-creator` once revise-in-place behavior
-  is defined
+- a second `update_existing_doc` packet for `foundation-creator` that requires
+  replacing or tightening existing language rather than additive edits only
 - baseline comparison runs (`current skill` vs `previous skill` / `no skill`)
   when the local harness is ready to compare deltas directly
 - optional Braintrust-style scorer/export integration if local JSON artifacts are
diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs
index d1e7e52..4f01658 100644
--- a/scripts/run-baml-eval.mjs
+++ b/scripts/run-baml-eval.mjs
@@ -192,6 +192,9 @@ async function buildPacket(evalEntry, evalsDir, packetType) {
   const existingSpecPath = packetFiles.existing_spec
     ? path.join(evalsDir, packetFiles.existing_spec)
     : null;
+  const existingFoundationPath = packetFiles.existing_foundation
+    ? path.join(evalsDir, packetFiles.existing_foundation)
+    : null;
 
   const packet = {
     packet_name: evalEntry.eval_name,
@@ -206,6 +209,12 @@ async function buildPacket(evalEntry, evalsDir, packetType) {
     packet.existing_spec = existingSpecPath ? await loadText(existingSpecPath) : null;
   }
 
+  if (packetType === "FoundationEvalPacket") {
+    packet.existing_foundation = existingFoundationPath
+      ? await loadText(existingFoundationPath)
+      : null;
+  }
+
   return packet;
 }
 
@@ -327,6 +336,34 @@ function extractFoundationSectionBodies(candidateDocument) {
   };
 }
 
+function extractMarkdownBullets(sectionBody) {
+  const bullets = [];
+  let current = null;
+
+  for (const rawLine of sectionBody.split(/\r?\n/)) {
+    if (/^\s*-\s+/.test(rawLine)) {
+      if (current) {
+        bullets.push(current);
+      }
+      current = normalizeLine(rawLine.replace(/^\s*-\s+/, ""));
+      continue;
+    }
+
+    const line = normalizeLine(rawLine);
+    if (!current || line.length === 0) {
+      continue;
+    }
+
+    current = `${current} ${line}`.trim();
+  }
+
+  if (current) {
+    bullets.push(current);
+  }
+
+  return bullets;
+}
+
 function validateFoundationDocument(candidateDocument, templateText) {
   const requiredSections = extractFoundationTemplateSections(templateText);
   const disallowedHeadings = extractFoundationDisallowedHeadings(templateText);
@@ -383,24 +420,18 @@ function validateFoundationDocument(candidateDocument, templateText) {
   );
   const strategicBetsBody = bodies.get("Strategic Bets") ?? "";
   const openQuestionsBody = bodies.get("Open Questions") ?? "";
-  const openQuestionBullets = openQuestionsBody
-    .split(/\r?\n/)
-    .map((line) => normalizeLine(line))
-    .filter((line) => line.startsWith("- "));
+  const openQuestionBullets = extractMarkdownBullets(openQuestionsBody);
   const openQuestionsLookOpen =
     openQuestionBullets.length > 0 &&
     openQuestionBullets.every((line) => line.endsWith("?"));
-  const strategicBetLines = strategicBetsBody
-    .split(/\r?\n/)
-    .map((line) => normalizeLine(line))
-    .filter((line) => line.startsWith("- "));
+  const strategicBetLines = extractMarkdownBullets(strategicBetsBody);
   const strategicBetsBodyHasDirectionalPreamble = /\b(observed directional bets|public signals)\b/i.test(
     strategicBetsBody,
   );
   const hedgedStrategicBets =
     strategicBetLines.length === 0 ||
     strategicBetLines.every((line) =>
-      /^\-\s+Bet:/i.test(line)
+      /^Bet:/i.test(line)
         ? strategicBetsBodyHasDirectionalPreamble
         : /\b(appears?|suggests?|signals?|signaling|indicates?|indicating|directional bet|directional bets|observed bet|observed bets|a bet that|bet that|bet on)\b/i.test(
             line,
@@ -469,6 +500,41 @@ function validateFoundationDocument(candidateDocument, templateText) {
   ];
 }
 
+function validateFoundationUpdateDocument(
+  candidateDocument,
+  existingFoundationText,
+  templateText,
+  validationContract,
+) {
+  const baseChecks = validateFoundationDocument(candidateDocument, templateText);
+  const existingLines = extractNonEmptyNormalizedLines(existingFoundationText);
+  const candidateLines = extractNonEmptyNormalizedLines(candidateDocument);
+  const normalizedCandidate = normalizeLine(candidateDocument);
+  const preservesExistingContent = linesAppearInOrder(existingLines, candidateLines);
+  const requiredPatternChecks = (validationContract.required_patterns ?? []).map((patternCheck) => {
+    const expression = new RegExp(patternCheck.pattern, patternCheck.flags ?? "i");
+    const passed = expression.test(normalizedCandidate);
+
+    return createCheck(
+      patternCheck.id,
+      passed,
+      passed ? patternCheck.details_pass : patternCheck.details_fail,
+    );
+  });
+
+  return [
+    ...baseChecks,
+    createCheck(
+      "existing_content_preserved_in_order",
+      preservesExistingContent,
+      preservesExistingContent
+        ? "All non-empty lines from the existing foundation appear in order in the candidate."
+        : "One or more non-empty lines from the existing foundation were removed or reordered.",
+    ),
+    ...requiredPatternChecks,
+  ];
+}
+
 function extractSpecMajorSections(templateText) {
   const sections = [];
   const lines = templateText.split(/\r?\n/);
@@ -650,10 +716,14 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo
   const existingSpecPath = validationContract.existing_spec_file
     ? path.join(skillRoot, validationContract.existing_spec_file)
     : null;
-  const [templateText, languageText, existingSpecText] = await Promise.all([
+  const existingFoundationPath = validationContract.existing_foundation_file
+    ? path.join(skillRoot, validationContract.existing_foundation_file)
+    : null;
+  const [templateText, languageText, existingSpecText, existingFoundationText] = await Promise.all([
     validationContract.template_file ? loadText(templatePath) : Promise.resolve(""),
     languagePath ? loadText(languagePath) : Promise.resolve(""),
     existingSpecPath ? loadText(existingSpecPath) : Promise.resolve(""),
+    existingFoundationPath ? loadText(existingFoundationPath) : Promise.resolve(""),
   ]);
 
   let checks;
@@ -661,6 +731,14 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo
     case "foundation-v1":
       checks = validateFoundationDocument(candidateDocument, templateText, languageText);
       break;
+    case "foundation-update-v1":
+      checks = validateFoundationUpdateDocument(
+        candidateDocument,
+        existingFoundationText,
+        templateText,
+        validationContract,
+      );
+      break;
     case "spec-v1":
       checks = validateSpecDocument(candidateDocument, templateText, languageText);
       break;
@@ -715,6 +793,13 @@ async function runSingleTrial({
     }
   }
 
+  if (runner.packet_type === "FoundationEvalPacket") {
+    brief.update_request = packet.task_prompt;
+    if (packet.existing_foundation) {
+      brief.existing_foundation = packet.existing_foundation;
+    }
+  }
+
   const renderStartedAt = Date.now();
   const candidateDocument = await b[renderFnName](brief);
   timing.render_ms = Date.now() - renderStartedAt;
diff --git a/skills/foundation-creator/SKILL.md b/skills/foundation-creator/SKILL.md
index 811bb59..88f7bad 100644
--- a/skills/foundation-creator/SKILL.md
+++ b/skills/foundation-creator/SKILL.md
@@ -39,9 +39,57 @@ Load on demand, not upfront.
 - Prefer explicit open questions over invented certainty.
 - Separate durable beliefs from speculative bets.
 - Avoid implementation detail unless the user explicitly wants it.
+- When updating an existing foundation document, preserve section order and
+  untouched wording unless the request explicitly requires a rewrite.
 - Escalate to `spec-creator` only when a subsystem is concrete enough to
   deserve a `SPEC.md`.
 
+## Decide: create or update
+
+Before writing:
+
+1. If the user provides an existing foundation document or asks to revise one,
+   use **update mode**.
+2. Otherwise, use **create mode**.
+
+Do not treat a revise-in-place request as a greenfield rewrite. The distinction
+matters because foundation documents often preserve useful ambiguity that
+should not be flattened during an edit.
+
+## Create mode
+
+Gather only what is still missing from the notes:
+
+- what the primitive or company is
+- what it is not
+- durable thesis-level beliefs
+- meaningful actors
+- durable surfaces
+- unresolved tensions or open questions
+
+Then draft the document using `references/template.md` and validate it against
+`references/language.md`.
+
+## Update mode
+
+1. Read the existing foundation document fully.
+2. Scope the requested change narrowly.
+3. Edit in place:
+   - preserve the existing heading structure
+   - keep unchanged lines and bullets verbatim where possible
+   - add or revise only what the request requires
+   - do not add new sections, planning material, or stronger certainty unless
+     the request explicitly justifies it
+4. Re-read and validate the whole document, not just the changed section.
+
+Typical update shapes:
+
+- add a sharper boundary
+- add or remove a durable surface
+- clarify a strategic bet without turning it into a recommendation
+- add an open question that the earlier draft omitted
+- tighten an overconfident sentence back into source-bound language
+
 ## Allowed content
 
 - What the primitive is.
@@ -78,6 +126,7 @@ Before finalizing, check for these failure modes:
 - business-model speculation
 - metrics or operational milestones not present in the source
 - missing explicit open questions where the notes remain unsettled
+- update drift that rewrites unchanged material or quietly resolves ambiguity
 
 ## Current compiler surface
 
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index f793b6f..6a9f9a4 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -109,6 +109,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Avoid implementation detail.
     - Emphasize thesis, boundaries, actor model, surfaces, and strategic bets.
     - State that the writer is a source-bound synthesizer, not a strategy consultant.
+    - If the brief includes `existing_foundation`, treat the task as update mode and preserve the existing document verbatim except for the requested edits.
     - Require markdown output with exactly this heading structure unless the user asks otherwise:
       - `# <Primitive Name> Foundation`
       - `## What This Is`
@@ -125,6 +126,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Require recently emerging or transitional surfaces to be qualified explicitly rather than flattened as fully settled.
     - Require `Open Questions` bullets to remain actual unresolved questions rather than disguised conclusions.
     - Forbid market-leadership or superiority claims unless they are explicit in the brief.
+    - In update mode, do not add new sections or broad new framing unless the request explicitly asks for them.
     - Make the prompt directly usable by an agent.
 
     Brief:
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index fb8fd23..edaa130 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -10,6 +10,9 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
     Raw notes:
     {{ packet.raw_notes }}
 
+    Existing foundation:
+    {{ packet.existing_foundation }}
+
     Expected criteria:
     {{ packet.expected_criteria }}
 
@@ -18,6 +21,11 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
     - Preserve ambiguity where the notes do not settle the framing.
     - Treat expected criteria as evaluation guidance, not as license to invent.
     - Avoid implementation detail.
+    - If `existing_foundation` is present, treat this as update mode:
+      - preserve the existing section structure
+      - carry forward unchanged framing and language where possible
+      - extract a narrow `update_request` rather than rewriting the entire document
+      - do not resolve existing ambiguity unless the task prompt explicitly asks for it
     - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them.
     - If `strategic_bets` are included, phrase them as observed directional signals or evidence.
     - Each `strategic_bets` item should start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`.
@@ -41,6 +49,7 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     - Start from thesis and boundaries, not architecture.
     - Preserve unresolved questions explicitly.
     - Avoid implementation detail.
+    - If `existing_foundation` is present, treat this as update mode and let the update-mode rules override the create-mode template-completion rules below.
     - Stay source-bound: do not invent monetization, KPIs, org structure, partnerships, operating guidance, or next-step plans.
     - Prefer omission over plausible-sounding speculation.
     - Return markdown.
@@ -60,6 +69,12 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     - When a surface is visible but still evolving in the packet, qualify it explicitly as emerging, evolving, or unsettled.
     - Write `Open Questions` as actual unresolved questions, typically ending with `?`.
     - Do not use market-leadership or competitive-superiority language unless the packet explicitly supports it.
+    - If `existing_foundation` is present, treat this as update mode:
+      - start from `existing_foundation`
+      - preserve all unchanged lines verbatim
+      - apply only the edits implied by `update_request`
+      - do not rewrite existing paragraphs or bullets unless the request requires it
+      - do not add new sections or broad new framing unless `update_request` explicitly asks for them
 
     Brief:
     {{ brief|format(type="yaml") }}
@@ -85,6 +100,7 @@ function EvaluateFoundationDocument(
     - Grade against the expected criteria explicitly.
     - Reward preservation of uncertainty when the source packet is genuinely mixed.
     - Penalize invented certainty, invented capabilities, or implementation leakage.
+    - If `existing_foundation` is present, penalize rewriting unchanged content or adding broad new material outside the requested edit.
     - Penalize unsupported business-model, monetization, KPI, org, partnership, or operating-plan language.
     - Penalize consulting-style sections such as `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or similar drift.
     - Penalize missing markdown heading structure if the document drifts from the required template shape.
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml
index 94058d1..b8ef525 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml
@@ -22,4 +22,5 @@ class FoundationEvalPacket {
   task_prompt string @assert(nonempty_task_prompt, {{ this|length > 0 }})
   raw_notes string @assert(nonempty_raw_notes, {{ this|length > 0 }})
   expected_criteria string @assert(nonempty_expected_criteria, {{ this|length > 0 }})
+  existing_foundation string?
 }
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml
index 002c42d..2d7e916 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml
@@ -29,4 +29,6 @@ class FoundationBrief {
   surfaces string[]
   strategic_bets string[]
   unresolved_questions string[]
+  update_request string?
+  existing_foundation string?
 }
diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json
index d8c6e80..198fdbe 100644
--- a/skills/foundation-creator/evals/evals.json
+++ b/skills/foundation-creator/evals/evals.json
@@ -105,6 +105,59 @@
       "files": [
         "fixtures/harbor_care/raw_notes.md"
       ]
+    },
+    {
+      "id": 4,
+      "eval_name": "update-lightfast-foundation-boundary-surface-question",
+      "scenario_type": "update_existing_doc",
+      "input_shape": "existing_doc_update",
+      "ambiguity_level": "low",
+      "domain_profile": "company_foundation",
+      "primary_risks": [
+        "update_regression",
+        "template_drift",
+        "invented_certainty"
+      ],
+      "prompt": "We have an existing Lightfast foundation document. Please update it in place: add a boundary clarifying that Lightfast is not a hosted control plane or system of record by default, add a durable surface bullet covering versioned evaluation artifacts and run histories, and add an open question about whether repo-native usage remains the center of gravity or shifts toward a hosted system of record. Keep everything else unchanged.",
+      "expected_output": "The existing Lightfast foundation document is preserved verbatim except for three requested additions: (1) a new boundary clarifying that Lightfast is not a hosted control plane or system of record by default; (2) a new durable surface bullet about versioned evaluation artifacts and run histories; and (3) a new open question about repo-native versus hosted center of gravity. No new sections, planning drift, or first-person language are introduced.",
+      "expected_file": "fixtures/lightfast_foundation_update/expected_criteria.md",
+      "validation_contract": {
+        "type": "reference_document_checks",
+        "validator": "foundation-update-v1",
+        "template_file": "references/template.md",
+        "language_file": "references/language.md",
+        "existing_foundation_file": "evals/fixtures/existing_foundation.md",
+        "required_patterns": [
+          {
+            "id": "hosted_control_plane_boundary_present",
+            "pattern": "not a hosted control plane or system of record",
+            "flags": "i",
+            "details_pass": "Detected the requested hosted-control-plane/system-of-record boundary.",
+            "details_fail": "Did not detect the requested hosted-control-plane/system-of-record boundary."
+          },
+          {
+            "id": "evaluation_artifacts_surface_present",
+            "pattern": "versioned evaluation artifacts and run histor",
+            "flags": "i",
+            "details_pass": "Detected the requested durable-surface bullet about evaluation artifacts and run histories.",
+            "details_fail": "Did not detect the requested durable-surface bullet about evaluation artifacts and run histories."
+          },
+          {
+            "id": "repo_native_vs_hosted_question_present",
+            "pattern": "repo-native usage remain[s]? the center of gravity.*hosted system of record",
+            "flags": "i",
+            "details_pass": "Detected the requested open question about repo-native versus hosted center of gravity.",
+            "details_fail": "Did not detect the requested open question about repo-native versus hosted center of gravity."
+          }
+        ]
+      },
+      "packet_files": {
+        "expected_criteria": "fixtures/lightfast_foundation_update/expected_criteria.md",
+        "existing_foundation": "fixtures/existing_foundation.md"
+      },
+      "files": [
+        "fixtures/existing_foundation.md"
+      ]
     }
   ]
 }
diff --git a/skills/foundation-creator/evals/fixtures/existing_foundation.md b/skills/foundation-creator/evals/fixtures/existing_foundation.md
new file mode 100644
index 0000000..e1bd1b1
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/existing_foundation.md
@@ -0,0 +1,56 @@
+# Lightfast Foundation
+
+## What This Is
+Lightfast is a durable-artifact layer for agent work. It turns messy direction
+into inspectable, versioned artifacts that structure human judgment across
+authoring, review, and execution.
+
+## Core Thesis
+- Durable artifacts such as foundation documents, specs, typed contracts, and
+  reusable skill packages survive model churn better than prompts or
+  transcripts.
+- The core primitive is the translation of ambiguity into stable constraints
+  that humans and agents can inspect and reuse.
+- Repo-native and artifact-native workflows are a strong default because
+  outputs can be versioned, reviewed, and edited directly.
+- Coding workflows are a plausible first wedge, but the product may extend to
+  other repeatable work with heavy ambiguity.
+
+## Boundaries
+- Not a chat wrapper, prompt library, or general AI assistant.
+- Not an agency delivering bespoke strategy or operational work.
+- Not project management, ticketing, or a drag-and-drop workflow builder.
+- Not a fully autonomous company-in-a-box that replaces human ownership.
+
+## Actor Model
+- Solo builders use reusable skill packages and local-first artifact workflows.
+- Founders and operators use foundation documents and specs to align intent
+  while preserving decision ownership.
+- Product, engineering, and operations teams use typed contracts and repeatable
+  artifacts to review and distribute working knowledge.
+
+## Durable Surfaces
+- Foundation documents and templates that capture durable intent and scope.
+- Specs that turn thesis-level framing into reviewable behavioral documents.
+- Reusable skill packages that make operational knowledge installable.
+- Typed contracts that make expectations explicit across humans, agents, and
+  code.
+- Repo-native files and reference artifacts that support versioned local
+  editing and review.
+
+## Strategic Bets
+- The notes suggest a bet on skills and coding workflows as an initial wedge
+  because evaluation loops are tighter there.
+- There are visible signals that repo-native artifacts are a preferred
+  stability and inspection boundary.
+- The company appears to be betting that formal constraints and reusable
+  artifacts will outlast raw prompts and transcripts.
+
+## Open Questions
+- Is the main primitive better described as durable instructions for agents or
+  as a broader interface layer between humans, models, repositories, and
+  outputs?
+- How much of the long-term value comes from authoring artifacts versus
+  distributing them?
+- Will the company remain coding-first, or extend meaningfully into research,
+  operations, support, and GTM work?
diff --git a/skills/foundation-creator/evals/fixtures/lightfast_foundation_update/expected_criteria.md b/skills/foundation-creator/evals/fixtures/lightfast_foundation_update/expected_criteria.md
new file mode 100644
index 0000000..d167d07
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/lightfast_foundation_update/expected_criteria.md
@@ -0,0 +1,13 @@
+# Expected Criteria
+
+- The output should preserve the existing Lightfast foundation document's title,
+  section order, and unchanged wording instead of rewriting it from scratch.
+- The output should add one new `Boundaries` bullet clarifying that Lightfast
+  is not a hosted control plane or system of record by default.
+- The output should add one new `Durable Surfaces` bullet covering versioned
+  evaluation artifacts and run histories.
+- The output should add one new `Open Questions` bullet asking whether
+  repo-native usage remains the center of gravity or shifts toward a hosted
+  system of record.
+- The output should not add new sections, planning language, monetization
+  claims, or stronger certainty than the request supports.
diff --git a/skills/foundation-creator/references/language.md b/skills/foundation-creator/references/language.md
index 49a4370..60decb0 100644
--- a/skills/foundation-creator/references/language.md
+++ b/skills/foundation-creator/references/language.md
@@ -21,6 +21,8 @@ How the foundation document should be worded.
 - Use `#` for the document title: `# <Primitive Name> Foundation`.
 - Use `##` for every major section in `references/template.md`.
 - Keep the section order exactly as the template defines it.
+- In update mode, preserve the existing section order and keep unchanged wording
+  intact unless the requested edit requires a local rewrite.
 - Use bullets for `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`,
   `Strategic Bets`, and `Open Questions`.
 - `Open Questions` bullets should be written as actual open questions and usually

From a5e56f3ce2002bd0da2bc7f4c6fd76fbb9ece637 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Tue, 21 Apr 2026 12:35:30 +1000
Subject: [PATCH 07/30] Add replacement-heavy foundation update eval

---
 README.md                                     |   2 +
 evals/TAXONOMY.md                             |   2 -
 scripts/run-baml-eval.mjs                     | 112 +++++++++++++---
 .../compiler_functions.baml                   |   2 +
 .../foundation_compiler/eval_runner.baml      |   4 +-
 skills/foundation-creator/evals/evals.json    | 120 ++++++++++++++++++
 .../existing_foundation_overconfident.md      |  47 +++++++
 .../expected_criteria.md                      |  21 +++
 8 files changed, 291 insertions(+), 19 deletions(-)
 create mode 100644 skills/foundation-creator/evals/fixtures/existing_foundation_overconfident.md
 create mode 100644 skills/foundation-creator/evals/fixtures/lightfast_foundation_tighten/expected_criteria.md

diff --git a/README.md b/README.md
index 9381b4c..b0c6ecc 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,7 @@ bun install
 bun run eval:foundation -- create-foundation-from-vercel-source-packet
 bun run eval:foundation -- create-foundation-from-lightfast-founder-notes
 bun run eval:foundation -- update-lightfast-foundation-boundary-surface-question
+bun run eval:foundation -- update-lightfast-foundation-tighten-overreach
 bun run eval:spec -- create-from-vercel-mcp-source-packet
 bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3
 ```
@@ -44,6 +45,7 @@ Current `foundation-creator` corpus includes:
 - `create-foundation-from-lightfast-founder-notes`
 - `create-foundation-from-harbor-care-source-packet`
 - `update-lightfast-foundation-boundary-surface-question`
+- `update-lightfast-foundation-tighten-overreach`
 
 The runner now also writes:
 
diff --git a/evals/TAXONOMY.md b/evals/TAXONOMY.md
index 7cb33b3..b7fecdc 100644
--- a/evals/TAXONOMY.md
+++ b/evals/TAXONOMY.md
@@ -74,8 +74,6 @@ values:
 
 The next missing slices are:
 
-- a second `update_existing_doc` packet for `foundation-creator` that requires
-  replacing or tightening existing language rather than additive edits only
 - baseline comparison runs (`current skill` vs `previous skill` / `no skill`)
   when the local harness is ready to compare deltas directly
 - optional Braintrust-style scorer/export integration if local JSON artifacts are
diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs
index 4f01658..4bd5b1d 100644
--- a/scripts/run-baml-eval.mjs
+++ b/scripts/run-baml-eval.mjs
@@ -266,6 +266,38 @@ function createCheck(id, passed, details) {
   return { id, passed, details };
 }
 
+function compilePatternSpec(patternSpec) {
+  if (typeof patternSpec === "string") {
+    return new RegExp(patternSpec, "i");
+  }
+
+  return new RegExp(patternSpec.pattern, patternSpec.flags ?? "i");
+}
+
+function filterLinesByPatternSpecs(lines, patternSpecs = []) {
+  if (patternSpecs.length === 0) {
+    return lines;
+  }
+
+  return lines.filter((line) =>
+    !patternSpecs.some((patternSpec) => compilePatternSpec(patternSpec).test(line)),
+  );
+}
+
+function createPatternChecks(normalizedCandidate, patternChecks = [], expectedPresence = true) {
+  return patternChecks.map((patternCheck) => {
+    const expression = compilePatternSpec(patternCheck);
+    const matched = expression.test(normalizedCandidate);
+    const passed = expectedPresence ? matched : !matched;
+
+    return createCheck(
+      patternCheck.id,
+      passed,
+      passed ? patternCheck.details_pass : patternCheck.details_fail,
+    );
+  });
+}
+
 function extractFoundationTemplateSections(templateText) {
   const sections = [];
   const lines = templateText.split(/\r?\n/);
@@ -364,6 +396,49 @@ function extractMarkdownBullets(sectionBody) {
   return bullets;
 }
 
+function extractNormalizedMarkdownBlocks(text) {
+  const blocks = [];
+  let current = null;
+
+  function flush() {
+    if (current && normalizeLine(current).length > 0) {
+      blocks.push(normalizeLine(current));
+    }
+    current = null;
+  }
+
+  for (const rawLine of text.split(/\r?\n/)) {
+    const trimmed = rawLine.trim();
+
+    if (trimmed.length === 0) {
+      flush();
+      continue;
+    }
+
+    if (/^\s*#+\s+/.test(rawLine)) {
+      flush();
+      blocks.push(normalizeLine(rawLine));
+      continue;
+    }
+
+    if (/^\s*-\s+/.test(rawLine)) {
+      flush();
+      current = rawLine.replace(/^\s*-\s+/, "- ");
+      continue;
+    }
+
+    if (current) {
+      current = `${current} ${trimmed}`.trim();
+      continue;
+    }
+
+    current = trimmed;
+  }
+
+  flush();
+  return blocks;
+}
+
 function validateFoundationDocument(candidateDocument, templateText) {
   const requiredSections = extractFoundationTemplateSections(templateText);
   const disallowedHeadings = extractFoundationDisallowedHeadings(templateText);
@@ -433,7 +508,7 @@ function validateFoundationDocument(candidateDocument, templateText) {
     strategicBetLines.every((line) =>
       /^Bet:/i.test(line)
         ? strategicBetsBodyHasDirectionalPreamble
-        : /\b(appears?|suggests?|signals?|signaling|indicates?|indicating|directional bet|directional bets|observed bet|observed bets|a bet that|bet that|bet on)\b/i.test(
+        : /\b(appears?|suggests?|signals?|signaling|indicates?|indicating|indications?|directional bet|directional bets|observed bet|observed bets|a bet that|bet that|bet on)\b/i.test(
             line,
           ),
     );
@@ -507,20 +582,24 @@ function validateFoundationUpdateDocument(
   validationContract,
 ) {
   const baseChecks = validateFoundationDocument(candidateDocument, templateText);
-  const existingLines = extractNonEmptyNormalizedLines(existingFoundationText);
-  const candidateLines = extractNonEmptyNormalizedLines(candidateDocument);
+  const existingBlocks = extractNormalizedMarkdownBlocks(existingFoundationText);
+  const candidateBlocks = extractNormalizedMarkdownBlocks(candidateDocument);
   const normalizedCandidate = normalizeLine(candidateDocument);
-  const preservesExistingContent = linesAppearInOrder(existingLines, candidateLines);
-  const requiredPatternChecks = (validationContract.required_patterns ?? []).map((patternCheck) => {
-    const expression = new RegExp(patternCheck.pattern, patternCheck.flags ?? "i");
-    const passed = expression.test(normalizedCandidate);
-
-    return createCheck(
-      patternCheck.id,
-      passed,
-      passed ? patternCheck.details_pass : patternCheck.details_fail,
-    );
-  });
+  const preservedExistingBlocks = filterLinesByPatternSpecs(
+    existingBlocks,
+    validationContract.allowed_removed_patterns ?? [],
+  );
+  const preservesExistingContent = linesAppearInOrder(preservedExistingBlocks, candidateBlocks);
+  const requiredPatternChecks = createPatternChecks(
+    normalizedCandidate,
+    validationContract.required_patterns ?? [],
+    true,
+  );
+  const forbiddenPatternChecks = createPatternChecks(
+    normalizedCandidate,
+    validationContract.forbidden_patterns ?? [],
+    false,
+  );
 
   return [
     ...baseChecks,
@@ -528,10 +607,11 @@ function validateFoundationUpdateDocument(
       "existing_content_preserved_in_order",
       preservesExistingContent,
       preservesExistingContent
-        ? "All non-empty lines from the existing foundation appear in order in the candidate."
-        : "One or more non-empty lines from the existing foundation were removed or reordered.",
+        ? "All existing markdown blocks appear in order in the candidate, except blocks explicitly marked as replaceable."
+        : "One or more existing markdown blocks were removed or reordered outside the explicitly replaceable blocks.",
     ),
     ...requiredPatternChecks,
+    ...forbiddenPatternChecks,
   ];
 }
 
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index 6a9f9a4..2a1d340 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -110,6 +110,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Emphasize thesis, boundaries, actor model, surfaces, and strategic bets.
     - State that the writer is a source-bound synthesizer, not a strategy consultant.
     - If the brief includes `existing_foundation`, treat the task as update mode and preserve the existing document verbatim except for the requested edits.
+    - In update mode, tell the writer to copy every unchanged bullet and question verbatim instead of paraphrasing it.
     - Require markdown output with exactly this heading structure unless the user asks otherwise:
       - `# <Primitive Name> Foundation`
       - `## What This Is`
@@ -127,6 +128,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Require `Open Questions` bullets to remain actual unresolved questions rather than disguised conclusions.
     - Forbid market-leadership or superiority claims unless they are explicit in the brief.
     - In update mode, do not add new sections or broad new framing unless the request explicitly asks for them.
+    - In update mode, if a boundary, actor, surface, strategic bet, or open question is not part of the requested edit, preserve its wording exactly.
     - Make the prompt directly usable by an agent.
 
     Brief:
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index edaa130..3407e0d 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -23,9 +23,10 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
     - Avoid implementation detail.
     - If `existing_foundation` is present, treat this as update mode:
       - preserve the existing section structure
-      - carry forward unchanged framing and language where possible
+      - carry forward unchanged framing and language verbatim whenever possible
       - extract a narrow `update_request` rather than rewriting the entire document
       - do not resolve existing ambiguity unless the task prompt explicitly asks for it
+      - if a boundary, actor, surface, strategic bet, or open question is not explicitly targeted by the task prompt, keep its wording unchanged in the brief
     - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them.
     - If `strategic_bets` are included, phrase them as observed directional signals or evidence.
     - Each `strategic_bets` item should start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`.
@@ -72,6 +73,7 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     - If `existing_foundation` is present, treat this as update mode:
       - start from `existing_foundation`
       - preserve all unchanged lines verbatim
+      - preserve unchanged bullet wording and unchanged open questions exactly as written
       - apply only the edits implied by `update_request`
       - do not rewrite existing paragraphs or bullets unless the request requires it
       - do not add new sections or broad new framing unless `update_request` explicitly asks for them
diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json
index 198fdbe..1c7935c 100644
--- a/skills/foundation-creator/evals/evals.json
+++ b/skills/foundation-creator/evals/evals.json
@@ -158,6 +158,126 @@
       "files": [
         "fixtures/existing_foundation.md"
       ]
+    },
+    {
+      "id": 5,
+      "eval_name": "update-lightfast-foundation-tighten-overreach",
+      "scenario_type": "update_existing_doc",
+      "input_shape": "existing_doc_update",
+      "ambiguity_level": "medium",
+      "domain_profile": "company_foundation",
+      "primary_risks": [
+        "update_regression",
+        "invented_certainty",
+        "weak_boundaries"
+      ],
+      "prompt": "We have an older Lightfast foundation draft that overstates a few conclusions. Update it in place: replace the `operating system for durable agent work` framing with narrower source-bound language about a durable artifact or constraint layer for agent work; replace the thesis line about `replacing ambiguous human direction with machine-executable constraints` so it instead frames Lightfast as structuring human judgment through stable constraints; remove settled claims that repo-native is the clear long-term center of gravity or that hosted control planes matter less, and replace them with an explicit open question about repo-native usage versus a hosted control plane, catalog, or distribution surface; rewrite the wedge language so coding workflows are the easiest first wedge because eval loops are tighter, without implying coding is the permanent center of gravity; and soften any remaining categorical `Strategic Bets` language into directional, source-bound signals. Keep the existing heading structure and preserve everything else unless one of these local rewrites requires adjacent wording changes.",
+      "expected_output": "The older Lightfast foundation document is updated in place: the overreaching `operating system` framing is narrowed, the primitive is reframed around structuring human judgment rather than replacing it, settled repo-native superiority claims are removed and restored as an open tension, the wedge language now says coding workflows are the easiest first wedge because eval loops are tighter without implying permanent scope, and any remaining categorical strategic-bet language is softened into directional evidence. Unchanged content stays in place and no new planning or strategy sections are added.",
+      "expected_file": "fixtures/lightfast_foundation_tighten/expected_criteria.md",
+      "validation_contract": {
+        "type": "reference_document_checks",
+        "validator": "foundation-update-v1",
+        "template_file": "references/template.md",
+        "language_file": "references/language.md",
+        "existing_foundation_file": "evals/fixtures/existing_foundation_overconfident.md",
+        "allowed_removed_patterns": [
+          {
+            "pattern": "^Lightfast is an operating system for durable agent work\\.$",
+            "flags": "i"
+          },
+          {
+            "pattern": "^- The core primitive is replacing ambiguous human direction with machine-executable constraints\\.$",
+            "flags": "i"
+          },
+          {
+            "pattern": "^- Repo-native workflows are the clear long-term center of gravity\\.$",
+            "flags": "i"
+          },
+          {
+            "pattern": "^- Coding is the product center of gravity, not just the initial wedge\\.$",
+            "flags": "i"
+          },
+          {
+            "pattern": "^- Skills are the wedge and will remain the product center of gravity\\.$",
+            "flags": "i"
+          },
+          {
+            "pattern": "^- Repo-native distribution matters more than any hosted control plane\\.$",
+            "flags": "i"
+          },
+          {
+            "pattern": "^- Formal constraints and reusable artifacts will outlast raw prompts and transcripts\\.$",
+            "flags": "i"
+          }
+        ],
+        "required_patterns": [
+          {
+            "id": "narrower_artifact_layer_framing_present",
+            "pattern": "source-bound.*durable artifact and constraint layer|source-bound.*durable artifacts? and constraints?|durable artifact and constraint layer.*agent(?:-enabled)? work",
+            "flags": "i",
+            "details_pass": "Detected the narrower durable-artifact/constraint-layer framing in `What This Is`.",
+            "details_fail": "Did not detect the requested narrower durable-artifact/constraint-layer framing."
+          },
+          {
+            "id": "human_judgment_thesis_present",
+            "pattern": "structur(?:e|ing) human judgment.*stable(?:,? [a-z-]+)? constraints?|stable(?:,? [a-z-]+)? constraints?.*structur(?:e|ing) human judgment",
+            "flags": "i",
+            "details_pass": "Detected the requested thesis language about structuring human judgment through stable constraints.",
+            "details_fail": "Did not detect the requested thesis language about structuring human judgment through stable constraints."
+          },
+          {
+            "id": "coding_first_wedge_is_qualified",
+            "pattern": "coding workflows.*easiest (?:first|initial) wedge because eval(?:uation)? loops are tighter",
+            "flags": "i",
+            "details_pass": "Detected the requested qualified coding-first wedge language.",
+            "details_fail": "Did not detect the requested qualified coding-first wedge language."
+          },
+          {
+            "id": "repo_native_vs_hosted_tension_restored",
+            "pattern": "repo-native.*hosted (?:control plane|catalog|distribution surface)",
+            "flags": "i",
+            "details_pass": "Detected the requested repo-native versus hosted tension.",
+            "details_fail": "Did not detect the requested repo-native versus hosted tension."
+          }
+        ],
+        "forbidden_patterns": [
+          {
+            "id": "operating_system_overreach_removed",
+            "pattern": "operating system for durable agent work",
+            "flags": "i",
+            "details_pass": "Did not detect the overreaching `operating system` framing.",
+            "details_fail": "Still detected the overreaching `operating system` framing."
+          },
+          {
+            "id": "machine_executable_constraint_line_removed",
+            "pattern": "replacing ambiguous human direction with machine-executable constraints",
+            "flags": "i",
+            "details_pass": "Did not detect the old thesis language about replacing ambiguous human direction.",
+            "details_fail": "Still detected the old thesis language about replacing ambiguous human direction."
+          },
+          {
+            "id": "repo_native_winner_claim_removed",
+            "pattern": "repo-native workflows are the clear long-term center of gravity|repo-native distribution matters more than any hosted control plane",
+            "flags": "i",
+            "details_pass": "Did not detect the old settled repo-native winner claims.",
+            "details_fail": "Still detected a settled repo-native winner claim that should have been removed."
+          },
+          {
+            "id": "permanent_wedge_claim_removed",
+            "pattern": "coding is the product center of gravity, not just the initial wedge|skills are the wedge and will remain the product center of gravity",
+            "flags": "i",
+            "details_pass": "Did not detect the old permanent-center-of-gravity wedge claims.",
+            "details_fail": "Still detected a permanent-center-of-gravity wedge claim that should have been removed."
+          }
+        ]
+      },
+      "packet_files": {
+        "expected_criteria": "fixtures/lightfast_foundation_tighten/expected_criteria.md",
+        "existing_foundation": "fixtures/existing_foundation_overconfident.md"
+      },
+      "files": [
+        "fixtures/existing_foundation_overconfident.md"
+      ]
     }
   ]
 }
diff --git a/skills/foundation-creator/evals/fixtures/existing_foundation_overconfident.md b/skills/foundation-creator/evals/fixtures/existing_foundation_overconfident.md
new file mode 100644
index 0000000..8ad7f78
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/existing_foundation_overconfident.md
@@ -0,0 +1,47 @@
+# Lightfast Foundation
+
+## What This Is
+Lightfast is an operating system for durable agent work.
+
+## Core Thesis
+- Durable artifacts such as foundation documents, specs, typed contracts, and
+  reusable skill packages survive model churn better than prompts or
+  transcripts.
+- The core primitive is replacing ambiguous human direction with
+  machine-executable constraints.
+- Repo-native workflows are the clear long-term center of gravity.
+- Coding is the product center of gravity, not just the initial wedge.
+
+## Boundaries
+- Not a chat wrapper, prompt library, or general AI assistant.
+- Not an agency delivering bespoke strategy or operational work.
+- Not project management, ticketing, or a drag-and-drop workflow builder.
+- Not a fully autonomous company-in-a-box that replaces human ownership.
+
+## Actor Model
+- Solo builders use reusable skill packages and local-first artifact workflows.
+- Founders and operators use foundation documents and specs to align intent
+  while preserving decision ownership.
+- Product, engineering, and operations teams use typed contracts and repeatable
+  artifacts to review and distribute working knowledge.
+
+## Durable Surfaces
+- Foundation documents and templates that capture durable intent and scope.
+- Specs that turn thesis-level framing into reviewable behavioral documents.
+- Reusable skill packages that make operational knowledge installable.
+- Typed contracts that make expectations explicit across humans, agents, and
+  code.
+- Repo-native files and reference artifacts that support versioned local
+  editing and review.
+
+## Strategic Bets
+- Skills are the wedge and will remain the product center of gravity.
+- Repo-native distribution matters more than any hosted control plane.
+- Formal constraints and reusable artifacts will outlast raw prompts and
+  transcripts.
+
+## Open Questions
+- How much of the long-term value comes from authoring artifacts versus
+  distributing them?
+- Will the company extend meaningfully beyond coding into operations, support,
+  and GTM work?
diff --git a/skills/foundation-creator/evals/fixtures/lightfast_foundation_tighten/expected_criteria.md b/skills/foundation-creator/evals/fixtures/lightfast_foundation_tighten/expected_criteria.md
new file mode 100644
index 0000000..a5ebabb
--- /dev/null
+++ b/skills/foundation-creator/evals/fixtures/lightfast_foundation_tighten/expected_criteria.md
@@ -0,0 +1,21 @@
+# Expected Criteria
+
+- The update should keep the existing Lightfast foundation structure and retain
+  unchanged lines in place.
+- The `What This Is` framing should be narrowed from `operating system for
+  durable agent work` to a more source-bound durable artifact or constraint
+  layer framing.
+- The thesis should no longer say Lightfast replaces ambiguous human direction
+  with machine-executable constraints; it should instead say the system
+  structures human judgment through stable constraints or inspectable artifacts.
+- The document should no longer claim repo-native workflows or distribution are
+  already the clear long-term winner. That tension should be restored as an
+  open question about repo-native usage versus a hosted control plane, catalog,
+  or distribution surface.
+- The wedge language should be tightened so coding workflows are framed as the
+  easiest first wedge because eval loops are tighter, without implying coding
+  is the permanent center of gravity.
+- Remaining categorical `Strategic Bets` language should be softened into
+  directional, source-bound signals rather than settled conclusions.
+- The update should not introduce new sections, roadmap content, or invented
+  certainty beyond the requested rewrites.

From c7710303d58acb56aa57fb080865a527cfaa3d3a Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Tue, 21 Apr 2026 13:50:15 +1000
Subject: [PATCH 08/30] Add baseline comparison eval harness

---
 README.md                                     |  15 +
 evals/TAXONOMY.md                             |  10 +-
 scripts/run-baml-eval.mjs                     | 497 ++++++++++++++++--
 .../foundation_compiler/eval_runner.baml      |  90 ++++
 skills/foundation-creator/evals/evals.json    |   2 +-
 5 files changed, 562 insertions(+), 52 deletions(-)
 create mode 100644 skills/foundation-creator/eval_profiles/no-skill/baml_src/foundation_compiler/eval_runner.baml

diff --git a/README.md b/README.md
index b0c6ecc..d1208ad 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,7 @@ bun run eval:foundation -- update-lightfast-foundation-boundary-surface-question
 bun run eval:foundation -- update-lightfast-foundation-tighten-overreach
 bun run eval:spec -- create-from-vercel-mcp-source-packet
 bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3
+bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator update-lightfast-foundation-tighten-overreach --compare previous,profile:no-skill
 ```
 
 Each run writes packet, brief, candidate document, and evaluation report
@@ -56,6 +57,20 @@ The runner now also writes:
 - `benchmark.json` — aggregated status counts and timing summaries across all
   trials
 
+When `--compare` is used, the run directory also includes:
+
+- `comparison.json` — head-to-head summary across variants, all judged by the
+  current skill's evaluator
+- `variants/<label>/...` — per-variant packet/brief/candidate/report artifacts
+  and `benchmark.json`
+
+Current comparison variants:
+
+- `current` — working tree prompt stack
+- `previous` — `HEAD~1` snapshot of the skill
+- `profile:no-skill` — intentionally under-scaffolded baseline profile for
+  measuring how much the foundation-specific prompt constraints matter
+
 Eval manifests also carry lightweight taxonomy metadata
 (`scenario_type`, `input_shape`, `ambiguity_level`, `domain_profile`,
 `primary_risks`) so benchmark runs can be grouped by failure mode. Shared
diff --git a/evals/TAXONOMY.md b/evals/TAXONOMY.md
index b7fecdc..a024e86 100644
--- a/evals/TAXONOMY.md
+++ b/evals/TAXONOMY.md
@@ -72,9 +72,15 @@ values:
 
 ## Current expansion priority
 
+The comparison harness now supports:
+
+- `current` — working tree skill
+- `previous` — `HEAD~1` snapshot
+- `profile:no-skill` — intentionally under-scaffolded baseline
+
 The next missing slices are:
 
-- baseline comparison runs (`current skill` vs `previous skill` / `no skill`)
-  when the local harness is ready to compare deltas directly
+- comparison coverage on more packets so prompt changes are judged by deltas,
+  not only by isolated pass/fail runs
 - optional Braintrust-style scorer/export integration if local JSON artifacts are
   no longer sufficient for experiment tracking
diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs
index 4bd5b1d..a5ecf99 100644
--- a/scripts/run-baml-eval.mjs
+++ b/scripts/run-baml-eval.mjs
@@ -1,4 +1,5 @@
-import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
+import { access, cp, mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
+import os from "node:os";
 import path from "node:path";
 import { fileURLToPath, pathToFileURL } from "node:url";
 import { spawn } from "node:child_process";
@@ -120,6 +121,7 @@ async function loadText(filePath) {
 function parseArgs(argv) {
   const positionals = [];
   let trials = 1;
+  let compare = [];
 
   for (let index = 0; index < argv.length; index += 1) {
     const arg = argv[index];
@@ -137,6 +139,19 @@ function parseArgs(argv) {
       continue;
     }
 
+    if (arg === "--compare") {
+      const next = argv[index + 1];
+      if (!next) {
+        fail("Missing value after --compare.");
+      }
+      compare = next
+        .split(",")
+        .map((value) => value.trim())
+        .filter((value) => value.length > 0);
+      index += 1;
+      continue;
+    }
+
     positionals.push(arg);
   }
 
@@ -144,9 +159,124 @@ function parseArgs(argv) {
     skillName: positionals[0],
     selector: positionals[1],
     trials,
+    compare,
   };
 }
 
+function shellEscape(value) {
+  return `'${String(value).replace(/'/g, `'\\''`)}'`;
+}
+
+function slugifyVariantLabel(label) {
+  return label
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, "-")
+    .replace(/^-+|-+$/g, "") || "variant";
+}
+
+function compareStatuses(left, right) {
+  return STATUS_RANK[left] - STATUS_RANK[right];
+}
+
+function parseVariantSpec(rawValue) {
+  const value = rawValue.trim();
+
+  if (value === "current") {
+    return {
+      key: "current",
+      label: "current",
+      kind: "current",
+      source: {
+        type: "working_tree",
+      },
+    };
+  }
+
+  if (value === "previous") {
+    return {
+      key: "previous",
+      label: "previous",
+      kind: "git",
+      ref: "HEAD~1",
+      source: {
+        type: "git",
+        ref: "HEAD~1",
+      },
+    };
+  }
+
+  if (value === "no-skill") {
+    return {
+      key: "profile:no-skill",
+      label: "profile:no-skill",
+      kind: "profile",
+      profileName: "no-skill",
+      source: {
+        type: "profile",
+        name: "no-skill",
+      },
+    };
+  }
+
+  if (value.startsWith("profile:")) {
+    const profileName = value.slice("profile:".length).trim();
+    if (profileName.length === 0) {
+      fail("Profile comparison variant is missing a name.");
+    }
+    return {
+      key: `profile:${profileName}`,
+      label: `profile:${profileName}`,
+      kind: "profile",
+      profileName,
+      source: {
+        type: "profile",
+        name: profileName,
+      },
+    };
+  }
+
+  if (value.startsWith("git:")) {
+    const ref = value.slice("git:".length).trim();
+    if (ref.length === 0) {
+      fail("Git comparison variant is missing a ref.");
+    }
+    return {
+      key: `git:${ref}`,
+      label: `git:${ref}`,
+      kind: "git",
+      ref,
+      source: {
+        type: "git",
+        ref,
+      },
+    };
+  }
+
+  fail(
+    `Unknown comparison variant '${rawValue}'. Use current, previous, no-skill, profile:<name>, or git:<ref>.`,
+  );
+}
+
+function buildVariantPlan(compareSpecs) {
+  if (compareSpecs.length === 0) {
+    return [parseVariantSpec("current")];
+  }
+
+  const variants = [parseVariantSpec("current"), ...compareSpecs.map(parseVariantSpec)];
+  const deduped = [];
+  const seen = new Set();
+
+  for (const variant of variants) {
+    if (seen.has(variant.key)) {
+      continue;
+    }
+    seen.add(variant.key);
+    deduped.push(variant);
+  }
+
+  return deduped;
+}
+
 function getEvalBySelector(evals, selector) {
   if (!selector) {
     if (evals.length === 1) {
@@ -218,6 +348,90 @@ async function buildPacket(evalEntry, evalsDir, packetType) {
   return packet;
 }
 
+function shouldCopySkillPath(relativePath) {
+  const normalizedPath = relativePath.split(path.sep).join("/");
+
+  if (normalizedPath.length === 0) {
+    return true;
+  }
+
+  return !(
+    normalizedPath === "baml_client" ||
+    normalizedPath.startsWith("baml_client/") ||
+    normalizedPath === "baml_client_dist" ||
+    normalizedPath.startsWith("baml_client_dist/") ||
+    normalizedPath === "evals/runs" ||
+    normalizedPath.startsWith("evals/runs/")
+  );
+}
+
+async function cloneSkillRoot(sourceSkillRoot, destinationSkillRoot) {
+  await cp(sourceSkillRoot, destinationSkillRoot, {
+    recursive: true,
+    force: true,
+    filter: (sourcePath) => shouldCopySkillPath(path.relative(sourceSkillRoot, sourcePath)),
+  });
+}
+
+async function materializeGitSkillRoot(skillName, ref, workspaceRoot) {
+  const repoRelativeSkillPath = path.posix.join("skills", skillName);
+  const archiveCommand = [
+    "git archive --format=tar",
+    shellEscape(ref),
+    shellEscape(repoRelativeSkillPath),
+    "| tar -x -C",
+    shellEscape(workspaceRoot),
+  ].join(" ");
+
+  await runCommand("bash", ["-lc", archiveCommand], repoRoot);
+  return path.join(workspaceRoot, "skills", skillName);
+}
+
+async function materializeVariantSkillRoot(skillName, baseSkillRoot, variant) {
+  if (variant.kind === "current") {
+    return {
+      skillRoot: baseSkillRoot,
+      cleanupRoot: null,
+    };
+  }
+
+  const workspaceRoot = await mkdtemp(
+    path.join(os.tmpdir(), `lightfast-skill-eval-${slugifyVariantLabel(variant.label)}-`),
+  );
+
+  if (variant.kind === "git") {
+    const skillRoot = await materializeGitSkillRoot(skillName, variant.ref, workspaceRoot);
+    return {
+      skillRoot,
+      cleanupRoot: workspaceRoot,
+    };
+  }
+
+  if (variant.kind === "profile") {
+    const skillRoot = path.join(workspaceRoot, "skills", skillName);
+    const profileRoot = path.join(baseSkillRoot, "eval_profiles", variant.profileName);
+
+    try {
+      await access(profileRoot);
+    } catch {
+      fail(`Comparison profile '${variant.profileName}' not found at ${profileRoot}.`);
+    }
+
+    await cloneSkillRoot(baseSkillRoot, skillRoot);
+    await cp(profileRoot, skillRoot, {
+      recursive: true,
+      force: true,
+    });
+
+    return {
+      skillRoot,
+      cleanupRoot: workspaceRoot,
+    };
+  }
+
+  fail(`Unsupported variant kind '${variant.kind}'.`);
+}
+
 async function ensureFreshClient(skillRoot) {
   const clientDir = path.join(skillRoot, "baml_client");
   const distDir = path.join(skillRoot, "baml_client_dist");
@@ -840,30 +1054,39 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo
 async function runSingleTrial({
   evalEntry,
   evalsDir,
-  generated,
+  candidateGenerated,
+  judgeGenerated,
   runner,
   skillRoot,
   validationContract,
 }) {
-  const { b } = generated;
+  const { b: candidateClient } = candidateGenerated;
+  const { b: judgeClient } = judgeGenerated;
   const packet = await buildPacket(evalEntry, evalsDir, runner.packet_type);
   const compileFnName = runner.compile_brief_function;
   const renderFnName = runner.render_document_function;
   const evaluateFnName = runner.evaluate_document_function;
 
   if (
-    typeof b[compileFnName] !== "function" ||
-    typeof b[renderFnName] !== "function" ||
-    typeof b[evaluateFnName] !== "function"
+    typeof candidateClient[compileFnName] !== "function" ||
+    typeof candidateClient[renderFnName] !== "function"
   ) {
-    fail(`Generated client is missing one or more runner functions for '${path.basename(skillRoot)}'.`);
+    fail(
+      `Generated client is missing one or more compile/render runner functions for '${path.basename(skillRoot)}'.`,
+    );
+  }
+
+  if (typeof judgeClient[evaluateFnName] !== "function") {
+    fail(
+      `Judge client is missing evaluate runner function '${evaluateFnName}' for '${path.basename(skillRoot)}'.`,
+    );
   }
 
   const timing = {};
   const startedAt = Date.now();
 
   const compileStartedAt = Date.now();
-  const brief = await b[compileFnName](packet);
+  const brief = await candidateClient[compileFnName](packet);
   timing.compile_ms = Date.now() - compileStartedAt;
 
   if (runner.packet_type === "SpecEvalPacket") {
@@ -881,7 +1104,7 @@ async function runSingleTrial({
   }
 
   const renderStartedAt = Date.now();
-  const candidateDocument = await b[renderFnName](brief);
+  const candidateDocument = await candidateClient[renderFnName](brief);
   timing.render_ms = Date.now() - renderStartedAt;
 
   const deterministicStartedAt = Date.now();
@@ -893,7 +1116,7 @@ async function runSingleTrial({
   timing.deterministic_ms = Date.now() - deterministicStartedAt;
 
   const evaluateStartedAt = Date.now();
-  const report = await b[evaluateFnName](packet, candidateDocument);
+  const report = await judgeClient[evaluateFnName](packet, candidateDocument);
   timing.evaluate_ms = Date.now() - evaluateStartedAt;
   timing.total_ms = Date.now() - startedAt;
 
@@ -917,6 +1140,66 @@ async function runSingleTrial({
   };
 }
 
+async function runVariantTrials({
+  artifactDir,
+  candidateGenerated,
+  evalEntry,
+  evalsDir,
+  judgeGenerated,
+  runner,
+  skillName,
+  skillRoot,
+  trials,
+  validationContract,
+  variant,
+}) {
+  const trialResults = [];
+
+  for (let trialIndex = 0; trialIndex < trials; trialIndex += 1) {
+    const trialResult = await runSingleTrial({
+      evalEntry,
+      evalsDir,
+      candidateGenerated,
+      judgeGenerated,
+      runner,
+      skillRoot,
+      validationContract,
+    });
+    trialResults.push(trialResult);
+
+    const trialArtifacts = {
+      "packet.json": trialResult.packet,
+      "brief.json": trialResult.brief,
+      "candidate.md": trialResult.candidateDocument,
+      "report.json": trialResult.report,
+      "deterministic_checks.json": trialResult.deterministic_checks,
+      "timing.json": trialResult.timing,
+      "summary.json": trialResult.summary,
+    };
+
+    if (trials === 1) {
+      await writeRunArtifacts(artifactDir, trialArtifacts);
+    } else {
+      await writeRunArtifacts(path.join(artifactDir, `trial-${trialIndex + 1}`), trialArtifacts);
+    }
+  }
+
+  const benchmark = {
+    ...buildBenchmark(skillName, evalEntry.eval_name, trialResults),
+    eval_metadata: extractEvalMetadata(evalEntry),
+    variant,
+  };
+
+  await writeRunArtifacts(artifactDir, {
+    "benchmark.json": benchmark,
+  });
+
+  return {
+    benchmark,
+    trialResults,
+  };
+}
+
 function buildBenchmark(skillName, evalName, trials) {
   const judgeStatuses = trials.map((trial) => trial.report.overall_status);
   const combinedStatuses = trials.map((trial) => trial.summary.combined_status);
@@ -996,12 +1279,102 @@ function extractEvalMetadata(evalEntry) {
   );
 }
 
+function compareBenchmarks(left, right) {
+  const leftSummary = left.benchmark_summary;
+  const rightSummary = right.benchmark_summary;
+
+  const combinedOrder = compareStatuses(
+    leftSummary.combined_worst_status,
+    rightSummary.combined_worst_status,
+  );
+  if (combinedOrder !== 0) {
+    return combinedOrder;
+  }
+
+  const llmOrder = compareStatuses(leftSummary.llm_worst_status, rightSummary.llm_worst_status);
+  if (llmOrder !== 0) {
+    return llmOrder;
+  }
+
+  return rightSummary.deterministic_pass_rate - leftSummary.deterministic_pass_rate;
+}
+
+function buildComparisonReport(skillName, evalEntry, variantResults) {
+  const currentVariant = variantResults.find((variantResult) => variantResult.variant.key === "current");
+  const rankedVariants = [...variantResults]
+    .sort((left, right) => compareBenchmarks(left.benchmark, right.benchmark))
+    .map((variantResult, index) => ({
+      rank: index + 1,
+      label: variantResult.variant.label,
+      combined_worst_status: variantResult.benchmark.benchmark_summary.combined_worst_status,
+      llm_worst_status: variantResult.benchmark.benchmark_summary.llm_worst_status,
+      deterministic_pass_rate: variantResult.benchmark.benchmark_summary.deterministic_pass_rate,
+    }));
+
+  const currentVsBaselines = currentVariant
+    ? variantResults
+        .filter((variantResult) => variantResult.variant.key !== "current")
+        .map((variantResult) => ({
+          label: variantResult.variant.label,
+          combined_worst_status_relative_to_current:
+            compareStatuses(
+              variantResult.benchmark.benchmark_summary.combined_worst_status,
+              currentVariant.benchmark.benchmark_summary.combined_worst_status,
+            ) < 0
+              ? "better"
+              : compareStatuses(
+                    variantResult.benchmark.benchmark_summary.combined_worst_status,
+                    currentVariant.benchmark.benchmark_summary.combined_worst_status,
+                  ) > 0
+                ? "worse"
+                : "same",
+          llm_worst_status_relative_to_current:
+            compareStatuses(
+              variantResult.benchmark.benchmark_summary.llm_worst_status,
+              currentVariant.benchmark.benchmark_summary.llm_worst_status,
+            ) < 0
+              ? "better"
+              : compareStatuses(
+                    variantResult.benchmark.benchmark_summary.llm_worst_status,
+                    currentVariant.benchmark.benchmark_summary.llm_worst_status,
+                  ) > 0
+                ? "worse"
+                : "same",
+          deterministic_pass_rate_delta: Number(
+            (
+              variantResult.benchmark.benchmark_summary.deterministic_pass_rate -
+              currentVariant.benchmark.benchmark_summary.deterministic_pass_rate
+            ).toFixed(2),
+          ),
+        }))
+    : [];
+
+  return {
+    skill_name: skillName,
+    eval_name: evalEntry.eval_name,
+    trial_count: variantResults[0]?.benchmark.trial_count ?? 0,
+    judge_variant: "current",
+    eval_metadata: extractEvalMetadata(evalEntry),
+    variants: variantResults.map((variantResult) => ({
+      label: variantResult.variant.label,
+      source: variantResult.variant.source,
+      run_subdir: variantResult.run_subdir,
+      benchmark_summary: variantResult.benchmark.benchmark_summary,
+      judge_status_counts: variantResult.benchmark.judge_status_counts,
+      combined_status_counts: variantResult.benchmark.combined_status_counts,
+      timing_ms: variantResult.benchmark.timing_ms,
+    })),
+    ranking: rankedVariants,
+    current_vs_baselines: currentVsBaselines,
+  };
+}
+
 async function main() {
-  const { skillName, selector, trials } = parseArgs(process.argv.slice(2));
+  const { skillName, selector, trials, compare } = parseArgs(process.argv.slice(2));
 
   if (!skillName) {
     fail(
-      "Usage: bun run ./scripts/run-baml-eval.mjs <foundation-creator|spec-creator> [eval-id-or-name] [--trials N]",
+      "Usage: bun run ./scripts/run-baml-eval.mjs <foundation-creator|spec-creator> [eval-id-or-name] [--trials N] [--compare previous,profile:no-skill]",
     );
   }
 
@@ -1010,6 +1383,8 @@ async function main() {
   const manifestPath = path.join(evalsDir, "evals.json");
   const manifest = await loadJson(manifestPath);
   const evalEntry = getEvalBySelector(manifest.evals, selector);
+  const variants = buildVariantPlan(compare);
+  const compareMode = compare.length > 0 || variants.length > 1;
   const runner = manifest.runner_contract;
   const validationContract = evalEntry.validation_contract ?? manifest.validation_contract ?? null;
 
@@ -1022,51 +1397,75 @@ async function main() {
   }
 
   await ensureFreshClient(skillRoot);
-  const generated = await importGeneratedClient(skillRoot);
+  const judgeGenerated = await importGeneratedClient(skillRoot);
   const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
   const runDir = path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`);
-  const trialResults = [];
+  const cleanupRoots = [];
 
-  for (let trialIndex = 0; trialIndex < trials; trialIndex += 1) {
-    const trialResult = await runSingleTrial({
-      evalEntry,
-      evalsDir,
-      generated,
-      runner,
-      skillRoot,
-      validationContract,
-    });
-    trialResults.push(trialResult);
+  try {
+    const variantResults = [];
 
-    const trialArtifacts = {
-      "packet.json": trialResult.packet,
-      "brief.json": trialResult.brief,
-      "candidate.md": trialResult.candidateDocument,
-      "report.json": trialResult.report,
-      "deterministic_checks.json": trialResult.deterministic_checks,
-      "timing.json": trialResult.timing,
-      "summary.json": trialResult.summary,
-    };
+    for (const variant of variants) {
+      const preparedVariant = await materializeVariantSkillRoot(skillName, skillRoot, variant);
+      if (preparedVariant.cleanupRoot) {
+        cleanupRoots.push(preparedVariant.cleanupRoot);
+      }
 
-    if (trials === 1) {
-      await writeRunArtifacts(runDir, trialArtifacts);
-    } else {
-      await writeRunArtifacts(path.join(runDir, `trial-${trialIndex + 1}`), trialArtifacts);
+      let candidateGenerated = judgeGenerated;
+      if (variant.kind !== "current") {
+        await ensureFreshClient(preparedVariant.skillRoot);
+        candidateGenerated = await importGeneratedClient(preparedVariant.skillRoot);
+      }
+
+      const artifactDir = compareMode
+        ? path.join(runDir, "variants", slugifyVariantLabel(variant.label))
+        : runDir;
+
+      const variantRun = await runVariantTrials({
+        artifactDir,
+        candidateGenerated,
+        evalEntry,
+        evalsDir,
+        judgeGenerated,
+        runner,
+        skillName,
+        skillRoot,
+        trials,
+        validationContract,
+        variant,
+      });
+
+      variantResults.push({
+        variant,
+        benchmark: variantRun.benchmark,
+        run_subdir: path.relative(runDir, artifactDir).split(path.sep).join("/"),
+      });
     }
-  }
 
-  const benchmark = {
-    ...buildBenchmark(skillName, evalEntry.eval_name, trialResults),
-    eval_metadata: extractEvalMetadata(evalEntry),
-  };
-  await writeRunArtifacts(runDir, {
-    "benchmark.json": benchmark,
-  });
+    if (compareMode) {
+      const comparison = buildComparisonReport(skillName, evalEntry, variantResults);
+      await writeRunArtifacts(runDir, {
+        "comparison.json": comparison,
+      });
+
+      console.log(`Run complete: ${runDir}`);
+      console.log(`Trials per variant: ${trials}`);
+      for (const variantResult of variantResults) {
+        console.log(
+          `${variantResult.variant.label}: LLM ${variantResult.benchmark.benchmark_summary.llm_worst_status}, Combined ${variantResult.benchmark.benchmark_summary.combined_worst_status}, Deterministic ${variantResult.benchmark.benchmark_summary.deterministic_pass_rate}`,
+        );
+      }
+      return;
+    }
 
-  console.log(`Run complete: ${runDir}`);
-  console.log(`Trials: ${trialResults.length}`);
-  console.log(`LLM worst status: ${benchmark.benchmark_summary.llm_worst_status}`);
-  console.log(`Combined worst status: ${benchmark.benchmark_summary.combined_worst_status}`);
+    const benchmark = variantResults[0].benchmark;
+    console.log(`Run complete: ${runDir}`);
+    console.log(`Trials: ${trials}`);
+    console.log(`LLM worst status: ${benchmark.benchmark_summary.llm_worst_status}`);
+    console.log(`Combined worst status: ${benchmark.benchmark_summary.combined_worst_status}`);
+  } finally {
+    await Promise.all(cleanupRoots.map((cleanupRoot) => rm(cleanupRoot, { recursive: true, force: true })));
+  }
 }
 
 main().catch((error) => {
diff --git a/skills/foundation-creator/eval_profiles/no-skill/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/eval_profiles/no-skill/baml_src/foundation_compiler/eval_runner.baml
new file mode 100644
index 0000000..3e02c6c
--- /dev/null
+++ b/skills/foundation-creator/eval_profiles/no-skill/baml_src/foundation_compiler/eval_runner.baml
@@ -0,0 +1,90 @@
+function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> FoundationBrief {
+  client EvalModel
+  prompt #"
+    {{ _.role("user") }}
+    Create a foundation brief from the evaluation packet below.
+
+    Task prompt:
+    {{ packet.task_prompt }}
+
+    Raw notes:
+    {{ packet.raw_notes }}
+
+    Existing foundation:
+    {{ packet.existing_foundation }}
+
+    Expected criteria:
+    {{ packet.expected_criteria }}
+
+    Guidelines:
+    - Produce a clear, polished foundation brief.
+    - Summarize the strongest interpretation of what the company or primitive is.
+    - Fill in reasonable connective tissue when the packet is incomplete.
+    - Organize the result into `summary`, `core_theses`, `boundaries`, `actor_model`, `surfaces`, `strategic_bets`, and `unresolved_questions`.
+    - If `existing_foundation` is present, include it and describe the requested changes in `update_request`, but rewrite surrounding language whenever it improves coherence.
+    - Prefer a decisive synthesis over a list of fragments.
+
+    {{ ctx.output_format }}
+  "#
+}
+
+function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
+  client EvalModel
+  prompt #"
+    {{ _.role("user") }}
+    Draft a polished foundation document from the brief below.
+
+    Guidelines:
+    - Write a strong, coherent top-level foundation document.
+    - Make the framing crisp and readable, even when the source material is somewhat mixed.
+    - Resolve ambiguity when a clearer interpretation improves the document.
+    - Use markdown and keep the document compact.
+    - Start with `# <Primitive Name> Foundation`.
+    - Use these sections:
+      - `## What This Is`
+      - `## Core Thesis`
+      - `## Boundaries`
+      - `## Actor Model`
+      - `## Durable Surfaces`
+      - `## Strategic Bets`
+      - `## Open Questions`
+    - If `existing_foundation` is present, use it as a starting point but rewrite surrounding content freely when it makes the document more consistent.
+    - Favor confident, finished prose over heavily qualified language.
+
+    Brief:
+    {{ brief|format(type="yaml") }}
+  "#
+}
+
+function EvaluateFoundationDocument(
+  packet: FoundationEvalPacket,
+  candidate_document: string
+) -> EvalReport {
+  client EvalModel
+  prompt #"
+    {{ _.role("user") }}
+    Evaluate the candidate foundation document against the evaluation packet.
+
+    Packet:
+    {{ packet|format(type="yaml") }}
+
+    Candidate document:
+    {{ candidate_document }}
+
+    Rules:
+    - Grade against the expected criteria explicitly.
+    - Reward preservation of uncertainty when the source packet is genuinely mixed.
+    - Penalize invented certainty, invented capabilities, or implementation leakage.
+    - If `existing_foundation` is present, penalize rewriting unchanged content or adding broad new material outside the requested edit.
+    - Penalize unsupported business-model, monetization, KPI, org, partnership, or operating-plan language.
+    - Penalize consulting-style sections such as `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or similar drift.
+    - Penalize missing markdown heading structure if the document drifts from the required template shape.
+    - Penalize `Strategic Bets` phrased as recommendations or settled conclusions when the packet only supports directional evidence.
+    - Penalize flattening transitional surfaces as fully settled if the packet presents them as evolving.
+    - Penalize `Open Questions` that silently resolve ambiguity instead of keeping it open.
+    - Penalize market-leadership or competitive-superiority claims not explicitly supported by the packet.
+    - Use `Pass`, `Partial`, or `Fail` for each criterion.
+
+    {{ ctx.output_format }}
+  "#
+}
diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json
index 1c7935c..f978eb6 100644
--- a/skills/foundation-creator/evals/evals.json
+++ b/skills/foundation-creator/evals/evals.json
@@ -220,7 +220,7 @@
           },
           {
             "id": "human_judgment_thesis_present",
-            "pattern": "structur(?:e|ing) human judgment.*stable(?:,? [a-z-]+)? constraints?|stable(?:,? [a-z-]+)? constraints?.*structur(?:e|ing) human judgment",
+            "pattern": "structur(?:e|ing) human judgment.*stable(?:,? [a-z-]+)? constraints?(?: and artifacts?)?|stable(?:,? [a-z-]+)? constraints?(?: and artifacts?)?.*structur(?:e|ing) human judgment",
             "flags": "i",
             "details_pass": "Detected the requested thesis language about structuring human judgment through stable constraints.",
             "details_fail": "Did not detect the requested thesis language about structuring human judgment through stable constraints."

From e4028a78c30c18118451eb509018dbf34a930608 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Tue, 21 Apr 2026 15:31:15 +1000
Subject: [PATCH 09/30] Tune foundation and spec eval profiles

---
 README.md                                     |  23 +-
 package.json                                  |   8 +-
 scripts/run-baml-eval.mjs                     | 370 ++++++++++++++++--
 skills/foundation-creator/SKILL.md            |   5 +-
 .../foundation-creator/baml_src/clients.baml  |   2 +-
 .../compiler_functions.baml                   |  24 +-
 .../foundation_compiler/eval_runner.baml      |  35 +-
 .../baml_src/clients.baml                     |   8 +
 .../baml_src/clients.baml                     |  11 +
 .../baml_src/clients.baml                     |  11 +
 .../foundation-creator/references/language.md |  30 +-
 .../foundation-creator/references/template.md |   8 +-
 skills/spec-creator/baml_src/clients.baml     |   2 +-
 .../spec_compiler/compiler_functions.baml     |  40 +-
 .../baml_src/spec_compiler/eval_runner.baml   |  38 +-
 .../baml_src/spec_compiler/spec_types.baml    |   6 +
 .../baml_src/clients.baml                     |   8 +
 .../baml_src/clients.baml                     |  11 +
 .../baml_src/clients.baml                     |  11 +
 .../fixtures/vercel_mcp/expected_criteria.md  |  12 +
 20 files changed, 581 insertions(+), 82 deletions(-)
 create mode 100644 skills/foundation-creator/eval_profiles/model-anthropic-claude-opus-4-7/baml_src/clients.baml
 create mode 100644 skills/foundation-creator/eval_profiles/model-openai-gpt-5.4-mini/baml_src/clients.baml
 create mode 100644 skills/foundation-creator/eval_profiles/model-openai-gpt-5.4/baml_src/clients.baml
 create mode 100644 skills/spec-creator/eval_profiles/model-anthropic-claude-opus-4-7/baml_src/clients.baml
 create mode 100644 skills/spec-creator/eval_profiles/model-openai-gpt-5.4-mini/baml_src/clients.baml
 create mode 100644 skills/spec-creator/eval_profiles/model-openai-gpt-5.4/baml_src/clients.baml

diff --git a/README.md b/README.md
index d1208ad..f90b6fa 100644
--- a/README.md
+++ b/README.md
@@ -32,8 +32,9 @@ bun run eval:foundation -- create-foundation-from-lightfast-founder-notes
 bun run eval:foundation -- update-lightfast-foundation-boundary-surface-question
 bun run eval:foundation -- update-lightfast-foundation-tighten-overreach
 bun run eval:spec -- create-from-vercel-mcp-source-packet
-bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3
-bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator update-lightfast-foundation-tighten-overreach --compare previous,profile:no-skill
+bun run with-env -- node ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --eval-profile gate --trials 3
+bun run with-env -- node ./scripts/run-baml-eval.mjs foundation-creator update-lightfast-foundation-tighten-overreach --eval-profile fast --compare previous,profile:no-skill
+bun run with-env -- node ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-lightfast-founder-notes --eval-profile cross
 ```
 
 Each run writes packet, brief, candidate document, and evaluation report
@@ -71,6 +72,22 @@ Current comparison variants:
 - `profile:no-skill` — intentionally under-scaffolded baseline profile for
   measuring how much the foundation-specific prompt constraints matter
 
+Current eval profiles:
+
+- `fast` — candidate and judge both run on `openai/gpt-5.4-mini`
+- `gate` — candidate runs on `openai/gpt-5.4-mini`, judge runs on `openai/gpt-5.4`
+- `prod` — candidate uses the skill's default authoring model from `baml_src/clients.baml`, judge runs on `openai/gpt-5.4`
+- `cross` — candidate runs on `openai/gpt-5.4-mini`, judge runs on `anthropic/claude-opus-4-7`
+
+The default authoring client in each skill's `baml_src/clients.baml` is
+`openai/gpt-5.4` for higher-quality foundation/spec generation. Eval profiles
+override that default so the tuning loop can stay on cheaper candidate models.
+
+`fast` is the default when `--eval-profile` is omitted.
+Model profiles are applied as overlay fixtures, so prompt comparisons against
+`previous` or `profile:no-skill` stay on the same candidate/judge model split.
+The `cross` profile requires Anthropic model access through Vercel AI Gateway.
+
 Eval manifests also carry lightweight taxonomy metadata
 (`scenario_type`, `input_shape`, `ambiguity_level`, `domain_profile`,
 `primary_risks`) so benchmark runs can be grouped by failure mode. Shared
@@ -86,7 +103,7 @@ When `--trials N` is used, the run directory contains `trial-1/`, `trial-2/`,
 For other local commands that should inherit `.env`, use:
 
 ```bash
-bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-vercel-source-packet
+bun run with-env -- node ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-vercel-source-packet
 ```
 
 ## License
diff --git a/package.json b/package.json
index 944dfe8..4983f5a 100644
--- a/package.json
+++ b/package.json
@@ -5,10 +5,10 @@
   "type": "module",
   "scripts": {
     "with-env": "dotenv -e .env --",
-    "baml:generate:foundation": "bunx baml-cli generate --from ./skills/foundation-creator/baml_src",
-    "baml:generate:spec": "bunx baml-cli generate --from ./skills/spec-creator/baml_src",
-    "eval:foundation": "bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator",
-    "eval:spec": "bun run with-env -- bun run ./scripts/run-baml-eval.mjs spec-creator"
+    "baml:generate:foundation": "node ./node_modules/@boundaryml/baml/cli.js generate --from ./skills/foundation-creator/baml_src",
+    "baml:generate:spec": "node ./node_modules/@boundaryml/baml/cli.js generate --from ./skills/spec-creator/baml_src",
+    "eval:foundation": "bun run with-env -- node ./scripts/run-baml-eval.mjs foundation-creator",
+    "eval:spec": "bun run with-env -- node ./scripts/run-baml-eval.mjs spec-creator"
   },
   "dependencies": {
     "@boundaryml/baml": "0.221.0",
diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs
index a5ecf99..fc74642 100644
--- a/scripts/run-baml-eval.mjs
+++ b/scripts/run-baml-eval.mjs
@@ -1,4 +1,4 @@
-import { access, cp, mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
+import { access, cp, mkdir, mkdtemp, readFile, rm, symlink, writeFile } from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
 import { fileURLToPath, pathToFileURL } from "node:url";
@@ -19,6 +19,41 @@ const STATUS_RANK = {
   Fail: 2,
 };
 
+const EVAL_PROFILE_PRESETS = {
+  fast: {
+    name: "fast",
+    description: "Fast inner-loop runs with GPT-5.4 mini for candidate and judge.",
+    candidateModel: "openai/gpt-5.4-mini",
+    judgeModel: "openai/gpt-5.4-mini",
+    candidateOverlayProfiles: ["model-openai-gpt-5.4-mini"],
+    judgeOverlayProfiles: ["model-openai-gpt-5.4-mini"],
+  },
+  gate: {
+    name: "gate",
+    description: "Candidate on GPT-5.4 mini, judged by GPT-5.4.",
+    candidateModel: "openai/gpt-5.4-mini",
+    judgeModel: "openai/gpt-5.4",
+    candidateOverlayProfiles: ["model-openai-gpt-5.4-mini"],
+    judgeOverlayProfiles: ["model-openai-gpt-5.4"],
+  },
+  prod: {
+    name: "prod",
+    description: "Production authoring default as candidate, judged by GPT-5.4.",
+    candidateModel: "skill-default",
+    judgeModel: "openai/gpt-5.4",
+    candidateOverlayProfiles: [],
+    judgeOverlayProfiles: ["model-openai-gpt-5.4"],
+  },
+  cross: {
+    name: "cross",
+    description: "Candidate on GPT-5.4 mini, judged by Claude Opus 4.7 through AI Gateway.",
+    candidateModel: "openai/gpt-5.4-mini",
+    judgeModel: "anthropic/claude-opus-4-7",
+    candidateOverlayProfiles: ["model-openai-gpt-5.4-mini"],
+    judgeOverlayProfiles: ["model-anthropic-claude-opus-4-7"],
+  },
+};
+
 function normalizeLine(line) {
   return line.trim().replace(/\s+/g, " ");
 }
@@ -122,6 +157,7 @@ function parseArgs(argv) {
   const positionals = [];
   let trials = 1;
   let compare = [];
+  let evalProfile = "fast";
 
   for (let index = 0; index < argv.length; index += 1) {
     const arg = argv[index];
@@ -152,6 +188,19 @@ function parseArgs(argv) {
       continue;
     }
 
+    if (arg === "--eval-profile") {
+      const next = argv[index + 1];
+      if (!next) {
+        fail("Missing value after --eval-profile.");
+      }
+      evalProfile = next.trim();
+      if (evalProfile.length === 0) {
+        fail("--eval-profile must not be empty.");
+      }
+      index += 1;
+      continue;
+    }
+
     positionals.push(arg);
   }
 
@@ -160,9 +209,61 @@ function parseArgs(argv) {
     selector: positionals[1],
     trials,
     compare,
+    evalProfile,
+  };
+}
+
+function dedupeStrings(values) {
+  const deduped = [];
+  const seen = new Set();
+
+  for (const value of values) {
+    if (seen.has(value)) {
+      continue;
+    }
+    seen.add(value);
+    deduped.push(value);
+  }
+
+  return deduped;
+}
+
+function getEvalProfilePreset(rawValue) {
+  const value = rawValue?.trim() || "fast";
+  const preset = EVAL_PROFILE_PRESETS[value];
+
+  if (!preset) {
+    fail(
+      `Unknown eval profile '${rawValue}'. Use one of: ${Object.keys(EVAL_PROFILE_PRESETS).join(", ")}.`,
+    );
+  }
+
+  return {
+    ...preset,
+    candidateOverlayProfiles: dedupeStrings(preset.candidateOverlayProfiles ?? []),
+    judgeOverlayProfiles: dedupeStrings(preset.judgeOverlayProfiles ?? []),
+  };
+}
+
+function summarizeEvalProfile(evalProfile) {
+  return {
+    name: evalProfile.name,
+    description: evalProfile.description,
+    candidate_model: evalProfile.candidateModel,
+    judge_model: evalProfile.judgeModel,
+    candidate_overlay_profiles: evalProfile.candidateOverlayProfiles,
+    judge_overlay_profiles: evalProfile.judgeOverlayProfiles,
   };
 }
 
+function sameStringArray(left, right) {
+  if (left.length !== right.length) {
+    return false;
+  }
+
+  return left.every((value, index) => value === right[index]);
+}
+
 function shellEscape(value) {
   return `'${String(value).replace(/'/g, `'\\''`)}'`;
 }
@@ -303,7 +404,11 @@ function getEvalBySelector(evals, selector) {
 
 async function generateClient(skillRoot) {
   const bamlSrc = path.join(skillRoot, "baml_src");
-  await runCommand("bunx", ["baml-cli", "generate", "--from", bamlSrc], repoRoot);
+  await runCommand(
+    "node",
+    [path.join(repoRoot, "node_modules", "@boundaryml", "baml", "cli.js"), "generate", "--from", bamlSrc],
+    repoRoot,
+  );
 }
 
 async function importGeneratedClient(skillRoot) {
@@ -373,6 +478,18 @@ async function cloneSkillRoot(sourceSkillRoot, destinationSkillRoot) {
   });
 }
 
+async function linkWorkspaceNodeModules(workspaceRoot) {
+  const workspaceNodeModulesPath = path.join(workspaceRoot, "node_modules");
+
+  try {
+    await symlink(path.join(repoRoot, "node_modules"), workspaceNodeModulesPath, "dir");
+  } catch (error) {
+    if (error?.code !== "EEXIST") {
+      throw error;
+    }
+  }
+}
+
 async function materializeGitSkillRoot(skillName, ref, workspaceRoot) {
   const repoRelativeSkillPath = path.posix.join("skills", skillName);
   const archiveCommand = [
@@ -387,8 +504,33 @@ async function materializeGitSkillRoot(skillName, ref, workspaceRoot) {
   return path.join(workspaceRoot, "skills", skillName);
 }
 
-async function materializeVariantSkillRoot(skillName, baseSkillRoot, variant) {
-  if (variant.kind === "current") {
+async function resolveProfileRoot(baseSkillRoot, profileName) {
+  const profileRoot = path.join(baseSkillRoot, "eval_profiles", profileName);
+
+  try {
+    await access(profileRoot);
+  } catch {
+    fail(`Skill profile '${profileName}' not found at ${profileRoot}.`);
+  }
+
+  return profileRoot;
+}
+
+async function applyOverlayProfiles(skillRoot, baseSkillRoot, overlayProfileNames) {
+  for (const profileName of overlayProfileNames) {
+    const profileRoot = await resolveProfileRoot(baseSkillRoot, profileName);
+    await cp(profileRoot, skillRoot, {
+      recursive: true,
+      force: true,
+    });
+  }
+}
+
+async function materializeVariantSkillRoot(skillName, baseSkillRoot, variant, overlayProfileNames = []) {
+  const dedupedOverlayProfiles = dedupeStrings(overlayProfileNames);
+  const needsWorkspace = variant.kind !== "current" || dedupedOverlayProfiles.length > 0;
+
+  if (!needsWorkspace) {
     return {
       skillRoot: baseSkillRoot,
       cleanupRoot: null,
@@ -398,9 +540,21 @@ async function materializeVariantSkillRoot(skillName, baseSkillRoot, variant) {
   const workspaceRoot = await mkdtemp(
     path.join(os.tmpdir(), `lightfast-skill-eval-${slugifyVariantLabel(variant.label)}-`),
   );
+  await linkWorkspaceNodeModules(workspaceRoot);
+  let skillRoot = path.join(workspaceRoot, "skills", skillName);
+
+  if (variant.kind === "current") {
+    await cloneSkillRoot(baseSkillRoot, skillRoot);
+    await applyOverlayProfiles(skillRoot, baseSkillRoot, dedupedOverlayProfiles);
+    return {
+      skillRoot,
+      cleanupRoot: workspaceRoot,
+    };
+  }
 
   if (variant.kind === "git") {
-    const skillRoot = await materializeGitSkillRoot(skillName, variant.ref, workspaceRoot);
+    skillRoot = await materializeGitSkillRoot(skillName, variant.ref, workspaceRoot);
+    await applyOverlayProfiles(skillRoot, baseSkillRoot, dedupedOverlayProfiles);
     return {
       skillRoot,
       cleanupRoot: workspaceRoot,
@@ -408,20 +562,13 @@ async function materializeVariantSkillRoot(skillName, baseSkillRoot, variant) {
   }
 
   if (variant.kind === "profile") {
-    const skillRoot = path.join(workspaceRoot, "skills", skillName);
-    const profileRoot = path.join(baseSkillRoot, "eval_profiles", variant.profileName);
-
-    try {
-      await access(profileRoot);
-    } catch {
-      fail(`Comparison profile '${variant.profileName}' not found at ${profileRoot}.`);
-    }
-
+    const profileRoot = await resolveProfileRoot(baseSkillRoot, variant.profileName);
     await cloneSkillRoot(baseSkillRoot, skillRoot);
     await cp(profileRoot, skillRoot, {
       recursive: true,
       force: true,
     });
+    await applyOverlayProfiles(skillRoot, baseSkillRoot, dedupedOverlayProfiles);
 
     return {
       skillRoot,
@@ -461,7 +608,11 @@ async function ensureFreshClient(skillRoot) {
     "utf8",
   );
   try {
-    await runCommand("bunx", ["tsc", "--project", tsconfigPath], repoRoot);
+    await runCommand(
+      "node",
+      [path.join(repoRoot, "node_modules", "typescript", "bin", "tsc"), "--project", tsconfigPath],
+      repoRoot,
+    );
   } finally {
     await rm(tsconfigPath, { force: true });
   }
@@ -610,6 +761,50 @@ function extractMarkdownBullets(sectionBody) {
   return bullets;
 }
 
+function packetAllowsPublicMaterialsLead(packet) {
+  if (!packet || typeof packet.raw_notes !== "string") {
+    return true;
+  }
+
+  return (
+    /\bURL:\b/i.test(packet.raw_notes) ||
+    /\b##\s+Source\s+\d+/i.test(packet.raw_notes) ||
+    /\bofficial\b.*\bsources?\b/i.test(packet.raw_notes) ||
+    /\bpress release\b/i.test(packet.raw_notes) ||
+    /\bdocs homepage\b/i.test(packet.raw_notes) ||
+    /\bLast updated:\b/i.test(packet.raw_notes) ||
+    /\bPublished:\b/i.test(packet.raw_notes)
+  );
+}
+
+function strategicBetUsesApprovedLead(line, { allowPublicMaterialsLead = true } = {}) {
+  const publicLeadPattern = allowPublicMaterialsLead
+    ? "|Public materials suggest(?:\\s+a bet on|\\s+that)?"
+    : "";
+
+  return new RegExp(
+    `^(The notes suggest(?:\\s+a bet on|\\s+that)?|There are visible signals that${publicLeadPattern}|The source material indicates(?:\\s+a bet on|\\s+that)?)`,
+    "i",
+  ).test(line);
+}
+
+function strategicBetUsesDisallowedLanguage(line) {
+  const normalized = normalizeLine(line).toLowerCase();
+
+  return (
+    normalized.includes("the company appears to be betting on") ||
+    /\bprioriti(?:ze|zes|zed|zing)\b/.test(normalized) ||
+    /\binvest(?:s|ed|ing)? in\b/.test(normalized) ||
+    /\bship(?:s|ped|ping)?\b/.test(normalized) ||
+    /\btreat(?:s|ed|ing)?\b.*\bfirst-class\b/.test(normalized) ||
+    normalized.includes("first-class") ||
+    normalized.includes("is the wedge") ||
+    normalized.includes("is a defensible primitive") ||
+    normalized.includes("will remain") ||
+    normalized.includes("matters more than")
+  );
+}
+
 function extractNormalizedMarkdownBlocks(text) {
   const blocks = [];
   let current = null;
@@ -653,7 +848,7 @@ function extractNormalizedMarkdownBlocks(text) {
   return blocks;
 }
 
-function validateFoundationDocument(candidateDocument, templateText) {
+function validateFoundationDocument(candidateDocument, templateText, languageText, packet) {
   const requiredSections = extractFoundationTemplateSections(templateText);
   const disallowedHeadings = extractFoundationDisallowedHeadings(templateText);
   const { lines, sections, bodies } = extractFoundationSectionBodies(candidateDocument);
@@ -710,22 +905,21 @@ function validateFoundationDocument(candidateDocument, templateText) {
   const strategicBetsBody = bodies.get("Strategic Bets") ?? "";
   const openQuestionsBody = bodies.get("Open Questions") ?? "";
   const openQuestionBullets = extractMarkdownBullets(openQuestionsBody);
+  const allowPublicMaterialsLead = packetAllowsPublicMaterialsLead(packet);
   const openQuestionsLookOpen =
     openQuestionBullets.length > 0 &&
     openQuestionBullets.every((line) => line.endsWith("?"));
   const strategicBetLines = extractMarkdownBullets(strategicBetsBody);
-  const strategicBetsBodyHasDirectionalPreamble = /\b(observed directional bets|public signals)\b/i.test(
-    strategicBetsBody,
-  );
+  const strategicBetsUseBullets =
+    strategicBetsBody.trim().length === 0 || strategicBetLines.length > 0;
   const hedgedStrategicBets =
-    strategicBetLines.length === 0 ||
+    strategicBetLines.length > 0 &&
     strategicBetLines.every((line) =>
-      /^Bet:/i.test(line)
-        ? strategicBetsBodyHasDirectionalPreamble
-        : /\b(appears?|suggests?|signals?|signaling|indicates?|indicating|indications?|directional bet|directional bets|observed bet|observed bets|a bet that|bet that|bet on)\b/i.test(
-            line,
-          ),
+      strategicBetUsesApprovedLead(line, { allowPublicMaterialsLead }),
     );
+  const strategicBetsAvoidPrescriptiveOrCompanyPosture =
+    strategicBetLines.length === 0 ||
+    strategicBetLines.every((line) => !strategicBetUsesDisallowedLanguage(line));
 
   return [
     createCheck(
@@ -765,12 +959,28 @@ function validateFoundationDocument(candidateDocument, templateText) {
         ? "No disallowed downstream-planning sections were detected."
         : `Disallowed sections present: ${presentDisallowedSections.join(", ")}.`,
     ),
+    createCheck(
+      "strategic_bets_use_markdown_bullets",
+      strategicBetsUseBullets,
+      strategicBetsUseBullets
+        ? "Strategic Bets use markdown bullets."
+        : "Strategic Bets should be rendered as markdown bullets using `- `, not as paragraph-only prose.",
+    ),
     createCheck(
       "strategic_bets_use_directional_language",
       hedgedStrategicBets,
       hedgedStrategicBets
-        ? "Strategic Bets are framed as directional bets or observed signals."
-        : "One or more Strategic Bets bullets read as settled conclusions instead of directional bets or observed signals.",
+        ? "Strategic Bets use approved source-centered lead phrasing."
+        : allowPublicMaterialsLead
+          ? "One or more Strategic Bets bullets do not begin with approved source-centered phrasing such as `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`."
+          : "One or more Strategic Bets bullets use evidence-source phrasing that does not match a notes-only packet. In note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.",
+    ),
+    createCheck(
+      "strategic_bets_avoid_prescriptive_or_company_intent_language",
+      strategicBetsAvoidPrescriptiveOrCompanyPosture,
+      strategicBetsAvoidPrescriptiveOrCompanyPosture
+        ? "Strategic Bets avoid prescriptive verbs and direct company-intent phrasing."
+        : "One or more Strategic Bets bullets use prescriptive verbs, categorical phrasing, or direct company-intent language.",
     ),
     createCheck(
       "open_questions_remain_questions",
@@ -794,8 +1004,9 @@ function validateFoundationUpdateDocument(
   existingFoundationText,
   templateText,
   validationContract,
+  packet,
 ) {
-  const baseChecks = validateFoundationDocument(candidateDocument, templateText);
+  const baseChecks = validateFoundationDocument(candidateDocument, templateText, "", packet);
   const existingBlocks = extractNormalizedMarkdownBlocks(existingFoundationText);
   const candidateBlocks = extractNormalizedMarkdownBlocks(candidateDocument);
   const normalizedCandidate = normalizeLine(candidateDocument);
@@ -844,12 +1055,28 @@ function extractSpecMajorSections(templateText) {
   return sections;
 }
 
+function extractSpecSubsections(templateText) {
+  const sections = [];
+  const lines = templateText.split(/\r?\n/);
+
+  for (const rawLine of lines) {
+    const line = normalizeLine(rawLine);
+    const match = line.match(/^### \d+\.\d+\s+(.+)$/);
+    if (match) {
+      sections.push(match[1]);
+    }
+  }
+
+  return sections;
+}
+
 function lineExists(lines, matcher) {
   return lines.some((line) => matcher(normalizeHeading(line)));
 }
 
 function validateSpecDocument(candidateDocument, templateText) {
   const requiredSections = extractSpecMajorSections(templateText);
+  const requiredSubsections = extractSpecSubsections(templateText);
   const lines = candidateDocument.split(/\r?\n/);
   const missingSections = requiredSections.filter(
     (section) =>
@@ -858,6 +1085,13 @@ function validateSpecDocument(candidateDocument, templateText) {
         (line) => stripLeadingSectionNumber(line).toLowerCase() === section.toLowerCase(),
       ),
   );
+  const missingSubsections = requiredSubsections.filter(
+    (section) =>
+      !lineExists(
+        lines,
+        (line) => stripLeadingSectionNumber(line).toLowerCase() === section.toLowerCase(),
+      ),
+  );
 
   const hasStatusLine = lines.some((line) => /^\s*Status:\s+\S+/.test(line));
   const hasPurposeLine = lines.some((line) => /^\s*Purpose:\s+\S+/.test(line));
@@ -865,9 +1099,12 @@ function validateSpecDocument(candidateDocument, templateText) {
   const hasGoalSubsections =
     lineExists(lines, (line) => line === "2.1 Goals") &&
     lineExists(lines, (line) => line === "2.2 Non-Goals");
-  const hasImportantBoundaryBlock = /(^|\n)Important boundary:\s*\n/m.test(candidateDocument);
+  const hasImportantBoundaryBlock =
+    /(^|\n)\s*(?:\*\*)?Important boundary:(?:\*\*)?\s*(?:\n|$)/m.test(candidateDocument);
   const hasNumberedComponents = /^\d+\.\s+`[^`]+`/m.test(candidateDocument);
   const hasFieldFormatting = /- `[^`]+` \([^)]+\)/.test(candidateDocument);
+  const fieldLinesAvoidRequirementKeywords =
+    !/- `[^`]+` \([^)]*\b(required|optional)\b[^)]*\)/i.test(candidateDocument);
 
   return [
     createCheck(
@@ -877,6 +1114,13 @@ function validateSpecDocument(candidateDocument, templateText) {
         ? `All major template sections are present: ${requiredSections.join(", ")}.`
         : `Missing major sections: ${missingSections.join(", ")}.`,
     ),
+    createCheck(
+      "required_subsections_present",
+      missingSubsections.length === 0,
+      missingSubsections.length === 0
+        ? `All template subsections are present: ${requiredSubsections.join(", ")}.`
+        : `Missing template subsections: ${missingSubsections.join(", ")}.`,
+    ),
     createCheck(
       "status_line_present",
       hasStatusLine,
@@ -926,6 +1170,13 @@ function validateSpecDocument(candidateDocument, templateText) {
         ? "Detected domain-field lines using the `` `field_name` (type) `` format."
         : "Did not detect any domain-field lines using the template field format.",
     ),
+    createCheck(
+      "field_parens_avoid_requirement_keywords",
+      fieldLinesAvoidRequirementKeywords,
+      fieldLinesAvoidRequirementKeywords
+        ? "Field type parentheses avoid `required`/`optional` labels."
+        : "Detected `required` or `optional` inside field type parentheses; keep those details in the description bullets instead.",
+    ),
     createCheck(
       "no_first_or_second_person",
       !hasPronounDrift(candidateDocument),
@@ -992,7 +1243,7 @@ function validateSpecUpdateDocument(candidateDocument, existingSpecText) {
   ];
 }
 
-async function runDeterministicChecks(skillRoot, validationContract, candidateDocument) {
+async function runDeterministicChecks(skillRoot, validationContract, candidateDocument, packet = null) {
   if (!validationContract || validationContract.type !== "reference_document_checks") {
     return {
       enabled: false,
@@ -1023,7 +1274,7 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo
   let checks;
   switch (validationContract.validator) {
     case "foundation-v1":
-      checks = validateFoundationDocument(candidateDocument, templateText, languageText);
+      checks = validateFoundationDocument(candidateDocument, templateText, languageText, packet);
       break;
     case "foundation-update-v1":
       checks = validateFoundationUpdateDocument(
@@ -1031,6 +1282,7 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo
         existingFoundationText,
         templateText,
         validationContract,
+        packet,
       );
       break;
     case "spec-v1":
@@ -1112,6 +1364,7 @@ async function runSingleTrial({
     skillRoot,
     validationContract,
     candidateDocument,
+    packet,
   );
   timing.deterministic_ms = Date.now() - deterministicStartedAt;
 
@@ -1299,7 +1552,7 @@ function compareBenchmarks(left, right) {
   return rightSummary.deterministic_pass_rate - leftSummary.deterministic_pass_rate;
 }
 
-function buildComparisonReport(skillName, evalEntry, variantResults) {
+function buildComparisonReport(skillName, evalEntry, variantResults, evalProfile) {
   const currentVariant = variantResults.find((variantResult) => variantResult.variant.key === "current");
   const rankedVariants = [...variantResults]
     .sort((left, right) => compareBenchmarks(left.benchmark, right.benchmark))
@@ -1354,6 +1607,7 @@ function buildComparisonReport(skillName, evalEntry, variantResults) {
     eval_name: evalEntry.eval_name,
     trial_count: variantResults[0]?.benchmark.trial_count ?? 0,
     judge_variant: "current",
+    eval_profile: summarizeEvalProfile(evalProfile),
     eval_metadata: extractEvalMetadata(evalEntry),
     variants: variantResults.map((variantResult) => ({
       label: variantResult.variant.label,
@@ -1370,11 +1624,13 @@ function buildComparisonReport(skillName, evalEntry, variantResults) {
 }
 
 async function main() {
-  const { skillName, selector, trials, compare } = parseArgs(process.argv.slice(2));
+  const { skillName, selector, trials, compare, evalProfile: evalProfileName } = parseArgs(
+    process.argv.slice(2),
+  );
 
   if (!skillName) {
     fail(
-      "Usage: bun run ./scripts/run-baml-eval.mjs <foundation-creator|spec-creator> [eval-id-or-name] [--trials N] [--compare previous,profile:no-skill]",
+      "Usage: bun run ./scripts/run-baml-eval.mjs <foundation-creator|spec-creator> [eval-id-or-name] [--trials N] [--compare previous,profile:no-skill] [--eval-profile fast|gate|prod|cross]",
     );
   }
 
@@ -1385,6 +1641,7 @@ async function main() {
   const evalEntry = getEvalBySelector(manifest.evals, selector);
   const variants = buildVariantPlan(compare);
   const compareMode = compare.length > 0 || variants.length > 1;
+  const evalProfile = getEvalProfilePreset(evalProfileName);
   const runner = manifest.runner_contract;
   const validationContract = evalEntry.validation_contract ?? manifest.validation_contract ?? null;
 
@@ -1396,23 +1653,44 @@ async function main() {
     fail("AI_GATEWAY_API_KEY is required to execute BAML evals.");
   }
 
-  await ensureFreshClient(skillRoot);
-  const judgeGenerated = await importGeneratedClient(skillRoot);
+  const cleanupRoots = [];
+  const judgePrepared = await materializeVariantSkillRoot(
+    skillName,
+    skillRoot,
+    parseVariantSpec("current"),
+    evalProfile.judgeOverlayProfiles,
+  );
+  if (judgePrepared.cleanupRoot) {
+    cleanupRoots.push(judgePrepared.cleanupRoot);
+  }
+
+  await ensureFreshClient(judgePrepared.skillRoot);
+  const judgeGenerated = await importGeneratedClient(judgePrepared.skillRoot);
   const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
   const runDir = path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`);
-  const cleanupRoots = [];
 
   try {
     const variantResults = [];
 
     for (const variant of variants) {
-      const preparedVariant = await materializeVariantSkillRoot(skillName, skillRoot, variant);
-      if (preparedVariant.cleanupRoot) {
-        cleanupRoots.push(preparedVariant.cleanupRoot);
-      }
+      const currentVariantMatchesJudge =
+        variant.kind === "current" &&
+        sameStringArray(evalProfile.candidateOverlayProfiles, evalProfile.judgeOverlayProfiles);
 
+      let preparedVariant = judgePrepared;
       let candidateGenerated = judgeGenerated;
-      if (variant.kind !== "current") {
+
+      if (!currentVariantMatchesJudge) {
+        preparedVariant = await materializeVariantSkillRoot(
+          skillName,
+          skillRoot,
+          variant,
+          evalProfile.candidateOverlayProfiles,
+        );
+        if (preparedVariant.cleanupRoot) {
+          cleanupRoots.push(preparedVariant.cleanupRoot);
+        }
+
         await ensureFreshClient(preparedVariant.skillRoot);
         candidateGenerated = await importGeneratedClient(preparedVariant.skillRoot);
       }
@@ -1443,12 +1721,15 @@ async function main() {
     }
 
     if (compareMode) {
-      const comparison = buildComparisonReport(skillName, evalEntry, variantResults);
+      const comparison = buildComparisonReport(skillName, evalEntry, variantResults, evalProfile);
       await writeRunArtifacts(runDir, {
         "comparison.json": comparison,
       });
 
       console.log(`Run complete: ${runDir}`);
+      console.log(
+        `Eval profile: ${evalProfile.name} (candidate ${evalProfile.candidateModel}, judge ${evalProfile.judgeModel})`,
+      );
       console.log(`Trials per variant: ${trials}`);
       for (const variantResult of variantResults) {
         console.log(
@@ -1460,6 +1741,9 @@ async function main() {
 
     const benchmark = variantResults[0].benchmark;
     console.log(`Run complete: ${runDir}`);
+    console.log(
+      `Eval profile: ${evalProfile.name} (candidate ${evalProfile.candidateModel}, judge ${evalProfile.judgeModel})`,
+    );
     console.log(`Trials: ${trials}`);
     console.log(`LLM worst status: ${benchmark.benchmark_summary.llm_worst_status}`);
     console.log(`Combined worst status: ${benchmark.benchmark_summary.combined_worst_status}`);
diff --git a/skills/foundation-creator/SKILL.md b/skills/foundation-creator/SKILL.md
index 88f7bad..781fe0b 100644
--- a/skills/foundation-creator/SKILL.md
+++ b/skills/foundation-creator/SKILL.md
@@ -97,7 +97,10 @@ Typical update shapes:
 - Durable thesis-level framing.
 - Actor model and durable surfaces.
 - Strategic bets only when they are clearly supported by the source material.
-  Frame them as observed directional bets, not recommendations.
+  Frame them as source-visible directional signals, not recommendations or
+  direct claims about company intent. Match the lead-in to the evidence:
+  note-style packets should use note-style attribution, and `Public materials
+  suggest...` should appear only when the packet actually cites public sources.
 - Open questions and unresolved tensions.
 
 ## Forbidden drift
diff --git a/skills/foundation-creator/baml_src/clients.baml b/skills/foundation-creator/baml_src/clients.baml
index f37af5b..ef45ace 100644
--- a/skills/foundation-creator/baml_src/clients.baml
+++ b/skills/foundation-creator/baml_src/clients.baml
@@ -3,7 +3,7 @@ client<llm> EvalModel {
   options {
     api_key env.AI_GATEWAY_API_KEY
     base_url "https://ai-gateway.vercel.sh/v1"
-    model "openai/gpt-5-mini"
+    model "openai/gpt-5.4"
     reasoning {
       effort "medium"
     }
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index 2a1d340..33105a3 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -83,7 +83,13 @@ function CompileFoundationBrief(
     - Remove unsupported inferences rather than softening them.
     - Prefer omission over consultant-style expansion.
     - Do not add business-model, KPI, org, roadmap, or operating-plan language unless source-backed.
-    - Treat `strategic_bets` as observed directional bets, not recommendations or settled future state.
+    - Treat `strategic_bets` as source-visible directional signals, not recommendations or settled future state.
+    - Each `strategic_bets` item must begin with an evidence-matched source-centered opening.
+    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
+    - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
+    - Avoid wording that attributes settled intent directly to the company, such as `The company appears to be betting on...`.
+    - Forbid prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
+    - If a possible bet cannot be phrased as a source-visible signal without overclaiming, move it to `open_questions` or omit it.
     - If a surface is visible but still in transition, preserve that qualification in the surrounding summary or questions.
     - Avoid market-leadership or competitive-superiority language unless directly supported by source material.
 
@@ -111,6 +117,9 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - State that the writer is a source-bound synthesizer, not a strategy consultant.
     - If the brief includes `existing_foundation`, treat the task as update mode and preserve the existing document verbatim except for the requested edits.
     - In update mode, tell the writer to copy every unchanged bullet and question verbatim instead of paraphrasing it.
+    - In update mode, do not normalize nouns or labels inside unchanged lines; if an unchanged question says `company`, keep `company` rather than rewriting it as `project`, `product`, or `primitive`.
+    - In update mode, when the edit request supplies target phrasing for a local rewrite, preserve those anchor words rather than substituting looser synonyms.
+    - For example, if the request says `easiest first wedge because eval loops are tighter`, keep both `easiest first wedge` and `eval loops are tighter` explicit in the revised line.
     - Require markdown output with exactly this heading structure unless the user asks otherwise:
       - `# <Primitive Name> Foundation`
       - `## What This Is`
@@ -120,15 +129,22 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
       - `## Durable Surfaces`
       - `## Strategic Bets`
       - `## Open Questions`
+    - Require `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, and `Open Questions` to use markdown bullets with `- ` for each item rather than paragraph-only prose.
     - Forbid extra sections like `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or `Roadmap`.
-    - Require `Strategic Bets` to be phrased as observed directional bets or public signals rather than prescriptions.
-    - Require each `Strategic Bets` bullet to start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`.
-    - Forbid bare `Bet:` labels and categorical claims like `X is the wedge` or `Y is a defensible primitive`.
+    - Require `Strategic Bets` to be phrased as source-visible directional signals rather than prescriptions or settled company posture.
+    - Require each `Strategic Bets` bullet to begin with an evidence-matched source-centered opening.
+    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
+    - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
+    - Forbid wording like `The company appears to be betting on...`, bare `Bet:` labels, or categorical claims like `X is the wedge`, `Y is a defensible primitive`, `X will remain the center of gravity`, or `Y matters more than Z`.
+    - Forbid prescriptive verbs in `Strategic Bets`, including `prioritize`, `invest in`, `ship`, or `treat as first-class`.
+    - If a would-be strategic bet cannot be phrased as a source-visible signal, convert it into an `Open Questions` bullet or omit it.
     - Require recently emerging or transitional surfaces to be qualified explicitly rather than flattened as fully settled.
     - Require `Open Questions` bullets to remain actual unresolved questions rather than disguised conclusions.
     - Forbid market-leadership or superiority claims unless they are explicit in the brief.
     - In update mode, do not add new sections or broad new framing unless the request explicitly asks for them.
     - In update mode, if a boundary, actor, surface, strategic bet, or open question is not part of the requested edit, preserve its wording exactly.
+    - In update mode, preserve unchanged nouns and subject labels inside copied lines exactly as written.
+    - In update mode, if the request gives a specific comparative or causal formulation, keep that formulation explicit instead of paraphrasing it away.
     - Make the prompt directly usable by an agent.
 
     Brief:
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index 3407e0d..a7e6d53 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -27,9 +27,18 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
       - extract a narrow `update_request` rather than rewriting the entire document
       - do not resolve existing ambiguity unless the task prompt explicitly asks for it
       - if a boundary, actor, surface, strategic bet, or open question is not explicitly targeted by the task prompt, keep its wording unchanged in the brief
+      - when a line is unchanged, copy its text verbatim from `existing_foundation` instead of normalizing nouns or labels
+      - do not rewrite unchanged `company`, `product`, `primitive`, or actor labels into nearby synonyms
+      - if the task prompt gives target phrasing for a local rewrite, preserve those anchor words in the rewritten field instead of loosening them into synonyms
+      - if the task prompt specifies a comparative or causal clause such as `easiest first wedge because eval loops are tighter`, keep that clause explicit
     - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them.
-    - If `strategic_bets` are included, phrase them as observed directional signals or evidence.
-    - Each `strategic_bets` item should start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`.
+    - If `strategic_bets` are included, phrase them as source-visible directional signals or evidence.
+    - Each `strategic_bets` item must begin with an evidence-matched source-centered opening.
+    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
+    - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
+    - Do not phrase `strategic_bets` as direct statements of company intent such as `The company appears to be betting on...`.
+    - Do not use prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
+    - If a possible bet cannot be stated as a source-visible signal without overclaiming, move it to `open_questions` or omit it.
     - Do not use bare `Bet:` labels or categorical claims like `X is the wedge` or `Y is a defensible primitive`.
     - Keep `strategic_bets` short when evidence is weak.
     - Keep unresolved questions as actual unresolved questions rather than quietly resolving them in the summary.
@@ -63,10 +72,17 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
       - `## Durable Surfaces`
       - `## Strategic Bets`
       - `## Open Questions`
+    - Render `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, and `Open Questions` as markdown bullet lists using `- ` for every item.
+    - Do not render `Strategic Bets` or `Open Questions` as paragraph-only prose.
     - If `Strategic Bets` is weakly supported, keep it short rather than expanding it.
-    - Phrase `Strategic Bets` as observed directional bets or public signals, not recommendations or settled future state.
-    - Each `Strategic Bets` bullet must start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`.
-    - Do not use bare `Bet:` labels or categorical statements like `X is the wedge` or `Y is a defensible primitive`.
+    - Phrase `Strategic Bets` as source-visible directional signals, not recommendations or settled future state.
+    - Each `Strategic Bets` bullet must begin with an evidence-matched source-centered opening.
+    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
+    - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
+    - Do not use direct company-intent phrasing such as `The company appears to be betting on...`.
+    - Do not use bare `Bet:` labels or categorical statements like `X is the wedge`, `Y is a defensible primitive`, `X will remain the center of gravity`, or `Y matters more than Z`.
+    - Do not use prescriptive verbs like `prioritize`, `invest in`, `ship`, or `treat as first-class` in `Strategic Bets`.
+    - If a would-be strategic bet cannot be phrased as a source-visible signal, convert it into an `Open Questions` bullet or omit it.
     - When a surface is visible but still evolving in the packet, qualify it explicitly as emerging, evolving, or unsettled.
     - Write `Open Questions` as actual unresolved questions, typically ending with `?`.
     - Do not use market-leadership or competitive-superiority language unless the packet explicitly supports it.
@@ -77,6 +93,9 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
       - apply only the edits implied by `update_request`
       - do not rewrite existing paragraphs or bullets unless the request requires it
       - do not add new sections or broad new framing unless `update_request` explicitly asks for them
+      - copy unchanged lines literally, including nouns and subject labels such as `company`, rather than normalizing them into synonyms
+      - if `update_request` supplies target phrasing for a local rewrite, preserve those anchor words rather than replacing them with looser synonyms
+      - if `update_request` includes a comparative or causal clause such as `easiest first wedge because eval loops are tighter`, keep that clause explicit in the revised line
 
     Brief:
     {{ brief|format(type="yaml") }}
@@ -106,7 +125,11 @@ function EvaluateFoundationDocument(
     - Penalize unsupported business-model, monetization, KPI, org, partnership, or operating-plan language.
     - Penalize consulting-style sections such as `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or similar drift.
     - Penalize missing markdown heading structure if the document drifts from the required template shape.
-    - Penalize `Strategic Bets` phrased as recommendations or settled conclusions when the packet only supports directional evidence.
+    - Penalize paragraph-only rendering of list sections that the template expects as markdown bullets, especially `Strategic Bets` and `Open Questions`.
+    - `## Strategic Bets` is a required template section. Do not penalize its presence by itself when the bullets remain source-bound and restrained.
+    - Penalize `Strategic Bets` phrased as recommendations, direct company-intent claims, or settled conclusions when the packet only supports directional evidence.
+    - Penalize `Strategic Bets` that use verbs like `prioritize`, `invest in`, `ship`, `treat as first-class`, `will remain`, or `matters more than`, or phrases like `The company appears to be betting on...`.
+    - Penalize evidence-source mismatch inside `Strategic Bets`, such as `Public materials suggest...` when the packet only contains founder notes or internal notes.
     - Penalize flattening transitional surfaces as fully settled if the packet presents them as evolving.
     - Penalize `Open Questions` that silently resolve ambiguity instead of keeping it open.
     - Penalize market-leadership or competitive-superiority claims not explicitly supported by the packet.
diff --git a/skills/foundation-creator/eval_profiles/model-anthropic-claude-opus-4-7/baml_src/clients.baml b/skills/foundation-creator/eval_profiles/model-anthropic-claude-opus-4-7/baml_src/clients.baml
new file mode 100644
index 0000000..ed334d1
--- /dev/null
+++ b/skills/foundation-creator/eval_profiles/model-anthropic-claude-opus-4-7/baml_src/clients.baml
@@ -0,0 +1,8 @@
+client<llm> EvalModel {
+  provider "openai-responses"
+  options {
+    api_key env.AI_GATEWAY_API_KEY
+    base_url "https://ai-gateway.vercel.sh/v1"
+    model "anthropic/claude-opus-4-7"
+  }
+}
diff --git a/skills/foundation-creator/eval_profiles/model-openai-gpt-5.4-mini/baml_src/clients.baml b/skills/foundation-creator/eval_profiles/model-openai-gpt-5.4-mini/baml_src/clients.baml
new file mode 100644
index 0000000..d41bb9b
--- /dev/null
+++ b/skills/foundation-creator/eval_profiles/model-openai-gpt-5.4-mini/baml_src/clients.baml
@@ -0,0 +1,11 @@
+client<llm> EvalModel {
+  provider "openai-responses"
+  options {
+    api_key env.AI_GATEWAY_API_KEY
+    base_url "https://ai-gateway.vercel.sh/v1"
+    model "openai/gpt-5.4-mini"
+    reasoning {
+      effort "medium"
+    }
+  }
+}
diff --git a/skills/foundation-creator/eval_profiles/model-openai-gpt-5.4/baml_src/clients.baml b/skills/foundation-creator/eval_profiles/model-openai-gpt-5.4/baml_src/clients.baml
new file mode 100644
index 0000000..ef45ace
--- /dev/null
+++ b/skills/foundation-creator/eval_profiles/model-openai-gpt-5.4/baml_src/clients.baml
@@ -0,0 +1,11 @@
+client<llm> EvalModel {
+  provider "openai-responses"
+  options {
+    api_key env.AI_GATEWAY_API_KEY
+    base_url "https://ai-gateway.vercel.sh/v1"
+    model "openai/gpt-5.4"
+    reasoning {
+      effort "medium"
+    }
+  }
+}
diff --git a/skills/foundation-creator/references/language.md b/skills/foundation-creator/references/language.md
index 60decb0..eca9ffe 100644
--- a/skills/foundation-creator/references/language.md
+++ b/skills/foundation-creator/references/language.md
@@ -23,14 +23,17 @@ How the foundation document should be worded.
 - Keep the section order exactly as the template defines it.
 - In update mode, preserve the existing section order and keep unchanged wording
   intact unless the requested edit requires a local rewrite.
-- Use bullets for `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`,
-  `Strategic Bets`, and `Open Questions`.
+- Use markdown bullets with `- ` for every item in `Core Thesis`, `Boundaries`,
+  `Actor Model`, `Durable Surfaces`, `Strategic Bets`, and `Open Questions`.
+- Do not render `Strategic Bets` or `Open Questions` as paragraph-only prose.
 - `Open Questions` bullets should be written as actual open questions and usually
   end with `?`.
 - `Strategic Bets` should be framed as observed directional signals or bets
-  rather than recommendations. Wording like `materials suggest a bet on...`,
-  `the company appears to be betting on...`, or `public signals indicate...`
-  is preferred.
+  rather than recommendations. Match the lead phrase to the evidence:
+  for note-only packets, prefer `the notes suggest...`,
+  `there are visible signals that...`, or `the source material indicates...`.
+  Use `public materials suggest...` only when the packet explicitly cites
+  public docs, press releases, or other external sources.
 
 ## 4. Restraint Rules
 
@@ -55,12 +58,17 @@ How the foundation document should be worded.
   implementation components. If a surface is source-visible but still in
   transition, qualify it explicitly as emerging or evolving.
 - `Strategic Bets` should be minimal and clearly grounded in repeated signals.
-  Each bullet should start with visible-evidence language such as `the notes
-  suggest a bet on...`, `public materials suggest...`, `there are visible
-  signals that...`, or `the company appears to be betting on...` rather than
-  settled declarations or recommendations.
-  Avoid naked labels like `Bet:` and avoid categorical claims like `X is the
-  wedge` or `Y is a defensible primitive`.
+  Each bullet should start with visible-evidence language that matches the
+  packet. For note-only packets, use `the notes suggest...`,
+  `there are visible signals that...`, or `the source material indicates...`.
+  Use `public materials suggest...` only when the packet explicitly cites
+  public docs, press releases, or other external sources, rather than settled
+  declarations or recommendations.
+  Avoid naked labels like `Bet:`. Avoid company-intent phrasing like
+  `the company appears to be betting on...`. Avoid categorical claims like
+  `X is the wedge`, `Y is a defensible primitive`, `X will remain the center
+  of gravity`, or `Y matters more than Z`. Avoid prescriptive verbs such as
+  `prioritize`, `invest in`, `ship`, or `treat as first-class`.
 - `Open Questions` should remain open rather than being quietly resolved in
   prose elsewhere.
 
diff --git a/skills/foundation-creator/references/template.md b/skills/foundation-creator/references/template.md
index 2a323ab..74d00e1 100644
--- a/skills/foundation-creator/references/template.md
+++ b/skills/foundation-creator/references/template.md
@@ -3,6 +3,8 @@
 Use only the sections below unless the user explicitly asks for more.
 All headings in the final document must use markdown heading syntax exactly as
 shown here.
+All sections below except `What This Is` should use markdown bullets with `- `
+for each item.
 
 ## What This Is
 
@@ -31,7 +33,11 @@ shown here.
 ## Strategic Bets
 
 - {The notes suggest a bet on ...}
-- {There are visible signals that ...}
+- {The source material indicates ...}
+
+Match the lead phrase to the packet. Use `Public materials suggest...` only
+when the packet explicitly cites public docs, press releases, or other
+external sources.
 
 ## Open Questions
 
diff --git a/skills/spec-creator/baml_src/clients.baml b/skills/spec-creator/baml_src/clients.baml
index f37af5b..ef45ace 100644
--- a/skills/spec-creator/baml_src/clients.baml
+++ b/skills/spec-creator/baml_src/clients.baml
@@ -3,7 +3,7 @@ client<llm> EvalModel {
   options {
     api_key env.AI_GATEWAY_API_KEY
     base_url "https://ai-gateway.vercel.sh/v1"
-    model "openai/gpt-5-mini"
+    model "openai/gpt-5.4"
     reasoning {
       effort "medium"
     }
diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
index 3339db7..573f209 100644
--- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -13,6 +13,9 @@ function ExtractClaims(raw_notes: string) -> Claim[] {
     - Use `Constraint` for hard operating requirements.
     - Use `OpenQuestion` when the note is unresolved.
     - Keep `sources` short and human-readable.
+    - Preserve packet-named capabilities separately when the notes distinguish them, such as docs, teams, projects, deployments, and logs.
+    - Do not infer internal implementation, storage, transport, token, registry, allowlist, or SDK detail unless the notes say it directly.
+    - Prefer omission over speculative completion.
 
     Raw notes:
     {{ raw_notes }}
@@ -29,10 +32,21 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
 
     Rules:
     - Produce a behavioral brief suitable for `spec-creator`.
+    - Follow the `spec-creator` contract rather than ad hoc document structure.
     - Prefer service-level behavior over product doctrine.
     - Preserve unresolved questions explicitly.
     - Do not invent implementation detail.
+    - Keep packet-named service surfaces distinct when they matter. For example, if the notes name teams separately from projects, do not silently collapse teams into projects.
+    - `components` should describe major service responsibilities or user-visible service surfaces, not low-level classes, methods, SDKs, or protocols.
+    - `abstraction_layers` should describe stable behavioral layers that make the service easier to reason about or port. Keep them service-level and conservative.
+    - `external_dependencies` must name external systems, resources, or authorities explicitly visible in the notes. Do not list inferred internals such as token issuance, allowlists, registries, classes, transport mechanics, or storage engines unless they are stated directly.
+    - `entities` should model only externally meaningful nouns the service manipulates or exposes. Do not create entities only to complete a plausible schema.
+    - Every entity field must be directly supported by the notes or be the minimal identity/status field required to keep the entity coherent. Prefer fewer fields over speculative completeness.
+    - Use only logical field types such as `string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, or `string or null`. Do not invent custom type labels like `duration`.
+    - Do not embed `required`, `optional`, or invented constraints like `unique within service scope` into `type_expression`.
+    - If a concept is visible in the notes but does not support a stable entity schema, keep it in components, dependencies, or unresolved questions instead of forcing it into the domain model.
     - If `existing_spec` is present, preserve compatible intent and only sharpen gaps.
+    - Prefer omission over speculation.
 
     Raw notes:
     {{ raw_notes }}
@@ -55,6 +69,10 @@ function CritiqueSpecBrief(brief: SpecBrief) -> SpecCritique {
     - Flag implementation leakage.
     - Flag ambiguous terms that weaken a behavioral specification.
     - Flag missing sections implied by the current brief.
+    - Flag missing packet-named surfaces that were collapsed away, such as teams being buried inside projects.
+    - Flag abstraction layers that drift into low-level implementation or internal class design.
+    - Flag dependencies that name inferred internals rather than source-visible external systems.
+    - Flag entity/field modeling that looks like speculative schema completion rather than source-backed service behavior.
 
     Brief:
     {{ brief|format(type="yaml") }}
@@ -74,6 +92,7 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
     - Keep the output language-agnostic and behavioral.
     - Preserve unresolved questions instead of inventing decisions.
     - If the brief includes `existing_spec`, treat the task as update mode and preserve the existing document verbatim except for the requested edits.
+    - Treat `references/template.md` and `references/language.md` as the contract.
     - Require the canonical `spec-creator` section shape:
       - `# <Service Name> Specification`
       - `Status: Draft v1 (language-agnostic)`
@@ -84,16 +103,31 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
       - `### 2.2 Non-Goals`
       - `## 3. System Overview`
       - `### 3.1 Main Components`
-      - `### 3.2 External Dependencies`
+      - `### 3.2 Abstraction Levels`
+      - `### 3.3 External Dependencies`
       - `## 4. Core Domain Model`
       - `### 4.1 Entities`
     - Require `## 1. Problem Statement` to contain:
       - one opening paragraph
       - a line like `The service solves <N> operational problems:`
       - a bullet list of explicit operational problems
-    - Require `Important boundary:` as a labeled block inside the Problem Statement.
+    - Require the `Purpose:` line to name the service kind, the protected endpoint or access boundary, and the main resource surfaces at the highest stable granularity the brief supports.
+    - If the brief names surfaces like docs, teams, projects, deployments, or logs, mention them explicitly in the `Purpose:` line rather than replacing them with vague phrases like `operational context`.
+    - Keep the `Purpose:` line concrete and source-bound. Avoid metaphorical or unsourced framing verbs like `protects`, `defends`, `guards`, or similar language unless the brief explicitly uses them.
+    - Require `Important boundary:` as a standalone labeled block inside the Problem Statement, followed by boundary bullets.
     - Require numbered components and template-shaped field lines: `- `field_name` (type, constraints)`.
-    - Require external dependencies to stay at the system/service level rather than naming low-level API methods or SDK calls.
+    - Require `### 3.2 Abstraction Levels` to explain stable behavioral layers rather than runtime call order or internal code structure.
+    - Require external dependencies to stay at the system/service level rather than naming low-level API methods, SDK calls, token mechanics, registry internals, classes, or storage implementation.
+    - Require packet-named surfaces to remain visible when they materially matter. For example, if the brief distinguishes teams from projects, keep that distinction somewhere in the spec instead of collapsing it away.
+    - Require the domain model to stay conservative:
+      - include only entities materially supported by the brief
+      - prefer fewer entities over speculative completeness
+      - use minimal source-backed fields
+      - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
+    - In field lines, use only logical types and source-backed constraints.
+    - Do not put `required`, `optional`, `non-empty`, `unique`, or invented service-scope constraints inside the field parentheses.
+    - If requirement state matters, express it in the indented semantic bullet rather than inside the type parentheses.
+    - If `unresolved_questions` are materially important, add `## 5. Open Questions` after the domain model and render them as bullets.
     - In update mode, do not add new top-level sections or template-completion material unless the request explicitly asks for them.
     - Emphasize problem statement, goals, non-goals, components, dependencies, and entities.
     - Make the prompt directly usable by an agent.
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
index d093142..60167a8 100644
--- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -18,9 +18,20 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
 
     Rules:
     - Produce a behavioral service brief suitable for `spec-creator`.
+    - Follow the `spec-creator` template and language guide rather than an ad hoc service summary.
     - Treat expected criteria as evaluation guidance, not as license to invent.
     - Preserve unresolved questions explicitly.
     - Avoid implementation detail.
+    - Keep packet-named service surfaces distinct when they matter. For example, if the packet names teams separately from projects, do not silently collapse teams into projects.
+    - `components` should describe major service responsibilities or user-visible service surfaces, not low-level classes, methods, SDKs, or protocols.
+    - `abstraction_layers` should describe stable behavioral layers that make the service easier to reason about or port. Keep them service-level and conservative.
+    - `external_dependencies` must name external systems, resources, or authorities explicitly visible in the packet. Do not list inferred internals such as token issuance, allowlists, registries, classes, transport mechanics, or storage engines unless they are stated directly.
+    - `entities` should model only externally meaningful nouns the service manipulates or exposes. Do not create entities only to complete a plausible schema.
+    - Every entity field must be directly supported by the packet or be the minimal identity/status field required to keep the entity coherent. Prefer fewer fields over speculative completeness.
+    - Use only logical field types such as `string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, or `string or null`. Do not invent custom type labels like `duration`.
+    - Do not embed `required`, `optional`, or invented constraints like `unique within service scope` into `type_expression`.
+    - If a concept is visible in the packet but does not support a stable entity schema, keep it in components, dependencies, or unresolved questions instead of forcing it into the domain model.
+    - Prefer omission over speculation.
 
     {{ ctx.output_format }}
   "#
@@ -41,6 +52,9 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - `# <Service Name> Specification`
       - `Status: Draft v1 (language-agnostic)`
       - `Purpose: <one sentence>`
+    - Require the `Purpose:` line to name the service kind, the protected endpoint or access boundary, and the main resource surfaces at the highest stable granularity the brief supports.
+    - If the brief names surfaces like docs, teams, projects, deployments, or logs, mention them explicitly in the `Purpose:` line rather than replacing them with vague phrases like `operational context`.
+    - Keep the `Purpose:` line concrete and source-bound. Avoid metaphorical or unsourced framing verbs like `protects`, `defends`, `guards`, or similar language unless the brief explicitly uses them.
     - Include these sections in order:
       - `## 1. Problem Statement`
       - `## 2. Goals and Non-Goals`
@@ -48,20 +62,32 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - `### 2.2 Non-Goals`
       - `## 3. System Overview`
       - `### 3.1 Main Components`
-      - `### 3.2 External Dependencies`
+      - `### 3.2 Abstraction Levels`
+      - `### 3.3 External Dependencies`
       - `## 4. Core Domain Model`
       - `### 4.1 Entities`
     - In `## 1. Problem Statement`, write:
       - one opening paragraph describing the service at a high level
       - a line like `The service solves <N> operational problems:`
       - a bullet list of concrete operational problems
-    - Keep `Important boundary:` as a labeled block inside `## 1. Problem Statement`. Do not turn it into a standalone section.
+    - Keep `Important boundary:` as a standalone labeled block inside `## 1. Problem Statement`, followed by bullets. Do not turn it into a standalone section.
     - Use a numbered list for `### 3.1 Main Components` in the form `1. `Component Name`` followed by indented responsibility bullets.
+    - Use a numbered list for `### 3.2 Abstraction Levels` in the form `1. `Layer Name`` followed by indented responsibility bullets.
     - For entities, use `#### 4.1.x EntityName`, then `Fields:`, then field lines in the form `- `field_name` (type, constraints)` with indented semantic bullets and optional `Default: `value`` lines.
     - Use logical field types (`string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, `string or null`), not implementation types.
-    - List external dependencies as systems or services, not low-level API method names, SDK calls, classes, or internal implementation choices.
+    - List external dependencies as systems or services, not low-level API method names, SDK calls, token mechanics, registry internals, classes, or internal implementation choices.
+    - Keep packet-named surfaces visible when they materially matter. If the source packet names teams separately from projects, preserve that distinction somewhere in the spec instead of collapsing it away.
+    - Keep the domain model conservative:
+      - include only entities materially supported by the brief
+      - prefer fewer entities over speculative completeness
+      - use minimal source-backed fields
+      - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
+    - In field lines, use only logical types and source-backed constraints.
+    - Do not put `required`, `optional`, `non-empty`, `unique`, or invented service-scope constraints inside the field parentheses.
+    - If requirement state matters, express it in the indented semantic bullet rather than inside the type parentheses.
     - Preserve uncertainty where the source packet is in transition.
     - Avoid implementation detail.
+    - If `unresolved_questions` are materially important, add `## 5. Open Questions` after the domain model and render them as markdown bullets.
     - Optional extra sections are allowed only when the brief materially needs them; if used, place them after `## 4. Core Domain Model`.
     - Do not add end markers, appendix-only filler, or operational/program-management sections unless the brief explicitly requires them.
     - If `existing_spec` is present, treat this as update mode:
@@ -95,7 +121,11 @@ function EvaluateSpecDocument(
     - Grade against the expected criteria explicitly.
     - Penalize invented capabilities, invented certainty, or implementation leakage.
     - Reward correct scope boundaries and careful handling of transition states.
-    - Penalize missing core template sections, malformed field formatting, or moving `Important boundary:` out of the Problem Statement block.
+    - Penalize missing core template sections, missing `### 3.2 Abstraction Levels`, missing `### 3.3 External Dependencies`, malformed field formatting, or moving `Important boundary:` out of the Problem Statement block.
+    - Penalize collapsing packet-named surfaces away when they matter. For example, if the packet explicitly names teams separately from projects, omitting team/workspace context entirely should count against the candidate.
+    - Penalize inferred internals such as token issuance, allowlists, registries, transport details, or storage choices unless the packet states them directly.
+    - Penalize domain models that look schema-complete but are not source-backed.
+    - Reward conservative entity modeling and explicit `Open Questions` when the packet is genuinely transitional.
     - Use `Pass`, `Partial`, or `Fail` for each criterion.
 
     {{ ctx.output_format }}
diff --git a/skills/spec-creator/baml_src/spec_compiler/spec_types.baml b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
index 2e03a1e..c9c6f53 100644
--- a/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
@@ -3,6 +3,11 @@ class ComponentBrief {
   responsibility string @assert(nonempty_responsibility, {{ this|length > 0 }})
 }
 
+class AbstractionLayerBrief {
+  name string @assert(nonempty_name, {{ this|length > 0 }})
+  responsibility string @assert(nonempty_responsibility, {{ this|length > 0 }})
+}
+
 class FieldBrief {
   name string @assert(nonempty_name, {{ this|length > 0 }})
   type_expression string @assert(nonempty_type_expression, {{ this|length > 0 }})
@@ -24,6 +29,7 @@ class SpecBrief {
   non_goals string[]
   important_boundaries string[]
   components ComponentBrief[]
+  abstraction_layers AbstractionLayerBrief[]
   external_dependencies string[]
   entities EntityBrief[]
   unresolved_questions string[]
diff --git a/skills/spec-creator/eval_profiles/model-anthropic-claude-opus-4-7/baml_src/clients.baml b/skills/spec-creator/eval_profiles/model-anthropic-claude-opus-4-7/baml_src/clients.baml
new file mode 100644
index 0000000..ed334d1
--- /dev/null
+++ b/skills/spec-creator/eval_profiles/model-anthropic-claude-opus-4-7/baml_src/clients.baml
@@ -0,0 +1,8 @@
+client<llm> EvalModel {
+  provider "openai-responses"
+  options {
+    api_key env.AI_GATEWAY_API_KEY
+    base_url "https://ai-gateway.vercel.sh/v1"
+    model "anthropic/claude-opus-4-7"
+  }
+}
diff --git a/skills/spec-creator/eval_profiles/model-openai-gpt-5.4-mini/baml_src/clients.baml b/skills/spec-creator/eval_profiles/model-openai-gpt-5.4-mini/baml_src/clients.baml
new file mode 100644
index 0000000..d41bb9b
--- /dev/null
+++ b/skills/spec-creator/eval_profiles/model-openai-gpt-5.4-mini/baml_src/clients.baml
@@ -0,0 +1,11 @@
+client<llm> EvalModel {
+  provider "openai-responses"
+  options {
+    api_key env.AI_GATEWAY_API_KEY
+    base_url "https://ai-gateway.vercel.sh/v1"
+    model "openai/gpt-5.4-mini"
+    reasoning {
+      effort "medium"
+    }
+  }
+}
diff --git a/skills/spec-creator/eval_profiles/model-openai-gpt-5.4/baml_src/clients.baml b/skills/spec-creator/eval_profiles/model-openai-gpt-5.4/baml_src/clients.baml
new file mode 100644
index 0000000..ef45ace
--- /dev/null
+++ b/skills/spec-creator/eval_profiles/model-openai-gpt-5.4/baml_src/clients.baml
@@ -0,0 +1,11 @@
+client<llm> EvalModel {
+  provider "openai-responses"
+  options {
+    api_key env.AI_GATEWAY_API_KEY
+    base_url "https://ai-gateway.vercel.sh/v1"
+    model "openai/gpt-5.4"
+    reasoning {
+      effort "medium"
+    }
+  }
+}
diff --git a/skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md b/skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md
index d297751..37ca535 100644
--- a/skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md
+++ b/skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md
@@ -8,9 +8,21 @@
   Vercel context from tools or development environments.
 - The output should include boundaries around official endpoint usage,
   approved clients, and security-sensitive access patterns.
+- The output should keep packet-named service surfaces visible, including docs,
+  teams or workspace context, projects, deployments, and logs.
 - The output should preserve the transition between the August 6, 2025
   read-only launch framing and the January 30, 2026 broader management framing.
 - The output should avoid claiming unconstrained write behavior unless it is
   explicitly scoped or qualified.
+- The output should use the `spec-creator` template shape, including
+  `### 3.2 Abstraction Levels` and `### 3.3 External Dependencies`.
 - The output should stay behavioral and language-agnostic, not implementation
   specific.
+- The output should avoid inferred internals such as token issuance,
+  allowlists, registries, or transport/storage mechanics unless the packet
+  states them directly.
+- The output should prefer conservative entity modeling over speculative schema
+  completion.
+- The output should preserve unresolved ambiguity explicitly, including an
+  `Open Questions` section when that is the clearest way to keep the
+  transition honest.

From 5f53fbc85748cc6e2a813b8dc1a36a97a1fa61b7 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Tue, 21 Apr 2026 15:46:46 +1000
Subject: [PATCH 10/30] Add spec ambiguity and cross-domain eval packets

---
 .../spec_compiler/compiler_functions.baml     | 12 ++++
 .../baml_src/spec_compiler/eval_runner.baml   | 16 +++++
 skills/spec-creator/evals/evals.json          | 44 +++++++++++++
 .../expected_criteria.md                      | 34 ++++++++++
 .../harbor_care_coordination/raw_notes.md     | 65 +++++++++++++++++++
 .../expected_criteria.md                      | 37 +++++++++++
 .../lightfast_founder_notes/raw_notes.md      | 62 ++++++++++++++++++
 7 files changed, 270 insertions(+)
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_care_coordination/expected_criteria.md
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_care_coordination/raw_notes.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md

diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
index 573f209..d6900aa 100644
--- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -35,6 +35,11 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
     - Follow the `spec-creator` contract rather than ad hoc document structure.
     - Prefer service-level behavior over product doctrine.
     - Preserve unresolved questions explicitly.
+    - Preserve packet-named unresolved tensions when they affect scope, actor boundaries, approval gates, deployment shape, or operating model.
+    - If the notes contain multiple materially distinct unresolved tensions, keep them distinct in `unresolved_questions` instead of collapsing them into one generic question.
+    - If the notes contain explicit `Unresolved:` items, treat them as high-priority source facts. Preserve each materially distinct unresolved item in `unresolved_questions` unless two items are true duplicates.
+    - Do not replace a packet-named open question with a narrower unsourced variant.
+    - Do not invent a new unresolved question if it causes a packet-stated unresolved question to be dropped.
     - Do not invent implementation detail.
     - Keep packet-named service surfaces distinct when they matter. For example, if the notes name teams separately from projects, do not silently collapse teams into projects.
     - `components` should describe major service responsibilities or user-visible service surfaces, not low-level classes, methods, SDKs, or protocols.
@@ -42,6 +47,7 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
     - `external_dependencies` must name external systems, resources, or authorities explicitly visible in the notes. Do not list inferred internals such as token issuance, allowlists, registries, classes, transport mechanics, or storage engines unless they are stated directly.
     - `entities` should model only externally meaningful nouns the service manipulates or exposes. Do not create entities only to complete a plausible schema.
     - Every entity field must be directly supported by the notes or be the minimal identity/status field required to keep the entity coherent. Prefer fewer fields over speculative completeness.
+    - Prefer the smallest viable domain model. If an extra entity or field only restates behavior already captured elsewhere, omit it.
     - Use only logical field types such as `string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, or `string or null`. Do not invent custom type labels like `duration`.
     - Do not embed `required`, `optional`, or invented constraints like `unique within service scope` into `type_expression`.
     - If a concept is visible in the notes but does not support a stable entity schema, keep it in components, dependencies, or unresolved questions instead of forcing it into the domain model.
@@ -113,6 +119,8 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
       - a bullet list of explicit operational problems
     - Require the `Purpose:` line to name the service kind, the protected endpoint or access boundary, and the main resource surfaces at the highest stable granularity the brief supports.
     - If the brief names surfaces like docs, teams, projects, deployments, or logs, mention them explicitly in the `Purpose:` line rather than replacing them with vague phrases like `operational context`.
+    - If the brief is fundamentally about coordination across named participant types, roles, or organizations, mention that cross-party coordination boundary in the `Purpose:` line instead of only naming artifacts or records.
+    - If the brief names explicit ingress surfaces such as referrals, intake notes, source packets, or raw notes as core to the service, mention at least one of those entry surfaces in the `Purpose:` line instead of only describing downstream records or outputs.
     - Keep the `Purpose:` line concrete and source-bound. Avoid metaphorical or unsourced framing verbs like `protects`, `defends`, `guards`, or similar language unless the brief explicitly uses them.
     - Require `Important boundary:` as a standalone labeled block inside the Problem Statement, followed by boundary bullets.
     - Require numbered components and template-shaped field lines: `- `field_name` (type, constraints)`.
@@ -124,10 +132,14 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
       - prefer fewer entities over speculative completeness
       - use minimal source-backed fields
       - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
+      - avoid second-order record types when a smaller set of entities or fields already makes the behavior legible
     - In field lines, use only logical types and source-backed constraints.
     - Do not put `required`, `optional`, `non-empty`, `unique`, or invented service-scope constraints inside the field parentheses.
     - If requirement state matters, express it in the indented semantic bullet rather than inside the type parentheses.
     - If `unresolved_questions` are materially important, add `## 5. Open Questions` after the domain model and render them as bullets.
+    - When multiple unresolved questions are present, preserve the distinct tensions rather than reducing them to a single generic question.
+    - Keep `## 5. Open Questions` source-bound. If the brief names a structural tension such as repo-native vs hosted, family-direct vs sponsored, or automation vs human ownership, carry that tension forward explicitly.
+    - When the source notes include explicit `Unresolved:` bullets, keep those questions source-traceable in `## 5. Open Questions` rather than substituting different questions.
     - In update mode, do not add new top-level sections or template-completion material unless the request explicitly asks for them.
     - Emphasize problem statement, goals, non-goals, components, dependencies, and entities.
     - Make the prompt directly usable by an agent.
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
index 60167a8..7cb3ea6 100644
--- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -21,6 +21,11 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
     - Follow the `spec-creator` template and language guide rather than an ad hoc service summary.
     - Treat expected criteria as evaluation guidance, not as license to invent.
     - Preserve unresolved questions explicitly.
+    - Preserve packet-named unresolved tensions when they affect scope, actor boundaries, approval gates, deployment shape, or operating model.
+    - If the packet contains multiple materially distinct unresolved tensions, keep them distinct in `unresolved_questions` instead of collapsing them into one generic question.
+    - If the raw notes contain explicit `Unresolved:` items, treat them as high-priority source facts. Preserve each materially distinct unresolved item in `unresolved_questions` unless two items are true duplicates.
+    - Do not replace a packet-named open question with a narrower unsourced variant.
+    - Do not invent a new unresolved question if it causes a packet-stated unresolved question to be dropped.
     - Avoid implementation detail.
     - Keep packet-named service surfaces distinct when they matter. For example, if the packet names teams separately from projects, do not silently collapse teams into projects.
     - `components` should describe major service responsibilities or user-visible service surfaces, not low-level classes, methods, SDKs, or protocols.
@@ -28,6 +33,7 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
     - `external_dependencies` must name external systems, resources, or authorities explicitly visible in the packet. Do not list inferred internals such as token issuance, allowlists, registries, classes, transport mechanics, or storage engines unless they are stated directly.
     - `entities` should model only externally meaningful nouns the service manipulates or exposes. Do not create entities only to complete a plausible schema.
     - Every entity field must be directly supported by the packet or be the minimal identity/status field required to keep the entity coherent. Prefer fewer fields over speculative completeness.
+    - Prefer the smallest viable domain model. If an extra entity or field only restates behavior already captured elsewhere, omit it.
     - Use only logical field types such as `string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, or `string or null`. Do not invent custom type labels like `duration`.
     - Do not embed `required`, `optional`, or invented constraints like `unique within service scope` into `type_expression`.
     - If a concept is visible in the packet but does not support a stable entity schema, keep it in components, dependencies, or unresolved questions instead of forcing it into the domain model.
@@ -54,6 +60,8 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - `Purpose: <one sentence>`
     - Require the `Purpose:` line to name the service kind, the protected endpoint or access boundary, and the main resource surfaces at the highest stable granularity the brief supports.
     - If the brief names surfaces like docs, teams, projects, deployments, or logs, mention them explicitly in the `Purpose:` line rather than replacing them with vague phrases like `operational context`.
+    - If the brief is fundamentally about coordination across named participant types, roles, or organizations, mention that cross-party coordination boundary in the `Purpose:` line instead of only naming artifacts or records.
+    - If the brief names explicit ingress surfaces such as referrals, intake notes, source packets, or raw notes as core to the service, mention at least one of those entry surfaces in the `Purpose:` line instead of only describing downstream records or outputs.
     - Keep the `Purpose:` line concrete and source-bound. Avoid metaphorical or unsourced framing verbs like `protects`, `defends`, `guards`, or similar language unless the brief explicitly uses them.
     - Include these sections in order:
       - `## 1. Problem Statement`
@@ -82,12 +90,16 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - prefer fewer entities over speculative completeness
       - use minimal source-backed fields
       - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
+      - avoid second-order record types when a smaller set of entities or fields already makes the behavior legible
     - In field lines, use only logical types and source-backed constraints.
     - Do not put `required`, `optional`, `non-empty`, `unique`, or invented service-scope constraints inside the field parentheses.
     - If requirement state matters, express it in the indented semantic bullet rather than inside the type parentheses.
     - Preserve uncertainty where the source packet is in transition.
     - Avoid implementation detail.
     - If `unresolved_questions` are materially important, add `## 5. Open Questions` after the domain model and render them as markdown bullets.
+    - When multiple unresolved questions are present, preserve the distinct tensions rather than reducing them to a single generic question.
+    - Keep `## 5. Open Questions` source-bound. If the brief names a structural tension such as repo-native vs hosted, family-direct vs sponsored, or automation vs human ownership, carry that tension forward explicitly.
+    - When the source packet includes explicit `Unresolved:` bullets, keep those questions source-traceable in `## 5. Open Questions` rather than substituting different questions.
     - Optional extra sections are allowed only when the brief materially needs them; if used, place them after `## 4. Core Domain Model`.
     - Do not add end markers, appendix-only filler, or operational/program-management sections unless the brief explicitly requires them.
     - If `existing_spec` is present, treat this as update mode:
@@ -123,8 +135,12 @@ function EvaluateSpecDocument(
     - Reward correct scope boundaries and careful handling of transition states.
     - Penalize missing core template sections, missing `### 3.2 Abstraction Levels`, missing `### 3.3 External Dependencies`, malformed field formatting, or moving `Important boundary:` out of the Problem Statement block.
     - Penalize collapsing packet-named surfaces away when they matter. For example, if the packet explicitly names teams separately from projects, omitting team/workspace context entirely should count against the candidate.
+    - Penalize dropping packet-named unresolved tensions, or replacing them with narrower unsourced variants, when those tensions materially affect scope or behavior.
+    - When the packet contains explicit `Unresolved:` bullets, treat omission of any materially distinct unresolved item as a real miss unless the candidate preserves it as a close paraphrase.
     - Penalize inferred internals such as token issuance, allowlists, registries, transport details, or storage choices unless the packet states them directly.
     - Penalize domain models that look schema-complete but are not source-backed.
+    - Do not penalize typed field notation on its own; the template requires logical field types. Penalize only when the entity count or field coverage is materially more complete than the packet justifies.
+    - Reward purpose lines that carry forward the core actor or coordination boundary when the packet makes that boundary central.
     - Reward conservative entity modeling and explicit `Open Questions` when the packet is genuinely transitional.
     - Use `Pass`, `Partial`, or `Fail` for each criterion.
 
diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json
index 93083be..08154eb 100644
--- a/skills/spec-creator/evals/evals.json
+++ b/skills/spec-creator/evals/evals.json
@@ -89,6 +89,50 @@
         "expected_criteria": "fixtures/vercel_mcp/expected_criteria.md"
       },
       "files": ["fixtures/vercel_mcp/raw_notes.md"]
+    },
+    {
+      "id": 4,
+      "eval_name": "create-from-lightfast-founder-notes-packet",
+      "scenario_type": "founder_notes_ambiguity",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "agent_work_infrastructure",
+      "primary_risks": [
+        "foundation_language_drift",
+        "invented_capabilities",
+        "invented_certainty",
+        "implementation_leakage"
+      ],
+      "prompt": "Use the source packet in `fixtures/lightfast_founder_notes/raw_notes.md` to write a `SPEC.md` for a service called `Lightfast Workspace Coordinator`. Treat it as a concrete repo-native coordination service, not as a company-level foundation document. Preserve unresolved product tension only where it materially affects service behavior, and do not invent hidden workflow engines, background mutation, or autonomous execution powers.",
+      "expected_output": "A `SPEC.md` that frames `Lightfast Workspace Coordinator` as a long-running service that turns rough workspace requests into packetized artifact drafts, evaluation runs, and explicit review decisions; sets clear boundaries around human review, silent mutation, and downstream execution; preserves open questions about repo-native versus hosted operation and failed-eval follow-up ownership; and stays behavioral rather than strategic or implementation-specific.",
+      "expected_file": "fixtures/lightfast_founder_notes/expected_criteria.md",
+      "packet_files": {
+        "raw_notes": "fixtures/lightfast_founder_notes/raw_notes.md",
+        "expected_criteria": "fixtures/lightfast_founder_notes/expected_criteria.md"
+      },
+      "files": ["fixtures/lightfast_founder_notes/raw_notes.md"]
+    },
+    {
+      "id": 5,
+      "eval_name": "create-from-harbor-care-coordination-packet",
+      "scenario_type": "cross_domain_generalization",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "care_operations",
+      "primary_risks": [
+        "source_overfitting",
+        "invented_capabilities",
+        "invented_certainty",
+        "implementation_leakage"
+      ],
+      "prompt": "Use the source packet in `fixtures/harbor_care_coordination/raw_notes.md` to write a `SPEC.md` for a service called `Harbor Care Coordination Service`. Treat it as a trust-heavy care-coordination service, not as a developer tool, company foundation document, or clinical decision system. Preserve uncertainty where the packet leaves the automation boundary unresolved, and do not invent medical authority, compliance machinery, or detailed healthcare schemas.",
+      "expected_output": "A `SPEC.md` that frames `Harbor Care Coordination Service` as a long-running care-coordination service for turning messy referrals, intake notes, and follow-up events into a shared case timeline, explicit task ownership, and handoff tracking; keeps human advocates central; sets clear non-goals around clinical guidance, telehealth, EHR, insurer claims, and marketplace behavior; and remains behavioral rather than implementation-heavy.",
+      "expected_file": "fixtures/harbor_care_coordination/expected_criteria.md",
+      "packet_files": {
+        "raw_notes": "fixtures/harbor_care_coordination/raw_notes.md",
+        "expected_criteria": "fixtures/harbor_care_coordination/expected_criteria.md"
+      },
+      "files": ["fixtures/harbor_care_coordination/raw_notes.md"]
     }
   ]
 }
diff --git a/skills/spec-creator/evals/fixtures/harbor_care_coordination/expected_criteria.md b/skills/spec-creator/evals/fixtures/harbor_care_coordination/expected_criteria.md
new file mode 100644
index 0000000..2d63bb3
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_care_coordination/expected_criteria.md
@@ -0,0 +1,34 @@
+# Expected Criteria
+
+- The output should frame `Harbor Care Coordination Service` as a long-running
+  care-coordination service that turns messy referrals, intake notes, and
+  follow-up events into a shared case timeline, explicit next actions, and
+  handoff tracking.
+- The purpose should mention intake or referrals, a shared case/timeline
+  surface, and coordination across participants such as family caregivers,
+  advocates, providers, pharmacies, or payers.
+- The problem statement should capture fragmented care logistics, missing
+  shared operating picture, and the burden placed on families when coordination
+  is implicit.
+- The output should keep human advocates central and include boundaries around
+  clinical interpretation, telehealth, diagnosis, EHR behavior, insurer claims
+  processing, and marketplace behavior.
+- The output should include `### 3.2 Abstraction Levels` and
+  `### 3.3 External Dependencies` using service-level language rather than
+  developer-tooling language.
+- The system overview should identify conservative components such as intake
+  normalization, case timeline management, coordination task ownership, handoff
+  tracking, advocate escalation, and a qualified benefits-question surface.
+- External dependencies should stay conservative and source-backed, such as
+  intake or referral channels, care documents, provider or payer
+  communications, and human advocates. The output should avoid inventing EHR
+  integrations, claims adjudication machinery, diagnosis engines, or
+  compliance-specific infrastructure unless the packet states them directly.
+- The core domain model should stay conservative. Minimal entities such as a
+  care case, participant, coordination task, handoff, or benefits question are
+  appropriate when they are kept source-backed.
+- The output should preserve unresolved questions explicitly, including the
+  automation-versus-advocate boundary, the access/sponsorship model, and the
+  reminder-service boundary.
+- The output should stay behavioral and language-agnostic, not implementation
+  specific, and it should avoid company-foundation language.
diff --git a/skills/spec-creator/evals/fixtures/harbor_care_coordination/raw_notes.md b/skills/spec-creator/evals/fixtures/harbor_care_coordination/raw_notes.md
new file mode 100644
index 0000000..28e42a8
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_care_coordination/raw_notes.md
@@ -0,0 +1,65 @@
+# Harbor Care Coordination Packet
+
+Assembled on April 21, 2026 from synthetic founder and operations notes.
+
+This packet tests whether `spec-creator` generalizes to a non-developer,
+trust-heavy operations service without inventing clinical authority, compliance
+machinery, or overly complete healthcare schemas.
+
+## Raw notes
+
+- Working name: `Harbor Care Coordination Service`.
+- Families keep becoming the default project managers for eldercare and
+  chronic-care logistics. The service should turn scattered referrals, notes,
+  discharge instructions, and benefits questions into a living coordination
+  case with explicit ownership.
+- The problem is not diagnosis. The problem is that tasks, documents,
+  callbacks, and handoffs sit in different places and nobody has a current
+  operating picture.
+- Inputs are messy: referral forms, intake call notes, discharge paperwork,
+  home-care issues, pharmacy problems, benefits questions.
+- The service should normalize that into one case timeline plus a queue of
+  next actions.
+- Human advocates are central. If something is ambiguous, sensitive, or
+  clinically interpretive, route it to an advocate instead of pretending the
+  system can decide.
+- It should track who owns the next step across family caregiver, advocate,
+  provider office, home-care organization, pharmacy, or payer contact.
+- It should preserve a longitudinal record of what happened, what is pending,
+  and what still needs documents or callbacks.
+- Handoff tracking matters a lot: hospital -> home, clinic -> rehab,
+  pharmacy -> family, payer -> advocate, etc.
+- There may be a benefits or eligibility question surface, but this is not an
+  adjudication engine.
+- The system may send reminders or request missing documents, but it should not
+  turn into a generic outbound CRM.
+- Not telehealth.
+- Not diagnosis or treatment planning.
+- Not an EHR.
+- Not an insurer claims processor.
+- Not a marketplace for clinicians or home aides.
+- likely actors:
+  - family caregiver
+  - patient sometimes
+  - care advocate
+  - external provider or discharge planner
+  - payer or benefits administrator occasionally
+- recurring service surfaces:
+  - referral / intake normalization
+  - case timeline
+  - coordination task queue with ownership
+  - handoff tracker
+  - advocate escalation queue
+  - benefits question tracker maybe
+- Unresolved:
+  how much outbound follow-up can be automated vs advocate-owned?
+- Unresolved:
+  is the primary access model family direct, employer/payer sponsored, or
+  provider-linked?
+- Unresolved:
+  do reminders belong here or in a separate communication service?
+- External dependencies probably include intake/referral channels, care
+  documents, communications with providers/payers, and human advocates. Keep
+  the runtime honest about the human boundary.
+- If records conflict, the service should surface the discrepancy and keep the
+  case moving where possible, not invent a clinical conclusion.
diff --git a/skills/spec-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md b/skills/spec-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md
new file mode 100644
index 0000000..713a82d
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md
@@ -0,0 +1,37 @@
+# Expected Criteria
+
+- The output should frame `Lightfast Workspace Coordinator` as a long-running
+  repo or workspace coordination service for turning rough requests into
+  packetized artifact drafts, evaluation runs, and explicit review decisions.
+- The purpose should mention a repo/workspace-facing service boundary and
+  should make the main surfaces visible, such as intake, artifact drafts, eval
+  runs, approvals, or history.
+- The problem statement should capture transcript drift, weak traceability, and
+  the risk of silent or ungated mutation of canonical artifacts.
+- The output should include boundaries around human review, explicit approval
+  gates, and the fact that the service is not a general autonomous engineer,
+  project manager, docs CMS, or arbitrary workflow runtime.
+- The output should include `### 3.2 Abstraction Levels` and
+  `### 3.3 External Dependencies` using service-level language.
+- The system overview should identify conservative components such as intake or
+  packet normalization, artifact drafting, evaluation coordination,
+  review/approval gating, and history or traceability management.
+- External dependencies should stay conservative and source-backed, such as
+  workspace or repo files, model providers, version control, and qualified
+  downstream evaluator or executor surfaces.
+- The core domain model should stay conservative. Minimal entities such as a
+  work packet or request, artifact draft, evaluation run, and approval
+  decision are appropriate. The output should avoid schema-complete workflow or
+  organization modeling.
+- The output should preserve unresolved questions explicitly, including the
+  repo-native versus hosted control-plane tension and the failed-eval
+  follow-up boundary.
+- The output should also preserve the named tension about how much of the
+  service's value sits in drafting, evaluation, or review gates, rather than
+  replacing it with a different unsourced open question.
+- The output should remain a service `SPEC.md`, not a Lightfast foundation
+  document. It should avoid company-thesis sections or language such as
+  `What This Is`, `Durable Surfaces`, `Strategic Bets`, or broad claims that
+  Lightfast is a durable artifact layer or operating system.
+- The output should stay behavioral and language-agnostic, not implementation
+  specific.
diff --git a/skills/spec-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md b/skills/spec-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md
new file mode 100644
index 0000000..b246cfa
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md
@@ -0,0 +1,62 @@
+# Lightfast Founder Notes Packet
+
+Assembled on April 21, 2026 from synthetic founder-style product notes.
+
+This packet is intentionally messy. It tests whether `spec-creator` can derive
+a concrete service specification for a Lightfast runtime surface without
+sliding into company-foundation language or inventing implementation that the
+notes do not settle.
+
+## Raw notes
+
+- Working name: `Lightfast Workspace Coordinator`. Maybe the product name stays
+  just `Lightfast`, but this service is the clearest concrete slice right now.
+- The thing probably lives inside or alongside a repo/workspace and manages the
+  loop from rough operator request -> structured packet -> draft artifacts ->
+  eval results -> explicit review decision.
+- It is not another infinite chat transcript. The durable thing should be the
+  packet, draft, eval, and approval trail.
+- Someone should be able to drop in founder/operator notes and get back
+  explicit artifacts that can be inspected, edited, approved, or rejected.
+- Feels like an operator inbox plus artifact pipeline.
+- Maybe also feels like a repo-native work loop with checkpoints.
+- Strong rule: no silent mutation of canonical docs. Anything that might
+  replace a foundation/spec or kick off downstream execution should go through
+  an explicit review gate.
+- Strong rule: failed evals stay attached to the draft. The service should not
+  quietly keep regenerating until something looks plausible.
+- The service should preserve lineage:
+  raw request -> packet -> brief -> candidate artifact -> eval run ->
+  approval/rejection.
+- Main surfaces that recur:
+  - intake of rough notes / source material
+  - artifact drafting for foundation/spec/eval docs
+  - evaluation runs against packet criteria
+  - approval / rejection with reviewer comments
+  - versioned history of what changed and why
+- likely actors:
+  - operator / founder / PM / eng lead
+  - reviewer / approver
+  - optional downstream executor or another agent system
+- Not a general autonomous software engineer.
+- Not project management or backlog grooming.
+- Not a docs CMS.
+- Not the runtime that executes arbitrary business workflows after approval.
+- It should only coordinate work that has been packetized into artifact stages.
+- There may be lightweight status states for work items and drafts, but do not
+  turn it into a full BPM monster.
+- Unresolved:
+  repo-native first vs a hosted catalog / multi-workspace control plane later.
+- Unresolved:
+  when an eval fails, does this service create follow-up tasks itself or just
+  surface the failure for humans / external systems?
+- Unresolved:
+  how much of the value is in drafting vs evaluation vs review gates?
+- External things it probably depends on:
+  workspace files / repo state,
+  model providers,
+  version control,
+  maybe a separate evaluator,
+  maybe a downstream execution system for approved outputs.
+- If the source packet is ambiguous, the service should surface open questions
+  rather than pretend it understands the answer.

From f91d3101466f853503c7f5fe0723e530596fdce4 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 12:20:26 +1000
Subject: [PATCH 11/30] Harden spec creator update evals

---
 scripts/run-baml-eval.mjs                     | 519 ++++++++++-
 .../spec_compiler/compiler_functions.baml     | 115 ++-
 .../baml_src/spec_compiler/eval_runner.baml   | 138 ++-
 .../baml_src/spec_compiler/spec_types.baml    |   3 +
 skills/spec-creator/evals/evals.json          | 819 +++++++++++++++++-
 .../existing_spec.md                          | 134 +++
 .../expected_criteria.md                      |  29 +
 .../raw_notes.md                              |  41 +
 .../existing_spec.md                          | 134 +++
 .../expected_criteria.md                      |  25 +
 .../raw_notes.md                              |  41 +
 .../harbor_update_addendum/existing_spec.md   | 125 +++
 .../expected_criteria.md                      |  26 +
 .../harbor_update_addendum/raw_notes.md       |  30 +
 .../existing_spec.md                          | 180 ++++
 .../expected_criteria.md                      |  18 +
 .../lightfast_non_goals_refresh/raw_notes.md  |  30 +
 .../existing_spec.md                          | 180 ++++
 .../expected_criteria.md                      |  23 +
 .../raw_notes.md                              |  38 +
 .../existing_spec.md                          | 174 ++++
 .../expected_criteria.md                      |  30 +
 .../lightfast_update_addendum/raw_notes.md    |  42 +
 23 files changed, 2841 insertions(+), 53 deletions(-)
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/existing_spec.md
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/expected_criteria.md
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/raw_notes.md
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/existing_spec.md
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/expected_criteria.md
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/raw_notes.md
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_update_addendum/existing_spec.md
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_update_addendum/expected_criteria.md
 create mode 100644 skills/spec-creator/evals/fixtures/harbor_update_addendum/raw_notes.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/existing_spec.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/expected_criteria.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/raw_notes.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/existing_spec.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/expected_criteria.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/raw_notes.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_update_addendum/existing_spec.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_update_addendum/expected_criteria.md
 create mode 100644 skills/spec-creator/evals/fixtures/lightfast_update_addendum/raw_notes.md

diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs
index fc74642..166a37b 100644
--- a/scripts/run-baml-eval.mjs
+++ b/scripts/run-baml-eval.mjs
@@ -228,6 +228,260 @@ function dedupeStrings(values) {
   return deduped;
 }
 
+function normalizeMatchText(value) {
+  return String(value ?? "")
+    .toLowerCase()
+    .replace(/[`"'’]/g, "")
+    .replace(/[^a-z0-9]+/g, " ")
+    .trim();
+}
+
+const COMPARABLE_STOP_WORDS = new Set([
+  "the",
+  "and",
+  "for",
+  "with",
+  "that",
+  "this",
+  "from",
+  "into",
+  "through",
+  "within",
+  "under",
+  "over",
+  "while",
+  "where",
+  "when",
+  "than",
+  "then",
+  "only",
+  "just",
+  "more",
+  "less",
+  "same",
+  "does",
+  "doesnt",
+  "not",
+  "are",
+  "is",
+  "was",
+  "were",
+  "be",
+  "being",
+  "been",
+  "can",
+  "could",
+  "should",
+  "would",
+  "will",
+  "may",
+  "might",
+  "must",
+  "have",
+  "has",
+  "had",
+  "its",
+  "their",
+  "them",
+  "they",
+  "there",
+  "about",
+  "across",
+  "around",
+  "also",
+  "still",
+  "rather",
+  "than",
+  "such",
+  "those",
+  "these",
+  "service",
+  "packet",
+  "source",
+  "record",
+  "records",
+  "surface",
+  "surfaces",
+  "entity",
+  "entities",
+  "concept",
+  "concepts",
+  "label",
+  "labels",
+  "term",
+  "terms",
+  "context",
+]);
+
+function extractComparableWords(value) {
+  return new Set(
+    normalizeMatchText(value)
+      .split(" ")
+      .map((word) => word.trim())
+      .filter((word) => word.length >= 3 && !COMPARABLE_STOP_WORDS.has(word)),
+  );
+}
+
+function countWordOverlap(leftWords, rightWords) {
+  let overlap = 0;
+  for (const word of leftWords) {
+    if (rightWords.has(word)) {
+      overlap += 1;
+    }
+  }
+  return overlap;
+}
+
+function normalizeFieldName(value) {
+  return normalizeMatchText(value).replace(/\s+/g, "_");
+}
+
+const GENERIC_IDENTITY_FIELDS = new Set([
+  "id",
+  "name",
+  "slug",
+  "title",
+  "key",
+  "code",
+  "identifier",
+  "external_id",
+  "display_name",
+]);
+
+function entityUsesOnlyGenericIdentityFields(entity) {
+  const fields = Array.isArray(entity?.fields) ? entity.fields : [];
+
+  if (fields.length > 2) {
+    return false;
+  }
+
+  return fields.every((field) => GENERIC_IDENTITY_FIELDS.has(normalizeFieldName(field?.name)));
+}
+
+function collectSpecBriefContextTexts(brief, packet) {
+  return [
+    brief?.purpose,
+    ...(brief?.operational_problems ?? []),
+    ...(brief?.goals ?? []),
+    ...(brief?.non_goals ?? []),
+    ...(brief?.important_boundaries ?? []),
+    ...(brief?.external_dependencies ?? []),
+    ...(brief?.unresolved_questions ?? []),
+    packet?.raw_notes,
+    packet?.expected_criteria,
+  ].filter((value) => typeof value === "string" && value.trim().length > 0);
+}
+
+const ENTITY_ALIAS_AMBIGUITY_MARKERS = [
+  /\bdoes not resolve\b/i,
+  /\bpreferred term\b/i,
+  /\bsame underlying concept\b/i,
+  /\bsame service surface\b/i,
+  /\bdistinct surface\b/i,
+  /\bdistinct surfaces\b/i,
+  /\bdistinct concept\b/i,
+  /\bdistinct concepts\b/i,
+  /\bseparate concept\b/i,
+  /\bseparate concepts\b/i,
+  /\balias\b/i,
+  /\bversus\b/i,
+  /\bvs\.?\b/i,
+  /\bdo not collapse\b/i,
+  /\breferring to the same\b/i,
+];
+
+function textHasAliasAmbiguityMarker(text) {
+  return (
+    ENTITY_ALIAS_AMBIGUITY_MARKERS.some((pattern) => pattern.test(text)) ||
+    /\bteams?\b.*\bworkspace\b/i.test(text) ||
+    /\bworkspace\b.*\bteams?\b/i.test(text)
+  );
+}
+
+function findAmbiguousAliasEntityDecision(entity, brief, packet) {
+  if (!entityUsesOnlyGenericIdentityFields(entity)) {
+    return null;
+  }
+
+  const entityText = normalizeMatchText(
+    [
+      entity?.name,
+      entity?.description,
+      ...(entity?.fields ?? []).flatMap((field) => [field?.name, field?.description]),
+    ].join(" "),
+  );
+
+  if (!textHasAliasAmbiguityMarker(entityText)) {
+    return null;
+  }
+
+  const entityWords = extractComparableWords(entityText);
+  if (entityWords.size === 0) {
+    return null;
+  }
+
+  const matchingContext = collectSpecBriefContextTexts(brief, packet).find((contextText) => {
+    const normalizedContext = normalizeMatchText(contextText);
+    if (!textHasAliasAmbiguityMarker(normalizedContext)) {
+      return false;
+    }
+
+    return countWordOverlap(entityWords, extractComparableWords(normalizedContext)) >= 2;
+  });
+
+  if (!matchingContext) {
+    return null;
+  }
+
+  return {
+    entity_name: entity?.name ?? "Unnamed entity",
+    reason:
+      "Removed minimal entity because it models an explicitly ambiguous alias surface that the brief and packet keep unresolved.",
+    evidence: matchingContext,
+  };
+}
+
+function cloneEvalBrief(brief) {
+  if (typeof structuredClone === "function") {
+    return structuredClone(brief);
+  }
+
+  return JSON.parse(JSON.stringify(brief));
+}
+
+function normalizeCompiledBriefForRender({ skillName, packetType, brief, packet }) {
+  if (skillName !== "spec-creator" || packetType !== "SpecEvalPacket") {
+    return {
+      brief,
+      normalization: {
+        applied: false,
+        removed_entities: [],
+      },
+    };
+  }
+
+  const normalizedBrief = cloneEvalBrief(brief);
+  const removedEntities = [];
+
+  normalizedBrief.entities = (normalizedBrief.entities ?? []).filter((entity) => {
+    const decision = findAmbiguousAliasEntityDecision(entity, normalizedBrief, packet);
+    if (!decision) {
+      return true;
+    }
+
+    removedEntities.push(decision);
+    return false;
+  });
+
+  return {
+    brief: normalizedBrief,
+    normalization: {
+      applied: true,
+      removed_entities: removedEntities,
+    },
+  };
+}
+
 function getEvalProfilePreset(rawValue) {
   const value = rawValue?.trim() || "fast";
   const preset = EVAL_PROFILE_PRESETS[value];
@@ -649,10 +903,124 @@ function filterLinesByPatternSpecs(lines, patternSpecs = []) {
   );
 }
 
-function createPatternChecks(normalizedCandidate, patternChecks = [], expectedPresence = true) {
+function normalizeComparableHeading(value) {
+  return stripLeadingSectionNumber(normalizeHeading(value));
+}
+
+function extractMarkdownHeadingEntries(document) {
+  const lines = document.split(/\r?\n/);
+  const headings = [];
+
+  for (const [index, rawLine] of lines.entries()) {
+    const match = rawLine.match(/^\s*(#{1,6})\s+(.+?)\s*$/);
+    if (!match) {
+      continue;
+    }
+
+    headings.push({
+      index,
+      level: match[1].length,
+      title: normalizeLine(match[2]),
+      normalizedTitle: normalizeComparableHeading(match[2]),
+    });
+  }
+
+  return {
+    lines,
+    headings,
+  };
+}
+
+function removeReplaceableSectionContent(document, replaceableSections = []) {
+  if (replaceableSections.length === 0) {
+    return document;
+  }
+
+  const normalizedSpecs = replaceableSections
+    .map((sectionSpec) =>
+      typeof sectionSpec === "string"
+        ? {
+            title: sectionSpec,
+          }
+        : sectionSpec,
+    )
+    .filter((sectionSpec) => typeof sectionSpec?.title === "string")
+    .map((sectionSpec) => ({
+      normalizedTitle: normalizeComparableHeading(sectionSpec.title),
+      level: Number.isInteger(sectionSpec.level) ? sectionSpec.level : null,
+      removeHeading: sectionSpec.remove_heading === true,
+    }));
+
+  if (normalizedSpecs.length === 0) {
+    return document;
+  }
+
+  const { lines, headings } = extractMarkdownHeadingEntries(document);
+  const skippedLineIndexes = new Set();
+
+  for (let headingIndex = 0; headingIndex < headings.length; headingIndex += 1) {
+    const heading = headings[headingIndex];
+    const matchingSpec = normalizedSpecs.find((sectionSpec) => {
+      if (sectionSpec.normalizedTitle !== heading.normalizedTitle) {
+        return false;
+      }
+
+      if (sectionSpec.level !== null && sectionSpec.level !== heading.level) {
+        return false;
+      }
+
+      return true;
+    });
+
+    if (!matchingSpec) {
+      continue;
+    }
+
+    let end = lines.length;
+    for (let nextIndex = headingIndex + 1; nextIndex < headings.length; nextIndex += 1) {
+      if (headings[nextIndex].level <= heading.level) {
+        end = headings[nextIndex].index;
+        break;
+      }
+    }
+
+    const start = matchingSpec.removeHeading ? heading.index : heading.index + 1;
+    for (let lineIndex = start; lineIndex < end; lineIndex += 1) {
+      skippedLineIndexes.add(lineIndex);
+    }
+  }
+
+  return lines
+    .filter((_, index) => !skippedLineIndexes.has(index))
+    .join("\n");
+}
+
+function findSectionBody(sectionMap, targetTitle) {
+  const normalizedTarget = stripLeadingSectionNumber(normalizeHeading(targetTitle));
+
+  for (const [title, body] of sectionMap.entries()) {
+    if (stripLeadingSectionNumber(normalizeHeading(title)) === normalizedTarget) {
+      return body ?? "";
+    }
+  }
+
+  return null;
+}
+
+function createPatternChecks(candidateDocument, patternChecks = [], expectedPresence = true) {
+  const normalizedCandidate = normalizeLine(candidateDocument);
+  const sectionBodies = extractFoundationSectionBodies(candidateDocument).bodies;
+
   return patternChecks.map((patternCheck) => {
     const expression = compilePatternSpec(patternCheck);
-    const matched = expression.test(normalizedCandidate);
+    const scopedText = patternCheck.section_title
+      ? findSectionBody(sectionBodies, patternCheck.section_title)
+      : candidateDocument;
+    const normalizedScopedText =
+      typeof scopedText === "string" ? normalizeLine(scopedText) : "";
+    const matched = expression.test(
+      patternCheck.section_title ? normalizedScopedText : normalizedCandidate,
+    );
     const passed = expectedPresence ? matched : !matched;
 
     return createCheck(
@@ -663,6 +1031,30 @@ function createPatternChecks(normalizedCandidate, patternChecks = [], expectedPr
   });
 }
 
+function createPreservedSectionChecks(existingDocument, candidateDocument, preservedSections = []) {
+  if (preservedSections.length === 0) {
+    return [];
+  }
+
+  const existingSections = extractFoundationSectionBodies(existingDocument).bodies;
+  const candidateSections = extractFoundationSectionBodies(candidateDocument).bodies;
+
+  return preservedSections.map((sectionCheck) => {
+    const existingBody = findSectionBody(existingSections, sectionCheck.title);
+    const candidateBody = findSectionBody(candidateSections, sectionCheck.title);
+    const passed =
+      existingBody !== null &&
+      candidateBody !== null &&
+      normalizeLine(existingBody) === normalizeLine(candidateBody);
+
+    return createCheck(
+      sectionCheck.id,
+      passed,
+      passed ? sectionCheck.details_pass : sectionCheck.details_fail,
+    );
+  });
+}
+
 function extractFoundationTemplateSections(templateText) {
   const sections = [];
   const lines = templateText.split(/\r?\n/);
@@ -704,25 +1096,25 @@ function extractFoundationDisallowedHeadings(templateText) {
 }
 
 function extractFoundationSectionBodies(candidateDocument) {
-  const lines = candidateDocument.split(/\r?\n/);
-  const sections = [];
-
-  for (const [index, rawLine] of lines.entries()) {
-    const match = rawLine.match(/^\s*##\s+(.+?)\s*$/);
-    if (match) {
-      sections.push({
-        title: normalizeLine(match[1]),
-        index,
-      });
-    }
-  }
-
+  const { lines, headings } = extractMarkdownHeadingEntries(candidateDocument);
+  const sections = headings.map((heading) => ({
+    title: heading.title,
+    index: heading.index,
+    level: heading.level,
+  }));
   const bodies = new Map();
   for (let index = 0; index < sections.length; index += 1) {
     const current = sections[index];
-    const next = sections[index + 1];
+    let end = lines.length;
+
+    for (let nextIndex = index + 1; nextIndex < sections.length; nextIndex += 1) {
+      if (sections[nextIndex].level <= current.level) {
+        end = sections[nextIndex].index;
+        break;
+      }
+    }
+
     const start = current.index + 1;
-    const end = next ? next.index : lines.length;
     bodies.set(current.title, lines.slice(start, end).join("\n").trim());
   }
 
@@ -1007,21 +1399,24 @@ function validateFoundationUpdateDocument(
   packet,
 ) {
   const baseChecks = validateFoundationDocument(candidateDocument, templateText, "", packet);
-  const existingBlocks = extractNormalizedMarkdownBlocks(existingFoundationText);
+  const filteredExistingFoundationText = removeReplaceableSectionContent(
+    existingFoundationText,
+    validationContract.replaceable_sections ?? [],
+  );
+  const existingBlocks = extractNormalizedMarkdownBlocks(filteredExistingFoundationText);
   const candidateBlocks = extractNormalizedMarkdownBlocks(candidateDocument);
-  const normalizedCandidate = normalizeLine(candidateDocument);
   const preservedExistingBlocks = filterLinesByPatternSpecs(
     existingBlocks,
     validationContract.allowed_removed_patterns ?? [],
   );
   const preservesExistingContent = linesAppearInOrder(preservedExistingBlocks, candidateBlocks);
   const requiredPatternChecks = createPatternChecks(
-    normalizedCandidate,
+    candidateDocument,
     validationContract.required_patterns ?? [],
     true,
   );
   const forbiddenPatternChecks = createPatternChecks(
-    normalizedCandidate,
+    candidateDocument,
     validationContract.forbidden_patterns ?? [],
     false,
   );
@@ -1194,38 +1589,45 @@ function validateSpecDocument(candidateDocument, templateText) {
   ];
 }
 
-function validateSpecUpdateDocument(candidateDocument, existingSpecText) {
-  const existingLines = extractNonEmptyNormalizedLines(existingSpecText);
-  const candidateLines = extractNonEmptyNormalizedLines(candidateDocument);
-  const preservesExistingContent = linesAppearInOrder(existingLines, candidateLines);
-  const hasOffsetStoreComponent = /^\s*4\.\s+`Offset Store`/m.test(candidateDocument);
-  const hasCrossRegionNonGoal =
-    /(^|\n)-\s+(Cross-region log replication\.|Replicating logs across regions\.)/mi.test(
-      candidateDocument,
-    );
+function validateSpecUpdateDocument(candidateDocument, existingSpecText, validationContract) {
+  const filteredExistingSpecText = removeReplaceableSectionContent(
+    existingSpecText,
+    validationContract.replaceable_sections ?? [],
+  );
+  const existingBlocks = extractNormalizedMarkdownBlocks(filteredExistingSpecText);
+  const candidateBlocks = extractNormalizedMarkdownBlocks(candidateDocument);
+  const preservedExistingBlocks = filterLinesByPatternSpecs(
+    existingBlocks,
+    validationContract.allowed_removed_patterns ?? [],
+  );
+  const preservesExistingContent = linesAppearInOrder(preservedExistingBlocks, candidateBlocks);
+  const requiredPatternChecks = createPatternChecks(
+    candidateDocument,
+    validationContract.required_patterns ?? [],
+    true,
+  );
+  const forbiddenPatternChecks = createPatternChecks(
+    candidateDocument,
+    validationContract.forbidden_patterns ?? [],
+    false,
+  );
+  const preservedSectionChecks = createPreservedSectionChecks(
+    existingSpecText,
+    candidateDocument,
+    validationContract.preserve_sections ?? [],
+  );
 
   return [
     createCheck(
       "existing_content_preserved_in_order",
       preservesExistingContent,
       preservesExistingContent
-        ? "All non-empty lines from the existing spec appear in order in the candidate."
-        : "One or more non-empty lines from the existing spec were removed or reordered.",
-    ),
-    createCheck(
-      "offset_store_component_present",
-      hasOffsetStoreComponent,
-      hasOffsetStoreComponent
-        ? "Detected numbered component `4. `Offset Store``."
-        : "Did not detect numbered component `4. `Offset Store``.",
-    ),
-    createCheck(
-      "cross_region_nongoal_present",
-      hasCrossRegionNonGoal,
-      hasCrossRegionNonGoal
-        ? "Detected the requested cross-region replication non-goal."
-        : "Did not detect the requested cross-region replication non-goal.",
+        ? "All existing markdown blocks appear in order in the candidate, except blocks explicitly marked as replaceable."
+        : "One or more existing markdown blocks were removed or reordered outside the explicitly replaceable blocks.",
     ),
+    ...requiredPatternChecks,
+    ...forbiddenPatternChecks,
+    ...preservedSectionChecks,
     createCheck(
       "no_first_or_second_person",
       !hasPronounDrift(candidateDocument),
@@ -1289,7 +1691,7 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo
       checks = validateSpecDocument(candidateDocument, templateText, languageText);
       break;
     case "spec-update-v1":
-      checks = validateSpecUpdateDocument(candidateDocument, existingSpecText, languageText);
+      checks = validateSpecUpdateDocument(candidateDocument, existingSpecText, validationContract);
       break;
     default:
       fail(`Unknown validation contract '${validationContract.validator}'.`);
@@ -1309,6 +1711,7 @@ async function runSingleTrial({
   candidateGenerated,
   judgeGenerated,
   runner,
+  skillName,
   skillRoot,
   validationContract,
 }) {
@@ -1338,9 +1741,16 @@ async function runSingleTrial({
   const startedAt = Date.now();
 
   const compileStartedAt = Date.now();
-  const brief = await candidateClient[compileFnName](packet);
+  const compiledBrief = await candidateClient[compileFnName](packet);
   timing.compile_ms = Date.now() - compileStartedAt;
 
+  const { brief, normalization } = normalizeCompiledBriefForRender({
+    skillName,
+    packetType: runner.packet_type,
+    brief: compiledBrief,
+    packet,
+  });
+
   if (runner.packet_type === "SpecEvalPacket") {
     brief.update_request = packet.task_prompt;
     if (packet.existing_spec) {
@@ -1384,6 +1794,7 @@ async function runSingleTrial({
     candidateDocument,
     report,
     deterministic_checks,
+    normalization,
     timing,
     summary: {
       llm_status: report.overall_status,
@@ -1415,6 +1826,7 @@ async function runVariantTrials({
       candidateGenerated,
       judgeGenerated,
       runner,
+      skillName,
       skillRoot,
       validationContract,
     });
@@ -1426,6 +1838,7 @@ async function runVariantTrials({
       "candidate.md": trialResult.candidateDocument,
       "report.json": trialResult.report,
       "deterministic_checks.json": trialResult.deterministic_checks,
+      "normalization.json": trialResult.normalization,
       "timing.json": trialResult.timing,
       "summary.json": trialResult.summary,
     };
@@ -1499,6 +1912,16 @@ function buildBenchmark(skillName, evalName, trials) {
       combined_worst_status: worstStatus(combinedStatuses),
       deterministic_pass_rate: Number((deterministicPassCount / trials.length).toFixed(2)),
     },
+    trial_summaries: trials.map((trial, index) => ({
+      trial: index + 1,
+      llm_status: trial.report.overall_status,
+      combined_status: trial.summary.combined_status,
+      deterministic_pass: trial.deterministic_checks.overall_pass,
+      failed_deterministic_checks: trial.deterministic_checks.checks
+        .filter((check) => !check.passed)
+        .map((check) => check.id),
+      timing_ms: trial.timing.total_ms,
+    })),
     timing_ms: {
       compile: summarizeNumeric(trials.map((trial) => trial.timing.compile_ms)),
       render: summarizeNumeric(trials.map((trial) => trial.timing.render_ms)),
diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
index d6900aa..da034d8 100644
--- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -35,23 +35,68 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
     - Follow the `spec-creator` contract rather than ad hoc document structure.
     - Prefer service-level behavior over product doctrine.
     - Preserve unresolved questions explicitly.
-    - Preserve packet-named unresolved tensions when they affect scope, actor boundaries, approval gates, deployment shape, or operating model.
+    - Preserve packet-named unresolved tensions when they affect scope, actor boundaries, approval boundaries, deployment shape, or operating model.
     - If the notes contain multiple materially distinct unresolved tensions, keep them distinct in `unresolved_questions` instead of collapsing them into one generic question.
     - If the notes contain explicit `Unresolved:` items, treat them as high-priority source facts. Preserve each materially distinct unresolved item in `unresolved_questions` unless two items are true duplicates.
+    - If the notes contain multiple explicit `Unresolved:` bullets, the brief should usually preserve the same count of materially distinct unresolved questions unless two bullets are true duplicates.
     - Do not replace a packet-named open question with a narrower unsourced variant.
     - Do not invent a new unresolved question if it causes a packet-stated unresolved question to be dropped.
     - Do not invent implementation detail.
     - Keep packet-named service surfaces distinct when they matter. For example, if the notes name teams separately from projects, do not silently collapse teams into projects.
+    - If the notes name an ordered lineage, pipeline, or stage progression, preserve each materially distinct stage somewhere in the brief. Do not collapse named intermediate stages such as `brief` into a broader `draft` label unless the notes themselves do so.
+    - Use `lineage_stages` for packet-named ordered stages that materially shape the service, such as `raw request`, `packet`, `brief`, `draft`, `evaluation`, or `approval`. If the notes name such stages, preserve them there in source order.
+    - Distinct source-visible surfaces do not automatically require distinct entities. Preserve the distinction in purpose, components, dependencies, or unresolved questions unless the notes clearly support separate first-class records.
+    - Use `actors` for source-backed participant roles that shape the service boundary, coordination model, or human escalation path. Human advocates, caregivers, reviewers, operators, patients, providers, and similar participants belong in `actors`.
+    - Use `service_surfaces` for stable, source-backed coordination surfaces such as inboxes, queues, shared records, timelines, cases, or similar user-visible surfaces. If the notes imply one of these surfaces, preserve it there even if the wording is tentative.
     - `components` should describe major service responsibilities or user-visible service surfaces, not low-level classes, methods, SDKs, or protocols.
+    - Component names should stay source-visible and behavior-oriented. Avoid internal-control names like `gate`, `control plane`, `status system`, or similar unless the notes explicitly use that framing.
     - `abstraction_layers` should describe stable behavioral layers that make the service easier to reason about or port. Keep them service-level and conservative.
     - `external_dependencies` must name external systems, resources, or authorities explicitly visible in the notes. Do not list inferred internals such as token issuance, allowlists, registries, classes, transport mechanics, or storage engines unless they are stated directly.
+    - When the notes describe an external dependency with uncertainty words such as `maybe`, `optional`, `if present`, or similar, preserve that dependency as conditional rather than turning it into an unconditional requirement.
+    - Prefer systems, resources, and external services over actor roles in `external_dependencies`.
+    - If the notes explicitly name a specific human authority as part of the external dependency boundary to keep the runtime honest, preserve that role as a qualified human dependency as well as an `actor`. This should be rare and should apply only to packet-explicit human review, advocate, or approval boundaries, not to general participant lists.
+    - Do not list general participant roles such as family caregivers, patients, providers, reviewers, or approvers under `external_dependencies` just because they appear in the packet. Keep those in `actors`, boundaries, or escalation flows unless the packet explicitly frames a narrower human authority as part of the dependency boundary.
+    - When the notes mention approved clients, consent, or endpoint verification, keep that as a source-visible boundary or restriction. Do not turn it into an approval-state object, internal control process, or hidden eligibility system.
     - `entities` should model only externally meaningful nouns the service manipulates or exposes. Do not create entities only to complete a plausible schema.
+    - Do not model clients, endpoints, documentation search results, approval lists, or ambiguous aliases as entities unless the notes clearly treat them as durable service records with stable fields and behavior.
+    - When two labels may describe overlapping context, such as team versus workspace context, keep the ambiguity explicit instead of multiplying entities.
+    - Do not add umbrella subject fields such as `subject`, `case_subject`, `party`, or similarly generalized placeholders when the notes already frame participants or care context more concretely. Prefer packet-backed wording or omit the field.
     - Every entity field must be directly supported by the notes or be the minimal identity/status field required to keep the entity coherent. Prefer fewer fields over speculative completeness.
+    - When the notes describe a transition-state resource surface rather than a record-centric system, default to the sparsest viable entities. One minimal identifying field may be enough.
+    - Do not infer relationship fields, lifecycle/status fields, or embedded collections such as `project_id`, `status`, or `logs` unless the notes clearly support them.
+    - Do not add audit-style fields such as `change_note`, `decided_at`, revision metadata, or extra timestamps solely because the notes mention history or traceability. Keep history in components or prose unless the record field itself is source-backed.
+    - Do not add revision-chain fields such as `revision`, `previous_*_id`, `parent_*_id`, or similar linkage solely because the notes mention version history, revisions, or lineage. Keep revision history in components or prose unless the notes clearly make that linkage a record-level concern.
+    - Do not split a shared case, shared timeline, or shared record surface into entry-level entities such as `TimelineEntry`, `Event`, or `RecordItem` unless the notes clearly support those entries as durable first-class records.
+    - Most briefs should stay under roughly six entities unless the notes clearly support richer modeling.
     - Prefer the smallest viable domain model. If an extra entity or field only restates behavior already captured elsewhere, omit it.
+    - If the notes mention broader `management` or `operations` language only as an unresolved transition, keep that wording in boundaries or unresolved questions rather than turning it into a positive capability, component, or layer name.
+    - If the notes leave reminders, outbound follow-up, missing-document requests, or similar communication behavior explicitly unresolved, keep that behavior in boundaries or unresolved questions rather than turning it into a standalone component, layer, or affirmative service capability.
+    - If the notes emphasize durable lineage, versioned history, approval trails, or recurring packet-to-review loops, make the service's long-running coordination role explicit in the brief rather than leaving persistence only implied.
+    - When the notes support it, use the exact phrase `long-running` in `purpose` or `update_request` instead of relying only on weaker proxies like `durable`, `ongoing`, or `history-aware`.
+    - If the notes center a stable coordination surface such as a shared case, timeline, inbox, queue, or record, make that surface explicit in the brief's purpose rather than leaving it only in components.
+    - If the notes describe the service with tentative but source-backed coordination-surface language such as `feels like an inbox`, `shared record`, `queue`, or similar, preserve that surface or a close paraphrase in the brief instead of collapsing the service into pipeline-only wording.
+    - When preserving a coordination surface such as an inbox, queue, or shared record, keep it visible in `service_surfaces` and in at least one other brief field such as `purpose`, `operational_problems`, `components`, or `abstraction_layers`.
+    - If the notes name versioned history, a shared timeline, an approval trail, or similar trace surface as a recurring service surface, preserve at least one such surface in `service_surfaces` rather than reducing it to generic traceability wording.
+    - When write scope is unresolved, do not describe project or deployment surfaces as supporting `actions`, broader `workflows`, or `possible management` behavior. Keep them framed as retrieval, inspection, analysis, source-visible context, or unresolved documentation language.
+    - If a surface is explicitly ambiguous or alias-like, such as `team` versus `workspace context`, do not promote that ambiguity into a first-class entity unless the notes clearly establish a durable record shape. Keep it in components, dependencies, or unresolved questions instead.
+    - If approved-client scope is unresolved, ask source-visible questions such as whether the named client list is exhaustive or illustrative. Do not ask how approval is designated, maintained, or implemented unless the notes explicitly raise that process.
     - Use only logical field types such as `string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, or `string or null`. Do not invent custom type labels like `duration`.
     - Do not embed `required`, `optional`, or invented constraints like `unique within service scope` into `type_expression`.
     - If a concept is visible in the notes but does not support a stable entity schema, keep it in components, dependencies, or unresolved questions instead of forcing it into the domain model.
     - If `existing_spec` is present, preserve compatible intent and only sharpen gaps.
+    - If `existing_spec` is present and the request is a narrow update, treat the brief as a delta-oriented edit plan rather than a full regenerate-from-scratch brief.
+    - In update mode, prefer the smallest possible change set. Keep `components`, `abstraction_layers`, `external_dependencies`, `entities`, and `unresolved_questions` limited to items the request explicitly adds, removes, or materially changes.
+    - In update mode, the `SpecBrief` schema still requires a full-shape brief. Do not emit blank placeholders or empty required fields just because the requested edit is narrow.
+    - In update mode, carry forward unchanged required fields from `existing_spec`, especially `service_name`, `purpose`, and any unchanged document-shaping lists needed to preserve the current document.
+    - In update mode, treat `existing_spec` as the source of truth for unchanged wording and structure. Use the notes to identify requested edits, not to paraphrase unaffected text.
+    - In update mode, do not fill in missing template sections from scratch just because the existing spec omits them. The update brief should respect the current document shape unless the request explicitly asks to expand it.
+    - In update mode, do not restate or expand unchanged Purpose, Problem Statement, Goal, Non-Goal, component, dependency, or domain-model content just because you can infer a cleaner version.
+    - In update mode, if an unchanged paragraph, bullet, or component description already exists in `existing_spec`, preserve that wording exactly instead of restating it from summary fields.
+    - In update mode, do not make stylistic micro-edits to untouched text. Avoid synonym swaps, helper-word insertion, tense normalization, article changes, singular/plural cleanup, or punctuation-only rewriting unless the requested edit requires it.
+    - In update mode, if the request asks for a new component, boundary bullet, or open-question section, do not automatically propagate that concept into Purpose, Problem Statement, Goals, Abstraction Levels, External Dependencies, or the domain model unless the request explicitly asks for those extra edits.
+    - In update mode, if the request does not explicitly ask to revise `## 4. Core Domain Model`, keep the entity set unchanged. A newly added component, surface, boundary, or open question does not by itself justify a new entity or new fields.
+    - In update mode, prefer expressing newly requested behavior in component text, boundary bullets, or unresolved questions before considering any domain-model edit.
+    - In update mode, do not change enumerated counts such as `The service solves 5 operational problems:` unless the request explicitly asks to add or remove a problem bullet in that section.
     - Prefer omission over speculation.
 
     Raw notes:
@@ -76,9 +121,13 @@ function CritiqueSpecBrief(brief: SpecBrief) -> SpecCritique {
     - Flag ambiguous terms that weaken a behavioral specification.
     - Flag missing sections implied by the current brief.
     - Flag missing packet-named surfaces that were collapsed away, such as teams being buried inside projects.
+    - Flag named lineage or pipeline stages that disappeared from the brief, such as a `brief` stage being collapsed into a generic draft.
     - Flag abstraction layers that drift into low-level implementation or internal class design.
     - Flag dependencies that name inferred internals rather than source-visible external systems.
+    - Flag human participant roles that were stuffed into `external_dependencies` instead of `actors`, except where the brief preserves a packet-explicit human authority boundary as a qualified dependency.
+    - Flag missing `service_surfaces` or `lineage_stages` when the brief otherwise flattens important packet-described coordination surfaces or ordered stages into generic pipeline language.
     - Flag entity/field modeling that looks like speculative schema completion rather than source-backed service behavior.
+    - Flag unresolved questions that were present in the source packet but were not preserved distinctly in the brief.
 
     Brief:
     {{ brief|format(type="yaml") }}
@@ -98,6 +147,10 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
     - Keep the output language-agnostic and behavioral.
     - Preserve unresolved questions instead of inventing decisions.
     - If the brief includes `existing_spec`, treat the task as update mode and preserve the existing document verbatim except for the requested edits.
+    - In update mode, the create-mode template rules below do not authorize filling in missing sections, rewriting existing prose, or normalizing wording. They apply only where the update request explicitly touches that content.
+    - In update mode, treat `existing_spec` as the canonical source for unchanged wording, numbering, bullet layout, and line breaks. Use the brief to identify requested edits and new content, not to paraphrase untouched text.
+    - In update mode, if a surrounding paragraph, bullet, or component responsibility does not need to change, copy it exactly rather than regenerating a fresh paraphrase from the brief.
+    - In update mode, do not make stylistic micro-edits to untouched text: no synonym swaps, pronoun insertion, helper words, tense normalization, singular/plural cleanup, or punctuation-only rewrites unless the requested edit requires them.
     - Treat `references/template.md` and `references/language.md` as the contract.
     - Require the canonical `spec-creator` section shape:
       - `# <Service Name> Specification`
@@ -113,34 +166,92 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
       - `### 3.3 External Dependencies`
       - `## 4. Core Domain Model`
       - `### 4.1 Entities`
+    - Require the exact heading `### 4.1 Entities` to appear before any entity definitions.
+    - Never place `#### 4.1.x ...` headings directly under `## 4. Core Domain Model`; always emit `### 4.1 Entities` first, then any optional introductory sentence, then entity headings.
     - Require `## 1. Problem Statement` to contain:
       - one opening paragraph
       - a line like `The service solves <N> operational problems:`
       - a bullet list of explicit operational problems
     - Require the `Purpose:` line to name the service kind, the protected endpoint or access boundary, and the main resource surfaces at the highest stable granularity the brief supports.
     - If the brief names surfaces like docs, teams, projects, deployments, or logs, mention them explicitly in the `Purpose:` line rather than replacing them with vague phrases like `operational context`.
+    - If `service_surfaces` are present, mention at least one stable service surface from that list in the `Purpose:` line or opening Problem Statement paragraph.
+    - If the brief centers a stable coordination surface such as a shared case, timeline, inbox, queue, or record, mention at least one such surface explicitly in the `Purpose:` line.
+    - If the brief carries a tentative but source-backed coordination surface such as an inbox, queue, or shared record, keep that surface visible in the `Purpose:` line or opening Problem Statement paragraph rather than flattening it into pipeline-only language.
+    - If the brief mentions OAuth, consent, approved clients, or secure access, the `Purpose:` line must explicitly say `OAuth-protected` or `secure, OAuth-protected` rather than leaving that security detail to later sections.
     - If the brief is fundamentally about coordination across named participant types, roles, or organizations, mention that cross-party coordination boundary in the `Purpose:` line instead of only naming artifacts or records.
+    - If `actors` are present, use them to describe who coordination happens across.
+    - Do not restate general participant actors as external dependencies just to make them visible. A role may appear in both places only when the brief explicitly preserves a packet-stated human authority dependency boundary.
     - If the brief names explicit ingress surfaces such as referrals, intake notes, source packets, or raw notes as core to the service, mention at least one of those entry surfaces in the `Purpose:` line instead of only describing downstream records or outputs.
     - Keep the `Purpose:` line concrete and source-bound. Avoid metaphorical or unsourced framing verbs like `protects`, `defends`, `guards`, or similar language unless the brief explicitly uses them.
+    - If the brief emphasizes lineage, version history, recurring review loops, or approval trails, make the service's long-running coordination role explicit in the `Purpose:` line or opening Problem Statement paragraph instead of leaving it implicit.
+    - When the brief supports it, prefer the exact phrase `long-running` in the `Purpose:` line or opening Problem Statement paragraph instead of leaving that property to inference.
+    - If `service_surfaces` include versioned history, a timeline, or an approval trail, mention at least one such surface explicitly in the `Purpose:` line or opening Problem Statement paragraph instead of referring only to generic traceability or lineage.
+    - When write scope is unresolved, prefer neutral words like `surface`, `context`, `retrieval`, `inspection`, or `analysis` over stronger nouns like `operations` or `management`.
+    - Do not use `operation`, `operations`, or `management` in component or layer names when the brief leaves write scope unresolved.
+    - Prefer `handling`, `coordination`, or `review` over workflow-forward labels like `routing`, `orchestration`, or `dispatch` unless the brief explicitly supports that language.
     - Require `Important boundary:` as a standalone labeled block inside the Problem Statement, followed by boundary bullets.
     - Require numbered components and template-shaped field lines: `- `field_name` (type, constraints)`.
     - Require `### 3.2 Abstraction Levels` to explain stable behavioral layers rather than runtime call order or internal code structure.
     - Require external dependencies to stay at the system/service level rather than naming low-level API methods, SDK calls, token mechanics, registry internals, classes, or storage implementation.
+    - When a dependency is tentative in the brief, keep that tentativeness visible with wording like `if present`, `when used`, or `if evaluations are external` rather than presenting the dependency as always required.
+    - When evaluation is optional, qualified, or external in the brief, describe the service as coordinating evaluation runs or attaching evaluation results rather than claiming it always performs the evaluation itself.
+    - Prefer systems and external services over actor roles in `### 3.3 External Dependencies`.
+    - If the brief explicitly frames a specific human authority as part of the external dependency boundary, keep that role in `### 3.3 External Dependencies` with qualified wording such as `human advocate review, when escalation is required`.
+    - Do not list general participant roles such as family caregivers, patients, providers, reviewers, or approvers in `### 3.3 External Dependencies` just because the brief emphasizes a human boundary.
+    - If `actors` are present, reflect them in the Problem Statement, Purpose, or component responsibilities instead of moving them wholesale into `### 3.3 External Dependencies`.
+    - When a dependency is fundamentally about communicating with external parties, phrase the dependency as a communication channel, interface, or information source rather than as a bare list of counterparties.
+    - If the brief mentions approved-client restrictions, keep that phrased as a boundary or restriction, not as an approval-state system or internal status object.
     - Require packet-named surfaces to remain visible when they materially matter. For example, if the brief distinguishes teams from projects, keep that distinction somewhere in the spec instead of collapsing it away.
+    - If `lineage_stages` are present, preserve every materially distinct stage from that list somewhere in the document. Mentioning the ordered stages in prose is acceptable even when not every stage becomes a domain entity.
+    - If `service_surfaces` are present, keep at least one of those surfaces visible in the Purpose or System Overview so the service is not reduced to stage sequencing alone.
+    - If `service_surfaces` include versioned history, a timeline, or an approval trail, keep at least one such history-bearing surface visible in the Purpose, Problem Statement, or System Overview rather than only generic `traceability` language.
+    - If the brief names an ordered lineage or artifact pipeline, preserve every materially distinct stage somewhere in the document. If the brief includes a stage like `brief` between packet and candidate artifact, keep that stage visible in the Problem Statement, System Overview, or another prose section even if it is not a domain entity.
     - Require the domain model to stay conservative:
       - include only entities materially supported by the brief
       - prefer fewer entities over speculative completeness
       - use minimal source-backed fields
       - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
       - avoid second-order record types when a smaller set of entities or fields already makes the behavior legible
+      - preserve packet-named distinctions somewhere in the spec without forcing every named surface into the domain model
+      - do not model clients, endpoints, documentation search results, or unresolved aliases as entities unless the brief clearly treats them as durable first-class records
+      - do not convert a separate packet-named surface into a nested field on another entity unless the brief clearly supports that nesting
+      - do not add umbrella subject fields such as `subject`, `case_subject`, or `party` when the brief does not clearly require them
+      - when the brief is transitional on write scope, keep component and layer naming neutral rather than capability-forward
+      - if broader `management` language appears only as unresolved packet wording, discuss it as a tension rather than naming a component or layer after it
+      - if reminders, outbound follow-up, or missing-document requests are explicitly unresolved, do not model them as standalone components, layers, or capabilities
+      - if write scope is unresolved, do not describe project or deployment surfaces as supporting `actions`, broader `workflows`, or `possible management`; keep them framed as retrieval, inspection, analysis, or unresolved scope
+      - if a surface is explicitly ambiguous or alias-like, such as `team` versus `workspace context`, keep it out of the domain model unless the brief clearly establishes it as a durable record
+      - avoid component names like `gate` or dependency phrases like `client status` when the brief only supports externally visible access restrictions
+      - do not add revision-chain fields such as `revision`, `previous_*_id`, or `parent_*_id` solely to express history or traceability when the brief does not make that linkage record-level
+      - do not split a shared case, shared timeline, or shared record surface into entry-level entities unless the brief clearly makes those entries first-class durable records
+      - keep entity count modest; more than roughly six entities should be rare and strongly justified by the brief
+      - if the brief names a shared case timeline or shared record surface without explicit entry records, do not create `TimelineEntry`, `Event`, or similar entry entities just to make the timeline concrete
     - In field lines, use only logical types and source-backed constraints.
     - Do not put `required`, `optional`, `non-empty`, `unique`, or invented service-scope constraints inside the field parentheses.
     - If requirement state matters, express it in the indented semantic bullet rather than inside the type parentheses.
     - If `unresolved_questions` are materially important, add `## 5. Open Questions` after the domain model and render them as bullets.
+    - When `unresolved_questions` are present, render every materially distinct item from the brief exactly once. Do not drop one for brevity or substitute only a subset.
     - When multiple unresolved questions are present, preserve the distinct tensions rather than reducing them to a single generic question.
     - Keep `## 5. Open Questions` source-bound. If the brief names a structural tension such as repo-native vs hosted, family-direct vs sponsored, or automation vs human ownership, carry that tension forward explicitly.
     - When the source notes include explicit `Unresolved:` bullets, keep those questions source-traceable in `## 5. Open Questions` rather than substituting different questions.
-    - In update mode, do not add new top-level sections or template-completion material unless the request explicitly asks for them.
+    - When approved-client scope is uncertain, prefer questions about whether the named supported-client list is fixed, exhaustive, or illustrative. Do not ask how approval is internally designated or maintained unless the brief explicitly raises that process.
+    - When write scope is unresolved, ask which specific mutations, if any, are supported. Do not describe the question itself in a way that implies broader actions are already established.
+    - In update mode, do not add new top-level sections, missing template subsections, empty scaffold headings, or template-completion material unless the request explicitly asks for them.
+    - In update mode, preserve the existing section set exactly. If `existing_spec` lacks `### 3.2 Abstraction Levels`, `### 3.3 External Dependencies`, `## 4. Core Domain Model`, `## 5. Open Questions`, or any other template section, do not add that missing section unless `update_request` names that section or asks for that content explicitly.
+    - In update mode, make the smallest textual diff that satisfies `update_request`.
+    - In update mode, if `update_request` uses words like refresh, replace, or rewrite for a named section or subsection, treat that named section or subsection as replaceable. Keep its heading and neighboring sections unchanged, but replace the section body with the corresponding brief content.
+    - For a refreshed or replaced section, the old section body is not authoritative. Do not copy old bullets or questions from `existing_spec` into the refreshed section unless the same bullet or question appears in the corresponding brief field.
+    - In update mode, insertion is preferred only for add-only requests. Do not preserve stale bullets inside a section that the request explicitly asks to refresh or replace.
+    - If `update_request` asks to refresh `### 2.2 Non-Goals`, render `brief.non_goals` as the complete new bullet list for that subsection and remove old non-goal bullets that are absent from `brief.non_goals`.
+    - If `update_request` asks to refresh `## 5. Open Questions`, render `brief.unresolved_questions` as the complete new bullet list for that section and remove old questions that are absent from `brief.unresolved_questions`.
+    - In update mode, prefer insertion over rewrite. When adding a new bullet or component beside existing text, keep neighboring unchanged lines exactly as they appear in `existing_spec`.
+    - If `update_request` asks for a new bullet, component, boundary, or open question, insert only that material into the existing structure instead of regenerating neighboring paragraphs or adding unrelated template sections. This insertion rule does not apply to refresh, replace, or rewrite requests.
+    - If `update_request` does not ask to revise the Purpose line, Problem Statement bullets, existing component descriptions, dependencies, or domain-model entities, keep those lines unchanged.
+    - If `update_request` does not explicitly ask to revise `## 4. Core Domain Model`, preserve that entire section verbatim.
+    - A newly added component, surface, or open question does not authorize adding a new entity or new fields.
+    - If `update_request` asks for a new main component, add it only under `### 3.1 Main Components` unless the request explicitly asks for matching edits elsewhere.
+    - If `update_request` asks for `## 5. Open Questions`, append that section and keep earlier prose unchanged unless the request explicitly asks to revise earlier sections.
+    - Do not update problem counts, goal counts, or other numeric summary lines unless the request explicitly changes the bullets they count.
     - Emphasize problem statement, goals, non-goals, components, dependencies, and entities.
     - Make the prompt directly usable by an agent.
 
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
index 7cb3ea6..45eaae6 100644
--- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -21,23 +21,68 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
     - Follow the `spec-creator` template and language guide rather than an ad hoc service summary.
     - Treat expected criteria as evaluation guidance, not as license to invent.
     - Preserve unresolved questions explicitly.
-    - Preserve packet-named unresolved tensions when they affect scope, actor boundaries, approval gates, deployment shape, or operating model.
+    - Preserve packet-named unresolved tensions when they affect scope, actor boundaries, approval boundaries, deployment shape, or operating model.
     - If the packet contains multiple materially distinct unresolved tensions, keep them distinct in `unresolved_questions` instead of collapsing them into one generic question.
     - If the raw notes contain explicit `Unresolved:` items, treat them as high-priority source facts. Preserve each materially distinct unresolved item in `unresolved_questions` unless two items are true duplicates.
+    - If the packet contains multiple explicit `Unresolved:` bullets, the brief should usually preserve the same count of materially distinct unresolved questions unless two bullets are true duplicates.
     - Do not replace a packet-named open question with a narrower unsourced variant.
     - Do not invent a new unresolved question if it causes a packet-stated unresolved question to be dropped.
     - Avoid implementation detail.
     - Keep packet-named service surfaces distinct when they matter. For example, if the packet names teams separately from projects, do not silently collapse teams into projects.
+    - If the packet names an ordered lineage, pipeline, or stage progression, preserve each materially distinct stage somewhere in the brief. Do not collapse named intermediate stages such as `brief` into a broader `draft` label unless the packet itself does so.
+    - Use `lineage_stages` for packet-named ordered stages that materially shape the service, such as `raw request`, `packet`, `brief`, `draft`, `evaluation`, or `approval`. If the packet names such stages, preserve them there in source order.
+    - Distinct source-visible surfaces do not automatically require distinct entities. Preserve the distinction in purpose, components, dependencies, or unresolved questions unless the packet clearly supports separate first-class records.
+    - Use `actors` for source-backed participant roles that shape the service boundary, coordination model, or human escalation path. Human advocates, caregivers, reviewers, operators, patients, providers, and similar participants belong in `actors`.
+    - Use `service_surfaces` for stable, source-backed coordination surfaces such as inboxes, queues, shared records, timelines, cases, or similar user-visible surfaces. If the packet implies one of these surfaces, preserve it there even if the wording is tentative.
     - `components` should describe major service responsibilities or user-visible service surfaces, not low-level classes, methods, SDKs, or protocols.
+    - Component names should stay source-visible and behavior-oriented. Avoid internal-control names like `gate`, `control plane`, `status system`, or similar unless the packet explicitly uses that framing.
     - `abstraction_layers` should describe stable behavioral layers that make the service easier to reason about or port. Keep them service-level and conservative.
     - `external_dependencies` must name external systems, resources, or authorities explicitly visible in the packet. Do not list inferred internals such as token issuance, allowlists, registries, classes, transport mechanics, or storage engines unless they are stated directly.
+    - When the packet describes an external dependency with uncertainty words such as `maybe`, `optional`, `if present`, or similar, preserve that dependency as conditional rather than turning it into an unconditional requirement.
+    - Prefer systems, resources, and external services over actor roles in `external_dependencies`.
+    - If the packet explicitly names a specific human authority as part of the external dependency boundary to keep the runtime honest, preserve that role as a qualified human dependency as well as an `actor`. This should be rare and should apply only to packet-explicit human review, advocate, or approval boundaries, not to general participant lists.
+    - Do not list general participant roles such as family caregivers, patients, providers, reviewers, or approvers under `external_dependencies` just because they appear in the packet. Keep those in `actors`, boundaries, or escalation flows unless the packet explicitly frames a narrower human authority as part of the dependency boundary.
+    - When the packet mentions approved clients, consent, or endpoint verification, keep that as a source-visible boundary or restriction. Do not turn it into an approval-state object, internal control process, or hidden eligibility system.
     - `entities` should model only externally meaningful nouns the service manipulates or exposes. Do not create entities only to complete a plausible schema.
+    - Do not model clients, endpoints, documentation search results, approval lists, or ambiguous aliases as entities unless the packet clearly treats them as durable service records with stable fields and behavior.
+    - When two labels may describe overlapping context, such as team versus workspace context, keep the ambiguity explicit instead of multiplying entities.
+    - Do not add umbrella subject fields such as `subject`, `case_subject`, `party`, or similarly generalized placeholders when the packet already frames participants or care context more concretely. Prefer packet-backed wording or omit the field.
     - Every entity field must be directly supported by the packet or be the minimal identity/status field required to keep the entity coherent. Prefer fewer fields over speculative completeness.
+    - When the packet describes a transition-state resource surface rather than a record-centric system, default to the sparsest viable entities. One minimal identifying field may be enough.
+    - Do not infer relationship fields, lifecycle/status fields, or embedded collections such as `project_id`, `status`, or `logs` unless the packet clearly supports them.
+    - Do not add audit-style fields such as `change_note`, `decided_at`, revision metadata, or extra timestamps solely because the packet mentions history or traceability. Keep history in components or prose unless the record field itself is source-backed.
+    - Do not add revision-chain fields such as `revision`, `previous_*_id`, `parent_*_id`, or similar linkage solely because the packet mentions version history, revisions, or lineage. Keep revision history in components or prose unless the packet clearly makes that linkage a record-level concern.
+    - Do not split a shared case, shared timeline, or shared record surface into entry-level entities such as `TimelineEntry`, `Event`, or `RecordItem` unless the packet clearly supports those entries as durable first-class records.
+    - Most briefs should stay under roughly six entities unless the packet clearly supports richer modeling.
     - Prefer the smallest viable domain model. If an extra entity or field only restates behavior already captured elsewhere, omit it.
+    - If the packet mentions broader `management` or `operations` language only as an unresolved transition, keep that wording in boundaries or unresolved questions rather than turning it into a positive capability, component, or layer name.
+    - If the packet leaves reminders, outbound follow-up, missing-document requests, or similar communication behavior explicitly unresolved, keep that behavior in boundaries or unresolved questions rather than turning it into a standalone component, layer, or affirmative service capability.
+    - If the packet emphasizes durable lineage, versioned history, approval trails, or recurring packet-to-review loops, make the service's long-running coordination role explicit in the brief rather than leaving persistence only implied.
+    - When the packet supports it, use the exact phrase `long-running` in `purpose` or `update_request` instead of relying only on weaker proxies like `durable`, `ongoing`, or `history-aware`.
+    - If the packet centers a stable coordination surface such as a shared case, timeline, inbox, queue, or record, make that surface explicit in the brief's purpose rather than leaving it only in components.
+    - If the packet describes the service with tentative but source-backed coordination-surface language such as `feels like an inbox`, `shared record`, `queue`, or similar, preserve that surface or a close paraphrase in the brief instead of collapsing the service into pipeline-only wording.
+    - When preserving a coordination surface such as an inbox, queue, or shared record, keep it visible in `service_surfaces` and in at least one other brief field such as `purpose`, `operational_problems`, `components`, or `abstraction_layers`.
+    - If the packet names versioned history, a shared timeline, an approval trail, or similar trace surface as a recurring service surface, preserve at least one such surface in `service_surfaces` rather than reducing it to generic traceability wording.
+    - When write scope is unresolved, do not describe project or deployment surfaces as supporting `actions`, broader `workflows`, or `possible management` behavior. Keep them framed as retrieval, inspection, analysis, source-visible context, or unresolved documentation language.
+    - If a surface is explicitly ambiguous or alias-like, such as `team` versus `workspace context`, do not promote that ambiguity into a first-class entity unless the packet clearly establishes a durable record shape. Keep it in components, dependencies, or unresolved questions instead.
+    - If approved-client scope is unresolved, ask source-visible questions such as whether the named client list is exhaustive or illustrative. Do not ask how approval is designated, maintained, or implemented unless the packet explicitly raises that process.
     - Use only logical field types such as `string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, or `string or null`. Do not invent custom type labels like `duration`.
     - Do not embed `required`, `optional`, or invented constraints like `unique within service scope` into `type_expression`.
     - If a concept is visible in the packet but does not support a stable entity schema, keep it in components, dependencies, or unresolved questions instead of forcing it into the domain model.
     - Prefer omission over speculation.
+    - If `existing_spec` is present and the request is a narrow update, treat the brief as a delta-oriented edit plan rather than a full regenerate-from-scratch brief.
+    - In update mode, prefer the smallest possible change set. Keep `components`, `abstraction_layers`, `external_dependencies`, `entities`, and `unresolved_questions` limited to items the request explicitly adds, removes, or materially changes.
+    - In update mode, the `SpecBrief` schema still requires a full-shape brief. Do not emit blank placeholders or empty required fields just because the requested edit is narrow.
+    - In update mode, carry forward unchanged required fields from `existing_spec`, especially `service_name`, `purpose`, and any unchanged document-shaping lists needed to preserve the current document.
+    - In update mode, treat `existing_spec` as the source of truth for unchanged wording and structure. Use the packet to identify requested edits, not to paraphrase unaffected text.
+    - In update mode, do not fill in missing template sections from scratch just because the existing spec omits them. The update brief should respect the current document shape unless the request explicitly asks to expand it.
+    - In update mode, do not restate or expand unchanged Purpose, Problem Statement, Goal, Non-Goal, component, dependency, or domain-model content just because you can infer a cleaner version.
+    - In update mode, if an unchanged paragraph, bullet, or component description already exists in `existing_spec`, preserve that wording exactly instead of restating it from summary fields.
+    - In update mode, do not make stylistic micro-edits to untouched text. Avoid synonym swaps, helper-word insertion, tense normalization, article changes, singular/plural cleanup, or punctuation-only rewriting unless the requested edit requires it.
+    - In update mode, if the request asks for a new component, boundary bullet, or open-question section, do not automatically propagate that concept into Purpose, Problem Statement, Goals, Abstraction Levels, External Dependencies, or the domain model unless the request explicitly asks for those extra edits.
+    - In update mode, if the request does not explicitly ask to revise `## 4. Core Domain Model`, keep the entity set unchanged. A newly added component, surface, boundary, or open question does not by itself justify a new entity or new fields.
+    - In update mode, prefer expressing newly requested behavior in component text, boundary bullets, or unresolved questions before considering any domain-model edit.
+    - In update mode, do not change enumerated counts such as `The service solves 5 operational problems:` unless the request explicitly asks to add or remove a problem bullet in that section.
 
     {{ ctx.output_format }}
   "#
@@ -53,6 +98,10 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
     - Output valid markdown.
     - Keep the document behavioral and language-agnostic.
     - If `existing_spec` is present, treat this as update mode and let the update-mode rules override the create-mode template-completion rules below.
+    - In update mode, the create-mode template rules below do not authorize filling in missing sections, rewriting existing prose, or normalizing wording. They apply only where the update request explicitly touches that content.
+    - In update mode, treat `existing_spec` as the canonical source for unchanged wording, numbering, bullet layout, and line breaks. Use the brief to identify requested edits and new content, not to paraphrase untouched text.
+    - In update mode, if a surrounding paragraph, bullet, or component responsibility does not need to change, copy it exactly rather than regenerating a fresh paraphrase from the brief.
+    - In update mode, do not make stylistic micro-edits to untouched text: no synonym swaps, pronoun insertion, helper words, tense normalization, singular/plural cleanup, or punctuation-only rewrites unless the requested edit requires them.
     - Follow the `spec-creator` template shape directly. Do not rename the core headings or collapse them into ad hoc labels.
     - Use exactly this opening shape:
       - `# <Service Name> Specification`
@@ -60,9 +109,21 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - `Purpose: <one sentence>`
     - Require the `Purpose:` line to name the service kind, the protected endpoint or access boundary, and the main resource surfaces at the highest stable granularity the brief supports.
     - If the brief names surfaces like docs, teams, projects, deployments, or logs, mention them explicitly in the `Purpose:` line rather than replacing them with vague phrases like `operational context`.
+    - If `service_surfaces` are present, mention at least one stable service surface from that list in the `Purpose:` line or opening Problem Statement paragraph.
+    - If the brief centers a stable coordination surface such as a shared case, timeline, inbox, queue, or record, mention at least one such surface explicitly in the `Purpose:` line.
+    - If the brief carries a tentative but source-backed coordination surface such as an inbox, queue, or shared record, keep that surface visible in the `Purpose:` line or opening Problem Statement paragraph rather than flattening it into pipeline-only language.
+    - If the brief mentions OAuth, consent, approved clients, or secure access, the `Purpose:` line must explicitly say `OAuth-protected` or `secure, OAuth-protected` rather than leaving that security detail to later sections.
     - If the brief is fundamentally about coordination across named participant types, roles, or organizations, mention that cross-party coordination boundary in the `Purpose:` line instead of only naming artifacts or records.
+    - If `actors` are present, use them to describe who coordination happens across.
+    - Do not restate general participant actors as external dependencies just to make them visible. A role may appear in both places only when the brief explicitly preserves a packet-stated human authority dependency boundary.
     - If the brief names explicit ingress surfaces such as referrals, intake notes, source packets, or raw notes as core to the service, mention at least one of those entry surfaces in the `Purpose:` line instead of only describing downstream records or outputs.
     - Keep the `Purpose:` line concrete and source-bound. Avoid metaphorical or unsourced framing verbs like `protects`, `defends`, `guards`, or similar language unless the brief explicitly uses them.
+    - If the brief emphasizes lineage, version history, recurring review loops, or approval trails, make the service's long-running coordination role explicit in the `Purpose:` line or opening Problem Statement paragraph instead of leaving it implicit.
+    - When the brief supports it, prefer the exact phrase `long-running` in the `Purpose:` line or opening Problem Statement paragraph instead of leaving that property to inference.
+    - If `service_surfaces` include versioned history, a timeline, or an approval trail, mention at least one such surface explicitly in the `Purpose:` line or opening Problem Statement paragraph instead of referring only to generic traceability or lineage.
+    - When write scope is unresolved, prefer neutral words like `surface`, `context`, `retrieval`, `inspection`, or `analysis` over stronger nouns like `operations` or `management`.
+    - Do not use `operation`, `operations`, or `management` in component or layer names when the brief leaves write scope unresolved.
+    - Prefer `handling`, `coordination`, or `review` over workflow-forward labels like `routing`, `orchestration`, or `dispatch` unless the brief explicitly supports that language.
     - Include these sections in order:
       - `## 1. Problem Statement`
       - `## 2. Goals and Non-Goals`
@@ -74,6 +135,8 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - `### 3.3 External Dependencies`
       - `## 4. Core Domain Model`
       - `### 4.1 Entities`
+    - Write the heading `### 4.1 Entities` exactly before any entity definitions.
+    - Never place `#### 4.1.x ...` headings directly under `## 4. Core Domain Model`; always emit `### 4.1 Entities` first, then any optional introductory sentence, then entity headings.
     - In `## 1. Problem Statement`, write:
       - one opening paragraph describing the service at a high level
       - a line like `The service solves <N> operational problems:`
@@ -84,22 +147,46 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
     - For entities, use `#### 4.1.x EntityName`, then `Fields:`, then field lines in the form `- `field_name` (type, constraints)` with indented semantic bullets and optional `Default: `value`` lines.
     - Use logical field types (`string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, `string or null`), not implementation types.
     - List external dependencies as systems or services, not low-level API method names, SDK calls, token mechanics, registry internals, classes, or internal implementation choices.
+    - When a dependency is tentative in the packet or brief, keep that tentativeness visible with wording like `if present`, `when used`, or `if evaluations are external` rather than presenting the dependency as always required.
+    - When evaluation is optional, qualified, or external in the brief, describe the service as coordinating evaluation runs or attaching evaluation results rather than claiming it always performs the evaluation itself.
+    - Prefer systems and external services over actor roles in `### 3.3 External Dependencies`.
+    - If the packet or brief explicitly frames a specific human authority as part of the external dependency boundary, keep that role in `### 3.3 External Dependencies` with qualified wording such as `human advocate review, when escalation is required`.
+    - Do not list general participant roles such as family caregivers, patients, providers, reviewers, or approvers in `### 3.3 External Dependencies` just because the packet emphasizes a human boundary.
+    - If `actors` are present, reflect them in the Problem Statement, Purpose, or component responsibilities instead of moving them wholesale into `### 3.3 External Dependencies`.
+    - When a dependency is fundamentally about communicating with external parties, phrase the dependency as a communication channel, interface, or information source rather than as a bare list of counterparties.
+    - If the brief mentions approved-client restrictions, keep that phrased as a boundary or restriction, not as an approval-state system or internal status object.
     - Keep packet-named surfaces visible when they materially matter. If the source packet names teams separately from projects, preserve that distinction somewhere in the spec instead of collapsing it away.
+    - If `lineage_stages` are present, preserve every materially distinct stage from that list somewhere in the document. Mentioning the ordered stages in prose is acceptable even when not every stage becomes a domain entity.
+    - If `service_surfaces` are present, keep at least one of those surfaces visible in the Purpose or System Overview so the service is not reduced to stage sequencing alone.
+    - If `service_surfaces` include versioned history, a timeline, or an approval trail, keep at least one such history-bearing surface visible in the Purpose, Problem Statement, or System Overview rather than only generic `traceability` language.
+    - If the brief names an ordered lineage or artifact pipeline, preserve every materially distinct stage somewhere in the document. If the packet or brief includes a stage like `brief` between packet and candidate artifact, keep that stage visible in the Problem Statement, System Overview, or another prose section even if it is not a domain entity.
     - Keep the domain model conservative:
       - include only entities materially supported by the brief
       - prefer fewer entities over speculative completeness
       - use minimal source-backed fields
       - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
       - avoid second-order record types when a smaller set of entities or fields already makes the behavior legible
+      - preserve packet-named distinctions somewhere in the document without forcing every named surface into the domain model
+      - do not model clients, endpoints, documentation search results, or unresolved aliases as entities unless the brief clearly treats them as durable first-class records
+      - do not convert a separate packet-named surface into a nested field on another entity unless the brief clearly supports that nesting
+      - when the brief is transitional on write scope, keep component and layer naming neutral rather than capability-forward
+      - if broader `management` language appears only as unresolved packet wording, discuss it as a tension rather than naming a component or layer after it
+      - if write scope is unresolved, do not describe project or deployment surfaces as supporting `actions`, broader `workflows`, or `possible management`; keep them framed as retrieval, inspection, analysis, or unresolved scope
+      - if a surface is explicitly ambiguous or alias-like, such as `team` versus `workspace context`, keep it out of the domain model unless the brief clearly establishes it as a durable record
+      - avoid component names like `gate` or dependency phrases like `client status` when the brief only supports externally visible access restrictions
+      - keep entity count modest; more than roughly six entities should be rare and strongly justified by the brief
     - In field lines, use only logical types and source-backed constraints.
     - Do not put `required`, `optional`, `non-empty`, `unique`, or invented service-scope constraints inside the field parentheses.
     - If requirement state matters, express it in the indented semantic bullet rather than inside the type parentheses.
     - Preserve uncertainty where the source packet is in transition.
     - Avoid implementation detail.
     - If `unresolved_questions` are materially important, add `## 5. Open Questions` after the domain model and render them as markdown bullets.
+    - When `unresolved_questions` are present, render every materially distinct item from the brief exactly once. Do not drop one for brevity or substitute only a subset.
     - When multiple unresolved questions are present, preserve the distinct tensions rather than reducing them to a single generic question.
     - Keep `## 5. Open Questions` source-bound. If the brief names a structural tension such as repo-native vs hosted, family-direct vs sponsored, or automation vs human ownership, carry that tension forward explicitly.
     - When the source packet includes explicit `Unresolved:` bullets, keep those questions source-traceable in `## 5. Open Questions` rather than substituting different questions.
+    - When approved-client scope is uncertain, prefer questions about whether the named supported-client list is fixed, exhaustive, or illustrative. Do not ask how approval is internally designated or maintained unless the packet explicitly raises that process.
+    - When write scope is unresolved, ask which specific mutations, if any, are supported. Do not describe the question itself in a way that implies broader actions are already established.
     - Optional extra sections are allowed only when the brief materially needs them; if used, place them after `## 4. Core Domain Model`.
     - Do not add end markers, appendix-only filler, or operational/program-management sections unless the brief explicitly requires them.
     - If `existing_spec` is present, treat this as update mode:
@@ -107,7 +194,23 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - preserve all unchanged lines verbatim
       - apply only the edits implied by `update_request`
       - do not rewrite existing paragraphs, bullets, or component descriptions unless the request requires it
-      - do not add new top-level sections, domain model content, dependencies, or other template-completion material unless `update_request` explicitly asks for them
+      - do not add new top-level sections, missing template subsections, empty scaffold headings, domain model content, dependencies, or other template-completion material unless `update_request` explicitly asks for them
+      - preserve the existing section set exactly; if `existing_spec` lacks `### 3.2 Abstraction Levels`, `### 3.3 External Dependencies`, `## 4. Core Domain Model`, `## 5. Open Questions`, or any other template section, do not add that missing section unless `update_request` names that section or asks for that content explicitly
+      - make the smallest textual diff that satisfies `update_request`
+      - if `update_request` uses words like refresh, replace, or rewrite for a named section or subsection, treat that named section or subsection as replaceable
+      - for a section or subsection refresh, keep the heading and neighboring sections unchanged, but replace that section body with the corresponding brief content
+      - for a refreshed or replaced section, the old section body is not authoritative; do not copy old bullets or questions from `existing_spec` into the refreshed section unless the same bullet or question appears in the corresponding brief field
+      - insertion is preferred only for add-only requests; do not preserve stale bullets inside a section that the request explicitly asks to refresh or replace
+      - if `update_request` asks to refresh `### 2.2 Non-Goals`, render `brief.non_goals` as the complete new bullet list for that subsection and remove old non-goal bullets that are absent from `brief.non_goals`
+      - if `update_request` asks to refresh `## 5. Open Questions`, render `brief.unresolved_questions` as the complete new bullet list for that section and remove old questions that are absent from `brief.unresolved_questions`
+      - prefer insertion over rewrite; when adding a new bullet or component beside existing text, keep neighboring unchanged lines exactly as they appear in `existing_spec`
+      - if `update_request` asks for a new bullet, component, boundary, or open question, insert only that material into the existing structure instead of regenerating neighboring paragraphs or adding unrelated template sections; this insertion rule does not apply to refresh, replace, or rewrite requests
+      - if `update_request` does not ask to revise the Purpose line, Problem Statement bullets, existing component descriptions, dependencies, or domain-model entities, keep those lines unchanged
+      - if `update_request` does not explicitly ask to revise `## 4. Core Domain Model`, preserve that entire section verbatim
+      - a newly added component, surface, or open question does not authorize adding a new entity or new fields
+      - if `update_request` asks for a new main component, add it only under `### 3.1 Main Components` unless the request explicitly asks for matching edits elsewhere
+      - if `update_request` asks for `## 5. Open Questions`, append that section and keep earlier prose unchanged unless the request explicitly asks to revise earlier sections
+      - do not update problem counts, goal counts, or other numeric summary lines unless the request explicitly changes the bullets they count
 
     Brief:
     {{ brief|format(type="yaml") }}
@@ -134,13 +237,44 @@ function EvaluateSpecDocument(
     - Penalize invented capabilities, invented certainty, or implementation leakage.
     - Reward correct scope boundaries and careful handling of transition states.
     - Penalize missing core template sections, missing `### 3.2 Abstraction Levels`, missing `### 3.3 External Dependencies`, malformed field formatting, or moving `Important boundary:` out of the Problem Statement block.
+    - If `packet.existing_spec` is present, judge structural completeness relative to the requested update. Do not penalize sections or subsections that were already absent from `existing_spec` unless `task_prompt` or `expected_criteria` explicitly asks for those sections to be added.
+    - In update mode, reward preserving the existing document shape when the task says to keep everything else as-is.
+    - In update mode, penalize style-only paraphrases of untouched paragraphs, bullets, or component descriptions even when the meaning is still close.
+    - In update mode, reward insertion-style edits that leave neighboring unchanged wording exactly or near-exactly intact.
+    - In update mode, if the packet asks to refresh, replace, or rewrite a named section or subsection, verify that stale content from that section was removed rather than appended to. Do not mark the update as `Pass` when old bullets or questions remain verbatim inside the refreshed section.
+    - In update mode, penalize new entity or field additions when the request does not explicitly ask to revise `## 4. Core Domain Model`.
     - Penalize collapsing packet-named surfaces away when they matter. For example, if the packet explicitly names teams separately from projects, omitting team/workspace context entirely should count against the candidate.
+    - Penalize collapsing packet-named lineage or pipeline stages away when they matter. If the packet explicitly names `packet -> brief -> candidate artifact`, omitting the `brief` stage entirely should count against the candidate unless the stage is preserved elsewhere as a close paraphrase.
     - Penalize dropping packet-named unresolved tensions, or replacing them with narrower unsourced variants, when those tensions materially affect scope or behavior.
     - When the packet contains explicit `Unresolved:` bullets, treat omission of any materially distinct unresolved item as a real miss unless the candidate preserves it as a close paraphrase.
     - Penalize inferred internals such as token issuance, allowlists, registries, transport details, or storage choices unless the packet states them directly.
     - Penalize domain models that look schema-complete but are not source-backed.
+    - Penalize flattening tentative packet dependencies into unconditional requirements when the packet uses softer wording such as `maybe`, `optional`, or `if present`.
+    - Penalize listing general participant roles such as family caregivers, patients, providers, reviewers, or similar actors under external dependencies when the packet uses them to describe participants or escalation boundaries rather than dependency-like institutional authorities.
+    - Reward keeping source-backed participant roles visible without treating them as infrastructure dependencies when the packet centers human coordination or escalation.
+    - Do not penalize a qualified human dependency when the packet explicitly names a specific human authority, such as human advocate review, as part of the external dependency boundary.
+    - Penalize Purpose lines that omit a packet-central stable coordination surface such as a shared case, timeline, inbox, queue, or record.
+    - Penalize collapsing a packet-described inbox, queue, shared record, timeline, approval trail, or similar coordination surface into pipeline-only language when that surface materially shapes how the service is understood.
+    - Penalize dependency wording that collapses a communication or information dependency into a bare list of external counterparties when the packet supports a clearer channel or source framing.
+    - Penalize modeling access surfaces such as client applications, endpoint metadata, documentation search results, or ambiguous aliases as first-class entities unless the packet clearly treats them as durable records.
+    - Do not treat preservation of packet-named surface distinctions as justification for forcing every named surface into the domain model.
+    - Penalize relationship fields, lifecycle/status fields, or embedded collections that are plausible but not directly supported by the packet.
+    - Penalize audit-heavy fields or record types that are justified only by general traceability language rather than directly supported packet facts.
+    - Penalize revision-chain fields such as `revision`, `previous_*_id`, or `parent_*_id` when they are introduced only to express general history or lineage rather than a packet-backed record concern.
+    - Penalize umbrella subject fields such as `subject`, `case_subject`, or `party` when they generalize away more concrete packet-backed participant or care-context framing.
+    - Penalize splitting a shared case, shared timeline, or shared record surface into entry-level entities when the packet does not clearly establish entry records as first-class objects.
+    - Penalize capability-forward wording like `operations` or `management behavior` when the packet leaves write scope explicitly unresolved.
+    - Penalize standalone reminder, outbound follow-up, or missing-document components when the packet leaves that communication boundary explicitly unresolved.
+    - Penalize component or abstraction-layer names that turn unresolved `management` language into affirmative service capability labels.
+    - Penalize workflow-forward component labels like `routing`, `orchestration`, or `dispatch` when the packet only supports softer review, handling, or coordination language.
+    - Penalize wording that presents project or deployment `actions`, broader `workflows`, or similar affirmative behavior when the packet only establishes retrieval/inspection surfaces plus unresolved broader language.
+    - Penalize Purpose lines that omit explicit `OAuth-protected` or equivalent security wording when the packet clearly centers OAuth or secure access.
+    - Penalize component, dependency, or open-question wording that turns approved-client restrictions into internal control-process speculation when the packet does not define that process.
+    - Penalize promoting explicitly ambiguous surfaces, such as `team` versus `workspace context`, into first-class entities when the packet keeps them unsettled at the surface level.
     - Do not penalize typed field notation on its own; the template requires logical field types. Penalize only when the entity count or field coverage is materially more complete than the packet justifies.
     - Reward purpose lines that carry forward the core actor or coordination boundary when the packet makes that boundary central.
+    - Penalize candidates that leave the long-running coordination role only implied when the packet strongly emphasizes lineage, history, approval trails, or repeated packet-to-review loops.
+    - Reward making the service's long-running or durable coordination role explicit when the packet emphasizes lineage, history, approval trails, or repeated packet-to-review loops.
     - Reward conservative entity modeling and explicit `Open Questions` when the packet is genuinely transitional.
     - Use `Pass`, `Partial`, or `Fail` for each criterion.
 
diff --git a/skills/spec-creator/baml_src/spec_compiler/spec_types.baml b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
index c9c6f53..13383cc 100644
--- a/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml
@@ -28,6 +28,9 @@ class SpecBrief {
   goals string[]
   non_goals string[]
   important_boundaries string[]
+  actors string[]
+  service_surfaces string[]
+  lineage_stages string[]
   components ComponentBrief[]
   abstraction_layers AbstractionLayerBrief[]
   external_dependencies string[]
diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json
index 08154eb..3a72a11 100644
--- a/skills/spec-creator/evals/evals.json
+++ b/skills/spec-creator/evals/evals.json
@@ -61,7 +61,43 @@
         "type": "reference_document_checks",
         "validator": "spec-update-v1",
         "language_file": "references/language.md",
-        "existing_spec_file": "evals/fixtures/existing_spec.md"
+        "existing_spec_file": "evals/fixtures/existing_spec.md",
+        "required_patterns": [
+          {
+            "id": "offset_store_component_present",
+            "pattern": "\\d+\\. `Offset Store`",
+            "section_title": "Main Components",
+            "details_pass": "Detected the requested numbered `Offset Store` component.",
+            "details_fail": "Did not detect the requested numbered `Offset Store` component."
+          },
+          {
+            "id": "cross_region_nongoal_present",
+            "pattern": "Cross-region log replication\\.|Replicating logs across regions\\.",
+            "section_title": "Non-Goals",
+            "details_pass": "Detected the requested cross-region replication non-goal.",
+            "details_fail": "Did not detect the requested cross-region replication non-goal."
+          }
+        ],
+        "forbidden_patterns": [
+          {
+            "id": "no_template_completion_abstraction_levels",
+            "pattern": "### 3\\.2 Abstraction Levels",
+            "details_pass": "No unrequested `Abstraction Levels` scaffold was added.",
+            "details_fail": "Detected an unrequested `Abstraction Levels` scaffold in a keep-everything-else-as-is update."
+          },
+          {
+            "id": "no_template_completion_external_dependencies",
+            "pattern": "### 3\\.3 External Dependencies",
+            "details_pass": "No unrequested `External Dependencies` scaffold was added.",
+            "details_fail": "Detected an unrequested `External Dependencies` scaffold in a keep-everything-else-as-is update."
+          },
+          {
+            "id": "no_template_completion_core_domain_model",
+            "pattern": "## 4\\. Core Domain Model",
+            "details_pass": "No unrequested `Core Domain Model` scaffold was added.",
+            "details_fail": "Detected an unrequested `Core Domain Model` scaffold in a keep-everything-else-as-is update."
+          }
+        ]
       },
       "packet_files": {
         "existing_spec": "fixtures/existing_spec.md"
@@ -133,6 +169,787 @@
         "expected_criteria": "fixtures/harbor_care_coordination/expected_criteria.md"
       },
       "files": ["fixtures/harbor_care_coordination/raw_notes.md"]
+    },
+    {
+      "id": 6,
+      "eval_name": "update-harbor-add-advocate-escalation-and-open-questions",
+      "scenario_type": "update_existing_doc_from_source_packet",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "care_operations",
+      "primary_risks": [
+        "update_regression",
+        "invented_capabilities",
+        "invented_certainty",
+        "template_drift"
+      ],
+      "prompt": "Update the existing `SPEC.md` using the addendum in `fixtures/harbor_update_addendum/raw_notes.md`. Keep the existing structure and wording unless the notes require a change. Append a new main component called `Advocate Escalation Queue` at the end of `### 3.1 Main Components`, preserve the current care-coordination scope, and add explicit open questions for automation-versus-advocate follow-up ownership, access model, and whether reminders belong here or in a separate communication service. Do not turn reminders into a resolved standalone capability, do not reshuffle existing component numbering, and do not invent clinical, claims, or implementation detail.",
+      "expected_output": "An updated `SPEC.md` that preserves the current Harbor draft while adding an explicit advocate-escalation surface and carrying forward the unresolved automation, access, and reminder questions without rewriting the document from scratch.",
+      "expected_file": "fixtures/harbor_update_addendum/expected_criteria.md",
+      "validation_contract": {
+        "type": "reference_document_checks",
+        "validator": "spec-update-v1",
+        "language_file": "references/language.md",
+        "existing_spec_file": "evals/fixtures/harbor_update_addendum/existing_spec.md",
+        "allowed_removed_patterns": [
+          {
+            "pattern": "^Purpose: A long-running care-coordination service that turns referrals, intake notes, discharge instructions, documents, and follow-up events into a shared case timeline, coordination task queue, handoff tracker, and cautious benefits-question surface across family caregivers, patients, advocates, providers, pharmacies, and payers\\.$"
+          },
+          {
+            "pattern": "^- Human advocates remain central when situations are ambiguous, sensitive, or clinically interpretive\\.$"
+          }
+        ],
+        "required_patterns": [
+          {
+            "id": "advocate_escalation_queue_component_present",
+            "pattern": "6\\. `Advocate Escalation Queue`",
+            "section_title": "Main Components",
+            "details_pass": "Detected the requested appended `Advocate Escalation Queue` component as item 6.",
+            "details_fail": "Did not detect the requested appended `Advocate Escalation Queue` component as item 6."
+          },
+          {
+            "id": "open_questions_section_present",
+            "pattern": "## 5\\. Open Questions",
+            "details_pass": "Detected an `Open Questions` section for the update addendum.",
+            "details_fail": "Did not detect the required `Open Questions` section."
+          },
+          {
+            "id": "automation_vs_advocate_question_present",
+            "pattern": "automated versus advocate-owned|automated vs advocate-owned|advocate-owned|owned by a human advocate",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the automation-versus-advocate follow-up question.",
+            "details_fail": "Did not detect language preserving the automation-versus-advocate follow-up question."
+          },
+          {
+            "id": "reminder_boundary_question_present",
+            "pattern": "separate communication service|reminders belong here",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the reminder-service boundary as an open question.",
+            "details_fail": "Did not detect language preserving the reminder-service boundary as an open question."
+          }
+        ],
+        "forbidden_patterns": [
+          {
+            "id": "no_standalone_reminder_component",
+            "pattern": "\\d+\\. `(?:Reminder|Reminders|Missing-document and callback follow-up|Communication Service)`",
+            "section_title": "Main Components",
+            "details_pass": "No standalone reminder or communication component was added.",
+            "details_fail": "Detected a standalone reminder or communication component even though the reminder boundary is still unresolved."
+          }
+        ]
+      },
+      "packet_files": {
+        "raw_notes": "fixtures/harbor_update_addendum/raw_notes.md",
+        "expected_criteria": "fixtures/harbor_update_addendum/expected_criteria.md",
+        "existing_spec": "fixtures/harbor_update_addendum/existing_spec.md"
+      },
+      "files": [
+        "fixtures/harbor_update_addendum/raw_notes.md",
+        "fixtures/harbor_update_addendum/existing_spec.md"
+      ]
+    },
+    {
+      "id": 7,
+      "eval_name": "update-lightfast-add-approved-output-handoff-and-open-questions",
+      "scenario_type": "update_existing_doc_from_source_packet",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "agent_work_infrastructure",
+      "primary_risks": [
+        "update_regression",
+        "foundation_language_drift",
+        "invented_capabilities",
+        "template_drift"
+      ],
+      "prompt": "Update the existing `SPEC.md` using the addendum in `fixtures/lightfast_update_addendum/raw_notes.md`. Keep the existing structure and wording unless the notes require a change. Append a new main component called `Approved Output Handoff` at the end of `### 3.1 Main Components`, preserve the current repo/workspace coordination scope, and add explicit open questions for downstream acknowledgement ownership, stale-approval handling after artifact changes, and whether rejected drafts ever produce a handoff object. Do not turn the service into a deployment executor, workflow runtime, docs CMS, or hosted control plane. Do not reshuffle existing component numbering or rewrite unchanged sections.",
+      "expected_output": "An updated `SPEC.md` that preserves the current Lightfast draft while adding an explicit approved-output handoff surface and carrying forward the unresolved acknowledgement, stale-approval, and rejected-draft questions without drifting into foundation-doc language or downstream execution behavior.",
+      "expected_file": "fixtures/lightfast_update_addendum/expected_criteria.md",
+      "validation_contract": {
+        "type": "reference_document_checks",
+        "validator": "spec-update-v1",
+        "language_file": "references/language.md",
+        "existing_spec_file": "evals/fixtures/lightfast_update_addendum/existing_spec.md",
+        "replaceable_sections": [
+          {
+            "title": "Open Questions"
+          }
+        ],
+        "required_patterns": [
+          {
+            "id": "approved_output_handoff_component_present",
+            "pattern": "6\\. `Approved Output Handoff`",
+            "section_title": "Main Components",
+            "details_pass": "Detected the requested appended `Approved Output Handoff` component as item 6.",
+            "details_fail": "Did not detect the requested appended `Approved Output Handoff` component as item 6."
+          },
+          {
+            "id": "downstream_acknowledgement_question_present",
+            "pattern": "acknowledg(?:e|ement|ment)|receipt",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the downstream acknowledgement or receipt question.",
+            "details_fail": "Did not detect language preserving the downstream acknowledgement or receipt question."
+          },
+          {
+            "id": "stale_approval_question_present",
+            "pattern": "stale|invalidate|invalidated|require a new review",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the stale-approval or re-review question.",
+            "details_fail": "Did not detect language preserving the stale-approval or re-review question."
+          },
+          {
+            "id": "rejected_draft_handoff_question_present",
+            "pattern": "rejected drafts ever produce a handoff object|rejected drafts|strictly for approved outputs",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the rejected-draft handoff boundary question.",
+            "details_fail": "Did not detect language preserving the rejected-draft handoff boundary question."
+          }
+        ],
+        "forbidden_patterns": [
+          {
+            "id": "no_execution_component_added",
+            "pattern": "\\d+\\. `(?:Deployment Executor|Downstream Executor|Workflow Runtime|Hosted Control Plane)`",
+            "section_title": "Main Components",
+            "details_pass": "No execution or hosted-control-plane component was added.",
+            "details_fail": "Detected an execution or hosted-control-plane component that was not requested."
+          },
+          {
+            "id": "no_foundation_language_drift",
+            "pattern": "What This Is|Strategic Bets|Durable Surfaces|durable artifact layer|operating system",
+            "details_pass": "No foundation-document language drift was detected.",
+            "details_fail": "Detected foundation-document language drift in the updated spec."
+          },
+          {
+            "id": "no_resolved_acknowledgement_capability",
+            "pattern": "(?:make|keep|show|expose)\\s+(?:any\\s+)?downstream\\s+acknowledg(?:e|ement|ment)(?:\\s+of\\s+receipt)?\\s+(?:state\\s+)?visible",
+            "section_title": "Open Questions",
+            "details_pass": "Downstream acknowledgement was not presented as a current guaranteed capability.",
+            "details_fail": "The spec presented downstream acknowledgement visibility as a current capability instead of keeping it unresolved."
+          }
+        ],
+        "preserve_sections": [
+          {
+            "id": "core_domain_model_unchanged",
+            "title": "Core Domain Model",
+            "details_pass": "The existing `Core Domain Model` section remained unchanged, which matches the narrow update request.",
+            "details_fail": "The `Core Domain Model` section changed even though the update request did not ask for domain-model edits."
+          }
+        ]
+      },
+      "packet_files": {
+        "raw_notes": "fixtures/lightfast_update_addendum/raw_notes.md",
+        "expected_criteria": "fixtures/lightfast_update_addendum/expected_criteria.md",
+        "existing_spec": "fixtures/lightfast_update_addendum/existing_spec.md"
+      },
+      "files": [
+        "fixtures/lightfast_update_addendum/raw_notes.md",
+        "fixtures/lightfast_update_addendum/existing_spec.md"
+      ]
+    },
+    {
+      "id": 8,
+      "eval_name": "update-add-single-nongoal-preserve-system-overview",
+      "scenario_type": "update_existing_doc",
+      "input_shape": "existing_doc_update",
+      "ambiguity_level": "low",
+      "domain_profile": "developer_infrastructure",
+      "primary_risks": [
+        "update_regression",
+        "micro_paraphrase_drift",
+        "template_drift"
+      ],
+      "prompt": "We have an existing SPEC.md at the repo root for Log Shipper. Please update it: add a non-goal stating that cross-region log replication is out of scope. Keep everything else as-is. Do not rewrite `## 3. System Overview` or any existing component descriptions.",
+      "expected_output": "SPEC.md with the existing document preserved verbatim except for one new non-goal bullet under `### 2.2 Non-Goals`. `## 3. System Overview` should remain unchanged, including the existing `Forwarder` wording `Batches events and posts to the sink.` No first-person pronouns introduced.",
+      "validation_contract": {
+        "type": "reference_document_checks",
+        "validator": "spec-update-v1",
+        "language_file": "references/language.md",
+        "existing_spec_file": "evals/fixtures/existing_spec.md",
+        "required_patterns": [
+          {
+            "id": "cross_region_nongoal_present",
+            "pattern": "Cross-region log replication(?: is out of scope)?\\.|Cross-region log replication\\. \\(Out of scope\\.\\)|Replicating logs across regions\\.",
+            "section_title": "Non-Goals",
+            "details_pass": "Detected the requested cross-region replication non-goal.",
+            "details_fail": "Did not detect the requested cross-region replication non-goal."
+          },
+          {
+            "id": "forwarder_line_preserved_exactly",
+            "pattern": "- Batches events and posts to the sink\\.",
+            "section_title": "System Overview",
+            "details_pass": "The unchanged `Forwarder` description remained exactly preserved.",
+            "details_fail": "The unchanged `Forwarder` description was paraphrased or rewritten."
+          }
+        ],
+        "preserve_sections": [
+          {
+            "id": "problem_statement_unchanged",
+            "title": "Problem Statement",
+            "details_pass": "The existing `Problem Statement` section remained unchanged, which matches the narrow update request.",
+            "details_fail": "The `Problem Statement` section changed even though the update request did not ask for edits there."
+          },
+          {
+            "id": "system_overview_unchanged",
+            "title": "System Overview",
+            "details_pass": "The existing `System Overview` section remained unchanged, which matches the narrow update request.",
+            "details_fail": "The `System Overview` section changed even though the update request did not ask for edits there."
+          }
+        ]
+      },
+      "packet_files": {
+        "existing_spec": "fixtures/existing_spec.md"
+      },
+      "files": ["fixtures/existing_spec.md"]
+    },
+    {
+      "id": 9,
+      "eval_name": "update-lightfast-refresh-open-questions-section",
+      "scenario_type": "update_existing_doc_from_source_packet",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "agent_work_infrastructure",
+      "primary_risks": [
+        "update_regression",
+        "section_rewrite_drift",
+        "invented_capabilities",
+        "template_drift"
+      ],
+      "prompt": "Update the existing `SPEC.md` using the addendum in `fixtures/lightfast_open_questions_refresh/raw_notes.md`. The existing document already contains the `Approved Output Handoff` component. Refresh `## 5. Open Questions` so it reflects the current approval-to-handoff ambiguity, but keep the rest of the draft unchanged. Do not rewrite `Problem Statement`, `Goals and Non-Goals`, `System Overview`, or `Core Domain Model`. Do not turn the service into a workflow runtime, deployment executor, release manager, hosted control plane, or queue system.",
+      "expected_output": "An updated `SPEC.md` that behaves like a section refresh, preserving the existing Lightfast service framing and `Approved Output Handoff` component while replacing the old open questions with newer uncertainty around acknowledgement ownership, stale approved handoffs, and whether rejected drafts ever produce a handoff object.",
+      "expected_file": "fixtures/lightfast_open_questions_refresh/expected_criteria.md",
+      "validation_contract": {
+        "type": "reference_document_checks",
+        "validator": "spec-update-v1",
+        "language_file": "references/language.md",
+        "existing_spec_file": "evals/fixtures/lightfast_open_questions_refresh/existing_spec.md",
+        "replaceable_sections": [
+          {
+            "title": "Open Questions"
+          }
+        ],
+        "required_patterns": [
+          {
+            "id": "open_questions_section_present",
+            "pattern": "## 5\\. Open Questions",
+            "details_pass": "Detected the refreshed `Open Questions` section.",
+            "details_fail": "Did not detect the required `Open Questions` section."
+          },
+          {
+            "id": "downstream_acknowledgement_question_present",
+            "pattern": "acknowledg(?:e|ement|ment)|receipt|acceptance",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the downstream acknowledgement or receipt ownership question.",
+            "details_fail": "Did not detect language preserving the downstream acknowledgement or receipt ownership question."
+          },
+          {
+            "id": "stale_handoff_question_present",
+            "pattern": "stale|historical record|require another approval|new handoff",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the stale or superseded handoff question.",
+            "details_fail": "Did not detect language preserving the stale or superseded handoff question."
+          },
+          {
+            "id": "rejected_draft_handoff_question_present",
+            "pattern": "rejected drafts ever produce a handoff object|approval-only|strictly approval-only|strictly for approved outputs",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the rejected-draft handoff question.",
+            "details_fail": "Did not detect language preserving the rejected-draft handoff question."
+          }
+        ],
+        "forbidden_patterns": [
+          {
+            "id": "old_hosted_vs_repo_question_removed",
+            "pattern": "repo-native and workspace-local first|hosted catalog|multi-workspace control plane later",
+            "section_title": "Open Questions",
+            "details_pass": "The older hosted-versus-repo-native question was removed.",
+            "details_fail": "Detected the older hosted-versus-repo-native question even though this section should have been refreshed."
+          },
+          {
+            "id": "old_eval_followup_question_removed",
+            "pattern": "When an evaluation fails, should the service create follow-up tasks itself",
+            "section_title": "Open Questions",
+            "details_pass": "The older evaluation-follow-up ownership question was removed.",
+            "details_fail": "Detected the older evaluation-follow-up ownership question even though this section should have been refreshed."
+          },
+          {
+            "id": "old_value_lands_first_question_removed",
+            "pattern": "How much of the service's value should land first in drafting, evaluation, or review gates",
+            "section_title": "Open Questions",
+            "details_pass": "The older 'where value lands first' question was removed.",
+            "details_fail": "Detected the older 'where value lands first' question even though this section should have been refreshed."
+          },
+          {
+            "id": "no_execution_language_drift",
+            "pattern": "deployment executor|release manager|hosted control plane|queue system",
+            "section_title": "Open Questions",
+            "details_pass": "No execution or hosted-control-plane drift was introduced.",
+            "details_fail": "Detected execution or hosted-control-plane drift in a section-refresh update."
+          },
+          {
+            "id": "no_resolved_acknowledgement_capability",
+            "pattern": "(?:make|keep|show|expose)\\s+(?:any\\s+)?downstream\\s+acknowledg(?:e|ement|ment)(?:\\s+of\\s+receipt|\\s+or\\s+acceptance)?\\s+(?:state\\s+)?visible",
+            "section_title": "Open Questions",
+            "details_pass": "Downstream acknowledgement was not presented as a current guaranteed capability.",
+            "details_fail": "The spec presented downstream acknowledgement as a current guaranteed capability instead of keeping it unresolved."
+          }
+        ],
+        "preserve_sections": [
+          {
+            "id": "problem_statement_unchanged",
+            "title": "Problem Statement",
+            "details_pass": "The existing `Problem Statement` section remained unchanged, which matches this narrow section-refresh request.",
+            "details_fail": "The `Problem Statement` section changed even though the packet only asked to refresh `Open Questions`."
+          },
+          {
+            "id": "goals_and_non_goals_unchanged",
+            "title": "Goals and Non-Goals",
+            "details_pass": "The existing `Goals and Non-Goals` section remained unchanged, which matches this narrow section-refresh request.",
+            "details_fail": "The `Goals and Non-Goals` section changed even though the packet only asked to refresh `Open Questions`."
+          },
+          {
+            "id": "system_overview_unchanged",
+            "title": "System Overview",
+            "details_pass": "The existing `System Overview` section remained unchanged, including the current `Approved Output Handoff` component wording.",
+            "details_fail": "The `System Overview` section changed even though the packet only asked to refresh `Open Questions`."
+          },
+          {
+            "id": "core_domain_model_unchanged",
+            "title": "Core Domain Model",
+            "details_pass": "The existing `Core Domain Model` section remained unchanged, which matches the narrow update request.",
+            "details_fail": "The `Core Domain Model` section changed even though the packet did not ask for domain-model edits."
+          }
+        ]
+      },
+      "packet_files": {
+        "raw_notes": "fixtures/lightfast_open_questions_refresh/raw_notes.md",
+        "expected_criteria": "fixtures/lightfast_open_questions_refresh/expected_criteria.md",
+        "existing_spec": "fixtures/lightfast_open_questions_refresh/existing_spec.md"
+      },
+      "files": [
+        "fixtures/lightfast_open_questions_refresh/raw_notes.md",
+        "fixtures/lightfast_open_questions_refresh/existing_spec.md"
+      ]
+    },
+    {
+      "id": 10,
+      "eval_name": "update-harbor-refresh-open-questions-section",
+      "scenario_type": "update_existing_doc_from_source_packet",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "care_operations",
+      "primary_risks": [
+        "update_regression",
+        "section_rewrite_drift",
+        "invented_capabilities",
+        "template_drift"
+      ],
+      "prompt": "Update the existing `SPEC.md` using the addendum in `fixtures/harbor_open_questions_refresh/raw_notes.md`. The existing document already contains the `Advocate Escalation Queue` component. Refresh `## 5. Open Questions` so it reflects the current household-access, external-resolution, and after-hours boundary ambiguity, but keep the rest of the draft unchanged. Do not rewrite `Problem Statement`, `Goals and Non-Goals`, `System Overview`, or `Core Domain Model`. Do not turn the service into a caregiver portal, permissions engine, telehealth system, clinical triage layer, EHR, or claims processor.",
+      "expected_output": "An updated `SPEC.md` that behaves like a section refresh, preserving the existing Harbor service framing and `Advocate Escalation Queue` component while replacing the old open questions with newer uncertainty around household access authority, external-resolution tracking after escalation, and whether after-hours non-clinical issues belong in Harbor or a separate urgent-support path.",
+      "expected_file": "fixtures/harbor_open_questions_refresh/expected_criteria.md",
+      "validation_contract": {
+        "type": "reference_document_checks",
+        "validator": "spec-update-v1",
+        "language_file": "references/language.md",
+        "existing_spec_file": "evals/fixtures/harbor_open_questions_refresh/existing_spec.md",
+        "replaceable_sections": [
+          {
+            "title": "Open Questions"
+          }
+        ],
+        "required_patterns": [
+          {
+            "id": "open_questions_section_present",
+            "pattern": "## 5\\. Open Questions",
+            "details_pass": "Detected the refreshed `Open Questions` section.",
+            "details_fail": "Did not detect the required `Open Questions` section."
+          },
+          {
+            "id": "household_access_question_present",
+            "pattern": "view or update.*(?:shared )?case|patient-designated caregivers|authorized (?:family )?caregivers|sponsored navigator",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the household-access authority question.",
+            "details_fail": "Did not detect language preserving the household-access authority question."
+          },
+          {
+            "id": "external_resolution_question_present",
+            "pattern": "external resolution|only the handoff|next internal follow-up|payer|provider office|pharmacy",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the external-resolution tracking question.",
+            "details_fail": "Did not detect language preserving the external-resolution tracking question."
+          },
+          {
+            "id": "after_hours_boundary_question_present",
+            "pattern": "after hours|after-hours|next advocate review|urgent-support|urgent support",
+            "section_title": "Open Questions",
+            "details_pass": "Detected language preserving the after-hours boundary question.",
+            "details_fail": "Did not detect language preserving the after-hours boundary question."
+          }
+        ],
+        "forbidden_patterns": [
+          {
+            "id": "old_automation_vs_advocate_question_removed",
+            "pattern": "How much outbound follow-up can be automated versus advocate-owned",
+            "section_title": "Open Questions",
+            "details_pass": "The older automation-versus-advocate question was removed.",
+            "details_fail": "Detected the older automation-versus-advocate question even though this section should have been refreshed."
+          },
+          {
+            "id": "old_access_model_question_removed",
+            "pattern": "primary access model family direct|employer/payer sponsored|provider-linked",
+            "section_title": "Open Questions",
+            "details_pass": "The older access-model question was removed.",
+            "details_fail": "Detected the older access-model question even though this section should have been refreshed."
+          },
+          {
+            "id": "old_reminder_boundary_question_removed",
+            "pattern": "Do reminders belong here or in a separate communication service",
+            "section_title": "Open Questions",
+            "details_pass": "The older reminder-service boundary question was removed.",
+            "details_fail": "Detected the older reminder-service boundary question even though this section should have been refreshed."
+          },
+          {
+            "id": "no_portal_or_permissions_engine_capability",
+            "pattern": "caregiver portal|family portal|consent engine|permissions engine",
+            "section_title": "Open Questions",
+            "details_pass": "The refreshed questions did not invent a portal or permissions engine capability.",
+            "details_fail": "The refreshed questions invented a portal or permissions engine capability instead of keeping access authority unresolved."
+          },
+          {
+            "id": "no_clinical_after_hours_capability",
+            "pattern": "telehealth|clinical triage|on-call clinical support",
+            "section_title": "Open Questions",
+            "details_pass": "The refreshed questions did not invent after-hours clinical support behavior.",
+            "details_fail": "The refreshed questions drifted into telehealth or clinical triage behavior."
+          }
+        ],
+        "preserve_sections": [
+          {
+            "id": "problem_statement_unchanged",
+            "title": "Problem Statement",
+            "details_pass": "The existing `Problem Statement` section remained unchanged, which matches this narrow section-refresh request.",
+            "details_fail": "The `Problem Statement` section changed even though the packet only asked to refresh `Open Questions`."
+          },
+          {
+            "id": "goals_and_non_goals_unchanged",
+            "title": "Goals and Non-Goals",
+            "details_pass": "The existing `Goals and Non-Goals` section remained unchanged, which matches this narrow section-refresh request.",
+            "details_fail": "The `Goals and Non-Goals` section changed even though the packet only asked to refresh `Open Questions`."
+          },
+          {
+            "id": "system_overview_unchanged",
+            "title": "System Overview",
+            "details_pass": "The existing `System Overview` section remained unchanged, including the current `Advocate Escalation Queue` component wording.",
+            "details_fail": "The `System Overview` section changed even though the packet only asked to refresh `Open Questions`."
+          },
+          {
+            "id": "core_domain_model_unchanged",
+            "title": "Core Domain Model",
+            "details_pass": "The existing `Core Domain Model` section remained unchanged, which matches the narrow update request.",
+            "details_fail": "The `Core Domain Model` section changed even though the packet did not ask for domain-model edits."
+          }
+        ]
+      },
+      "packet_files": {
+        "raw_notes": "fixtures/harbor_open_questions_refresh/raw_notes.md",
+        "expected_criteria": "fixtures/harbor_open_questions_refresh/expected_criteria.md",
+        "existing_spec": "fixtures/harbor_open_questions_refresh/existing_spec.md"
+      },
+      "files": [
+        "fixtures/harbor_open_questions_refresh/raw_notes.md",
+        "fixtures/harbor_open_questions_refresh/existing_spec.md"
+      ]
+    },
+    {
+      "id": 11,
+      "eval_name": "update-lightfast-refresh-non-goals-section",
+      "scenario_type": "update_existing_doc_from_source_packet",
+      "input_shape": "source_packet",
+      "ambiguity_level": "high",
+      "domain_profile": "agent_work_infrastructure",
+      "primary_risks": [
+        "update_regression",
+        "section_rewrite_drift",
+        "boundary_drift",
+        "invented_capabilities",
+        "template_drift"
+      ],
+      "prompt": "Update the existing `SPEC.md` using the addendum in `fixtures/lightfast_non_goals_refresh/raw_notes.md`. Refresh only `### 2.2 Non-Goals` so it reflects the newer scope boundaries, but keep `Purpose`, `Problem Statement`, `### 2.1 Goals`, `System Overview`, `Core Domain Model`, and `Open Questions` unchanged. Do not turn the service into project management, backlog grooming, a repo mutation engine, a CI runner, a deployment executor, a hosted multi-workspace control plane, or a docs platform.",
+      "expected_output": "An updated `SPEC.md` that behaves like a section refresh, preserving the existing Lightfast workspace-coordination framing while replacing the stale `Non-Goals` shorthand with sharper boundaries around ticket-routing and planning, repo mutation or execution, hosted multi-workspace control-plane behavior, docs-platform drift, and silent pass-chasing regeneration loops.",
+      "expected_file": "fixtures/lightfast_non_goals_refresh/expected_criteria.md",
+      "validation_contract": {
+        "type": "reference_document_checks",
+        "validator": "spec-update-v1",
+        "language_file": "references/language.md",
+        "existing_spec_file": "evals/fixtures/lightfast_non_goals_refresh/existing_spec.md",
+        "replaceable_sections": [
+          {
+            "title": "Non-Goals"
+          }
+        ],
+        "required_patterns": [
+          {
+            "id": "planning_boundary_present",
+            "pattern": "ticket routing|task assignment|sprint planning|backlog (?:grooming|management)",
+            "section_title": "Non-Goals",
+            "details_pass": "Detected a refreshed boundary against planning, routing, or task-management drift.",
+            "details_fail": "Did not detect a refreshed boundary against planning, routing, or task-management drift."
+          },
+          {
+            "id": "repo_execution_boundary_present",
+            "pattern": "repo mutation|mutating repositories|apply(?:ing)? approved changes|apply changes(?: directly)? to (?:the )?repo|CI runner|running CI|deployment executor|deploying outputs|post-approval execution surface|automatic(?:ally)? apply approved",
+            "section_title": "Non-Goals",
+            "details_pass": "Detected a refreshed boundary against repo mutation or execution behavior.",
+            "details_fail": "Did not detect a refreshed boundary against repo mutation or execution behavior."
+          },
+          {
+            "id": "hosted_control_plane_boundary_present",
+            "pattern": "hosted (?:(?:multi-workspace|multi-tenant)(?: or (?:multi-workspace|multi-tenant))?) control plane|cross-workspace (?:control plane|fleet manager)|multi-workspace fleet manager",
+            "section_title": "Non-Goals",
+            "details_pass": "Detected a refreshed boundary against hosted or cross-workspace control-plane drift.",
+            "details_fail": "Did not detect a refreshed boundary against hosted or cross-workspace control-plane drift."
+          },
+          {
+            "id": "docs_platform_boundary_present",
+            "pattern": "knowledge base|wiki|docs (?:publishing )?(?:platform|system)|content CMS|general content management",
+            "section_title": "Non-Goals",
+            "details_pass": "Detected a refreshed boundary against docs-platform or content-system drift.",
+            "details_fail": "Did not detect a refreshed boundary against docs-platform or content-system drift."
+          },
+          {
+            "id": "silent_regeneration_boundary_present",
+            "pattern": "silent (?:best-effort )?(?:rewrite|regeneration|drafting) loop|keep(?:s)? rewriting until (?:an )?eval(?:uation)? passes|pass-chasing regeneration|re-evaluating until something passes",
+            "section_title": "Non-Goals",
+            "details_pass": "Detected a refreshed boundary against silent pass-chasing regeneration.",
+            "details_fail": "Did not detect a refreshed boundary against silent pass-chasing regeneration."
+          }
+        ],
+        "forbidden_patterns": [
+          {
+            "id": "old_autonomous_engineer_bullet_removed",
+            "pattern": "Not a general autonomous software engineer\\.",
+            "section_title": "Non-Goals",
+            "details_pass": "The stale autonomous-engineer shorthand was removed instead of being carried forward verbatim.",
+            "details_fail": "Detected the stale autonomous-engineer shorthand even though this section should have been refreshed."
+          },
+          {
+            "id": "old_project_management_bullet_removed",
+            "pattern": "Not project management or backlog grooming\\.",
+            "section_title": "Non-Goals",
+            "details_pass": "The older project-management shorthand was removed instead of being preserved verbatim.",
+            "details_fail": "Detected the older project-management shorthand even though this section should have been refreshed."
+          },
+          {
+            "id": "old_docs_cms_bullet_removed",
+            "pattern": "Not a docs CMS\\.",
+            "section_title": "Non-Goals",
+            "details_pass": "The older docs-CMS shorthand was removed instead of being preserved verbatim.",
+            "details_fail": "Detected the older docs-CMS shorthand even though this section should have been refreshed."
+          },
+          {
+            "id": "no_foundation_language_drift",
+            "pattern": "company thesis|strategic moat|market positioning|foundational doctrine|product doctrine",
+            "section_title": "Non-Goals",
+            "details_pass": "No company-foundation language drift was introduced into the refreshed non-goals section.",
+            "details_fail": "Detected company-foundation language drift in the refreshed non-goals section."
+          }
+        ],
+        "preserve_sections": [
+          {
+            "id": "problem_statement_unchanged",
+            "title": "Problem Statement",
+            "details_pass": "The existing `Problem Statement` section remained unchanged, which matches this narrow section-refresh request.",
+            "details_fail": "The `Problem Statement` section changed even though the packet only asked to refresh `Non-Goals`."
+          },
+          {
+            "id": "goals_unchanged",
+            "title": "Goals",
+            "details_pass": "The existing `Goals` subsection remained unchanged, which matches this narrow section-refresh request.",
+            "details_fail": "The `Goals` subsection changed even though the packet only asked to refresh `Non-Goals`."
+          },
+          {
+            "id": "system_overview_unchanged",
+            "title": "System Overview",
+            "details_pass": "The existing `System Overview` section remained unchanged, including the current `Approved Output Handoff` component wording.",
+            "details_fail": "The `System Overview` section changed even though the packet only asked to refresh `Non-Goals`."
+          },
+          {
+            "id": "core_domain_model_unchanged",
+            "title": "Core Domain Model",
+            "details_pass": "The existing `Core Domain Model` section remained unchanged, which matches the narrow update request.",
+            "details_fail": "The `Core Domain Model` section changed even though the packet did not ask for domain-model edits."
+          },
+          {
+            "id": "open_questions_unchanged",
+            "title": "Open Questions",
+            "details_pass": "The existing `Open Questions` section remained unchanged, which matches this narrow section-refresh request.",
+            "details_fail": "The `Open Questions` section changed even though the packet only asked to refresh `Non-Goals`."
+          }
+        ]
+      },
+      "packet_files": {
+        "raw_notes": "fixtures/lightfast_non_goals_refresh/raw_notes.md",
+        "expected_criteria": "fixtures/lightfast_non_goals_refresh/expected_criteria.md",
+        "existing_spec": "fixtures/lightfast_non_goals_refresh/existing_spec.md"
+      },
+      "files": [
+        "fixtures/lightfast_non_goals_refresh/raw_notes.md",
+        "fixtures/lightfast_non_goals_refresh/existing_spec.md"
+      ]
+    },
+    {
+      "id": 12,
+      "eval_name": "update-harbor-refresh-external-dependencies-section",
+      "scenario_type": "update_existing_doc_from_source_packet",
+      "input_shape": "source_packet",
+      "ambiguity_level": "medium",
+      "domain_profile": "care_operations",
+      "primary_risks": [
+        "update_regression",
+        "section_rewrite_drift",
+        "dependency_boundary_drift",
+        "invented_capabilities",
+        "template_drift"
+      ],
+      "prompt": "Update the existing `SPEC.md` using the addendum in `fixtures/harbor_external_dependencies_refresh/raw_notes.md`. Refresh only `### 3.3 External Dependencies` so it frames Harbor dependencies as source material, communication channels, and qualified human review authority. Keep `Purpose`, `Problem Statement`, `Goals and Non-Goals`, `### 3.1 Main Components`, `### 3.2 Abstraction Levels`, `Core Domain Model`, and `Open Questions` unchanged. Do not turn participant roles into bare dependency bullets, and do not invent EHR integration, claims processing, telehealth, clinical triage, caregiver portals, consent-management, notification infrastructure, or implementation detail.",
+      "expected_output": "An updated `SPEC.md` that behaves like a subsection refresh, preserving the existing Harbor service framing while replacing the loose external-dependencies bullets with sharper dependency language for referral and intake sources, care-document sources, communication channels, and human advocate review.",
+      "expected_file": "fixtures/harbor_external_dependencies_refresh/expected_criteria.md",
+      "validation_contract": {
+        "type": "reference_document_checks",
+        "validator": "spec-update-v1",
+        "language_file": "references/language.md",
+        "existing_spec_file": "evals/fixtures/harbor_external_dependencies_refresh/existing_spec.md",
+        "replaceable_sections": [
+          {
+            "title": "External Dependencies"
+          }
+        ],
+        "required_patterns": [
+          {
+            "id": "external_dependencies_section_present",
+            "pattern": "### 3\\.3 External Dependencies",
+            "details_pass": "Detected the refreshed `External Dependencies` subsection.",
+            "details_fail": "Did not detect the required `External Dependencies` subsection."
+          },
+          {
+            "id": "referral_intake_sources_present",
+            "pattern": "referral and intake source(?:s| material)|intake sources.*referral forms|referral forms.*intake notes.*discharge (?:paperwork|instructions)",
+            "section_title": "External Dependencies",
+            "details_pass": "Detected referral and intake sources as a dependency.",
+            "details_fail": "Did not detect referral and intake sources as a dependency."
+          },
+          {
+            "id": "care_document_sources_present",
+            "pattern": "care-document sources|care document sources|document sources.*coordination context|care documents.*coordination context|document access.*coordination context|current coordination context",
+            "section_title": "External Dependencies",
+            "details_pass": "Detected care-document sources framed as coordination-context dependencies.",
+            "details_fail": "Did not detect care-document sources framed as coordination-context dependencies."
+          },
+          {
+            "id": "communication_channels_present",
+            "pattern": "communication channels.*provider offices.*pharmacies.*payers|provider offices.*pharmacies.*payers.*care contacts",
+            "section_title": "External Dependencies",
+            "details_pass": "Detected communication channels with provider offices, pharmacies, payers, and other care contacts.",
+            "details_fail": "Did not detect communication channels with provider offices, pharmacies, payers, and other care contacts."
+          },
+          {
+            "id": "human_advocate_review_present",
+            "pattern": "human advocate review|advocate review.*ambiguous.*sensitive|ambiguous.*sensitive.*clinically interpretive",
+            "section_title": "External Dependencies",
+            "details_pass": "Detected qualified human advocate review as a dependency boundary.",
+            "details_fail": "Did not detect qualified human advocate review as a dependency boundary."
+          }
+        ],
+        "forbidden_patterns": [
+          {
+            "id": "old_referral_channel_bullet_removed",
+            "pattern": "Referral and intake channels\\.",
+            "section_title": "External Dependencies",
+            "details_pass": "The older generic referral-channel bullet was removed.",
+            "details_fail": "Detected the older generic referral-channel bullet even though this subsection should have been refreshed."
+          },
+          {
+            "id": "old_care_documents_bullet_removed",
+            "pattern": "Care documents such as referral forms, intake notes, and discharge paperwork\\.",
+            "section_title": "External Dependencies",
+            "details_pass": "The older generic care-documents bullet was removed.",
+            "details_fail": "Detected the older generic care-documents bullet even though this subsection should have been refreshed."
+          },
+          {
+            "id": "old_communication_channels_bullet_removed",
+            "pattern": "Communication channels with provider offices, pharmacies, payers, and other care contacts\\.",
+            "section_title": "External Dependencies",
+            "details_pass": "The older generic communication-channels bullet was removed.",
+            "details_fail": "Detected the older generic communication-channels bullet even though this subsection should have been refreshed."
+          },
+          {
+            "id": "old_human_advocates_bullet_removed",
+            "pattern": "Human advocates\\.",
+            "section_title": "External Dependencies",
+            "details_pass": "The older bare human-advocates bullet was removed.",
+            "details_fail": "Detected the older bare human-advocates bullet even though this subsection should have been refreshed."
+          },
+          {
+            "id": "no_participant_bare_dependency_list",
+            "pattern": "patients, family caregivers|family caregivers, patients|provider offices, pharmacies, payers\\.",
+            "section_title": "External Dependencies",
+            "details_pass": "The refreshed subsection did not turn participant roles into bare dependency bullets.",
+            "details_fail": "The refreshed subsection appears to list participant roles as bare external dependencies."
+          },
+          {
+            "id": "no_health_platform_drift",
+            "pattern": "EHR integration|claims processing|claims adjudication|telehealth|clinical triage|caregiver portal|consent-management|consent management|notification infrastructure",
+            "section_title": "External Dependencies",
+            "details_pass": "The refreshed subsection did not invent health-platform or infrastructure capabilities.",
+            "details_fail": "The refreshed subsection invented health-platform or infrastructure capabilities."
+          }
+        ],
+        "preserve_sections": [
+          {
+            "id": "problem_statement_unchanged",
+            "title": "Problem Statement",
+            "details_pass": "The existing `Problem Statement` section remained unchanged, which matches this narrow subsection-refresh request.",
+            "details_fail": "The `Problem Statement` section changed even though the packet only asked to refresh `External Dependencies`."
+          },
+          {
+            "id": "goals_and_non_goals_unchanged",
+            "title": "Goals and Non-Goals",
+            "details_pass": "The existing `Goals and Non-Goals` section remained unchanged, which matches this narrow subsection-refresh request.",
+            "details_fail": "The `Goals and Non-Goals` section changed even though the packet only asked to refresh `External Dependencies`."
+          },
+          {
+            "id": "main_components_unchanged",
+            "title": "Main Components",
+            "details_pass": "The existing `Main Components` subsection remained unchanged, including the current `Advocate Escalation Queue` component wording.",
+            "details_fail": "The `Main Components` subsection changed even though the packet only asked to refresh `External Dependencies`."
+          },
+          {
+            "id": "abstraction_levels_unchanged",
+            "title": "Abstraction Levels",
+            "details_pass": "The existing `Abstraction Levels` subsection remained unchanged, which matches this narrow subsection-refresh request.",
+            "details_fail": "The `Abstraction Levels` subsection changed even though the packet only asked to refresh `External Dependencies`."
+          },
+          {
+            "id": "core_domain_model_unchanged",
+            "title": "Core Domain Model",
+            "details_pass": "The existing `Core Domain Model` section remained unchanged, which matches the narrow update request.",
+            "details_fail": "The `Core Domain Model` section changed even though the packet did not ask for domain-model edits."
+          },
+          {
+            "id": "open_questions_unchanged",
+            "title": "Open Questions",
+            "details_pass": "The existing `Open Questions` section remained unchanged, which matches this narrow subsection-refresh request.",
+            "details_fail": "The `Open Questions` section changed even though the packet only asked to refresh `External Dependencies`."
+          }
+        ]
+      },
+      "packet_files": {
+        "raw_notes": "fixtures/harbor_external_dependencies_refresh/raw_notes.md",
+        "expected_criteria": "fixtures/harbor_external_dependencies_refresh/expected_criteria.md",
+        "existing_spec": "fixtures/harbor_external_dependencies_refresh/existing_spec.md"
+      },
+      "files": [
+        "fixtures/harbor_external_dependencies_refresh/raw_notes.md",
+        "fixtures/harbor_external_dependencies_refresh/existing_spec.md"
+      ]
     }
   ]
 }
diff --git a/skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/existing_spec.md b/skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/existing_spec.md
new file mode 100644
index 0000000..7e52e1b
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/existing_spec.md
@@ -0,0 +1,134 @@
+# Harbor Care Coordination Service Specification
+Status: Draft v1 (language-agnostic)
+Purpose: A long-running care-coordination service that turns referrals, intake notes, discharge instructions, documents, and follow-up events into a shared case timeline, coordination task queue, handoff tracker, and cautious benefits-question surface across family caregivers, patients, advocates, providers, pharmacies, and payers.
+
+## 1. Problem Statement
+
+Harbor Care Coordination Service keeps care logistics from dissolving into scattered calls, documents, and callbacks. It gives families, advocates, providers, pharmacies, payers, and other care contacts one visible coordination picture instead of forcing families to assemble the current state on their own.
+
+The service solves 5 operational problems:
+- Families often become the default project managers for eldercare and chronic-care logistics.
+- Referrals, intake notes, discharge paperwork, pharmacy issues, benefits questions, and follow-up events live in different places.
+- There is no shared operating picture of what happened, what is pending, and who owns the next step.
+- Handoffs across organizations and settings can stall when responsibility is implicit.
+- Conflicting records need to be surfaced without pretending the service can make a clinical conclusion.
+
+Important boundary:
+- Human advocates remain central when situations are ambiguous, sensitive, or clinically interpretive.
+- The service stays focused on coordination logistics rather than diagnosis, treatment planning, or telehealth.
+- The service is not an EHR, insurer claims processor, or marketplace for clinicians or home aides.
+- Benefits questions may be tracked, but the service does not adjudicate claims or eligibility.
+- The service should not turn into a generic outbound CRM.
+
+## 2. Goals and Non-Goals
+
+### 2.1 Goals
+- Normalize referrals, intake notes, and related care inputs into one coordinated case.
+- Maintain a shared case timeline showing what happened, what is pending, and what still needs documents or callbacks.
+- Keep next-step ownership visible across family caregivers, advocates, providers, pharmacies, payers, and other care contacts.
+- Track handoffs so care transitions do not disappear between settings or organizations.
+- Keep a cautious surface for benefits or eligibility questions without becoming an adjudication engine.
+
+### 2.2 Non-Goals
+- Diagnosis or treatment planning.
+- Telehealth.
+- An EHR or EHR replacement.
+- Insurer claims processing or adjudication.
+- A marketplace for clinicians or home aides.
+- A generic outbound CRM.
+
+## 3. System Overview
+
+Harbor Care Coordination Service is organized around messy intake, shared coordination state, explicit ownership, and visible handoffs. It keeps the case current without claiming clinical authority.
+
+### 3.1 Main Components
+1. `Referral and Intake Normalization`
+   - Converts referral forms, intake call notes, discharge paperwork, and related inputs into a consistent care case.
+   - Preserves original context while extracting coordination-relevant work.
+
+2. `Shared Case Timeline`
+   - Maintains the visible record of what happened, what is pending, and what still needs documents or callbacks.
+   - Provides the main shared coordination surface for the care case.
+
+3. `Coordination Task Queue`
+   - Tracks next actions that still need attention.
+   - Shows who owns the next step when ownership is known.
+
+4. `Handoff Tracker`
+   - Tracks transitions such as hospital to home, clinic to rehab, pharmacy to family, and payer to advocate.
+   - Makes responsibility transfers explicit instead of implicit.
+
+5. `Benefits Question Surface`
+   - Tracks benefits or eligibility questions that need coordination follow-up.
+   - Stays short of claims handling or adjudication.
+
+6. `Advocate Escalation Queue`
+   - Provides a visible human escalation surface for ambiguous, sensitive, or clinically interpretive follow-up so review work lands with advocates instead of being forced into automated resolution.
+
+### 3.2 Abstraction Levels
+1. `Intake and Normalization Layer`
+   - Accepts messy source material and turns it into a care case the service can coordinate.
+
+2. `Shared Coordination State Layer`
+   - Keeps the current case timeline, pending items, and next-step ownership visible over time.
+
+3. `Handoff and Ownership Layer`
+   - Preserves where responsibility moved and what still needs attention.
+
+4. `Human Boundary Layer`
+   - Keeps clinically interpretive or sensitive situations with humans rather than pretending the service can resolve them automatically.
+
+### 3.3 External Dependencies
+- Referral and intake channels.
+- Care documents such as referral forms, intake notes, and discharge paperwork.
+- Communication channels with provider offices, pharmacies, payers, and other care contacts.
+- Human advocates.
+
+## 4. Core Domain Model
+
+### 4.1 Entities
+
+The core model stays small and centered on the case, the participants involved, the tasks that need ownership, the handoffs that cross settings, and the benefits questions that need cautious follow-up.
+
+#### 4.1.1 Care Case
+Fields:
+- `case_id` (string)
+  - Stable identifier for the care case.
+
+#### 4.1.2 Participant
+Fields:
+- `participant_id` (string)
+  - Stable identifier for a person or organization involved in the case.
+- `role` (string)
+  - Coordination role such as family caregiver, patient, advocate, provider office, pharmacy, payer, or home-care organization.
+
+#### 4.1.3 Coordination Task
+Fields:
+- `task_id` (string)
+  - Stable identifier for a next action that needs follow-through.
+- `description` (string)
+  - Summary of what still needs to happen.
+- `owner_role` (string or null)
+  - Current owner of the next step when known.
+
+#### 4.1.4 Handoff
+Fields:
+- `handoff_id` (string)
+  - Stable identifier for a care transition or responsibility transfer.
+- `handoff_from` (string)
+  - Source side of the transition.
+- `handoff_to` (string)
+  - Destination side of the transition.
+
+#### 4.1.5 Benefits Question
+Fields:
+- `benefits_question_id` (string)
+  - Stable identifier for a benefits or eligibility question.
+- `question_text` (string)
+  - The question being tracked for coordination follow-up.
+
+## 5. Open Questions
+
+- How much outbound follow-up can be automated versus advocate-owned?
+- Is the primary access model family direct, employer/payer sponsored, or provider-linked?
+- Do reminders belong here or in a separate communication service?
diff --git a/skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/expected_criteria.md b/skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/expected_criteria.md
new file mode 100644
index 0000000..cf766db
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/expected_criteria.md
@@ -0,0 +1,29 @@
+# Expected Criteria
+
+- The output should behave like a narrow update to the existing `SPEC.md`,
+  with `### 3.3 External Dependencies` refreshed rather than the full document
+  being rewritten.
+- The output should preserve `Purpose`, `## 1. Problem Statement`,
+  `## 2. Goals and Non-Goals`, `### 3.1 Main Components`,
+  `### 3.2 Abstraction Levels`, `## 4. Core Domain Model`, and
+  `## 5. Open Questions` unchanged.
+- The refreshed dependencies should be framed as external sources,
+  communication channels, and qualified human review authority rather than as
+  a bare participant list.
+- The refreshed dependencies should include referral and intake sources,
+  care-document sources for coordination context, communication channels with
+  provider offices, pharmacies, payers, and other care contacts, and human
+  advocate review for ambiguous, sensitive, or clinically interpretive issues.
+- The older generic bullets `Referral and intake channels.`, `Care documents
+  such as referral forms, intake notes, and discharge paperwork.`,
+  `Communication channels with provider offices, pharmacies, payers, and other
+  care contacts.`, and `Human advocates.` should be removed rather than
+  preserved verbatim.
+- The output should not list patients, family caregivers, provider offices,
+  pharmacies, or payers as bare dependency bullets.
+- The output should not invent EHR integration, claims adjudication,
+  telehealth, clinical triage, caregiver portals, consent-management systems,
+  notification infrastructure, or implementation-specific detail.
+- The output should stay behavioral and language-agnostic, and it should
+  remain a service `SPEC.md` rather than drifting into company-foundation
+  language.
diff --git a/skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/raw_notes.md b/skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/raw_notes.md
new file mode 100644
index 0000000..2d527aa
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_external_dependencies_refresh/raw_notes.md
@@ -0,0 +1,41 @@
+# Harbor External Dependencies Refresh Packet
+
+Assembled on April 23, 2026 from synthetic operator and founder follow-up
+notes.
+
+This packet is intentionally subsection-scoped. There is already a draft
+`SPEC.md`, and it already contains the `Advocate Escalation Queue` component.
+The request here is to refresh only `### 3.3 External Dependencies`, not to
+rewrite the service description, add new components, or change the domain
+model.
+
+## Delta notes
+
+- The current Harbor draft is mostly right on intake, shared case state,
+  handoffs, benefits questions, and the explicit advocate escalation surface.
+- The existing external-dependencies list is too loose. It reads like a
+  participant list plus a generic "human advocates" line. Tighten it so the
+  dependencies are framed as sources, communication channels, and human review
+  authority.
+- Keep `Purpose`, `Problem Statement`, `Goals and Non-Goals`,
+  `### 3.1 Main Components`, `### 3.2 Abstraction Levels`,
+  `Core Domain Model`, and `Open Questions` unchanged.
+- Replace the current `### 3.3 External Dependencies` bullets with the current
+  dependency boundary.
+- Harbor depends on referral and intake sources, including referral forms,
+  intake notes, and discharge paperwork.
+- Harbor depends on care-document sources needed to understand current
+  coordination context. This is document access, not an EHR integration.
+- Harbor depends on communication channels used to coordinate with provider
+  offices, pharmacies, payers, and other care contacts. Phrase this as a
+  communication dependency, not as a bare list of counterparties.
+- Harbor depends on human advocate review when an issue is ambiguous,
+  sensitive, or clinically interpretive.
+- Do not list patients, family caregivers, provider offices, pharmacies, or
+  payers as bare external dependencies just because they are participants in
+  the coordination loop.
+- Do not invent EHR integration, claims processing, telehealth, clinical
+  triage, consent-management, caregiver portal, notification infrastructure,
+  or implementation-specific detail.
+- The output should behave like a subsection refresh, not an append-only
+  update and not a full-document rewrite.
diff --git a/skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/existing_spec.md b/skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/existing_spec.md
new file mode 100644
index 0000000..7e52e1b
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/existing_spec.md
@@ -0,0 +1,134 @@
+# Harbor Care Coordination Service Specification
+Status: Draft v1 (language-agnostic)
+Purpose: A long-running care-coordination service that turns referrals, intake notes, discharge instructions, documents, and follow-up events into a shared case timeline, coordination task queue, handoff tracker, and cautious benefits-question surface across family caregivers, patients, advocates, providers, pharmacies, and payers.
+
+## 1. Problem Statement
+
+Harbor Care Coordination Service keeps care logistics from dissolving into scattered calls, documents, and callbacks. It gives families, advocates, providers, pharmacies, payers, and other care contacts one visible coordination picture instead of forcing families to assemble the current state on their own.
+
+The service solves 5 operational problems:
+- Families often become the default project managers for eldercare and chronic-care logistics.
+- Referrals, intake notes, discharge paperwork, pharmacy issues, benefits questions, and follow-up events live in different places.
+- There is no shared operating picture of what happened, what is pending, and who owns the next step.
+- Handoffs across organizations and settings can stall when responsibility is implicit.
+- Conflicting records need to be surfaced without pretending the service can make a clinical conclusion.
+
+Important boundary:
+- Human advocates remain central when situations are ambiguous, sensitive, or clinically interpretive.
+- The service stays focused on coordination logistics rather than diagnosis, treatment planning, or telehealth.
+- The service is not an EHR, insurer claims processor, or marketplace for clinicians or home aides.
+- Benefits questions may be tracked, but the service does not adjudicate claims or eligibility.
+- The service should not turn into a generic outbound CRM.
+
+## 2. Goals and Non-Goals
+
+### 2.1 Goals
+- Normalize referrals, intake notes, and related care inputs into one coordinated case.
+- Maintain a shared case timeline showing what happened, what is pending, and what still needs documents or callbacks.
+- Keep next-step ownership visible across family caregivers, advocates, providers, pharmacies, payers, and other care contacts.
+- Track handoffs so care transitions do not disappear between settings or organizations.
+- Keep a cautious surface for benefits or eligibility questions without becoming an adjudication engine.
+
+### 2.2 Non-Goals
+- Diagnosis or treatment planning.
+- Telehealth.
+- An EHR or EHR replacement.
+- Insurer claims processing or adjudication.
+- A marketplace for clinicians or home aides.
+- A generic outbound CRM.
+
+## 3. System Overview
+
+Harbor Care Coordination Service is organized around messy intake, shared coordination state, explicit ownership, and visible handoffs. It keeps the case current without claiming clinical authority.
+
+### 3.1 Main Components
+1. `Referral and Intake Normalization`
+   - Converts referral forms, intake call notes, discharge paperwork, and related inputs into a consistent care case.
+   - Preserves original context while extracting coordination-relevant work.
+
+2. `Shared Case Timeline`
+   - Maintains the visible record of what happened, what is pending, and what still needs documents or callbacks.
+   - Provides the main shared coordination surface for the care case.
+
+3. `Coordination Task Queue`
+   - Tracks next actions that still need attention.
+   - Shows who owns the next step when ownership is known.
+
+4. `Handoff Tracker`
+   - Tracks transitions such as hospital to home, clinic to rehab, pharmacy to family, and payer to advocate.
+   - Makes responsibility transfers explicit instead of implicit.
+
+5. `Benefits Question Surface`
+   - Tracks benefits or eligibility questions that need coordination follow-up.
+   - Stays short of claims handling or adjudication.
+
+6. `Advocate Escalation Queue`
+   - Provides a visible human escalation surface for ambiguous, sensitive, or clinically interpretive follow-up so review work lands with advocates instead of being forced into automated resolution.
+
+### 3.2 Abstraction Levels
+1. `Intake and Normalization Layer`
+   - Accepts messy source material and turns it into a care case the service can coordinate.
+
+2. `Shared Coordination State Layer`
+   - Keeps the current case timeline, pending items, and next-step ownership visible over time.
+
+3. `Handoff and Ownership Layer`
+   - Preserves where responsibility moved and what still needs attention.
+
+4. `Human Boundary Layer`
+   - Keeps clinically interpretive or sensitive situations with humans rather than pretending the service can resolve them automatically.
+
+### 3.3 External Dependencies
+- Referral and intake channels.
+- Care documents such as referral forms, intake notes, and discharge paperwork.
+- Communication channels with provider offices, pharmacies, payers, and other care contacts.
+- Human advocates.
+
+## 4. Core Domain Model
+
+### 4.1 Entities
+
+The core model stays small and centered on the case, the participants involved, the tasks that need ownership, the handoffs that cross settings, and the benefits questions that need cautious follow-up.
+
+#### 4.1.1 Care Case
+Fields:
+- `case_id` (string)
+  - Stable identifier for the care case.
+
+#### 4.1.2 Participant
+Fields:
+- `participant_id` (string)
+  - Stable identifier for a person or organization involved in the case.
+- `role` (string)
+  - Coordination role such as family caregiver, patient, advocate, provider office, pharmacy, payer, or home-care organization.
+
+#### 4.1.3 Coordination Task
+Fields:
+- `task_id` (string)
+  - Stable identifier for a next action that needs follow-through.
+- `description` (string)
+  - Summary of what still needs to happen.
+- `owner_role` (string or null)
+  - Current owner of the next step when known.
+
+#### 4.1.4 Handoff
+Fields:
+- `handoff_id` (string)
+  - Stable identifier for a care transition or responsibility transfer.
+- `handoff_from` (string)
+  - Source side of the transition.
+- `handoff_to` (string)
+  - Destination side of the transition.
+
+#### 4.1.5 Benefits Question
+Fields:
+- `benefits_question_id` (string)
+  - Stable identifier for a benefits or eligibility question.
+- `question_text` (string)
+  - The question being tracked for coordination follow-up.
+
+## 5. Open Questions
+
+- How much outbound follow-up can be automated versus advocate-owned?
+- Is the primary access model family direct, employer/payer sponsored, or provider-linked?
+- Do reminders belong here or in a separate communication service?
diff --git a/skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/expected_criteria.md b/skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/expected_criteria.md
new file mode 100644
index 0000000..cf2483c
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/expected_criteria.md
@@ -0,0 +1,25 @@
+# Expected Criteria
+
+- The output should behave like a narrow update to the existing `SPEC.md`,
+  with `## 5. Open Questions` refreshed rather than the full document being
+  rewritten.
+- The output should preserve the existing `Advocate Escalation Queue`
+  component wording and numbering. This packet does not ask for a new
+  component.
+- The output should preserve `## 1. Problem Statement`,
+  `## 2. Goals and Non-Goals`, `## 3. System Overview`, and
+  `## 4. Core Domain Model` unchanged.
+- The output should replace the current open questions with questions about
+  household access authority, external-resolution tracking after escalation,
+  and the after-hours non-clinical support boundary.
+- The older Harbor open questions about automation-versus-advocate ownership,
+  primary access model, and reminder-service boundary should be removed rather
+  than appended to.
+- The output should not present a caregiver portal, permissions engine, or
+  after-hours clinical support path as a current guaranteed capability of the
+  service.
+- The output should not invent telehealth behavior, claims adjudication, EHR
+  behavior, or implementation-specific detail.
+- The output should stay behavioral and language-agnostic, and it should
+  remain a service `SPEC.md` rather than drifting into company-foundation
+  language.
diff --git a/skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/raw_notes.md b/skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/raw_notes.md
new file mode 100644
index 0000000..18b463e
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_open_questions_refresh/raw_notes.md
@@ -0,0 +1,41 @@
+# Harbor Open Questions Refresh Packet
+
+Assembled on April 23, 2026 from synthetic operator and founder follow-up
+notes.
+
+This packet is intentionally subsection-scoped. There is already a draft
+`SPEC.md`, and it already contains the `Advocate Escalation Queue` component.
+The request here is to refresh `## 5. Open Questions`, not to rewrite the
+service description.
+
+## Delta notes
+
+- The current Harbor draft is mostly right on intake, shared case state,
+  handoffs, benefits questions, and the explicit advocate escalation surface.
+- The older open questions were useful earlier, but the sharper unresolved
+  tensions are now around household access authority, external-resolution
+  tracking after an escalation, and after-hours non-clinical support
+  boundaries.
+- Keep the existing service framing, scope, and component wording. Do not
+  rewrite `Purpose`, `Problem Statement`, `Goals and Non-Goals`,
+  `System Overview`, or `Core Domain Model`.
+- `Advocate Escalation Queue` is already the right component name and should
+  stay as-is. This packet is not asking for another component or new entities.
+- Replace the current `## 5. Open Questions` section with questions focused on
+  the current advocate and household-boundary ambiguity.
+- First question: which non-advocate participants can view or update the
+  shared case timeline? Is that limited to the patient, patient-designated
+  caregivers, or some sponsored navigator role? Keep it unresolved rather than
+  inventing a portal or permissions engine.
+- Second question: when Harbor escalates a benefits or coordination issue to a
+  payer, provider office, or pharmacy, does Harbor track the external
+  resolution state, or only the handoff plus the next internal follow-up?
+- Third question: if a time-sensitive but non-clinical issue lands after
+  hours, does it wait for the next advocate review window or belong in a
+  separate urgent-support path?
+- Do not turn after-hours handling into telehealth, clinical triage, or on-call
+  clinical support.
+- Do not add a portal component, consent engine, claims adjudication, EHR
+  behavior, or operational runbook detail.
+- The output should behave like a section refresh, not a rewrite with broader
+  rewording.
diff --git a/skills/spec-creator/evals/fixtures/harbor_update_addendum/existing_spec.md b/skills/spec-creator/evals/fixtures/harbor_update_addendum/existing_spec.md
new file mode 100644
index 0000000..a5888bc
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_update_addendum/existing_spec.md
@@ -0,0 +1,125 @@
+# Harbor Care Coordination Service Specification
+Status: Draft v1 (language-agnostic)
+Purpose: A long-running care-coordination service that turns referrals, intake notes, discharge instructions, documents, and follow-up events into a shared case timeline, coordination task queue, handoff tracker, and cautious benefits-question surface across family caregivers, patients, advocates, providers, pharmacies, and payers.
+
+## 1. Problem Statement
+
+Harbor Care Coordination Service keeps care logistics from dissolving into scattered calls, documents, and callbacks. It gives families, advocates, providers, pharmacies, payers, and other care contacts one visible coordination picture instead of forcing families to assemble the current state on their own.
+
+The service solves 5 operational problems:
+- Families often become the default project managers for eldercare and chronic-care logistics.
+- Referrals, intake notes, discharge paperwork, pharmacy issues, benefits questions, and follow-up events live in different places.
+- There is no shared operating picture of what happened, what is pending, and who owns the next step.
+- Handoffs across organizations and settings can stall when responsibility is implicit.
+- Conflicting records need to be surfaced without pretending the service can make a clinical conclusion.
+
+Important boundary:
+- Human advocates remain central when situations are ambiguous, sensitive, or clinically interpretive.
+- The service stays focused on coordination logistics rather than diagnosis, treatment planning, or telehealth.
+- The service is not an EHR, insurer claims processor, or marketplace for clinicians or home aides.
+- Benefits questions may be tracked, but the service does not adjudicate claims or eligibility.
+- The service should not turn into a generic outbound CRM.
+
+## 2. Goals and Non-Goals
+
+### 2.1 Goals
+- Normalize referrals, intake notes, and related care inputs into one coordinated case.
+- Maintain a shared case timeline showing what happened, what is pending, and what still needs documents or callbacks.
+- Keep next-step ownership visible across family caregivers, advocates, providers, pharmacies, payers, and other care contacts.
+- Track handoffs so care transitions do not disappear between settings or organizations.
+- Keep a cautious surface for benefits or eligibility questions without becoming an adjudication engine.
+
+### 2.2 Non-Goals
+- Diagnosis or treatment planning.
+- Telehealth.
+- An EHR or EHR replacement.
+- Insurer claims processing or adjudication.
+- A marketplace for clinicians or home aides.
+- A generic outbound CRM.
+
+## 3. System Overview
+
+Harbor Care Coordination Service is organized around messy intake, shared coordination state, explicit ownership, and visible handoffs. It keeps the case current without claiming clinical authority.
+
+### 3.1 Main Components
+1. `Referral and Intake Normalization`
+   - Converts referral forms, intake call notes, discharge paperwork, and related inputs into a consistent care case.
+   - Preserves original context while extracting coordination-relevant work.
+
+2. `Shared Case Timeline`
+   - Maintains the visible record of what happened, what is pending, and what still needs documents or callbacks.
+   - Provides the main shared coordination surface for the care case.
+
+3. `Coordination Task Queue`
+   - Tracks next actions that still need attention.
+   - Shows who owns the next step when ownership is known.
+
+4. `Handoff Tracker`
+   - Tracks transitions such as hospital to home, clinic to rehab, pharmacy to family, and payer to advocate.
+   - Makes responsibility transfers explicit instead of implicit.
+
+5. `Benefits Question Surface`
+   - Tracks benefits or eligibility questions that need coordination follow-up.
+   - Stays short of claims handling or adjudication.
+
+### 3.2 Abstraction Levels
+1. `Intake and Normalization Layer`
+   - Accepts messy source material and turns it into a care case the service can coordinate.
+
+2. `Shared Coordination State Layer`
+   - Keeps the current case timeline, pending items, and next-step ownership visible over time.
+
+3. `Handoff and Ownership Layer`
+   - Preserves where responsibility moved and what still needs attention.
+
+4. `Human Boundary Layer`
+   - Keeps clinically interpretive or sensitive situations with humans rather than pretending the service can resolve them automatically.
+
+### 3.3 External Dependencies
+- Referral and intake channels.
+- Care documents such as referral forms, intake notes, and discharge paperwork.
+- Communication channels with provider offices, pharmacies, payers, and other care contacts.
+- Human advocates.
+
+## 4. Core Domain Model
+
+### 4.1 Entities
+
+The core model stays small and centered on the case, the participants involved, the tasks that need ownership, the handoffs that cross settings, and the benefits questions that need cautious follow-up.
+
+#### 4.1.1 Care Case
+Fields:
+- `case_id` (string)
+  - Stable identifier for the care case.
+
+#### 4.1.2 Participant
+Fields:
+- `participant_id` (string)
+  - Stable identifier for a person or organization involved in the case.
+- `role` (string)
+  - Coordination role such as family caregiver, patient, advocate, provider office, pharmacy, payer, or home-care organization.
+
+#### 4.1.3 Coordination Task
+Fields:
+- `task_id` (string)
+  - Stable identifier for a next action that needs follow-through.
+- `description` (string)
+  - Summary of what still needs to happen.
+- `owner_role` (string or null)
+  - Current owner of the next step when known.
+
+#### 4.1.4 Handoff
+Fields:
+- `handoff_id` (string)
+  - Stable identifier for a care transition or responsibility transfer.
+- `handoff_from` (string)
+  - Source side of the transition.
+- `handoff_to` (string)
+  - Destination side of the transition.
+
+#### 4.1.5 Benefits Question
+Fields:
+- `benefits_question_id` (string)
+  - Stable identifier for a benefits or eligibility question.
+- `question_text` (string)
+  - The question being tracked for coordination follow-up.
diff --git a/skills/spec-creator/evals/fixtures/harbor_update_addendum/expected_criteria.md b/skills/spec-creator/evals/fixtures/harbor_update_addendum/expected_criteria.md
new file mode 100644
index 0000000..e478608
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_update_addendum/expected_criteria.md
@@ -0,0 +1,26 @@
+# Expected Criteria
+
+- The output should behave like an update to the existing `SPEC.md`, not a
+  rewrite from scratch. Existing structure and unchanged content should remain
+  intact except where the addendum clearly asks for a targeted change.
+- The output should add a new `### 3.1 Main Components` item called
+  `Advocate Escalation Queue` or a very close equivalent, and it should frame
+  that component as a human escalation surface rather than an automated decision
+  system.
+- The new `Advocate Escalation Queue` component should be appended at the end of
+  the existing `### 3.1 Main Components` list so the earlier component wording
+  and numbering stay intact.
+- The output should preserve the existing Harbor framing as a long-running,
+  trust-heavy care-coordination service centered on intake, shared case
+  timeline, task ownership, handoff tracking, and cautious benefits handling.
+- The output should add `## 5. Open Questions` and preserve all three named
+  unresolved questions:
+  automation-versus-advocate follow-up ownership, access model, and
+  reminder-service boundary.
+- The output should not invent a standalone reminder engine, outbound CRM
+  surface, diagnosis/treatment behavior, EHR behavior, claims machinery, or
+  implementation-specific detail.
+- The benefits-question surface should remain qualified and occasional rather
+  than being turned into a mandatory or adjudicative workflow.
+- The output should stay behavioral and language-agnostic, and it should remain
+  a service `SPEC.md` rather than drifting into company-foundation language.
diff --git a/skills/spec-creator/evals/fixtures/harbor_update_addendum/raw_notes.md b/skills/spec-creator/evals/fixtures/harbor_update_addendum/raw_notes.md
new file mode 100644
index 0000000..b12aa35
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/harbor_update_addendum/raw_notes.md
@@ -0,0 +1,30 @@
+# Harbor Update Addendum Packet
+
+Assembled on April 22, 2026 from synthetic operator and founder follow-up notes.
+
+This packet is intentionally update-shaped. There is already a draft `SPEC.md`.
+These notes are only the delta and should not trigger a rewrite of the whole
+document.
+
+## Delta notes
+
+- The existing draft is broadly right on intake, case timeline, task ownership,
+  handoffs, and cautious benefits handling.
+- What is still underspecified is the human advocate surface. We need an
+  explicit advocate escalation queue in the spec, not just vague advocate
+  language in boundaries.
+- If something is ambiguous, sensitive, or clinically interpretive, the service
+  should make the advocate queue visible as the place where review work lands.
+- Add that queue as a new appended component instead of reshuffling the existing
+  main-component order.
+- Add open questions instead of hiding unresolved product decisions in prose.
+- Unresolved: how much outbound follow-up can be automated versus
+  advocate-owned?
+- Unresolved: is the primary access model family direct, employer/payer
+  sponsored, or provider-linked?
+- Unresolved: do reminders belong here or in a separate communication service?
+- Benefits questions still matter, but that surface is occasional and qualified.
+  Do not turn it into a guaranteed first-class workflow for every case.
+- Do not add a standalone reminder engine, outbound CRM component, diagnosis
+  behavior, telehealth behavior, EHR behavior, claims adjudication, or
+  implementation detail.
diff --git a/skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/existing_spec.md b/skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/existing_spec.md
new file mode 100644
index 0000000..8fbf9a9
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/existing_spec.md
@@ -0,0 +1,180 @@
+# Lightfast Workspace Coordinator Specification
+Status: Draft v1 (language-agnostic)
+Purpose: Lightfast Workspace Coordinator is a long-running repo/workspace coordination service at the workspace boundary for rough notes and source material, with an intake inbox, structured packet record, artifact draft workspace, evaluation results view, approval/rejection trail, and versioned history.
+
+## 1. Problem Statement
+
+Lightfast Workspace Coordinator keeps rough operator requests and source notes inside a repo/workspace boundary so they become inspectable packets, draft artifacts, evaluation results, and explicit review decisions instead of fading into an untraceable transcript. It preserves the ordered lineage from raw request to packet, brief, candidate artifact, eval run, and approval or rejection so people can inspect, edit, approve, or reject work with visible history.
+
+The service solves 5 operational problems:
+- Rough notes and source material can drift into an untraceable transcript instead of becoming inspectable artifacts.
+- Canonical docs can be replaced or mutated without an explicit review gate.
+- Evaluation failures can get detached from the draft instead of staying visible with the work that failed.
+- Versioned history of what changed and why is hard to follow across request, packet, brief, candidate artifact, eval run, and decision stages.
+- Ambiguous source notes can be over-interpreted instead of surfaced as open questions.
+
+Important boundary:
+- No silent mutation of canonical docs; replacement requires explicit review and approval.
+- Failed evaluation runs stay attached to the draft instead of being hidden or discarded.
+- Human review is part of the service boundary, not an incidental side effect.
+- Downstream execution, if any, remains external to this service and only receives approved outputs.
+- Only work that has been packetized into artifact stages is in scope for coordination.
+- Open questions from the source packet should remain visible rather than being resolved by guesswork.
+
+## 2. Goals and Non-Goals
+
+### 2.1 Goals
+- Intake rough notes or source material and normalize them into a structured packet.
+- Produce draft artifacts that can be inspected, edited, approved, or rejected.
+- Coordinate evaluation runs against packet criteria and keep the results attached to the draft trail.
+- Preserve lineage from raw request to packet, brief, candidate artifact, eval run, and approval or rejection.
+- Make human review and explicit approval the point where canonical replacement or downstream handoff can happen.
+- Surface unresolved questions rather than pretending ambiguous source material is settled.
+
+### 2.2 Non-Goals
+- Not a general autonomous software engineer.
+- Not project management or backlog grooming.
+- Not a docs CMS.
+- Not an arbitrary workflow runtime or hidden background execution system.
+- Not a system that silently regenerates until a draft looks plausible.
+- Not the downstream executor for approved business workflows.
+
+## 3. System Overview
+
+Lightfast Workspace Coordinator is organized around workspace-facing intake, artifact lineage, review-bound coordination, and visible history. It does not hide state transitions behind silent background mutation; instead, it keeps the packet, brief, draft, evaluation, and decision trail inspectable.
+
+### 3.1 Main Components
+1. `Intake and packet normalization`
+   - Accept rough notes or source material from the workspace boundary.
+   - Normalize source material into a structured packet record.
+   - Preserve the visible connection between raw request input and packet output.
+   - Surface ambiguous material as questions rather than resolving it implicitly.
+
+2. `Artifact drafting`
+   - Produce draft foundation, spec, or eval artifacts from the packet.
+   - Keep drafts inspectable and editable before any approval decision.
+   - Maintain the link between the packet and each candidate artifact.
+   - Allow drafts to remain in progress without implying completion.
+
+3. `Evaluation run coordination`
+   - Coordinate evaluations against packet criteria when evaluation is used.
+   - Keep evaluation results attached to the draft trail.
+   - Preserve failed runs alongside the work that produced them.
+   - Attach criteria text so review can inspect what was measured.
+
+4. `Review decision handling`
+   - Present drafts and evaluation outcomes for explicit approval or rejection.
+   - Keep reviewer comments visible with the decision trail.
+   - Make approval the visible point where canonical replacement or handoff may happen.
+   - Stop short of downstream execution.
+
+5. `Traceability and history`
+   - Preserve the visible lineage from source request through packet, brief, draft, evaluation, and decision.
+   - Keep change history and evaluation outcomes inspectable.
+   - Support versioned history or timeline views at the workspace boundary.
+   - Make it clear what changed and why across long-running coordination.
+
+6. `Approved Output Handoff`
+   - Present the exact approved artifact version together with its source packet or draft lineage, evaluator context if present, and reviewer comments as a durable handoff surface.
+   - Keep the handoff packaging visible rather than executable.
+   - It may later capture whether a downstream system acknowledged receipt, but that ownership remains unresolved.
+   - Do not apply changes to the repo, run downstream jobs, or become a workflow runtime.
+
+### 3.2 Abstraction Levels
+1. `Workspace-facing intake layer`
+   - Keeps the service anchored to repo or workspace source material.
+   - Accepts rough notes, source packets, and other entry material.
+   - Avoids free-form transcript accumulation as the primary record.
+   - Surfaces source ambiguity instead of converting it to assumed intent.
+
+2. `Artifact lineage layer`
+   - Maintains the ordered progression from raw request to packet, brief, draft, eval run, and decision.
+   - Connects each artifact stage to the next stage in the visible trail.
+   - Keeps candidate artifacts tied to the packet that produced them.
+   - Preserves the distinction between intermediate stages rather than collapsing them.
+
+3. `Review-bound coordination layer`
+   - Holds work until humans make an explicit approval or rejection decision.
+   - Prevents silent replacement of canonical docs.
+   - Keeps evaluation failures and reviewer comments visible during review.
+   - Allows downstream handoff only after approval.
+
+4. `Traceability layer`
+   - Makes change history and evaluation outcomes inspectable.
+   - Supports versioned history or timeline views.
+   - Keeps the service long-running without hiding state transitions.
+   - Exposes what changed across request, packet, brief, draft, evaluation, and decision stages.
+
+### 3.3 External Dependencies
+- Workspace files and repository state
+- Version control
+- Model providers, when draft generation or normalization uses model assistance
+- A separate evaluator service, if present
+- A downstream execution system for approved outputs, if present
+
+## 4. Core Domain Model
+
+### 4.1 Entities
+
+The core model is intentionally small and centers on the packet, draft, evaluation, and decision trail that the service makes visible.
+
+#### 4.1.1 WorkPacket
+Fields:
+- `id` (string)
+  - Stable identifier for the packet record.
+- `raw_request` (string)
+  - Original rough notes or source material submitted to the service.
+- `packet_text` (string or null)
+  - Normalized packet text produced from the raw request.
+  - May remain null until the intake material has been normalized.
+- `brief_text` (string or null)
+  - Narrower brief derived from the packet before candidate artifact drafting.
+  - Keeps the packet-to-brief stage visible in the lineage.
+- `open_questions` (list of strings or null)
+  - Source-backed unresolved questions that should remain visible.
+  - Reflects ambiguity that should not be guessed away.
+
+#### 4.1.2 ArtifactDraft
+Fields:
+- `id` (string)
+  - Stable identifier for the draft artifact.
+- `source_packet_id` (string)
+  - Reference to the packet that produced this draft.
+  - Preserves the lineage from packet to candidate artifact.
+- `artifact_kind` (string)
+  - Source-visible kind of draft, such as foundation, spec, or eval document.
+- `draft_text` (string or null)
+  - Current draft content.
+  - May be revised before approval or rejection.
+
+#### 4.1.3 EvaluationRun
+Fields:
+- `id` (string)
+  - Stable identifier for the evaluation run.
+- `source_draft_id` (string)
+  - Reference to the draft that was evaluated.
+  - Keeps the evaluation result attached to the work that was checked.
+- `result` (string)
+  - Evaluation outcome or summary, such as pass or fail.
+- `criteria_text` (string or null)
+  - Criteria or rubric used for the run.
+  - May reflect packet criteria when evaluation is coordinated externally.
+
+#### 4.1.4 ApprovalDecision
+Fields:
+- `id` (string)
+  - Stable identifier for the decision record.
+- `source_draft_id` (string)
+  - Reference to the draft that received the review decision.
+  - Connects approval or rejection back to the candidate artifact.
+- `decision` (string)
+  - Review outcome, such as approved or rejected.
+- `reviewer_comments` (string or null)
+  - Reviewer notes explaining the decision or requested changes.
+  - Keeps review rationale visible in the trail.
+
+## 5. Open Questions
+
+- Should the service stay repo-native and workspace-local first, or become a hosted catalog or multi-workspace control plane later?
+- When an evaluation fails, should the service create follow-up tasks itself, or only surface the failure to humans or external systems?
+- How much of the service's value should land first in drafting, evaluation, or review gates?
diff --git a/skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/expected_criteria.md b/skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/expected_criteria.md
new file mode 100644
index 0000000..7b70f34
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/expected_criteria.md
@@ -0,0 +1,18 @@
+# Expected Criteria
+
+- The output should behave like a narrow update to the existing `SPEC.md`,
+  with `### 2.2 Non-Goals` refreshed rather than the full document being
+  rewritten.
+- The output should preserve `Purpose`, `## 1. Problem Statement`,
+  `### 2.1 Goals`, `## 3. System Overview`, `## 4. Core Domain Model`, and
+  `## 5. Open Questions` unchanged.
+- The output should replace the stale Lightfast non-goals shorthand with
+  sharper boundaries against ticket-routing or planning drift, repo mutation
+  or execution behavior, hosted multi-workspace control-plane behavior,
+  docs-platform drift, and silent pass-chasing regeneration loops.
+- The old bullets `Not a general autonomous software engineer.`,
+  `Not project management or backlog grooming.`, and `Not a docs CMS.` should
+  be removed rather than preserved verbatim.
+- The output should stay behavioral and language-agnostic, and it should
+  remain a service `SPEC.md` rather than drifting into company-foundation
+  language.
diff --git a/skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/raw_notes.md b/skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/raw_notes.md
new file mode 100644
index 0000000..0b945bf
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_non_goals_refresh/raw_notes.md
@@ -0,0 +1,30 @@
+# Lightfast Non-Goals Refresh Packet
+
+Assembled on April 23, 2026 from synthetic founder and operator follow-up
+notes.
+
+This packet is intentionally subsection-scoped. There is already a draft
+`SPEC.md`, and the service framing is mostly right. The request here is to
+refresh `### 2.2 Non-Goals`, not to rewrite the service.
+
+## Delta notes
+
+- Keep the existing `Purpose`, `Problem Statement`, `### 2.1 Goals`,
+  `System Overview`, `Core Domain Model`, and `Open Questions` wording.
+- Refresh only `### 2.2 Non-Goals`.
+- The current non-goals list has stale shorthand like "general autonomous
+  software engineer" and "docs CMS." That wording is too vague now.
+- The sharper boundary is that this service should not become ticket routing,
+  sprint planning, backlog management, or team task assignment.
+- It also should not mutate repositories, apply approved changes, run CI,
+  deploy, or become the post-approval execution surface.
+- It should not become a hosted multi-workspace or multi-tenant control plane.
+- It should not become a wiki, docs publishing platform, or general content
+  management system.
+- It should not become a silent drafting loop that keeps rewriting or
+  re-evaluating until something finally passes.
+- Keep the section behavioral and language-agnostic. Do not add
+  implementation-specific details.
+- Do not drift into company-thesis or strategic-foundation language.
+- This should behave like a section refresh. Replace the stale shorthand
+  bullets rather than appending the new ones underneath them.
diff --git a/skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/existing_spec.md b/skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/existing_spec.md
new file mode 100644
index 0000000..8fbf9a9
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/existing_spec.md
@@ -0,0 +1,180 @@
+# Lightfast Workspace Coordinator Specification
+Status: Draft v1 (language-agnostic)
+Purpose: Lightfast Workspace Coordinator is a long-running repo/workspace coordination service at the workspace boundary for rough notes and source material, with an intake inbox, structured packet record, artifact draft workspace, evaluation results view, approval/rejection trail, and versioned history.
+
+## 1. Problem Statement
+
+Lightfast Workspace Coordinator keeps rough operator requests and source notes inside a repo/workspace boundary so they become inspectable packets, draft artifacts, evaluation results, and explicit review decisions instead of fading into an untraceable transcript. It preserves the ordered lineage from raw request to packet, brief, candidate artifact, eval run, and approval or rejection so people can inspect, edit, approve, or reject work with visible history.
+
+The service solves 5 operational problems:
+- Rough notes and source material can drift into an untraceable transcript instead of becoming inspectable artifacts.
+- Canonical docs can be replaced or mutated without an explicit review gate.
+- Evaluation failures can get detached from the draft instead of staying visible with the work that failed.
+- Versioned history of what changed and why is hard to follow across request, packet, brief, candidate artifact, eval run, and decision stages.
+- Ambiguous source notes can be over-interpreted instead of surfaced as open questions.
+
+Important boundary:
+- No silent mutation of canonical docs; replacement requires explicit review and approval.
+- Failed evaluation runs stay attached to the draft instead of being hidden or discarded.
+- Human review is part of the service boundary, not an incidental side effect.
+- Downstream execution, if any, remains external to this service and only receives approved outputs.
+- Only work that has been packetized into artifact stages is in scope for coordination.
+- Open questions from the source packet should remain visible rather than being resolved by guesswork.
+
+## 2. Goals and Non-Goals
+
+### 2.1 Goals
+- Intake rough notes or source material and normalize them into a structured packet.
+- Produce draft artifacts that can be inspected, edited, approved, or rejected.
+- Coordinate evaluation runs against packet criteria and keep the results attached to the draft trail.
+- Preserve lineage from raw request to packet, brief, candidate artifact, eval run, and approval or rejection.
+- Make human review and explicit approval the point where canonical replacement or downstream handoff can happen.
+- Surface unresolved questions rather than pretending ambiguous source material is settled.
+
+### 2.2 Non-Goals
+- Not a general autonomous software engineer.
+- Not project management or backlog grooming.
+- Not a docs CMS.
+- Not an arbitrary workflow runtime or hidden background execution system.
+- Not a system that silently regenerates until a draft looks plausible.
+- Not the downstream executor for approved business workflows.
+
+## 3. System Overview
+
+Lightfast Workspace Coordinator is organized around workspace-facing intake, artifact lineage, review-bound coordination, and visible history. It does not hide state transitions behind silent background mutation; instead, it keeps the packet, brief, draft, evaluation, and decision trail inspectable.
+
+### 3.1 Main Components
+1. `Intake and packet normalization`
+   - Accept rough notes or source material from the workspace boundary.
+   - Normalize source material into a structured packet record.
+   - Preserve the visible connection between raw request input and packet output.
+   - Surface ambiguous material as questions rather than resolving it implicitly.
+
+2. `Artifact drafting`
+   - Produce draft foundation, spec, or eval artifacts from the packet.
+   - Keep drafts inspectable and editable before any approval decision.
+   - Maintain the link between the packet and each candidate artifact.
+   - Allow drafts to remain in progress without implying completion.
+
+3. `Evaluation run coordination`
+   - Coordinate evaluations against packet criteria when evaluation is used.
+   - Keep evaluation results attached to the draft trail.
+   - Preserve failed runs alongside the work that produced them.
+   - Attach criteria text so review can inspect what was measured.
+
+4. `Review decision handling`
+   - Present drafts and evaluation outcomes for explicit approval or rejection.
+   - Keep reviewer comments visible with the decision trail.
+   - Make approval the visible point where canonical replacement or handoff may happen.
+   - Stop short of downstream execution.
+
+5. `Traceability and history`
+   - Preserve the visible lineage from source request through packet, brief, draft, evaluation, and decision.
+   - Keep change history and evaluation outcomes inspectable.
+   - Support versioned history or timeline views at the workspace boundary.
+   - Make it clear what changed and why across long-running coordination.
+
+6. `Approved Output Handoff`
+   - Present the exact approved artifact version together with its source packet or draft lineage, evaluator context if present, and reviewer comments as a durable handoff surface.
+   - Keep the handoff packaging visible rather than executable.
+   - It may later capture whether a downstream system acknowledged receipt, but that ownership remains unresolved.
+   - Do not apply changes to the repo, run downstream jobs, or become a workflow runtime.
+
+### 3.2 Abstraction Levels
+1. `Workspace-facing intake layer`
+   - Keeps the service anchored to repo or workspace source material.
+   - Accepts rough notes, source packets, and other entry material.
+   - Avoids free-form transcript accumulation as the primary record.
+   - Surfaces source ambiguity instead of converting it to assumed intent.
+
+2. `Artifact lineage layer`
+   - Maintains the ordered progression from raw request to packet, brief, draft, eval run, and decision.
+   - Connects each artifact stage to the next stage in the visible trail.
+   - Keeps candidate artifacts tied to the packet that produced them.
+   - Preserves the distinction between intermediate stages rather than collapsing them.
+
+3. `Review-bound coordination layer`
+   - Holds work until humans make an explicit approval or rejection decision.
+   - Prevents silent replacement of canonical docs.
+   - Keeps evaluation failures and reviewer comments visible during review.
+   - Allows downstream handoff only after approval.
+
+4. `Traceability layer`
+   - Makes change history and evaluation outcomes inspectable.
+   - Supports versioned history or timeline views.
+   - Keeps the service long-running without hiding state transitions.
+   - Exposes what changed across request, packet, brief, draft, evaluation, and decision stages.
+
+### 3.3 External Dependencies
+- Workspace files and repository state
+- Version control
+- Model providers, when draft generation or normalization uses model assistance
+- A separate evaluator service, if present
+- A downstream execution system for approved outputs, if present
+
+## 4. Core Domain Model
+
+### 4.1 Entities
+
+The core model is intentionally small and centers on the packet, draft, evaluation, and decision trail that the service makes visible.
+
+#### 4.1.1 WorkPacket
+Fields:
+- `id` (string)
+  - Stable identifier for the packet record.
+- `raw_request` (string)
+  - Original rough notes or source material submitted to the service.
+- `packet_text` (string or null)
+  - Normalized packet text produced from the raw request.
+  - May remain null until the intake material has been normalized.
+- `brief_text` (string or null)
+  - Narrower brief derived from the packet before candidate artifact drafting.
+  - Keeps the packet-to-brief stage visible in the lineage.
+- `open_questions` (list of strings or null)
+  - Source-backed unresolved questions that should remain visible.
+  - Reflects ambiguity that should not be guessed away.
+
+#### 4.1.2 ArtifactDraft
+Fields:
+- `id` (string)
+  - Stable identifier for the draft artifact.
+- `source_packet_id` (string)
+  - Reference to the packet that produced this draft.
+  - Preserves the lineage from packet to candidate artifact.
+- `artifact_kind` (string)
+  - Source-visible kind of draft, such as foundation, spec, or eval document.
+- `draft_text` (string or null)
+  - Current draft content.
+  - May be revised before approval or rejection.
+
+#### 4.1.3 EvaluationRun
+Fields:
+- `id` (string)
+  - Stable identifier for the evaluation run.
+- `source_draft_id` (string)
+  - Reference to the draft that was evaluated.
+  - Keeps the evaluation result attached to the work that was checked.
+- `result` (string)
+  - Evaluation outcome or summary, such as pass or fail.
+- `criteria_text` (string or null)
+  - Criteria or rubric used for the run.
+  - May reflect packet criteria when evaluation is coordinated externally.
+
+#### 4.1.4 ApprovalDecision
+Fields:
+- `id` (string)
+  - Stable identifier for the decision record.
+- `source_draft_id` (string)
+  - Reference to the draft that received the review decision.
+  - Connects approval or rejection back to the candidate artifact.
+- `decision` (string)
+  - Review outcome, such as approved or rejected.
+- `reviewer_comments` (string or null)
+  - Reviewer notes explaining the decision or requested changes.
+  - Keeps review rationale visible in the trail.
+
+## 5. Open Questions
+
+- Should the service stay repo-native and workspace-local first, or become a hosted catalog or multi-workspace control plane later?
+- When an evaluation fails, should the service create follow-up tasks itself, or only surface the failure to humans or external systems?
+- How much of the service's value should land first in drafting, evaluation, or review gates?
diff --git a/skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/expected_criteria.md b/skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/expected_criteria.md
new file mode 100644
index 0000000..23d17f0
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/expected_criteria.md
@@ -0,0 +1,23 @@
+# Expected Criteria
+
+- The output should behave like a narrow update to the existing `SPEC.md`,
+  with `## 5. Open Questions` refreshed rather than the full document being
+  rewritten.
+- The output should preserve the existing `Approved Output Handoff` component
+  wording and numbering. This packet does not ask for a new component.
+- The output should preserve `## 1. Problem Statement`,
+  `## 2. Goals and Non-Goals`, `## 3. System Overview`, and
+  `## 4. Core Domain Model` unchanged.
+- The output should replace the current open questions with questions about
+  downstream acknowledgement ownership, stale or superseded approved handoffs,
+  and whether rejected drafts ever produce a handoff object.
+- The old Lightfast open questions about repo-native versus hosted operation,
+  evaluation-failure follow-up ownership, and where the first user value lands
+  should be removed rather than appended to.
+- The output should not present downstream acknowledgement as a current
+  guaranteed capability of the service.
+- The output should not invent a workflow runtime, deployment executor,
+  release manager, hosted control plane, or queue system.
+- The output should stay behavioral and language-agnostic, and it should
+  remain a service `SPEC.md` rather than drifting into company-foundation
+  language.
diff --git a/skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/raw_notes.md b/skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/raw_notes.md
new file mode 100644
index 0000000..fdef30b
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_open_questions_refresh/raw_notes.md
@@ -0,0 +1,38 @@
+# Lightfast Open Questions Refresh Packet
+
+Assembled on April 22, 2026 from synthetic founder and operator follow-up
+notes.
+
+This packet is intentionally subsection-scoped. There is already a draft
+`SPEC.md`, and it already contains the `Approved Output Handoff` component.
+The request here is to refresh `## 5. Open Questions`, not to rewrite the
+service description.
+
+## Delta notes
+
+- The current open questions are now slightly off. They still reflect earlier
+  "where does value land first" framing, but the sharper tension is around the
+  approval-to-handoff boundary.
+- Keep the existing service framing, scope, and component wording. Do not
+  rewrite `Purpose`, `Problem Statement`, `Goals and Non-Goals`,
+  `System Overview`, or `Core Domain Model`.
+- `Approved Output Handoff` is already the right component name and should
+  stay as-is. This packet is not asking for another component or for schema
+  changes.
+- Replace the current `## 5. Open Questions` section with questions focused on
+  the approval-handoff boundary.
+- First question: if an approved output is handed off to another system, who
+  owns acknowledgement of receipt or acceptance? It is unresolved whether that
+  acknowledgement belongs in this service or only in the downstream system.
+- Second question: if the packet or candidate draft changes after approval,
+  what happens to the existing handoff? Does it remain as a historical record,
+  become stale automatically, or require another approval before a new handoff
+  can exist?
+- Third question: do rejected drafts ever produce a handoff object for audit
+  or reference, or is handoff strictly approval-only?
+- Do not turn acknowledgement into a resolved current capability. Keep it as
+  an open question.
+- Do not use this update to turn the service into a workflow runtime,
+  deployment executor, release manager, hosted control plane, or queue system.
+- The output should behave like a section refresh, not a rewrite with broader
+  rewording.
diff --git a/skills/spec-creator/evals/fixtures/lightfast_update_addendum/existing_spec.md b/skills/spec-creator/evals/fixtures/lightfast_update_addendum/existing_spec.md
new file mode 100644
index 0000000..84cb0c4
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_update_addendum/existing_spec.md
@@ -0,0 +1,174 @@
+# Lightfast Workspace Coordinator Specification
+Status: Draft v1 (language-agnostic)
+Purpose: Lightfast Workspace Coordinator is a long-running repo/workspace coordination service at the workspace boundary for rough notes and source material, with an intake inbox, structured packet record, artifact draft workspace, evaluation results view, approval/rejection trail, and versioned history.
+
+## 1. Problem Statement
+
+Lightfast Workspace Coordinator keeps rough operator requests and source notes inside a repo/workspace boundary so they become inspectable packets, draft artifacts, evaluation results, and explicit review decisions instead of fading into an untraceable transcript. It preserves the ordered lineage from raw request to packet, brief, candidate artifact, eval run, and approval or rejection so people can inspect, edit, approve, or reject work with visible history.
+
+The service solves 5 operational problems:
+- Rough notes and source material can drift into an untraceable transcript instead of becoming inspectable artifacts.
+- Canonical docs can be replaced or mutated without an explicit review gate.
+- Evaluation failures can get detached from the draft instead of staying visible with the work that failed.
+- Versioned history of what changed and why is hard to follow across request, packet, brief, candidate artifact, eval run, and decision stages.
+- Ambiguous source notes can be over-interpreted instead of surfaced as open questions.
+
+Important boundary:
+- No silent mutation of canonical docs; replacement requires explicit review and approval.
+- Failed evaluation runs stay attached to the draft instead of being hidden or discarded.
+- Human review is part of the service boundary, not an incidental side effect.
+- Downstream execution, if any, remains external to this service and only receives approved outputs.
+- Only work that has been packetized into artifact stages is in scope for coordination.
+- Open questions from the source packet should remain visible rather than being resolved by guesswork.
+
+## 2. Goals and Non-Goals
+
+### 2.1 Goals
+- Intake rough notes or source material and normalize them into a structured packet.
+- Produce draft artifacts that can be inspected, edited, approved, or rejected.
+- Coordinate evaluation runs against packet criteria and keep the results attached to the draft trail.
+- Preserve lineage from raw request to packet, brief, candidate artifact, eval run, and approval or rejection.
+- Make human review and explicit approval the point where canonical replacement or downstream handoff can happen.
+- Surface unresolved questions rather than pretending ambiguous source material is settled.
+
+### 2.2 Non-Goals
+- Not a general autonomous software engineer.
+- Not project management or backlog grooming.
+- Not a docs CMS.
+- Not an arbitrary workflow runtime or hidden background execution system.
+- Not a system that silently regenerates until a draft looks plausible.
+- Not the downstream executor for approved business workflows.
+
+## 3. System Overview
+
+Lightfast Workspace Coordinator is organized around workspace-facing intake, artifact lineage, review-bound coordination, and visible history. It does not hide state transitions behind silent background mutation; instead, it keeps the packet, brief, draft, evaluation, and decision trail inspectable.
+
+### 3.1 Main Components
+1. `Intake and packet normalization`
+   - Accept rough notes or source material from the workspace boundary.
+   - Normalize source material into a structured packet record.
+   - Preserve the visible connection between raw request input and packet output.
+   - Surface ambiguous material as questions rather than resolving it implicitly.
+
+2. `Artifact drafting`
+   - Produce draft foundation, spec, or eval artifacts from the packet.
+   - Keep drafts inspectable and editable before any approval decision.
+   - Maintain the link between the packet and each candidate artifact.
+   - Allow drafts to remain in progress without implying completion.
+
+3. `Evaluation run coordination`
+   - Coordinate evaluations against packet criteria when evaluation is used.
+   - Keep evaluation results attached to the draft trail.
+   - Preserve failed runs alongside the work that produced them.
+   - Attach criteria text so review can inspect what was measured.
+
+4. `Review decision handling`
+   - Present drafts and evaluation outcomes for explicit approval or rejection.
+   - Keep reviewer comments visible with the decision trail.
+   - Make approval the visible point where canonical replacement or handoff may happen.
+   - Stop short of downstream execution.
+
+5. `Traceability and history`
+   - Preserve the visible lineage from source request through packet, brief, draft, evaluation, and decision.
+   - Keep change history and evaluation outcomes inspectable.
+   - Support versioned history or timeline views at the workspace boundary.
+   - Make it clear what changed and why across long-running coordination.
+
+### 3.2 Abstraction Levels
+1. `Workspace-facing intake layer`
+   - Keeps the service anchored to repo or workspace source material.
+   - Accepts rough notes, source packets, and other entry material.
+   - Avoids free-form transcript accumulation as the primary record.
+   - Surfaces source ambiguity instead of converting it to assumed intent.
+
+2. `Artifact lineage layer`
+   - Maintains the ordered progression from raw request to packet, brief, draft, eval run, and decision.
+   - Connects each artifact stage to the next stage in the visible trail.
+   - Keeps candidate artifacts tied to the packet that produced them.
+   - Preserves the distinction between intermediate stages rather than collapsing them.
+
+3. `Review-bound coordination layer`
+   - Holds work until humans make an explicit approval or rejection decision.
+   - Prevents silent replacement of canonical docs.
+   - Keeps evaluation failures and reviewer comments visible during review.
+   - Allows downstream handoff only after approval.
+
+4. `Traceability layer`
+   - Makes change history and evaluation outcomes inspectable.
+   - Supports versioned history or timeline views.
+   - Keeps the service long-running without hiding state transitions.
+   - Exposes what changed across request, packet, brief, draft, evaluation, and decision stages.
+
+### 3.3 External Dependencies
+- Workspace files and repository state
+- Version control
+- Model providers, when draft generation or normalization uses model assistance
+- A separate evaluator service, if present
+- A downstream execution system for approved outputs, if present
+
+## 4. Core Domain Model
+
+### 4.1 Entities
+
+The core model is intentionally small and centers on the packet, draft, evaluation, and decision trail that the service makes visible.
+
+#### 4.1.1 WorkPacket
+Fields:
+- `id` (string)
+  - Stable identifier for the packet record.
+- `raw_request` (string)
+  - Original rough notes or source material submitted to the service.
+- `packet_text` (string or null)
+  - Normalized packet text produced from the raw request.
+  - May remain null until the intake material has been normalized.
+- `brief_text` (string or null)
+  - Narrower brief derived from the packet before candidate artifact drafting.
+  - Keeps the packet-to-brief stage visible in the lineage.
+- `open_questions` (list of strings or null)
+  - Source-backed unresolved questions that should remain visible.
+  - Reflects ambiguity that should not be guessed away.
+
+#### 4.1.2 ArtifactDraft
+Fields:
+- `id` (string)
+  - Stable identifier for the draft artifact.
+- `source_packet_id` (string)
+  - Reference to the packet that produced this draft.
+  - Preserves the lineage from packet to candidate artifact.
+- `artifact_kind` (string)
+  - Source-visible kind of draft, such as foundation, spec, or eval document.
+- `draft_text` (string or null)
+  - Current draft content.
+  - May be revised before approval or rejection.
+
+#### 4.1.3 EvaluationRun
+Fields:
+- `id` (string)
+  - Stable identifier for the evaluation run.
+- `source_draft_id` (string)
+  - Reference to the draft that was evaluated.
+  - Keeps the evaluation result attached to the work that was checked.
+- `result` (string)
+  - Evaluation outcome or summary, such as pass or fail.
+- `criteria_text` (string or null)
+  - Criteria or rubric used for the run.
+  - May reflect packet criteria when evaluation is coordinated externally.
+
+#### 4.1.4 ApprovalDecision
+Fields:
+- `id` (string)
+  - Stable identifier for the decision record.
+- `source_draft_id` (string)
+  - Reference to the draft that received the review decision.
+  - Connects approval or rejection back to the candidate artifact.
+- `decision` (string)
+  - Review outcome, such as approved or rejected.
+- `reviewer_comments` (string or null)
+  - Reviewer notes explaining the decision or requested changes.
+  - Keeps review rationale visible in the trail.
+
+## 5. Open Questions
+
+- Should the service stay repo-native and workspace-local first, or become a hosted catalog or multi-workspace control plane later?
+- When an evaluation fails, should the service create follow-up tasks itself, or only surface the failure to humans or external systems?
+- How much of the service's value should land first in drafting, evaluation, or review gates?
diff --git a/skills/spec-creator/evals/fixtures/lightfast_update_addendum/expected_criteria.md b/skills/spec-creator/evals/fixtures/lightfast_update_addendum/expected_criteria.md
new file mode 100644
index 0000000..9cb35ae
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_update_addendum/expected_criteria.md
@@ -0,0 +1,30 @@
+# Expected Criteria
+
+- The output should behave like an update to the existing `SPEC.md`, not a
+  rewrite from scratch. Existing structure and unchanged content should remain
+  intact except where the addendum clearly asks for a targeted change.
+- The output should add a new `### 3.1 Main Components` item called
+  `Approved Output Handoff`, and it should frame that component as a packaging
+  and visibility surface rather than an execution engine.
+- The new `Approved Output Handoff` component should be appended at the end of
+  the existing `### 3.1 Main Components` list so the earlier component wording
+  and numbering stay intact.
+- The output should preserve the existing Lightfast framing as a long-running
+  repo/workspace coordination service centered on intake, packet lineage,
+  draft artifacts, evaluation runs, review decisions, and visible history.
+- The output should preserve the current boundaries around explicit approval,
+  no silent mutation, no hidden regeneration, and downstream execution staying
+  external.
+- The output should preserve open questions about downstream acknowledgement
+  ownership, stale approvals after packet or draft changes, and whether
+  rejected drafts ever produce a handoff object.
+- The output should not present downstream acknowledgement visibility as a
+  current guaranteed service capability. That ownership stays unresolved.
+- The output should keep `## 4. Core Domain Model` unchanged. This update adds
+  a new component and open questions, not a new entity schema.
+- The output should not invent a deployment executor, auto-apply behavior,
+  multi-workspace control plane, general docs CMS, or implementation-specific
+  detail.
+- The output should stay behavioral and language-agnostic, and it should
+  remain a service `SPEC.md` rather than drifting into company-foundation
+  language.
diff --git a/skills/spec-creator/evals/fixtures/lightfast_update_addendum/raw_notes.md b/skills/spec-creator/evals/fixtures/lightfast_update_addendum/raw_notes.md
new file mode 100644
index 0000000..03a4518
--- /dev/null
+++ b/skills/spec-creator/evals/fixtures/lightfast_update_addendum/raw_notes.md
@@ -0,0 +1,42 @@
+# Lightfast Update Addendum Packet
+
+Assembled on April 22, 2026 from synthetic founder and operator follow-up
+notes.
+
+This packet is intentionally update-shaped. There is already a draft
+`SPEC.md`. These notes are only the delta and should not trigger a rewrite of
+the whole document.
+
+## Delta notes
+
+- The existing draft is mostly right on intake, packet lineage, eval
+  attachment, review gates, and visible history.
+- What still feels underspecified is the thing that exists after approval.
+  Right now there is a decision trail, but not a clearly named approved-output
+  handoff surface.
+- Need a durable object or surface for "this is the exact artifact that was
+  approved, with the packet lineage and review context that went with it."
+- Call that new appended component `Approved Output Handoff`.
+- That handoff should make the approved artifact version, source packet or
+  draft lineage, evaluator context if present, and reviewer comments visible
+  together.
+- Important boundary: this is packaging and visibility, not execution. Do not
+  make this service apply changes to the repo, run downstream jobs, or become
+  a workflow runtime.
+- It may later capture whether a downstream system acknowledged receipt, but it
+  is unresolved whether acknowledgement state belongs here or in the downstream
+  system.
+- Do not describe downstream acknowledgement as a current guaranteed capability
+  of this service. Keep it as an explicit ownership question.
+- Also unresolved: if the packet or draft changes after approval, does the old
+  handoff remain current, become stale automatically, or require a new review
+  before another handoff exists?
+- Also unresolved: do rejected drafts ever produce a handoff object, or is
+  handoff strictly for approved outputs?
+- Keep the service repo/workspace-facing. Do not use this update to turn it
+  into a hosted control plane, general docs system, or project-management
+  queue.
+- Add the new handoff surface as an appended main component instead of
+  rewriting earlier component order.
+- Add open questions instead of resolving the acknowledgement or stale-approval
+  rules by guesswork.

From 6cbdaa4431384195d5c432a5e4c00991332a10c8 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 13:42:57 +1000
Subject: [PATCH 12/30] Refactor BAML eval runner modules

---
 README.md                              |   18 +-
 package.json                           |    4 +-
 scripts/evals/README.md                |   31 +
 scripts/evals/baml.ts                  |   62 +
 scripts/evals/cli.ts                   |   68 +
 scripts/evals/manifest.ts              |  105 ++
 scripts/evals/normalization.ts         |  251 +++
 scripts/evals/profiles.ts              |   79 +
 scripts/evals/reports.ts               |  202 +++
 scripts/evals/runtime.ts               |   35 +
 scripts/evals/status.ts                |   35 +
 scripts/evals/text.ts                  |   44 +
 scripts/evals/validators/foundation.ts |  293 ++++
 scripts/evals/validators/index.ts      |   75 +
 scripts/evals/validators/markdown.ts   |  283 +++
 scripts/evals/validators/spec.ts       |  235 +++
 scripts/evals/variants.ts              |  269 +++
 scripts/run-baml-eval.mjs              | 2181 ------------------------
 scripts/run-baml-eval.ts               |  464 +++++
 skills/spec-creator/evals/evals.json   |    2 +-
 20 files changed, 2548 insertions(+), 2188 deletions(-)
 create mode 100644 scripts/evals/README.md
 create mode 100644 scripts/evals/baml.ts
 create mode 100644 scripts/evals/cli.ts
 create mode 100644 scripts/evals/manifest.ts
 create mode 100644 scripts/evals/normalization.ts
 create mode 100644 scripts/evals/profiles.ts
 create mode 100644 scripts/evals/reports.ts
 create mode 100644 scripts/evals/runtime.ts
 create mode 100644 scripts/evals/status.ts
 create mode 100644 scripts/evals/text.ts
 create mode 100644 scripts/evals/validators/foundation.ts
 create mode 100644 scripts/evals/validators/index.ts
 create mode 100644 scripts/evals/validators/markdown.ts
 create mode 100644 scripts/evals/validators/spec.ts
 create mode 100644 scripts/evals/variants.ts
 delete mode 100644 scripts/run-baml-eval.mjs
 create mode 100644 scripts/run-baml-eval.ts

diff --git a/README.md b/README.md
index f90b6fa..cd501e6 100644
--- a/README.md
+++ b/README.md
@@ -32,9 +32,10 @@ bun run eval:foundation -- create-foundation-from-lightfast-founder-notes
 bun run eval:foundation -- update-lightfast-foundation-boundary-surface-question
 bun run eval:foundation -- update-lightfast-foundation-tighten-overreach
 bun run eval:spec -- create-from-vercel-mcp-source-packet
-bun run with-env -- node ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --eval-profile gate --trials 3
-bun run with-env -- node ./scripts/run-baml-eval.mjs foundation-creator update-lightfast-foundation-tighten-overreach --eval-profile fast --compare previous,profile:no-skill
-bun run with-env -- node ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-lightfast-founder-notes --eval-profile cross
+bun run eval:spec -- --all
+bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator create-foundation-from-cloudflare-source-packet --eval-profile gate --trials 3
+bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator update-lightfast-foundation-tighten-overreach --eval-profile fast --compare previous,profile:no-skill
+bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator create-foundation-from-lightfast-founder-notes --eval-profile cross
 ```
 
 Each run writes packet, brief, candidate document, and evaluation report
@@ -65,6 +66,15 @@ When `--compare` is used, the run directory also includes:
 - `variants/<label>/...` — per-variant packet/brief/candidate/report artifacts
   and `benchmark.json`
 
+When `--all` is used, the runner executes every eval in the selected skill
+manifest and writes a suite directory under `skills/<skill>/evals/runs/` with:
+
+- `suite.json` — aggregate status summary for every eval in the manifest
+- `<eval-name>/...` — the normal per-eval artifacts for each manifest entry
+
+Suite mode exits nonzero if any eval has a non-`Pass` combined status, making it
+suitable for CI gates.
+
 Current comparison variants:
 
 - `current` — working tree prompt stack
@@ -103,7 +113,7 @@ When `--trials N` is used, the run directory contains `trial-1/`, `trial-2/`,
 For other local commands that should inherit `.env`, use:
 
 ```bash
-bun run with-env -- node ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-vercel-source-packet
+bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator create-foundation-from-vercel-source-packet
 ```
 
 ## License
diff --git a/package.json b/package.json
index 4983f5a..15506c0 100644
--- a/package.json
+++ b/package.json
@@ -7,8 +7,8 @@
     "with-env": "dotenv -e .env --",
     "baml:generate:foundation": "node ./node_modules/@boundaryml/baml/cli.js generate --from ./skills/foundation-creator/baml_src",
     "baml:generate:spec": "node ./node_modules/@boundaryml/baml/cli.js generate --from ./skills/spec-creator/baml_src",
-    "eval:foundation": "bun run with-env -- node ./scripts/run-baml-eval.mjs foundation-creator",
-    "eval:spec": "bun run with-env -- node ./scripts/run-baml-eval.mjs spec-creator"
+    "eval:foundation": "bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator",
+    "eval:spec": "bun run with-env -- bun ./scripts/run-baml-eval.ts spec-creator"
   },
   "dependencies": {
     "@boundaryml/baml": "0.221.0",
diff --git a/scripts/evals/README.md b/scripts/evals/README.md
new file mode 100644
index 0000000..59a8f78
--- /dev/null
+++ b/scripts/evals/README.md
@@ -0,0 +1,31 @@
+# Eval Runner Modules
+
+This directory holds reusable primitives for local skill eval runners. Keep
+modules small and project-agnostic so other repos can copy the runner without
+bringing Lightfast-specific skill logic with it.
+
+Current modules:
+
+- `baml.ts` regenerates and imports generated BAML clients.
+- `cli.ts` parses command-line flags into a runner request.
+- `manifest.ts` loads eval manifests, selects eval entries, and builds packets
+  from fixture files.
+- `normalization.ts` owns post-compile brief normalization before rendering.
+- `profiles.ts` defines model profile presets and profile summaries.
+- `reports.ts` builds benchmark, comparison, and suite-summary artifacts.
+- `runtime.ts` owns process/runtime helpers such as command execution and file
+  loading.
+- `status.ts` owns pass/partial/fail ordering and numeric summaries.
+- `text.ts` owns markdown/text normalization helpers.
+- `variants.ts` materializes current, git, and profile-based skill variants.
+- `validators/` owns deterministic reference-document checks.
+
+The remaining orchestration in `../run-baml-eval.ts` should stay limited to
+three separable areas:
+
+- Trial execution.
+- Artifact writing.
+- Provider/reporter integrations.
+
+Keep Braintrust or other external reporting behind a reporter interface rather
+than adding vendor-specific branches to the runner core.
diff --git a/scripts/evals/baml.ts b/scripts/evals/baml.ts
new file mode 100644
index 0000000..2c155b4
--- /dev/null
+++ b/scripts/evals/baml.ts
@@ -0,0 +1,62 @@
+import { rm, writeFile } from "node:fs/promises";
+import path from "node:path";
+import { pathToFileURL } from "node:url";
+import { runCommand } from "./runtime.ts";
+
+async function generateClient(skillRoot, repoRoot) {
+  const bamlSrc = path.join(skillRoot, "baml_src");
+  await runCommand(
+    "node",
+    [
+      path.join(repoRoot, "node_modules", "@boundaryml", "baml", "cli.js"),
+      "generate",
+      "--from",
+      bamlSrc,
+    ],
+    repoRoot,
+  );
+}
+
+export async function importGeneratedClient(skillRoot) {
+  const clientPath = path.join(skillRoot, "baml_client_dist", "index.js");
+  return import(pathToFileURL(clientPath).href);
+}
+
+export async function ensureFreshClient(skillRoot, repoRoot) {
+  const clientDir = path.join(skillRoot, "baml_client");
+  const distDir = path.join(skillRoot, "baml_client_dist");
+  const tsconfigPath = path.join(skillRoot, ".tmp-baml-client-tsconfig.json");
+  await rm(clientDir, { recursive: true, force: true });
+  await rm(distDir, { recursive: true, force: true });
+  await generateClient(skillRoot, repoRoot);
+  await writeFile(
+    tsconfigPath,
+    JSON.stringify(
+      {
+        compilerOptions: {
+          module: "NodeNext",
+          moduleResolution: "NodeNext",
+          target: "ES2022",
+          declaration: false,
+          sourceMap: false,
+          skipLibCheck: true,
+          outDir: distDir,
+          rootDir: clientDir,
+        },
+        include: [path.join(clientDir, "*.ts")],
+      },
+      null,
+      2,
+    ),
+    "utf8",
+  );
+  try {
+    await runCommand(
+      "node",
+      [path.join(repoRoot, "node_modules", "typescript", "bin", "tsc"), "--project", tsconfigPath],
+      repoRoot,
+    );
+  } finally {
+    await rm(tsconfigPath, { force: true });
+  }
+}
diff --git a/scripts/evals/cli.ts b/scripts/evals/cli.ts
new file mode 100644
index 0000000..170998c
--- /dev/null
+++ b/scripts/evals/cli.ts
@@ -0,0 +1,68 @@
+import { fail } from "./runtime.ts";
+
+export function parseArgs(argv) {
+  const positionals = [];
+  let trials = 1;
+  let compare = [];
+  let evalProfile = "fast";
+  let runAll = false;
+
+  for (let index = 0; index < argv.length; index += 1) {
+    const arg = argv[index];
+
+    if (arg === "--all") {
+      runAll = true;
+      continue;
+    }
+
+    if (arg === "--trials") {
+      const next = argv[index + 1];
+      if (!next) {
+        fail("Missing value after --trials.");
+      }
+      trials = Number.parseInt(next, 10);
+      if (!Number.isInteger(trials) || trials < 1) {
+        fail("--trials must be a positive integer.");
+      }
+      index += 1;
+      continue;
+    }
+
+    if (arg === "--compare") {
+      const next = argv[index + 1];
+      if (!next) {
+        fail("Missing value after --compare.");
+      }
+      compare = next
+        .split(",")
+        .map((value) => value.trim())
+        .filter((value) => value.length > 0);
+      index += 1;
+      continue;
+    }
+
+    if (arg === "--eval-profile") {
+      const next = argv[index + 1];
+      if (!next) {
+        fail("Missing value after --eval-profile.");
+      }
+      evalProfile = next.trim();
+      if (evalProfile.length === 0) {
+        fail("--eval-profile must not be empty.");
+      }
+      index += 1;
+      continue;
+    }
+
+    positionals.push(arg);
+  }
+
+  return {
+    skillName: positionals[0],
+    selector: positionals[1],
+    trials,
+    compare,
+    evalProfile,
+    runAll,
+  };
+}
diff --git a/scripts/evals/manifest.ts b/scripts/evals/manifest.ts
new file mode 100644
index 0000000..6c3b52b
--- /dev/null
+++ b/scripts/evals/manifest.ts
@@ -0,0 +1,105 @@
+import path from "node:path";
+import { fail, loadJson, loadText } from "./runtime.ts";
+
+export async function loadEvalManifest(evalsDir, skillName) {
+  const manifestPath = path.join(evalsDir, "evals.json");
+  const manifest = await loadJson(manifestPath);
+
+  if (!manifest.runner_contract || manifest.runner_contract.type !== "baml_pipeline") {
+    fail(`Skill '${skillName}' does not declare a supported runner_contract.`);
+  }
+
+  if (!Array.isArray(manifest.evals)) {
+    fail(`Eval manifest for '${skillName}' must declare an evals array.`);
+  }
+
+  return manifest;
+}
+
+function getEvalBySelector(evals, selector) {
+  if (!selector) {
+    if (evals.length === 1) {
+      return evals[0];
+    }
+    fail("Multiple evals exist. Pass an eval id or name.");
+  }
+
+  const numeric = Number(selector);
+  if (!Number.isNaN(numeric)) {
+    const byId = evals.find((entry) => entry.id === numeric);
+    if (byId) {
+      return byId;
+    }
+  }
+
+  const byName = evals.find((entry) => entry.eval_name === selector);
+  if (byName) {
+    return byName;
+  }
+
+  fail(`Eval '${selector}' not found.`);
+}
+
+export function getEvalEntriesBySelector(evals, selector, runAll) {
+  if (runAll) {
+    if (selector) {
+      fail("Pass either --all or an eval id/name, not both.");
+    }
+    return evals;
+  }
+
+  return [getEvalBySelector(evals, selector)];
+}
+
+export async function buildEvalPacket(evalEntry, evalsDir, packetType) {
+  const packetFiles = evalEntry.packet_files ?? {};
+  const rawNotesPath = packetFiles.raw_notes
+    ? path.join(evalsDir, packetFiles.raw_notes)
+    : null;
+  const expectedCriteriaPath = packetFiles.expected_criteria
+    ? path.join(evalsDir, packetFiles.expected_criteria)
+    : null;
+  const existingSpecPath = packetFiles.existing_spec
+    ? path.join(evalsDir, packetFiles.existing_spec)
+    : null;
+  const existingFoundationPath = packetFiles.existing_foundation
+    ? path.join(evalsDir, packetFiles.existing_foundation)
+    : null;
+
+  const packet = {
+    packet_name: evalEntry.eval_name,
+    task_prompt: evalEntry.prompt,
+    raw_notes: rawNotesPath ? await loadText(rawNotesPath) : evalEntry.prompt,
+    expected_criteria: expectedCriteriaPath
+      ? await loadText(expectedCriteriaPath)
+      : (evalEntry.expected_output ?? ""),
+  };
+
+  if (packetType === "SpecEvalPacket") {
+    packet.existing_spec = existingSpecPath ? await loadText(existingSpecPath) : null;
+  }
+
+  if (packetType === "FoundationEvalPacket") {
+    packet.existing_foundation = existingFoundationPath
+      ? await loadText(existingFoundationPath)
+      : null;
+  }
+
+  return packet;
+}
+
+export function extractEvalMetadata(evalEntry) {
+  const fields = [
+    "scenario_type",
+    "input_shape",
+    "ambiguity_level",
+    "domain_profile",
+    "primary_risks",
+  ];
+
+  return Object.fromEntries(
+    fields
+      .filter((field) => evalEntry[field] !== undefined)
+      .map((field) => [field, evalEntry[field]]),
+  );
+}
diff --git a/scripts/evals/normalization.ts b/scripts/evals/normalization.ts
new file mode 100644
index 0000000..4a399dc
--- /dev/null
+++ b/scripts/evals/normalization.ts
@@ -0,0 +1,251 @@
+function normalizeMatchText(value) {
+  return String(value ?? "")
+    .toLowerCase()
+    .replace(/[`"'’]/g, "")
+    .replace(/[^a-z0-9]+/g, " ")
+    .trim();
+}
+
+const COMPARABLE_STOP_WORDS = new Set([
+  "the",
+  "and",
+  "for",
+  "with",
+  "that",
+  "this",
+  "from",
+  "into",
+  "through",
+  "within",
+  "under",
+  "over",
+  "while",
+  "where",
+  "when",
+  "than",
+  "then",
+  "only",
+  "just",
+  "more",
+  "less",
+  "same",
+  "does",
+  "doesnt",
+  "not",
+  "are",
+  "is",
+  "was",
+  "were",
+  "be",
+  "being",
+  "been",
+  "can",
+  "could",
+  "should",
+  "would",
+  "will",
+  "may",
+  "might",
+  "must",
+  "have",
+  "has",
+  "had",
+  "its",
+  "their",
+  "them",
+  "they",
+  "there",
+  "about",
+  "across",
+  "around",
+  "also",
+  "still",
+  "rather",
+  "than",
+  "such",
+  "those",
+  "these",
+  "service",
+  "packet",
+  "source",
+  "record",
+  "records",
+  "surface",
+  "surfaces",
+  "entity",
+  "entities",
+  "concept",
+  "concepts",
+  "label",
+  "labels",
+  "term",
+  "terms",
+  "context",
+]);
+
+function extractComparableWords(value) {
+  return new Set(
+    normalizeMatchText(value)
+      .split(" ")
+      .map((word) => word.trim())
+      .filter((word) => word.length >= 3 && !COMPARABLE_STOP_WORDS.has(word)),
+  );
+}
+
+function countWordOverlap(leftWords, rightWords) {
+  let overlap = 0;
+  for (const word of leftWords) {
+    if (rightWords.has(word)) {
+      overlap += 1;
+    }
+  }
+  return overlap;
+}
+
+function normalizeFieldName(value) {
+  return normalizeMatchText(value).replace(/\s+/g, "_");
+}
+
+const GENERIC_IDENTITY_FIELDS = new Set([
+  "id",
+  "name",
+  "slug",
+  "title",
+  "key",
+  "code",
+  "identifier",
+  "external_id",
+  "display_name",
+]);
+
+function entityUsesOnlyGenericIdentityFields(entity) {
+  const fields = Array.isArray(entity?.fields) ? entity.fields : [];
+
+  if (fields.length > 2) {
+    return false;
+  }
+
+  return fields.every((field) => GENERIC_IDENTITY_FIELDS.has(normalizeFieldName(field?.name)));
+}
+
+function collectSpecBriefContextTexts(brief, packet) {
+  return [
+    brief?.purpose,
+    ...(brief?.operational_problems ?? []),
+    ...(brief?.goals ?? []),
+    ...(brief?.non_goals ?? []),
+    ...(brief?.important_boundaries ?? []),
+    ...(brief?.external_dependencies ?? []),
+    ...(brief?.unresolved_questions ?? []),
+    packet?.raw_notes,
+    packet?.expected_criteria,
+  ].filter((value) => typeof value === "string" && value.trim().length > 0);
+}
+
+const ENTITY_ALIAS_AMBIGUITY_MARKERS = [
+  /\bdoes not resolve\b/i,
+  /\bpreferred term\b/i,
+  /\bsame underlying concept\b/i,
+  /\bsame service surface\b/i,
+  /\bdistinct surface\b/i,
+  /\bdistinct surfaces\b/i,
+  /\bdistinct concept\b/i,
+  /\bdistinct concepts\b/i,
+  /\bseparate concept\b/i,
+  /\bseparate concepts\b/i,
+  /\balias\b/i,
+  /\bversus\b/i,
+  /\bvs\.?\b/i,
+  /\bdo not collapse\b/i,
+  /\breferring to the same\b/i,
+];
+
+function textHasAliasAmbiguityMarker(text) {
+  return (
+    ENTITY_ALIAS_AMBIGUITY_MARKERS.some((pattern) => pattern.test(text)) ||
+    /\bteams?\b.*\bworkspace\b/i.test(text) ||
+    /\bworkspace\b.*\bteams?\b/i.test(text)
+  );
+}
+
+function findAmbiguousAliasEntityDecision(entity, brief, packet) {
+  if (!entityUsesOnlyGenericIdentityFields(entity)) {
+    return null;
+  }
+
+  const entityText = normalizeMatchText(
+    [
+      entity?.name,
+      entity?.description,
+      ...(entity?.fields ?? []).flatMap((field) => [field?.name, field?.description]),
+    ].join(" "),
+  );
+
+  if (!textHasAliasAmbiguityMarker(entityText)) {
+    return null;
+  }
+
+  const entityWords = extractComparableWords(entityText);
+  if (entityWords.size === 0) {
+    return null;
+  }
+
+  const matchingContext = collectSpecBriefContextTexts(brief, packet).find((contextText) => {
+    const normalizedContext = normalizeMatchText(contextText);
+    if (!textHasAliasAmbiguityMarker(normalizedContext)) {
+      return false;
+    }
+
+    return countWordOverlap(entityWords, extractComparableWords(normalizedContext)) >= 2;
+  });
+
+  if (!matchingContext) {
+    return null;
+  }
+
+  return {
+    entity_name: entity?.name ?? "Unnamed entity",
+    reason:
+      "Removed minimal entity because it models an explicitly ambiguous alias surface that the brief and packet keep unresolved.",
+    evidence: matchingContext,
+  };
+}
+
+function cloneEvalBrief(brief) {
+  // Generated BAML objects may be class instances that Bun cannot structuredClone.
+  // The runner only needs JSON-shaped data for normalization and rendering.
+  return JSON.parse(JSON.stringify(brief));
+}
+
+export function normalizeCompiledBriefForRender({ skillName, packetType, brief, packet }) {
+  if (skillName !== "spec-creator" || packetType !== "SpecEvalPacket") {
+    return {
+      brief,
+      normalization: {
+        applied: false,
+        removed_entities: [],
+      },
+    };
+  }
+
+  const normalizedBrief = cloneEvalBrief(brief);
+  const removedEntities = [];
+
+  normalizedBrief.entities = (normalizedBrief.entities ?? []).filter((entity) => {
+    const decision = findAmbiguousAliasEntityDecision(entity, normalizedBrief, packet);
+    if (!decision) {
+      return true;
+    }
+
+    removedEntities.push(decision);
+    return false;
+  });
+
+  return {
+    brief: normalizedBrief,
+    normalization: {
+      applied: true,
+      removed_entities: removedEntities,
+    },
+  };
+}
diff --git a/scripts/evals/profiles.ts b/scripts/evals/profiles.ts
new file mode 100644
index 0000000..3982205
--- /dev/null
+++ b/scripts/evals/profiles.ts
@@ -0,0 +1,79 @@
+import { fail } from "./runtime.ts";
+
+const EVAL_PROFILE_PRESETS = {
+  fast: {
+    name: "fast",
+    description: "Fast inner-loop runs with GPT-5.4 mini for candidate and judge.",
+    candidateModel: "openai/gpt-5.4-mini",
+    judgeModel: "openai/gpt-5.4-mini",
+    candidateOverlayProfiles: ["model-openai-gpt-5.4-mini"],
+    judgeOverlayProfiles: ["model-openai-gpt-5.4-mini"],
+  },
+  gate: {
+    name: "gate",
+    description: "Candidate on GPT-5.4 mini, judged by GPT-5.4.",
+    candidateModel: "openai/gpt-5.4-mini",
+    judgeModel: "openai/gpt-5.4",
+    candidateOverlayProfiles: ["model-openai-gpt-5.4-mini"],
+    judgeOverlayProfiles: ["model-openai-gpt-5.4"],
+  },
+  prod: {
+    name: "prod",
+    description: "Production authoring default as candidate, judged by GPT-5.4.",
+    candidateModel: "skill-default",
+    judgeModel: "openai/gpt-5.4",
+    candidateOverlayProfiles: [],
+    judgeOverlayProfiles: ["model-openai-gpt-5.4"],
+  },
+  cross: {
+    name: "cross",
+    description: "Candidate on GPT-5.4 mini, judged by Claude Opus 4.7 through AI Gateway.",
+    candidateModel: "openai/gpt-5.4-mini",
+    judgeModel: "anthropic/claude-opus-4-7",
+    candidateOverlayProfiles: ["model-openai-gpt-5.4-mini"],
+    judgeOverlayProfiles: ["model-anthropic-claude-opus-4-7"],
+  },
+};
+
+function dedupeStrings(values) {
+  const deduped = [];
+  const seen = new Set();
+
+  for (const value of values) {
+    if (seen.has(value)) {
+      continue;
+    }
+    seen.add(value);
+    deduped.push(value);
+  }
+
+  return deduped;
+}
+
+export function getEvalProfilePreset(rawValue) {
+  const value = rawValue?.trim() || "fast";
+  const preset = EVAL_PROFILE_PRESETS[value];
+
+  if (!preset) {
+    fail(
+      `Unknown eval profile '${rawValue}'. Use one of: ${Object.keys(EVAL_PROFILE_PRESETS).join(", ")}.`,
+    );
+  }
+
+  return {
+    ...preset,
+    candidateOverlayProfiles: dedupeStrings(preset.candidateOverlayProfiles ?? []),
+    judgeOverlayProfiles: dedupeStrings(preset.judgeOverlayProfiles ?? []),
+  };
+}
+
+export function summarizeEvalProfile(evalProfile) {
+  return {
+    name: evalProfile.name,
+    description: evalProfile.description,
+    candidate_model: evalProfile.candidateModel,
+    judge_model: evalProfile.judgeModel,
+    candidate_overlay_profiles: evalProfile.candidateOverlayProfiles,
+    judge_overlay_profiles: evalProfile.judgeOverlayProfiles,
+  };
+}
diff --git a/scripts/evals/reports.ts b/scripts/evals/reports.ts
new file mode 100644
index 0000000..08ec474
--- /dev/null
+++ b/scripts/evals/reports.ts
@@ -0,0 +1,202 @@
+import path from "node:path";
+import { extractEvalMetadata } from "./manifest.ts";
+import { summarizeEvalProfile } from "./profiles.ts";
+import { compareStatuses, summarizeNumeric, worstStatus } from "./status.ts";
+
+export function buildBenchmark(skillName, evalEntry, trials) {
+  const judgeStatuses = trials.map((trial) => trial.report.overall_status);
+  const combinedStatuses = trials.map((trial) => trial.summary.combined_status);
+  const deterministicPassCount = trials.filter(
+    (trial) => trial.deterministic_checks.overall_pass,
+  ).length;
+
+  const checkStats = new Map();
+  for (const trial of trials) {
+    for (const check of trial.deterministic_checks.checks) {
+      if (!checkStats.has(check.id)) {
+        checkStats.set(check.id, {
+          id: check.id,
+          passed: 0,
+          total: 0,
+          last_details: "",
+        });
+      }
+      const stat = checkStats.get(check.id);
+      stat.total += 1;
+      if (check.passed) {
+        stat.passed += 1;
+      }
+      stat.last_details = check.details;
+    }
+  }
+
+  return {
+    skill_name: skillName,
+    eval_name: evalEntry.eval_name,
+    eval_metadata: extractEvalMetadata(evalEntry),
+    trial_count: trials.length,
+    judge_status_counts: {
+      Pass: judgeStatuses.filter((status) => status === "Pass").length,
+      Partial: judgeStatuses.filter((status) => status === "Partial").length,
+      Fail: judgeStatuses.filter((status) => status === "Fail").length,
+    },
+    combined_status_counts: {
+      Pass: combinedStatuses.filter((status) => status === "Pass").length,
+      Partial: combinedStatuses.filter((status) => status === "Partial").length,
+      Fail: combinedStatuses.filter((status) => status === "Fail").length,
+    },
+    benchmark_summary: {
+      llm_worst_status: worstStatus(judgeStatuses),
+      combined_worst_status: worstStatus(combinedStatuses),
+      deterministic_pass_rate: Number((deterministicPassCount / trials.length).toFixed(2)),
+    },
+    trial_summaries: trials.map((trial, index) => ({
+      trial: index + 1,
+      llm_status: trial.report.overall_status,
+      combined_status: trial.summary.combined_status,
+      deterministic_pass: trial.deterministic_checks.overall_pass,
+      failed_deterministic_checks: trial.deterministic_checks.checks
+        .filter((check) => !check.passed)
+        .map((check) => check.id),
+      timing_ms: trial.timing.total_ms,
+    })),
+    timing_ms: {
+      compile: summarizeNumeric(trials.map((trial) => trial.timing.compile_ms)),
+      render: summarizeNumeric(trials.map((trial) => trial.timing.render_ms)),
+      deterministic: summarizeNumeric(trials.map((trial) => trial.timing.deterministic_ms)),
+      evaluate: summarizeNumeric(trials.map((trial) => trial.timing.evaluate_ms)),
+      total: summarizeNumeric(trials.map((trial) => trial.timing.total_ms)),
+    },
+    deterministic_checks: [...checkStats.values()].map((stat) => ({
+      id: stat.id,
+      pass_rate: Number((stat.passed / stat.total).toFixed(2)),
+      passed_trials: stat.passed,
+      total_trials: stat.total,
+      last_details: stat.last_details,
+    })),
+  };
+}
+
+function compareBenchmarks(left, right) {
+  const leftSummary = left.benchmark_summary;
+  const rightSummary = right.benchmark_summary;
+
+  const combinedOrder = compareStatuses(
+    leftSummary.combined_worst_status,
+    rightSummary.combined_worst_status,
+  );
+  if (combinedOrder !== 0) {
+    return combinedOrder;
+  }
+
+  const llmOrder = compareStatuses(leftSummary.llm_worst_status, rightSummary.llm_worst_status);
+  if (llmOrder !== 0) {
+    return llmOrder;
+  }
+
+  return rightSummary.deterministic_pass_rate - leftSummary.deterministic_pass_rate;
+}
+
+export function buildComparisonReport(skillName, evalEntry, variantResults, evalProfile) {
+  const currentVariant = variantResults.find(
+    (variantResult) => variantResult.variant.key === "current",
+  );
+  const rankedVariants = [...variantResults]
+    .sort((left, right) => compareBenchmarks(left.benchmark, right.benchmark))
+    .map((variantResult, index) => ({
+      rank: index + 1,
+      label: variantResult.variant.label,
+      combined_worst_status: variantResult.benchmark.benchmark_summary.combined_worst_status,
+      llm_worst_status: variantResult.benchmark.benchmark_summary.llm_worst_status,
+      deterministic_pass_rate: variantResult.benchmark.benchmark_summary.deterministic_pass_rate,
+    }));
+
+  const currentVsBaselines = currentVariant
+    ? variantResults
+        .filter((variantResult) => variantResult.variant.key !== "current")
+        .map((variantResult) => ({
+          label: variantResult.variant.label,
+          combined_worst_status_relative_to_current:
+            compareStatuses(
+              variantResult.benchmark.benchmark_summary.combined_worst_status,
+              currentVariant.benchmark.benchmark_summary.combined_worst_status,
+            ) < 0
+              ? "better"
+              : compareStatuses(
+                    variantResult.benchmark.benchmark_summary.combined_worst_status,
+                    currentVariant.benchmark.benchmark_summary.combined_worst_status,
+                  ) > 0
+                ? "worse"
+                : "same",
+          llm_worst_status_relative_to_current:
+            compareStatuses(
+              variantResult.benchmark.benchmark_summary.llm_worst_status,
+              currentVariant.benchmark.benchmark_summary.llm_worst_status,
+            ) < 0
+              ? "better"
+              : compareStatuses(
+                    variantResult.benchmark.benchmark_summary.llm_worst_status,
+                    currentVariant.benchmark.benchmark_summary.llm_worst_status,
+                  ) > 0
+                ? "worse"
+                : "same",
+          deterministic_pass_rate_delta: Number(
+            (
+              variantResult.benchmark.benchmark_summary.deterministic_pass_rate -
+              currentVariant.benchmark.benchmark_summary.deterministic_pass_rate
+            ).toFixed(2),
+          ),
+        }))
+    : [];
+
+  return {
+    skill_name: skillName,
+    eval_name: evalEntry.eval_name,
+    trial_count: variantResults[0]?.benchmark.trial_count ?? 0,
+    judge_variant: "current",
+    eval_profile: summarizeEvalProfile(evalProfile),
+    eval_metadata: extractEvalMetadata(evalEntry),
+    variants: variantResults.map((variantResult) => ({
+      label: variantResult.variant.label,
+      source: variantResult.variant.source,
+      run_subdir: variantResult.run_subdir,
+      benchmark_summary: variantResult.benchmark.benchmark_summary,
+      judge_status_counts: variantResult.benchmark.judge_status_counts,
+      combined_status_counts: variantResult.benchmark.combined_status_counts,
+      timing_ms: variantResult.benchmark.timing_ms,
+    })),
+    ranking: rankedVariants,
+    current_vs_baselines: currentVsBaselines,
+  };
+}
+
+export function buildSuiteSummary({
+  skillName,
+  evalProfile,
+  trials,
+  compareMode,
+  suiteResults,
+  suiteDir,
+}) {
+  const failingResults = suiteResults.filter(
+    (result) => result.current_summary.combined_worst_status !== "Pass",
+  );
+
+  return {
+    skill_name: skillName,
+    eval_profile: summarizeEvalProfile(evalProfile),
+    trial_count: trials,
+    compare_mode: compareMode,
+    eval_count: suiteResults.length,
+    pass_count: suiteResults.length - failingResults.length,
+    non_pass_count: failingResults.length,
+    results: suiteResults.map((result) => ({
+      eval_name: result.eval_name,
+      run_dir: path.relative(suiteDir, result.run_dir).split(path.sep).join("/"),
+      llm_worst_status: result.current_summary.llm_worst_status,
+      combined_worst_status: result.current_summary.combined_worst_status,
+      deterministic_pass_rate: result.current_summary.deterministic_pass_rate,
+      error: result.error ?? null,
+    })),
+  };
+}
diff --git a/scripts/evals/runtime.ts b/scripts/evals/runtime.ts
new file mode 100644
index 0000000..5e8e79d
--- /dev/null
+++ b/scripts/evals/runtime.ts
@@ -0,0 +1,35 @@
+import { readFile } from "node:fs/promises";
+import { spawn } from "node:child_process";
+
+export function fail(message) {
+  console.error(message);
+  process.exit(1);
+}
+
+export function runCommand(command, args, cwd) {
+  return new Promise((resolve, reject) => {
+    const child = spawn(command, args, {
+      cwd,
+      stdio: "inherit",
+      env: process.env,
+      shell: false,
+    });
+
+    child.on("error", reject);
+    child.on("exit", (code) => {
+      if (code === 0) {
+        resolve();
+      } else {
+        reject(new Error(`${command} ${args.join(" ")} exited with code ${code}`));
+      }
+    });
+  });
+}
+
+export async function loadJson(filePath) {
+  return JSON.parse(await readFile(filePath, "utf8"));
+}
+
+export async function loadText(filePath) {
+  return readFile(filePath, "utf8");
+}
diff --git a/scripts/evals/status.ts b/scripts/evals/status.ts
new file mode 100644
index 0000000..75620de
--- /dev/null
+++ b/scripts/evals/status.ts
@@ -0,0 +1,35 @@
+const STATUS_RANK = {
+  Pass: 0,
+  Partial: 1,
+  Fail: 2,
+};
+
+export function summarizeNumeric(values) {
+  if (values.length === 0) {
+    return {
+      mean: 0,
+      min: 0,
+      max: 0,
+    };
+  }
+
+  const total = values.reduce((sum, value) => sum + value, 0);
+  return {
+    mean: Math.round(total / values.length),
+    min: Math.min(...values),
+    max: Math.max(...values),
+  };
+}
+
+export function worstStatus(statuses) {
+  return statuses.reduce((worst, current) => {
+    if (!worst) {
+      return current;
+    }
+    return STATUS_RANK[current] > STATUS_RANK[worst] ? current : worst;
+  }, null);
+}
+
+export function compareStatuses(left, right) {
+  return STATUS_RANK[left] - STATUS_RANK[right];
+}
diff --git a/scripts/evals/text.ts b/scripts/evals/text.ts
new file mode 100644
index 0000000..ee8a95a
--- /dev/null
+++ b/scripts/evals/text.ts
@@ -0,0 +1,44 @@
+export function normalizeLine(line) {
+  return line.trim().replace(/\s+/g, " ");
+}
+
+export function normalizeHeading(line) {
+  return normalizeLine(line)
+    .replace(/^#+\s*/, "")
+    .replace(/\s+/g, " ")
+    .trim();
+}
+
+export function stripLeadingSectionNumber(heading) {
+  return heading.replace(/^\d+(?:\.\d+)*\.?\s+/, "").trim();
+}
+
+export function hasPronounDrift(document) {
+  return /\b(we|our|ours|us|you|your|yours)\b/i.test(document);
+}
+
+export function hasUppercaseObligationKeyword(document) {
+  return /\b(MUST|SHOULD|MAY)\b/.test(document);
+}
+
+export function extractNonEmptyNormalizedLines(text) {
+  return text
+    .split(/\r?\n/)
+    .map((line) => normalizeLine(line))
+    .filter((line) => line.length > 0);
+}
+
+export function linesAppearInOrder(needleLines, haystackLines) {
+  let needleIndex = 0;
+
+  for (const line of haystackLines) {
+    if (needleIndex >= needleLines.length) {
+      break;
+    }
+    if (line === needleLines[needleIndex]) {
+      needleIndex += 1;
+    }
+  }
+
+  return needleIndex === needleLines.length;
+}
diff --git a/scripts/evals/validators/foundation.ts b/scripts/evals/validators/foundation.ts
new file mode 100644
index 0000000..815cced
--- /dev/null
+++ b/scripts/evals/validators/foundation.ts
@@ -0,0 +1,293 @@
+import {
+  hasPronounDrift,
+  linesAppearInOrder,
+  normalizeLine,
+} from "../text.ts";
+import {
+  createCheck,
+  createPatternChecks,
+  extractMarkdownBullets,
+  extractMarkdownSectionBodies,
+  extractNormalizedMarkdownBlocks,
+  filterLinesByPatternSpecs,
+  removeReplaceableSectionContent,
+} from "./markdown.ts";
+
+function extractFoundationTemplateSections(templateText) {
+  const sections = [];
+  const lines = templateText.split(/\r?\n/);
+
+  for (const rawLine of lines) {
+    const line = normalizeLine(rawLine);
+    if (line === "## Disallowed Sections") {
+      break;
+    }
+    if (line.startsWith("## ")) {
+      sections.push(line.slice(3).trim());
+    }
+  }
+
+  return sections;
+}
+
+function extractFoundationDisallowedHeadings(templateText) {
+  const disallowed = new Set();
+  const lines = templateText.split(/\r?\n/);
+  let inDisallowedSection = false;
+
+  for (const rawLine of lines) {
+    const line = normalizeLine(rawLine);
+    if (line === "## Disallowed Sections") {
+      inDisallowedSection = true;
+      continue;
+    }
+    if (!inDisallowedSection) {
+      continue;
+    }
+    const match = line.match(/^- `(.+)`$/);
+    if (match) {
+      disallowed.add(match[1]);
+    }
+  }
+
+  return disallowed;
+}
+
+function packetAllowsPublicMaterialsLead(packet) {
+  if (!packet || typeof packet.raw_notes !== "string") {
+    return true;
+  }
+
+  return (
+    /\bURL:\b/i.test(packet.raw_notes) ||
+    /\b##\s+Source\s+\d+/i.test(packet.raw_notes) ||
+    /\bofficial\b.*\bsources?\b/i.test(packet.raw_notes) ||
+    /\bpress release\b/i.test(packet.raw_notes) ||
+    /\bdocs homepage\b/i.test(packet.raw_notes) ||
+    /\bLast updated:\b/i.test(packet.raw_notes) ||
+    /\bPublished:\b/i.test(packet.raw_notes)
+  );
+}
+
+function strategicBetUsesApprovedLead(line, { allowPublicMaterialsLead = true } = {}) {
+  const publicLeadPattern = allowPublicMaterialsLead
+    ? "|Public materials suggest(?:\\s+a bet on|\\s+that)?"
+    : "";
+
+  return new RegExp(
+    `^(The notes suggest(?:\\s+a bet on|\\s+that)?|There are visible signals that${publicLeadPattern}|The source material indicates(?:\\s+a bet on|\\s+that)?)`,
+    "i",
+  ).test(line);
+}
+
+function strategicBetUsesDisallowedLanguage(line) {
+  const normalized = normalizeLine(line).toLowerCase();
+
+  return (
+    normalized.includes("the company appears to be betting on") ||
+    /\bprioriti(?:ze|zes|zed|zing)\b/.test(normalized) ||
+    /\binvest(?:s|ed|ing)? in\b/.test(normalized) ||
+    /\bship(?:s|ped|ping)?\b/.test(normalized) ||
+    /\btreat(?:s|ed|ing)?\b.*\bfirst-class\b/.test(normalized) ||
+    normalized.includes("first-class") ||
+    normalized.includes("is the wedge") ||
+    normalized.includes("is a defensible primitive") ||
+    normalized.includes("will remain") ||
+    normalized.includes("matters more than")
+  );
+}
+
+export function validateFoundationDocument(candidateDocument, templateText, _languageText, packet) {
+  const requiredSections = extractFoundationTemplateSections(templateText);
+  const disallowedHeadings = extractFoundationDisallowedHeadings(templateText);
+  const { lines, sections, bodies } = extractMarkdownSectionBodies(candidateDocument);
+  const titleLine = lines.find((line) => normalizeLine(line).length > 0) ?? "";
+  const hasMarkdownTitle = /^\s*#\s+.+\s+Foundation\s*$/.test(titleLine);
+  const lineMap = new Map();
+
+  for (const { title, index } of sections) {
+    const line = normalizeLine(title);
+    if (!lineMap.has(line)) {
+      lineMap.set(line, []);
+    }
+    lineMap.get(line).push(index);
+  }
+
+  const missingSections = [];
+  const duplicateSections = [];
+  const positions = [];
+
+  for (const section of requiredSections) {
+    const matches = lineMap.get(section) ?? [];
+    if (matches.length === 0) {
+      missingSections.push(section);
+      continue;
+    }
+    if (matches.length > 1) {
+      duplicateSections.push(section);
+    }
+    positions.push({
+      section,
+      index: matches[0],
+    });
+  }
+
+  const orderIsCorrect = positions.every((entry, index) => {
+    if (index === 0) {
+      return true;
+    }
+    return entry.index > positions[index - 1].index;
+  });
+
+  const emptySections = [];
+  for (let index = 0; index < positions.length; index += 1) {
+    const current = positions[index];
+    const sectionBody = bodies.get(current.section) ?? "";
+    if (!sectionBody) {
+      emptySections.push(current.section);
+    }
+  }
+
+  const presentDisallowedSections = [...disallowedHeadings].filter((section) =>
+    (lineMap.get(section) ?? []).length > 0,
+  );
+  const strategicBetsBody = bodies.get("Strategic Bets") ?? "";
+  const openQuestionsBody = bodies.get("Open Questions") ?? "";
+  const openQuestionBullets = extractMarkdownBullets(openQuestionsBody);
+  const allowPublicMaterialsLead = packetAllowsPublicMaterialsLead(packet);
+  const openQuestionsLookOpen =
+    openQuestionBullets.length > 0 &&
+    openQuestionBullets.every((line) => line.endsWith("?"));
+  const strategicBetLines = extractMarkdownBullets(strategicBetsBody);
+  const strategicBetsUseBullets =
+    strategicBetsBody.trim().length === 0 || strategicBetLines.length > 0;
+  const hedgedStrategicBets =
+    strategicBetLines.length > 0 &&
+    strategicBetLines.every((line) =>
+      strategicBetUsesApprovedLead(line, { allowPublicMaterialsLead }),
+    );
+  const strategicBetsAvoidPrescriptiveOrCompanyPosture =
+    strategicBetLines.length === 0 ||
+    strategicBetLines.every((line) => !strategicBetUsesDisallowedLanguage(line));
+
+  return [
+    createCheck(
+      "title_heading_present",
+      hasMarkdownTitle,
+      hasMarkdownTitle
+        ? "Detected the required markdown title heading."
+        : "Missing required title heading like `# <Primitive Name> Foundation`.",
+    ),
+    createCheck(
+      "required_sections_present_once",
+      missingSections.length === 0 && duplicateSections.length === 0,
+      missingSections.length === 0 && duplicateSections.length === 0
+        ? `All required sections from template are present exactly once: ${requiredSections.join(", ")}.`
+        : `Missing: ${missingSections.join(", ") || "none"}. Duplicate: ${duplicateSections.join(", ") || "none"}.`,
+    ),
+    createCheck(
+      "required_sections_in_template_order",
+      missingSections.length === 0 && orderIsCorrect,
+      missingSections.length > 0
+        ? "Section order check skipped because one or more required sections are missing."
+        : orderIsCorrect
+          ? "Required sections follow the template order."
+          : "Required sections are present but not in template order.",
+    ),
+    createCheck(
+      "required_sections_nonempty",
+      emptySections.length === 0,
+      emptySections.length === 0
+        ? "Every required section has non-empty content."
+        : `Empty sections: ${emptySections.join(", ")}.`,
+    ),
+    createCheck(
+      "no_disallowed_sections",
+      presentDisallowedSections.length === 0,
+      presentDisallowedSections.length === 0
+        ? "No disallowed downstream-planning sections were detected."
+        : `Disallowed sections present: ${presentDisallowedSections.join(", ")}.`,
+    ),
+    createCheck(
+      "strategic_bets_use_markdown_bullets",
+      strategicBetsUseBullets,
+      strategicBetsUseBullets
+        ? "Strategic Bets use markdown bullets."
+        : "Strategic Bets should be rendered as markdown bullets using `- `, not as paragraph-only prose.",
+    ),
+    createCheck(
+      "strategic_bets_use_directional_language",
+      hedgedStrategicBets,
+      hedgedStrategicBets
+        ? "Strategic Bets use approved source-centered lead phrasing."
+        : allowPublicMaterialsLead
+          ? "One or more Strategic Bets bullets do not begin with approved source-centered phrasing such as `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`."
+          : "One or more Strategic Bets bullets use evidence-source phrasing that does not match a notes-only packet. In note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.",
+    ),
+    createCheck(
+      "strategic_bets_avoid_prescriptive_or_company_intent_language",
+      strategicBetsAvoidPrescriptiveOrCompanyPosture,
+      strategicBetsAvoidPrescriptiveOrCompanyPosture
+        ? "Strategic Bets avoid prescriptive verbs and direct company-intent phrasing."
+        : "One or more Strategic Bets bullets use prescriptive verbs, categorical phrasing, or direct company-intent language.",
+    ),
+    createCheck(
+      "open_questions_remain_questions",
+      openQuestionsLookOpen,
+      openQuestionsLookOpen
+        ? "Open Questions are written as explicit unanswered questions."
+        : "Open Questions should be bullet questions that remain open and usually end with `?`.",
+    ),
+    createCheck(
+      "no_first_or_second_person",
+      !hasPronounDrift(candidateDocument),
+      !hasPronounDrift(candidateDocument)
+        ? "No obvious first-person or second-person pronouns detected."
+        : "Detected first-person or second-person pronouns that violate the language guide.",
+    ),
+  ];
+}
+
+export function validateFoundationUpdateDocument(
+  candidateDocument,
+  existingFoundationText,
+  templateText,
+  validationContract,
+  packet,
+) {
+  const baseChecks = validateFoundationDocument(candidateDocument, templateText, "", packet);
+  const filteredExistingFoundationText = removeReplaceableSectionContent(
+    existingFoundationText,
+    validationContract.replaceable_sections ?? [],
+  );
+  const existingBlocks = extractNormalizedMarkdownBlocks(filteredExistingFoundationText);
+  const candidateBlocks = extractNormalizedMarkdownBlocks(candidateDocument);
+  const preservedExistingBlocks = filterLinesByPatternSpecs(
+    existingBlocks,
+    validationContract.allowed_removed_patterns ?? [],
+  );
+  const preservesExistingContent = linesAppearInOrder(preservedExistingBlocks, candidateBlocks);
+  const requiredPatternChecks = createPatternChecks(
+    candidateDocument,
+    validationContract.required_patterns ?? [],
+    true,
+  );
+  const forbiddenPatternChecks = createPatternChecks(
+    candidateDocument,
+    validationContract.forbidden_patterns ?? [],
+    false,
+  );
+
+  return [
+    ...baseChecks,
+    createCheck(
+      "existing_content_preserved_in_order",
+      preservesExistingContent,
+      preservesExistingContent
+        ? "All existing markdown blocks appear in order in the candidate, except blocks explicitly marked as replaceable."
+        : "One or more existing markdown blocks were removed or reordered outside the explicitly replaceable blocks.",
+    ),
+    ...requiredPatternChecks,
+    ...forbiddenPatternChecks,
+  ];
+}
diff --git a/scripts/evals/validators/index.ts b/scripts/evals/validators/index.ts
new file mode 100644
index 0000000..59bb27e
--- /dev/null
+++ b/scripts/evals/validators/index.ts
@@ -0,0 +1,75 @@
+import path from "node:path";
+import { fail, loadText } from "../runtime.ts";
+import {
+  validateFoundationDocument,
+  validateFoundationUpdateDocument,
+} from "./foundation.ts";
+import {
+  validateSpecDocument,
+  validateSpecUpdateDocument,
+} from "./spec.ts";
+
+export async function runDeterministicChecks(
+  skillRoot,
+  validationContract,
+  candidateDocument,
+  packet = null,
+) {
+  if (!validationContract || validationContract.type !== "reference_document_checks") {
+    return {
+      enabled: false,
+      overall_pass: true,
+      checks: [],
+    };
+  }
+
+  const templatePath = validationContract.template_file
+    ? path.join(skillRoot, validationContract.template_file)
+    : null;
+  const languagePath = validationContract.language_file
+    ? path.join(skillRoot, validationContract.language_file)
+    : null;
+  const existingSpecPath = validationContract.existing_spec_file
+    ? path.join(skillRoot, validationContract.existing_spec_file)
+    : null;
+  const existingFoundationPath = validationContract.existing_foundation_file
+    ? path.join(skillRoot, validationContract.existing_foundation_file)
+    : null;
+  const [templateText, languageText, existingSpecText, existingFoundationText] = await Promise.all([
+    validationContract.template_file ? loadText(templatePath) : Promise.resolve(""),
+    languagePath ? loadText(languagePath) : Promise.resolve(""),
+    existingSpecPath ? loadText(existingSpecPath) : Promise.resolve(""),
+    existingFoundationPath ? loadText(existingFoundationPath) : Promise.resolve(""),
+  ]);
+
+  let checks;
+  switch (validationContract.validator) {
+    case "foundation-v1":
+      checks = validateFoundationDocument(candidateDocument, templateText, languageText, packet);
+      break;
+    case "foundation-update-v1":
+      checks = validateFoundationUpdateDocument(
+        candidateDocument,
+        existingFoundationText,
+        templateText,
+        validationContract,
+        packet,
+      );
+      break;
+    case "spec-v1":
+      checks = validateSpecDocument(candidateDocument, templateText, languageText);
+      break;
+    case "spec-update-v1":
+      checks = validateSpecUpdateDocument(candidateDocument, existingSpecText, validationContract);
+      break;
+    default:
+      fail(`Unknown validation contract '${validationContract.validator}'.`);
+  }
+
+  return {
+    enabled: true,
+    validator: validationContract.validator,
+    overall_pass: checks.every((check) => check.passed),
+    checks,
+  };
+}
diff --git a/scripts/evals/validators/markdown.ts b/scripts/evals/validators/markdown.ts
new file mode 100644
index 0000000..4ca8cc5
--- /dev/null
+++ b/scripts/evals/validators/markdown.ts
@@ -0,0 +1,283 @@
+import {
+  normalizeHeading,
+  normalizeLine,
+  stripLeadingSectionNumber,
+} from "../text.ts";
+
+export function createCheck(id, passed, details) {
+  return { id, passed, details };
+}
+
+export function compilePatternSpec(patternSpec) {
+  if (typeof patternSpec === "string") {
+    return new RegExp(patternSpec, "i");
+  }
+
+  return new RegExp(patternSpec.pattern, patternSpec.flags ?? "i");
+}
+
+export function filterLinesByPatternSpecs(lines, patternSpecs = []) {
+  if (patternSpecs.length === 0) {
+    return lines;
+  }
+
+  return lines.filter((line) =>
+    !patternSpecs.some((patternSpec) => compilePatternSpec(patternSpec).test(line)),
+  );
+}
+
+export function normalizeComparableHeading(value) {
+  return stripLeadingSectionNumber(normalizeHeading(value));
+}
+
+export function extractMarkdownHeadingEntries(document) {
+  const lines = document.split(/\r?\n/);
+  const headings = [];
+
+  for (const [index, rawLine] of lines.entries()) {
+    const match = rawLine.match(/^\s*(#{1,6})\s+(.+?)\s*$/);
+    if (!match) {
+      continue;
+    }
+
+    headings.push({
+      index,
+      level: match[1].length,
+      title: normalizeLine(match[2]),
+      normalizedTitle: normalizeComparableHeading(match[2]),
+    });
+  }
+
+  return {
+    lines,
+    headings,
+  };
+}
+
+export function removeReplaceableSectionContent(document, replaceableSections = []) {
+  if (replaceableSections.length === 0) {
+    return document;
+  }
+
+  const normalizedSpecs = replaceableSections
+    .map((sectionSpec) =>
+      typeof sectionSpec === "string"
+        ? {
+            title: sectionSpec,
+          }
+        : sectionSpec,
+    )
+    .filter((sectionSpec) => typeof sectionSpec?.title === "string")
+    .map((sectionSpec) => ({
+      normalizedTitle: normalizeComparableHeading(sectionSpec.title),
+      level: Number.isInteger(sectionSpec.level) ? sectionSpec.level : null,
+      removeHeading: sectionSpec.remove_heading === true,
+    }));
+
+  if (normalizedSpecs.length === 0) {
+    return document;
+  }
+
+  const { lines, headings } = extractMarkdownHeadingEntries(document);
+  const skippedLineIndexes = new Set();
+
+  for (let headingIndex = 0; headingIndex < headings.length; headingIndex += 1) {
+    const heading = headings[headingIndex];
+    const matchingSpec = normalizedSpecs.find((sectionSpec) => {
+      if (sectionSpec.normalizedTitle !== heading.normalizedTitle) {
+        return false;
+      }
+
+      if (sectionSpec.level !== null && sectionSpec.level !== heading.level) {
+        return false;
+      }
+
+      return true;
+    });
+
+    if (!matchingSpec) {
+      continue;
+    }
+
+    let end = lines.length;
+    for (let nextIndex = headingIndex + 1; nextIndex < headings.length; nextIndex += 1) {
+      if (headings[nextIndex].level <= heading.level) {
+        end = headings[nextIndex].index;
+        break;
+      }
+    }
+
+    const start = matchingSpec.removeHeading ? heading.index : heading.index + 1;
+    for (let lineIndex = start; lineIndex < end; lineIndex += 1) {
+      skippedLineIndexes.add(lineIndex);
+    }
+  }
+
+  return lines.filter((_, index) => !skippedLineIndexes.has(index)).join("\n");
+}
+
+export function findSectionBody(sectionMap, targetTitle) {
+  const normalizedTarget = stripLeadingSectionNumber(normalizeHeading(targetTitle));
+
+  for (const [title, body] of sectionMap.entries()) {
+    if (stripLeadingSectionNumber(normalizeHeading(title)) === normalizedTarget) {
+      return body ?? "";
+    }
+  }
+
+  return null;
+}
+
+export function extractMarkdownSectionBodies(candidateDocument) {
+  const { lines, headings } = extractMarkdownHeadingEntries(candidateDocument);
+  const sections = headings.map((heading) => ({
+    title: heading.title,
+    index: heading.index,
+    level: heading.level,
+  }));
+  const bodies = new Map();
+
+  for (let index = 0; index < sections.length; index += 1) {
+    const current = sections[index];
+    let end = lines.length;
+
+    for (let nextIndex = index + 1; nextIndex < sections.length; nextIndex += 1) {
+      if (sections[nextIndex].level <= current.level) {
+        end = sections[nextIndex].index;
+        break;
+      }
+    }
+
+    const start = current.index + 1;
+    bodies.set(current.title, lines.slice(start, end).join("\n").trim());
+  }
+
+  return {
+    lines,
+    sections,
+    bodies,
+  };
+}
+
+export function createPatternChecks(candidateDocument, patternChecks = [], expectedPresence = true) {
+  const normalizedCandidate = normalizeLine(candidateDocument);
+  const sectionBodies = extractMarkdownSectionBodies(candidateDocument).bodies;
+
+  return patternChecks.map((patternCheck) => {
+    const expression = compilePatternSpec(patternCheck);
+    const scopedText = patternCheck.section_title
+      ? findSectionBody(sectionBodies, patternCheck.section_title)
+      : candidateDocument;
+    const normalizedScopedText =
+      typeof scopedText === "string" ? normalizeLine(scopedText) : "";
+    const matched = expression.test(
+      patternCheck.section_title ? normalizedScopedText : normalizedCandidate,
+    );
+    const passed = expectedPresence ? matched : !matched;
+
+    return createCheck(
+      patternCheck.id,
+      passed,
+      passed ? patternCheck.details_pass : patternCheck.details_fail,
+    );
+  });
+}
+
+export function createPreservedSectionChecks(
+  existingDocument,
+  candidateDocument,
+  preservedSections = [],
+) {
+  if (preservedSections.length === 0) {
+    return [];
+  }
+
+  const existingSections = extractMarkdownSectionBodies(existingDocument).bodies;
+  const candidateSections = extractMarkdownSectionBodies(candidateDocument).bodies;
+
+  return preservedSections.map((sectionCheck) => {
+    const existingBody = findSectionBody(existingSections, sectionCheck.title);
+    const candidateBody = findSectionBody(candidateSections, sectionCheck.title);
+    const passed =
+      existingBody !== null &&
+      candidateBody !== null &&
+      normalizeLine(existingBody) === normalizeLine(candidateBody);
+
+    return createCheck(
+      sectionCheck.id,
+      passed,
+      passed ? sectionCheck.details_pass : sectionCheck.details_fail,
+    );
+  });
+}
+
+export function extractMarkdownBullets(sectionBody) {
+  const bullets = [];
+  let current = null;
+
+  for (const rawLine of sectionBody.split(/\r?\n/)) {
+    if (/^\s*-\s+/.test(rawLine)) {
+      if (current) {
+        bullets.push(current);
+      }
+      current = normalizeLine(rawLine.replace(/^\s*-\s+/, ""));
+      continue;
+    }
+
+    const line = normalizeLine(rawLine);
+    if (!current || line.length === 0) {
+      continue;
+    }
+
+    current = `${current} ${line}`.trim();
+  }
+
+  if (current) {
+    bullets.push(current);
+  }
+
+  return bullets;
+}
+
+export function extractNormalizedMarkdownBlocks(text) {
+  const blocks = [];
+  let current = null;
+
+  function flush() {
+    if (current && normalizeLine(current).length > 0) {
+      blocks.push(normalizeLine(current));
+    }
+    current = null;
+  }
+
+  for (const rawLine of text.split(/\r?\n/)) {
+    const trimmed = rawLine.trim();
+
+    if (trimmed.length === 0) {
+      flush();
+      continue;
+    }
+
+    if (/^\s*#+\s+/.test(rawLine)) {
+      flush();
+      blocks.push(normalizeLine(rawLine));
+      continue;
+    }
+
+    if (/^\s*-\s+/.test(rawLine)) {
+      flush();
+      current = rawLine.replace(/^\s*-\s+/, "- ");
+      continue;
+    }
+
+    if (current) {
+      current = `${current} ${trimmed}`.trim();
+      continue;
+    }
+
+    current = trimmed;
+  }
+
+  flush();
+  return blocks;
+}
diff --git a/scripts/evals/validators/spec.ts b/scripts/evals/validators/spec.ts
new file mode 100644
index 0000000..42de447
--- /dev/null
+++ b/scripts/evals/validators/spec.ts
@@ -0,0 +1,235 @@
+import {
+  hasPronounDrift,
+  hasUppercaseObligationKeyword,
+  linesAppearInOrder,
+  normalizeHeading,
+  normalizeLine,
+  stripLeadingSectionNumber,
+} from "../text.ts";
+import {
+  createCheck,
+  createPatternChecks,
+  createPreservedSectionChecks,
+  extractNormalizedMarkdownBlocks,
+  filterLinesByPatternSpecs,
+  removeReplaceableSectionContent,
+} from "./markdown.ts";
+
+function extractSpecMajorSections(templateText) {
+  const sections = [];
+  const lines = templateText.split(/\r?\n/);
+
+  for (const rawLine of lines) {
+    const line = normalizeLine(rawLine);
+    const match = line.match(/^## \d+\.\s+(.+)$/);
+    if (match) {
+      sections.push(match[1]);
+    }
+  }
+
+  return sections;
+}
+
+function extractSpecSubsections(templateText) {
+  const sections = [];
+  const lines = templateText.split(/\r?\n/);
+
+  for (const rawLine of lines) {
+    const line = normalizeLine(rawLine);
+    const match = line.match(/^### \d+\.\d+\s+(.+)$/);
+    if (match) {
+      sections.push(match[1]);
+    }
+  }
+
+  return sections;
+}
+
+function lineExists(lines, matcher) {
+  return lines.some((line) => matcher(normalizeHeading(line)));
+}
+
+export function validateSpecDocument(candidateDocument, templateText) {
+  const requiredSections = extractSpecMajorSections(templateText);
+  const requiredSubsections = extractSpecSubsections(templateText);
+  const lines = candidateDocument.split(/\r?\n/);
+  const missingSections = requiredSections.filter(
+    (section) =>
+      !lineExists(
+        lines,
+        (line) => stripLeadingSectionNumber(line).toLowerCase() === section.toLowerCase(),
+      ),
+  );
+  const missingSubsections = requiredSubsections.filter(
+    (section) =>
+      !lineExists(
+        lines,
+        (line) => stripLeadingSectionNumber(line).toLowerCase() === section.toLowerCase(),
+      ),
+  );
+
+  const hasStatusLine = lines.some((line) => /^\s*Status:\s+\S+/.test(line));
+  const hasPurposeLine = lines.some((line) => /^\s*Purpose:\s+\S+/.test(line));
+  const hasProblemStatement = lineExists(lines, (line) => line === "1. Problem Statement");
+  const hasGoalSubsections =
+    lineExists(lines, (line) => line === "2.1 Goals") &&
+    lineExists(lines, (line) => line === "2.2 Non-Goals");
+  const hasImportantBoundaryBlock =
+    /(^|\n)\s*(?:\*\*)?Important boundary:(?:\*\*)?\s*(?:\n|$)/m.test(candidateDocument);
+  const hasNumberedComponents = /^\d+\.\s+`[^`]+`/m.test(candidateDocument);
+  const hasFieldFormatting = /- `[^`]+` \([^)]+\)/.test(candidateDocument);
+  const hasEntityDefinitions = /^####\s+4\.1\.\d+\s+/m.test(candidateDocument);
+  const declaresNoDurableEntities =
+    !hasEntityDefinitions &&
+    /\b(?:does not introduce|no)\b[^.\n]*(?:durable|service-owned|first-class)[^.\n]*entities\b/i.test(
+      candidateDocument,
+    );
+  const domainModelShapeAcceptable = hasFieldFormatting || declaresNoDurableEntities;
+  const fieldLinesAvoidRequirementKeywords =
+    !/- `[^`]+` \([^)]*\b(required|optional)\b[^)]*\)/i.test(candidateDocument);
+
+  return [
+    createCheck(
+      "core_sections_present",
+      missingSections.length === 0,
+      missingSections.length === 0
+        ? `All major template sections are present: ${requiredSections.join(", ")}.`
+        : `Missing major sections: ${missingSections.join(", ")}.`,
+    ),
+    createCheck(
+      "required_subsections_present",
+      missingSubsections.length === 0,
+      missingSubsections.length === 0
+        ? `All template subsections are present: ${requiredSubsections.join(", ")}.`
+        : `Missing template subsections: ${missingSubsections.join(", ")}.`,
+    ),
+    createCheck(
+      "status_line_present",
+      hasStatusLine,
+      hasStatusLine
+        ? "Status line is present."
+        : "Status line is missing or malformed.",
+    ),
+    createCheck(
+      "purpose_line_present",
+      hasPurposeLine,
+      hasPurposeLine
+        ? "Purpose line is present."
+        : "Purpose line is missing or malformed.",
+    ),
+    createCheck(
+      "problem_statement_present",
+      hasProblemStatement,
+      hasProblemStatement
+        ? "Problem Statement major section is present."
+        : "Problem Statement major section is missing.",
+    ),
+    createCheck(
+      "goals_subsections_present",
+      hasGoalSubsections,
+      hasGoalSubsections
+        ? "Detected `2.1 Goals` and `2.2 Non-Goals` subsections."
+        : "Missing one or both of the required goals subsections: `2.1 Goals`, `2.2 Non-Goals`.",
+    ),
+    createCheck(
+      "important_boundary_block_present",
+      hasImportantBoundaryBlock,
+      hasImportantBoundaryBlock
+        ? "Detected an `Important boundary:` block inside the document."
+        : "Did not detect the required `Important boundary:` block.",
+    ),
+    createCheck(
+      "component_list_uses_numbering",
+      hasNumberedComponents,
+      hasNumberedComponents
+        ? "Detected numbered component entries in the spec."
+        : "Did not detect numbered component entries like `1. `Component Name``.",
+    ),
+    createCheck(
+      "domain_fields_use_template_shape",
+      domainModelShapeAcceptable,
+      hasFieldFormatting
+        ? "Detected domain-field lines using the `` `field_name` (type) `` format."
+        : declaresNoDurableEntities
+          ? "The spec explicitly declares that the service does not introduce durable service-owned entities."
+          : "Did not detect any domain-field lines using the template field format.",
+    ),
+    createCheck(
+      "field_parens_avoid_requirement_keywords",
+      fieldLinesAvoidRequirementKeywords,
+      fieldLinesAvoidRequirementKeywords
+        ? "Field type parentheses avoid `required`/`optional` labels."
+        : "Detected `required` or `optional` inside field type parentheses; keep those details in the description bullets instead.",
+    ),
+    createCheck(
+      "no_first_or_second_person",
+      !hasPronounDrift(candidateDocument),
+      !hasPronounDrift(candidateDocument)
+        ? "No obvious first-person or second-person pronouns detected."
+        : "Detected first-person or second-person pronouns that violate the language guide.",
+    ),
+    createCheck(
+      "obligation_keywords_lowercase",
+      !hasUppercaseObligationKeyword(candidateDocument),
+      !hasUppercaseObligationKeyword(candidateDocument)
+        ? "No uppercase obligation keywords detected."
+        : "Detected uppercase MUST/SHOULD/MAY, which violates the language guide.",
+    ),
+  ];
+}
+
+export function validateSpecUpdateDocument(candidateDocument, existingSpecText, validationContract) {
+  const filteredExistingSpecText = removeReplaceableSectionContent(
+    existingSpecText,
+    validationContract.replaceable_sections ?? [],
+  );
+  const existingBlocks = extractNormalizedMarkdownBlocks(filteredExistingSpecText);
+  const candidateBlocks = extractNormalizedMarkdownBlocks(candidateDocument);
+  const preservedExistingBlocks = filterLinesByPatternSpecs(
+    existingBlocks,
+    validationContract.allowed_removed_patterns ?? [],
+  );
+  const preservesExistingContent = linesAppearInOrder(preservedExistingBlocks, candidateBlocks);
+  const requiredPatternChecks = createPatternChecks(
+    candidateDocument,
+    validationContract.required_patterns ?? [],
+    true,
+  );
+  const forbiddenPatternChecks = createPatternChecks(
+    candidateDocument,
+    validationContract.forbidden_patterns ?? [],
+    false,
+  );
+  const preservedSectionChecks = createPreservedSectionChecks(
+    existingSpecText,
+    candidateDocument,
+    validationContract.preserve_sections ?? [],
+  );
+
+  return [
+    createCheck(
+      "existing_content_preserved_in_order",
+      preservesExistingContent,
+      preservesExistingContent
+        ? "All existing markdown blocks appear in order in the candidate, except blocks explicitly marked as replaceable."
+        : "One or more existing markdown blocks were removed or reordered outside the explicitly replaceable blocks.",
+    ),
+    ...requiredPatternChecks,
+    ...forbiddenPatternChecks,
+    ...preservedSectionChecks,
+    createCheck(
+      "no_first_or_second_person",
+      !hasPronounDrift(candidateDocument),
+      !hasPronounDrift(candidateDocument)
+        ? "No obvious first-person or second-person pronouns detected."
+        : "Detected first-person or second-person pronouns that violate the language guide.",
+    ),
+    createCheck(
+      "obligation_keywords_lowercase",
+      !hasUppercaseObligationKeyword(candidateDocument),
+      !hasUppercaseObligationKeyword(candidateDocument)
+        ? "No uppercase obligation keywords detected."
+        : "Detected uppercase MUST/SHOULD/MAY, which violates the language guide.",
+    ),
+  ];
+}
diff --git a/scripts/evals/variants.ts b/scripts/evals/variants.ts
new file mode 100644
index 0000000..1ca031c
--- /dev/null
+++ b/scripts/evals/variants.ts
@@ -0,0 +1,269 @@
+import { access, cp, mkdtemp, symlink } from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { fail, runCommand } from "./runtime.ts";
+
+function dedupeStrings(values) {
+  const deduped = [];
+  const seen = new Set();
+
+  for (const value of values) {
+    if (seen.has(value)) {
+      continue;
+    }
+    seen.add(value);
+    deduped.push(value);
+  }
+
+  return deduped;
+}
+
+export function sameStringArray(left, right) {
+  if (left.length !== right.length) {
+    return false;
+  }
+
+  return left.every((value, index) => value === right[index]);
+}
+
+function shellEscape(value) {
+  return `'${String(value).replace(/'/g, `'\\''`)}'`;
+}
+
+export function slugifyVariantLabel(label) {
+  return label
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, "-")
+    .replace(/^-+|-+$/g, "") || "variant";
+}
+
+export function parseVariantSpec(rawValue) {
+  const value = rawValue.trim();
+
+  if (value === "current") {
+    return {
+      key: "current",
+      label: "current",
+      kind: "current",
+      source: {
+        type: "working_tree",
+      },
+    };
+  }
+
+  if (value === "previous") {
+    return {
+      key: "previous",
+      label: "previous",
+      kind: "git",
+      ref: "HEAD~1",
+      source: {
+        type: "git",
+        ref: "HEAD~1",
+      },
+    };
+  }
+
+  if (value === "no-skill") {
+    return {
+      key: "profile:no-skill",
+      label: "profile:no-skill",
+      kind: "profile",
+      profileName: "no-skill",
+      source: {
+        type: "profile",
+        name: "no-skill",
+      },
+    };
+  }
+
+  if (value.startsWith("profile:")) {
+    const profileName = value.slice("profile:".length).trim();
+    if (profileName.length === 0) {
+      fail("Profile comparison variant is missing a name.");
+    }
+    return {
+      key: `profile:${profileName}`,
+      label: `profile:${profileName}`,
+      kind: "profile",
+      profileName,
+      source: {
+        type: "profile",
+        name: profileName,
+      },
+    };
+  }
+
+  if (value.startsWith("git:")) {
+    const ref = value.slice("git:".length).trim();
+    if (ref.length === 0) {
+      fail("Git comparison variant is missing a ref.");
+    }
+    return {
+      key: `git:${ref}`,
+      label: `git:${ref}`,
+      kind: "git",
+      ref,
+      source: {
+        type: "git",
+        ref,
+      },
+    };
+  }
+
+  fail(
+    `Unknown comparison variant '${rawValue}'. Use current, previous, no-skill, profile:<name>, or git:<ref>.`,
+  );
+}
+
+export function buildVariantPlan(compareSpecs) {
+  if (compareSpecs.length === 0) {
+    return [parseVariantSpec("current")];
+  }
+
+  const variants = [parseVariantSpec("current"), ...compareSpecs.map(parseVariantSpec)];
+  const deduped = [];
+  const seen = new Set();
+
+  for (const variant of variants) {
+    if (seen.has(variant.key)) {
+      continue;
+    }
+    seen.add(variant.key);
+    deduped.push(variant);
+  }
+
+  return deduped;
+}
+
+function shouldCopySkillPath(relativePath) {
+  const normalizedPath = relativePath.split(path.sep).join("/");
+
+  if (normalizedPath.length === 0) {
+    return true;
+  }
+
+  return !(
+    normalizedPath === "baml_client" ||
+    normalizedPath.startsWith("baml_client/") ||
+    normalizedPath === "baml_client_dist" ||
+    normalizedPath.startsWith("baml_client_dist/") ||
+    normalizedPath === "evals/runs" ||
+    normalizedPath.startsWith("evals/runs/")
+  );
+}
+
+async function cloneSkillRoot(sourceSkillRoot, destinationSkillRoot) {
+  await cp(sourceSkillRoot, destinationSkillRoot, {
+    recursive: true,
+    force: true,
+    filter: (sourcePath) => shouldCopySkillPath(path.relative(sourceSkillRoot, sourcePath)),
+  });
+}
+
+async function linkWorkspaceNodeModules(workspaceRoot, repoRoot) {
+  const workspaceNodeModulesPath = path.join(workspaceRoot, "node_modules");
+
+  try {
+    await symlink(path.join(repoRoot, "node_modules"), workspaceNodeModulesPath, "dir");
+  } catch (error) {
+    if (error?.code !== "EEXIST") {
+      throw error;
+    }
+  }
+}
+
+async function materializeGitSkillRoot(skillName, ref, workspaceRoot, repoRoot) {
+  const repoRelativeSkillPath = path.posix.join("skills", skillName);
+  const archiveCommand = [
+    "git archive --format=tar",
+    shellEscape(ref),
+    shellEscape(repoRelativeSkillPath),
+    "| tar -x -C",
+    shellEscape(workspaceRoot),
+  ].join(" ");
+
+  await runCommand("bash", ["-lc", archiveCommand], repoRoot);
+  return path.join(workspaceRoot, "skills", skillName);
+}
+
+async function resolveProfileRoot(baseSkillRoot, profileName) {
+  const profileRoot = path.join(baseSkillRoot, "eval_profiles", profileName);
+
+  try {
+    await access(profileRoot);
+  } catch {
+    fail(`Skill profile '${profileName}' not found at ${profileRoot}.`);
+  }
+
+  return profileRoot;
+}
+
+async function applyOverlayProfiles(skillRoot, baseSkillRoot, overlayProfileNames) {
+  for (const profileName of overlayProfileNames) {
+    const profileRoot = await resolveProfileRoot(baseSkillRoot, profileName);
+    await cp(profileRoot, skillRoot, {
+      recursive: true,
+      force: true,
+    });
+  }
+}
+
+export async function materializeVariantSkillRoot(
+  skillName,
+  baseSkillRoot,
+  variant,
+  overlayProfileNames = [],
+  repoRoot,
+) {
+  const dedupedOverlayProfiles = dedupeStrings(overlayProfileNames);
+  const needsWorkspace = variant.kind !== "current" || dedupedOverlayProfiles.length > 0;
+
+  if (!needsWorkspace) {
+    return {
+      skillRoot: baseSkillRoot,
+      cleanupRoot: null,
+    };
+  }
+
+  const workspaceRoot = await mkdtemp(
+    path.join(os.tmpdir(), `lightfast-skill-eval-${slugifyVariantLabel(variant.label)}-`),
+  );
+  await linkWorkspaceNodeModules(workspaceRoot, repoRoot);
+  let skillRoot = path.join(workspaceRoot, "skills", skillName);
+
+  if (variant.kind === "current") {
+    await cloneSkillRoot(baseSkillRoot, skillRoot);
+    await applyOverlayProfiles(skillRoot, baseSkillRoot, dedupedOverlayProfiles);
+    return {
+      skillRoot,
+      cleanupRoot: workspaceRoot,
+    };
+  }
+
+  if (variant.kind === "git") {
+    skillRoot = await materializeGitSkillRoot(skillName, variant.ref, workspaceRoot, repoRoot);
+    await applyOverlayProfiles(skillRoot, baseSkillRoot, dedupedOverlayProfiles);
+    return {
+      skillRoot,
+      cleanupRoot: workspaceRoot,
+    };
+  }
+
+  if (variant.kind === "profile") {
+    const profileRoot = await resolveProfileRoot(baseSkillRoot, variant.profileName);
+    await cloneSkillRoot(baseSkillRoot, skillRoot);
+    await cp(profileRoot, skillRoot, {
+      recursive: true,
+      force: true,
+    });
+    await applyOverlayProfiles(skillRoot, baseSkillRoot, dedupedOverlayProfiles);
+
+    return {
+      skillRoot,
+      cleanupRoot: workspaceRoot,
+    };
+  }
+
+  fail(`Unsupported variant kind '${variant.kind}'.`);
+}
diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs
deleted file mode 100644
index 166a37b..0000000
--- a/scripts/run-baml-eval.mjs
+++ /dev/null
@@ -1,2181 +0,0 @@
-import { access, cp, mkdir, mkdtemp, readFile, rm, symlink, writeFile } from "node:fs/promises";
-import os from "node:os";
-import path from "node:path";
-import { fileURLToPath, pathToFileURL } from "node:url";
-import { spawn } from "node:child_process";
-
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-const repoRoot = path.resolve(__dirname, "..");
-
-function fail(message) {
-  console.error(message);
-  process.exit(1);
-}
-
-const STATUS_RANK = {
-  Pass: 0,
-  Partial: 1,
-  Fail: 2,
-};
-
-const EVAL_PROFILE_PRESETS = {
-  fast: {
-    name: "fast",
-    description: "Fast inner-loop runs with GPT-5.4 mini for candidate and judge.",
-    candidateModel: "openai/gpt-5.4-mini",
-    judgeModel: "openai/gpt-5.4-mini",
-    candidateOverlayProfiles: ["model-openai-gpt-5.4-mini"],
-    judgeOverlayProfiles: ["model-openai-gpt-5.4-mini"],
-  },
-  gate: {
-    name: "gate",
-    description: "Candidate on GPT-5.4 mini, judged by GPT-5.4.",
-    candidateModel: "openai/gpt-5.4-mini",
-    judgeModel: "openai/gpt-5.4",
-    candidateOverlayProfiles: ["model-openai-gpt-5.4-mini"],
-    judgeOverlayProfiles: ["model-openai-gpt-5.4"],
-  },
-  prod: {
-    name: "prod",
-    description: "Production authoring default as candidate, judged by GPT-5.4.",
-    candidateModel: "skill-default",
-    judgeModel: "openai/gpt-5.4",
-    candidateOverlayProfiles: [],
-    judgeOverlayProfiles: ["model-openai-gpt-5.4"],
-  },
-  cross: {
-    name: "cross",
-    description: "Candidate on GPT-5.4 mini, judged by Claude Opus 4.7 through AI Gateway.",
-    candidateModel: "openai/gpt-5.4-mini",
-    judgeModel: "anthropic/claude-opus-4-7",
-    candidateOverlayProfiles: ["model-openai-gpt-5.4-mini"],
-    judgeOverlayProfiles: ["model-anthropic-claude-opus-4-7"],
-  },
-};
-
-function normalizeLine(line) {
-  return line.trim().replace(/\s+/g, " ");
-}
-
-function normalizeHeading(line) {
-  return normalizeLine(line)
-    .replace(/^#+\s*/, "")
-    .replace(/\s+/g, " ")
-    .trim();
-}
-
-function stripLeadingSectionNumber(heading) {
-  return heading.replace(/^\d+(?:\.\d+)*\.?\s+/, "").trim();
-}
-
-function summarizeNumeric(values) {
-  if (values.length === 0) {
-    return {
-      mean: 0,
-      min: 0,
-      max: 0,
-    };
-  }
-
-  const total = values.reduce((sum, value) => sum + value, 0);
-  return {
-    mean: Math.round(total / values.length),
-    min: Math.min(...values),
-    max: Math.max(...values),
-  };
-}
-
-function worstStatus(statuses) {
-  return statuses.reduce((worst, current) => {
-    if (!worst) {
-      return current;
-    }
-    return STATUS_RANK[current] > STATUS_RANK[worst] ? current : worst;
-  }, null);
-}
-
-function hasPronounDrift(document) {
-  return /\b(we|our|ours|us|you|your|yours)\b/i.test(document);
-}
-
-function hasUppercaseObligationKeyword(document) {
-  return /\b(MUST|SHOULD|MAY)\b/.test(document);
-}
-
-function extractNonEmptyNormalizedLines(text) {
-  return text
-    .split(/\r?\n/)
-    .map((line) => normalizeLine(line))
-    .filter((line) => line.length > 0);
-}
-
-function linesAppearInOrder(needleLines, haystackLines) {
-  let needleIndex = 0;
-
-  for (const line of haystackLines) {
-    if (needleIndex >= needleLines.length) {
-      break;
-    }
-    if (line === needleLines[needleIndex]) {
-      needleIndex += 1;
-    }
-  }
-
-  return needleIndex === needleLines.length;
-}
-
-function runCommand(command, args, cwd) {
-  return new Promise((resolve, reject) => {
-    const child = spawn(command, args, {
-      cwd,
-      stdio: "inherit",
-      env: process.env,
-      shell: false,
-    });
-
-    child.on("error", reject);
-    child.on("exit", (code) => {
-      if (code === 0) {
-        resolve();
-      } else {
-        reject(new Error(`${command} ${args.join(" ")} exited with code ${code}`));
-      }
-    });
-  });
-}
-
-async function loadJson(filePath) {
-  return JSON.parse(await readFile(filePath, "utf8"));
-}
-
-async function loadText(filePath) {
-  return readFile(filePath, "utf8");
-}
-
-function parseArgs(argv) {
-  const positionals = [];
-  let trials = 1;
-  let compare = [];
-  let evalProfile = "fast";
-
-  for (let index = 0; index < argv.length; index += 1) {
-    const arg = argv[index];
-
-    if (arg === "--trials") {
-      const next = argv[index + 1];
-      if (!next) {
-        fail("Missing value after --trials.");
-      }
-      trials = Number.parseInt(next, 10);
-      if (!Number.isInteger(trials) || trials < 1) {
-        fail("--trials must be a positive integer.");
-      }
-      index += 1;
-      continue;
-    }
-
-    if (arg === "--compare") {
-      const next = argv[index + 1];
-      if (!next) {
-        fail("Missing value after --compare.");
-      }
-      compare = next
-        .split(",")
-        .map((value) => value.trim())
-        .filter((value) => value.length > 0);
-      index += 1;
-      continue;
-    }
-
-    if (arg === "--eval-profile") {
-      const next = argv[index + 1];
-      if (!next) {
-        fail("Missing value after --eval-profile.");
-      }
-      evalProfile = next.trim();
-      if (evalProfile.length === 0) {
-        fail("--eval-profile must not be empty.");
-      }
-      index += 1;
-      continue;
-    }
-
-    positionals.push(arg);
-  }
-
-  return {
-    skillName: positionals[0],
-    selector: positionals[1],
-    trials,
-    compare,
-    evalProfile,
-  };
-}
-
-function dedupeStrings(values) {
-  const deduped = [];
-  const seen = new Set();
-
-  for (const value of values) {
-    if (seen.has(value)) {
-      continue;
-    }
-    seen.add(value);
-    deduped.push(value);
-  }
-
-  return deduped;
-}
-
-function normalizeMatchText(value) {
-  return String(value ?? "")
-    .toLowerCase()
-    .replace(/[`"'’]/g, "")
-    .replace(/[^a-z0-9]+/g, " ")
-    .trim();
-}
-
-const COMPARABLE_STOP_WORDS = new Set([
-  "the",
-  "and",
-  "for",
-  "with",
-  "that",
-  "this",
-  "from",
-  "into",
-  "through",
-  "within",
-  "under",
-  "over",
-  "while",
-  "where",
-  "when",
-  "than",
-  "then",
-  "only",
-  "just",
-  "more",
-  "less",
-  "same",
-  "does",
-  "doesnt",
-  "not",
-  "are",
-  "is",
-  "was",
-  "were",
-  "be",
-  "being",
-  "been",
-  "can",
-  "could",
-  "should",
-  "would",
-  "will",
-  "may",
-  "might",
-  "must",
-  "have",
-  "has",
-  "had",
-  "its",
-  "their",
-  "them",
-  "they",
-  "there",
-  "about",
-  "across",
-  "around",
-  "also",
-  "still",
-  "rather",
-  "than",
-  "such",
-  "those",
-  "these",
-  "service",
-  "packet",
-  "source",
-  "record",
-  "records",
-  "surface",
-  "surfaces",
-  "entity",
-  "entities",
-  "concept",
-  "concepts",
-  "label",
-  "labels",
-  "term",
-  "terms",
-  "context",
-]);
-
-function extractComparableWords(value) {
-  return new Set(
-    normalizeMatchText(value)
-      .split(" ")
-      .map((word) => word.trim())
-      .filter((word) => word.length >= 3 && !COMPARABLE_STOP_WORDS.has(word)),
-  );
-}
-
-function countWordOverlap(leftWords, rightWords) {
-  let overlap = 0;
-  for (const word of leftWords) {
-    if (rightWords.has(word)) {
-      overlap += 1;
-    }
-  }
-  return overlap;
-}
-
-function normalizeFieldName(value) {
-  return normalizeMatchText(value).replace(/\s+/g, "_");
-}
-
-const GENERIC_IDENTITY_FIELDS = new Set([
-  "id",
-  "name",
-  "slug",
-  "title",
-  "key",
-  "code",
-  "identifier",
-  "external_id",
-  "display_name",
-]);
-
-function entityUsesOnlyGenericIdentityFields(entity) {
-  const fields = Array.isArray(entity?.fields) ? entity.fields : [];
-
-  if (fields.length > 2) {
-    return false;
-  }
-
-  return fields.every((field) => GENERIC_IDENTITY_FIELDS.has(normalizeFieldName(field?.name)));
-}
-
-function collectSpecBriefContextTexts(brief, packet) {
-  return [
-    brief?.purpose,
-    ...(brief?.operational_problems ?? []),
-    ...(brief?.goals ?? []),
-    ...(brief?.non_goals ?? []),
-    ...(brief?.important_boundaries ?? []),
-    ...(brief?.external_dependencies ?? []),
-    ...(brief?.unresolved_questions ?? []),
-    packet?.raw_notes,
-    packet?.expected_criteria,
-  ].filter((value) => typeof value === "string" && value.trim().length > 0);
-}
-
-const ENTITY_ALIAS_AMBIGUITY_MARKERS = [
-  /\bdoes not resolve\b/i,
-  /\bpreferred term\b/i,
-  /\bsame underlying concept\b/i,
-  /\bsame service surface\b/i,
-  /\bdistinct surface\b/i,
-  /\bdistinct surfaces\b/i,
-  /\bdistinct concept\b/i,
-  /\bdistinct concepts\b/i,
-  /\bseparate concept\b/i,
-  /\bseparate concepts\b/i,
-  /\balias\b/i,
-  /\bversus\b/i,
-  /\bvs\.?\b/i,
-  /\bdo not collapse\b/i,
-  /\breferring to the same\b/i,
-];
-
-function textHasAliasAmbiguityMarker(text) {
-  return (
-    ENTITY_ALIAS_AMBIGUITY_MARKERS.some((pattern) => pattern.test(text)) ||
-    /\bteams?\b.*\bworkspace\b/i.test(text) ||
-    /\bworkspace\b.*\bteams?\b/i.test(text)
-  );
-}
-
-function findAmbiguousAliasEntityDecision(entity, brief, packet) {
-  if (!entityUsesOnlyGenericIdentityFields(entity)) {
-    return null;
-  }
-
-  const entityText = normalizeMatchText(
-    [
-      entity?.name,
-      entity?.description,
-      ...(entity?.fields ?? []).flatMap((field) => [field?.name, field?.description]),
-    ].join(" "),
-  );
-
-  if (!textHasAliasAmbiguityMarker(entityText)) {
-    return null;
-  }
-
-  const entityWords = extractComparableWords(entityText);
-  if (entityWords.size === 0) {
-    return null;
-  }
-
-  const matchingContext = collectSpecBriefContextTexts(brief, packet).find((contextText) => {
-    const normalizedContext = normalizeMatchText(contextText);
-    if (!textHasAliasAmbiguityMarker(normalizedContext)) {
-      return false;
-    }
-
-    return countWordOverlap(entityWords, extractComparableWords(normalizedContext)) >= 2;
-  });
-
-  if (!matchingContext) {
-    return null;
-  }
-
-  return {
-    entity_name: entity?.name ?? "Unnamed entity",
-    reason:
-      "Removed minimal entity because it models an explicitly ambiguous alias surface that the brief and packet keep unresolved.",
-    evidence: matchingContext,
-  };
-}
-
-function cloneEvalBrief(brief) {
-  if (typeof structuredClone === "function") {
-    return structuredClone(brief);
-  }
-
-  return JSON.parse(JSON.stringify(brief));
-}
-
-function normalizeCompiledBriefForRender({ skillName, packetType, brief, packet }) {
-  if (skillName !== "spec-creator" || packetType !== "SpecEvalPacket") {
-    return {
-      brief,
-      normalization: {
-        applied: false,
-        removed_entities: [],
-      },
-    };
-  }
-
-  const normalizedBrief = cloneEvalBrief(brief);
-  const removedEntities = [];
-
-  normalizedBrief.entities = (normalizedBrief.entities ?? []).filter((entity) => {
-    const decision = findAmbiguousAliasEntityDecision(entity, normalizedBrief, packet);
-    if (!decision) {
-      return true;
-    }
-
-    removedEntities.push(decision);
-    return false;
-  });
-
-  return {
-    brief: normalizedBrief,
-    normalization: {
-      applied: true,
-      removed_entities: removedEntities,
-    },
-  };
-}
-
-function getEvalProfilePreset(rawValue) {
-  const value = rawValue?.trim() || "fast";
-  const preset = EVAL_PROFILE_PRESETS[value];
-
-  if (!preset) {
-    fail(
-      `Unknown eval profile '${rawValue}'. Use one of: ${Object.keys(EVAL_PROFILE_PRESETS).join(", ")}.`,
-    );
-  }
-
-  return {
-    ...preset,
-    candidateOverlayProfiles: dedupeStrings(preset.candidateOverlayProfiles ?? []),
-    judgeOverlayProfiles: dedupeStrings(preset.judgeOverlayProfiles ?? []),
-  };
-}
-
-function summarizeEvalProfile(evalProfile) {
-  return {
-    name: evalProfile.name,
-    description: evalProfile.description,
-    candidate_model: evalProfile.candidateModel,
-    judge_model: evalProfile.judgeModel,
-    candidate_overlay_profiles: evalProfile.candidateOverlayProfiles,
-    judge_overlay_profiles: evalProfile.judgeOverlayProfiles,
-  };
-}
-
-function sameStringArray(left, right) {
-  if (left.length !== right.length) {
-    return false;
-  }
-
-  return left.every((value, index) => value === right[index]);
-}
-
-function shellEscape(value) {
-  return `'${String(value).replace(/'/g, `'\\''`)}'`;
-}
-
-function slugifyVariantLabel(label) {
-  return label
-    .toLowerCase()
-    .replace(/[^a-z0-9]+/g, "-")
-    .replace(/^-+|-+$/g, "") || "variant";
-}
-
-function compareStatuses(left, right) {
-  return STATUS_RANK[left] - STATUS_RANK[right];
-}
-
-function parseVariantSpec(rawValue) {
-  const value = rawValue.trim();
-
-  if (value === "current") {
-    return {
-      key: "current",
-      label: "current",
-      kind: "current",
-      source: {
-        type: "working_tree",
-      },
-    };
-  }
-
-  if (value === "previous") {
-    return {
-      key: "previous",
-      label: "previous",
-      kind: "git",
-      ref: "HEAD~1",
-      source: {
-        type: "git",
-        ref: "HEAD~1",
-      },
-    };
-  }
-
-  if (value === "no-skill") {
-    return {
-      key: "profile:no-skill",
-      label: "profile:no-skill",
-      kind: "profile",
-      profileName: "no-skill",
-      source: {
-        type: "profile",
-        name: "no-skill",
-      },
-    };
-  }
-
-  if (value.startsWith("profile:")) {
-    const profileName = value.slice("profile:".length).trim();
-    if (profileName.length === 0) {
-      fail("Profile comparison variant is missing a name.");
-    }
-    return {
-      key: `profile:${profileName}`,
-      label: `profile:${profileName}`,
-      kind: "profile",
-      profileName,
-      source: {
-        type: "profile",
-        name: profileName,
-      },
-    };
-  }
-
-  if (value.startsWith("git:")) {
-    const ref = value.slice("git:".length).trim();
-    if (ref.length === 0) {
-      fail("Git comparison variant is missing a ref.");
-    }
-    return {
-      key: `git:${ref}`,
-      label: `git:${ref}`,
-      kind: "git",
-      ref,
-      source: {
-        type: "git",
-        ref,
-      },
-    };
-  }
-
-  fail(
-    `Unknown comparison variant '${rawValue}'. Use current, previous, no-skill, profile:<name>, or git:<ref>.`,
-  );
-}
-
-function buildVariantPlan(compareSpecs) {
-  if (compareSpecs.length === 0) {
-    return [parseVariantSpec("current")];
-  }
-
-  const variants = [parseVariantSpec("current"), ...compareSpecs.map(parseVariantSpec)];
-  const deduped = [];
-  const seen = new Set();
-
-  for (const variant of variants) {
-    if (seen.has(variant.key)) {
-      continue;
-    }
-    seen.add(variant.key);
-    deduped.push(variant);
-  }
-
-  return deduped;
-}
-
-function getEvalBySelector(evals, selector) {
-  if (!selector) {
-    if (evals.length === 1) {
-      return evals[0];
-    }
-    fail("Multiple evals exist. Pass an eval id or name.");
-  }
-
-  const numeric = Number(selector);
-  if (!Number.isNaN(numeric)) {
-    const byId = evals.find((entry) => entry.id === numeric);
-    if (byId) {
-      return byId;
-    }
-  }
-
-  const byName = evals.find((entry) => entry.eval_name === selector);
-  if (byName) {
-    return byName;
-  }
-
-  fail(`Eval '${selector}' not found.`);
-}
-
-async function generateClient(skillRoot) {
-  const bamlSrc = path.join(skillRoot, "baml_src");
-  await runCommand(
-    "node",
-    [path.join(repoRoot, "node_modules", "@boundaryml", "baml", "cli.js"), "generate", "--from", bamlSrc],
-    repoRoot,
-  );
-}
-
-async function importGeneratedClient(skillRoot) {
-  const clientPath = path.join(skillRoot, "baml_client_dist", "index.js");
-  return import(pathToFileURL(clientPath).href);
-}
-
-async function buildPacket(evalEntry, evalsDir, packetType) {
-  const packetFiles = evalEntry.packet_files ?? {};
-  const rawNotesPath = packetFiles.raw_notes
-    ? path.join(evalsDir, packetFiles.raw_notes)
-    : null;
-  const expectedCriteriaPath = packetFiles.expected_criteria
-    ? path.join(evalsDir, packetFiles.expected_criteria)
-    : null;
-  const existingSpecPath = packetFiles.existing_spec
-    ? path.join(evalsDir, packetFiles.existing_spec)
-    : null;
-  const existingFoundationPath = packetFiles.existing_foundation
-    ? path.join(evalsDir, packetFiles.existing_foundation)
-    : null;
-
-  const packet = {
-    packet_name: evalEntry.eval_name,
-    task_prompt: evalEntry.prompt,
-    raw_notes: rawNotesPath ? await loadText(rawNotesPath) : evalEntry.prompt,
-    expected_criteria: expectedCriteriaPath
-      ? await loadText(expectedCriteriaPath)
-      : (evalEntry.expected_output ?? ""),
-  };
-
-  if (packetType === "SpecEvalPacket") {
-    packet.existing_spec = existingSpecPath ? await loadText(existingSpecPath) : null;
-  }
-
-  if (packetType === "FoundationEvalPacket") {
-    packet.existing_foundation = existingFoundationPath
-      ? await loadText(existingFoundationPath)
-      : null;
-  }
-
-  return packet;
-}
-
-function shouldCopySkillPath(relativePath) {
-  const normalizedPath = relativePath.split(path.sep).join("/");
-
-  if (normalizedPath.length === 0) {
-    return true;
-  }
-
-  return !(
-    normalizedPath === "baml_client" ||
-    normalizedPath.startsWith("baml_client/") ||
-    normalizedPath === "baml_client_dist" ||
-    normalizedPath.startsWith("baml_client_dist/") ||
-    normalizedPath === "evals/runs" ||
-    normalizedPath.startsWith("evals/runs/")
-  );
-}
-
-async function cloneSkillRoot(sourceSkillRoot, destinationSkillRoot) {
-  await cp(sourceSkillRoot, destinationSkillRoot, {
-    recursive: true,
-    force: true,
-    filter: (sourcePath) => shouldCopySkillPath(path.relative(sourceSkillRoot, sourcePath)),
-  });
-}
-
-async function linkWorkspaceNodeModules(workspaceRoot) {
-  const workspaceNodeModulesPath = path.join(workspaceRoot, "node_modules");
-
-  try {
-    await symlink(path.join(repoRoot, "node_modules"), workspaceNodeModulesPath, "dir");
-  } catch (error) {
-    if (error?.code !== "EEXIST") {
-      throw error;
-    }
-  }
-}
-
-async function materializeGitSkillRoot(skillName, ref, workspaceRoot) {
-  const repoRelativeSkillPath = path.posix.join("skills", skillName);
-  const archiveCommand = [
-    "git archive --format=tar",
-    shellEscape(ref),
-    shellEscape(repoRelativeSkillPath),
-    "| tar -x -C",
-    shellEscape(workspaceRoot),
-  ].join(" ");
-
-  await runCommand("bash", ["-lc", archiveCommand], repoRoot);
-  return path.join(workspaceRoot, "skills", skillName);
-}
-
-async function resolveProfileRoot(baseSkillRoot, profileName) {
-  const profileRoot = path.join(baseSkillRoot, "eval_profiles", profileName);
-
-  try {
-    await access(profileRoot);
-  } catch {
-    fail(`Skill profile '${profileName}' not found at ${profileRoot}.`);
-  }
-
-  return profileRoot;
-}
-
-async function applyOverlayProfiles(skillRoot, baseSkillRoot, overlayProfileNames) {
-  for (const profileName of overlayProfileNames) {
-    const profileRoot = await resolveProfileRoot(baseSkillRoot, profileName);
-    await cp(profileRoot, skillRoot, {
-      recursive: true,
-      force: true,
-    });
-  }
-}
-
-async function materializeVariantSkillRoot(skillName, baseSkillRoot, variant, overlayProfileNames = []) {
-  const dedupedOverlayProfiles = dedupeStrings(overlayProfileNames);
-  const needsWorkspace = variant.kind !== "current" || dedupedOverlayProfiles.length > 0;
-
-  if (!needsWorkspace) {
-    return {
-      skillRoot: baseSkillRoot,
-      cleanupRoot: null,
-    };
-  }
-
-  const workspaceRoot = await mkdtemp(
-    path.join(os.tmpdir(), `lightfast-skill-eval-${slugifyVariantLabel(variant.label)}-`),
-  );
-  await linkWorkspaceNodeModules(workspaceRoot);
-  let skillRoot = path.join(workspaceRoot, "skills", skillName);
-
-  if (variant.kind === "current") {
-    await cloneSkillRoot(baseSkillRoot, skillRoot);
-    await applyOverlayProfiles(skillRoot, baseSkillRoot, dedupedOverlayProfiles);
-    return {
-      skillRoot,
-      cleanupRoot: workspaceRoot,
-    };
-  }
-
-  if (variant.kind === "git") {
-    skillRoot = await materializeGitSkillRoot(skillName, variant.ref, workspaceRoot);
-    await applyOverlayProfiles(skillRoot, baseSkillRoot, dedupedOverlayProfiles);
-    return {
-      skillRoot,
-      cleanupRoot: workspaceRoot,
-    };
-  }
-
-  if (variant.kind === "profile") {
-    const profileRoot = await resolveProfileRoot(baseSkillRoot, variant.profileName);
-    await cloneSkillRoot(baseSkillRoot, skillRoot);
-    await cp(profileRoot, skillRoot, {
-      recursive: true,
-      force: true,
-    });
-    await applyOverlayProfiles(skillRoot, baseSkillRoot, dedupedOverlayProfiles);
-
-    return {
-      skillRoot,
-      cleanupRoot: workspaceRoot,
-    };
-  }
-
-  fail(`Unsupported variant kind '${variant.kind}'.`);
-}
-
-async function ensureFreshClient(skillRoot) {
-  const clientDir = path.join(skillRoot, "baml_client");
-  const distDir = path.join(skillRoot, "baml_client_dist");
-  const tsconfigPath = path.join(skillRoot, ".tmp-baml-client-tsconfig.json");
-  await rm(clientDir, { recursive: true, force: true });
-  await rm(distDir, { recursive: true, force: true });
-  await generateClient(skillRoot);
-  await writeFile(
-    tsconfigPath,
-    JSON.stringify(
-      {
-        compilerOptions: {
-          module: "NodeNext",
-          moduleResolution: "NodeNext",
-          target: "ES2022",
-          declaration: false,
-          sourceMap: false,
-          skipLibCheck: true,
-          outDir: distDir,
-          rootDir: clientDir,
-        },
-        include: [path.join(clientDir, "*.ts")],
-      },
-      null,
-      2,
-    ),
-    "utf8",
-  );
-  try {
-    await runCommand(
-      "node",
-      [path.join(repoRoot, "node_modules", "typescript", "bin", "tsc"), "--project", tsconfigPath],
-      repoRoot,
-    );
-  } finally {
-    await rm(tsconfigPath, { force: true });
-  }
-}
-
-async function writeRunArtifacts(runDir, artifacts) {
-  await mkdir(runDir, { recursive: true });
-  for (const [name, value] of Object.entries(artifacts)) {
-    const filePath = path.join(runDir, name);
-    const content = typeof value === "string" ? value : JSON.stringify(value, null, 2);
-    await writeFile(filePath, content, "utf8");
-  }
-}
-
-function createCheck(id, passed, details) {
-  return { id, passed, details };
-}
-
-function compilePatternSpec(patternSpec) {
-  if (typeof patternSpec === "string") {
-    return new RegExp(patternSpec, "i");
-  }
-
-  return new RegExp(patternSpec.pattern, patternSpec.flags ?? "i");
-}
-
-function filterLinesByPatternSpecs(lines, patternSpecs = []) {
-  if (patternSpecs.length === 0) {
-    return lines;
-  }
-
-  return lines.filter((line) =>
-    !patternSpecs.some((patternSpec) => compilePatternSpec(patternSpec).test(line)),
-  );
-}
-
-function normalizeComparableHeading(value) {
-  return stripLeadingSectionNumber(normalizeHeading(value));
-}
-
-function extractMarkdownHeadingEntries(document) {
-  const lines = document.split(/\r?\n/);
-  const headings = [];
-
-  for (const [index, rawLine] of lines.entries()) {
-    const match = rawLine.match(/^\s*(#{1,6})\s+(.+?)\s*$/);
-    if (!match) {
-      continue;
-    }
-
-    headings.push({
-      index,
-      level: match[1].length,
-      title: normalizeLine(match[2]),
-      normalizedTitle: normalizeComparableHeading(match[2]),
-    });
-  }
-
-  return {
-    lines,
-    headings,
-  };
-}
-
-function removeReplaceableSectionContent(document, replaceableSections = []) {
-  if (replaceableSections.length === 0) {
-    return document;
-  }
-
-  const normalizedSpecs = replaceableSections
-    .map((sectionSpec) =>
-      typeof sectionSpec === "string"
-        ? {
-            title: sectionSpec,
-          }
-        : sectionSpec,
-    )
-    .filter((sectionSpec) => typeof sectionSpec?.title === "string")
-    .map((sectionSpec) => ({
-      normalizedTitle: normalizeComparableHeading(sectionSpec.title),
-      level: Number.isInteger(sectionSpec.level) ? sectionSpec.level : null,
-      removeHeading: sectionSpec.remove_heading === true,
-    }));
-
-  if (normalizedSpecs.length === 0) {
-    return document;
-  }
-
-  const { lines, headings } = extractMarkdownHeadingEntries(document);
-  const skippedLineIndexes = new Set();
-
-  for (let headingIndex = 0; headingIndex < headings.length; headingIndex += 1) {
-    const heading = headings[headingIndex];
-    const matchingSpec = normalizedSpecs.find((sectionSpec) => {
-      if (sectionSpec.normalizedTitle !== heading.normalizedTitle) {
-        return false;
-      }
-
-      if (sectionSpec.level !== null && sectionSpec.level !== heading.level) {
-        return false;
-      }
-
-      return true;
-    });
-
-    if (!matchingSpec) {
-      continue;
-    }
-
-    let end = lines.length;
-    for (let nextIndex = headingIndex + 1; nextIndex < headings.length; nextIndex += 1) {
-      if (headings[nextIndex].level <= heading.level) {
-        end = headings[nextIndex].index;
-        break;
-      }
-    }
-
-    const start = matchingSpec.removeHeading ? heading.index : heading.index + 1;
-    for (let lineIndex = start; lineIndex < end; lineIndex += 1) {
-      skippedLineIndexes.add(lineIndex);
-    }
-  }
-
-  return lines
-    .filter((_, index) => !skippedLineIndexes.has(index))
-    .join("\n");
-}
-
-function findSectionBody(sectionMap, targetTitle) {
-  const normalizedTarget = stripLeadingSectionNumber(normalizeHeading(targetTitle));
-
-  for (const [title, body] of sectionMap.entries()) {
-    if (stripLeadingSectionNumber(normalizeHeading(title)) === normalizedTarget) {
-      return body ?? "";
-    }
-  }
-
-  return null;
-}
-
-function createPatternChecks(candidateDocument, patternChecks = [], expectedPresence = true) {
-  const normalizedCandidate = normalizeLine(candidateDocument);
-  const sectionBodies = extractFoundationSectionBodies(candidateDocument).bodies;
-
-  return patternChecks.map((patternCheck) => {
-    const expression = compilePatternSpec(patternCheck);
-    const scopedText = patternCheck.section_title
-      ? findSectionBody(sectionBodies, patternCheck.section_title)
-      : candidateDocument;
-    const normalizedScopedText =
-      typeof scopedText === "string" ? normalizeLine(scopedText) : "";
-    const matched = expression.test(
-      patternCheck.section_title ? normalizedScopedText : normalizedCandidate,
-    );
-    const passed = expectedPresence ? matched : !matched;
-
-    return createCheck(
-      patternCheck.id,
-      passed,
-      passed ? patternCheck.details_pass : patternCheck.details_fail,
-    );
-  });
-}
-
-function createPreservedSectionChecks(existingDocument, candidateDocument, preservedSections = []) {
-  if (preservedSections.length === 0) {
-    return [];
-  }
-
-  const existingSections = extractFoundationSectionBodies(existingDocument).bodies;
-  const candidateSections = extractFoundationSectionBodies(candidateDocument).bodies;
-
-  return preservedSections.map((sectionCheck) => {
-    const existingBody = findSectionBody(existingSections, sectionCheck.title);
-    const candidateBody = findSectionBody(candidateSections, sectionCheck.title);
-    const passed =
-      existingBody !== null &&
-      candidateBody !== null &&
-      normalizeLine(existingBody) === normalizeLine(candidateBody);
-
-    return createCheck(
-      sectionCheck.id,
-      passed,
-      passed ? sectionCheck.details_pass : sectionCheck.details_fail,
-    );
-  });
-}
-
-function extractFoundationTemplateSections(templateText) {
-  const sections = [];
-  const lines = templateText.split(/\r?\n/);
-
-  for (const rawLine of lines) {
-    const line = normalizeLine(rawLine);
-    if (line === "## Disallowed Sections") {
-      break;
-    }
-    if (line.startsWith("## ")) {
-      sections.push(line.slice(3).trim());
-    }
-  }
-
-  return sections;
-}
-
-function extractFoundationDisallowedHeadings(templateText) {
-  const disallowed = new Set();
-  const lines = templateText.split(/\r?\n/);
-  let inDisallowedSection = false;
-
-  for (const rawLine of lines) {
-    const line = normalizeLine(rawLine);
-    if (line === "## Disallowed Sections") {
-      inDisallowedSection = true;
-      continue;
-    }
-    if (!inDisallowedSection) {
-      continue;
-    }
-    const match = line.match(/^- `(.+)`$/);
-    if (match) {
-      disallowed.add(match[1]);
-    }
-  }
-
-  return disallowed;
-}
-
-function extractFoundationSectionBodies(candidateDocument) {
-  const { lines, headings } = extractMarkdownHeadingEntries(candidateDocument);
-  const sections = headings.map((heading) => ({
-    title: heading.title,
-    index: heading.index,
-    level: heading.level,
-  }));
-  const bodies = new Map();
-  for (let index = 0; index < sections.length; index += 1) {
-    const current = sections[index];
-    let end = lines.length;
-
-    for (let nextIndex = index + 1; nextIndex < sections.length; nextIndex += 1) {
-      if (sections[nextIndex].level <= current.level) {
-        end = sections[nextIndex].index;
-        break;
-      }
-    }
-
-    const start = current.index + 1;
-    bodies.set(current.title, lines.slice(start, end).join("\n").trim());
-  }
-
-  return {
-    lines,
-    sections,
-    bodies,
-  };
-}
-
-function extractMarkdownBullets(sectionBody) {
-  const bullets = [];
-  let current = null;
-
-  for (const rawLine of sectionBody.split(/\r?\n/)) {
-    if (/^\s*-\s+/.test(rawLine)) {
-      if (current) {
-        bullets.push(current);
-      }
-      current = normalizeLine(rawLine.replace(/^\s*-\s+/, ""));
-      continue;
-    }
-
-    const line = normalizeLine(rawLine);
-    if (!current || line.length === 0) {
-      continue;
-    }
-
-    current = `${current} ${line}`.trim();
-  }
-
-  if (current) {
-    bullets.push(current);
-  }
-
-  return bullets;
-}
-
-function packetAllowsPublicMaterialsLead(packet) {
-  if (!packet || typeof packet.raw_notes !== "string") {
-    return true;
-  }
-
-  return (
-    /\bURL:\b/i.test(packet.raw_notes) ||
-    /\b##\s+Source\s+\d+/i.test(packet.raw_notes) ||
-    /\bofficial\b.*\bsources?\b/i.test(packet.raw_notes) ||
-    /\bpress release\b/i.test(packet.raw_notes) ||
-    /\bdocs homepage\b/i.test(packet.raw_notes) ||
-    /\bLast updated:\b/i.test(packet.raw_notes) ||
-    /\bPublished:\b/i.test(packet.raw_notes)
-  );
-}
-
-function strategicBetUsesApprovedLead(line, { allowPublicMaterialsLead = true } = {}) {
-  const publicLeadPattern = allowPublicMaterialsLead
-    ? "|Public materials suggest(?:\\s+a bet on|\\s+that)?"
-    : "";
-
-  return new RegExp(
-    `^(The notes suggest(?:\\s+a bet on|\\s+that)?|There are visible signals that${publicLeadPattern}|The source material indicates(?:\\s+a bet on|\\s+that)?)`,
-    "i",
-  ).test(line);
-}
-
-function strategicBetUsesDisallowedLanguage(line) {
-  const normalized = normalizeLine(line).toLowerCase();
-
-  return (
-    normalized.includes("the company appears to be betting on") ||
-    /\bprioriti(?:ze|zes|zed|zing)\b/.test(normalized) ||
-    /\binvest(?:s|ed|ing)? in\b/.test(normalized) ||
-    /\bship(?:s|ped|ping)?\b/.test(normalized) ||
-    /\btreat(?:s|ed|ing)?\b.*\bfirst-class\b/.test(normalized) ||
-    normalized.includes("first-class") ||
-    normalized.includes("is the wedge") ||
-    normalized.includes("is a defensible primitive") ||
-    normalized.includes("will remain") ||
-    normalized.includes("matters more than")
-  );
-}
-
-function extractNormalizedMarkdownBlocks(text) {
-  const blocks = [];
-  let current = null;
-
-  function flush() {
-    if (current && normalizeLine(current).length > 0) {
-      blocks.push(normalizeLine(current));
-    }
-    current = null;
-  }
-
-  for (const rawLine of text.split(/\r?\n/)) {
-    const trimmed = rawLine.trim();
-
-    if (trimmed.length === 0) {
-      flush();
-      continue;
-    }
-
-    if (/^\s*#+\s+/.test(rawLine)) {
-      flush();
-      blocks.push(normalizeLine(rawLine));
-      continue;
-    }
-
-    if (/^\s*-\s+/.test(rawLine)) {
-      flush();
-      current = rawLine.replace(/^\s*-\s+/, "- ");
-      continue;
-    }
-
-    if (current) {
-      current = `${current} ${trimmed}`.trim();
-      continue;
-    }
-
-    current = trimmed;
-  }
-
-  flush();
-  return blocks;
-}
-
-function validateFoundationDocument(candidateDocument, templateText, languageText, packet) {
-  const requiredSections = extractFoundationTemplateSections(templateText);
-  const disallowedHeadings = extractFoundationDisallowedHeadings(templateText);
-  const { lines, sections, bodies } = extractFoundationSectionBodies(candidateDocument);
-  const titleLine = lines.find((line) => normalizeLine(line).length > 0) ?? "";
-  const hasMarkdownTitle = /^\s*#\s+.+\s+Foundation\s*$/.test(titleLine);
-  const lineMap = new Map();
-
-  for (const { title, index } of sections) {
-    const line = normalizeLine(title);
-    if (!lineMap.has(line)) {
-      lineMap.set(line, []);
-    }
-    lineMap.get(line).push(index);
-  }
-
-  const missingSections = [];
-  const duplicateSections = [];
-  const positions = [];
-
-  for (const section of requiredSections) {
-    const matches = lineMap.get(section) ?? [];
-    if (matches.length === 0) {
-      missingSections.push(section);
-      continue;
-    }
-    if (matches.length > 1) {
-      duplicateSections.push(section);
-    }
-    positions.push({
-      section,
-      index: matches[0],
-    });
-  }
-
-  const orderIsCorrect = positions.every((entry, index) => {
-    if (index === 0) {
-      return true;
-    }
-    return entry.index > positions[index - 1].index;
-  });
-
-  const emptySections = [];
-  for (let index = 0; index < positions.length; index += 1) {
-    const current = positions[index];
-    const sectionBody = bodies.get(current.section) ?? "";
-    if (!sectionBody) {
-      emptySections.push(current.section);
-    }
-  }
-
-  const presentDisallowedSections = [...disallowedHeadings].filter((section) =>
-    (lineMap.get(section) ?? []).length > 0,
-  );
-  const strategicBetsBody = bodies.get("Strategic Bets") ?? "";
-  const openQuestionsBody = bodies.get("Open Questions") ?? "";
-  const openQuestionBullets = extractMarkdownBullets(openQuestionsBody);
-  const allowPublicMaterialsLead = packetAllowsPublicMaterialsLead(packet);
-  const openQuestionsLookOpen =
-    openQuestionBullets.length > 0 &&
-    openQuestionBullets.every((line) => line.endsWith("?"));
-  const strategicBetLines = extractMarkdownBullets(strategicBetsBody);
-  const strategicBetsUseBullets =
-    strategicBetsBody.trim().length === 0 || strategicBetLines.length > 0;
-  const hedgedStrategicBets =
-    strategicBetLines.length > 0 &&
-    strategicBetLines.every((line) =>
-      strategicBetUsesApprovedLead(line, { allowPublicMaterialsLead }),
-    );
-  const strategicBetsAvoidPrescriptiveOrCompanyPosture =
-    strategicBetLines.length === 0 ||
-    strategicBetLines.every((line) => !strategicBetUsesDisallowedLanguage(line));
-
-  return [
-    createCheck(
-      "title_heading_present",
-      hasMarkdownTitle,
-      hasMarkdownTitle
-        ? "Detected the required markdown title heading."
-        : "Missing required title heading like `# <Primitive Name> Foundation`.",
-    ),
-    createCheck(
-      "required_sections_present_once",
-      missingSections.length === 0 && duplicateSections.length === 0,
-      missingSections.length === 0 && duplicateSections.length === 0
-        ? `All required sections from template are present exactly once: ${requiredSections.join(", ")}.`
-        : `Missing: ${missingSections.join(", ") || "none"}. Duplicate: ${duplicateSections.join(", ") || "none"}.`,
-    ),
-    createCheck(
-      "required_sections_in_template_order",
-      missingSections.length === 0 && orderIsCorrect,
-      missingSections.length > 0
-        ? "Section order check skipped because one or more required sections are missing."
-        : orderIsCorrect
-          ? "Required sections follow the template order."
-          : "Required sections are present but not in template order.",
-    ),
-    createCheck(
-      "required_sections_nonempty",
-      emptySections.length === 0,
-      emptySections.length === 0
-        ? "Every required section has non-empty content."
-        : `Empty sections: ${emptySections.join(", ")}.`,
-    ),
-    createCheck(
-      "no_disallowed_sections",
-      presentDisallowedSections.length === 0,
-      presentDisallowedSections.length === 0
-        ? "No disallowed downstream-planning sections were detected."
-        : `Disallowed sections present: ${presentDisallowedSections.join(", ")}.`,
-    ),
-    createCheck(
-      "strategic_bets_use_markdown_bullets",
-      strategicBetsUseBullets,
-      strategicBetsUseBullets
-        ? "Strategic Bets use markdown bullets."
-        : "Strategic Bets should be rendered as markdown bullets using `- `, not as paragraph-only prose.",
-    ),
-    createCheck(
-      "strategic_bets_use_directional_language",
-      hedgedStrategicBets,
-      hedgedStrategicBets
-        ? "Strategic Bets use approved source-centered lead phrasing."
-        : allowPublicMaterialsLead
-          ? "One or more Strategic Bets bullets do not begin with approved source-centered phrasing such as `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`."
-          : "One or more Strategic Bets bullets use evidence-source phrasing that does not match a notes-only packet. In note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.",
-    ),
-    createCheck(
-      "strategic_bets_avoid_prescriptive_or_company_intent_language",
-      strategicBetsAvoidPrescriptiveOrCompanyPosture,
-      strategicBetsAvoidPrescriptiveOrCompanyPosture
-        ? "Strategic Bets avoid prescriptive verbs and direct company-intent phrasing."
-        : "One or more Strategic Bets bullets use prescriptive verbs, categorical phrasing, or direct company-intent language.",
-    ),
-    createCheck(
-      "open_questions_remain_questions",
-      openQuestionsLookOpen,
-      openQuestionsLookOpen
-        ? "Open Questions are written as explicit unanswered questions."
-        : "Open Questions should be bullet questions that remain open and usually end with `?`.",
-    ),
-    createCheck(
-      "no_first_or_second_person",
-      !hasPronounDrift(candidateDocument),
-      !hasPronounDrift(candidateDocument)
-        ? "No obvious first-person or second-person pronouns detected."
-        : "Detected first-person or second-person pronouns that violate the language guide.",
-    ),
-  ];
-}
-
-function validateFoundationUpdateDocument(
-  candidateDocument,
-  existingFoundationText,
-  templateText,
-  validationContract,
-  packet,
-) {
-  const baseChecks = validateFoundationDocument(candidateDocument, templateText, "", packet);
-  const filteredExistingFoundationText = removeReplaceableSectionContent(
-    existingFoundationText,
-    validationContract.replaceable_sections ?? [],
-  );
-  const existingBlocks = extractNormalizedMarkdownBlocks(filteredExistingFoundationText);
-  const candidateBlocks = extractNormalizedMarkdownBlocks(candidateDocument);
-  const preservedExistingBlocks = filterLinesByPatternSpecs(
-    existingBlocks,
-    validationContract.allowed_removed_patterns ?? [],
-  );
-  const preservesExistingContent = linesAppearInOrder(preservedExistingBlocks, candidateBlocks);
-  const requiredPatternChecks = createPatternChecks(
-    candidateDocument,
-    validationContract.required_patterns ?? [],
-    true,
-  );
-  const forbiddenPatternChecks = createPatternChecks(
-    candidateDocument,
-    validationContract.forbidden_patterns ?? [],
-    false,
-  );
-
-  return [
-    ...baseChecks,
-    createCheck(
-      "existing_content_preserved_in_order",
-      preservesExistingContent,
-      preservesExistingContent
-        ? "All existing markdown blocks appear in order in the candidate, except blocks explicitly marked as replaceable."
-        : "One or more existing markdown blocks were removed or reordered outside the explicitly replaceable blocks.",
-    ),
-    ...requiredPatternChecks,
-    ...forbiddenPatternChecks,
-  ];
-}
-
-function extractSpecMajorSections(templateText) {
-  const sections = [];
-  const lines = templateText.split(/\r?\n/);
-
-  for (const rawLine of lines) {
-    const line = normalizeLine(rawLine);
-    const match = line.match(/^## \d+\.\s+(.+)$/);
-    if (match) {
-      sections.push(match[1]);
-    }
-  }
-
-  return sections;
-}
-
-function extractSpecSubsections(templateText) {
-  const sections = [];
-  const lines = templateText.split(/\r?\n/);
-
-  for (const rawLine of lines) {
-    const line = normalizeLine(rawLine);
-    const match = line.match(/^### \d+\.\d+\s+(.+)$/);
-    if (match) {
-      sections.push(match[1]);
-    }
-  }
-
-  return sections;
-}
-
-function lineExists(lines, matcher) {
-  return lines.some((line) => matcher(normalizeHeading(line)));
-}
-
-function validateSpecDocument(candidateDocument, templateText) {
-  const requiredSections = extractSpecMajorSections(templateText);
-  const requiredSubsections = extractSpecSubsections(templateText);
-  const lines = candidateDocument.split(/\r?\n/);
-  const missingSections = requiredSections.filter(
-    (section) =>
-      !lineExists(
-        lines,
-        (line) => stripLeadingSectionNumber(line).toLowerCase() === section.toLowerCase(),
-      ),
-  );
-  const missingSubsections = requiredSubsections.filter(
-    (section) =>
-      !lineExists(
-        lines,
-        (line) => stripLeadingSectionNumber(line).toLowerCase() === section.toLowerCase(),
-      ),
-  );
-
-  const hasStatusLine = lines.some((line) => /^\s*Status:\s+\S+/.test(line));
-  const hasPurposeLine = lines.some((line) => /^\s*Purpose:\s+\S+/.test(line));
-  const hasProblemStatement = lineExists(lines, (line) => line === "1. Problem Statement");
-  const hasGoalSubsections =
-    lineExists(lines, (line) => line === "2.1 Goals") &&
-    lineExists(lines, (line) => line === "2.2 Non-Goals");
-  const hasImportantBoundaryBlock =
-    /(^|\n)\s*(?:\*\*)?Important boundary:(?:\*\*)?\s*(?:\n|$)/m.test(candidateDocument);
-  const hasNumberedComponents = /^\d+\.\s+`[^`]+`/m.test(candidateDocument);
-  const hasFieldFormatting = /- `[^`]+` \([^)]+\)/.test(candidateDocument);
-  const fieldLinesAvoidRequirementKeywords =
-    !/- `[^`]+` \([^)]*\b(required|optional)\b[^)]*\)/i.test(candidateDocument);
-
-  return [
-    createCheck(
-      "core_sections_present",
-      missingSections.length === 0,
-      missingSections.length === 0
-        ? `All major template sections are present: ${requiredSections.join(", ")}.`
-        : `Missing major sections: ${missingSections.join(", ")}.`,
-    ),
-    createCheck(
-      "required_subsections_present",
-      missingSubsections.length === 0,
-      missingSubsections.length === 0
-        ? `All template subsections are present: ${requiredSubsections.join(", ")}.`
-        : `Missing template subsections: ${missingSubsections.join(", ")}.`,
-    ),
-    createCheck(
-      "status_line_present",
-      hasStatusLine,
-      hasStatusLine
-        ? "Status line is present."
-        : "Status line is missing or malformed.",
-    ),
-    createCheck(
-      "purpose_line_present",
-      hasPurposeLine,
-      hasPurposeLine
-        ? "Purpose line is present."
-        : "Purpose line is missing or malformed.",
-    ),
-    createCheck(
-      "problem_statement_present",
-      hasProblemStatement,
-      hasProblemStatement
-        ? "Problem Statement major section is present."
-        : "Problem Statement major section is missing.",
-    ),
-    createCheck(
-      "goals_subsections_present",
-      hasGoalSubsections,
-      hasGoalSubsections
-        ? "Detected `2.1 Goals` and `2.2 Non-Goals` subsections."
-        : "Missing one or both of the required goals subsections: `2.1 Goals`, `2.2 Non-Goals`.",
-    ),
-    createCheck(
-      "important_boundary_block_present",
-      hasImportantBoundaryBlock,
-      hasImportantBoundaryBlock
-        ? "Detected an `Important boundary:` block inside the document."
-        : "Did not detect the required `Important boundary:` block.",
-    ),
-    createCheck(
-      "component_list_uses_numbering",
-      hasNumberedComponents,
-      hasNumberedComponents
-        ? "Detected numbered component entries in the spec."
-        : "Did not detect numbered component entries like `1. `Component Name``.",
-    ),
-    createCheck(
-      "domain_fields_use_template_shape",
-      hasFieldFormatting,
-      hasFieldFormatting
-        ? "Detected domain-field lines using the `` `field_name` (type) `` format."
-        : "Did not detect any domain-field lines using the template field format.",
-    ),
-    createCheck(
-      "field_parens_avoid_requirement_keywords",
-      fieldLinesAvoidRequirementKeywords,
-      fieldLinesAvoidRequirementKeywords
-        ? "Field type parentheses avoid `required`/`optional` labels."
-        : "Detected `required` or `optional` inside field type parentheses; keep those details in the description bullets instead.",
-    ),
-    createCheck(
-      "no_first_or_second_person",
-      !hasPronounDrift(candidateDocument),
-      !hasPronounDrift(candidateDocument)
-        ? "No obvious first-person or second-person pronouns detected."
-        : "Detected first-person or second-person pronouns that violate the language guide.",
-    ),
-    createCheck(
-      "obligation_keywords_lowercase",
-      !hasUppercaseObligationKeyword(candidateDocument),
-      !hasUppercaseObligationKeyword(candidateDocument)
-        ? "No uppercase obligation keywords detected."
-        : "Detected uppercase MUST/SHOULD/MAY, which violates the language guide.",
-    ),
-  ];
-}
-
-function validateSpecUpdateDocument(candidateDocument, existingSpecText, validationContract) {
-  const filteredExistingSpecText = removeReplaceableSectionContent(
-    existingSpecText,
-    validationContract.replaceable_sections ?? [],
-  );
-  const existingBlocks = extractNormalizedMarkdownBlocks(filteredExistingSpecText);
-  const candidateBlocks = extractNormalizedMarkdownBlocks(candidateDocument);
-  const preservedExistingBlocks = filterLinesByPatternSpecs(
-    existingBlocks,
-    validationContract.allowed_removed_patterns ?? [],
-  );
-  const preservesExistingContent = linesAppearInOrder(preservedExistingBlocks, candidateBlocks);
-  const requiredPatternChecks = createPatternChecks(
-    candidateDocument,
-    validationContract.required_patterns ?? [],
-    true,
-  );
-  const forbiddenPatternChecks = createPatternChecks(
-    candidateDocument,
-    validationContract.forbidden_patterns ?? [],
-    false,
-  );
-  const preservedSectionChecks = createPreservedSectionChecks(
-    existingSpecText,
-    candidateDocument,
-    validationContract.preserve_sections ?? [],
-  );
-
-  return [
-    createCheck(
-      "existing_content_preserved_in_order",
-      preservesExistingContent,
-      preservesExistingContent
-        ? "All existing markdown blocks appear in order in the candidate, except blocks explicitly marked as replaceable."
-        : "One or more existing markdown blocks were removed or reordered outside the explicitly replaceable blocks.",
-    ),
-    ...requiredPatternChecks,
-    ...forbiddenPatternChecks,
-    ...preservedSectionChecks,
-    createCheck(
-      "no_first_or_second_person",
-      !hasPronounDrift(candidateDocument),
-      !hasPronounDrift(candidateDocument)
-        ? "No obvious first-person or second-person pronouns detected."
-        : "Detected first-person or second-person pronouns that violate the language guide.",
-    ),
-    createCheck(
-      "obligation_keywords_lowercase",
-      !hasUppercaseObligationKeyword(candidateDocument),
-      !hasUppercaseObligationKeyword(candidateDocument)
-        ? "No uppercase obligation keywords detected."
-        : "Detected uppercase MUST/SHOULD/MAY, which violates the language guide.",
-    ),
-  ];
-}
-
-async function runDeterministicChecks(skillRoot, validationContract, candidateDocument, packet = null) {
-  if (!validationContract || validationContract.type !== "reference_document_checks") {
-    return {
-      enabled: false,
-      overall_pass: true,
-      checks: [],
-    };
-  }
-
-  const templatePath = validationContract.template_file
-    ? path.join(skillRoot, validationContract.template_file)
-    : null;
-  const languagePath = validationContract.language_file
-    ? path.join(skillRoot, validationContract.language_file)
-    : null;
-  const existingSpecPath = validationContract.existing_spec_file
-    ? path.join(skillRoot, validationContract.existing_spec_file)
-    : null;
-  const existingFoundationPath = validationContract.existing_foundation_file
-    ? path.join(skillRoot, validationContract.existing_foundation_file)
-    : null;
-  const [templateText, languageText, existingSpecText, existingFoundationText] = await Promise.all([
-    validationContract.template_file ? loadText(templatePath) : Promise.resolve(""),
-    languagePath ? loadText(languagePath) : Promise.resolve(""),
-    existingSpecPath ? loadText(existingSpecPath) : Promise.resolve(""),
-    existingFoundationPath ? loadText(existingFoundationPath) : Promise.resolve(""),
-  ]);
-
-  let checks;
-  switch (validationContract.validator) {
-    case "foundation-v1":
-      checks = validateFoundationDocument(candidateDocument, templateText, languageText, packet);
-      break;
-    case "foundation-update-v1":
-      checks = validateFoundationUpdateDocument(
-        candidateDocument,
-        existingFoundationText,
-        templateText,
-        validationContract,
-        packet,
-      );
-      break;
-    case "spec-v1":
-      checks = validateSpecDocument(candidateDocument, templateText, languageText);
-      break;
-    case "spec-update-v1":
-      checks = validateSpecUpdateDocument(candidateDocument, existingSpecText, validationContract);
-      break;
-    default:
-      fail(`Unknown validation contract '${validationContract.validator}'.`);
-  }
-
-  return {
-    enabled: true,
-    validator: validationContract.validator,
-    overall_pass: checks.every((check) => check.passed),
-    checks,
-  };
-}
-
-async function runSingleTrial({
-  evalEntry,
-  evalsDir,
-  candidateGenerated,
-  judgeGenerated,
-  runner,
-  skillName,
-  skillRoot,
-  validationContract,
-}) {
-  const { b: candidateClient } = candidateGenerated;
-  const { b: judgeClient } = judgeGenerated;
-  const packet = await buildPacket(evalEntry, evalsDir, runner.packet_type);
-  const compileFnName = runner.compile_brief_function;
-  const renderFnName = runner.render_document_function;
-  const evaluateFnName = runner.evaluate_document_function;
-
-  if (
-    typeof candidateClient[compileFnName] !== "function" ||
-    typeof candidateClient[renderFnName] !== "function"
-  ) {
-    fail(
-      `Generated client is missing one or more compile/render runner functions for '${path.basename(skillRoot)}'.`,
-    );
-  }
-
-  if (typeof judgeClient[evaluateFnName] !== "function") {
-    fail(
-      `Judge client is missing evaluate runner function '${evaluateFnName}' for '${path.basename(skillRoot)}'.`,
-    );
-  }
-
-  const timing = {};
-  const startedAt = Date.now();
-
-  const compileStartedAt = Date.now();
-  const compiledBrief = await candidateClient[compileFnName](packet);
-  timing.compile_ms = Date.now() - compileStartedAt;
-
-  const { brief, normalization } = normalizeCompiledBriefForRender({
-    skillName,
-    packetType: runner.packet_type,
-    brief: compiledBrief,
-    packet,
-  });
-
-  if (runner.packet_type === "SpecEvalPacket") {
-    brief.update_request = packet.task_prompt;
-    if (packet.existing_spec) {
-      brief.existing_spec = packet.existing_spec;
-    }
-  }
-
-  if (runner.packet_type === "FoundationEvalPacket") {
-    brief.update_request = packet.task_prompt;
-    if (packet.existing_foundation) {
-      brief.existing_foundation = packet.existing_foundation;
-    }
-  }
-
-  const renderStartedAt = Date.now();
-  const candidateDocument = await candidateClient[renderFnName](brief);
-  timing.render_ms = Date.now() - renderStartedAt;
-
-  const deterministicStartedAt = Date.now();
-  const deterministic_checks = await runDeterministicChecks(
-    skillRoot,
-    validationContract,
-    candidateDocument,
-    packet,
-  );
-  timing.deterministic_ms = Date.now() - deterministicStartedAt;
-
-  const evaluateStartedAt = Date.now();
-  const report = await judgeClient[evaluateFnName](packet, candidateDocument);
-  timing.evaluate_ms = Date.now() - evaluateStartedAt;
-  timing.total_ms = Date.now() - startedAt;
-
-  const combined_status =
-    deterministic_checks.enabled && !deterministic_checks.overall_pass
-      ? worstStatus([report.overall_status, "Fail"])
-      : report.overall_status;
-
-  return {
-    packet,
-    brief,
-    candidateDocument,
-    report,
-    deterministic_checks,
-    normalization,
-    timing,
-    summary: {
-      llm_status: report.overall_status,
-      combined_status,
-      deterministic_pass: deterministic_checks.overall_pass,
-    },
-  };
-}
-
-async function runVariantTrials({
-  artifactDir,
-  candidateGenerated,
-  evalEntry,
-  evalsDir,
-  judgeGenerated,
-  runner,
-  skillName,
-  skillRoot,
-  trials,
-  validationContract,
-  variant,
-}) {
-  const trialResults = [];
-
-  for (let trialIndex = 0; trialIndex < trials; trialIndex += 1) {
-    const trialResult = await runSingleTrial({
-      evalEntry,
-      evalsDir,
-      candidateGenerated,
-      judgeGenerated,
-      runner,
-      skillName,
-      skillRoot,
-      validationContract,
-    });
-    trialResults.push(trialResult);
-
-    const trialArtifacts = {
-      "packet.json": trialResult.packet,
-      "brief.json": trialResult.brief,
-      "candidate.md": trialResult.candidateDocument,
-      "report.json": trialResult.report,
-      "deterministic_checks.json": trialResult.deterministic_checks,
-      "normalization.json": trialResult.normalization,
-      "timing.json": trialResult.timing,
-      "summary.json": trialResult.summary,
-    };
-
-    if (trials === 1) {
-      await writeRunArtifacts(artifactDir, trialArtifacts);
-    } else {
-      await writeRunArtifacts(path.join(artifactDir, `trial-${trialIndex + 1}`), trialArtifacts);
-    }
-  }
-
-  const benchmark = {
-    ...buildBenchmark(skillName, evalEntry.eval_name, trialResults),
-    eval_metadata: extractEvalMetadata(evalEntry),
-    variant,
-  };
-
-  await writeRunArtifacts(artifactDir, {
-    "benchmark.json": benchmark,
-  });
-
-  return {
-    benchmark,
-    trialResults,
-  };
-}
-
-function buildBenchmark(skillName, evalName, trials) {
-  const judgeStatuses = trials.map((trial) => trial.report.overall_status);
-  const combinedStatuses = trials.map((trial) => trial.summary.combined_status);
-  const deterministicPassCount = trials.filter(
-    (trial) => trial.deterministic_checks.overall_pass,
-  ).length;
-
-  const checkStats = new Map();
-  for (const trial of trials) {
-    for (const check of trial.deterministic_checks.checks) {
-      if (!checkStats.has(check.id)) {
-        checkStats.set(check.id, {
-          id: check.id,
-          passed: 0,
-          total: 0,
-          last_details: "",
-        });
-      }
-      const stat = checkStats.get(check.id);
-      stat.total += 1;
-      if (check.passed) {
-        stat.passed += 1;
-      }
-      stat.last_details = check.details;
-    }
-  }
-
-  return {
-    skill_name: skillName,
-    eval_name: evalName,
-    trial_count: trials.length,
-    judge_status_counts: {
-      Pass: judgeStatuses.filter((status) => status === "Pass").length,
-      Partial: judgeStatuses.filter((status) => status === "Partial").length,
-      Fail: judgeStatuses.filter((status) => status === "Fail").length,
-    },
-    combined_status_counts: {
-      Pass: combinedStatuses.filter((status) => status === "Pass").length,
-      Partial: combinedStatuses.filter((status) => status === "Partial").length,
-      Fail: combinedStatuses.filter((status) => status === "Fail").length,
-    },
-    benchmark_summary: {
-      llm_worst_status: worstStatus(judgeStatuses),
-      combined_worst_status: worstStatus(combinedStatuses),
-      deterministic_pass_rate: Number((deterministicPassCount / trials.length).toFixed(2)),
-    },
-    trial_summaries: trials.map((trial, index) => ({
-      trial: index + 1,
-      llm_status: trial.report.overall_status,
-      combined_status: trial.summary.combined_status,
-      deterministic_pass: trial.deterministic_checks.overall_pass,
-      failed_deterministic_checks: trial.deterministic_checks.checks
-        .filter((check) => !check.passed)
-        .map((check) => check.id),
-      timing_ms: trial.timing.total_ms,
-    })),
-    timing_ms: {
-      compile: summarizeNumeric(trials.map((trial) => trial.timing.compile_ms)),
-      render: summarizeNumeric(trials.map((trial) => trial.timing.render_ms)),
-      deterministic: summarizeNumeric(trials.map((trial) => trial.timing.deterministic_ms)),
-      evaluate: summarizeNumeric(trials.map((trial) => trial.timing.evaluate_ms)),
-      total: summarizeNumeric(trials.map((trial) => trial.timing.total_ms)),
-    },
-    deterministic_checks: [...checkStats.values()].map((stat) => ({
-      id: stat.id,
-      pass_rate: Number((stat.passed / stat.total).toFixed(2)),
-      passed_trials: stat.passed,
-      total_trials: stat.total,
-      last_details: stat.last_details,
-    })),
-  };
-}
-
-function extractEvalMetadata(evalEntry) {
-  const fields = [
-    "scenario_type",
-    "input_shape",
-    "ambiguity_level",
-    "domain_profile",
-    "primary_risks",
-  ];
-
-  return Object.fromEntries(
-    fields
-      .filter((field) => evalEntry[field] !== undefined)
-      .map((field) => [field, evalEntry[field]]),
-  );
-}
-
-function compareBenchmarks(left, right) {
-  const leftSummary = left.benchmark_summary;
-  const rightSummary = right.benchmark_summary;
-
-  const combinedOrder = compareStatuses(
-    leftSummary.combined_worst_status,
-    rightSummary.combined_worst_status,
-  );
-  if (combinedOrder !== 0) {
-    return combinedOrder;
-  }
-
-  const llmOrder = compareStatuses(leftSummary.llm_worst_status, rightSummary.llm_worst_status);
-  if (llmOrder !== 0) {
-    return llmOrder;
-  }
-
-  return rightSummary.deterministic_pass_rate - leftSummary.deterministic_pass_rate;
-}
-
-function buildComparisonReport(skillName, evalEntry, variantResults, evalProfile) {
-  const currentVariant = variantResults.find((variantResult) => variantResult.variant.key === "current");
-  const rankedVariants = [...variantResults]
-    .sort((left, right) => compareBenchmarks(left.benchmark, right.benchmark))
-    .map((variantResult, index) => ({
-      rank: index + 1,
-      label: variantResult.variant.label,
-      combined_worst_status: variantResult.benchmark.benchmark_summary.combined_worst_status,
-      llm_worst_status: variantResult.benchmark.benchmark_summary.llm_worst_status,
-      deterministic_pass_rate: variantResult.benchmark.benchmark_summary.deterministic_pass_rate,
-    }));
-
-  const currentVsBaselines = currentVariant
-    ? variantResults
-        .filter((variantResult) => variantResult.variant.key !== "current")
-        .map((variantResult) => ({
-          label: variantResult.variant.label,
-          combined_worst_status_relative_to_current:
-            compareStatuses(
-              variantResult.benchmark.benchmark_summary.combined_worst_status,
-              currentVariant.benchmark.benchmark_summary.combined_worst_status,
-            ) < 0
-              ? "better"
-              : compareStatuses(
-                    variantResult.benchmark.benchmark_summary.combined_worst_status,
-                    currentVariant.benchmark.benchmark_summary.combined_worst_status,
-                  ) > 0
-                ? "worse"
-                : "same",
-          llm_worst_status_relative_to_current:
-            compareStatuses(
-              variantResult.benchmark.benchmark_summary.llm_worst_status,
-              currentVariant.benchmark.benchmark_summary.llm_worst_status,
-            ) < 0
-              ? "better"
-              : compareStatuses(
-                    variantResult.benchmark.benchmark_summary.llm_worst_status,
-                    currentVariant.benchmark.benchmark_summary.llm_worst_status,
-                  ) > 0
-                ? "worse"
-                : "same",
-          deterministic_pass_rate_delta: Number(
-            (
-              variantResult.benchmark.benchmark_summary.deterministic_pass_rate -
-              currentVariant.benchmark.benchmark_summary.deterministic_pass_rate
-            ).toFixed(2),
-          ),
-        }))
-    : [];
-
-  return {
-    skill_name: skillName,
-    eval_name: evalEntry.eval_name,
-    trial_count: variantResults[0]?.benchmark.trial_count ?? 0,
-    judge_variant: "current",
-    eval_profile: summarizeEvalProfile(evalProfile),
-    eval_metadata: extractEvalMetadata(evalEntry),
-    variants: variantResults.map((variantResult) => ({
-      label: variantResult.variant.label,
-      source: variantResult.variant.source,
-      run_subdir: variantResult.run_subdir,
-      benchmark_summary: variantResult.benchmark.benchmark_summary,
-      judge_status_counts: variantResult.benchmark.judge_status_counts,
-      combined_status_counts: variantResult.benchmark.combined_status_counts,
-      timing_ms: variantResult.benchmark.timing_ms,
-    })),
-    ranking: rankedVariants,
-    current_vs_baselines: currentVsBaselines,
-  };
-}
-
-async function main() {
-  const { skillName, selector, trials, compare, evalProfile: evalProfileName } = parseArgs(
-    process.argv.slice(2),
-  );
-
-  if (!skillName) {
-    fail(
-      "Usage: bun run ./scripts/run-baml-eval.mjs <foundation-creator|spec-creator> [eval-id-or-name] [--trials N] [--compare previous,profile:no-skill] [--eval-profile fast|gate|prod|cross]",
-    );
-  }
-
-  const skillRoot = path.join(repoRoot, "skills", skillName);
-  const evalsDir = path.join(skillRoot, "evals");
-  const manifestPath = path.join(evalsDir, "evals.json");
-  const manifest = await loadJson(manifestPath);
-  const evalEntry = getEvalBySelector(manifest.evals, selector);
-  const variants = buildVariantPlan(compare);
-  const compareMode = compare.length > 0 || variants.length > 1;
-  const evalProfile = getEvalProfilePreset(evalProfileName);
-  const runner = manifest.runner_contract;
-  const validationContract = evalEntry.validation_contract ?? manifest.validation_contract ?? null;
-
-  if (!runner || runner.type !== "baml_pipeline") {
-    fail(`Skill '${skillName}' does not declare a supported runner_contract.`);
-  }
-
-  if (!process.env.AI_GATEWAY_API_KEY) {
-    fail("AI_GATEWAY_API_KEY is required to execute BAML evals.");
-  }
-
-  const cleanupRoots = [];
-  const judgePrepared = await materializeVariantSkillRoot(
-    skillName,
-    skillRoot,
-    parseVariantSpec("current"),
-    evalProfile.judgeOverlayProfiles,
-  );
-  if (judgePrepared.cleanupRoot) {
-    cleanupRoots.push(judgePrepared.cleanupRoot);
-  }
-
-  await ensureFreshClient(judgePrepared.skillRoot);
-  const judgeGenerated = await importGeneratedClient(judgePrepared.skillRoot);
-  const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
-  const runDir = path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`);
-
-  try {
-    const variantResults = [];
-
-    for (const variant of variants) {
-      const currentVariantMatchesJudge =
-        variant.kind === "current" &&
-        sameStringArray(evalProfile.candidateOverlayProfiles, evalProfile.judgeOverlayProfiles);
-
-      let preparedVariant = judgePrepared;
-      let candidateGenerated = judgeGenerated;
-
-      if (!currentVariantMatchesJudge) {
-        preparedVariant = await materializeVariantSkillRoot(
-          skillName,
-          skillRoot,
-          variant,
-          evalProfile.candidateOverlayProfiles,
-        );
-        if (preparedVariant.cleanupRoot) {
-          cleanupRoots.push(preparedVariant.cleanupRoot);
-        }
-
-        await ensureFreshClient(preparedVariant.skillRoot);
-        candidateGenerated = await importGeneratedClient(preparedVariant.skillRoot);
-      }
-
-      const artifactDir = compareMode
-        ? path.join(runDir, "variants", slugifyVariantLabel(variant.label))
-        : runDir;
-
-      const variantRun = await runVariantTrials({
-        artifactDir,
-        candidateGenerated,
-        evalEntry,
-        evalsDir,
-        judgeGenerated,
-        runner,
-        skillName,
-        skillRoot,
-        trials,
-        validationContract,
-        variant,
-      });
-
-      variantResults.push({
-        variant,
-        benchmark: variantRun.benchmark,
-        run_subdir: path.relative(runDir, artifactDir).split(path.sep).join("/"),
-      });
-    }
-
-    if (compareMode) {
-      const comparison = buildComparisonReport(skillName, evalEntry, variantResults, evalProfile);
-      await writeRunArtifacts(runDir, {
-        "comparison.json": comparison,
-      });
-
-      console.log(`Run complete: ${runDir}`);
-      console.log(
-        `Eval profile: ${evalProfile.name} (candidate ${evalProfile.candidateModel}, judge ${evalProfile.judgeModel})`,
-      );
-      console.log(`Trials per variant: ${trials}`);
-      for (const variantResult of variantResults) {
-        console.log(
-          `${variantResult.variant.label}: LLM ${variantResult.benchmark.benchmark_summary.llm_worst_status}, Combined ${variantResult.benchmark.benchmark_summary.combined_worst_status}, Deterministic ${variantResult.benchmark.benchmark_summary.deterministic_pass_rate}`,
-        );
-      }
-      return;
-    }
-
-    const benchmark = variantResults[0].benchmark;
-    console.log(`Run complete: ${runDir}`);
-    console.log(
-      `Eval profile: ${evalProfile.name} (candidate ${evalProfile.candidateModel}, judge ${evalProfile.judgeModel})`,
-    );
-    console.log(`Trials: ${trials}`);
-    console.log(`LLM worst status: ${benchmark.benchmark_summary.llm_worst_status}`);
-    console.log(`Combined worst status: ${benchmark.benchmark_summary.combined_worst_status}`);
-  } finally {
-    await Promise.all(cleanupRoots.map((cleanupRoot) => rm(cleanupRoot, { recursive: true, force: true })));
-  }
-}
-
-main().catch((error) => {
-  console.error(error instanceof Error ? error.message : String(error));
-  process.exit(1);
-});
diff --git a/scripts/run-baml-eval.ts b/scripts/run-baml-eval.ts
new file mode 100644
index 0000000..10a595d
--- /dev/null
+++ b/scripts/run-baml-eval.ts
@@ -0,0 +1,464 @@
+import { mkdir, rm, writeFile } from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import {
+  ensureFreshClient,
+  importGeneratedClient,
+} from "./evals/baml.ts";
+import { parseArgs } from "./evals/cli.ts";
+import {
+  buildEvalPacket,
+  getEvalEntriesBySelector,
+  loadEvalManifest,
+} from "./evals/manifest.ts";
+import { normalizeCompiledBriefForRender } from "./evals/normalization.ts";
+import { getEvalProfilePreset } from "./evals/profiles.ts";
+import {
+  buildBenchmark,
+  buildComparisonReport,
+  buildSuiteSummary,
+} from "./evals/reports.ts";
+import { fail } from "./evals/runtime.ts";
+import { worstStatus } from "./evals/status.ts";
+import { runDeterministicChecks } from "./evals/validators/index.ts";
+import {
+  buildVariantPlan,
+  materializeVariantSkillRoot,
+  parseVariantSpec,
+  sameStringArray,
+  slugifyVariantLabel,
+} from "./evals/variants.ts";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const repoRoot = path.resolve(__dirname, "..");
+
+async function writeRunArtifacts(runDir, artifacts) {
+  await mkdir(runDir, { recursive: true });
+  for (const [name, value] of Object.entries(artifacts)) {
+    const filePath = path.join(runDir, name);
+    const content = typeof value === "string" ? value : JSON.stringify(value, null, 2);
+    await writeFile(filePath, content, "utf8");
+  }
+}
+
+async function runSingleTrial({
+  evalEntry,
+  evalsDir,
+  candidateGenerated,
+  judgeGenerated,
+  runner,
+  skillName,
+  skillRoot,
+  validationContract,
+}) {
+  const { b: candidateClient } = candidateGenerated;
+  const { b: judgeClient } = judgeGenerated;
+  const packet = await buildEvalPacket(evalEntry, evalsDir, runner.packet_type);
+  const compileFnName = runner.compile_brief_function;
+  const renderFnName = runner.render_document_function;
+  const evaluateFnName = runner.evaluate_document_function;
+
+  if (
+    typeof candidateClient[compileFnName] !== "function" ||
+    typeof candidateClient[renderFnName] !== "function"
+  ) {
+    fail(
+      `Generated client is missing one or more compile/render runner functions for '${path.basename(skillRoot)}'.`,
+    );
+  }
+
+  if (typeof judgeClient[evaluateFnName] !== "function") {
+    fail(
+      `Judge client is missing evaluate runner function '${evaluateFnName}' for '${path.basename(skillRoot)}'.`,
+    );
+  }
+
+  const timing = {};
+  const startedAt = Date.now();
+
+  const compileStartedAt = Date.now();
+  const compiledBrief = await candidateClient[compileFnName](packet);
+  timing.compile_ms = Date.now() - compileStartedAt;
+
+  const { brief, normalization } = normalizeCompiledBriefForRender({
+    skillName,
+    packetType: runner.packet_type,
+    brief: compiledBrief,
+    packet,
+  });
+
+  if (runner.packet_type === "SpecEvalPacket") {
+    brief.update_request = packet.task_prompt;
+    if (packet.existing_spec) {
+      brief.existing_spec = packet.existing_spec;
+    }
+  }
+
+  if (runner.packet_type === "FoundationEvalPacket") {
+    brief.update_request = packet.task_prompt;
+    if (packet.existing_foundation) {
+      brief.existing_foundation = packet.existing_foundation;
+    }
+  }
+
+  const renderStartedAt = Date.now();
+  const candidateDocument = await candidateClient[renderFnName](brief);
+  timing.render_ms = Date.now() - renderStartedAt;
+
+  const deterministicStartedAt = Date.now();
+  const deterministic_checks = await runDeterministicChecks(
+    skillRoot,
+    validationContract,
+    candidateDocument,
+    packet,
+  );
+  timing.deterministic_ms = Date.now() - deterministicStartedAt;
+
+  const evaluateStartedAt = Date.now();
+  const report = await judgeClient[evaluateFnName](packet, candidateDocument);
+  timing.evaluate_ms = Date.now() - evaluateStartedAt;
+  timing.total_ms = Date.now() - startedAt;
+
+  const combined_status =
+    deterministic_checks.enabled && !deterministic_checks.overall_pass
+      ? worstStatus([report.overall_status, "Fail"])
+      : report.overall_status;
+
+  return {
+    packet,
+    brief,
+    candidateDocument,
+    report,
+    deterministic_checks,
+    normalization,
+    timing,
+    summary: {
+      llm_status: report.overall_status,
+      combined_status,
+      deterministic_pass: deterministic_checks.overall_pass,
+    },
+  };
+}
+
+async function runVariantTrials({
+  artifactDir,
+  candidateGenerated,
+  evalEntry,
+  evalsDir,
+  judgeGenerated,
+  runner,
+  skillName,
+  skillRoot,
+  trials,
+  validationContract,
+  variant,
+}) {
+  const trialResults = [];
+
+  for (let trialIndex = 0; trialIndex < trials; trialIndex += 1) {
+    const trialResult = await runSingleTrial({
+      evalEntry,
+      evalsDir,
+      candidateGenerated,
+      judgeGenerated,
+      runner,
+      skillName,
+      skillRoot,
+      validationContract,
+    });
+    trialResults.push(trialResult);
+
+    const trialArtifacts = {
+      "packet.json": trialResult.packet,
+      "brief.json": trialResult.brief,
+      "candidate.md": trialResult.candidateDocument,
+      "report.json": trialResult.report,
+      "deterministic_checks.json": trialResult.deterministic_checks,
+      "normalization.json": trialResult.normalization,
+      "timing.json": trialResult.timing,
+      "summary.json": trialResult.summary,
+    };
+
+    if (trials === 1) {
+      await writeRunArtifacts(artifactDir, trialArtifacts);
+    } else {
+      await writeRunArtifacts(path.join(artifactDir, `trial-${trialIndex + 1}`), trialArtifacts);
+    }
+  }
+
+  const benchmark = {
+    ...buildBenchmark(skillName, evalEntry, trialResults),
+    variant,
+  };
+
+  await writeRunArtifacts(artifactDir, {
+    "benchmark.json": benchmark,
+  });
+
+  return {
+    benchmark,
+    trialResults,
+  };
+}
+
+async function runEvalEntry({
+  cleanupRoots,
+  compareMode,
+  evalEntry,
+  evalProfile,
+  evalsDir,
+  judgeGenerated,
+  judgePrepared,
+  manifestValidationContract,
+  runDir,
+  runner,
+  skillName,
+  skillRoot,
+  trials,
+  variants,
+}) {
+  const validationContract = evalEntry.validation_contract ?? manifestValidationContract ?? null;
+  const variantResults = [];
+
+  for (const variant of variants) {
+    const currentVariantMatchesJudge =
+      variant.kind === "current" &&
+      sameStringArray(evalProfile.candidateOverlayProfiles, evalProfile.judgeOverlayProfiles);
+
+    let preparedVariant = judgePrepared;
+    let candidateGenerated = judgeGenerated;
+
+    if (!currentVariantMatchesJudge) {
+      preparedVariant = await materializeVariantSkillRoot(
+        skillName,
+        skillRoot,
+        variant,
+        evalProfile.candidateOverlayProfiles,
+        repoRoot,
+      );
+      if (preparedVariant.cleanupRoot) {
+        cleanupRoots.push(preparedVariant.cleanupRoot);
+      }
+
+      await ensureFreshClient(preparedVariant.skillRoot, repoRoot);
+      candidateGenerated = await importGeneratedClient(preparedVariant.skillRoot);
+    }
+
+    const artifactDir = compareMode
+      ? path.join(runDir, "variants", slugifyVariantLabel(variant.label))
+      : runDir;
+
+    const variantRun = await runVariantTrials({
+      artifactDir,
+      candidateGenerated,
+      evalEntry,
+      evalsDir,
+      judgeGenerated,
+      runner,
+      skillName,
+      skillRoot,
+      trials,
+      validationContract,
+      variant,
+    });
+
+    variantResults.push({
+      variant,
+      benchmark: variantRun.benchmark,
+      run_subdir: path.relative(runDir, artifactDir).split(path.sep).join("/"),
+    });
+  }
+
+  const currentVariant =
+    variantResults.find((variantResult) => variantResult.variant.key === "current") ??
+    variantResults[0];
+
+  if (compareMode) {
+    const comparison = buildComparisonReport(skillName, evalEntry, variantResults, evalProfile);
+    await writeRunArtifacts(runDir, {
+      "comparison.json": comparison,
+    });
+
+    console.log(`Run complete: ${runDir}`);
+    console.log(
+      `Eval profile: ${evalProfile.name} (candidate ${evalProfile.candidateModel}, judge ${evalProfile.judgeModel})`,
+    );
+    console.log(`Trials per variant: ${trials}`);
+    for (const variantResult of variantResults) {
+      console.log(
+        `${variantResult.variant.label}: LLM ${variantResult.benchmark.benchmark_summary.llm_worst_status}, Combined ${variantResult.benchmark.benchmark_summary.combined_worst_status}, Deterministic ${variantResult.benchmark.benchmark_summary.deterministic_pass_rate}`,
+      );
+    }
+
+    return {
+      eval_name: evalEntry.eval_name,
+      run_dir: runDir,
+      current_summary: currentVariant.benchmark.benchmark_summary,
+      comparison,
+      variants: variantResults.map((variantResult) => ({
+        label: variantResult.variant.label,
+        run_subdir: variantResult.run_subdir,
+        benchmark_summary: variantResult.benchmark.benchmark_summary,
+      })),
+    };
+  }
+
+  const benchmark = currentVariant.benchmark;
+  console.log(`Run complete: ${runDir}`);
+  console.log(
+    `Eval profile: ${evalProfile.name} (candidate ${evalProfile.candidateModel}, judge ${evalProfile.judgeModel})`,
+  );
+  console.log(`Trials: ${trials}`);
+  console.log(`LLM worst status: ${benchmark.benchmark_summary.llm_worst_status}`);
+  console.log(`Combined worst status: ${benchmark.benchmark_summary.combined_worst_status}`);
+
+  return {
+    eval_name: evalEntry.eval_name,
+    run_dir: runDir,
+    current_summary: benchmark.benchmark_summary,
+  };
+}
+
+async function main() {
+  const {
+    skillName,
+    selector,
+    trials,
+    compare,
+    evalProfile: evalProfileName,
+    runAll,
+  } = parseArgs(process.argv.slice(2));
+
+  if (!skillName) {
+    fail(
+      "Usage: bun run ./scripts/run-baml-eval.ts <foundation-creator|spec-creator> [eval-id-or-name] [--all] [--trials N] [--compare previous,profile:no-skill] [--eval-profile fast|gate|prod|cross]",
+    );
+  }
+
+  const skillRoot = path.join(repoRoot, "skills", skillName);
+  const evalsDir = path.join(skillRoot, "evals");
+  const manifest = await loadEvalManifest(evalsDir, skillName);
+  const evalEntries = getEvalEntriesBySelector(manifest.evals, selector, runAll);
+  const variants = buildVariantPlan(compare);
+  const compareMode = compare.length > 0 || variants.length > 1;
+  const evalProfile = getEvalProfilePreset(evalProfileName);
+  const runner = manifest.runner_contract;
+  const manifestValidationContract = manifest.validation_contract ?? null;
+
+  if (!process.env.AI_GATEWAY_API_KEY) {
+    fail("AI_GATEWAY_API_KEY is required to execute BAML evals.");
+  }
+
+  const cleanupRoots = [];
+  const judgePrepared = await materializeVariantSkillRoot(
+    skillName,
+    skillRoot,
+    parseVariantSpec("current"),
+    evalProfile.judgeOverlayProfiles,
+    repoRoot,
+  );
+  if (judgePrepared.cleanupRoot) {
+    cleanupRoots.push(judgePrepared.cleanupRoot);
+  }
+
+  await ensureFreshClient(judgePrepared.skillRoot, repoRoot);
+  const judgeGenerated = await importGeneratedClient(judgePrepared.skillRoot);
+  const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
+  const suiteDir = runAll
+    ? path.join(skillRoot, "evals", "runs", `${timestamp}-suite`)
+    : null;
+
+  try {
+    const suiteResults = [];
+
+    for (const [index, evalEntry] of evalEntries.entries()) {
+      if (runAll) {
+        console.log(`\n=== ${index + 1}/${evalEntries.length}: ${evalEntry.eval_name} ===`);
+      }
+
+      const runDir = runAll
+        ? path.join(suiteDir, evalEntry.eval_name)
+        : path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`);
+
+      try {
+        const evalResult = await runEvalEntry({
+          cleanupRoots,
+          compareMode,
+          evalEntry,
+          evalProfile,
+          evalsDir,
+          judgeGenerated,
+          judgePrepared,
+          manifestValidationContract,
+          runDir,
+          runner,
+          skillName,
+          skillRoot,
+          trials,
+          variants,
+        });
+
+        suiteResults.push(evalResult);
+      } catch (error) {
+        if (!runAll) {
+          throw error;
+        }
+
+        const message = error instanceof Error ? error.message : String(error);
+        console.error(`Eval failed: ${evalEntry.eval_name}`);
+        console.error(message);
+
+        await writeRunArtifacts(runDir, {
+          "error.json": {
+            eval_name: evalEntry.eval_name,
+            message,
+          },
+        });
+
+        suiteResults.push({
+          eval_name: evalEntry.eval_name,
+          run_dir: runDir,
+          error: message,
+          current_summary: {
+            llm_worst_status: "Fail",
+            combined_worst_status: "Fail",
+            deterministic_pass_rate: 0,
+          },
+        });
+      }
+    }
+
+    if (runAll) {
+      const suiteSummary = buildSuiteSummary({
+        skillName,
+        evalProfile,
+        trials,
+        compareMode,
+        suiteResults,
+        suiteDir,
+      });
+
+      await writeRunArtifacts(suiteDir, {
+        "suite.json": suiteSummary,
+      });
+
+      console.log(`\nSuite complete: ${suiteDir}`);
+      console.log(`Evals: ${suiteResults.length}`);
+      console.log(`Non-pass evals: ${suiteSummary.non_pass_count}`);
+
+      if (suiteSummary.non_pass_count > 0) {
+        process.exitCode = 1;
+      }
+    }
+  } finally {
+    await Promise.all(
+      cleanupRoots.map((cleanupRoot) => rm(cleanupRoot, { recursive: true, force: true })),
+    );
+  }
+}
+
+main().catch((error) => {
+  console.error(error instanceof Error ? error.message : String(error));
+  process.exit(1);
+});
diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json
index 3a72a11..d048fb6 100644
--- a/skills/spec-creator/evals/evals.json
+++ b/skills/spec-creator/evals/evals.json
@@ -695,7 +695,7 @@
           },
           {
             "id": "repo_execution_boundary_present",
-            "pattern": "repo mutation|mutating repositories|apply(?:ing)? approved changes|apply changes(?: directly)? to (?:the )?repo|CI runner|running CI|deployment executor|deploying outputs|post-approval execution surface|automatic(?:ally)? apply approved",
+            "pattern": "repo(?:sitory)? mutation|mutating repositories|appl(?:y|ies|ying)(?: approved)? changes|apply changes(?: directly)? to (?:the )?repo|CI runner|runs? CI|running CI|deployment executor|deploys? work|deploying outputs|post-approval (?:execution )?surface|automatic(?:ally)? apply approved",
             "section_title": "Non-Goals",
             "details_pass": "Detected a refreshed boundary against repo mutation or execution behavior.",
             "details_fail": "Did not detect a refreshed boundary against repo mutation or execution behavior."

From 5f0ad1a765c5277a73634c2f23a955eaf8b0e960 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 14:39:51 +1000
Subject: [PATCH 13/30] Add Braintrust-ready eval reporting

---
 README.md                                  |  45 +++
 bun.lock                                   | 374 +++++++++++++++++++++
 evals/TAXONOMY.md                          |   9 +-
 package.json                               |   7 +-
 scripts/evals/README.md                    |   9 +-
 scripts/evals/artifacts.ts                 |  56 +++
 scripts/evals/cli.ts                       |  37 ++
 scripts/evals/git.ts                       |  34 ++
 scripts/evals/manifest.ts                  |  23 +-
 scripts/evals/reporters.ts                 | 240 +++++++++++++
 scripts/evals/reports.ts                   |  18 +-
 scripts/evals/runtime.ts                   |   2 +-
 scripts/evals/validators/index.ts          |   2 +-
 scripts/run-baml-eval.ts                   | 368 +++++++++++++++-----
 skills/foundation-creator/evals/evals.json |   3 +
 skills/spec-creator/evals/evals.json       |   5 +
 16 files changed, 1137 insertions(+), 95 deletions(-)
 create mode 100644 scripts/evals/artifacts.ts
 create mode 100644 scripts/evals/git.ts
 create mode 100644 scripts/evals/reporters.ts

diff --git a/README.md b/README.md
index cd501e6..4c78e08 100644
--- a/README.md
+++ b/README.md
@@ -27,11 +27,14 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and
 
 ```bash
 bun install
+bun run eval:typecheck
 bun run eval:foundation -- create-foundation-from-vercel-source-packet
 bun run eval:foundation -- create-foundation-from-lightfast-founder-notes
 bun run eval:foundation -- update-lightfast-foundation-boundary-surface-question
 bun run eval:foundation -- update-lightfast-foundation-tighten-overreach
 bun run eval:spec -- create-from-vercel-mcp-source-packet
+bun run eval:foundation:smoke
+bun run eval:spec:smoke
 bun run eval:spec -- --all
 bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator create-foundation-from-cloudflare-source-packet --eval-profile gate --trials 3
 bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator update-lightfast-foundation-tighten-overreach --eval-profile fast --compare previous,profile:no-skill
@@ -75,6 +78,19 @@ manifest and writes a suite directory under `skills/<skill>/evals/runs/` with:
 Suite mode exits nonzero if any eval has a non-`Pass` combined status, making it
 suitable for CI gates.
 
+When `--smoke` is used, the runner executes only manifest entries marked with
+`"smoke": true`. The package scripts `eval:foundation:smoke` and
+`eval:spec:smoke` are the intended lightweight CI commands.
+
+When `--deterministic-only <path>` is used, the runner validates an existing
+`candidate.md` artifact against deterministic reference checks without calling
+the candidate model or LLM judge. The path can point to a `candidate.md`, a run
+directory, or a suite directory:
+
+```bash
+bun run eval:spec -- update-add-single-nongoal-preserve-system-overview --deterministic-only skills/spec-creator/evals/runs/<run>/candidate.md
+```
+
 Current comparison variants:
 
 - `current` — working tree prompt stack
@@ -98,6 +114,35 @@ Model profiles are applied as overlay fixtures, so prompt comparisons against
 `previous` or `profile:no-skill` stay on the same candidate/judge model split.
 The `cross` profile requires Anthropic model access through Vercel AI Gateway.
 
+Local JSON artifacts remain the source of truth. Optional Braintrust export can
+be enabled with:
+
+```bash
+bun run eval:spec -- create-from-vercel-mcp-source-packet --reporter local,braintrust
+```
+
+Braintrust export requires `BRAINTRUST_API_KEY`. The default project is
+`lightfast-skills`, which can be overridden with `BRAINTRUST_PROJECT`.
+
+Experiment names are generated as:
+
+```text
+<capability-id>.<suite-mode>.<profile>.<run-kind>.<yyyymmdd-HHMM>.<git-sha>
+```
+
+Examples:
+
+```text
+foundation-doc.smoke.fast.model.20260423-0423.6cbdaa4
+service-spec.smoke.fast.deterministic.20260423-0422.6cbdaa4
+service-spec.compare.gate.model.20260423-0530.6cbdaa4
+```
+
+Use stable `capability_id` values in manifests instead of relying on mutable
+skill package names. Current values are `foundation-doc` and `service-spec`.
+Optional Braintrust environment variables are `BRAINTRUST_EXPERIMENT` for
+manual curated runs and `BRAINTRUST_ORG` for org selection.
+
 Eval manifests also carry lightweight taxonomy metadata
 (`scenario_type`, `input_shape`, `ambiguity_level`, `domain_profile`,
 `primary_risks`) so benchmark runs can be grouped by failure mode. Shared
diff --git a/bun.lock b/bun.lock
index 0a5ac2e..d5e1a0d 100644
--- a/bun.lock
+++ b/bun.lock
@@ -9,11 +9,17 @@
         "typescript": "5.9.3",
       },
       "devDependencies": {
+        "@types/node": "^25.6.0",
+        "braintrust": "^3.9.0",
         "dotenv-cli": "^8.0.0",
       },
     },
   },
   "packages": {
+    "@ai-sdk/provider": ["@ai-sdk/provider@1.1.3", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg=="],
+
+    "@apm-js-collab/code-transformer": ["@apm-js-collab/code-transformer@0.12.0", "", { "dependencies": { "@types/estree": "^1.0.8", "astring": "^1.9.0", "esquery": "^1.7.0", "meriyah": "^6.1.4", "semifies": "^1.0.0", "source-map": "^0.6.0" } }, "sha512-5F2ob4cMYezbaUGAk+YltbDvb9BFIghN92ubct9Ho/0MFx4FkChCxYV99NkU6Kx+RAgaqBV6yxKuWreQ6K8SOw=="],
+
     "@boundaryml/baml": ["@boundaryml/baml@0.221.0", "", { "dependencies": { "@scarf/scarf": "^1.3.0" }, "optionalDependencies": { "@boundaryml/baml-darwin-arm64": "0.221.0", "@boundaryml/baml-darwin-x64": "0.221.0", "@boundaryml/baml-linux-arm64-gnu": "0.221.0", "@boundaryml/baml-linux-arm64-musl": "0.221.0", "@boundaryml/baml-linux-x64-gnu": "0.221.0", "@boundaryml/baml-linux-x64-musl": "0.221.0", "@boundaryml/baml-win32-arm64-msvc": "0.221.0", "@boundaryml/baml-win32-x64-msvc": "0.221.0" }, "bin": { "baml": "cli.js", "baml-cli": "cli.js" } }, "sha512-pPOp2JVsG4Wa/tMLnJv/rxil5jsuVDgxnA0xO0h4lKy7t/fKCXOVvO+nzpOZ4byLTP/Ow+8pVvoKRKvx1J/Hsw=="],
 
     "@boundaryml/baml-darwin-arm64": ["@boundaryml/baml-darwin-arm64@0.221.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-GxqdjVUodyKtgKX/CIDGZyz5lXS0d0iFnV2x7thMQM9ziMrOPcWd3qwflOLYdgDo6Hy9yMULrqtMPkCrmbwEHQ=="],
@@ -32,28 +38,396 @@
 
     "@boundaryml/baml-win32-x64-msvc": ["@boundaryml/baml-win32-x64-msvc@0.221.0", "", { "os": "win32", "cpu": "x64" }, "sha512-XP3CxwsYxOZAOzkWqZd2Dg8iNpDOMrbA/Bz3nqI7oX/wL+ZMkHJwjWQwxIVL+sg2rp+TceV+21UPb6LTmt+qJw=="],
 
+    "@colors/colors": ["@colors/colors@1.5.0", "", {}, "sha512-ooWCrlZP11i8GImSjTHYHLkvFDP48nS4+204nGb1RiX/WXYHmJA2III9/e2DWVabCESdW7hBAEzHRqUn9OUVvQ=="],
+
+    "@esbuild/aix-ppc64": ["@esbuild/aix-ppc64@0.27.7", "", { "os": "aix", "cpu": "ppc64" }, "sha512-EKX3Qwmhz1eMdEJokhALr0YiD0lhQNwDqkPYyPhiSwKrh7/4KRjQc04sZ8db+5DVVnZ1LmbNDI1uAMPEUBnQPg=="],
+
+    "@esbuild/android-arm": ["@esbuild/android-arm@0.27.7", "", { "os": "android", "cpu": "arm" }, "sha512-jbPXvB4Yj2yBV7HUfE2KHe4GJX51QplCN1pGbYjvsyCZbQmies29EoJbkEc+vYuU5o45AfQn37vZlyXy4YJ8RQ=="],
+
+    "@esbuild/android-arm64": ["@esbuild/android-arm64@0.27.7", "", { "os": "android", "cpu": "arm64" }, "sha512-62dPZHpIXzvChfvfLJow3q5dDtiNMkwiRzPylSCfriLvZeq0a1bWChrGx/BbUbPwOrsWKMn8idSllklzBy+dgQ=="],
+
+    "@esbuild/android-x64": ["@esbuild/android-x64@0.27.7", "", { "os": "android", "cpu": "x64" }, "sha512-x5VpMODneVDb70PYV2VQOmIUUiBtY3D3mPBG8NxVk5CogneYhkR7MmM3yR/uMdITLrC1ml/NV1rj4bMJuy9MCg=="],
+
+    "@esbuild/darwin-arm64": ["@esbuild/darwin-arm64@0.27.7", "", { "os": "darwin", "cpu": "arm64" }, "sha512-5lckdqeuBPlKUwvoCXIgI2D9/ABmPq3Rdp7IfL70393YgaASt7tbju3Ac+ePVi3KDH6N2RqePfHnXkaDtY9fkw=="],
+
+    "@esbuild/darwin-x64": ["@esbuild/darwin-x64@0.27.7", "", { "os": "darwin", "cpu": "x64" }, "sha512-rYnXrKcXuT7Z+WL5K980jVFdvVKhCHhUwid+dDYQpH+qu+TefcomiMAJpIiC2EM3Rjtq0sO3StMV/+3w3MyyqQ=="],
+
+    "@esbuild/freebsd-arm64": ["@esbuild/freebsd-arm64@0.27.7", "", { "os": "freebsd", "cpu": "arm64" }, "sha512-B48PqeCsEgOtzME2GbNM2roU29AMTuOIN91dsMO30t+Ydis3z/3Ngoj5hhnsOSSwNzS+6JppqWsuhTp6E82l2w=="],
+
+    "@esbuild/freebsd-x64": ["@esbuild/freebsd-x64@0.27.7", "", { "os": "freebsd", "cpu": "x64" }, "sha512-jOBDK5XEjA4m5IJK3bpAQF9/Lelu/Z9ZcdhTRLf4cajlB+8VEhFFRjWgfy3M1O4rO2GQ/b2dLwCUGpiF/eATNQ=="],
+
+    "@esbuild/linux-arm": ["@esbuild/linux-arm@0.27.7", "", { "os": "linux", "cpu": "arm" }, "sha512-RkT/YXYBTSULo3+af8Ib0ykH8u2MBh57o7q/DAs3lTJlyVQkgQvlrPTnjIzzRPQyavxtPtfg0EopvDyIt0j1rA=="],
+
+    "@esbuild/linux-arm64": ["@esbuild/linux-arm64@0.27.7", "", { "os": "linux", "cpu": "arm64" }, "sha512-RZPHBoxXuNnPQO9rvjh5jdkRmVizktkT7TCDkDmQ0W2SwHInKCAV95GRuvdSvA7w4VMwfCjUiPwDi0ZO6Nfe9A=="],
+
+    "@esbuild/linux-ia32": ["@esbuild/linux-ia32@0.27.7", "", { "os": "linux", "cpu": "ia32" }, "sha512-GA48aKNkyQDbd3KtkplYWT102C5sn/EZTY4XROkxONgruHPU72l+gW+FfF8tf2cFjeHaRbWpOYa/uRBz/Xq1Pg=="],
+
+    "@esbuild/linux-loong64": ["@esbuild/linux-loong64@0.27.7", "", { "os": "linux", "cpu": "none" }, "sha512-a4POruNM2oWsD4WKvBSEKGIiWQF8fZOAsycHOt6JBpZ+JN2n2JH9WAv56SOyu9X5IqAjqSIPTaJkqN8F7XOQ5Q=="],
+
+    "@esbuild/linux-mips64el": ["@esbuild/linux-mips64el@0.27.7", "", { "os": "linux", "cpu": "none" }, "sha512-KabT5I6StirGfIz0FMgl1I+R1H73Gp0ofL9A3nG3i/cYFJzKHhouBV5VWK1CSgKvVaG4q1RNpCTR2LuTVB3fIw=="],
+
+    "@esbuild/linux-ppc64": ["@esbuild/linux-ppc64@0.27.7", "", { "os": "linux", "cpu": "ppc64" }, "sha512-gRsL4x6wsGHGRqhtI+ifpN/vpOFTQtnbsupUF5R5YTAg+y/lKelYR1hXbnBdzDjGbMYjVJLJTd2OFmMewAgwlQ=="],
+
+    "@esbuild/linux-riscv64": ["@esbuild/linux-riscv64@0.27.7", "", { "os": "linux", "cpu": "none" }, "sha512-hL25LbxO1QOngGzu2U5xeXtxXcW+/GvMN3ejANqXkxZ/opySAZMrc+9LY/WyjAan41unrR3YrmtTsUpwT66InQ=="],
+
+    "@esbuild/linux-s390x": ["@esbuild/linux-s390x@0.27.7", "", { "os": "linux", "cpu": "s390x" }, "sha512-2k8go8Ycu1Kb46vEelhu1vqEP+UeRVj2zY1pSuPdgvbd5ykAw82Lrro28vXUrRmzEsUV0NzCf54yARIK8r0fdw=="],
+
+    "@esbuild/linux-x64": ["@esbuild/linux-x64@0.27.7", "", { "os": "linux", "cpu": "x64" }, "sha512-hzznmADPt+OmsYzw1EE33ccA+HPdIqiCRq7cQeL1Jlq2gb1+OyWBkMCrYGBJ+sxVzve2ZJEVeePbLM2iEIZSxA=="],
+
+    "@esbuild/netbsd-arm64": ["@esbuild/netbsd-arm64@0.27.7", "", { "os": "none", "cpu": "arm64" }, "sha512-b6pqtrQdigZBwZxAn1UpazEisvwaIDvdbMbmrly7cDTMFnw/+3lVxxCTGOrkPVnsYIosJJXAsILG9XcQS+Yu6w=="],
+
+    "@esbuild/netbsd-x64": ["@esbuild/netbsd-x64@0.27.7", "", { "os": "none", "cpu": "x64" }, "sha512-OfatkLojr6U+WN5EDYuoQhtM+1xco+/6FSzJJnuWiUw5eVcicbyK3dq5EeV/QHT1uy6GoDhGbFpprUiHUYggrw=="],
+
+    "@esbuild/openbsd-arm64": ["@esbuild/openbsd-arm64@0.27.7", "", { "os": "openbsd", "cpu": "arm64" }, "sha512-AFuojMQTxAz75Fo8idVcqoQWEHIXFRbOc1TrVcFSgCZtQfSdc1RXgB3tjOn/krRHENUB4j00bfGjyl2mJrU37A=="],
+
+    "@esbuild/openbsd-x64": ["@esbuild/openbsd-x64@0.27.7", "", { "os": "openbsd", "cpu": "x64" }, "sha512-+A1NJmfM8WNDv5CLVQYJ5PshuRm/4cI6WMZRg1by1GwPIQPCTs1GLEUHwiiQGT5zDdyLiRM/l1G0Pv54gvtKIg=="],
+
+    "@esbuild/openharmony-arm64": ["@esbuild/openharmony-arm64@0.27.7", "", { "os": "none", "cpu": "arm64" }, "sha512-+KrvYb/C8zA9CU/g0sR6w2RBw7IGc5J2BPnc3dYc5VJxHCSF1yNMxTV5LQ7GuKteQXZtspjFbiuW5/dOj7H4Yw=="],
+
+    "@esbuild/sunos-x64": ["@esbuild/sunos-x64@0.27.7", "", { "os": "sunos", "cpu": "x64" }, "sha512-ikktIhFBzQNt/QDyOL580ti9+5mL/YZeUPKU2ivGtGjdTYoqz6jObj6nOMfhASpS4GU4Q/Clh1QtxWAvcYKamA=="],
+
+    "@esbuild/win32-arm64": ["@esbuild/win32-arm64@0.27.7", "", { "os": "win32", "cpu": "arm64" }, "sha512-7yRhbHvPqSpRUV7Q20VuDwbjW5kIMwTHpptuUzV+AA46kiPze5Z7qgt6CLCK3pWFrHeNfDd1VKgyP4O+ng17CA=="],
+
+    "@esbuild/win32-ia32": ["@esbuild/win32-ia32@0.27.7", "", { "os": "win32", "cpu": "ia32" }, "sha512-SmwKXe6VHIyZYbBLJrhOoCJRB/Z1tckzmgTLfFYOfpMAx63BJEaL9ExI8x7v0oAO3Zh6D/Oi1gVxEYr5oUCFhw=="],
+
+    "@esbuild/win32-x64": ["@esbuild/win32-x64@0.27.7", "", { "os": "win32", "cpu": "x64" }, "sha512-56hiAJPhwQ1R4i+21FVF7V8kSD5zZTdHcVuRFMW0hn753vVfQN8xlx4uOPT4xoGH0Z/oVATuR82AiqSTDIpaHg=="],
+
+    "@jridgewell/gen-mapping": ["@jridgewell/gen-mapping@0.3.13", "", { "dependencies": { "@jridgewell/sourcemap-codec": "^1.5.0", "@jridgewell/trace-mapping": "^0.3.24" } }, "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA=="],
+
+    "@jridgewell/remapping": ["@jridgewell/remapping@2.3.5", "", { "dependencies": { "@jridgewell/gen-mapping": "^0.3.5", "@jridgewell/trace-mapping": "^0.3.24" } }, "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ=="],
+
+    "@jridgewell/resolve-uri": ["@jridgewell/resolve-uri@3.1.2", "", {}, "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw=="],
+
+    "@jridgewell/sourcemap-codec": ["@jridgewell/sourcemap-codec@1.5.5", "", {}, "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og=="],
+
+    "@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="],
+
+    "@kwsites/file-exists": ["@kwsites/file-exists@1.1.1", "", { "dependencies": { "debug": "^4.1.1" } }, "sha512-m9/5YGR18lIwxSFDwfE3oA7bWuq9kdau6ugN4H2rJeyhFQZcG9AgSHkQtSD15a8WvTgfz9aikZMrKPHvbpqFiw=="],
+
+    "@kwsites/promise-deferred": ["@kwsites/promise-deferred@1.1.1", "", {}, "sha512-GaHYm+c0O9MjZRu0ongGBRbinu8gVAMd2UZjji6jVmqKtZluZnptXGWhz1E8j8D2HJ3f/yMxKAUC0b+57wncIw=="],
+
+    "@next/env": ["@next/env@14.2.35", "", {}, "sha512-DuhvCtj4t9Gwrx80dmz2F4t/zKQ4ktN8WrMwOuVzkJfBilwAwGr6v16M5eI8yCuZ63H9TTuEU09Iu2HqkzFPVQ=="],
+
     "@scarf/scarf": ["@scarf/scarf@1.4.0", "", {}, "sha512-xxeapPiUXdZAE3che6f3xogoJPeZgig6omHEy1rIY5WVsB3H2BHNnZH+gHG6x91SCWyQCzWGsuL2Hh3ClO5/qQ=="],
 
+    "@simple-git/args-pathspec": ["@simple-git/args-pathspec@1.0.3", "", {}, "sha512-ngJMaHlsWDTfjyq9F3VIQ8b7NXbBLq5j9i5bJ6XLYtD6qlDXT7fdKY2KscWWUF8t18xx052Y/PUO1K1TRc9yKA=="],
+
+    "@simple-git/argv-parser": ["@simple-git/argv-parser@1.1.1", "", { "dependencies": { "@simple-git/args-pathspec": "^1.0.3" } }, "sha512-Q9lBcfQ+VQCpQqGJFHe5yooOS5hGdLFFbJ5R+R5aDsnkPCahtn1hSkMcORX65J2Z5lxSkD0lQorMsncuBQxYUw=="],
+
+    "@types/estree": ["@types/estree@1.0.8", "", {}, "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w=="],
+
+    "@types/node": ["@types/node@25.6.0", "", { "dependencies": { "undici-types": "~7.19.0" } }, "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ=="],
+
+    "@vercel/functions": ["@vercel/functions@1.6.0", "", { "peerDependencies": { "@aws-sdk/credential-provider-web-identity": "*" }, "optionalPeers": ["@aws-sdk/credential-provider-web-identity"] }, "sha512-R6FKQrYT5MZs5IE1SqeCJWxMuBdHawFcCZboKKw8p7s+6/mcd55Gx6tWmyKnQTyrSEA04NH73Tc9CbqpEle8RA=="],
+
+    "accepts": ["accepts@1.3.8", "", { "dependencies": { "mime-types": "~2.1.34", "negotiator": "0.6.3" } }, "sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw=="],
+
+    "acorn": ["acorn@8.16.0", "", { "bin": { "acorn": "bin/acorn" } }, "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw=="],
+
+    "ajv": ["ajv@8.18.0", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A=="],
+
+    "ansi-align": ["ansi-align@3.0.1", "", { "dependencies": { "string-width": "^4.1.0" } }, "sha512-IOfwwBF5iczOjp/WeY4YxyjqAFMQoZufdQWDd19SEExbVLNXqvpzSJ/M7Za4/sCPmQ0+GRquoA7bGcINcxew6w=="],
+
+    "ansi-regex": ["ansi-regex@6.2.2", "", {}, "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg=="],
+
+    "ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="],
+
+    "argparse": ["argparse@2.0.1", "", {}, "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q=="],
+
+    "array-flatten": ["array-flatten@1.1.1", "", {}, "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg=="],
+
+    "astring": ["astring@1.9.0", "", { "bin": { "astring": "bin/astring" } }, "sha512-LElXdjswlqjWrPpJFg1Fx4wpkOCxj1TDHlSV4PlaRxHGWko024xICaa97ZkMfs6DRKlCguiAI+rbXv5GWwXIkg=="],
+
+    "balanced-match": ["balanced-match@1.0.2", "", {}, "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="],
+
+    "body-parser": ["body-parser@1.20.4", "", { "dependencies": { "bytes": "~3.1.2", "content-type": "~1.0.5", "debug": "2.6.9", "depd": "2.0.0", "destroy": "~1.2.0", "http-errors": "~2.0.1", "iconv-lite": "~0.4.24", "on-finished": "~2.4.1", "qs": "~6.14.0", "raw-body": "~2.5.3", "type-is": "~1.6.18", "unpipe": "~1.0.0" } }, "sha512-ZTgYYLMOXY9qKU/57FAo8F+HA2dGX7bqGc71txDRC1rS4frdFI5R7NhluHxH6M0YItAP0sHB4uqAOcYKxO6uGA=="],
+
+    "boxen": ["boxen@8.0.1", "", { "dependencies": { "ansi-align": "^3.0.1", "camelcase": "^8.0.0", "chalk": "^5.3.0", "cli-boxes": "^3.0.0", "string-width": "^7.2.0", "type-fest": "^4.21.0", "widest-line": "^5.0.0", "wrap-ansi": "^9.0.0" } }, "sha512-F3PH5k5juxom4xktynS7MoFY+NUWH5LC4CnH11YB8NPew+HLpmBLCybSAEyb2F+4pRXhuhWqFesoQd6DAyc2hw=="],
+
+    "brace-expansion": ["brace-expansion@2.1.0", "", { "dependencies": { "balanced-match": "^1.0.0" } }, "sha512-TN1kCZAgdgweJhWWpgKYrQaMNHcDULHkWwQIspdtjV4Y5aurRdZpjAqn6yX3FPqTA9ngHCc4hJxMAMgGfve85w=="],
+
+    "braintrust": ["braintrust@3.9.0", "", { "dependencies": { "@ai-sdk/provider": "^1.1.3", "@apm-js-collab/code-transformer": "^0.12.0", "@next/env": "^14.2.3", "@vercel/functions": "^1.0.2", "ajv": "^8.17.1", "argparse": "^2.0.1", "boxen": "^8.0.1", "chalk": "^4.1.2", "cli-progress": "^3.12.0", "cli-table3": "^0.6.5", "cors": "^2.8.5", "dc-browser": "^1.0.4", "dotenv": "^16.4.5", "esbuild": "^0.27.0", "eventsource-parser": "^1.1.2", "express": "^4.21.2", "graceful-fs": "^4.2.11", "http-errors": "^2.0.0", "minimatch": "^9.0.3", "module-details-from-path": "^1.0.4", "mustache": "^4.2.0", "pluralize": "^8.0.0", "simple-git": "^3.21.0", "source-map": "^0.7.4", "termi-link": "^1.0.1", "unplugin": "^2.3.5", "uuid": "^9.0.1", "zod-to-json-schema": "^3.25.0" }, "peerDependencies": { "zod": "^3.25.34 || ^4.0" }, "bin": { "braintrust": "dist/cli.js" } }, "sha512-Sd/HC8qGgKsRf3hX1nLeyhoQZkUBIVmq4c+kV36FEbK77JyHiq9Bz11q/KPr1zzmPRhHRx4bA5zySuMUbVWFug=="],
+
+    "bytes": ["bytes@3.1.2", "", {}, "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg=="],
+
+    "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="],
+
+    "call-bound": ["call-bound@1.0.4", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "get-intrinsic": "^1.3.0" } }, "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg=="],
+
+    "camelcase": ["camelcase@8.0.0", "", {}, "sha512-8WB3Jcas3swSvjIeA2yvCJ+Miyz5l1ZmB6HFb9R1317dt9LCQoswg/BGrmAmkWVEszSrrg4RwmO46qIm2OEnSA=="],
+
+    "chalk": ["chalk@4.1.2", "", { "dependencies": { "ansi-styles": "^4.1.0", "supports-color": "^7.1.0" } }, "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA=="],
+
+    "cli-boxes": ["cli-boxes@3.0.0", "", {}, "sha512-/lzGpEWL/8PfI0BmBOPRwp0c/wFNX1RdUML3jK/RcSBA9T8mZDdQpqYBKtCFTOfQbwPqWEOpjqW+Fnayc0969g=="],
+
+    "cli-progress": ["cli-progress@3.12.0", "", { "dependencies": { "string-width": "^4.2.3" } }, "sha512-tRkV3HJ1ASwm19THiiLIXLO7Im7wlTuKnvkYaTkyoAPefqjNg7W7DHKUlGRxy9vxDvbyCYQkQozvptuMkGCg8A=="],
+
+    "cli-table3": ["cli-table3@0.6.5", "", { "dependencies": { "string-width": "^4.2.0" }, "optionalDependencies": { "@colors/colors": "1.5.0" } }, "sha512-+W/5efTR7y5HRD7gACw9yQjqMVvEMLBHmboM/kPWam+H+Hmyrgjh6YncVKK122YZkXrLudzTuAukUw9FnMf7IQ=="],
+
+    "color-convert": ["color-convert@2.0.1", "", { "dependencies": { "color-name": "~1.1.4" } }, "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ=="],
+
+    "color-name": ["color-name@1.1.4", "", {}, "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA=="],
+
+    "content-disposition": ["content-disposition@0.5.4", "", { "dependencies": { "safe-buffer": "5.2.1" } }, "sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ=="],
+
+    "content-type": ["content-type@1.0.5", "", {}, "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA=="],
+
+    "cookie": ["cookie@0.7.2", "", {}, "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w=="],
+
+    "cookie-signature": ["cookie-signature@1.0.7", "", {}, "sha512-NXdYc3dLr47pBkpUCHtKSwIOQXLVn8dZEuywboCOJY/osA0wFSLlSawr3KN8qXJEyX66FcONTH8EIlVuK0yyFA=="],
+
+    "cors": ["cors@2.8.6", "", { "dependencies": { "object-assign": "^4", "vary": "^1" } }, "sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw=="],
+
     "cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="],
 
+    "dc-browser": ["dc-browser@1.0.4", "", {}, "sha512-7oEtnzNlcE+hr4OvO3GR6Gndgw8BhW+wKOEwMqSleyY7N29jbAxzyW5BaJl7qBCw+6OIxfMWtY0T+6dxq8RWLw=="],
+
+    "debug": ["debug@2.6.9", "", { "dependencies": { "ms": "2.0.0" } }, "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA=="],
+
+    "depd": ["depd@2.0.0", "", {}, "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw=="],
+
+    "destroy": ["destroy@1.2.0", "", {}, "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg=="],
+
     "dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="],
 
     "dotenv-cli": ["dotenv-cli@8.0.0", "", { "dependencies": { "cross-spawn": "^7.0.6", "dotenv": "^16.3.0", "dotenv-expand": "^10.0.0", "minimist": "^1.2.6" }, "bin": { "dotenv": "cli.js" } }, "sha512-aLqYbK7xKOiTMIRf1lDPbI+Y+Ip/wo5k3eyp6ePysVaSqbyxjyK3dK35BTxG+rmd7djf5q2UPs4noPNH+cj0Qw=="],
 
     "dotenv-expand": ["dotenv-expand@10.0.0", "", {}, "sha512-GopVGCpVS1UKH75VKHGuQFqS1Gusej0z4FyQkPdwjil2gNIv+LNsqBlboOzpJFZKVT95GkCyWJbBSdFEFUWI2A=="],
 
+    "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="],
+
+    "ee-first": ["ee-first@1.1.1", "", {}, "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="],
+
+    "emoji-regex": ["emoji-regex@10.6.0", "", {}, "sha512-toUI84YS5YmxW219erniWD0CIVOo46xGKColeNQRgOzDorgBi1v4D71/OFzgD9GO2UGKIv1C3Sp8DAn0+j5w7A=="],
+
+    "encodeurl": ["encodeurl@2.0.0", "", {}, "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg=="],
+
+    "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="],
+
+    "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="],
+
+    "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="],
+
+    "esbuild": ["esbuild@0.27.7", "", { "optionalDependencies": { "@esbuild/aix-ppc64": "0.27.7", "@esbuild/android-arm": "0.27.7", "@esbuild/android-arm64": "0.27.7", "@esbuild/android-x64": "0.27.7", "@esbuild/darwin-arm64": "0.27.7", "@esbuild/darwin-x64": "0.27.7", "@esbuild/freebsd-arm64": "0.27.7", "@esbuild/freebsd-x64": "0.27.7", "@esbuild/linux-arm": "0.27.7", "@esbuild/linux-arm64": "0.27.7", "@esbuild/linux-ia32": "0.27.7", "@esbuild/linux-loong64": "0.27.7", "@esbuild/linux-mips64el": "0.27.7", "@esbuild/linux-ppc64": "0.27.7", "@esbuild/linux-riscv64": "0.27.7", "@esbuild/linux-s390x": "0.27.7", "@esbuild/linux-x64": "0.27.7", "@esbuild/netbsd-arm64": "0.27.7", "@esbuild/netbsd-x64": "0.27.7", "@esbuild/openbsd-arm64": "0.27.7", "@esbuild/openbsd-x64": "0.27.7", "@esbuild/openharmony-arm64": "0.27.7", "@esbuild/sunos-x64": "0.27.7", "@esbuild/win32-arm64": "0.27.7", "@esbuild/win32-ia32": "0.27.7", "@esbuild/win32-x64": "0.27.7" }, "bin": { "esbuild": "bin/esbuild" } }, "sha512-IxpibTjyVnmrIQo5aqNpCgoACA/dTKLTlhMHihVHhdkxKyPO1uBBthumT0rdHmcsk9uMonIWS0m4FljWzILh3w=="],
+
+    "escape-html": ["escape-html@1.0.3", "", {}, "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow=="],
+
+    "esquery": ["esquery@1.7.0", "", { "dependencies": { "estraverse": "^5.1.0" } }, "sha512-Ap6G0WQwcU/LHsvLwON1fAQX9Zp0A2Y6Y/cJBl9r/JbW90Zyg4/zbG6zzKa2OTALELarYHmKu0GhpM5EO+7T0g=="],
+
+    "estraverse": ["estraverse@5.3.0", "", {}, "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA=="],
+
+    "etag": ["etag@1.8.1", "", {}, "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg=="],
+
+    "eventsource-parser": ["eventsource-parser@1.1.2", "", {}, "sha512-v0eOBUbiaFojBu2s2NPBfYUoRR9GjcDNvCXVaqEf5vVfpIAh9f8RCo4vXTP8c63QRKCFwoLpMpTdPwwhEKVgzA=="],
+
+    "express": ["express@4.22.1", "", { "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", "body-parser": "~1.20.3", "content-disposition": "~0.5.4", "content-type": "~1.0.4", "cookie": "~0.7.1", "cookie-signature": "~1.0.6", "debug": "2.6.9", "depd": "2.0.0", "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "etag": "~1.8.1", "finalhandler": "~1.3.1", "fresh": "~0.5.2", "http-errors": "~2.0.0", "merge-descriptors": "1.0.3", "methods": "~1.1.2", "on-finished": "~2.4.1", "parseurl": "~1.3.3", "path-to-regexp": "~0.1.12", "proxy-addr": "~2.0.7", "qs": "~6.14.0", "range-parser": "~1.2.1", "safe-buffer": "5.2.1", "send": "~0.19.0", "serve-static": "~1.16.2", "setprototypeof": "1.2.0", "statuses": "~2.0.1", "type-is": "~1.6.18", "utils-merge": "1.0.1", "vary": "~1.1.2" } }, "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g=="],
+
+    "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="],
+
+    "fast-uri": ["fast-uri@3.1.0", "", {}, "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA=="],
+
+    "finalhandler": ["finalhandler@1.3.2", "", { "dependencies": { "debug": "2.6.9", "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "on-finished": "~2.4.1", "parseurl": "~1.3.3", "statuses": "~2.0.2", "unpipe": "~1.0.0" } }, "sha512-aA4RyPcd3badbdABGDuTXCMTtOneUCAYH/gxoYRTZlIJdF0YPWuGqiAsIrhNnnqdXGswYk6dGujem4w80UJFhg=="],
+
+    "forwarded": ["forwarded@0.2.0", "", {}, "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow=="],
+
+    "fresh": ["fresh@0.5.2", "", {}, "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q=="],
+
+    "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="],
+
+    "get-east-asian-width": ["get-east-asian-width@1.5.0", "", {}, "sha512-CQ+bEO+Tva/qlmw24dCejulK5pMzVnUOFOijVogd3KQs07HnRIgp8TGipvCCRT06xeYEbpbgwaCxglFyiuIcmA=="],
+
+    "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="],
+
+    "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="],
+
+    "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="],
+
+    "graceful-fs": ["graceful-fs@4.2.11", "", {}, "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ=="],
+
+    "has-flag": ["has-flag@4.0.0", "", {}, "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ=="],
+
+    "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="],
+
+    "hasown": ["hasown@2.0.3", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-ej4AhfhfL2Q2zpMmLo7U1Uv9+PyhIZpgQLGT1F9miIGmiCJIoCgSmczFdrc97mWT4kVY72KA+WnnhJ5pghSvSg=="],
+
+    "http-errors": ["http-errors@2.0.1", "", { "dependencies": { "depd": "~2.0.0", "inherits": "~2.0.4", "setprototypeof": "~1.2.0", "statuses": "~2.0.2", "toidentifier": "~1.0.1" } }, "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ=="],
+
+    "iconv-lite": ["iconv-lite@0.4.24", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3" } }, "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA=="],
+
+    "inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="],
+
+    "ipaddr.js": ["ipaddr.js@1.9.1", "", {}, "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g=="],
+
+    "is-fullwidth-code-point": ["is-fullwidth-code-point@3.0.0", "", {}, "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg=="],
+
     "isexe": ["isexe@2.0.0", "", {}, "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="],
 
+    "json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="],
+
+    "json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="],
+
+    "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="],
+
+    "media-typer": ["media-typer@0.3.0", "", {}, "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ=="],
+
+    "merge-descriptors": ["merge-descriptors@1.0.3", "", {}, "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ=="],
+
+    "meriyah": ["meriyah@6.1.4", "", {}, "sha512-Sz8FzjzI0kN13GK/6MVEsVzMZEPvOhnmmI1lU5+/1cGOiK3QUahntrNNtdVeihrO7t9JpoH75iMNXg6R6uWflQ=="],
+
+    "methods": ["methods@1.1.2", "", {}, "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w=="],
+
+    "mime": ["mime@1.6.0", "", { "bin": { "mime": "cli.js" } }, "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg=="],
+
+    "mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="],
+
+    "mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="],
+
+    "minimatch": ["minimatch@9.0.9", "", { "dependencies": { "brace-expansion": "^2.0.2" } }, "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg=="],
+
     "minimist": ["minimist@1.2.8", "", {}, "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA=="],
 
+    "module-details-from-path": ["module-details-from-path@1.0.4", "", {}, "sha512-EGWKgxALGMgzvxYF1UyGTy0HXX/2vHLkw6+NvDKW2jypWbHpjQuj4UMcqQWXHERJhVGKikolT06G3bcKe4fi7w=="],
+
+    "ms": ["ms@2.0.0", "", {}, "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="],
+
+    "mustache": ["mustache@4.2.0", "", { "bin": { "mustache": "bin/mustache" } }, "sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ=="],
+
+    "negotiator": ["negotiator@0.6.3", "", {}, "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg=="],
+
+    "object-assign": ["object-assign@4.1.1", "", {}, "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg=="],
+
+    "object-inspect": ["object-inspect@1.13.4", "", {}, "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew=="],
+
+    "on-finished": ["on-finished@2.4.1", "", { "dependencies": { "ee-first": "1.1.1" } }, "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg=="],
+
+    "parseurl": ["parseurl@1.3.3", "", {}, "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="],
+
     "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="],
 
+    "path-to-regexp": ["path-to-regexp@0.1.13", "", {}, "sha512-A/AGNMFN3c8bOlvV9RreMdrv7jsmF9XIfDeCd87+I8RNg6s78BhJxMu69NEMHBSJFxKidViTEdruRwEk/WIKqA=="],
+
+    "picomatch": ["picomatch@4.0.4", "", {}, "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A=="],
+
+    "pluralize": ["pluralize@8.0.0", "", {}, "sha512-Nc3IT5yHzflTfbjgqWcCPpo7DaKy4FnpB0l/zCAW0Tc7jxAiuqSxHasntB3D7887LSrA93kDJ9IXovxJYxyLCA=="],
+
+    "proxy-addr": ["proxy-addr@2.0.7", "", { "dependencies": { "forwarded": "0.2.0", "ipaddr.js": "1.9.1" } }, "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg=="],
+
+    "qs": ["qs@6.14.2", "", { "dependencies": { "side-channel": "^1.1.0" } }, "sha512-V/yCWTTF7VJ9hIh18Ugr2zhJMP01MY7c5kh4J870L7imm6/DIzBsNLTXzMwUA3yZ5b/KBqLx8Kp3uRvd7xSe3Q=="],
+
+    "range-parser": ["range-parser@1.2.1", "", {}, "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="],
+
+    "raw-body": ["raw-body@2.5.3", "", { "dependencies": { "bytes": "~3.1.2", "http-errors": "~2.0.1", "iconv-lite": "~0.4.24", "unpipe": "~1.0.0" } }, "sha512-s4VSOf6yN0rvbRZGxs8Om5CWj6seneMwK3oDb4lWDH0UPhWcxwOWw5+qk24bxq87szX1ydrwylIOp2uG1ojUpA=="],
+
+    "require-from-string": ["require-from-string@2.0.2", "", {}, "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw=="],
+
+    "safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="],
+
+    "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="],
+
+    "semifies": ["semifies@1.0.0", "", {}, "sha512-xXR3KGeoxTNWPD4aBvL5NUpMTT7WMANr3EWnaS190QVkY52lqqcVRD7Q05UVbBhiWDGWMlJEUam9m7uFFGVScw=="],
+
+    "send": ["send@0.19.2", "", { "dependencies": { "debug": "2.6.9", "depd": "2.0.0", "destroy": "1.2.0", "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "etag": "~1.8.1", "fresh": "~0.5.2", "http-errors": "~2.0.1", "mime": "1.6.0", "ms": "2.1.3", "on-finished": "~2.4.1", "range-parser": "~1.2.1", "statuses": "~2.0.2" } }, "sha512-VMbMxbDeehAxpOtWJXlcUS5E8iXh6QmN+BkRX1GARS3wRaXEEgzCcB10gTQazO42tpNIya8xIyNx8fll1OFPrg=="],
+
+    "serve-static": ["serve-static@1.16.3", "", { "dependencies": { "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "parseurl": "~1.3.3", "send": "~0.19.1" } }, "sha512-x0RTqQel6g5SY7Lg6ZreMmsOzncHFU7nhnRWkKgWuMTu5NN0DR5oruckMqRvacAN9d5w6ARnRBXl9xhDCgfMeA=="],
+
+    "setprototypeof": ["setprototypeof@1.2.0", "", {}, "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw=="],
+
     "shebang-command": ["shebang-command@2.0.0", "", { "dependencies": { "shebang-regex": "^3.0.0" } }, "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA=="],
 
     "shebang-regex": ["shebang-regex@3.0.0", "", {}, "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="],
 
+    "side-channel": ["side-channel@1.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.3", "side-channel-list": "^1.0.0", "side-channel-map": "^1.0.1", "side-channel-weakmap": "^1.0.2" } }, "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw=="],
+
+    "side-channel-list": ["side-channel-list@1.0.1", "", { "dependencies": { "es-errors": "^1.3.0", "object-inspect": "^1.13.4" } }, "sha512-mjn/0bi/oUURjc5Xl7IaWi/OJJJumuoJFQJfDDyO46+hBWsfaVM65TBHq2eoZBhzl9EchxOijpkbRC8SVBQU0w=="],
+
+    "side-channel-map": ["side-channel-map@1.0.1", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3" } }, "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA=="],
+
+    "side-channel-weakmap": ["side-channel-weakmap@1.0.2", "", { "dependencies": { "call-bound": "^1.0.2", "es-errors": "^1.3.0", "get-intrinsic": "^1.2.5", "object-inspect": "^1.13.3", "side-channel-map": "^1.0.1" } }, "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A=="],
+
+    "simple-git": ["simple-git@3.36.0", "", { "dependencies": { "@kwsites/file-exists": "^1.1.1", "@kwsites/promise-deferred": "^1.1.1", "@simple-git/args-pathspec": "^1.0.3", "@simple-git/argv-parser": "^1.1.0", "debug": "^4.4.0" } }, "sha512-cGQjLjK8bxJw4QuYT7gxHw3/IouVESbhahSsHrX97MzCL1gu2u7oy38W6L2ZIGECEfIBG4BabsWDPjBxJENv9Q=="],
+
+    "source-map": ["source-map@0.7.6", "", {}, "sha512-i5uvt8C3ikiWeNZSVZNWcfZPItFQOsYTUAOkcUPGd8DqDy1uOUikjt5dG+uRlwyvR108Fb9DOd4GvXfT0N2/uQ=="],
+
+    "statuses": ["statuses@2.0.2", "", {}, "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw=="],
+
+    "string-width": ["string-width@7.2.0", "", { "dependencies": { "emoji-regex": "^10.3.0", "get-east-asian-width": "^1.0.0", "strip-ansi": "^7.1.0" } }, "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ=="],
+
+    "strip-ansi": ["strip-ansi@7.2.0", "", { "dependencies": { "ansi-regex": "^6.2.2" } }, "sha512-yDPMNjp4WyfYBkHnjIRLfca1i6KMyGCtsVgoKe/z1+6vukgaENdgGBZt+ZmKPc4gavvEZ5OgHfHdrazhgNyG7w=="],
+
+    "supports-color": ["supports-color@7.2.0", "", { "dependencies": { "has-flag": "^4.0.0" } }, "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw=="],
+
+    "termi-link": ["termi-link@1.1.0", "", {}, "sha512-2qSN6TnomHgVLtk+htSWbaYs4Rd2MH/RU7VpHTy6MBstyNyWbM4yKd1DCYpE3fDg8dmGWojXCngNi/MHCzGuAA=="],
+
+    "toidentifier": ["toidentifier@1.0.1", "", {}, "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA=="],
+
+    "type-fest": ["type-fest@4.41.0", "", {}, "sha512-TeTSQ6H5YHvpqVwBRcnLDCBnDOHWYu7IvGbHT6N8AOymcr9PJGjc1GTtiWZTYg0NCgYwvnYWEkVChQAr9bjfwA=="],
+
+    "type-is": ["type-is@1.6.18", "", { "dependencies": { "media-typer": "0.3.0", "mime-types": "~2.1.24" } }, "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g=="],
+
     "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
 
+    "undici-types": ["undici-types@7.19.2", "", {}, "sha512-qYVnV5OEm2AW8cJMCpdV20CDyaN3g0AjDlOGf1OW4iaDEx8MwdtChUp4zu4H0VP3nDRF/8RKWH+IPp9uW0YGZg=="],
+
+    "unpipe": ["unpipe@1.0.0", "", {}, "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="],
+
+    "unplugin": ["unplugin@2.3.11", "", { "dependencies": { "@jridgewell/remapping": "^2.3.5", "acorn": "^8.15.0", "picomatch": "^4.0.3", "webpack-virtual-modules": "^0.6.2" } }, "sha512-5uKD0nqiYVzlmCRs01Fhs2BdkEgBS3SAVP6ndrBsuK42iC2+JHyxM05Rm9G8+5mkmRtzMZGY8Ct5+mliZxU/Ww=="],
+
+    "utils-merge": ["utils-merge@1.0.1", "", {}, "sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA=="],
+
+    "uuid": ["uuid@9.0.1", "", { "bin": { "uuid": "dist/bin/uuid" } }, "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA=="],
+
+    "vary": ["vary@1.1.2", "", {}, "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg=="],
+
+    "webpack-virtual-modules": ["webpack-virtual-modules@0.6.2", "", {}, "sha512-66/V2i5hQanC51vBQKPH4aI8NMAcBW59FVBs+rC7eGHupMyfn34q7rZIE+ETlJ+XTevqfUhVVBgSUNSW2flEUQ=="],
+
     "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
+
+    "widest-line": ["widest-line@5.0.0", "", { "dependencies": { "string-width": "^7.0.0" } }, "sha512-c9bZp7b5YtRj2wOe6dlj32MK+Bx/M/d+9VB2SHM1OtsUHR0aV0tdP6DWh/iMt0kWi1t5g1Iudu6hQRNd1A4PVA=="],
+
+    "wrap-ansi": ["wrap-ansi@9.0.2", "", { "dependencies": { "ansi-styles": "^6.2.1", "string-width": "^7.0.0", "strip-ansi": "^7.1.0" } }, "sha512-42AtmgqjV+X1VpdOfyTGOYRi0/zsoLqtXQckTmqTeybT+BDIbM/Guxo7x3pE2vtpr1ok6xRqM9OpBe+Jyoqyww=="],
+
+    "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="],
+
+    "zod-to-json-schema": ["zod-to-json-schema@3.25.2", "", { "peerDependencies": { "zod": "^3.25.28 || ^4" } }, "sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA=="],
+
+    "@apm-js-collab/code-transformer/source-map": ["source-map@0.6.1", "", {}, "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g=="],
+
+    "@kwsites/file-exists/debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
+
+    "ansi-align/string-width": ["string-width@4.2.3", "", { "dependencies": { "emoji-regex": "^8.0.0", "is-fullwidth-code-point": "^3.0.0", "strip-ansi": "^6.0.1" } }, "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g=="],
+
+    "boxen/chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="],
+
+    "cli-progress/string-width": ["string-width@4.2.3", "", { "dependencies": { "emoji-regex": "^8.0.0", "is-fullwidth-code-point": "^3.0.0", "strip-ansi": "^6.0.1" } }, "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g=="],
+
+    "cli-table3/string-width": ["string-width@4.2.3", "", { "dependencies": { "emoji-regex": "^8.0.0", "is-fullwidth-code-point": "^3.0.0", "strip-ansi": "^6.0.1" } }, "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g=="],
+
+    "send/ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
+
+    "simple-git/debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
+
+    "wrap-ansi/ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="],
+
+    "@kwsites/file-exists/debug/ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
+
+    "ansi-align/string-width/emoji-regex": ["emoji-regex@8.0.0", "", {}, "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A=="],
+
+    "ansi-align/string-width/strip-ansi": ["strip-ansi@6.0.1", "", { "dependencies": { "ansi-regex": "^5.0.1" } }, "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A=="],
+
+    "cli-progress/string-width/emoji-regex": ["emoji-regex@8.0.0", "", {}, "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A=="],
+
+    "cli-progress/string-width/strip-ansi": ["strip-ansi@6.0.1", "", { "dependencies": { "ansi-regex": "^5.0.1" } }, "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A=="],
+
+    "cli-table3/string-width/emoji-regex": ["emoji-regex@8.0.0", "", {}, "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A=="],
+
+    "cli-table3/string-width/strip-ansi": ["strip-ansi@6.0.1", "", { "dependencies": { "ansi-regex": "^5.0.1" } }, "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A=="],
+
+    "simple-git/debug/ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
+
+    "ansi-align/string-width/strip-ansi/ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="],
+
+    "cli-progress/string-width/strip-ansi/ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="],
+
+    "cli-table3/string-width/strip-ansi/ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="],
   }
 }
diff --git a/evals/TAXONOMY.md b/evals/TAXONOMY.md
index a024e86..eb45585 100644
--- a/evals/TAXONOMY.md
+++ b/evals/TAXONOMY.md
@@ -12,6 +12,11 @@ Each eval entry in `skills/*/evals/evals.json` should declare:
 - `ambiguity_level`
 - `domain_profile`
 - `primary_risks`
+- optional `smoke: true` for the small representative CI subset
+
+Each manifest should also declare a stable top-level `capability_id` for
+cross-run reporting. Prefer capability names such as `foundation-doc` or
+`service-spec` over current package names that may be renamed later.
 
 These fields are lightweight metadata. They do not change execution, but they
 show up in `benchmark.json` so runs can be grouped by failure mode later.
@@ -82,5 +87,5 @@ The next missing slices are:
 
 - comparison coverage on more packets so prompt changes are judged by deltas,
   not only by isolated pass/fail runs
-- optional Braintrust-style scorer/export integration if local JSON artifacts are
-  no longer sufficient for experiment tracking
+- richer cross-domain and update-mode packets once the runner/reporting shape
+  has stabilized
diff --git a/package.json b/package.json
index 15506c0..29e4314 100644
--- a/package.json
+++ b/package.json
@@ -8,13 +8,18 @@
     "baml:generate:foundation": "node ./node_modules/@boundaryml/baml/cli.js generate --from ./skills/foundation-creator/baml_src",
     "baml:generate:spec": "node ./node_modules/@boundaryml/baml/cli.js generate --from ./skills/spec-creator/baml_src",
     "eval:foundation": "bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator",
-    "eval:spec": "bun run with-env -- bun ./scripts/run-baml-eval.ts spec-creator"
+    "eval:foundation:smoke": "bun run eval:foundation -- --smoke",
+    "eval:spec": "bun run with-env -- bun ./scripts/run-baml-eval.ts spec-creator",
+    "eval:spec:smoke": "bun run eval:spec -- --smoke",
+    "eval:typecheck": "tsc --noEmit --allowImportingTsExtensions --moduleResolution bundler --module esnext --target esnext --skipLibCheck --types node scripts/run-baml-eval.ts scripts/evals/*.ts scripts/evals/validators/*.ts"
   },
   "dependencies": {
     "@boundaryml/baml": "0.221.0",
     "typescript": "5.9.3"
   },
   "devDependencies": {
+    "@types/node": "^25.6.0",
+    "braintrust": "^3.9.0",
     "dotenv-cli": "^8.0.0"
   }
 }
diff --git a/scripts/evals/README.md b/scripts/evals/README.md
index 59a8f78..48ce8be 100644
--- a/scripts/evals/README.md
+++ b/scripts/evals/README.md
@@ -6,12 +6,17 @@ bringing Lightfast-specific skill logic with it.
 
 Current modules:
 
+- `artifacts.ts` owns local artifact writes and deterministic-only candidate
+  artifact discovery.
 - `baml.ts` regenerates and imports generated BAML clients.
 - `cli.ts` parses command-line flags into a runner request.
+- `git.ts` captures lightweight git metadata for experiment names and reporter
+  metadata.
 - `manifest.ts` loads eval manifests, selects eval entries, and builds packets
   from fixture files.
 - `normalization.ts` owns post-compile brief normalization before rendering.
 - `profiles.ts` defines model profile presets and profile summaries.
+- `reporters.ts` defines the reporter contract and local/Braintrust adapters.
 - `reports.ts` builds benchmark, comparison, and suite-summary artifacts.
 - `runtime.ts` owns process/runtime helpers such as command execution and file
   loading.
@@ -24,8 +29,8 @@ The remaining orchestration in `../run-baml-eval.ts` should stay limited to
 three separable areas:
 
 - Trial execution.
-- Artifact writing.
-- Provider/reporter integrations.
+- Reporter hook dispatch.
+- Suite/smoke/deterministic-only run control.
 
 Keep Braintrust or other external reporting behind a reporter interface rather
 than adding vendor-specific branches to the runner core.
diff --git a/scripts/evals/artifacts.ts b/scripts/evals/artifacts.ts
new file mode 100644
index 0000000..301747e
--- /dev/null
+++ b/scripts/evals/artifacts.ts
@@ -0,0 +1,56 @@
+import { mkdir, stat, writeFile } from "node:fs/promises";
+import path from "node:path";
+
+export async function writeArtifacts(runDir, artifacts) {
+  await mkdir(runDir, { recursive: true });
+  for (const [name, value] of Object.entries(artifacts)) {
+    const filePath = path.join(runDir, name);
+    const content = typeof value === "string" ? value : JSON.stringify(value, null, 2);
+    await writeFile(filePath, content, "utf8");
+  }
+}
+
+async function isFile(filePath) {
+  try {
+    return (await stat(filePath)).isFile();
+  } catch {
+    return false;
+  }
+}
+
+async function isDirectory(filePath) {
+  try {
+    return (await stat(filePath)).isDirectory();
+  } catch {
+    return false;
+  }
+}
+
+export async function resolveCandidateDocumentPath(sourcePath, evalName) {
+  const resolved = path.resolve(sourcePath);
+  if (await isFile(resolved)) {
+    return resolved;
+  }
+
+  if (!(await isDirectory(resolved))) {
+    throw new Error(`Deterministic-only candidate source does not exist: ${sourcePath}`);
+  }
+
+  const candidates = [
+    path.join(resolved, "candidate.md"),
+    path.join(resolved, evalName, "candidate.md"),
+    path.join(resolved, "variants", "current", "candidate.md"),
+    path.join(resolved, evalName, "variants", "current", "candidate.md"),
+  ];
+
+  for (const candidatePath of candidates) {
+    if (await isFile(candidatePath)) {
+      return candidatePath;
+    }
+  }
+
+  throw new Error(
+    `Could not find candidate.md for '${evalName}' under ${sourcePath}. ` +
+      "Pass a candidate.md file, a run directory, or a suite directory.",
+  );
+}
diff --git a/scripts/evals/cli.ts b/scripts/evals/cli.ts
index 170998c..ac23baf 100644
--- a/scripts/evals/cli.ts
+++ b/scripts/evals/cli.ts
@@ -6,6 +6,9 @@ export function parseArgs(argv) {
   let compare = [];
   let evalProfile = "fast";
   let runAll = false;
+  let runSmoke = false;
+  let deterministicOnlyPath = null;
+  let reporters = ["local"];
 
   for (let index = 0; index < argv.length; index += 1) {
     const arg = argv[index];
@@ -15,6 +18,21 @@ export function parseArgs(argv) {
       continue;
     }
 
+    if (arg === "--smoke") {
+      runSmoke = true;
+      continue;
+    }
+
+    if (arg === "--deterministic-only") {
+      const next = argv[index + 1];
+      if (!next) {
+        fail("Missing path after --deterministic-only.");
+      }
+      deterministicOnlyPath = next;
+      index += 1;
+      continue;
+    }
+
     if (arg === "--trials") {
       const next = argv[index + 1];
       if (!next) {
@@ -41,6 +59,22 @@ export function parseArgs(argv) {
       continue;
     }
 
+    if (arg === "--reporter") {
+      const next = argv[index + 1];
+      if (!next) {
+        fail("Missing value after --reporter.");
+      }
+      reporters = next
+        .split(",")
+        .map((value) => value.trim())
+        .filter((value) => value.length > 0);
+      if (reporters.length === 0) {
+        fail("--reporter must include at least one reporter name.");
+      }
+      index += 1;
+      continue;
+    }
+
     if (arg === "--eval-profile") {
       const next = argv[index + 1];
       if (!next) {
@@ -64,5 +98,8 @@ export function parseArgs(argv) {
     compare,
     evalProfile,
     runAll,
+    runSmoke,
+    deterministicOnlyPath,
+    reporters,
   };
 }
diff --git a/scripts/evals/git.ts b/scripts/evals/git.ts
new file mode 100644
index 0000000..3dcc4d2
--- /dev/null
+++ b/scripts/evals/git.ts
@@ -0,0 +1,34 @@
+import { execFile } from "node:child_process";
+import { promisify } from "node:util";
+
+const execFileAsync = promisify(execFile);
+
+async function gitOutput(repoRoot, args) {
+  const { stdout } = await execFileAsync("git", args, { cwd: repoRoot });
+  return stdout.trim();
+}
+
+export async function getGitMetadata(repoRoot) {
+  try {
+    const [sha, shortSha, branch, status] = await Promise.all([
+      gitOutput(repoRoot, ["rev-parse", "HEAD"]),
+      gitOutput(repoRoot, ["rev-parse", "--short", "HEAD"]),
+      gitOutput(repoRoot, ["rev-parse", "--abbrev-ref", "HEAD"]),
+      gitOutput(repoRoot, ["status", "--short"]),
+    ]);
+
+    return {
+      sha,
+      short_sha: shortSha,
+      branch,
+      is_dirty: status.length > 0,
+    };
+  } catch {
+    return {
+      sha: "unknown",
+      short_sha: "unknown",
+      branch: "unknown",
+      is_dirty: null,
+    };
+  }
+}
diff --git a/scripts/evals/manifest.ts b/scripts/evals/manifest.ts
index 6c3b52b..d1f7659 100644
--- a/scripts/evals/manifest.ts
+++ b/scripts/evals/manifest.ts
@@ -40,14 +40,29 @@ function getEvalBySelector(evals, selector) {
   fail(`Eval '${selector}' not found.`);
 }
 
-export function getEvalEntriesBySelector(evals, selector, runAll) {
-  if (runAll) {
+export function getEvalEntriesBySelector(evals, selector, runAll, runSmoke = false) {
+  if (runAll && runSmoke) {
+    fail("Pass either --all or --smoke, not both.");
+  }
+
+  if (runAll || runSmoke) {
     if (selector) {
-      fail("Pass either --all or an eval id/name, not both.");
+      fail("Pass either a suite flag or an eval id/name, not both.");
     }
+  }
+
+  if (runAll) {
     return evals;
   }
 
+  if (runSmoke) {
+    const smokeEvals = evals.filter((entry) => entry.smoke === true);
+    if (smokeEvals.length === 0) {
+      fail("No evals are marked with smoke: true in this manifest.");
+    }
+    return smokeEvals;
+  }
+
   return [getEvalBySelector(evals, selector)];
 }
 
@@ -66,7 +81,7 @@ export async function buildEvalPacket(evalEntry, evalsDir, packetType) {
     ? path.join(evalsDir, packetFiles.existing_foundation)
     : null;
 
-  const packet = {
+  const packet: Record<string, unknown> = {
     packet_name: evalEntry.eval_name,
     task_prompt: evalEntry.prompt,
     raw_notes: rawNotesPath ? await loadText(rawNotesPath) : evalEntry.prompt,
diff --git a/scripts/evals/reporters.ts b/scripts/evals/reporters.ts
new file mode 100644
index 0000000..b00c63b
--- /dev/null
+++ b/scripts/evals/reporters.ts
@@ -0,0 +1,240 @@
+import path from "node:path";
+import { writeArtifacts } from "./artifacts.ts";
+import { extractEvalMetadata } from "./manifest.ts";
+import { fail } from "./runtime.ts";
+
+function uniqueReporterNames(names) {
+  const requested = names.length > 0 ? names : ["local"];
+  const normalized = requested.map((name) => name.trim()).filter(Boolean);
+  if (!normalized.includes("local")) {
+    normalized.unshift("local");
+  }
+  return [...new Set(normalized)];
+}
+
+function statusScore(status) {
+  switch (status) {
+    case "Pass":
+      return 1;
+    case "Partial":
+      return 0.5;
+    case "Fail":
+      return 0;
+    default:
+      return null;
+  }
+}
+
+function slugSegment(value) {
+  return String(value)
+    .trim()
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, "-")
+    .replace(/^-+|-+$/g, "");
+}
+
+function compactTimestamp(isoTimestamp) {
+  const date = new Date(isoTimestamp);
+  if (Number.isNaN(date.getTime())) {
+    return slugSegment(isoTimestamp).slice(0, 20) || "unknown-time";
+  }
+
+  const yyyy = String(date.getUTCFullYear());
+  const mm = String(date.getUTCMonth() + 1).padStart(2, "0");
+  const dd = String(date.getUTCDate()).padStart(2, "0");
+  const hh = String(date.getUTCHours()).padStart(2, "0");
+  const min = String(date.getUTCMinutes()).padStart(2, "0");
+  return `${yyyy}${mm}${dd}-${hh}${min}`;
+}
+
+function buildExperimentName(context) {
+  if (process.env.BRAINTRUST_EXPERIMENT) {
+    return process.env.BRAINTRUST_EXPERIMENT;
+  }
+
+  const capability = slugSegment(context.capabilityId);
+  const mode = context.compareMode ? "compare" : context.suiteMode;
+  const runKind = context.deterministicOnly ? "deterministic" : "model";
+  const timestamp = compactTimestamp(context.startedAtIso);
+  const shortSha = slugSegment(context.git?.short_sha ?? "unknown");
+
+  return [capability, mode, context.evalProfile.name, runKind, timestamp, shortSha].join(".");
+}
+
+function createLocalJsonReporter() {
+  return {
+    name: "local",
+    async onTrialComplete({ artifactDir, trialArtifacts, trialIndex, trials }) {
+      const trialDir =
+        trials === 1 ? artifactDir : path.join(artifactDir, `trial-${trialIndex + 1}`);
+      await writeArtifacts(trialDir, trialArtifacts);
+    },
+    async onBenchmarkComplete({ artifactDir, benchmark }) {
+      await writeArtifacts(artifactDir, {
+        "benchmark.json": benchmark,
+      });
+    },
+    async onComparisonComplete({ runDir, comparison }) {
+      await writeArtifacts(runDir, {
+        "comparison.json": comparison,
+      });
+    },
+    async onSuiteComplete({ suiteDir, suiteSummary }) {
+      await writeArtifacts(suiteDir, {
+        "suite.json": suiteSummary,
+      });
+    },
+    async onError({ runDir, errorArtifact }) {
+      await writeArtifacts(runDir, {
+        "error.json": errorArtifact,
+      });
+    },
+  };
+}
+
+async function createBraintrustReporter(context) {
+  if (!process.env.BRAINTRUST_API_KEY) {
+    fail("BRAINTRUST_API_KEY is required when using --reporter braintrust.");
+  }
+
+  const { init } = await import("braintrust");
+  const project = process.env.BRAINTRUST_PROJECT ?? "lightfast-skills";
+  const experiment = buildExperimentName(context);
+  const tags = [
+    "skills-eval",
+    context.capabilityId,
+    context.skillName,
+    context.suiteMode,
+    context.evalProfile.name,
+    context.deterministicOnly ? "deterministic-only" : "model-backed",
+  ];
+
+  const braintrustExperiment = init({
+    project,
+    experiment,
+    apiKey: process.env.BRAINTRUST_API_KEY,
+    orgName: process.env.BRAINTRUST_ORG || undefined,
+    metadata: {
+      capability_id: context.capabilityId,
+      skill_name: context.skillName,
+      eval_profile: context.evalProfile,
+      started_at: context.startedAtIso,
+      suite_mode: context.suiteMode,
+      run_suite: context.runSuite,
+      smoke: context.runSmoke,
+      deterministic_only: context.deterministicOnly,
+      compare_mode: context.compareMode,
+      git: context.git,
+    },
+    tags,
+  });
+
+  return {
+    name: "braintrust",
+    async onTrialComplete({ evalEntry, trialResult, trialIndex, trials, variant, artifactDir }) {
+      braintrustExperiment.log({
+        id: `${evalEntry.eval_name}:${variant.label}:trial-${trialIndex + 1}`,
+        input: {
+          skill_name: context.skillName,
+          capability_id: context.capabilityId,
+          eval_name: evalEntry.eval_name,
+          eval_id: evalEntry.id,
+          prompt: evalEntry.prompt,
+          packet: trialResult.packet,
+          variant: variant.label,
+          trial: trialIndex + 1,
+          trial_count: trials,
+        },
+        output: {
+          candidate_document: trialResult.candidateDocument,
+        },
+        expected: {
+          criteria: trialResult.packet.expected_criteria,
+          expected_output: evalEntry.expected_output ?? null,
+        },
+        scores: {
+          llm_status: statusScore(trialResult.summary.llm_status),
+          combined_status: statusScore(trialResult.summary.combined_status),
+          deterministic_pass: trialResult.deterministic_checks.overall_pass ? 1 : 0,
+        },
+        metadata: {
+          eval_metadata: extractEvalMetadata(evalEntry),
+          capability_id: context.capabilityId,
+          skill_name: context.skillName,
+          eval_profile: context.evalProfile,
+          suite_mode: context.suiteMode,
+          run_kind: context.deterministicOnly ? "deterministic" : "model",
+          git: context.git,
+          artifact_dir: artifactDir,
+          brief: trialResult.brief,
+          report: trialResult.report,
+          deterministic_checks: trialResult.deterministic_checks,
+          normalization: trialResult.normalization,
+          summary: trialResult.summary,
+          judge_skipped: Boolean(trialResult.report?.judge_skipped),
+        },
+        metrics: trialResult.timing,
+        tags,
+      });
+    },
+    async close() {
+      await braintrustExperiment.flush();
+      const summary = await braintrustExperiment.summarize();
+      console.log(
+        `Braintrust experiment: ${summary.experimentUrl ?? summary.experimentName}`,
+      );
+    },
+  };
+}
+
+export async function createReporterSet(names, context) {
+  const reporterNames = uniqueReporterNames(names);
+  const reporters = [];
+
+  for (const name of reporterNames) {
+    switch (name) {
+      case "local":
+        reporters.push(createLocalJsonReporter());
+        break;
+      case "braintrust":
+        reporters.push(await createBraintrustReporter(context));
+        break;
+      default:
+        fail(`Unknown reporter '${name}'. Supported reporters: local, braintrust.`);
+    }
+  }
+
+  async function emit(hook, payload) {
+    for (const reporter of reporters) {
+      if (typeof reporter[hook] === "function") {
+        await reporter[hook](payload);
+      }
+    }
+  }
+
+  return {
+    names: reporterNames,
+    async onTrialComplete(payload) {
+      await emit("onTrialComplete", payload);
+    },
+    async onBenchmarkComplete(payload) {
+      await emit("onBenchmarkComplete", payload);
+    },
+    async onComparisonComplete(payload) {
+      await emit("onComparisonComplete", payload);
+    },
+    async onSuiteComplete(payload) {
+      await emit("onSuiteComplete", payload);
+    },
+    async onError(payload) {
+      await emit("onError", payload);
+    },
+    async close() {
+      for (const reporter of reporters) {
+        if (typeof reporter.close === "function") {
+          await reporter.close();
+        }
+      }
+    },
+  };
+}
diff --git a/scripts/evals/reports.ts b/scripts/evals/reports.ts
index 08ec474..0052519 100644
--- a/scripts/evals/reports.ts
+++ b/scripts/evals/reports.ts
@@ -3,7 +3,7 @@ import { extractEvalMetadata } from "./manifest.ts";
 import { summarizeEvalProfile } from "./profiles.ts";
 import { compareStatuses, summarizeNumeric, worstStatus } from "./status.ts";
 
-export function buildBenchmark(skillName, evalEntry, trials) {
+export function buildBenchmark(skillName, evalEntry, trials, capabilityId = skillName) {
   const judgeStatuses = trials.map((trial) => trial.report.overall_status);
   const combinedStatuses = trials.map((trial) => trial.summary.combined_status);
   const deterministicPassCount = trials.filter(
@@ -32,6 +32,7 @@ export function buildBenchmark(skillName, evalEntry, trials) {
 
   return {
     skill_name: skillName,
+    capability_id: capabilityId,
     eval_name: evalEntry.eval_name,
     eval_metadata: extractEvalMetadata(evalEntry),
     trial_count: trials.length,
@@ -97,7 +98,13 @@ function compareBenchmarks(left, right) {
   return rightSummary.deterministic_pass_rate - leftSummary.deterministic_pass_rate;
 }
 
-export function buildComparisonReport(skillName, evalEntry, variantResults, evalProfile) {
+export function buildComparisonReport(
+  skillName,
+  evalEntry,
+  variantResults,
+  evalProfile,
+  capabilityId = skillName,
+) {
   const currentVariant = variantResults.find(
     (variantResult) => variantResult.variant.key === "current",
   );
@@ -151,6 +158,7 @@ export function buildComparisonReport(skillName, evalEntry, variantResults, eval
 
   return {
     skill_name: skillName,
+    capability_id: capabilityId,
     eval_name: evalEntry.eval_name,
     trial_count: variantResults[0]?.benchmark.trial_count ?? 0,
     judge_variant: "current",
@@ -172,9 +180,12 @@ export function buildComparisonReport(skillName, evalEntry, variantResults, eval
 
 export function buildSuiteSummary({
   skillName,
+  capabilityId = skillName,
   evalProfile,
   trials,
   compareMode,
+  deterministicOnly,
+  suiteMode,
   suiteResults,
   suiteDir,
 }) {
@@ -184,9 +195,12 @@ export function buildSuiteSummary({
 
   return {
     skill_name: skillName,
+    capability_id: capabilityId,
     eval_profile: summarizeEvalProfile(evalProfile),
+    suite_mode: suiteMode,
     trial_count: trials,
     compare_mode: compareMode,
+    deterministic_only: deterministicOnly,
     eval_count: suiteResults.length,
     pass_count: suiteResults.length - failingResults.length,
     non_pass_count: failingResults.length,
diff --git a/scripts/evals/runtime.ts b/scripts/evals/runtime.ts
index 5e8e79d..1ce4ad1 100644
--- a/scripts/evals/runtime.ts
+++ b/scripts/evals/runtime.ts
@@ -18,7 +18,7 @@ export function runCommand(command, args, cwd) {
     child.on("error", reject);
     child.on("exit", (code) => {
       if (code === 0) {
-        resolve();
+        resolve(undefined);
       } else {
         reject(new Error(`${command} ${args.join(" ")} exited with code ${code}`));
       }
diff --git a/scripts/evals/validators/index.ts b/scripts/evals/validators/index.ts
index 59bb27e..d532d5d 100644
--- a/scripts/evals/validators/index.ts
+++ b/scripts/evals/validators/index.ts
@@ -57,7 +57,7 @@ export async function runDeterministicChecks(
       );
       break;
     case "spec-v1":
-      checks = validateSpecDocument(candidateDocument, templateText, languageText);
+      checks = validateSpecDocument(candidateDocument, templateText);
       break;
     case "spec-update-v1":
       checks = validateSpecUpdateDocument(candidateDocument, existingSpecText, validationContract);
diff --git a/scripts/run-baml-eval.ts b/scripts/run-baml-eval.ts
index 10a595d..cb38e84 100644
--- a/scripts/run-baml-eval.ts
+++ b/scripts/run-baml-eval.ts
@@ -1,6 +1,7 @@
-import { mkdir, rm, writeFile } from "node:fs/promises";
+import { rm } from "node:fs/promises";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
+import { resolveCandidateDocumentPath } from "./evals/artifacts.ts";
 import {
   ensureFreshClient,
   importGeneratedClient,
@@ -11,14 +12,16 @@ import {
   getEvalEntriesBySelector,
   loadEvalManifest,
 } from "./evals/manifest.ts";
+import { getGitMetadata } from "./evals/git.ts";
 import { normalizeCompiledBriefForRender } from "./evals/normalization.ts";
 import { getEvalProfilePreset } from "./evals/profiles.ts";
+import { createReporterSet } from "./evals/reporters.ts";
 import {
   buildBenchmark,
   buildComparisonReport,
   buildSuiteSummary,
 } from "./evals/reports.ts";
-import { fail } from "./evals/runtime.ts";
+import { fail, loadText } from "./evals/runtime.ts";
 import { worstStatus } from "./evals/status.ts";
 import { runDeterministicChecks } from "./evals/validators/index.ts";
 import {
@@ -33,15 +36,6 @@ const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
 const repoRoot = path.resolve(__dirname, "..");
 
-async function writeRunArtifacts(runDir, artifacts) {
-  await mkdir(runDir, { recursive: true });
-  for (const [name, value] of Object.entries(artifacts)) {
-    const filePath = path.join(runDir, name);
-    const content = typeof value === "string" ? value : JSON.stringify(value, null, 2);
-    await writeFile(filePath, content, "utf8");
-  }
-}
-
 async function runSingleTrial({
   evalEntry,
   evalsDir,
@@ -74,7 +68,7 @@ async function runSingleTrial({
     );
   }
 
-  const timing = {};
+  const timing: Record<string, number> = {};
   const startedAt = Date.now();
 
   const compileStartedAt = Date.now();
@@ -141,12 +135,78 @@ async function runSingleTrial({
   };
 }
 
+function buildTrialArtifacts(trialResult) {
+  return {
+    "packet.json": trialResult.packet,
+    "brief.json": trialResult.brief,
+    "candidate.md": trialResult.candidateDocument,
+    "report.json": trialResult.report,
+    "deterministic_checks.json": trialResult.deterministic_checks,
+    "normalization.json": trialResult.normalization,
+    "timing.json": trialResult.timing,
+    "summary.json": trialResult.summary,
+  };
+}
+
+async function runDeterministicOnlyTrial({
+  candidateDocumentPath,
+  evalEntry,
+  evalsDir,
+  runner,
+  skillRoot,
+  validationContract,
+}) {
+  const startedAt = Date.now();
+  const packet = await buildEvalPacket(evalEntry, evalsDir, runner.packet_type);
+  const candidateDocument = await loadText(candidateDocumentPath);
+
+  const deterministicStartedAt = Date.now();
+  const deterministic_checks = await runDeterministicChecks(
+    skillRoot,
+    validationContract,
+    candidateDocument,
+    packet,
+  );
+
+  const deterministicStatus = deterministic_checks.overall_pass ? "Pass" : "Fail";
+  const timing = {
+    compile_ms: 0,
+    render_ms: 0,
+    deterministic_ms: Date.now() - deterministicStartedAt,
+    evaluate_ms: 0,
+    total_ms: Date.now() - startedAt,
+  };
+
+  return {
+    packet,
+    brief: null,
+    candidateDocument,
+    report: {
+      judge_skipped: true,
+      overall_status: deterministicStatus,
+      summary:
+        "LLM judge skipped because this run used --deterministic-only against an existing candidate.md artifact.",
+    },
+    deterministic_checks,
+    normalization: null,
+    timing,
+    summary: {
+      llm_status: deterministicStatus,
+      combined_status: deterministicStatus,
+      deterministic_pass: deterministic_checks.overall_pass,
+      judge_skipped: true,
+    },
+  };
+}
+
 async function runVariantTrials({
   artifactDir,
   candidateGenerated,
+  capabilityId,
   evalEntry,
   evalsDir,
   judgeGenerated,
+  reporters,
   runner,
   skillName,
   skillRoot,
@@ -169,31 +229,28 @@ async function runVariantTrials({
     });
     trialResults.push(trialResult);
 
-    const trialArtifacts = {
-      "packet.json": trialResult.packet,
-      "brief.json": trialResult.brief,
-      "candidate.md": trialResult.candidateDocument,
-      "report.json": trialResult.report,
-      "deterministic_checks.json": trialResult.deterministic_checks,
-      "normalization.json": trialResult.normalization,
-      "timing.json": trialResult.timing,
-      "summary.json": trialResult.summary,
-    };
-
-    if (trials === 1) {
-      await writeRunArtifacts(artifactDir, trialArtifacts);
-    } else {
-      await writeRunArtifacts(path.join(artifactDir, `trial-${trialIndex + 1}`), trialArtifacts);
-    }
+    await reporters.onTrialComplete({
+      artifactDir,
+      evalEntry,
+      trialArtifacts: buildTrialArtifacts(trialResult),
+      trialIndex,
+      trialResult,
+      trials,
+      variant,
+    });
   }
 
   const benchmark = {
-    ...buildBenchmark(skillName, evalEntry, trialResults),
+    ...buildBenchmark(skillName, evalEntry, trialResults, capabilityId),
     variant,
   };
 
-  await writeRunArtifacts(artifactDir, {
-    "benchmark.json": benchmark,
+  await reporters.onBenchmarkComplete({
+    artifactDir,
+    benchmark,
+    evalEntry,
+    trialResults,
+    variant,
   });
 
   return {
@@ -202,7 +259,77 @@ async function runVariantTrials({
   };
 }
 
+async function runDeterministicOnlyEntry({
+  candidateSourcePath,
+  capabilityId,
+  evalEntry,
+  evalProfile,
+  evalsDir,
+  manifestValidationContract,
+  reporters,
+  runDir,
+  runner,
+  skillName,
+  skillRoot,
+}) {
+  const validationContract = evalEntry.validation_contract ?? manifestValidationContract ?? null;
+  const variant = parseVariantSpec("current");
+  const candidateDocumentPath = await resolveCandidateDocumentPath(
+    candidateSourcePath,
+    evalEntry.eval_name,
+  );
+  const trialResult = await runDeterministicOnlyTrial({
+    candidateDocumentPath,
+    evalEntry,
+    evalsDir,
+    runner,
+    skillRoot,
+    validationContract,
+  });
+
+  await reporters.onTrialComplete({
+    artifactDir: runDir,
+    evalEntry,
+    trialArtifacts: {
+      ...buildTrialArtifacts(trialResult),
+      "source_candidate_path.txt": candidateDocumentPath,
+    },
+    trialIndex: 0,
+    trialResult,
+    trials: 1,
+    variant,
+  });
+
+  const benchmark = {
+    ...buildBenchmark(skillName, evalEntry, [trialResult], capabilityId),
+    variant,
+    deterministic_only: true,
+    source_candidate_path: candidateDocumentPath,
+  };
+
+  await reporters.onBenchmarkComplete({
+    artifactDir: runDir,
+    benchmark,
+    evalEntry,
+    trialResults: [trialResult],
+    variant,
+  });
+
+  console.log(`Deterministic-only run complete: ${runDir}`);
+  console.log(
+    `Eval profile: ${evalProfile.name} (validators only; model calls skipped)`,
+  );
+  console.log(`Combined status: ${benchmark.benchmark_summary.combined_worst_status}`);
+
+  return {
+    eval_name: evalEntry.eval_name,
+    run_dir: runDir,
+    current_summary: benchmark.benchmark_summary,
+  };
+}
+
 async function runEvalEntry({
+  capabilityId,
   cleanupRoots,
   compareMode,
   evalEntry,
@@ -211,6 +338,7 @@ async function runEvalEntry({
   judgeGenerated,
   judgePrepared,
   manifestValidationContract,
+  reporters,
   runDir,
   runner,
   skillName,
@@ -252,9 +380,11 @@ async function runEvalEntry({
     const variantRun = await runVariantTrials({
       artifactDir,
       candidateGenerated,
+      capabilityId,
       evalEntry,
       evalsDir,
       judgeGenerated,
+      reporters,
       runner,
       skillName,
       skillRoot,
@@ -275,9 +405,18 @@ async function runEvalEntry({
     variantResults[0];
 
   if (compareMode) {
-    const comparison = buildComparisonReport(skillName, evalEntry, variantResults, evalProfile);
-    await writeRunArtifacts(runDir, {
-      "comparison.json": comparison,
+    const comparison = buildComparisonReport(
+      skillName,
+      evalEntry,
+      variantResults,
+      evalProfile,
+      capabilityId,
+    );
+    await reporters.onComparisonComplete({
+      runDir,
+      comparison,
+      evalEntry,
+      variantResults,
     });
 
     console.log(`Run complete: ${runDir}`);
@@ -328,80 +467,136 @@ async function main() {
     compare,
     evalProfile: evalProfileName,
     runAll,
+    runSmoke,
+    deterministicOnlyPath,
+    reporters: reporterNames,
   } = parseArgs(process.argv.slice(2));
 
   if (!skillName) {
     fail(
-      "Usage: bun run ./scripts/run-baml-eval.ts <foundation-creator|spec-creator> [eval-id-or-name] [--all] [--trials N] [--compare previous,profile:no-skill] [--eval-profile fast|gate|prod|cross]",
+      "Usage: bun run ./scripts/run-baml-eval.ts <foundation-creator|spec-creator> [eval-id-or-name] [--all|--smoke] [--trials N] [--compare previous,profile:no-skill] [--eval-profile fast|gate|prod|cross] [--deterministic-only path] [--reporter local,braintrust]",
     );
   }
 
+  const deterministicOnly = Boolean(deterministicOnlyPath);
+  if (deterministicOnly && compare.length > 0) {
+    fail("--deterministic-only does not support --compare.");
+  }
+  if (deterministicOnly && trials !== 1) {
+    fail("--deterministic-only validates one existing candidate artifact per eval; omit --trials.");
+  }
+
   const skillRoot = path.join(repoRoot, "skills", skillName);
   const evalsDir = path.join(skillRoot, "evals");
   const manifest = await loadEvalManifest(evalsDir, skillName);
-  const evalEntries = getEvalEntriesBySelector(manifest.evals, selector, runAll);
-  const variants = buildVariantPlan(compare);
-  const compareMode = compare.length > 0 || variants.length > 1;
+  const capabilityId = manifest.capability_id ?? skillName;
+  const evalEntries = getEvalEntriesBySelector(manifest.evals, selector, runAll, runSmoke);
+  const variants = deterministicOnly ? [parseVariantSpec("current")] : buildVariantPlan(compare);
+  const compareMode = !deterministicOnly && (compare.length > 0 || variants.length > 1);
   const evalProfile = getEvalProfilePreset(evalProfileName);
   const runner = manifest.runner_contract;
   const manifestValidationContract = manifest.validation_contract ?? null;
 
-  if (!process.env.AI_GATEWAY_API_KEY) {
+  if (!deterministicOnly && !process.env.AI_GATEWAY_API_KEY) {
     fail("AI_GATEWAY_API_KEY is required to execute BAML evals.");
   }
 
-  const cleanupRoots = [];
-  const judgePrepared = await materializeVariantSkillRoot(
+  const startedAt = new Date();
+  const startedAtIso = startedAt.toISOString();
+  const timestamp = startedAtIso.replace(/[:.]/g, "-");
+  const git = await getGitMetadata(repoRoot);
+  const runSuite = runAll || runSmoke;
+  const suiteMode = runAll ? "all" : runSmoke ? "smoke" : "single";
+  const suiteDir = runSuite
+    ? path.join(
+        skillRoot,
+        "evals",
+        "runs",
+        `${timestamp}-${runSmoke ? "smoke" : "suite"}`,
+      )
+    : null;
+  const reporters = await createReporterSet(reporterNames, {
     skillName,
-    skillRoot,
-    parseVariantSpec("current"),
-    evalProfile.judgeOverlayProfiles,
-    repoRoot,
-  );
-  if (judgePrepared.cleanupRoot) {
-    cleanupRoots.push(judgePrepared.cleanupRoot);
-  }
+    capabilityId,
+    evalProfile,
+    timestamp,
+    startedAtIso,
+    suiteMode,
+    runSuite,
+    runSmoke,
+    deterministicOnly,
+    compareMode,
+    git,
+  });
+  const cleanupRoots = [];
 
-  await ensureFreshClient(judgePrepared.skillRoot, repoRoot);
-  const judgeGenerated = await importGeneratedClient(judgePrepared.skillRoot);
-  const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
-  const suiteDir = runAll
-    ? path.join(skillRoot, "evals", "runs", `${timestamp}-suite`)
-    : null;
+  let judgePrepared = null;
+  let judgeGenerated = null;
+  if (!deterministicOnly) {
+    judgePrepared = await materializeVariantSkillRoot(
+      skillName,
+      skillRoot,
+      parseVariantSpec("current"),
+      evalProfile.judgeOverlayProfiles,
+      repoRoot,
+    );
+    if (judgePrepared.cleanupRoot) {
+      cleanupRoots.push(judgePrepared.cleanupRoot);
+    }
+
+    await ensureFreshClient(judgePrepared.skillRoot, repoRoot);
+    judgeGenerated = await importGeneratedClient(judgePrepared.skillRoot);
+  }
 
   try {
     const suiteResults = [];
 
     for (const [index, evalEntry] of evalEntries.entries()) {
-      if (runAll) {
+      if (runSuite) {
         console.log(`\n=== ${index + 1}/${evalEntries.length}: ${evalEntry.eval_name} ===`);
       }
 
-      const runDir = runAll
+      const runDir = runSuite
         ? path.join(suiteDir, evalEntry.eval_name)
         : path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`);
 
       try {
-        const evalResult = await runEvalEntry({
-          cleanupRoots,
-          compareMode,
-          evalEntry,
-          evalProfile,
-          evalsDir,
-          judgeGenerated,
-          judgePrepared,
-          manifestValidationContract,
-          runDir,
-          runner,
-          skillName,
-          skillRoot,
-          trials,
-          variants,
-        });
+        const evalResult = deterministicOnly
+          ? await runDeterministicOnlyEntry({
+              candidateSourcePath: deterministicOnlyPath,
+              capabilityId,
+              evalEntry,
+              evalProfile,
+              evalsDir,
+              manifestValidationContract,
+              reporters,
+              runDir,
+              runner,
+              skillName,
+              skillRoot,
+            })
+          : await runEvalEntry({
+              capabilityId,
+              cleanupRoots,
+              compareMode,
+              evalEntry,
+              evalProfile,
+              evalsDir,
+              judgeGenerated,
+              judgePrepared,
+              manifestValidationContract,
+              reporters,
+              runDir,
+              runner,
+              skillName,
+              skillRoot,
+              trials,
+              variants,
+            });
 
         suiteResults.push(evalResult);
       } catch (error) {
-        if (!runAll) {
+        if (!runSuite) {
           throw error;
         }
 
@@ -409,8 +604,9 @@ async function main() {
         console.error(`Eval failed: ${evalEntry.eval_name}`);
         console.error(message);
 
-        await writeRunArtifacts(runDir, {
-          "error.json": {
+        await reporters.onError({
+          runDir,
+          errorArtifact: {
             eval_name: evalEntry.eval_name,
             message,
           },
@@ -429,18 +625,22 @@ async function main() {
       }
     }
 
-    if (runAll) {
+    if (runSuite) {
       const suiteSummary = buildSuiteSummary({
         skillName,
+        capabilityId,
         evalProfile,
         trials,
         compareMode,
+        deterministicOnly,
+        suiteMode,
         suiteResults,
         suiteDir,
       });
 
-      await writeRunArtifacts(suiteDir, {
-        "suite.json": suiteSummary,
+      await reporters.onSuiteComplete({
+        suiteDir,
+        suiteSummary,
       });
 
       console.log(`\nSuite complete: ${suiteDir}`);
@@ -452,9 +652,13 @@ async function main() {
       }
     }
   } finally {
-    await Promise.all(
-      cleanupRoots.map((cleanupRoot) => rm(cleanupRoot, { recursive: true, force: true })),
-    );
+    try {
+      await reporters.close();
+    } finally {
+      await Promise.all(
+        cleanupRoots.map((cleanupRoot) => rm(cleanupRoot, { recursive: true, force: true })),
+      );
+    }
   }
 }
 
diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json
index f978eb6..4eeaf88 100644
--- a/skills/foundation-creator/evals/evals.json
+++ b/skills/foundation-creator/evals/evals.json
@@ -1,5 +1,6 @@
 {
   "skill_name": "foundation-creator",
+  "capability_id": "foundation-doc",
   "runner_contract": {
     "type": "baml_pipeline",
     "packet_type": "FoundationEvalPacket",
@@ -17,6 +18,7 @@
     {
       "id": 0,
       "eval_name": "create-foundation-from-vercel-source-packet",
+      "smoke": true,
       "scenario_type": "source_packet_transition",
       "input_shape": "source_packet",
       "ambiguity_level": "high",
@@ -162,6 +164,7 @@
     {
       "id": 5,
       "eval_name": "update-lightfast-foundation-tighten-overreach",
+      "smoke": true,
       "scenario_type": "update_existing_doc",
       "input_shape": "existing_doc_update",
       "ambiguity_level": "medium",
diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json
index d048fb6..5d3b922 100644
--- a/skills/spec-creator/evals/evals.json
+++ b/skills/spec-creator/evals/evals.json
@@ -1,5 +1,6 @@
 {
   "skill_name": "spec-creator",
+  "capability_id": "service-spec",
   "runner_contract": {
     "type": "baml_pipeline",
     "packet_type": "SpecEvalPacket",
@@ -107,6 +108,7 @@
     {
       "id": 3,
       "eval_name": "create-from-vercel-mcp-source-packet",
+      "smoke": true,
       "scenario_type": "source_packet_transition",
       "input_shape": "source_packet",
       "ambiguity_level": "high",
@@ -129,6 +131,7 @@
     {
       "id": 4,
       "eval_name": "create-from-lightfast-founder-notes-packet",
+      "smoke": true,
       "scenario_type": "founder_notes_ambiguity",
       "input_shape": "source_packet",
       "ambiguity_level": "high",
@@ -151,6 +154,7 @@
     {
       "id": 5,
       "eval_name": "create-from-harbor-care-coordination-packet",
+      "smoke": true,
       "scenario_type": "cross_domain_generalization",
       "input_shape": "source_packet",
       "ambiguity_level": "high",
@@ -348,6 +352,7 @@
     {
       "id": 8,
       "eval_name": "update-add-single-nongoal-preserve-system-overview",
+      "smoke": true,
       "scenario_type": "update_existing_doc",
       "input_shape": "existing_doc_update",
       "ambiguity_level": "low",

From f23f9ee2a707c832b9348e11ba44fe07179e32e8 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 15:08:48 +1000
Subject: [PATCH 14/30] Tune spec eval domain modeling prompts

---
 scripts/evals/validators/spec.ts                            | 2 +-
 .../baml_src/spec_compiler/compiler_functions.baml          | 6 ++++++
 skills/spec-creator/baml_src/spec_compiler/eval_runner.baml | 6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/scripts/evals/validators/spec.ts b/scripts/evals/validators/spec.ts
index 42de447..9923df9 100644
--- a/scripts/evals/validators/spec.ts
+++ b/scripts/evals/validators/spec.ts
@@ -81,7 +81,7 @@ export function validateSpecDocument(candidateDocument, templateText) {
   const hasEntityDefinitions = /^####\s+4\.1\.\d+\s+/m.test(candidateDocument);
   const declaresNoDurableEntities =
     !hasEntityDefinitions &&
-    /\b(?:does not introduce|no)\b[^.\n]*(?:durable|service-owned|first-class)[^.\n]*entities\b/i.test(
+    /\b(?:does not (?:introduce|define|create)|no)\b[^.\n]*(?:durable|service-owned|service-specific|first-class)[^.\n]*entities\b/i.test(
       candidateDocument,
     );
   const domainModelShapeAcceptable = hasFieldFormatting || declaresNoDurableEntities;
diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
index da034d8..32498d6 100644
--- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -34,6 +34,7 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
     - Produce a behavioral brief suitable for `spec-creator`.
     - Follow the `spec-creator` contract rather than ad hoc document structure.
     - Prefer service-level behavior over product doctrine.
+    - Do not copy forbidden-example terms from these rules into the brief just to name non-goals. Terms like token issuance, allowlists, registries, transport mechanics, and storage mechanics should appear only when the notes state them directly; otherwise use broader source-backed wording such as undocumented internals.
     - Preserve unresolved questions explicitly.
     - Preserve packet-named unresolved tensions when they affect scope, actor boundaries, approval boundaries, deployment shape, or operating model.
     - If the notes contain multiple materially distinct unresolved tensions, keep them distinct in `unresolved_questions` instead of collapsing them into one generic question.
@@ -57,7 +58,10 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
     - If the notes explicitly name a specific human authority as part of the external dependency boundary to keep the runtime honest, preserve that role as a qualified human dependency as well as an `actor`. This should be rare and should apply only to packet-explicit human review, advocate, or approval boundaries, not to general participant lists.
     - Do not list general participant roles such as family caregivers, patients, providers, reviewers, or approvers under `external_dependencies` just because they appear in the packet. Keep those in `actors`, boundaries, or escalation flows unless the packet explicitly frames a narrower human authority as part of the dependency boundary.
     - When the notes mention approved clients, consent, or endpoint verification, keep that as a source-visible boundary or restriction. Do not turn it into an approval-state object, internal control process, or hidden eligibility system.
+    - Keep approved-client wording as `approved clients` or `approved-client restrictions`. Do not paraphrase it as allowlists, client registration, registry management, eligibility status, or similar internal process language unless the notes state that mechanism.
     - `entities` should model only externally meaningful nouns the service manipulates or exposes. Do not create entities only to complete a plausible schema.
+    - External resources that the service only retrieves, inspects, analyzes, or exposes through an access surface are not automatically service-owned entities. For example, projects, deployments, logs, docs, endpoint metadata, and client applications should stay in surfaces, components, dependencies, or open questions unless the notes clearly define durable service-specific records for them.
+    - If the notes support no durable service-specific records, set `entities` to an empty list and explain the absence through components, dependencies, and unresolved questions instead of inventing placeholder entities.
     - Do not model clients, endpoints, documentation search results, approval lists, or ambiguous aliases as entities unless the notes clearly treat them as durable service records with stable fields and behavior.
     - When two labels may describe overlapping context, such as team versus workspace context, keep the ambiguity explicit instead of multiplying entities.
     - Do not add umbrella subject fields such as `subject`, `case_subject`, `party`, or similarly generalized placeholders when the notes already frame participants or care context more concretely. Prefer packet-backed wording or omit the field.
@@ -213,6 +217,8 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
       - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
       - avoid second-order record types when a smaller set of entities or fields already makes the behavior legible
       - preserve packet-named distinctions somewhere in the spec without forcing every named surface into the domain model
+      - external resources that are only retrieved, inspected, analyzed, or exposed through an access surface are not service-owned entities; keep them in surfaces, components, dependencies, or open questions unless the brief defines durable service-specific records for them
+      - if `brief.entities` is empty, render `### 4.1 Entities` followed by a short sentence explaining that the service does not define durable service-specific entities; do not invent entity headings to satisfy the template
       - do not model clients, endpoints, documentation search results, or unresolved aliases as entities unless the brief clearly treats them as durable first-class records
       - do not convert a separate packet-named surface into a nested field on another entity unless the brief clearly supports that nesting
       - do not add umbrella subject fields such as `subject`, `case_subject`, or `party` when the brief does not clearly require them
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
index 45eaae6..2251d74 100644
--- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -20,6 +20,7 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
     - Produce a behavioral service brief suitable for `spec-creator`.
     - Follow the `spec-creator` template and language guide rather than an ad hoc service summary.
     - Treat expected criteria as evaluation guidance, not as license to invent.
+    - Do not copy forbidden-example terms from these rules or expected criteria into the brief just to name non-goals. Terms like token issuance, allowlists, registries, transport mechanics, and storage mechanics should appear only when the packet states them directly; otherwise use broader source-backed wording such as undocumented internals.
     - Preserve unresolved questions explicitly.
     - Preserve packet-named unresolved tensions when they affect scope, actor boundaries, approval boundaries, deployment shape, or operating model.
     - If the packet contains multiple materially distinct unresolved tensions, keep them distinct in `unresolved_questions` instead of collapsing them into one generic question.
@@ -43,7 +44,10 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
     - If the packet explicitly names a specific human authority as part of the external dependency boundary to keep the runtime honest, preserve that role as a qualified human dependency as well as an `actor`. This should be rare and should apply only to packet-explicit human review, advocate, or approval boundaries, not to general participant lists.
     - Do not list general participant roles such as family caregivers, patients, providers, reviewers, or approvers under `external_dependencies` just because they appear in the packet. Keep those in `actors`, boundaries, or escalation flows unless the packet explicitly frames a narrower human authority as part of the dependency boundary.
     - When the packet mentions approved clients, consent, or endpoint verification, keep that as a source-visible boundary or restriction. Do not turn it into an approval-state object, internal control process, or hidden eligibility system.
+    - Keep approved-client wording as `approved clients` or `approved-client restrictions`. Do not paraphrase it as allowlists, client registration, registry management, eligibility status, or similar internal process language unless the packet states that mechanism.
     - `entities` should model only externally meaningful nouns the service manipulates or exposes. Do not create entities only to complete a plausible schema.
+    - External resources that the service only retrieves, inspects, analyzes, or exposes through an access surface are not automatically service-owned entities. For example, projects, deployments, logs, docs, endpoint metadata, and client applications should stay in surfaces, components, dependencies, or open questions unless the packet clearly defines durable service-specific records for them.
+    - If the packet supports no durable service-specific records, set `entities` to an empty list and explain the absence through components, dependencies, and unresolved questions instead of inventing placeholder entities.
     - Do not model clients, endpoints, documentation search results, approval lists, or ambiguous aliases as entities unless the packet clearly treats them as durable service records with stable fields and behavior.
     - When two labels may describe overlapping context, such as team versus workspace context, keep the ambiguity explicit instead of multiplying entities.
     - Do not add umbrella subject fields such as `subject`, `case_subject`, `party`, or similarly generalized placeholders when the packet already frames participants or care context more concretely. Prefer packet-backed wording or omit the field.
@@ -167,6 +171,8 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
       - avoid second-order record types when a smaller set of entities or fields already makes the behavior legible
       - preserve packet-named distinctions somewhere in the document without forcing every named surface into the domain model
+      - external resources that are only retrieved, inspected, analyzed, or exposed through an access surface are not service-owned entities; keep them in surfaces, components, dependencies, or open questions unless the brief defines durable service-specific records for them
+      - if `brief.entities` is empty, render `### 4.1 Entities` followed by a short sentence explaining that the service does not define durable service-specific entities; do not invent entity headings to satisfy the template
       - do not model clients, endpoints, documentation search results, or unresolved aliases as entities unless the brief clearly treats them as durable first-class records
       - do not convert a separate packet-named surface into a nested field on another entity unless the brief clearly supports that nesting
       - when the brief is transitional on write scope, keep component and layer naming neutral rather than capability-forward

From af53340497c3a71f3283046c0681c47b2bdb8226 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 15:27:39 +1000
Subject: [PATCH 15/30] Persist Braintrust suite metadata

---
 scripts/evals/reporters.ts                    | 59 +++++++++++++++++--
 .../spec_compiler/compiler_functions.baml     |  6 +-
 .../baml_src/spec_compiler/eval_runner.baml   |  7 ++-
 3 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/scripts/evals/reporters.ts b/scripts/evals/reporters.ts
index b00c63b..429758f 100644
--- a/scripts/evals/reporters.ts
+++ b/scripts/evals/reporters.ts
@@ -128,6 +128,28 @@ async function createBraintrustReporter(context) {
     },
     tags,
   });
+  let finalizedSummary = null;
+
+  async function finalizeExperiment() {
+    if (finalizedSummary) {
+      return finalizedSummary;
+    }
+
+    await braintrustExperiment.flush();
+    const summary = await braintrustExperiment.summarize();
+    finalizedSummary = {
+      project,
+      experiment,
+      experiment_name: summary.experimentName ?? experiment,
+      experiment_url: summary.experimentUrl ?? null,
+    };
+
+    console.log(
+      `Braintrust experiment: ${finalizedSummary.experiment_url ?? finalizedSummary.experiment_name}`,
+    );
+
+    return finalizedSummary;
+  }
 
   return {
     name: "braintrust",
@@ -177,12 +199,11 @@ async function createBraintrustReporter(context) {
         tags,
       });
     },
+    async onSuiteComplete() {
+      return await finalizeExperiment();
+    },
     async close() {
-      await braintrustExperiment.flush();
-      const summary = await braintrustExperiment.summarize();
-      console.log(
-        `Braintrust experiment: ${summary.experimentUrl ?? summary.experimentName}`,
-      );
+      await finalizeExperiment();
     },
   };
 }
@@ -224,7 +245,33 @@ export async function createReporterSet(names, context) {
       await emit("onComparisonComplete", payload);
     },
     async onSuiteComplete(payload) {
-      await emit("onSuiteComplete", payload);
+      const reporterSummaries = {};
+
+      for (const reporter of reporters.filter((reporter) => reporter.name !== "local")) {
+        if (typeof reporter.onSuiteComplete === "function") {
+          const reporterSummary = await reporter.onSuiteComplete(payload);
+          if (reporterSummary) {
+            reporterSummaries[reporter.name] = reporterSummary;
+          }
+        }
+      }
+
+      const localPayload =
+        Object.keys(reporterSummaries).length > 0
+          ? {
+              ...payload,
+              suiteSummary: {
+                ...payload.suiteSummary,
+                reporters: reporterSummaries,
+              },
+            }
+          : payload;
+
+      for (const reporter of reporters.filter((reporter) => reporter.name === "local")) {
+        if (typeof reporter.onSuiteComplete === "function") {
+          await reporter.onSuiteComplete(localPayload);
+        }
+      }
     },
     async onError(payload) {
       await emit("onError", payload);
diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
index 32498d6..42eee80 100644
--- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -73,7 +73,8 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
     - Do not split a shared case, shared timeline, or shared record surface into entry-level entities such as `TimelineEntry`, `Event`, or `RecordItem` unless the notes clearly support those entries as durable first-class records.
     - Most briefs should stay under roughly six entities unless the notes clearly support richer modeling.
     - Prefer the smallest viable domain model. If an extra entity or field only restates behavior already captured elsewhere, omit it.
-    - If the notes mention broader `management` or `operations` language only as an unresolved transition, keep that wording in boundaries or unresolved questions rather than turning it into a positive capability, component, or layer name.
+    - If the notes mention broader `management` or `operations` language only as an unresolved transition, keep that wording in unresolved questions or narrowly qualified tension wording rather than turning it into a positive capability, component, goal, problem, boundary, or layer name.
+    - When write scope is unresolved, avoid phrases like `project and deployment management` outside `unresolved_questions`; prefer neutral source-backed wording such as `project and deployment context`, `deployment inspection`, `retrieval`, `analysis`, or `later documentation wording`.
     - If the notes leave reminders, outbound follow-up, missing-document requests, or similar communication behavior explicitly unresolved, keep that behavior in boundaries or unresolved questions rather than turning it into a standalone component, layer, or affirmative service capability.
     - If the notes emphasize durable lineage, versioned history, approval trails, or recurring packet-to-review loops, make the service's long-running coordination role explicit in the brief rather than leaving persistence only implied.
     - When the notes support it, use the exact phrase `long-running` in `purpose` or `update_request` instead of relying only on weaker proxies like `durable`, `ongoing`, or `history-aware`.
@@ -191,6 +192,7 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
     - When the brief supports it, prefer the exact phrase `long-running` in the `Purpose:` line or opening Problem Statement paragraph instead of leaving that property to inference.
     - If `service_surfaces` include versioned history, a timeline, or an approval trail, mention at least one such surface explicitly in the `Purpose:` line or opening Problem Statement paragraph instead of referring only to generic traceability or lineage.
     - When write scope is unresolved, prefer neutral words like `surface`, `context`, `retrieval`, `inspection`, or `analysis` over stronger nouns like `operations` or `management`.
+    - If the brief includes source-backed but unresolved `management` wording, place the literal `management` phrase only in `## 5. Open Questions` or in explicitly qualified tension language. Do not use it in goals, problem bullets, boundaries, component names, or layer names.
     - Do not use `operation`, `operations`, or `management` in component or layer names when the brief leaves write scope unresolved.
     - Prefer `handling`, `coordination`, or `review` over workflow-forward labels like `routing`, `orchestration`, or `dispatch` unless the brief explicitly supports that language.
     - Require `Important boundary:` as a standalone labeled block inside the Problem Statement, followed by boundary bullets.
@@ -212,6 +214,8 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
     - If the brief names an ordered lineage or artifact pipeline, preserve every materially distinct stage somewhere in the document. If the brief includes a stage like `brief` between packet and candidate artifact, keep that stage visible in the Problem Statement, System Overview, or another prose section even if it is not a domain entity.
     - Require the domain model to stay conservative:
       - include only entities materially supported by the brief
+      - render every entity in `brief.entities` as its own `#### 4.1.x EntityName` heading with its own `Fields:` block; do not merge fields from one entity into another entity
+      - preserve entity names from the brief exactly unless the name itself is malformed
       - prefer fewer entities over speculative completeness
       - use minimal source-backed fields
       - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
index 2251d74..720a640 100644
--- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -59,7 +59,8 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
     - Do not split a shared case, shared timeline, or shared record surface into entry-level entities such as `TimelineEntry`, `Event`, or `RecordItem` unless the packet clearly supports those entries as durable first-class records.
     - Most briefs should stay under roughly six entities unless the packet clearly supports richer modeling.
     - Prefer the smallest viable domain model. If an extra entity or field only restates behavior already captured elsewhere, omit it.
-    - If the packet mentions broader `management` or `operations` language only as an unresolved transition, keep that wording in boundaries or unresolved questions rather than turning it into a positive capability, component, or layer name.
+    - If the packet mentions broader `management` or `operations` language only as an unresolved transition, keep that wording in unresolved questions or narrowly qualified tension wording rather than turning it into a positive capability, component, goal, problem, boundary, or layer name.
+    - When write scope is unresolved, avoid phrases like `project and deployment management` outside `unresolved_questions`; prefer neutral source-backed wording such as `project and deployment context`, `deployment inspection`, `retrieval`, `analysis`, or `later documentation wording`.
     - If the packet leaves reminders, outbound follow-up, missing-document requests, or similar communication behavior explicitly unresolved, keep that behavior in boundaries or unresolved questions rather than turning it into a standalone component, layer, or affirmative service capability.
     - If the packet emphasizes durable lineage, versioned history, approval trails, or recurring packet-to-review loops, make the service's long-running coordination role explicit in the brief rather than leaving persistence only implied.
     - When the packet supports it, use the exact phrase `long-running` in `purpose` or `update_request` instead of relying only on weaker proxies like `durable`, `ongoing`, or `history-aware`.
@@ -126,6 +127,7 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
     - When the brief supports it, prefer the exact phrase `long-running` in the `Purpose:` line or opening Problem Statement paragraph instead of leaving that property to inference.
     - If `service_surfaces` include versioned history, a timeline, or an approval trail, mention at least one such surface explicitly in the `Purpose:` line or opening Problem Statement paragraph instead of referring only to generic traceability or lineage.
     - When write scope is unresolved, prefer neutral words like `surface`, `context`, `retrieval`, `inspection`, or `analysis` over stronger nouns like `operations` or `management`.
+    - If the brief includes source-backed but unresolved `management` wording, place the literal `management` phrase only in `## 5. Open Questions` or in explicitly qualified tension language. Do not use it in goals, problem bullets, boundaries, component names, or layer names.
     - Do not use `operation`, `operations`, or `management` in component or layer names when the brief leaves write scope unresolved.
     - Prefer `handling`, `coordination`, or `review` over workflow-forward labels like `routing`, `orchestration`, or `dispatch` unless the brief explicitly supports that language.
     - Include these sections in order:
@@ -166,6 +168,8 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
     - If the brief names an ordered lineage or artifact pipeline, preserve every materially distinct stage somewhere in the document. If the packet or brief includes a stage like `brief` between packet and candidate artifact, keep that stage visible in the Problem Statement, System Overview, or another prose section even if it is not a domain entity.
     - Keep the domain model conservative:
       - include only entities materially supported by the brief
+      - render every entity in `brief.entities` as its own `#### 4.1.x EntityName` heading with its own `Fields:` block; do not merge fields from one entity into another entity
+      - preserve entity names from the brief exactly unless the name itself is malformed
       - prefer fewer entities over speculative completeness
       - use minimal source-backed fields
       - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
@@ -270,6 +274,7 @@ function EvaluateSpecDocument(
     - Penalize umbrella subject fields such as `subject`, `case_subject`, or `party` when they generalize away more concrete packet-backed participant or care-context framing.
     - Penalize splitting a shared case, shared timeline, or shared record surface into entry-level entities when the packet does not clearly establish entry records as first-class objects.
     - Penalize capability-forward wording like `operations` or `management behavior` when the packet leaves write scope explicitly unresolved.
+    - Do not penalize source-backed `management` wording when it appears only inside an open question or explicitly qualified unresolved-tension sentence. Penalize it when it is used as an affirmative goal, capability, component name, layer name, or settled boundary.
     - Penalize standalone reminder, outbound follow-up, or missing-document components when the packet leaves that communication boundary explicitly unresolved.
     - Penalize component or abstraction-layer names that turn unresolved `management` language into affirmative service capability labels.
     - Penalize workflow-forward component labels like `routing`, `orchestration`, or `dispatch` when the packet only supports softer review, handling, or coordination language.

From 3e7e6dd197d2e05937dc701777d01b7ad4b30802 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 15:37:23 +1000
Subject: [PATCH 16/30] Relax foundation update validator phrasing

---
 skills/foundation-creator/evals/evals.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json
index 4eeaf88..e0d8de4 100644
--- a/skills/foundation-creator/evals/evals.json
+++ b/skills/foundation-creator/evals/evals.json
@@ -223,7 +223,7 @@
           },
           {
             "id": "human_judgment_thesis_present",
-            "pattern": "structur(?:e|ing) human judgment.*stable(?:,? [a-z-]+)? constraints?(?: and artifacts?)?|stable(?:,? [a-z-]+)? constraints?(?: and artifacts?)?.*structur(?:e|ing) human judgment",
+            "pattern": "structur(?:e|es|ing) human judgment.*stable(?:,? [a-z-]+)? constraints?(?: and (?:[a-z-]+ )?artifacts?)?|stable(?:,? [a-z-]+)? constraints?(?: and (?:[a-z-]+ )?artifacts?)?.*structur(?:e|es|ing) human judgment",
             "flags": "i",
             "details_pass": "Detected the requested thesis language about structuring human judgment through stable constraints.",
             "details_fail": "Did not detect the requested thesis language about structuring human judgment through stable constraints."

From 433f6252b89beaf35e61fb73529446ab61dd6a76 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 15:41:14 +1000
Subject: [PATCH 17/30] Tighten foundation strategic bet language

---
 .../baml_src/foundation_compiler/compiler_functions.baml    | 4 ++--
 .../baml_src/foundation_compiler/eval_runner.baml           | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index 33105a3..2f3b1d5 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -88,7 +88,7 @@ function CompileFoundationBrief(
     - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
     - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Avoid wording that attributes settled intent directly to the company, such as `The company appears to be betting on...`.
-    - Forbid prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
+    - Forbid prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
     - If a possible bet cannot be phrased as a source-visible signal without overclaiming, move it to `open_questions` or omit it.
     - If a surface is visible but still in transition, preserve that qualification in the surrounding summary or questions.
     - Avoid market-leadership or competitive-superiority language unless directly supported by source material.
@@ -136,7 +136,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
     - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Forbid wording like `The company appears to be betting on...`, bare `Bet:` labels, or categorical claims like `X is the wedge`, `Y is a defensible primitive`, `X will remain the center of gravity`, or `Y matters more than Z`.
-    - Forbid prescriptive verbs in `Strategic Bets`, including `prioritize`, `invest in`, `ship`, or `treat as first-class`.
+    - Forbid prescriptive verbs and categorical labels in `Strategic Bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, or `first-class`.
     - If a would-be strategic bet cannot be phrased as a source-visible signal, convert it into an `Open Questions` bullet or omit it.
     - Require recently emerging or transitional surfaces to be qualified explicitly rather than flattened as fully settled.
     - Require `Open Questions` bullets to remain actual unresolved questions rather than disguised conclusions.
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index a7e6d53..cdcb17e 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -37,7 +37,7 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
     - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
     - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Do not phrase `strategic_bets` as direct statements of company intent such as `The company appears to be betting on...`.
-    - Do not use prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
+    - Do not use prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
     - If a possible bet cannot be stated as a source-visible signal without overclaiming, move it to `open_questions` or omit it.
     - Do not use bare `Bet:` labels or categorical claims like `X is the wedge` or `Y is a defensible primitive`.
     - Keep `strategic_bets` short when evidence is weak.
@@ -81,7 +81,7 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Do not use direct company-intent phrasing such as `The company appears to be betting on...`.
     - Do not use bare `Bet:` labels or categorical statements like `X is the wedge`, `Y is a defensible primitive`, `X will remain the center of gravity`, or `Y matters more than Z`.
-    - Do not use prescriptive verbs like `prioritize`, `invest in`, `ship`, or `treat as first-class` in `Strategic Bets`.
+    - Do not use prescriptive verbs or categorical labels like `prioritize`, `invest in`, `ship`, `treat as first-class`, or `first-class` in `Strategic Bets`.
     - If a would-be strategic bet cannot be phrased as a source-visible signal, convert it into an `Open Questions` bullet or omit it.
     - When a surface is visible but still evolving in the packet, qualify it explicitly as emerging, evolving, or unsettled.
     - Write `Open Questions` as actual unresolved questions, typically ending with `?`.
@@ -128,7 +128,7 @@ function EvaluateFoundationDocument(
     - Penalize paragraph-only rendering of list sections that the template expects as markdown bullets, especially `Strategic Bets` and `Open Questions`.
     - `## Strategic Bets` is a required template section. Do not penalize its presence by itself when the bullets remain source-bound and restrained.
     - Penalize `Strategic Bets` phrased as recommendations, direct company-intent claims, or settled conclusions when the packet only supports directional evidence.
-    - Penalize `Strategic Bets` that use verbs like `prioritize`, `invest in`, `ship`, `treat as first-class`, `will remain`, or `matters more than`, or phrases like `The company appears to be betting on...`.
+    - Penalize `Strategic Bets` that use verbs or labels like `prioritize`, `invest in`, `ship`, `treat as first-class`, `first-class`, `will remain`, or `matters more than`, or phrases like `The company appears to be betting on...`.
     - Penalize evidence-source mismatch inside `Strategic Bets`, such as `Public materials suggest...` when the packet only contains founder notes or internal notes.
     - Penalize flattening transitional surfaces as fully settled if the packet presents them as evolving.
     - Penalize `Open Questions` that silently resolve ambiguity instead of keeping it open.

From 29e7d78c3155ec7aacadb9536592dd4084701a84 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 15:50:40 +1000
Subject: [PATCH 18/30] Add eval CI smoke workflow

---
 .github/workflows/evals.yml | 62 +++++++++++++++++++++++++++++++++++++
 README.md                   |  7 +++++
 package.json                |  3 +-
 3 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/evals.yml

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
new file mode 100644
index 0000000..1358e66
--- /dev/null
+++ b/.github/workflows/evals.yml
@@ -0,0 +1,62 @@
+name: Evals
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+concurrency:
+  group: evals-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  checks:
+    name: Typecheck and smoke evals
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    permissions:
+      contents: read
+    env:
+      AI_GATEWAY_API_KEY: ${{ secrets.AI_GATEWAY_API_KEY }}
+      BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+      BRAINTRUST_PROJECT: lightfast-skills
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: 1.3.9
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Static eval checks
+        run: bun run ci:check
+
+      - name: Run live smoke evals
+        if: env.AI_GATEWAY_API_KEY != ''
+        run: |
+          reporter="local"
+          if [ -n "${BRAINTRUST_API_KEY:-}" ]; then
+            reporter="local,braintrust"
+          fi
+
+          bun run eval:foundation:smoke -- --reporter "$reporter"
+          bun run eval:spec:smoke -- --reporter "$reporter"
+
+      - name: Skip live smoke evals
+        if: env.AI_GATEWAY_API_KEY == ''
+        run: echo "AI_GATEWAY_API_KEY is not configured; skipping model-backed smoke evals."
+
+      - name: Upload eval artifacts
+        if: always() && env.AI_GATEWAY_API_KEY != ''
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-runs
+          path: skills/*/evals/runs/**
+          if-no-files-found: ignore
+          retention-days: 7
diff --git a/README.md b/README.md
index 4c78e08..b9a0df7 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and
 
 ```bash
 bun install
+bun run ci:check
 bun run eval:typecheck
 bun run eval:foundation -- create-foundation-from-vercel-source-packet
 bun run eval:foundation -- create-foundation-from-lightfast-founder-notes
@@ -155,6 +156,12 @@ When `--trials N` is used, the run directory contains `trial-1/`, `trial-2/`,
 `AI_GATEWAY_API_KEY` can live in the repo-local `.env` without manual
 `source` steps.
 
+CI runs `bun run ci:check` on every pull request and push to `main`. Live smoke
+evals run only when `AI_GATEWAY_API_KEY` is configured in GitHub Actions
+secrets. Braintrust export remains optional: if `BRAINTRUST_API_KEY` is present,
+CI uses `local,braintrust`; otherwise local JSON artifacts remain the source of
+truth.
+
 For other local commands that should inherit `.env`, use:
 
 ```bash
diff --git a/package.json b/package.json
index 29e4314..74938f1 100644
--- a/package.json
+++ b/package.json
@@ -11,7 +11,8 @@
     "eval:foundation:smoke": "bun run eval:foundation -- --smoke",
     "eval:spec": "bun run with-env -- bun ./scripts/run-baml-eval.ts spec-creator",
     "eval:spec:smoke": "bun run eval:spec -- --smoke",
-    "eval:typecheck": "tsc --noEmit --allowImportingTsExtensions --moduleResolution bundler --module esnext --target esnext --skipLibCheck --types node scripts/run-baml-eval.ts scripts/evals/*.ts scripts/evals/validators/*.ts"
+    "eval:typecheck": "tsc --noEmit --allowImportingTsExtensions --moduleResolution bundler --module esnext --target esnext --skipLibCheck --types node scripts/run-baml-eval.ts scripts/evals/*.ts scripts/evals/validators/*.ts",
+    "ci:check": "bun run baml:generate:foundation && bun run baml:generate:spec && bun run eval:typecheck"
   },
   "dependencies": {
     "@boundaryml/baml": "0.221.0",

From 32eaed2d3bae7e8f6e3f8006dcdb94cf96f8871c Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 16:25:53 +1000
Subject: [PATCH 19/30] Tighten spec rendering prompt boundaries

---
 .../spec_compiler/compiler_functions.baml         | 15 +++++++++++----
 .../baml_src/spec_compiler/eval_runner.baml       | 15 +++++++++++----
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
index 42eee80..f0b245a 100644
--- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -57,6 +57,7 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
     - Prefer systems, resources, and external services over actor roles in `external_dependencies`.
     - If the notes explicitly name a specific human authority as part of the external dependency boundary to keep the runtime honest, preserve that role as a qualified human dependency as well as an `actor`. This should be rare and should apply only to packet-explicit human review, advocate, or approval boundaries, not to general participant lists.
     - Do not list general participant roles such as family caregivers, patients, providers, reviewers, or approvers under `external_dependencies` just because they appear in the packet. Keep those in `actors`, boundaries, or escalation flows unless the packet explicitly frames a narrower human authority as part of the dependency boundary.
+    - Do not list reviewers or approvers in `external_dependencies` merely because the service has a human review gate. Keep them as actors or boundary language unless the notes explicitly define an external review authority or external review service.
     - When the notes mention approved clients, consent, or endpoint verification, keep that as a source-visible boundary or restriction. Do not turn it into an approval-state object, internal control process, or hidden eligibility system.
     - Keep approved-client wording as `approved clients` or `approved-client restrictions`. Do not paraphrase it as allowlists, client registration, registry management, eligibility status, or similar internal process language unless the notes state that mechanism.
     - `entities` should model only externally meaningful nouns the service manipulates or exposes. Do not create entities only to complete a plausible schema.
@@ -66,6 +67,7 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
     - When two labels may describe overlapping context, such as team versus workspace context, keep the ambiguity explicit instead of multiplying entities.
     - Do not add umbrella subject fields such as `subject`, `case_subject`, `party`, or similarly generalized placeholders when the notes already frame participants or care context more concretely. Prefer packet-backed wording or omit the field.
     - Every entity field must be directly supported by the notes or be the minimal identity/status field required to keep the entity coherent. Prefer fewer fields over speculative completeness.
+    - If the notes mention lightweight status states while warning against full workflow or BPM modeling, do not add generic `status` fields to entities unless the status values are directly source-backed and essential to the record.
     - When the notes describe a transition-state resource surface rather than a record-centric system, default to the sparsest viable entities. One minimal identifying field may be enough.
     - Do not infer relationship fields, lifecycle/status fields, or embedded collections such as `project_id`, `status`, or `logs` unless the notes clearly support them.
     - Do not add audit-style fields such as `change_note`, `decided_at`, revision metadata, or extra timestamps solely because the notes mention history or traceability. Keep history in components or prose unless the record field itself is source-backed.
@@ -177,12 +179,15 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
       - one opening paragraph
       - a line like `The service solves <N> operational problems:`
       - a bullet list of explicit operational problems
-    - Require the `Purpose:` line to name the service kind, the protected endpoint or access boundary, and the main resource surfaces at the highest stable granularity the brief supports.
+    - If `brief.purpose` is already source-bound and concrete, use it as the `Purpose:` line with only minor grammar trimming. Do not introduce new concepts while rewriting it.
+    - Require the `Purpose:` line to name the service kind and main source-backed ingress, resource, or coordination surfaces at the highest stable granularity the brief supports.
+    - Name a protected endpoint, security boundary, or access boundary in the `Purpose:` line only when the brief explicitly supports one.
     - If the brief names surfaces like docs, teams, projects, deployments, or logs, mention them explicitly in the `Purpose:` line rather than replacing them with vague phrases like `operational context`.
     - If `service_surfaces` are present, mention at least one stable service surface from that list in the `Purpose:` line or opening Problem Statement paragraph.
     - If the brief centers a stable coordination surface such as a shared case, timeline, inbox, queue, or record, mention at least one such surface explicitly in the `Purpose:` line.
     - If the brief carries a tentative but source-backed coordination surface such as an inbox, queue, or shared record, keep that surface visible in the `Purpose:` line or opening Problem Statement paragraph rather than flattening it into pipeline-only language.
-    - If the brief mentions OAuth, consent, approved clients, or secure access, the `Purpose:` line must explicitly say `OAuth-protected` or `secure, OAuth-protected` rather than leaving that security detail to later sections.
+    - If the brief mentions OAuth, consent, approved clients, endpoint verification, or secure access, the `Purpose:` line must explicitly say `OAuth-protected` or `secure, OAuth-protected` rather than leaving that security detail to later sections.
+    - Do not add `secure`, `secure access`, `protected`, or `OAuth-protected` to the `Purpose:` line unless the brief explicitly mentions OAuth, consent, approved clients, endpoint verification, or secure access.
     - If the brief is fundamentally about coordination across named participant types, roles, or organizations, mention that cross-party coordination boundary in the `Purpose:` line instead of only naming artifacts or records.
     - If `actors` are present, use them to describe who coordination happens across.
     - Do not restate general participant actors as external dependencies just to make them visible. A role may appear in both places only when the brief explicitly preserves a packet-stated human authority dependency boundary.
@@ -204,6 +209,7 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
     - Prefer systems and external services over actor roles in `### 3.3 External Dependencies`.
     - If the brief explicitly frames a specific human authority as part of the external dependency boundary, keep that role in `### 3.3 External Dependencies` with qualified wording such as `human advocate review, when escalation is required`.
     - Do not list general participant roles such as family caregivers, patients, providers, reviewers, or approvers in `### 3.3 External Dependencies` just because the brief emphasizes a human boundary.
+    - Do not list reviewers or approvers in `### 3.3 External Dependencies` merely because the service has a human review gate. Keep them visible in actors, boundaries, or review components unless the brief explicitly defines an external review authority or external review service.
     - If `actors` are present, reflect them in the Problem Statement, Purpose, or component responsibilities instead of moving them wholesale into `### 3.3 External Dependencies`.
     - When a dependency is fundamentally about communicating with external parties, phrase the dependency as a communication channel, interface, or information source rather than as a bare list of counterparties.
     - If the brief mentions approved-client restrictions, keep that phrased as a boundary or restriction, not as an approval-state system or internal status object.
@@ -237,10 +243,11 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
       - keep entity count modest; more than roughly six entities should be rare and strongly justified by the brief
       - if the brief names a shared case timeline or shared record surface without explicit entry records, do not create `TimelineEntry`, `Event`, or similar entry entities just to make the timeline concrete
     - In field lines, use only logical types and source-backed constraints.
+    - Do not add generic `status` fields just because a brief or packet mentions lightweight state. Include status fields only when the source makes status values essential and not merely a warning against over-modeling workflow state.
     - Do not put `required`, `optional`, `non-empty`, `unique`, or invented service-scope constraints inside the field parentheses.
     - If requirement state matters, express it in the indented semantic bullet rather than inside the type parentheses.
-    - If `unresolved_questions` are materially important, add `## 5. Open Questions` after the domain model and render them as bullets.
-    - When `unresolved_questions` are present, render every materially distinct item from the brief exactly once. Do not drop one for brevity or substitute only a subset.
+    - If `unresolved_questions` is non-empty, add `## 5. Open Questions` after the domain model and render every question as a markdown bullet.
+    - When `unresolved_questions` are present, render every materially distinct item from the brief exactly once. Do not drop one for brevity, substitute only a subset, or move the questions only into boundary prose.
     - When multiple unresolved questions are present, preserve the distinct tensions rather than reducing them to a single generic question.
     - Keep `## 5. Open Questions` source-bound. If the brief names a structural tension such as repo-native vs hosted, family-direct vs sponsored, or automation vs human ownership, carry that tension forward explicitly.
     - When the source notes include explicit `Unresolved:` bullets, keep those questions source-traceable in `## 5. Open Questions` rather than substituting different questions.
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
index 720a640..ce9a9e8 100644
--- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -43,6 +43,7 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
     - Prefer systems, resources, and external services over actor roles in `external_dependencies`.
     - If the packet explicitly names a specific human authority as part of the external dependency boundary to keep the runtime honest, preserve that role as a qualified human dependency as well as an `actor`. This should be rare and should apply only to packet-explicit human review, advocate, or approval boundaries, not to general participant lists.
     - Do not list general participant roles such as family caregivers, patients, providers, reviewers, or approvers under `external_dependencies` just because they appear in the packet. Keep those in `actors`, boundaries, or escalation flows unless the packet explicitly frames a narrower human authority as part of the dependency boundary.
+    - Do not list reviewers or approvers in `external_dependencies` merely because the service has a human review gate. Keep them as actors or boundary language unless the packet explicitly defines an external review authority or external review service.
     - When the packet mentions approved clients, consent, or endpoint verification, keep that as a source-visible boundary or restriction. Do not turn it into an approval-state object, internal control process, or hidden eligibility system.
     - Keep approved-client wording as `approved clients` or `approved-client restrictions`. Do not paraphrase it as allowlists, client registration, registry management, eligibility status, or similar internal process language unless the packet states that mechanism.
     - `entities` should model only externally meaningful nouns the service manipulates or exposes. Do not create entities only to complete a plausible schema.
@@ -52,6 +53,7 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
     - When two labels may describe overlapping context, such as team versus workspace context, keep the ambiguity explicit instead of multiplying entities.
     - Do not add umbrella subject fields such as `subject`, `case_subject`, `party`, or similarly generalized placeholders when the packet already frames participants or care context more concretely. Prefer packet-backed wording or omit the field.
     - Every entity field must be directly supported by the packet or be the minimal identity/status field required to keep the entity coherent. Prefer fewer fields over speculative completeness.
+    - If the packet mentions lightweight status states while warning against full workflow or BPM modeling, do not add generic `status` fields to entities unless the status values are directly source-backed and essential to the record.
     - When the packet describes a transition-state resource surface rather than a record-centric system, default to the sparsest viable entities. One minimal identifying field may be enough.
     - Do not infer relationship fields, lifecycle/status fields, or embedded collections such as `project_id`, `status`, or `logs` unless the packet clearly supports them.
     - Do not add audit-style fields such as `change_note`, `decided_at`, revision metadata, or extra timestamps solely because the packet mentions history or traceability. Keep history in components or prose unless the record field itself is source-backed.
@@ -112,12 +114,15 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - `# <Service Name> Specification`
       - `Status: Draft v1 (language-agnostic)`
       - `Purpose: <one sentence>`
-    - Require the `Purpose:` line to name the service kind, the protected endpoint or access boundary, and the main resource surfaces at the highest stable granularity the brief supports.
+    - If `brief.purpose` is already source-bound and concrete, use it as the `Purpose:` line with only minor grammar trimming. Do not introduce new concepts while rewriting it.
+    - Require the `Purpose:` line to name the service kind and main source-backed ingress, resource, or coordination surfaces at the highest stable granularity the brief supports.
+    - Name a protected endpoint, security boundary, or access boundary in the `Purpose:` line only when the brief explicitly supports one.
     - If the brief names surfaces like docs, teams, projects, deployments, or logs, mention them explicitly in the `Purpose:` line rather than replacing them with vague phrases like `operational context`.
     - If `service_surfaces` are present, mention at least one stable service surface from that list in the `Purpose:` line or opening Problem Statement paragraph.
     - If the brief centers a stable coordination surface such as a shared case, timeline, inbox, queue, or record, mention at least one such surface explicitly in the `Purpose:` line.
     - If the brief carries a tentative but source-backed coordination surface such as an inbox, queue, or shared record, keep that surface visible in the `Purpose:` line or opening Problem Statement paragraph rather than flattening it into pipeline-only language.
-    - If the brief mentions OAuth, consent, approved clients, or secure access, the `Purpose:` line must explicitly say `OAuth-protected` or `secure, OAuth-protected` rather than leaving that security detail to later sections.
+    - If the brief mentions OAuth, consent, approved clients, endpoint verification, or secure access, the `Purpose:` line must explicitly say `OAuth-protected` or `secure, OAuth-protected` rather than leaving that security detail to later sections.
+    - Do not add `secure`, `secure access`, `protected`, or `OAuth-protected` to the `Purpose:` line unless the brief explicitly mentions OAuth, consent, approved clients, endpoint verification, or secure access.
     - If the brief is fundamentally about coordination across named participant types, roles, or organizations, mention that cross-party coordination boundary in the `Purpose:` line instead of only naming artifacts or records.
     - If `actors` are present, use them to describe who coordination happens across.
     - Do not restate general participant actors as external dependencies just to make them visible. A role may appear in both places only when the brief explicitly preserves a packet-stated human authority dependency boundary.
@@ -158,6 +163,7 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
     - Prefer systems and external services over actor roles in `### 3.3 External Dependencies`.
     - If the packet or brief explicitly frames a specific human authority as part of the external dependency boundary, keep that role in `### 3.3 External Dependencies` with qualified wording such as `human advocate review, when escalation is required`.
     - Do not list general participant roles such as family caregivers, patients, providers, reviewers, or approvers in `### 3.3 External Dependencies` just because the packet emphasizes a human boundary.
+    - Do not list reviewers or approvers in `### 3.3 External Dependencies` merely because the service has a human review gate. Keep them visible in actors, boundaries, or review components unless the packet or brief explicitly defines an external review authority or external review service.
     - If `actors` are present, reflect them in the Problem Statement, Purpose, or component responsibilities instead of moving them wholesale into `### 3.3 External Dependencies`.
     - When a dependency is fundamentally about communicating with external parties, phrase the dependency as a communication channel, interface, or information source rather than as a bare list of counterparties.
     - If the brief mentions approved-client restrictions, keep that phrased as a boundary or restriction, not as an approval-state system or internal status object.
@@ -186,12 +192,13 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - avoid component names like `gate` or dependency phrases like `client status` when the brief only supports externally visible access restrictions
       - keep entity count modest; more than roughly six entities should be rare and strongly justified by the brief
     - In field lines, use only logical types and source-backed constraints.
+    - Do not add generic `status` fields just because a brief or packet mentions lightweight state. Include status fields only when the source makes status values essential and not merely a warning against over-modeling workflow state.
     - Do not put `required`, `optional`, `non-empty`, `unique`, or invented service-scope constraints inside the field parentheses.
     - If requirement state matters, express it in the indented semantic bullet rather than inside the type parentheses.
     - Preserve uncertainty where the source packet is in transition.
     - Avoid implementation detail.
-    - If `unresolved_questions` are materially important, add `## 5. Open Questions` after the domain model and render them as markdown bullets.
-    - When `unresolved_questions` are present, render every materially distinct item from the brief exactly once. Do not drop one for brevity or substitute only a subset.
+    - If `unresolved_questions` is non-empty, add `## 5. Open Questions` after the domain model and render every question as a markdown bullet.
+    - When `unresolved_questions` are present, render every materially distinct item from the brief exactly once. Do not drop one for brevity, substitute only a subset, or move the questions only into boundary prose.
     - When multiple unresolved questions are present, preserve the distinct tensions rather than reducing them to a single generic question.
     - Keep `## 5. Open Questions` source-bound. If the brief names a structural tension such as repo-native vs hosted, family-direct vs sponsored, or automation vs human ownership, carry that tension forward explicitly.
     - When the source packet includes explicit `Unresolved:` bullets, keep those questions source-traceable in `## 5. Open Questions` rather than substituting different questions.

From 2acdb85ab94ecba1cfa3325d0e3f528de05ca2d8 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 16:44:39 +1000
Subject: [PATCH 20/30] Upgrade CI to Node 24 LTS

---
 .github/workflows/evals.yml | 8 ++++++++
 .node-version               | 1 +
 bun.lock                    | 6 +++---
 package.json                | 5 ++++-
 4 files changed, 16 insertions(+), 4 deletions(-)
 create mode 100644 .node-version

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index 1358e66..d9674af 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -11,6 +11,9 @@ concurrency:
   group: evals-${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+env:
+  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
+
 jobs:
   checks:
     name: Typecheck and smoke evals
@@ -26,6 +29,11 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
+      - name: Set up Node
+        uses: actions/setup-node@v4
+        with:
+          node-version-file: .node-version
+
       - name: Set up Bun
         uses: oven-sh/setup-bun@v2
         with:
diff --git a/.node-version b/.node-version
new file mode 100644
index 0000000..5bf4400
--- /dev/null
+++ b/.node-version
@@ -0,0 +1 @@
+24.15.0
diff --git a/bun.lock b/bun.lock
index d5e1a0d..f077461 100644
--- a/bun.lock
+++ b/bun.lock
@@ -9,7 +9,7 @@
         "typescript": "5.9.3",
       },
       "devDependencies": {
-        "@types/node": "^25.6.0",
+        "@types/node": "24",
         "braintrust": "^3.9.0",
         "dotenv-cli": "^8.0.0",
       },
@@ -116,7 +116,7 @@
 
     "@types/estree": ["@types/estree@1.0.8", "", {}, "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w=="],
 
-    "@types/node": ["@types/node@25.6.0", "", { "dependencies": { "undici-types": "~7.19.0" } }, "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ=="],
+    "@types/node": ["@types/node@24.12.2", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-A1sre26ke7HDIuY/M23nd9gfB+nrmhtYyMINbjI1zHJxYteKR6qSMX56FsmjMcDb3SMcjJg5BiRRgOCC/yBD0g=="],
 
     "@vercel/functions": ["@vercel/functions@1.6.0", "", { "peerDependencies": { "@aws-sdk/credential-provider-web-identity": "*" }, "optionalPeers": ["@aws-sdk/credential-provider-web-identity"] }, "sha512-R6FKQrYT5MZs5IE1SqeCJWxMuBdHawFcCZboKKw8p7s+6/mcd55Gx6tWmyKnQTyrSEA04NH73Tc9CbqpEle8RA=="],
 
@@ -366,7 +366,7 @@
 
     "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
 
-    "undici-types": ["undici-types@7.19.2", "", {}, "sha512-qYVnV5OEm2AW8cJMCpdV20CDyaN3g0AjDlOGf1OW4iaDEx8MwdtChUp4zu4H0VP3nDRF/8RKWH+IPp9uW0YGZg=="],
+    "undici-types": ["undici-types@7.16.0", "", {}, "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="],
 
     "unpipe": ["unpipe@1.0.0", "", {}, "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="],
 
diff --git a/package.json b/package.json
index 74938f1..5705971 100644
--- a/package.json
+++ b/package.json
@@ -3,6 +3,9 @@
   "private": true,
   "packageManager": "bun@1.3.9",
   "type": "module",
+  "engines": {
+    "node": ">=24 <25"
+  },
   "scripts": {
     "with-env": "dotenv -e .env --",
     "baml:generate:foundation": "node ./node_modules/@boundaryml/baml/cli.js generate --from ./skills/foundation-creator/baml_src",
@@ -19,7 +22,7 @@
     "typescript": "5.9.3"
   },
   "devDependencies": {
-    "@types/node": "^25.6.0",
+    "@types/node": "24",
     "braintrust": "^3.9.0",
     "dotenv-cli": "^8.0.0"
   }

From d0735914dcba91d318c7ffaa2434a11620429bc8 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 16:59:48 +1000
Subject: [PATCH 21/30] Stabilize foundation update eval prompt

---
 .../baml_src/foundation_compiler/compiler_functions.baml    | 6 ++++++
 .../baml_src/foundation_compiler/eval_runner.baml           | 6 ++++++
 skills/foundation-creator/evals/evals.json                  | 2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index 2f3b1d5..dbd273e 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -117,9 +117,12 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - State that the writer is a source-bound synthesizer, not a strategy consultant.
     - If the brief includes `existing_foundation`, treat the task as update mode and preserve the existing document verbatim except for the requested edits.
     - In update mode, tell the writer to copy every unchanged bullet and question verbatim instead of paraphrasing it.
+    - In update mode, tell the writer not to rewrite or replace existing open questions unless the update request explicitly targets that exact question.
+    - In update mode, if the request asks for a new or replacement open question, preserve all other existing open-question bullets exactly and add or replace only the targeted question.
     - In update mode, do not normalize nouns or labels inside unchanged lines; if an unchanged question says `company`, keep `company` rather than rewriting it as `project`, `product`, or `primitive`.
     - In update mode, when the edit request supplies target phrasing for a local rewrite, preserve those anchor words rather than substituting looser synonyms.
     - For example, if the request says `easiest first wedge because eval loops are tighter`, keep both `easiest first wedge` and `eval loops are tighter` explicit in the revised line.
+    - If the request names `coding workflows` as the wedge subject, do not substitute `skills`, `skills surface`, or another artifact label as the wedge subject in thesis or strategic-bet wording.
     - Require markdown output with exactly this heading structure unless the user asks otherwise:
       - `# <Primitive Name> Foundation`
       - `## What This Is`
@@ -143,8 +146,11 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Forbid market-leadership or superiority claims unless they are explicit in the brief.
     - In update mode, do not add new sections or broad new framing unless the request explicitly asks for them.
     - In update mode, if a boundary, actor, surface, strategic bet, or open question is not part of the requested edit, preserve its wording exactly.
+    - In update mode, do not rewrite an existing open question into a nearby broader or narrower variant just because it overlaps with the requested edit. Keep non-targeted questions byte-for-byte except for markdown context required by the edit.
+    - In update mode, when adding or replacing an open question, keep the rest of the `Open Questions` list in its original order and wording.
     - In update mode, preserve unchanged nouns and subject labels inside copied lines exactly as written.
     - In update mode, if the request gives a specific comparative or causal formulation, keep that formulation explicit instead of paraphrasing it away.
+    - In update mode, if the request says `coding workflows are the easiest first wedge because eval loops are tighter`, keep `coding workflows` as the subject anywhere that wedge claim is restated. Do not rewrite it as `skills` or `skills surface`.
     - Make the prompt directly usable by an agent.
 
     Brief:
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index cdcb17e..df040f1 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -27,10 +27,13 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
       - extract a narrow `update_request` rather than rewriting the entire document
       - do not resolve existing ambiguity unless the task prompt explicitly asks for it
       - if a boundary, actor, surface, strategic bet, or open question is not explicitly targeted by the task prompt, keep its wording unchanged in the brief
+      - do not rewrite or replace an existing open question unless the task prompt explicitly targets that exact question
+      - if the task prompt asks for a new or replacement open question, preserve all other existing open-question bullets exactly and add or replace only the targeted question
       - when a line is unchanged, copy its text verbatim from `existing_foundation` instead of normalizing nouns or labels
       - do not rewrite unchanged `company`, `product`, `primitive`, or actor labels into nearby synonyms
       - if the task prompt gives target phrasing for a local rewrite, preserve those anchor words in the rewritten field instead of loosening them into synonyms
       - if the task prompt specifies a comparative or causal clause such as `easiest first wedge because eval loops are tighter`, keep that clause explicit
+      - if the task prompt names `coding workflows` as the wedge subject, do not substitute `skills`, `skills surface`, or another artifact label as the wedge subject in thesis or strategic-bet wording
     - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them.
     - If `strategic_bets` are included, phrase them as source-visible directional signals or evidence.
     - Each `strategic_bets` item must begin with an evidence-matched source-centered opening.
@@ -90,12 +93,15 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
       - start from `existing_foundation`
       - preserve all unchanged lines verbatim
       - preserve unchanged bullet wording and unchanged open questions exactly as written
+      - do not rewrite or replace existing open questions unless `update_request` explicitly targets that exact question
+      - when adding or replacing an open question, keep every other existing open-question bullet in its original order and wording
       - apply only the edits implied by `update_request`
       - do not rewrite existing paragraphs or bullets unless the request requires it
       - do not add new sections or broad new framing unless `update_request` explicitly asks for them
       - copy unchanged lines literally, including nouns and subject labels such as `company`, rather than normalizing them into synonyms
       - if `update_request` supplies target phrasing for a local rewrite, preserve those anchor words rather than replacing them with looser synonyms
       - if `update_request` includes a comparative or causal clause such as `easiest first wedge because eval loops are tighter`, keep that clause explicit in the revised line
+      - if `update_request` says `coding workflows are the easiest first wedge because eval loops are tighter`, keep `coding workflows` as the subject anywhere that wedge claim is restated. Do not rewrite it as `skills` or `skills surface`.
 
     Brief:
     {{ brief|format(type="yaml") }}
diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json
index e0d8de4..5d1f97d 100644
--- a/skills/foundation-creator/evals/evals.json
+++ b/skills/foundation-creator/evals/evals.json
@@ -216,7 +216,7 @@
         "required_patterns": [
           {
             "id": "narrower_artifact_layer_framing_present",
-            "pattern": "source-bound.*durable artifact and constraint layer|source-bound.*durable artifacts? and constraints?|durable artifact and constraint layer.*agent(?:-enabled)? work",
+            "pattern": "source-bound.*durable artifact (?:and|or) constraint layer|source-bound.*durable artifacts? (?:and|or) constraints?|durable artifact (?:and|or) constraint layer.*agent(?:-enabled)? work",
             "flags": "i",
             "details_pass": "Detected the narrower durable-artifact/constraint-layer framing in `What This Is`.",
             "details_fail": "Did not detect the requested narrower durable-artifact/constraint-layer framing."

From 978c15295bedd58593e637e83b7a15d021e268b1 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 17:11:19 +1000
Subject: [PATCH 22/30] Stabilize foundation update smoke eval

---
 scripts/evals/validators/foundation.ts                      | 6 +++---
 .../baml_src/foundation_compiler/compiler_functions.baml    | 2 ++
 .../baml_src/foundation_compiler/eval_runner.baml           | 3 +++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/scripts/evals/validators/foundation.ts b/scripts/evals/validators/foundation.ts
index 815cced..78b22ff 100644
--- a/scripts/evals/validators/foundation.ts
+++ b/scripts/evals/validators/foundation.ts
@@ -75,7 +75,7 @@ function strategicBetUsesApprovedLead(line, { allowPublicMaterialsLead = true }
     : "";
 
   return new RegExp(
-    `^(The notes suggest(?:\\s+a bet on|\\s+that)?|There are visible signals that${publicLeadPattern}|The source material indicates(?:\\s+a bet on|\\s+that)?)`,
+    `^(The notes suggest(?:\\s+a bet on|\\s+that)?|There are visible signals that${publicLeadPattern}|The source material (?:indicates|emphasizes)(?:\\s+a bet on|\\s+that)?)`,
     "i",
   ).test(line);
 }
@@ -221,8 +221,8 @@ export function validateFoundationDocument(candidateDocument, templateText, _lan
       hedgedStrategicBets
         ? "Strategic Bets use approved source-centered lead phrasing."
         : allowPublicMaterialsLead
-          ? "One or more Strategic Bets bullets do not begin with approved source-centered phrasing such as `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`."
-          : "One or more Strategic Bets bullets use evidence-source phrasing that does not match a notes-only packet. In note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.",
+          ? "One or more Strategic Bets bullets do not begin with approved source-centered phrasing such as `The notes suggest...`, `There are visible signals that...`, `The source material indicates...`, or `The source material emphasizes...`."
+          : "One or more Strategic Bets bullets use evidence-source phrasing that does not match a notes-only packet. In note-only packets, use `The notes suggest...`, `There are visible signals that...`, `The source material indicates...`, or `The source material emphasizes...`.",
     ),
     createCheck(
       "strategic_bets_avoid_prescriptive_or_company_intent_language",
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index dbd273e..5080c6f 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -89,6 +89,7 @@ function CompileFoundationBrief(
     - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Avoid wording that attributes settled intent directly to the company, such as `The company appears to be betting on...`.
     - Forbid prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
+    - Avoid comparative durability claims in `Strategic Bets` such as `more durable than raw prompts` or `will outlast raw prompts`. Prefer source-bound wording like `The source material emphasizes formal constraints and reusable artifacts as durable surfaces relative to raw prompts and transcripts.`
     - If a possible bet cannot be phrased as a source-visible signal without overclaiming, move it to `open_questions` or omit it.
     - If a surface is visible but still in transition, preserve that qualification in the surrounding summary or questions.
     - Avoid market-leadership or competitive-superiority language unless directly supported by source material.
@@ -140,6 +141,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Forbid wording like `The company appears to be betting on...`, bare `Bet:` labels, or categorical claims like `X is the wedge`, `Y is a defensible primitive`, `X will remain the center of gravity`, or `Y matters more than Z`.
     - Forbid prescriptive verbs and categorical labels in `Strategic Bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, or `first-class`.
+    - Avoid comparative durability claims in `Strategic Bets` such as `more durable than raw prompts` or `will outlast raw prompts`. Prefer source-bound wording like `The source material emphasizes formal constraints and reusable artifacts as durable surfaces relative to raw prompts and transcripts.`
     - If a would-be strategic bet cannot be phrased as a source-visible signal, convert it into an `Open Questions` bullet or omit it.
     - Require recently emerging or transitional surfaces to be qualified explicitly rather than flattened as fully settled.
     - Require `Open Questions` bullets to remain actual unresolved questions rather than disguised conclusions.
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index df040f1..f3ad346 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -41,6 +41,7 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
     - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Do not phrase `strategic_bets` as direct statements of company intent such as `The company appears to be betting on...`.
     - Do not use prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
+    - Do not use comparative durability claims in `strategic_bets`, including `more durable than raw prompts` or `will outlast raw prompts`. Prefer source-bound wording like `The source material emphasizes formal constraints and reusable artifacts as durable surfaces relative to raw prompts and transcripts.`
     - If a possible bet cannot be stated as a source-visible signal without overclaiming, move it to `open_questions` or omit it.
     - Do not use bare `Bet:` labels or categorical claims like `X is the wedge` or `Y is a defensible primitive`.
     - Keep `strategic_bets` short when evidence is weak.
@@ -85,6 +86,7 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     - Do not use direct company-intent phrasing such as `The company appears to be betting on...`.
     - Do not use bare `Bet:` labels or categorical statements like `X is the wedge`, `Y is a defensible primitive`, `X will remain the center of gravity`, or `Y matters more than Z`.
     - Do not use prescriptive verbs or categorical labels like `prioritize`, `invest in`, `ship`, `treat as first-class`, or `first-class` in `Strategic Bets`.
+    - Do not use comparative durability claims in `Strategic Bets`, including `more durable than raw prompts` or `will outlast raw prompts`. Prefer source-bound wording like `The source material emphasizes formal constraints and reusable artifacts as durable surfaces relative to raw prompts and transcripts.`
     - If a would-be strategic bet cannot be phrased as a source-visible signal, convert it into an `Open Questions` bullet or omit it.
     - When a surface is visible but still evolving in the packet, qualify it explicitly as emerging, evolving, or unsettled.
     - Write `Open Questions` as actual unresolved questions, typically ending with `?`.
@@ -128,6 +130,7 @@ function EvaluateFoundationDocument(
     - Reward preservation of uncertainty when the source packet is genuinely mixed.
     - Penalize invented certainty, invented capabilities, or implementation leakage.
     - If `existing_foundation` is present, penalize rewriting unchanged content or adding broad new material outside the requested edit.
+    - In update mode, do not penalize exactly one added `Open Questions` bullet when the task explicitly asks to restore or add an open question; penalize only unrelated new questions or extra questions beyond the requested one.
     - Penalize unsupported business-model, monetization, KPI, org, partnership, or operating-plan language.
     - Penalize consulting-style sections such as `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or similar drift.
     - Penalize missing markdown heading structure if the document drifts from the required template shape.

From 98c574de1b42a1018643a63b89ac29fd41d0fc6b Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 17:20:29 +1000
Subject: [PATCH 23/30] Harden foundation open-question prompts

---
 .../baml_src/foundation_compiler/compiler_functions.baml  | 7 ++++++-
 .../baml_src/foundation_compiler/eval_runner.baml         | 8 +++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index 5080c6f..e96c23e 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -91,6 +91,7 @@ function CompileFoundationBrief(
     - Forbid prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
     - Avoid comparative durability claims in `Strategic Bets` such as `more durable than raw prompts` or `will outlast raw prompts`. Prefer source-bound wording like `The source material emphasizes formal constraints and reusable artifacts as durable surfaces relative to raw prompts and transcripts.`
     - If a possible bet cannot be phrased as a source-visible signal without overclaiming, move it to `open_questions` or omit it.
+    - Write every `unresolved_questions` item as a direct question ending in `?`, not as a declarative uncertainty statement starting with `Whether...`.
     - If a surface is visible but still in transition, preserve that qualification in the surrounding summary or questions.
     - Avoid market-leadership or competitive-superiority language unless directly supported by source material.
 
@@ -120,6 +121,8 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - In update mode, tell the writer to copy every unchanged bullet and question verbatim instead of paraphrasing it.
     - In update mode, tell the writer not to rewrite or replace existing open questions unless the update request explicitly targets that exact question.
     - In update mode, if the request asks for a new or replacement open question, preserve all other existing open-question bullets exactly and add or replace only the targeted question.
+    - In update mode, tell the writer never to omit an existing open question unless the update request explicitly says to remove or replace that exact question.
+    - In update mode, tell the writer to append a requested new open question after preserved existing questions unless the update request explicitly gives a different placement.
     - In update mode, do not normalize nouns or labels inside unchanged lines; if an unchanged question says `company`, keep `company` rather than rewriting it as `project`, `product`, or `primitive`.
     - In update mode, when the edit request supplies target phrasing for a local rewrite, preserve those anchor words rather than substituting looser synonyms.
     - For example, if the request says `easiest first wedge because eval loops are tighter`, keep both `easiest first wedge` and `eval loops are tighter` explicit in the revised line.
@@ -144,12 +147,14 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Avoid comparative durability claims in `Strategic Bets` such as `more durable than raw prompts` or `will outlast raw prompts`. Prefer source-bound wording like `The source material emphasizes formal constraints and reusable artifacts as durable surfaces relative to raw prompts and transcripts.`
     - If a would-be strategic bet cannot be phrased as a source-visible signal, convert it into an `Open Questions` bullet or omit it.
     - Require recently emerging or transitional surfaces to be qualified explicitly rather than flattened as fully settled.
-    - Require `Open Questions` bullets to remain actual unresolved questions rather than disguised conclusions.
+    - Require every `Open Questions` bullet to be a direct question ending in `?`, not a disguised conclusion or declarative uncertainty statement starting with `Whether...`.
     - Forbid market-leadership or superiority claims unless they are explicit in the brief.
     - In update mode, do not add new sections or broad new framing unless the request explicitly asks for them.
     - In update mode, if a boundary, actor, surface, strategic bet, or open question is not part of the requested edit, preserve its wording exactly.
     - In update mode, do not rewrite an existing open question into a nearby broader or narrower variant just because it overlaps with the requested edit. Keep non-targeted questions byte-for-byte except for markdown context required by the edit.
     - In update mode, when adding or replacing an open question, keep the rest of the `Open Questions` list in its original order and wording.
+    - In update mode, never omit an existing `Open Questions` bullet unless the request explicitly says to remove or replace that exact question.
+    - In update mode, append a requested new open question after preserved existing open questions unless the request explicitly gives a different placement.
     - In update mode, preserve unchanged nouns and subject labels inside copied lines exactly as written.
     - In update mode, if the request gives a specific comparative or causal formulation, keep that formulation explicit instead of paraphrasing it away.
     - In update mode, if the request says `coding workflows are the easiest first wedge because eval loops are tighter`, keep `coding workflows` as the subject anywhere that wedge claim is restated. Do not rewrite it as `skills` or `skills surface`.
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index f3ad346..0a234e2 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -29,6 +29,8 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
       - if a boundary, actor, surface, strategic bet, or open question is not explicitly targeted by the task prompt, keep its wording unchanged in the brief
       - do not rewrite or replace an existing open question unless the task prompt explicitly targets that exact question
       - if the task prompt asks for a new or replacement open question, preserve all other existing open-question bullets exactly and add or replace only the targeted question
+      - in `unresolved_questions`, copy every existing open-question bullet that is not explicitly replaced by the task prompt, in the original order
+      - when adding a requested new open question, append it after preserved existing questions unless the task prompt explicitly says to replace a specific old question
       - when a line is unchanged, copy its text verbatim from `existing_foundation` instead of normalizing nouns or labels
       - do not rewrite unchanged `company`, `product`, `primitive`, or actor labels into nearby synonyms
       - if the task prompt gives target phrasing for a local rewrite, preserve those anchor words in the rewritten field instead of loosening them into synonyms
@@ -46,6 +48,7 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
     - Do not use bare `Bet:` labels or categorical claims like `X is the wedge` or `Y is a defensible primitive`.
     - Keep `strategic_bets` short when evidence is weak.
     - Keep unresolved questions as actual unresolved questions rather than quietly resolving them in the summary.
+    - Write every `unresolved_questions` item as a direct question ending in `?`, not as a declarative uncertainty statement starting with `Whether...`.
     - Prefer omission over speculation.
 
     {{ ctx.output_format }}
@@ -89,7 +92,8 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     - Do not use comparative durability claims in `Strategic Bets`, including `more durable than raw prompts` or `will outlast raw prompts`. Prefer source-bound wording like `The source material emphasizes formal constraints and reusable artifacts as durable surfaces relative to raw prompts and transcripts.`
     - If a would-be strategic bet cannot be phrased as a source-visible signal, convert it into an `Open Questions` bullet or omit it.
     - When a surface is visible but still evolving in the packet, qualify it explicitly as emerging, evolving, or unsettled.
-    - Write `Open Questions` as actual unresolved questions, typically ending with `?`.
+    - Write every `Open Questions` bullet as a direct unresolved question ending in `?`.
+    - Do not write `Open Questions` as disguised conclusions or declarative uncertainty statements starting with `Whether...`.
     - Do not use market-leadership or competitive-superiority language unless the packet explicitly supports it.
     - If `existing_foundation` is present, treat this as update mode:
       - start from `existing_foundation`
@@ -97,6 +101,8 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
       - preserve unchanged bullet wording and unchanged open questions exactly as written
       - do not rewrite or replace existing open questions unless `update_request` explicitly targets that exact question
       - when adding or replacing an open question, keep every other existing open-question bullet in its original order and wording
+      - never omit an existing `Open Questions` bullet unless `update_request` explicitly says to remove or replace that exact question
+      - append a requested new open question after preserved existing open questions unless `update_request` explicitly gives a different placement
       - apply only the edits implied by `update_request`
       - do not rewrite existing paragraphs or bullets unless the request requires it
       - do not add new sections or broad new framing unless `update_request` explicitly asks for them

From aa8ff92ee649d9112a300085deaa440b76cac6ac Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 17:38:31 +1000
Subject: [PATCH 24/30] Preserve spec lineage brief stage

---
 .../baml_src/spec_compiler/compiler_functions.baml           | 5 +++++
 skills/spec-creator/baml_src/spec_compiler/eval_runner.baml  | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
index f0b245a..644bff7 100644
--- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -46,6 +46,8 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
     - Keep packet-named service surfaces distinct when they matter. For example, if the notes name teams separately from projects, do not silently collapse teams into projects.
     - If the notes name an ordered lineage, pipeline, or stage progression, preserve each materially distinct stage somewhere in the brief. Do not collapse named intermediate stages such as `brief` into a broader `draft` label unless the notes themselves do so.
     - Use `lineage_stages` for packet-named ordered stages that materially shape the service, such as `raw request`, `packet`, `brief`, `draft`, `evaluation`, or `approval`. If the notes name such stages, preserve them there in source order.
+    - If `lineage_stages` includes an intermediate `brief` stage, make that stage visible in at least one `component` or `abstraction_layer` responsibility; do not rely on `lineage_stages` alone.
+    - If the notes treat `brief` as part of a durable source-to-decision trail, preserve it with the smallest source-backed domain representation, such as `brief_text` on a packet or work record, unless the notes make a separate brief entity necessary.
     - Distinct source-visible surfaces do not automatically require distinct entities. Preserve the distinction in purpose, components, dependencies, or unresolved questions unless the notes clearly support separate first-class records.
     - Use `actors` for source-backed participant roles that shape the service boundary, coordination model, or human escalation path. Human advocates, caregivers, reviewers, operators, patients, providers, and similar participants belong in `actors`.
     - Use `service_surfaces` for stable, source-backed coordination surfaces such as inboxes, queues, shared records, timelines, cases, or similar user-visible surfaces. If the notes imply one of these surfaces, preserve it there even if the wording is tentative.
@@ -215,9 +217,12 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
     - If the brief mentions approved-client restrictions, keep that phrased as a boundary or restriction, not as an approval-state system or internal status object.
     - Require packet-named surfaces to remain visible when they materially matter. For example, if the brief distinguishes teams from projects, keep that distinction somewhere in the spec instead of collapsing it away.
     - If `lineage_stages` are present, preserve every materially distinct stage from that list somewhere in the document. Mentioning the ordered stages in prose is acceptable even when not every stage becomes a domain entity.
+    - If `lineage_stages` includes `brief`, explicitly include `brief` in the System Overview's lineage or history component, not only in the Problem Statement.
+    - If `lineage_stages` includes `raw request`, `packet`, `brief`, `candidate artifact`, `eval run`, and `approval/rejection`, preserve that ordered trail verbatim or as a close ordered paraphrase in the System Overview.
     - If `service_surfaces` are present, keep at least one of those surfaces visible in the Purpose or System Overview so the service is not reduced to stage sequencing alone.
     - If `service_surfaces` include versioned history, a timeline, or an approval trail, keep at least one such history-bearing surface visible in the Purpose, Problem Statement, or System Overview rather than only generic `traceability` language.
     - If the brief names an ordered lineage or artifact pipeline, preserve every materially distinct stage somewhere in the document. If the brief includes a stage like `brief` between packet and candidate artifact, keep that stage visible in the Problem Statement, System Overview, or another prose section even if it is not a domain entity.
+    - If the brief includes a `brief` lineage stage and a packet/work entity, include a minimal `brief_text`, `brief_summary`, or equivalent source-backed field when the brief treats that stage as part of the durable trail.
     - Require the domain model to stay conservative:
       - include only entities materially supported by the brief
       - render every entity in `brief.entities` as its own `#### 4.1.x EntityName` heading with its own `Fields:` block; do not merge fields from one entity into another entity
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
index ce9a9e8..d833055 100644
--- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -32,6 +32,8 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
     - Keep packet-named service surfaces distinct when they matter. For example, if the packet names teams separately from projects, do not silently collapse teams into projects.
     - If the packet names an ordered lineage, pipeline, or stage progression, preserve each materially distinct stage somewhere in the brief. Do not collapse named intermediate stages such as `brief` into a broader `draft` label unless the packet itself does so.
     - Use `lineage_stages` for packet-named ordered stages that materially shape the service, such as `raw request`, `packet`, `brief`, `draft`, `evaluation`, or `approval`. If the packet names such stages, preserve them there in source order.
+    - If `lineage_stages` includes an intermediate `brief` stage, make that stage visible in at least one `component` or `abstraction_layer` responsibility; do not rely on `lineage_stages` alone.
+    - If the packet treats `brief` as part of a durable source-to-decision trail, preserve it with the smallest source-backed domain representation, such as `brief_text` on a packet or work record, unless the packet makes a separate brief entity necessary.
     - Distinct source-visible surfaces do not automatically require distinct entities. Preserve the distinction in purpose, components, dependencies, or unresolved questions unless the packet clearly supports separate first-class records.
     - Use `actors` for source-backed participant roles that shape the service boundary, coordination model, or human escalation path. Human advocates, caregivers, reviewers, operators, patients, providers, and similar participants belong in `actors`.
     - Use `service_surfaces` for stable, source-backed coordination surfaces such as inboxes, queues, shared records, timelines, cases, or similar user-visible surfaces. If the packet implies one of these surfaces, preserve it there even if the wording is tentative.
@@ -169,9 +171,12 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
     - If the brief mentions approved-client restrictions, keep that phrased as a boundary or restriction, not as an approval-state system or internal status object.
     - Keep packet-named surfaces visible when they materially matter. If the source packet names teams separately from projects, preserve that distinction somewhere in the spec instead of collapsing it away.
     - If `lineage_stages` are present, preserve every materially distinct stage from that list somewhere in the document. Mentioning the ordered stages in prose is acceptable even when not every stage becomes a domain entity.
+    - If `lineage_stages` includes `brief`, explicitly include `brief` in the System Overview's lineage or history component, not only in the Problem Statement.
+    - If `lineage_stages` includes `raw request`, `packet`, `brief`, `candidate artifact`, `eval run`, and `approval/rejection`, preserve that ordered trail verbatim or as a close ordered paraphrase in the System Overview.
     - If `service_surfaces` are present, keep at least one of those surfaces visible in the Purpose or System Overview so the service is not reduced to stage sequencing alone.
     - If `service_surfaces` include versioned history, a timeline, or an approval trail, keep at least one such history-bearing surface visible in the Purpose, Problem Statement, or System Overview rather than only generic `traceability` language.
     - If the brief names an ordered lineage or artifact pipeline, preserve every materially distinct stage somewhere in the document. If the packet or brief includes a stage like `brief` between packet and candidate artifact, keep that stage visible in the Problem Statement, System Overview, or another prose section even if it is not a domain entity.
+    - If the brief includes a `brief` lineage stage and a packet/work entity, include a minimal `brief_text`, `brief_summary`, or equivalent source-backed field when the brief treats that stage as part of the durable trail.
     - Keep the domain model conservative:
       - include only entities materially supported by the brief
       - render every entity in `brief.entities` as its own `#### 4.1.x EntityName` heading with its own `Fields:` block; do not merge fields from one entity into another entity

From 68ef8ee3c726c40dedf994083cb65fe82c42bdaf Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 17:46:15 +1000
Subject: [PATCH 25/30] Preserve foundation update thesis bullets

---
 .../baml_src/foundation_compiler/compiler_functions.baml      | 4 ++++
 .../baml_src/foundation_compiler/eval_runner.baml             | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index e96c23e..298cb86 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -119,6 +119,8 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - State that the writer is a source-bound synthesizer, not a strategy consultant.
     - If the brief includes `existing_foundation`, treat the task as update mode and preserve the existing document verbatim except for the requested edits.
     - In update mode, tell the writer to copy every unchanged bullet and question verbatim instead of paraphrasing it.
+    - In update mode, tell the writer not to rewrite existing `Core Thesis` bullets unless the update request targets that exact bullet.
+    - In update mode, if an unchanged thesis bullet says artifacts `survive model churn better than prompts or transcripts`, keep that wording exactly; Strategic Bets restrictions do not apply to unchanged thesis bullets.
     - In update mode, tell the writer not to rewrite or replace existing open questions unless the update request explicitly targets that exact question.
     - In update mode, if the request asks for a new or replacement open question, preserve all other existing open-question bullets exactly and add or replace only the targeted question.
     - In update mode, tell the writer never to omit an existing open question unless the update request explicitly says to remove or replace that exact question.
@@ -151,6 +153,8 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Forbid market-leadership or superiority claims unless they are explicit in the brief.
     - In update mode, do not add new sections or broad new framing unless the request explicitly asks for them.
     - In update mode, if a boundary, actor, surface, strategic bet, or open question is not part of the requested edit, preserve its wording exactly.
+    - In update mode, if a `Core Thesis` bullet is not part of the requested edit, preserve its wording exactly.
+    - In update mode, do not apply `Strategic Bets` phrasing restrictions to unchanged `Core Thesis` bullets; for example, keep `survive model churn better than prompts or transcripts` unchanged when that thesis bullet is not targeted.
     - In update mode, do not rewrite an existing open question into a nearby broader or narrower variant just because it overlaps with the requested edit. Keep non-targeted questions byte-for-byte except for markdown context required by the edit.
     - In update mode, when adding or replacing an open question, keep the rest of the `Open Questions` list in its original order and wording.
     - In update mode, never omit an existing `Open Questions` bullet unless the request explicitly says to remove or replace that exact question.
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index 0a234e2..8e6cf5b 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -27,6 +27,8 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
       - extract a narrow `update_request` rather than rewriting the entire document
       - do not resolve existing ambiguity unless the task prompt explicitly asks for it
       - if a boundary, actor, surface, strategic bet, or open question is not explicitly targeted by the task prompt, keep its wording unchanged in the brief
+      - if a core thesis bullet is not explicitly targeted by the task prompt, keep its wording unchanged in the brief
+      - do not apply `strategic_bets` phrasing restrictions to unchanged core thesis bullets; if an unchanged thesis says artifacts `survive model churn better than prompts or transcripts`, keep that wording exactly
       - do not rewrite or replace an existing open question unless the task prompt explicitly targets that exact question
       - if the task prompt asks for a new or replacement open question, preserve all other existing open-question bullets exactly and add or replace only the targeted question
       - in `unresolved_questions`, copy every existing open-question bullet that is not explicitly replaced by the task prompt, in the original order
@@ -98,6 +100,8 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     - If `existing_foundation` is present, treat this as update mode:
       - start from `existing_foundation`
       - preserve all unchanged lines verbatim
+      - preserve unchanged `Core Thesis` bullets exactly unless `update_request` targets that exact bullet
+      - do not apply `Strategic Bets` phrasing restrictions to unchanged `Core Thesis` bullets; for example, keep `survive model churn better than prompts or transcripts` unchanged when that thesis bullet is not targeted
       - preserve unchanged bullet wording and unchanged open questions exactly as written
       - do not rewrite or replace existing open questions unless `update_request` explicitly targets that exact question
       - when adding or replacing an open question, keep every other existing open-question bullet in its original order and wording

From 134e23eb5347a8176abfda5eec6a65b26b653e4b Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 18:31:39 +1000
Subject: [PATCH 26/30] Add static eval contract checks

---
 README.md                      |   5 +
 package.json                   |   5 +-
 scripts/check-eval-fixtures.ts |  33 +++
 scripts/evals/README.md        |   2 +
 scripts/evals/reports.ts       |  17 ++
 scripts/evals/static-checks.ts | 436 +++++++++++++++++++++++++++++++++
 scripts/run-baml-eval.ts       |  43 ++++
 7 files changed, 539 insertions(+), 2 deletions(-)
 create mode 100644 scripts/check-eval-fixtures.ts
 create mode 100644 scripts/evals/static-checks.ts

diff --git a/README.md b/README.md
index b9a0df7..c9ac91d 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and
 ```bash
 bun install
 bun run ci:check
+bun run eval:check
 bun run eval:typecheck
 bun run eval:foundation -- create-foundation-from-vercel-source-packet
 bun run eval:foundation -- create-foundation-from-lightfast-founder-notes
@@ -45,6 +46,10 @@ bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator create-fou
 Each run writes packet, brief, candidate document, and evaluation report
 artifacts under `skills/<skill>/evals/runs/`.
 
+`bun run eval:check` is the cheap deterministic CI guard. It validates eval
+manifests, fixture paths, validation regexes, smoke membership, and BAML runner
+function wiring without calling any model.
+
 Current `foundation-creator` corpus includes:
 
 - `create-foundation-from-vercel-source-packet`
diff --git a/package.json b/package.json
index 5705971..640b5ba 100644
--- a/package.json
+++ b/package.json
@@ -14,8 +14,9 @@
     "eval:foundation:smoke": "bun run eval:foundation -- --smoke",
     "eval:spec": "bun run with-env -- bun ./scripts/run-baml-eval.ts spec-creator",
     "eval:spec:smoke": "bun run eval:spec -- --smoke",
-    "eval:typecheck": "tsc --noEmit --allowImportingTsExtensions --moduleResolution bundler --module esnext --target esnext --skipLibCheck --types node scripts/run-baml-eval.ts scripts/evals/*.ts scripts/evals/validators/*.ts",
-    "ci:check": "bun run baml:generate:foundation && bun run baml:generate:spec && bun run eval:typecheck"
+    "eval:check": "bun ./scripts/check-eval-fixtures.ts foundation-creator spec-creator",
+    "eval:typecheck": "tsc --noEmit --allowImportingTsExtensions --moduleResolution bundler --module esnext --target esnext --skipLibCheck --types node scripts/check-eval-fixtures.ts scripts/run-baml-eval.ts scripts/evals/*.ts scripts/evals/validators/*.ts",
+    "ci:check": "bun run eval:check && bun run baml:generate:foundation && bun run baml:generate:spec && bun run eval:typecheck"
   },
   "dependencies": {
     "@boundaryml/baml": "0.221.0",
diff --git a/scripts/check-eval-fixtures.ts b/scripts/check-eval-fixtures.ts
new file mode 100644
index 0000000..54e2123
--- /dev/null
+++ b/scripts/check-eval-fixtures.ts
@@ -0,0 +1,33 @@
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import {
+  checkSkillEvalContracts,
+  formatStaticCheckResult,
+} from "./evals/static-checks.ts";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const repoRoot = path.resolve(__dirname, "..");
+
+const args = process.argv.slice(2);
+const requireSmoke = !args.includes("--allow-no-smoke");
+const skillArgs = args.filter((arg) => arg !== "--allow-no-smoke");
+const skills = skillArgs.length > 0 ? skillArgs : ["foundation-creator", "spec-creator"];
+
+const results = await Promise.all(
+  skills.map((skillNameOrPath) =>
+    checkSkillEvalContracts({
+      repoRoot,
+      skillNameOrPath,
+      requireSmoke,
+    }),
+  ),
+);
+
+for (const result of results) {
+  console.log(formatStaticCheckResult(result).join("\n"));
+}
+
+if (results.some((result) => result.issues.length > 0)) {
+  process.exit(1);
+}
diff --git a/scripts/evals/README.md b/scripts/evals/README.md
index 48ce8be..76a162a 100644
--- a/scripts/evals/README.md
+++ b/scripts/evals/README.md
@@ -20,6 +20,8 @@ Current modules:
 - `reports.ts` builds benchmark, comparison, and suite-summary artifacts.
 - `runtime.ts` owns process/runtime helpers such as command execution and file
   loading.
+- `static-checks.ts` validates manifest, fixture, BAML runner, and regex
+  contracts without model calls.
 - `status.ts` owns pass/partial/fail ordering and numeric summaries.
 - `text.ts` owns markdown/text normalization helpers.
 - `variants.ts` materializes current, git, and profile-based skill variants.
diff --git a/scripts/evals/reports.ts b/scripts/evals/reports.ts
index 0052519..46dc9ee 100644
--- a/scripts/evals/reports.ts
+++ b/scripts/evals/reports.ts
@@ -9,6 +9,18 @@ export function buildBenchmark(skillName, evalEntry, trials, capabilityId = skil
   const deterministicPassCount = trials.filter(
     (trial) => trial.deterministic_checks.overall_pass,
   ).length;
+  const failedDeterministicChecks = [
+    ...new Set(
+      trials.flatMap((trial) =>
+        trial.deterministic_checks.checks
+          .filter((check) => !check.passed)
+          .map((check) => check.id),
+      ),
+    ),
+  ];
+  const llmOpenIssues = trials.flatMap((trial) =>
+    Array.isArray(trial.report.open_issues) ? trial.report.open_issues : [],
+  );
 
   const checkStats = new Map();
   for (const trial of trials) {
@@ -50,6 +62,8 @@ export function buildBenchmark(skillName, evalEntry, trials, capabilityId = skil
       llm_worst_status: worstStatus(judgeStatuses),
       combined_worst_status: worstStatus(combinedStatuses),
       deterministic_pass_rate: Number((deterministicPassCount / trials.length).toFixed(2)),
+      failed_deterministic_checks: failedDeterministicChecks,
+      llm_open_issues: llmOpenIssues,
     },
     trial_summaries: trials.map((trial, index) => ({
       trial: index + 1,
@@ -210,6 +224,9 @@ export function buildSuiteSummary({
       llm_worst_status: result.current_summary.llm_worst_status,
       combined_worst_status: result.current_summary.combined_worst_status,
       deterministic_pass_rate: result.current_summary.deterministic_pass_rate,
+      failed_deterministic_checks:
+        result.current_summary.failed_deterministic_checks ?? [],
+      llm_open_issues: result.current_summary.llm_open_issues ?? [],
       error: result.error ?? null,
     })),
   };
diff --git a/scripts/evals/static-checks.ts b/scripts/evals/static-checks.ts
new file mode 100644
index 0000000..944cafb
--- /dev/null
+++ b/scripts/evals/static-checks.ts
@@ -0,0 +1,436 @@
+import { readdir, readFile, stat } from "node:fs/promises";
+import path from "node:path";
+
+const allowedPacketFileKeys = new Set([
+  "raw_notes",
+  "expected_criteria",
+  "existing_spec",
+  "existing_foundation",
+]);
+
+const allowedValidators = new Set([
+  "foundation-v1",
+  "foundation-update-v1",
+  "spec-v1",
+  "spec-update-v1",
+]);
+
+const contractFileFields = [
+  "template_file",
+  "language_file",
+  "existing_spec_file",
+  "existing_foundation_file",
+];
+
+const patternFields = [
+  "allowed_removed_patterns",
+  "forbidden_patterns",
+  "required_patterns",
+];
+
+function issue(issues, scope, message) {
+  issues.push({ scope, message });
+}
+
+async function isFile(filePath) {
+  try {
+    return (await stat(filePath)).isFile();
+  } catch {
+    return false;
+  }
+}
+
+async function readJson(filePath, issues, scope) {
+  try {
+    return JSON.parse(await readFile(filePath, "utf8"));
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    issue(issues, scope, `Could not read JSON: ${message}`);
+    return null;
+  }
+}
+
+function resolveRelativeFile(baseDir, relativePath, issues, scope) {
+  if (typeof relativePath !== "string" || relativePath.trim().length === 0) {
+    issue(issues, scope, "File path must be a non-empty string.");
+    return null;
+  }
+
+  if (path.isAbsolute(relativePath)) {
+    issue(issues, scope, `File path must be relative, got '${relativePath}'.`);
+    return null;
+  }
+
+  const resolved = path.resolve(baseDir, relativePath);
+  const relative = path.relative(baseDir, resolved);
+  if (relative.startsWith("..") || path.isAbsolute(relative)) {
+    issue(issues, scope, `File path escapes its base directory: '${relativePath}'.`);
+    return null;
+  }
+
+  return resolved;
+}
+
+async function checkRelativeFile(baseDir, relativePath, issues, scope) {
+  const resolved = resolveRelativeFile(baseDir, relativePath, issues, scope);
+  if (!resolved) {
+    return;
+  }
+
+  if (!(await isFile(resolved))) {
+    issue(issues, scope, `Referenced file does not exist: '${relativePath}'.`);
+  }
+}
+
+function checkNonEmptyString(value, issues, scope, field) {
+  if (typeof value !== "string" || value.trim().length === 0) {
+    issue(issues, scope, `'${field}' must be a non-empty string.`);
+  }
+}
+
+function checkStringArray(value, issues, scope, field) {
+  if (value === undefined) {
+    return;
+  }
+  if (!Array.isArray(value) || value.some((entry) => typeof entry !== "string")) {
+    issue(issues, scope, `'${field}' must be an array of strings when present.`);
+  }
+}
+
+function checkPatternSpec(patternSpec, issues, scope) {
+  if (typeof patternSpec === "string") {
+    try {
+      new RegExp(patternSpec, "i");
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      issue(issues, scope, `Invalid regex pattern '${patternSpec}': ${message}`);
+    }
+    return;
+  }
+
+  if (!patternSpec || typeof patternSpec !== "object") {
+    issue(issues, scope, "Pattern spec must be a string or object.");
+    return;
+  }
+
+  if (typeof patternSpec.id !== "string" && patternSpec.id !== undefined) {
+    issue(issues, scope, "Pattern spec 'id' must be a string when present.");
+  }
+
+  if (typeof patternSpec.section_title !== "string" && patternSpec.section_title !== undefined) {
+    issue(issues, scope, "Pattern spec 'section_title' must be a string when present.");
+  }
+
+  if (typeof patternSpec.pattern !== "string" || patternSpec.pattern.length === 0) {
+    issue(issues, scope, "Pattern spec object must declare a non-empty 'pattern'.");
+    return;
+  }
+
+  try {
+    new RegExp(patternSpec.pattern, patternSpec.flags ?? "i");
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    issue(issues, scope, `Invalid regex pattern '${patternSpec.pattern}': ${message}`);
+  }
+}
+
+async function checkValidationContract(contract, skillRoot, issues, scope) {
+  if (!contract) {
+    return;
+  }
+
+  if (typeof contract !== "object") {
+    issue(issues, scope, "Validation contract must be an object.");
+    return;
+  }
+
+  if (contract.type !== "reference_document_checks") {
+    issue(issues, scope, "Validation contract type must be 'reference_document_checks'.");
+  }
+
+  if (!allowedValidators.has(contract.validator)) {
+    issue(
+      issues,
+      scope,
+      `Unknown validator '${contract.validator}'. Supported validators: ${[...allowedValidators].join(", ")}.`,
+    );
+  }
+
+  if (contract.validator === "spec-update-v1" && !contract.existing_spec_file) {
+    issue(issues, scope, "spec-update-v1 requires 'existing_spec_file'.");
+  }
+
+  if (contract.validator === "foundation-update-v1" && !contract.existing_foundation_file) {
+    issue(issues, scope, "foundation-update-v1 requires 'existing_foundation_file'.");
+  }
+
+  for (const field of contractFileFields) {
+    if (contract[field] !== undefined) {
+      await checkRelativeFile(skillRoot, contract[field], issues, `${scope}.${field}`);
+    }
+  }
+
+  for (const field of patternFields) {
+    if (contract[field] === undefined) {
+      continue;
+    }
+    if (!Array.isArray(contract[field])) {
+      issue(issues, scope, `'${field}' must be an array when present.`);
+      continue;
+    }
+    contract[field].forEach((patternSpec, index) =>
+      checkPatternSpec(patternSpec, issues, `${scope}.${field}[${index}]`),
+    );
+  }
+
+  for (const field of ["replaceable_sections", "preserve_sections"]) {
+    if (contract[field] !== undefined && !Array.isArray(contract[field])) {
+      issue(issues, scope, `'${field}' must be an array when present.`);
+    }
+  }
+}
+
+async function readBamlSourceFiles(directory, issues, scope) {
+  const files = [];
+
+  async function walk(currentDir) {
+    let entries;
+    try {
+      entries = await readdir(currentDir, { withFileTypes: true });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      issue(issues, scope, `Could not read BAML source directory: ${message}`);
+      return;
+    }
+
+    for (const entry of entries) {
+      const entryPath = path.join(currentDir, entry.name);
+      if (entry.isDirectory()) {
+        await walk(entryPath);
+        continue;
+      }
+      if (entry.isFile() && entry.name.endsWith(".baml")) {
+        files.push(entryPath);
+      }
+    }
+  }
+
+  await walk(directory);
+  return files;
+}
+
+async function checkRunnerFunctions(skillRoot, runnerContract, issues, scope) {
+  const bamlSrcDir = path.join(skillRoot, "baml_src");
+  const bamlFiles = await readBamlSourceFiles(bamlSrcDir, issues, `${scope}.baml_src`);
+  if (bamlFiles.length === 0) {
+    issue(issues, scope, "No BAML source files found under baml_src.");
+    return;
+  }
+
+  const bamlSource = (await Promise.all(bamlFiles.map((file) => readFile(file, "utf8")))).join(
+    "\n",
+  );
+  const fields = [
+    "compile_brief_function",
+    "render_document_function",
+    "evaluate_document_function",
+  ];
+
+  for (const field of fields) {
+    const functionName = runnerContract[field];
+    if (typeof functionName !== "string") {
+      continue;
+    }
+
+    const declaration = new RegExp(`\\bfunction\\s+${functionName}\\s*\\(`);
+    if (!declaration.test(bamlSource)) {
+      issue(
+        issues,
+        scope,
+        `Runner function '${functionName}' from '${field}' was not found in baml_src.`,
+      );
+    }
+  }
+}
+
+async function checkEvalEntry(evalEntry, context) {
+  const { evalsDir, manifestValidationContract, seenEvalIds, seenEvalNames, skillRoot } = context;
+  const issues = context.issues;
+  const scope = `${context.skillName}.evals[${context.index}]`;
+
+  if (!Number.isInteger(evalEntry.id)) {
+    issue(issues, scope, "'id' must be an integer.");
+  } else if (seenEvalIds.has(evalEntry.id)) {
+    issue(issues, scope, `Duplicate eval id '${evalEntry.id}'.`);
+  } else {
+    seenEvalIds.add(evalEntry.id);
+  }
+
+  checkNonEmptyString(evalEntry.eval_name, issues, scope, "eval_name");
+  if (typeof evalEntry.eval_name === "string") {
+    if (seenEvalNames.has(evalEntry.eval_name)) {
+      issue(issues, scope, `Duplicate eval_name '${evalEntry.eval_name}'.`);
+    } else {
+      seenEvalNames.add(evalEntry.eval_name);
+    }
+    if (!/^[a-z0-9][a-z0-9-]*$/.test(evalEntry.eval_name)) {
+      issue(issues, scope, "'eval_name' should be a lowercase slug.");
+    }
+  }
+
+  checkNonEmptyString(evalEntry.prompt, issues, scope, "prompt");
+  if (
+    typeof evalEntry.expected_output !== "string" &&
+    typeof evalEntry.expected_file !== "string" &&
+    typeof evalEntry.packet_files?.expected_criteria !== "string"
+  ) {
+    issue(
+      issues,
+      scope,
+      "Eval should declare expected_output, expected_file, or packet_files.expected_criteria.",
+    );
+  }
+
+  checkStringArray(evalEntry.files, issues, scope, "files");
+  if (Array.isArray(evalEntry.files)) {
+    for (const [fileIndex, filePath] of evalEntry.files.entries()) {
+      await checkRelativeFile(evalsDir, filePath, issues, `${scope}.files[${fileIndex}]`);
+    }
+  }
+
+  if (typeof evalEntry.expected_file === "string") {
+    await checkRelativeFile(evalsDir, evalEntry.expected_file, issues, `${scope}.expected_file`);
+  }
+
+  const packetFiles = evalEntry.packet_files ?? {};
+  if (packetFiles && typeof packetFiles !== "object") {
+    issue(issues, scope, "'packet_files' must be an object when present.");
+  } else {
+    for (const [key, filePath] of Object.entries(packetFiles)) {
+      if (!allowedPacketFileKeys.has(key)) {
+        issue(
+          issues,
+          scope,
+          `Unknown packet_files key '${key}'. Supported keys: ${[...allowedPacketFileKeys].join(", ")}.`,
+        );
+      }
+      await checkRelativeFile(evalsDir, filePath, issues, `${scope}.packet_files.${key}`);
+    }
+  }
+
+  const validationContract = evalEntry.validation_contract ?? manifestValidationContract;
+  await checkValidationContract(validationContract, skillRoot, issues, `${scope}.validation_contract`);
+}
+
+export async function checkSkillEvalContracts({
+  repoRoot,
+  skillNameOrPath,
+  requireSmoke = true,
+}) {
+  const looksLikePath =
+    skillNameOrPath.includes(path.sep) ||
+    skillNameOrPath.startsWith(".") ||
+    path.isAbsolute(skillNameOrPath);
+  const skillRoot = looksLikePath
+    ? path.resolve(repoRoot, skillNameOrPath)
+    : path.join(repoRoot, "skills", skillNameOrPath);
+  const skillName = path.basename(skillRoot);
+  const evalsDir = path.join(skillRoot, "evals");
+  const manifestPath = path.join(evalsDir, "evals.json");
+  const issues = [];
+
+  const manifest = await readJson(manifestPath, issues, skillName);
+  if (!manifest) {
+    return {
+      skill_name: skillName,
+      eval_count: 0,
+      smoke_count: 0,
+      issues,
+    };
+  }
+
+  if (manifest.skill_name !== skillName) {
+    issue(
+      issues,
+      skillName,
+      `Manifest skill_name '${manifest.skill_name}' does not match directory '${skillName}'.`,
+    );
+  }
+
+  checkNonEmptyString(manifest.capability_id, issues, skillName, "capability_id");
+
+  const runnerContract = manifest.runner_contract;
+  if (!runnerContract || typeof runnerContract !== "object") {
+    issue(issues, skillName, "Manifest must declare a runner_contract object.");
+  } else {
+    if (runnerContract.type !== "baml_pipeline") {
+      issue(issues, skillName, "runner_contract.type must be 'baml_pipeline'.");
+    }
+    for (const field of [
+      "packet_type",
+      "compile_brief_function",
+      "render_document_function",
+      "evaluate_document_function",
+    ]) {
+      checkNonEmptyString(runnerContract[field], issues, `${skillName}.runner_contract`, field);
+    }
+    await checkRunnerFunctions(skillRoot, runnerContract, issues, `${skillName}.runner_contract`);
+  }
+
+  await checkValidationContract(
+    manifest.validation_contract,
+    skillRoot,
+    issues,
+    `${skillName}.validation_contract`,
+  );
+
+  if (!Array.isArray(manifest.evals) || manifest.evals.length === 0) {
+    issue(issues, skillName, "Manifest must declare a non-empty evals array.");
+    return {
+      skill_name: skillName,
+      eval_count: 0,
+      smoke_count: 0,
+      issues,
+    };
+  }
+
+  const smokeCount = manifest.evals.filter((entry) => entry.smoke === true).length;
+  if (requireSmoke && smokeCount === 0) {
+    issue(issues, skillName, "At least one eval must be marked with smoke: true.");
+  }
+
+  const seenEvalIds = new Set();
+  const seenEvalNames = new Set();
+  for (const [index, evalEntry] of manifest.evals.entries()) {
+    await checkEvalEntry(evalEntry, {
+      evalsDir,
+      index,
+      issues,
+      manifestValidationContract: manifest.validation_contract,
+      seenEvalIds,
+      seenEvalNames,
+      skillName,
+      skillRoot,
+    });
+  }
+
+  return {
+    skill_name: skillName,
+    eval_count: manifest.evals.length,
+    smoke_count: smokeCount,
+    issues,
+  };
+}
+
+export function formatStaticCheckResult(result) {
+  if (result.issues.length === 0) {
+    return [
+      `OK ${result.skill_name}: ${result.eval_count} evals, ${result.smoke_count} smoke evals.`,
+    ];
+  }
+
+  return [
+    `FAIL ${result.skill_name}: ${result.issues.length} issue(s).`,
+    ...result.issues.map((entry) => `- ${entry.scope}: ${entry.message}`),
+  ];
+}
diff --git a/scripts/run-baml-eval.ts b/scripts/run-baml-eval.ts
index cb38e84..49c6cfb 100644
--- a/scripts/run-baml-eval.ts
+++ b/scripts/run-baml-eval.ts
@@ -148,6 +148,42 @@ function buildTrialArtifacts(trialResult) {
   };
 }
 
+function printNonPassDetails(suiteResults, suiteDir) {
+  const nonPassResults = suiteResults.filter(
+    (result) => result.current_summary.combined_worst_status !== "Pass",
+  );
+
+  if (nonPassResults.length === 0) {
+    return;
+  }
+
+  console.log("Non-pass details:");
+  for (const result of nonPassResults) {
+    const summary = result.current_summary;
+    const artifactPath = suiteDir
+      ? path.relative(suiteDir, result.run_dir).split(path.sep).join("/")
+      : result.run_dir;
+
+    console.log(
+      `- ${result.eval_name}: LLM ${summary.llm_worst_status}, Combined ${summary.combined_worst_status}, Deterministic pass rate ${summary.deterministic_pass_rate}, artifacts ${artifactPath}`,
+    );
+
+    if (result.error) {
+      console.log(`  error: ${result.error}`);
+    }
+
+    const failedChecks = summary.failed_deterministic_checks ?? [];
+    if (failedChecks.length > 0) {
+      console.log(`  failed deterministic checks: ${failedChecks.join(", ")}`);
+    }
+
+    const openIssues = summary.llm_open_issues ?? [];
+    if (openIssues.length > 0) {
+      console.log(`  LLM open issues: ${openIssues.join(" | ")}`);
+    }
+  }
+}
+
 async function runDeterministicOnlyTrial({
   candidateDocumentPath,
   evalEntry,
@@ -595,6 +631,10 @@ async function main() {
             });
 
         suiteResults.push(evalResult);
+        if (!runSuite && evalResult.current_summary.combined_worst_status !== "Pass") {
+          printNonPassDetails([evalResult], null);
+          process.exitCode = 1;
+        }
       } catch (error) {
         if (!runSuite) {
           throw error;
@@ -620,6 +660,8 @@ async function main() {
             llm_worst_status: "Fail",
             combined_worst_status: "Fail",
             deterministic_pass_rate: 0,
+            failed_deterministic_checks: [],
+            llm_open_issues: [],
           },
         });
       }
@@ -646,6 +688,7 @@ async function main() {
       console.log(`\nSuite complete: ${suiteDir}`);
       console.log(`Evals: ${suiteResults.length}`);
       console.log(`Non-pass evals: ${suiteSummary.non_pass_count}`);
+      printNonPassDetails(suiteResults, suiteDir);
 
       if (suiteSummary.non_pass_count > 0) {
         process.exitCode = 1;

From 765e6fc0c73ad87eedb82eafb879e6246bec75f1 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 18:35:42 +1000
Subject: [PATCH 27/30] Align foundation public-source bet checks

---
 scripts/evals/validators/foundation.ts                        | 4 ++--
 .../baml_src/foundation_compiler/compiler_functions.baml      | 4 ++--
 .../baml_src/foundation_compiler/eval_runner.baml             | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/evals/validators/foundation.ts b/scripts/evals/validators/foundation.ts
index 78b22ff..c14a057 100644
--- a/scripts/evals/validators/foundation.ts
+++ b/scripts/evals/validators/foundation.ts
@@ -71,7 +71,7 @@ function packetAllowsPublicMaterialsLead(packet) {
 
 function strategicBetUsesApprovedLead(line, { allowPublicMaterialsLead = true } = {}) {
   const publicLeadPattern = allowPublicMaterialsLead
-    ? "|Public materials suggest(?:\\s+a bet on|\\s+that)?"
+    ? "|Public materials (?:suggest|indicate|emphasize)(?:\\s+a bet on|\\s+that)?"
     : "";
 
   return new RegExp(
@@ -221,7 +221,7 @@ export function validateFoundationDocument(candidateDocument, templateText, _lan
       hedgedStrategicBets
         ? "Strategic Bets use approved source-centered lead phrasing."
         : allowPublicMaterialsLead
-          ? "One or more Strategic Bets bullets do not begin with approved source-centered phrasing such as `The notes suggest...`, `There are visible signals that...`, `The source material indicates...`, or `The source material emphasizes...`."
+          ? "One or more Strategic Bets bullets do not begin with approved source-centered phrasing such as `The notes suggest...`, `There are visible signals that...`, `Public materials suggest...`, `Public materials indicate...`, `The source material indicates...`, or `The source material emphasizes...`."
           : "One or more Strategic Bets bullets use evidence-source phrasing that does not match a notes-only packet. In note-only packets, use `The notes suggest...`, `There are visible signals that...`, `The source material indicates...`, or `The source material emphasizes...`.",
     ),
     createCheck(
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index 298cb86..d750cf1 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -86,7 +86,7 @@ function CompileFoundationBrief(
     - Treat `strategic_bets` as source-visible directional signals, not recommendations or settled future state.
     - Each `strategic_bets` item must begin with an evidence-matched source-centered opening.
     - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
-    - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
+    - For public-source packets, use `Public materials suggest...`, `Public materials indicate...`, or `Public materials emphasize...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Avoid wording that attributes settled intent directly to the company, such as `The company appears to be betting on...`.
     - Forbid prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
     - Avoid comparative durability claims in `Strategic Bets` such as `more durable than raw prompts` or `will outlast raw prompts`. Prefer source-bound wording like `The source material emphasizes formal constraints and reusable artifacts as durable surfaces relative to raw prompts and transcripts.`
@@ -143,7 +143,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Require `Strategic Bets` to be phrased as source-visible directional signals rather than prescriptions or settled company posture.
     - Require each `Strategic Bets` bullet to begin with an evidence-matched source-centered opening.
     - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
-    - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
+    - For public-source packets, use `Public materials suggest...`, `Public materials indicate...`, or `Public materials emphasize...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Forbid wording like `The company appears to be betting on...`, bare `Bet:` labels, or categorical claims like `X is the wedge`, `Y is a defensible primitive`, `X will remain the center of gravity`, or `Y matters more than Z`.
     - Forbid prescriptive verbs and categorical labels in `Strategic Bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, or `first-class`.
     - Avoid comparative durability claims in `Strategic Bets` such as `more durable than raw prompts` or `will outlast raw prompts`. Prefer source-bound wording like `The source material emphasizes formal constraints and reusable artifacts as durable surfaces relative to raw prompts and transcripts.`
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index 8e6cf5b..3dede43 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -42,7 +42,7 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
     - If `strategic_bets` are included, phrase them as source-visible directional signals or evidence.
     - Each `strategic_bets` item must begin with an evidence-matched source-centered opening.
     - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
-    - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
+    - For public-source packets, use `Public materials suggest...`, `Public materials indicate...`, or `Public materials emphasize...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Do not phrase `strategic_bets` as direct statements of company intent such as `The company appears to be betting on...`.
     - Do not use prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
     - Do not use comparative durability claims in `strategic_bets`, including `more durable than raw prompts` or `will outlast raw prompts`. Prefer source-bound wording like `The source material emphasizes formal constraints and reusable artifacts as durable surfaces relative to raw prompts and transcripts.`
@@ -87,7 +87,7 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     - Phrase `Strategic Bets` as source-visible directional signals, not recommendations or settled future state.
     - Each `Strategic Bets` bullet must begin with an evidence-matched source-centered opening.
     - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
-    - Use `Public materials suggest...` only when the packet explicitly cites public docs, press releases, or other external sources.
+    - For public-source packets, use `Public materials suggest...`, `Public materials indicate...`, or `Public materials emphasize...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Do not use direct company-intent phrasing such as `The company appears to be betting on...`.
     - Do not use bare `Bet:` labels or categorical statements like `X is the wedge`, `Y is a defensible primitive`, `X will remain the center of gravity`, or `Y matters more than Z`.
     - Do not use prescriptive verbs or categorical labels like `prioritize`, `invest in`, `ship`, `treat as first-class`, or `first-class` in `Strategic Bets`.

From 83d291c1f66e5bfe8c1693951ad76518bddb9a00 Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 18:38:48 +1000
Subject: [PATCH 28/30] Broaden foundation strategic bet leads

---
 scripts/evals/validators/foundation.ts                      | 6 +++---
 .../baml_src/foundation_compiler/compiler_functions.baml    | 4 ++--
 .../baml_src/foundation_compiler/eval_runner.baml           | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/evals/validators/foundation.ts b/scripts/evals/validators/foundation.ts
index c14a057..b7cf4d4 100644
--- a/scripts/evals/validators/foundation.ts
+++ b/scripts/evals/validators/foundation.ts
@@ -75,7 +75,7 @@ function strategicBetUsesApprovedLead(line, { allowPublicMaterialsLead = true }
     : "";
 
   return new RegExp(
-    `^(The notes suggest(?:\\s+a bet on|\\s+that)?|There are visible signals that${publicLeadPattern}|The source material (?:indicates|emphasizes)(?:\\s+a bet on|\\s+that)?)`,
+    `^(The notes suggest(?:\\s+a bet on|\\s+that)?|There are visible signals that${publicLeadPattern}|The source material (?:suggests|indicates|emphasizes)(?:\\s+a bet on|\\s+that)?)`,
     "i",
   ).test(line);
 }
@@ -221,8 +221,8 @@ export function validateFoundationDocument(candidateDocument, templateText, _lan
       hedgedStrategicBets
         ? "Strategic Bets use approved source-centered lead phrasing."
         : allowPublicMaterialsLead
-          ? "One or more Strategic Bets bullets do not begin with approved source-centered phrasing such as `The notes suggest...`, `There are visible signals that...`, `Public materials suggest...`, `Public materials indicate...`, `The source material indicates...`, or `The source material emphasizes...`."
-          : "One or more Strategic Bets bullets use evidence-source phrasing that does not match a notes-only packet. In note-only packets, use `The notes suggest...`, `There are visible signals that...`, `The source material indicates...`, or `The source material emphasizes...`.",
+          ? "One or more Strategic Bets bullets do not begin with approved source-centered phrasing such as `The notes suggest...`, `There are visible signals that...`, `Public materials suggest...`, `Public materials indicate...`, `The source material suggests...`, or `The source material indicates...`."
+          : "One or more Strategic Bets bullets use evidence-source phrasing that does not match a notes-only packet. In note-only packets, use `The notes suggest...`, `There are visible signals that...`, `The source material suggests...`, `The source material indicates...`, or `The source material emphasizes...`.",
     ),
     createCheck(
       "strategic_bets_avoid_prescriptive_or_company_intent_language",
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
index d750cf1..3db1476 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml
@@ -85,7 +85,7 @@ function CompileFoundationBrief(
     - Do not add business-model, KPI, org, roadmap, or operating-plan language unless source-backed.
     - Treat `strategic_bets` as source-visible directional signals, not recommendations or settled future state.
     - Each `strategic_bets` item must begin with an evidence-matched source-centered opening.
-    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
+    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, `The source material suggests...`, or `The source material indicates...`.
     - For public-source packets, use `Public materials suggest...`, `Public materials indicate...`, or `Public materials emphasize...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Avoid wording that attributes settled intent directly to the company, such as `The company appears to be betting on...`.
     - Forbid prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
@@ -142,7 +142,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string {
     - Forbid extra sections like `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or `Roadmap`.
     - Require `Strategic Bets` to be phrased as source-visible directional signals rather than prescriptions or settled company posture.
     - Require each `Strategic Bets` bullet to begin with an evidence-matched source-centered opening.
-    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
+    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, `The source material suggests...`, or `The source material indicates...`.
     - For public-source packets, use `Public materials suggest...`, `Public materials indicate...`, or `Public materials emphasize...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Forbid wording like `The company appears to be betting on...`, bare `Bet:` labels, or categorical claims like `X is the wedge`, `Y is a defensible primitive`, `X will remain the center of gravity`, or `Y matters more than Z`.
     - Forbid prescriptive verbs and categorical labels in `Strategic Bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, or `first-class`.
diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
index 3dede43..ba90eee 100644
--- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
+++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml
@@ -41,7 +41,7 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found
     - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them.
     - If `strategic_bets` are included, phrase them as source-visible directional signals or evidence.
     - Each `strategic_bets` item must begin with an evidence-matched source-centered opening.
-    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
+    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, `The source material suggests...`, or `The source material indicates...`.
     - For public-source packets, use `Public materials suggest...`, `Public materials indicate...`, or `Public materials emphasize...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Do not phrase `strategic_bets` as direct statements of company intent such as `The company appears to be betting on...`.
     - Do not use prescriptive or categorical phrasing in `strategic_bets`, including `prioritize`, `invest in`, `ship`, `treat as first-class`, `first-class`, `is the wedge`, `is a defensible primitive`, `will remain`, or `matters more than`.
@@ -86,7 +86,7 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string {
     - If `Strategic Bets` is weakly supported, keep it short rather than expanding it.
     - Phrase `Strategic Bets` as source-visible directional signals, not recommendations or settled future state.
     - Each `Strategic Bets` bullet must begin with an evidence-matched source-centered opening.
-    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, or `The source material indicates...`.
+    - For note-only packets, use `The notes suggest...`, `There are visible signals that...`, `The source material suggests...`, or `The source material indicates...`.
     - For public-source packets, use `Public materials suggest...`, `Public materials indicate...`, or `Public materials emphasize...` only when the packet explicitly cites public docs, press releases, or other external sources.
     - Do not use direct company-intent phrasing such as `The company appears to be betting on...`.
     - Do not use bare `Bet:` labels or categorical statements like `X is the wedge`, `Y is a defensible primitive`, `X will remain the center of gravity`, or `Y matters more than Z`.

From 60e856164822a3b1e5fc017278cf0a5cc5a3df7f Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 19:02:49 +1000
Subject: [PATCH 29/30] Align eval docs with validator taxonomy

---
 evals/TAXONOMY.md                               | 10 ++++++++++
 scripts/evals/static-checks.ts                  | 14 ++++++++++++++
 skills/foundation-creator/SKILL.md              |  4 +++-
 .../fixtures/harbor_care/expected_criteria.md   |  1 -
 .../expected_criteria.md                        |  1 -
 .../lightfast_founder_notes/raw_notes.md        |  1 -
 .../foundation-creator/references/language.md   | 17 ++++++++++-------
 .../foundation-creator/references/template.md   |  7 ++++---
 8 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/evals/TAXONOMY.md b/evals/TAXONOMY.md
index eb45585..d8013c2 100644
--- a/evals/TAXONOMY.md
+++ b/evals/TAXONOMY.md
@@ -33,6 +33,9 @@ show up in `benchmark.json` so runs can be grouped by failure mode later.
 - `update_existing_doc`
   - Existing document is the dominant constraint; success depends on precise
     in-place edits without broad drift.
+- `update_existing_doc_from_source_packet`
+  - Existing document remains the dominant constraint, but a messy source packet
+    or addendum supplies the requested change.
 - `founder_notes_ambiguity`
   - Highly ambiguous notes where positioning, boundaries, and unresolved
     questions matter more than completeness.
@@ -59,6 +62,8 @@ show up in `benchmark.json` so runs can be grouped by failure mode later.
 
 - `developer_infrastructure`
 - `company_foundation`
+- `agent_work_infrastructure`
+- `care_operations`
 - `non_developer_domain`
 
 ### `primary_risks`
@@ -67,9 +72,14 @@ Use a short list of the dominant failure modes for the eval. Current common
 values:
 
 - `template_drift`
+- `boundary_drift`
+- `dependency_boundary_drift`
+- `foundation_language_drift`
 - `implementation_leakage`
 - `invented_capabilities`
 - `invented_certainty`
+- `micro_paraphrase_drift`
+- `section_rewrite_drift`
 - `scope_bleed`
 - `source_overfitting`
 - `weak_boundaries`
diff --git a/scripts/evals/static-checks.ts b/scripts/evals/static-checks.ts
index 944cafb..d089d4f 100644
--- a/scripts/evals/static-checks.ts
+++ b/scripts/evals/static-checks.ts
@@ -28,6 +28,13 @@ const patternFields = [
   "required_patterns",
 ];
 
+const taxonomyStringFields = [
+  "scenario_type",
+  "input_shape",
+  "ambiguity_level",
+  "domain_profile",
+];
+
 function issue(issues, scope, message) {
   issues.push({ scope, message });
 }
@@ -279,6 +286,13 @@ async function checkEvalEntry(evalEntry, context) {
   }
 
   checkNonEmptyString(evalEntry.prompt, issues, scope, "prompt");
+  for (const field of taxonomyStringFields) {
+    checkNonEmptyString(evalEntry[field], issues, scope, field);
+  }
+  checkStringArray(evalEntry.primary_risks, issues, scope, "primary_risks");
+  if (Array.isArray(evalEntry.primary_risks) && evalEntry.primary_risks.length === 0) {
+    issue(issues, scope, "'primary_risks' must include at least one risk.");
+  }
   if (
     typeof evalEntry.expected_output !== "string" &&
     typeof evalEntry.expected_file !== "string" &&
diff --git a/skills/foundation-creator/SKILL.md b/skills/foundation-creator/SKILL.md
index 781fe0b..a4c5472 100644
--- a/skills/foundation-creator/SKILL.md
+++ b/skills/foundation-creator/SKILL.md
@@ -100,7 +100,9 @@ Typical update shapes:
   Frame them as source-visible directional signals, not recommendations or
   direct claims about company intent. Match the lead-in to the evidence:
   note-style packets should use note-style attribution, and `Public materials
-  suggest...` should appear only when the packet actually cites public sources.
+  suggest...`, `Public materials indicate...`, or `Public materials
+  emphasize...` should appear only when the packet actually cites public
+  sources.
 - Open questions and unresolved tensions.
 
 ## Forbidden drift
diff --git a/skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md b/skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md
index e3ed403..22413de 100644
--- a/skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md
+++ b/skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md
@@ -21,4 +21,3 @@
 - The output should not invent medical claims, treatment outcomes, revenue
   model, internal org structure, or unsupported platform surfaces not present in
   the packet.
-
diff --git a/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md
index 28cc166..4d3fc00 100644
--- a/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md
+++ b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md
@@ -25,4 +25,3 @@
 - The output should not invent monetization, marketplace certainty, internal
   org structure, financial claims, or execution plans that are not present in
   the notes.
-
diff --git a/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md
index ee51510..488ca98 100644
--- a/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md
+++ b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md
@@ -73,4 +73,3 @@ plans.
 - Open question:
   should Lightfast remain repo-native first, or eventually become a hosted
   system of record for agent work?
-
diff --git a/skills/foundation-creator/references/language.md b/skills/foundation-creator/references/language.md
index eca9ffe..20d434c 100644
--- a/skills/foundation-creator/references/language.md
+++ b/skills/foundation-creator/references/language.md
@@ -31,9 +31,11 @@ How the foundation document should be worded.
 - `Strategic Bets` should be framed as observed directional signals or bets
   rather than recommendations. Match the lead phrase to the evidence:
   for note-only packets, prefer `the notes suggest...`,
-  `there are visible signals that...`, or `the source material indicates...`.
-  Use `public materials suggest...` only when the packet explicitly cites
-  public docs, press releases, or other external sources.
+  `there are visible signals that...`, `the source material suggests...`, or
+  `the source material indicates...`. Use `public materials suggest...`,
+  `public materials indicate...`, or `public materials emphasize...` only when
+  the packet explicitly cites public docs, press releases, or other external
+  sources.
 
 ## 4. Restraint Rules
 
@@ -60,10 +62,11 @@ How the foundation document should be worded.
 - `Strategic Bets` should be minimal and clearly grounded in repeated signals.
   Each bullet should start with visible-evidence language that matches the
   packet. For note-only packets, use `the notes suggest...`,
-  `there are visible signals that...`, or `the source material indicates...`.
-  Use `public materials suggest...` only when the packet explicitly cites
-  public docs, press releases, or other external sources, rather than settled
-  declarations or recommendations.
+  `there are visible signals that...`, `the source material suggests...`, or
+  `the source material indicates...`. Use `public materials suggest...`,
+  `public materials indicate...`, or `public materials emphasize...` only when
+  the packet explicitly cites public docs, press releases, or other external
+  sources, rather than settled declarations or recommendations.
   Avoid naked labels like `Bet:`. Avoid company-intent phrasing like
   `the company appears to be betting on...`. Avoid categorical claims like
   `X is the wedge`, `Y is a defensible primitive`, `X will remain the center
diff --git a/skills/foundation-creator/references/template.md b/skills/foundation-creator/references/template.md
index 74d00e1..c23c9a0 100644
--- a/skills/foundation-creator/references/template.md
+++ b/skills/foundation-creator/references/template.md
@@ -35,9 +35,10 @@ for each item.
 - {The notes suggest a bet on ...}
 - {The source material indicates ...}
 
-Match the lead phrase to the packet. Use `Public materials suggest...` only
-when the packet explicitly cites public docs, press releases, or other
-external sources.
+Match the lead phrase to the packet. Use `Public materials suggest...`,
+`Public materials indicate...`, or `Public materials emphasize...` only when
+the packet explicitly cites public docs, press releases, or other external
+sources.
 
 ## Open Questions
 

From 8db1504de4959297132f655901ab5b3cc763d2ec Mon Sep 17 00:00:00 2001
From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com>
Date: Thu, 23 Apr 2026 19:23:52 +1000
Subject: [PATCH 30/30] Require explicit spec open questions

---
 scripts/evals/validators/index.ts             |  2 +-
 scripts/evals/validators/spec.ts              | 29 ++++++++++++++++++-
 .../spec_compiler/compiler_functions.baml     | 13 +++++++--
 .../baml_src/spec_compiler/eval_runner.baml   | 13 +++++++--
 4 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/scripts/evals/validators/index.ts b/scripts/evals/validators/index.ts
index d532d5d..7e8cda4 100644
--- a/scripts/evals/validators/index.ts
+++ b/scripts/evals/validators/index.ts
@@ -57,7 +57,7 @@ export async function runDeterministicChecks(
       );
       break;
     case "spec-v1":
-      checks = validateSpecDocument(candidateDocument, templateText);
+      checks = validateSpecDocument(candidateDocument, templateText, packet);
       break;
     case "spec-update-v1":
       checks = validateSpecUpdateDocument(candidateDocument, existingSpecText, validationContract);
diff --git a/scripts/evals/validators/spec.ts b/scripts/evals/validators/spec.ts
index 9923df9..43a68af 100644
--- a/scripts/evals/validators/spec.ts
+++ b/scripts/evals/validators/spec.ts
@@ -10,8 +10,11 @@ import {
   createCheck,
   createPatternChecks,
   createPreservedSectionChecks,
+  extractMarkdownBullets,
   extractNormalizedMarkdownBlocks,
+  extractMarkdownSectionBodies,
   filterLinesByPatternSpecs,
+  findSectionBody,
   removeReplaceableSectionContent,
 } from "./markdown.ts";
 
@@ -49,10 +52,20 @@ function lineExists(lines, matcher) {
   return lines.some((line) => matcher(normalizeHeading(line)));
 }
 
-export function validateSpecDocument(candidateDocument, templateText) {
+function packetRequiresOpenQuestions(packet) {
+  const rawNotes = typeof packet?.raw_notes === "string" ? packet.raw_notes : "";
+  const expectedCriteria =
+    typeof packet?.expected_criteria === "string" ? packet.expected_criteria : "";
+  const combined = `${rawNotes}\n${expectedCriteria}`;
+
+  return /\bUnresolved:\b/i.test(combined) || /\bopen questions?\b/i.test(combined);
+}
+
+export function validateSpecDocument(candidateDocument, templateText, packet = null) {
   const requiredSections = extractSpecMajorSections(templateText);
   const requiredSubsections = extractSpecSubsections(templateText);
   const lines = candidateDocument.split(/\r?\n/);
+  const sectionBodies = extractMarkdownSectionBodies(candidateDocument).bodies;
   const missingSections = requiredSections.filter(
     (section) =>
       !lineExists(
@@ -87,6 +100,11 @@ export function validateSpecDocument(candidateDocument, templateText) {
   const domainModelShapeAcceptable = hasFieldFormatting || declaresNoDurableEntities;
   const fieldLinesAvoidRequirementKeywords =
     !/- `[^`]+` \([^)]*\b(required|optional)\b[^)]*\)/i.test(candidateDocument);
+  const openQuestionsRequired = packetRequiresOpenQuestions(packet);
+  const openQuestionsBody = findSectionBody(sectionBodies, "Open Questions") ?? "";
+  const openQuestionBullets = extractMarkdownBullets(openQuestionsBody);
+  const preservesExplicitOpenQuestions =
+    !openQuestionsRequired || openQuestionBullets.length > 0;
 
   return [
     createCheck(
@@ -161,6 +179,15 @@ export function validateSpecDocument(candidateDocument, templateText) {
         ? "Field type parentheses avoid `required`/`optional` labels."
         : "Detected `required` or `optional` inside field type parentheses; keep those details in the description bullets instead.",
     ),
+    createCheck(
+      "explicit_open_questions_preserved",
+      preservesExplicitOpenQuestions,
+      preservesExplicitOpenQuestions
+        ? openQuestionsRequired
+          ? "Packet-level unresolved questions are preserved in an `Open Questions` section."
+          : "Packet does not require an explicit `Open Questions` section."
+        : "Packet includes explicit unresolved/open-question material, but the candidate did not render an `Open Questions` section with bullets.",
+    ),
     createCheck(
       "no_first_or_second_person",
       !hasPronounDrift(candidateDocument),
diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
index 644bff7..e4aec0b 100644
--- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml
@@ -68,7 +68,8 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie
     - Do not model clients, endpoints, documentation search results, approval lists, or ambiguous aliases as entities unless the notes clearly treat them as durable service records with stable fields and behavior.
     - When two labels may describe overlapping context, such as team versus workspace context, keep the ambiguity explicit instead of multiplying entities.
     - Do not add umbrella subject fields such as `subject`, `case_subject`, `party`, or similarly generalized placeholders when the notes already frame participants or care context more concretely. Prefer packet-backed wording or omit the field.
-    - Every entity field must be directly supported by the notes or be the minimal identity/status field required to keep the entity coherent. Prefer fewer fields over speculative completeness.
+    - Every entity field must be directly supported by the notes. Prefer fewer fields over speculative completeness.
+    - Do not add generic `id`, `stable identifier`, `status`, `outcome`, or correlation fields solely to make records look complete. Use prose, components, or lineage language for traceability unless the notes name record identifiers or states directly.
     - If the notes mention lightweight status states while warning against full workflow or BPM modeling, do not add generic `status` fields to entities unless the status values are directly source-backed and essential to the record.
     - When the notes describe a transition-state resource surface rather than a record-centric system, default to the sparsest viable entities. One minimal identifying field may be enough.
     - Do not infer relationship fields, lifecycle/status fields, or embedded collections such as `project_id`, `status`, or `logs` unless the notes clearly support them.
@@ -203,7 +204,8 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
     - Do not use `operation`, `operations`, or `management` in component or layer names when the brief leaves write scope unresolved.
     - Prefer `handling`, `coordination`, or `review` over workflow-forward labels like `routing`, `orchestration`, or `dispatch` unless the brief explicitly supports that language.
     - Require `Important boundary:` as a standalone labeled block inside the Problem Statement, followed by boundary bullets.
-    - Require numbered components and template-shaped field lines: `- `field_name` (type, constraints)`.
+    - Require numbered components and template-shaped field lines: `- `field_name` (type)`.
+    - In field lines, put only the logical type inside parentheses. Do not append descriptions, labels, constraints, `stable identifier`, `original rough request`, `optional reviewer comments`, or similar semantic text inside the parentheses.
     - Require `### 3.2 Abstraction Levels` to explain stable behavioral layers rather than runtime call order or internal code structure.
     - Require external dependencies to stay at the system/service level rather than naming low-level API methods, SDK calls, token mechanics, registry internals, classes, or storage implementation.
     - When a dependency is tentative in the brief, keep that tentativeness visible with wording like `if present`, `when used`, or `if evaluations are external` rather than presenting the dependency as always required.
@@ -229,6 +231,7 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
       - preserve entity names from the brief exactly unless the name itself is malformed
       - prefer fewer entities over speculative completeness
       - use minimal source-backed fields
+      - omit generic `id`, `stable identifier`, `status`, `outcome`, or correlation fields unless the notes or brief directly name the identifier or state as service behavior
       - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
       - avoid second-order record types when a smaller set of entities or fields already makes the behavior legible
       - preserve packet-named distinctions somewhere in the spec without forcing every named surface into the domain model
@@ -247,17 +250,21 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string {
       - do not split a shared case, shared timeline, or shared record surface into entry-level entities unless the brief clearly makes those entries first-class durable records
       - keep entity count modest; more than roughly six entities should be rare and strongly justified by the brief
       - if the brief names a shared case timeline or shared record surface without explicit entry records, do not create `TimelineEntry`, `Event`, or similar entry entities just to make the timeline concrete
-    - In field lines, use only logical types and source-backed constraints.
+    - In field lines, use only logical types inside parentheses and move all semantics into the indented bullets.
     - Do not add generic `status` fields just because a brief or packet mentions lightweight state. Include status fields only when the source makes status values essential and not merely a warning against over-modeling workflow state.
     - Do not put `required`, `optional`, `non-empty`, `unique`, or invented service-scope constraints inside the field parentheses.
     - If requirement state matters, express it in the indented semantic bullet rather than inside the type parentheses.
     - If `unresolved_questions` is non-empty, add `## 5. Open Questions` after the domain model and render every question as a markdown bullet.
+    - If `brief.unresolved_questions` is non-empty, omitting `## 5. Open Questions` is invalid even when the rest of the spec is complete.
+    - Do not stop at `## 4. Core Domain Model` when `unresolved_questions` are present.
+    - Do not place unresolved questions only in `Important boundary`, goals, non-goals, or prose; render the explicit `## 5. Open Questions` section.
     - When `unresolved_questions` are present, render every materially distinct item from the brief exactly once. Do not drop one for brevity, substitute only a subset, or move the questions only into boundary prose.
     - When multiple unresolved questions are present, preserve the distinct tensions rather than reducing them to a single generic question.
     - Keep `## 5. Open Questions` source-bound. If the brief names a structural tension such as repo-native vs hosted, family-direct vs sponsored, or automation vs human ownership, carry that tension forward explicitly.
     - When the source notes include explicit `Unresolved:` bullets, keep those questions source-traceable in `## 5. Open Questions` rather than substituting different questions.
     - When approved-client scope is uncertain, prefer questions about whether the named supported-client list is fixed, exhaustive, or illustrative. Do not ask how approval is internally designated or maintained unless the brief explicitly raises that process.
     - When write scope is unresolved, ask which specific mutations, if any, are supported. Do not describe the question itself in a way that implies broader actions are already established.
+    - Do not append a `Rules`, `Evaluation Rules`, `Criteria`, or evaluator-instruction section to the candidate `SPEC.md`.
     - In update mode, do not add new top-level sections, missing template subsections, empty scaffold headings, or template-completion material unless the request explicitly asks for them.
     - In update mode, preserve the existing section set exactly. If `existing_spec` lacks `### 3.2 Abstraction Levels`, `### 3.3 External Dependencies`, `## 4. Core Domain Model`, `## 5. Open Questions`, or any other template section, do not add that missing section unless `update_request` names that section or asks for that content explicitly.
     - In update mode, make the smallest textual diff that satisfies `update_request`.
diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
index d833055..5689a08 100644
--- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
+++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml
@@ -54,7 +54,8 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief {
     - Do not model clients, endpoints, documentation search results, approval lists, or ambiguous aliases as entities unless the packet clearly treats them as durable service records with stable fields and behavior.
     - When two labels may describe overlapping context, such as team versus workspace context, keep the ambiguity explicit instead of multiplying entities.
     - Do not add umbrella subject fields such as `subject`, `case_subject`, `party`, or similarly generalized placeholders when the packet already frames participants or care context more concretely. Prefer packet-backed wording or omit the field.
-    - Every entity field must be directly supported by the packet or be the minimal identity/status field required to keep the entity coherent. Prefer fewer fields over speculative completeness.
+    - Every entity field must be directly supported by the packet. Prefer fewer fields over speculative completeness.
+    - Do not add generic `id`, `stable identifier`, `status`, `outcome`, or correlation fields solely to make records look complete. Use prose, components, or lineage language for traceability unless the packet names record identifiers or states directly.
     - If the packet mentions lightweight status states while warning against full workflow or BPM modeling, do not add generic `status` fields to entities unless the status values are directly source-backed and essential to the record.
     - When the packet describes a transition-state resource surface rather than a record-centric system, default to the sparsest viable entities. One minimal identifying field may be enough.
     - Do not infer relationship fields, lifecycle/status fields, or embedded collections such as `project_id`, `status`, or `logs` unless the packet clearly supports them.
@@ -157,8 +158,9 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
     - Keep `Important boundary:` as a standalone labeled block inside `## 1. Problem Statement`, followed by bullets. Do not turn it into a standalone section.
     - Use a numbered list for `### 3.1 Main Components` in the form `1. `Component Name`` followed by indented responsibility bullets.
     - Use a numbered list for `### 3.2 Abstraction Levels` in the form `1. `Layer Name`` followed by indented responsibility bullets.
-    - For entities, use `#### 4.1.x EntityName`, then `Fields:`, then field lines in the form `- `field_name` (type, constraints)` with indented semantic bullets and optional `Default: `value`` lines.
+    - For entities, use `#### 4.1.x EntityName`, then `Fields:`, then field lines in the form `- `field_name` (type)` with indented semantic bullets and optional `Default: `value`` lines.
     - Use logical field types (`string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, `string or null`), not implementation types.
+    - In field lines, put only the logical type inside parentheses. Do not append descriptions, labels, constraints, `stable identifier`, `original rough request`, `optional reviewer comments`, or similar semantic text inside the parentheses.
     - List external dependencies as systems or services, not low-level API method names, SDK calls, token mechanics, registry internals, classes, or internal implementation choices.
     - When a dependency is tentative in the packet or brief, keep that tentativeness visible with wording like `if present`, `when used`, or `if evaluations are external` rather than presenting the dependency as always required.
     - When evaluation is optional, qualified, or external in the brief, describe the service as coordinating evaluation runs or attaching evaluation results rather than claiming it always performs the evaluation itself.
@@ -183,6 +185,7 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - preserve entity names from the brief exactly unless the name itself is malformed
       - prefer fewer entities over speculative completeness
       - use minimal source-backed fields
+      - omit generic `id`, `stable identifier`, `status`, `outcome`, or correlation fields unless the packet or brief directly names the identifier or state as service behavior
       - if a concept lacks enough support for a stable entity schema, keep it out of `## 4. Core Domain Model`
       - avoid second-order record types when a smaller set of entities or fields already makes the behavior legible
       - preserve packet-named distinctions somewhere in the document without forcing every named surface into the domain model
@@ -196,13 +199,16 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
       - if a surface is explicitly ambiguous or alias-like, such as `team` versus `workspace context`, keep it out of the domain model unless the brief clearly establishes it as a durable record
       - avoid component names like `gate` or dependency phrases like `client status` when the brief only supports externally visible access restrictions
       - keep entity count modest; more than roughly six entities should be rare and strongly justified by the brief
-    - In field lines, use only logical types and source-backed constraints.
+    - In field lines, use only logical types inside parentheses and move all semantics into the indented bullets.
     - Do not add generic `status` fields just because a brief or packet mentions lightweight state. Include status fields only when the source makes status values essential and not merely a warning against over-modeling workflow state.
     - Do not put `required`, `optional`, `non-empty`, `unique`, or invented service-scope constraints inside the field parentheses.
     - If requirement state matters, express it in the indented semantic bullet rather than inside the type parentheses.
     - Preserve uncertainty where the source packet is in transition.
     - Avoid implementation detail.
     - If `unresolved_questions` is non-empty, add `## 5. Open Questions` after the domain model and render every question as a markdown bullet.
+    - If `brief.unresolved_questions` is non-empty, omitting `## 5. Open Questions` is invalid even when the rest of the spec is complete.
+    - Do not stop at `## 4. Core Domain Model` when `unresolved_questions` are present.
+    - Do not place unresolved questions only in `Important boundary`, goals, non-goals, or prose; render the explicit `## 5. Open Questions` section.
     - When `unresolved_questions` are present, render every materially distinct item from the brief exactly once. Do not drop one for brevity, substitute only a subset, or move the questions only into boundary prose.
     - When multiple unresolved questions are present, preserve the distinct tensions rather than reducing them to a single generic question.
     - Keep `## 5. Open Questions` source-bound. If the brief names a structural tension such as repo-native vs hosted, family-direct vs sponsored, or automation vs human ownership, carry that tension forward explicitly.
@@ -211,6 +217,7 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string {
     - When write scope is unresolved, ask which specific mutations, if any, are supported. Do not describe the question itself in a way that implies broader actions are already established.
     - Optional extra sections are allowed only when the brief materially needs them; if used, place them after `## 4. Core Domain Model`.
     - Do not add end markers, appendix-only filler, or operational/program-management sections unless the brief explicitly requires them.
+    - Do not append a `Rules`, `Evaluation Rules`, `Criteria`, or evaluator-instruction section to the candidate `SPEC.md`.
     - If `existing_spec` is present, treat this as update mode:
       - start from `existing_spec`
       - preserve all unchanged lines verbatim