From 5bbcf8eb531e1c2d90756fa454953f62ff1bec73 Mon Sep 17 00:00:00 2001 From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:02:06 +1000 Subject: [PATCH 01/30] add foundation/spec compiler eval scaffolding --- .gitignore | 13 +- README.md | 16 ++ package-lock.json | 188 ++++++++++++++++ package.json | 15 ++ scripts/run-baml-eval.mjs | 203 ++++++++++++++++++ skills/foundation-creator/SKILL.md | 41 ++++ .../foundation-creator/baml_src/clients.baml | 10 + .../foundation_compiler/common_types.baml | 20 ++ .../compiler_functions.baml | 99 +++++++++ .../foundation_compiler/eval_runner.baml | 63 ++++++ .../foundation_compiler/eval_types.baml | 25 +++ .../foundation_compiler/foundation_types.baml | 31 +++ .../baml_src/generators.baml | 6 + skills/foundation-creator/evals/evals.json | 26 +++ .../fixtures/vercel/expected_criteria.md | 20 ++ .../evals/fixtures/vercel/raw_notes.md | 111 ++++++++++ skills/spec-creator/baml_src/clients.baml | 10 + skills/spec-creator/baml_src/generators.baml | 6 + .../baml_src/spec_compiler/common_types.baml | 20 ++ .../spec_compiler/compiler_functions.baml | 78 +++++++ .../baml_src/spec_compiler/eval_runner.baml | 67 ++++++ .../baml_src/spec_compiler/eval_types.baml | 26 +++ .../baml_src/spec_compiler/spec_types.baml | 37 ++++ skills/spec-creator/evals/evals.json | 19 ++ .../fixtures/vercel_mcp/expected_criteria.md | 16 ++ .../evals/fixtures/vercel_mcp/raw_notes.md | 62 ++++++ 26 files changed, 1222 insertions(+), 6 deletions(-) create mode 100644 package-lock.json create mode 100644 package.json create mode 100644 scripts/run-baml-eval.mjs create mode 100644 skills/foundation-creator/SKILL.md create mode 100644 skills/foundation-creator/baml_src/clients.baml create mode 100644 skills/foundation-creator/baml_src/foundation_compiler/common_types.baml create mode 100644 skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml create mode 100644 skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml create mode 100644 skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml create mode 100644 skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml create mode 100644 skills/foundation-creator/baml_src/generators.baml create mode 100644 skills/foundation-creator/evals/evals.json create mode 100644 skills/foundation-creator/evals/fixtures/vercel/expected_criteria.md create mode 100644 skills/foundation-creator/evals/fixtures/vercel/raw_notes.md create mode 100644 skills/spec-creator/baml_src/clients.baml create mode 100644 skills/spec-creator/baml_src/generators.baml create mode 100644 skills/spec-creator/baml_src/spec_compiler/common_types.baml create mode 100644 skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml create mode 100644 skills/spec-creator/baml_src/spec_compiler/eval_runner.baml create mode 100644 skills/spec-creator/baml_src/spec_compiler/eval_types.baml create mode 100644 skills/spec-creator/baml_src/spec_compiler/spec_types.baml create mode 100644 skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md create mode 100644 skills/spec-creator/evals/fixtures/vercel_mcp/raw_notes.md diff --git a/.gitignore b/.gitignore index 92c1e04..a7f1d20 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ -.DS_Store -__pycache__/ -*.py[cod] -.venv/ -.idea/ -.vscode/ +node_modules/ +skills/foundation-creator/baml_client/ +skills/foundation-creator/baml_client_dist/ +skills/spec-creator/baml_client/ +skills/spec-creator/baml_client_dist/ +skills/foundation-creator/evals/runs/ +skills/spec-creator/evals/runs/ diff --git a/README.md b/README.md index 1c9a33a..7af46e7 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ Agent skills published by Lightfast. Compatible with [Claude Code](https://docs. | Skill | Purpose | |---|---| +| [`foundation-creator`](skills/foundation-creator/) | Draft a top-level foundation document for a product or company primitive: thesis, mission, boundaries, actor model, surfaces, strategic bets, and open questions. | | [`spec-creator`](skills/spec-creator/) | Write and update a top-level `SPEC.md` service specification following a strict template and language guide. | ## Install @@ -13,11 +14,26 @@ Agent skills published by Lightfast. Compatible with [Claude Code](https://docs. Each skill is a subdirectory under `skills/`. To install one into a project: ```bash +npx skills add lightfastai/skills --skill foundation-creator npx skills add lightfastai/skills --skill spec-creator ``` Or copy the directory directly into `.claude/skills/` in your project. +## Local evals + +This repo now includes BAML-backed fixture evals for `foundation-creator` and +`spec-creator`. + +```bash +npm install +OPENAI_API_KEY=... npm run eval:foundation -- create-foundation-from-vercel-source-packet +OPENAI_API_KEY=... npm run eval:spec -- create-from-vercel-mcp-source-packet +``` + +Each run writes packet, brief, candidate document, and evaluation report +artifacts under `skills//evals/runs/`. + ## License MIT diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..7889375 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,188 @@ +{ + "name": "@lightfastai/skills", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "@lightfastai/skills", + "dependencies": { + "@boundaryml/baml": "0.221.0", + "typescript": "5.9.3" + } + }, + "node_modules/@boundaryml/baml": { + "version": "0.221.0", + "resolved": "https://registry.npmjs.org/@boundaryml/baml/-/baml-0.221.0.tgz", + "integrity": "sha512-pPOp2JVsG4Wa/tMLnJv/rxil5jsuVDgxnA0xO0h4lKy7t/fKCXOVvO+nzpOZ4byLTP/Ow+8pVvoKRKvx1J/Hsw==", + "license": "MIT", + "dependencies": { + "@scarf/scarf": "^1.3.0" + }, + "bin": { + "baml": "cli.js", + "baml-cli": "cli.js" + }, + "engines": { + "node": ">= 10" + }, + "optionalDependencies": { + "@boundaryml/baml-darwin-arm64": "0.221.0", + "@boundaryml/baml-darwin-x64": "0.221.0", + "@boundaryml/baml-linux-arm64-gnu": "0.221.0", + "@boundaryml/baml-linux-arm64-musl": "0.221.0", + "@boundaryml/baml-linux-x64-gnu": "0.221.0", + "@boundaryml/baml-linux-x64-musl": "0.221.0", + "@boundaryml/baml-win32-arm64-msvc": "0.221.0", + "@boundaryml/baml-win32-x64-msvc": "0.221.0" + } + }, + "node_modules/@boundaryml/baml-darwin-arm64": { + "version": "0.221.0", + "resolved": "https://registry.npmjs.org/@boundaryml/baml-darwin-arm64/-/baml-darwin-arm64-0.221.0.tgz", + "integrity": "sha512-GxqdjVUodyKtgKX/CIDGZyz5lXS0d0iFnV2x7thMQM9ziMrOPcWd3qwflOLYdgDo6Hy9yMULrqtMPkCrmbwEHQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@boundaryml/baml-darwin-x64": { + "version": "0.221.0", + "resolved": "https://registry.npmjs.org/@boundaryml/baml-darwin-x64/-/baml-darwin-x64-0.221.0.tgz", + "integrity": "sha512-wG3jsgOIr8C+09j0AFZY4F8EHvd1gKoKw6+HR1Oi+cw4pijklCk2LI0AIwMPzgG12BAxWV6jEIONMORmspesFQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@boundaryml/baml-linux-arm64-gnu": { + "version": "0.221.0", + "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-arm64-gnu/-/baml-linux-arm64-gnu-0.221.0.tgz", + "integrity": "sha512-Xy1M3muUV2B/4f8dVUpX/IN2CI1m4hGtw31V+kQdFYsy3Hvo58qjijtlkKNYZOjqWBqVlgPMFhTvv8N0cD4N/w==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@boundaryml/baml-linux-arm64-musl": { + "version": "0.221.0", + "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-arm64-musl/-/baml-linux-arm64-musl-0.221.0.tgz", + "integrity": "sha512-6RIkHCViXQEsn6Ts5Uk9c6SDgokkXGO4GkoHpoNnKluTJtuB/B2nUOv2O147GFDqtspFDL2jk5d+oiYibfMn0g==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@boundaryml/baml-linux-x64-gnu": { + "version": "0.221.0", + "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-x64-gnu/-/baml-linux-x64-gnu-0.221.0.tgz", + "integrity": "sha512-YoOz6N6E37UE4ULRCe24P/Ov2pNxjvI4R+I6Bwhkqdt5HOGsJrf2uJUSC+XxKZpkPqlbo1gGZPoCB0lcyeSkeA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@boundaryml/baml-linux-x64-musl": { + "version": "0.221.0", + "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-x64-musl/-/baml-linux-x64-musl-0.221.0.tgz", + "integrity": "sha512-gY67VRXrixgTenDtDzVSMo0GjLbeofGtCZuArfiDgCglfJ5/KGBSgwzqrrTuyUVLGK902NmCaYA5OrPSXezSzg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@boundaryml/baml-win32-arm64-msvc": { + "version": "0.221.0", + "resolved": "https://registry.npmjs.org/@boundaryml/baml-win32-arm64-msvc/-/baml-win32-arm64-msvc-0.221.0.tgz", + "integrity": "sha512-pTHPv6GVlW7nLVszgm7P7+PdQ97JJ8xnRp3/TeP/ya5z08wKi0ejOInLzElMyZVTB+XY707qGlM9CreJnDH3vg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@boundaryml/baml-win32-x64-msvc": { + "version": "0.221.0", + "resolved": "https://registry.npmjs.org/@boundaryml/baml-win32-x64-msvc/-/baml-win32-x64-msvc-0.221.0.tgz", + "integrity": "sha512-XP3CxwsYxOZAOzkWqZd2Dg8iNpDOMrbA/Bz3nqI7oX/wL+ZMkHJwjWQwxIVL+sg2rp+TceV+21UPb6LTmt+qJw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@scarf/scarf": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/@scarf/scarf/-/scarf-1.4.0.tgz", + "integrity": "sha512-xxeapPiUXdZAE3che6f3xogoJPeZgig6omHEy1rIY5WVsB3H2BHNnZH+gHG6x91SCWyQCzWGsuL2Hh3ClO5/qQ==", + "hasInstallScript": true, + "license": "Apache-2.0" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..0ebcd73 --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "name": "@lightfastai/skills", + "private": true, + "type": "module", + "scripts": { + "baml:generate:foundation": "npx baml-cli generate --from ./skills/foundation-creator/baml_src", + "baml:generate:spec": "npx baml-cli generate --from ./skills/spec-creator/baml_src", + "eval:foundation": "node ./scripts/run-baml-eval.mjs foundation-creator", + "eval:spec": "node ./scripts/run-baml-eval.mjs spec-creator" + }, + "dependencies": { + "@boundaryml/baml": "0.221.0", + "typescript": "5.9.3" + } +} diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs new file mode 100644 index 0000000..af18cec --- /dev/null +++ b/scripts/run-baml-eval.mjs @@ -0,0 +1,203 @@ +import { mkdir, readFile, rm, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { fileURLToPath, pathToFileURL } from "node:url"; +import { spawn } from "node:child_process"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const repoRoot = path.resolve(__dirname, ".."); + +function fail(message) { + console.error(message); + process.exit(1); +} + +function runCommand(command, args, cwd) { + return new Promise((resolve, reject) => { + const child = spawn(command, args, { + cwd, + stdio: "inherit", + env: process.env, + shell: false, + }); + + child.on("error", reject); + child.on("exit", (code) => { + if (code === 0) { + resolve(); + } else { + reject(new Error(`${command} ${args.join(" ")} exited with code ${code}`)); + } + }); + }); +} + +async function loadJson(filePath) { + return JSON.parse(await readFile(filePath, "utf8")); +} + +async function loadText(filePath) { + return readFile(filePath, "utf8"); +} + +function getEvalBySelector(evals, selector) { + if (!selector) { + if (evals.length === 1) { + return evals[0]; + } + fail("Multiple evals exist. Pass an eval id or name."); + } + + const numeric = Number(selector); + if (!Number.isNaN(numeric)) { + const byId = evals.find((entry) => entry.id === numeric); + if (byId) { + return byId; + } + } + + const byName = evals.find((entry) => entry.eval_name === selector); + if (byName) { + return byName; + } + + fail(`Eval '${selector}' not found.`); +} + +async function generateClient(skillRoot) { + const bamlSrc = path.join(skillRoot, "baml_src"); + await runCommand("npx", ["baml-cli", "generate", "--from", bamlSrc], repoRoot); +} + +async function importGeneratedClient(skillRoot) { + const clientPath = path.join(skillRoot, "baml_client_dist", "index.js"); + return import(pathToFileURL(clientPath).href); +} + +async function buildPacket(evalEntry, evalsDir, packetType) { + const packetFiles = evalEntry.packet_files ?? {}; + const rawNotesPath = packetFiles.raw_notes + ? path.join(evalsDir, packetFiles.raw_notes) + : null; + const expectedCriteriaPath = packetFiles.expected_criteria + ? path.join(evalsDir, packetFiles.expected_criteria) + : null; + const existingSpecPath = packetFiles.existing_spec + ? path.join(evalsDir, packetFiles.existing_spec) + : null; + + const packet = { + packet_name: evalEntry.eval_name, + task_prompt: evalEntry.prompt, + raw_notes: rawNotesPath ? await loadText(rawNotesPath) : "", + expected_criteria: expectedCriteriaPath ? await loadText(expectedCriteriaPath) : "", + }; + + if (packetType === "SpecEvalPacket") { + packet.existing_spec = existingSpecPath ? await loadText(existingSpecPath) : null; + } + + return packet; +} + +async function ensureFreshClient(skillRoot) { + const clientDir = path.join(skillRoot, "baml_client"); + const distDir = path.join(skillRoot, "baml_client_dist"); + const tsconfigPath = path.join(skillRoot, ".tmp-baml-client-tsconfig.json"); + await rm(clientDir, { recursive: true, force: true }); + await rm(distDir, { recursive: true, force: true }); + await generateClient(skillRoot); + await writeFile( + tsconfigPath, + JSON.stringify( + { + compilerOptions: { + module: "NodeNext", + moduleResolution: "NodeNext", + target: "ES2022", + declaration: false, + sourceMap: false, + skipLibCheck: true, + outDir: distDir, + rootDir: clientDir, + }, + include: [path.join(clientDir, "*.ts")], + }, + null, + 2, + ), + "utf8", + ); + try { + await runCommand("npx", ["tsc", "--project", tsconfigPath], repoRoot); + } finally { + await rm(tsconfigPath, { force: true }); + } +} + +async function writeRunArtifacts(runDir, artifacts) { + await mkdir(runDir, { recursive: true }); + for (const [name, value] of Object.entries(artifacts)) { + const filePath = path.join(runDir, name); + const content = typeof value === "string" ? value : JSON.stringify(value, null, 2); + await writeFile(filePath, content, "utf8"); + } +} + +async function main() { + const skillName = process.argv[2]; + const selector = process.argv[3]; + + if (!skillName) { + fail("Usage: node ./scripts/run-baml-eval.mjs [eval-id-or-name]"); + } + + const skillRoot = path.join(repoRoot, "skills", skillName); + const evalsDir = path.join(skillRoot, "evals"); + const manifestPath = path.join(evalsDir, "evals.json"); + const manifest = await loadJson(manifestPath); + const evalEntry = getEvalBySelector(manifest.evals, selector); + const runner = manifest.runner_contract; + + if (!runner || runner.type !== "baml_pipeline") { + fail(`Skill '${skillName}' does not declare a supported runner_contract.`); + } + + if (!process.env.OPENAI_API_KEY) { + fail("OPENAI_API_KEY is required to execute BAML evals with the current client configuration."); + } + + await ensureFreshClient(skillRoot); + const generated = await importGeneratedClient(skillRoot); + const { b } = generated; + + const packet = await buildPacket(evalEntry, evalsDir, runner.packet_type); + const compileFn = b[runner.compile_brief_function]; + const renderFn = b[runner.render_document_function]; + const evaluateFn = b[runner.evaluate_document_function]; + + if (!compileFn || !renderFn || !evaluateFn) { + fail(`Generated client is missing one or more runner functions for '${skillName}'.`); + } + + const brief = await compileFn(packet); + const candidateDocument = await renderFn(brief); + const report = await evaluateFn(packet, candidateDocument); + + const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); + const runDir = path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`); + await writeRunArtifacts(runDir, { + "packet.json": packet, + "brief.json": brief, + "candidate.md": candidateDocument, + "report.json": report, + }); + + console.log(`Run complete: ${runDir}`); + console.log(`Overall status: ${report.overall_status}`); +} + +main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); +}); diff --git a/skills/foundation-creator/SKILL.md b/skills/foundation-creator/SKILL.md new file mode 100644 index 0000000..e6dbb90 --- /dev/null +++ b/skills/foundation-creator/SKILL.md @@ -0,0 +1,41 @@ +--- +name: foundation-creator +description: > + Use this skill when the user wants to write, draft, update, or revise a + top-level foundation document for a company, product, or new primitive. + This document captures thesis, mission, boundaries, actor model, surfaces, + strategic bets, and open questions without collapsing ambiguity into + implementation decisions. Applies when the user is still defining what the + system is, what it is not, and what long-term direction it implies. Does + NOT apply to concrete service specifications in `SPEC.md`, implementation + plans, PRDs, or roadmap execution docs. +--- + +# Foundation Creator + +Writes and updates a single top-level foundation document for an early-stage +product or company primitive. The resulting document is strategic and +behavioral, not implementation-level. It should preserve uncertainty where +decisions are not yet mature. + +## Core behavior + +- Start from thesis and boundaries, not components. +- Prefer explicit open questions over invented certainty. +- Separate durable beliefs from speculative bets. +- Avoid implementation detail unless the user explicitly wants it. +- Escalate to `spec-creator` only when a subsystem is concrete enough to + deserve a `SPEC.md`. + +## Current compiler surface + +This skill includes typed BAML contracts under `baml_src/foundation_compiler/` +for: + +- extracting atomic claims from messy notes +- compiling a stable foundation kernel +- critiquing ambiguity, contradiction, and implementation leakage +- compiling a brief suitable for downstream document rendering + +The BAML layer is schema-first. Prompt wording and document templates can +evolve without changing the core interfaces. diff --git a/skills/foundation-creator/baml_src/clients.baml b/skills/foundation-creator/baml_src/clients.baml new file mode 100644 index 0000000..cd662f1 --- /dev/null +++ b/skills/foundation-creator/baml_src/clients.baml @@ -0,0 +1,10 @@ +client EvalModel { + provider "openai-responses" + options { + api_key env.OPENAI_API_KEY + model "gpt-5-mini" + reasoning { + effort "medium" + } + } +} diff --git a/skills/foundation-creator/baml_src/foundation_compiler/common_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/common_types.baml new file mode 100644 index 0000000..cb35cb6 --- /dev/null +++ b/skills/foundation-creator/baml_src/foundation_compiler/common_types.baml @@ -0,0 +1,20 @@ +enum ClaimKind { + Fact + Thesis + Boundary + Constraint + OpenQuestion +} + +enum Confidence { + High + Medium + Low +} + +class Claim { + statement string @assert(nonempty_statement, {{ this|length > 0 }}) + kind ClaimKind + confidence Confidence + sources string[] @assert(has_source, {{ this|length > 0 }}) +} diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml new file mode 100644 index 0000000..0b2b6c0 --- /dev/null +++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml @@ -0,0 +1,99 @@ +function ExtractClaims(raw_notes: string) -> Claim[] { + client EvalModel + prompt #" + Extract atomic claims from the raw notes below. + + Rules: + - Preserve the original intent. + - Split compound statements into separate claims when useful. + - Use `Fact` only for concrete statements in the notes. + - Use `Thesis` for directional beliefs or long-term positions. + - Use `Boundary` for explicit scope limits or exclusions. + - Use `Constraint` for hard operating requirements. + - Use `OpenQuestion` when the note is unresolved. + - Keep `sources` short and human-readable. + + Raw notes: + {{ raw_notes }} + + {{ ctx.output_format }} + "# +} + +function BuildFoundationKernel(claims: Claim[]) -> FoundationKernel { + client EvalModel + prompt #" + Build a stable foundation kernel from the extracted claims below. + + Rules: + - Preserve ambiguity when the claims do not support a hard decision. + - Capture durable thesis-level information. + - Do not invent implementation detail. + - Use empty lists when a category is not yet supported by the claims. + + Claims: + {{ claims|format(type="yaml") }} + + {{ ctx.output_format }} + "# +} + +function CritiqueFoundationKernel(kernel: FoundationKernel) -> FoundationCritique { + client EvalModel + prompt #" + Critique the foundation kernel below. + + Rules: + - Flag contradictions across thesis, boundaries, and bets. + - Flag vague claims that should be sharpened before document rendering. + - Flag implementation leakage. + - Flag missing boundaries that create strategic confusion. + - Ask only high-leverage clarification questions. + + Kernel: + {{ kernel|format(type="yaml") }} + + {{ ctx.output_format }} + "# +} + +function CompileFoundationBrief( + kernel: FoundationKernel, + critique: FoundationCritique +) -> FoundationBrief { + client EvalModel + prompt #" + Compile a concise foundation brief from the kernel and critique below. + + Rules: + - Optimize for document rendering, not implementation planning. + - Preserve unresolved questions explicitly. + - Exclude critique items that were already resolved by the kernel. + - Keep the brief compact and high-signal. + + Kernel: + {{ kernel|format(type="yaml") }} + + Critique: + {{ critique|format(type="yaml") }} + + {{ ctx.output_format }} + "# +} + +function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string { + client EvalModel + prompt #" + Render a prompt for the `foundation-creator` skill using the brief below. + + Rules: + - Ask for a top-level foundation document. + - Preserve strategic ambiguity where the brief leaves open questions. + - Avoid implementation detail. + - Emphasize thesis, boundaries, actor model, surfaces, and strategic bets. + - Make the prompt directly usable by an agent. + + Brief: + {{ brief|format(type="yaml") }} + "# +} diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml new file mode 100644 index 0000000..72df02a --- /dev/null +++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml @@ -0,0 +1,63 @@ +function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> FoundationBrief { + client EvalModel + prompt #" + Compile a foundation brief from the evaluation packet below. + + Task prompt: + {{ packet.task_prompt }} + + Raw notes: + {{ packet.raw_notes }} + + Expected criteria: + {{ packet.expected_criteria }} + + Rules: + - Produce a concise, durable foundation brief. + - Preserve ambiguity where the notes do not settle the framing. + - Treat expected criteria as evaluation guidance, not as license to invent. + - Avoid implementation detail. + + {{ ctx.output_format }} + "# +} + +function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string { + client EvalModel + prompt #" + Draft a top-level foundation document from the brief below. + + Rules: + - Write a durable strategic document, not a `SPEC.md`. + - Start from thesis and boundaries, not architecture. + - Preserve unresolved questions explicitly. + - Avoid implementation detail. + + Brief: + {{ brief|format(type="yaml") }} + "# +} + +function EvaluateFoundationDocument( + packet: FoundationEvalPacket, + candidate_document: string +) -> EvalReport { + client EvalModel + prompt #" + Evaluate the candidate foundation document against the evaluation packet. + + Packet: + {{ packet|format(type="yaml") }} + + Candidate document: + {{ candidate_document }} + + Rules: + - Grade against the expected criteria explicitly. + - Reward preservation of uncertainty when the source packet is genuinely mixed. + - Penalize invented certainty, invented capabilities, or implementation leakage. + - Use `Pass`, `Partial`, or `Fail` for each criterion. + + {{ ctx.output_format }} + "# +} diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml new file mode 100644 index 0000000..94058d1 --- /dev/null +++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml @@ -0,0 +1,25 @@ +enum EvalStatus { + Pass + Partial + Fail +} + +class EvalCheck { + criterion string @assert(nonempty_criterion, {{ this|length > 0 }}) + status EvalStatus + rationale string @assert(nonempty_rationale, {{ this|length > 0 }}) +} + +class EvalReport { + overall_status EvalStatus + summary string @assert(nonempty_summary, {{ this|length > 0 }}) + checks EvalCheck[] + open_issues string[] +} + +class FoundationEvalPacket { + packet_name string @assert(nonempty_packet_name, {{ this|length > 0 }}) + task_prompt string @assert(nonempty_task_prompt, {{ this|length > 0 }}) + raw_notes string @assert(nonempty_raw_notes, {{ this|length > 0 }}) + expected_criteria string @assert(nonempty_expected_criteria, {{ this|length > 0 }}) +} diff --git a/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml new file mode 100644 index 0000000..16ffa0f --- /dev/null +++ b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml @@ -0,0 +1,31 @@ +class FoundationKernel { + name string @assert(nonempty_name, {{ this|length > 0 }}) + mission string? + primitive string? + theses string[] + problems string[] + actors string[] + surfaces string[] + boundaries string[] + strategic_bets string[] + open_questions string[] +} + +class FoundationCritique { + contradictions string[] + vague_claims string[] + implementation_leaks string[] + missing_boundaries string[] + leverage_questions string[] +} + +class FoundationBrief { + title string @assert(nonempty_title, {{ this|length > 0 }}) + summary string @assert(nonempty_summary, {{ this|length > 0 }}) + core_theses string[] + boundaries string[] + actor_model string[] + surfaces string[] + strategic_bets string[] + unresolved_questions string[] +} diff --git a/skills/foundation-creator/baml_src/generators.baml b/skills/foundation-creator/baml_src/generators.baml new file mode 100644 index 0000000..248fe62 --- /dev/null +++ b/skills/foundation-creator/baml_src/generators.baml @@ -0,0 +1,6 @@ +generator target { + output_type "typescript" + output_dir "../" + module_format "esm" + version "0.221.0" +} diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json new file mode 100644 index 0000000..d0eae13 --- /dev/null +++ b/skills/foundation-creator/evals/evals.json @@ -0,0 +1,26 @@ +{ + "skill_name": "foundation-creator", + "runner_contract": { + "type": "baml_pipeline", + "packet_type": "FoundationEvalPacket", + "compile_brief_function": "CompileFoundationBriefFromPacket", + "render_document_function": "RenderFoundationDocumentDraft", + "evaluate_document_function": "EvaluateFoundationDocument" + }, + "evals": [ + { + "id": 0, + "eval_name": "create-foundation-from-vercel-source-packet", + "prompt": "Use the source packet in `fixtures/vercel/raw_notes.md` to draft a top-level foundation document for Vercel. Preserve ambiguity where the positioning is in transition. Do not produce a `SPEC.md`, implementation plan, or architecture diagram.", + "expected_output": "A top-level foundation document that frames Vercel as a developer cloud/platform company, captures the current tension between `Frontend Cloud` and `AI Cloud`, identifies core surfaces such as deployment workflow, collaboration, security, AI infrastructure, and platform-building, clarifies that Vercel is not just static hosting or general-purpose IaaS, and preserves open questions or strategic bets instead of inventing certainty.", + "expected_file": "fixtures/vercel/expected_criteria.md", + "packet_files": { + "raw_notes": "fixtures/vercel/raw_notes.md", + "expected_criteria": "fixtures/vercel/expected_criteria.md" + }, + "files": [ + "fixtures/vercel/raw_notes.md" + ] + } + ] +} diff --git a/skills/foundation-creator/evals/fixtures/vercel/expected_criteria.md b/skills/foundation-creator/evals/fixtures/vercel/expected_criteria.md new file mode 100644 index 0000000..f6ed893 --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/vercel/expected_criteria.md @@ -0,0 +1,20 @@ +# Expected Criteria + +- The output should identify Vercel as a developer platform or cloud for + shipping web and AI products, not merely a static hosting company. +- The output should preserve the current positioning tension between + `Frontend Cloud` and `AI Cloud` instead of pretending only one framing + exists. +- The output should identify multiple durable surfaces: + deployment workflow, collaboration, security, AI infrastructure, and + platform-building support. +- The output should describe at least one actor model that includes developers + or teams, and it should ideally also recognize AI tools or platform builders + as meaningful actors. +- The output should set clear boundaries: + not a general-purpose IaaS provider, not just a frontend framework, and not + just an AI tooling brand. +- The output should preserve at least one open question or strategic bet + instead of forcing a final conclusion about Vercel's ultimate primitive. +- The output should not invent internal organizational structure, financial + claims, or product lines that are not present in the packet. diff --git a/skills/foundation-creator/evals/fixtures/vercel/raw_notes.md b/skills/foundation-creator/evals/fixtures/vercel/raw_notes.md new file mode 100644 index 0000000..f7b6ea9 --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/vercel/raw_notes.md @@ -0,0 +1,111 @@ +# Vercel Source Packet + +Assembled on April 20, 2026 from official Vercel sources. + +This packet is intentionally paraphrased. It is meant to test whether +`foundation-creator` can turn a modern company/product platform into a durable +foundation document without collapsing brand transition into fake certainty. + +## Source 1 + +- URL: [https://vercel.com/about](https://vercel.com/about) +- Accessed: April 20, 2026 +- The About page says Vercel "enables the world to ship the best products." +- It describes the `Frontend Cloud` as the developer experience and + infrastructure to build, scale, and secure a faster, more personalized web. +- Its brand values emphasize products that are easy, universal, and + accessible. + +## Source 2 + +- URL: [https://vercel.com/docs](https://vercel.com/docs) +- Last updated: January 30, 2026 +- The docs index now describes Vercel as the `AI Cloud`, a unified platform + for building, deploying, and scaling AI-powered applications. +- The docs say Vercel can ship web apps, agentic workloads, and "everything in + between." +- Git-connected deployment is still central: connect a repository and deploy + on every push, with automatic preview environments before production. +- Build surfaces listed in the docs include `Next.js`, `Functions`, `Routing + Middleware`, `Incremental Static Regeneration`, `Image Optimization`, + environment management, and feature flags. +- AI surfaces listed in the docs include `v0`, `AI SDK`, `AI Gateway`, + `Agents`, `MCP Servers`, `Agent Resources`, `Sandbox`, and claim + deployments. +- Collaboration surfaces listed in the docs include `Toolbar`, `Comments`, and + `Draft mode`. +- Security surfaces listed in the docs include `Deployment Protection`, + `RBAC`, `Configurable WAF`, `Bot Management`, and `BotID`. + +## Source 3 + +- URL: [https://vercel.com/docs/getting-started-with-vercel](https://vercel.com/docs/getting-started-with-vercel) +- Last updated: September 24, 2025 +- Vercel is described as a platform for developers that provides tools, + workflows, and infrastructure to build and deploy web apps faster without + needing additional configuration. +- The getting started guide says Vercel supports popular frontend frameworks + out of the box. +- It also says the infrastructure is globally distributed. +- During development, Vercel provides preview and production environments and + comments for real-time collaboration. +- The docs repeatedly support both dashboard and CLI workflows. + +## Source 4 + +- URL: [https://vercel.com/blog/introducing-vercel-mcp-connect-vercel-to-your-ai-tools](https://vercel.com/blog/introducing-vercel-mcp-connect-vercel-to-your-ai-tools) +- Published: August 6, 2025 +- Vercel launched an official MCP server in public beta. +- The launch framing says AI tools lacked secure, structured access to + infrastructure like Vercel. +- The launch post describes Vercel MCP as a secure, OAuth-compliant interface + that lets AI clients interact with Vercel projects. +- The launch capabilities include searching docs, retrieving deployment logs, + fetching teams, and fetching projects. +- The launch post explicitly frames the initial server as read-only and + approved-client only. +- The launch post also says Vercel wants to be a place where developers ship + their own MCP servers. + +## Source 5 + +- URL: [https://vercel.com/docs/agent-resources/vercel-mcp](https://vercel.com/docs/agent-resources/vercel-mcp) +- Last updated: January 30, 2026 +- The product docs describe Vercel MCP as Vercel's official remote MCP with + OAuth at `https://mcp.vercel.com`. +- The docs say it lets AI tools search docs, manage projects and deployments, + and analyze deployment logs. +- Supported AI clients listed in the docs include Claude, ChatGPT, Codex CLI, + Cursor, VS Code with Copilot, Devin, Raycast, Goose, Windsurf, and Gemini + tools. +- The docs emphasize allowlisted clients, OAuth, and official endpoint + verification as security controls. +- There is a likely product transition to capture: the August 6, 2025 launch + post frames the service as read-only, while the January 30, 2026 product + docs frame it as broader project and deployment management. + +## Source 6 + +- URL: [https://vercel.com/platforms/docs](https://vercel.com/platforms/docs) +- Accessed: April 20, 2026 +- Vercel for Platforms supports two patterns: `Multi-Tenant` and + `Multi-Project`. +- The docs say `Multi-Tenant` is for one application structure with + tenant-specific content and branding. +- The docs say `Multi-Project` is for unique codebases, per-customer + environments, and AI coding platforms where complete isolation is required. +- This suggests Vercel is not only a deployment product for a single app team; + it is also a substrate for other platforms. + +## Tensions and questions the evaluator should preserve + +- Vercel still publicly uses `Frontend Cloud` language on the About page while + the docs index now centers `AI Cloud`. +- The same company appears to span deployment infrastructure, collaboration, + security, AI application tooling, and platform-building primitives. +- It should be treated as more than hosting, but not flattened into generic + cloud infrastructure. +- A good foundation document should preserve whether the core primitive is + "ship products fast", "deploy web and AI apps", "developer cloud", or + "infrastructure for the product surface of the internet" if the sources do + not settle it cleanly. diff --git a/skills/spec-creator/baml_src/clients.baml b/skills/spec-creator/baml_src/clients.baml new file mode 100644 index 0000000..cd662f1 --- /dev/null +++ b/skills/spec-creator/baml_src/clients.baml @@ -0,0 +1,10 @@ +client EvalModel { + provider "openai-responses" + options { + api_key env.OPENAI_API_KEY + model "gpt-5-mini" + reasoning { + effort "medium" + } + } +} diff --git a/skills/spec-creator/baml_src/generators.baml b/skills/spec-creator/baml_src/generators.baml new file mode 100644 index 0000000..248fe62 --- /dev/null +++ b/skills/spec-creator/baml_src/generators.baml @@ -0,0 +1,6 @@ +generator target { + output_type "typescript" + output_dir "../" + module_format "esm" + version "0.221.0" +} diff --git a/skills/spec-creator/baml_src/spec_compiler/common_types.baml b/skills/spec-creator/baml_src/spec_compiler/common_types.baml new file mode 100644 index 0000000..cb35cb6 --- /dev/null +++ b/skills/spec-creator/baml_src/spec_compiler/common_types.baml @@ -0,0 +1,20 @@ +enum ClaimKind { + Fact + Thesis + Boundary + Constraint + OpenQuestion +} + +enum Confidence { + High + Medium + Low +} + +class Claim { + statement string @assert(nonempty_statement, {{ this|length > 0 }}) + kind ClaimKind + confidence Confidence + sources string[] @assert(has_source, {{ this|length > 0 }}) +} diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml new file mode 100644 index 0000000..5fbe8b0 --- /dev/null +++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml @@ -0,0 +1,78 @@ +function ExtractClaims(raw_notes: string) -> Claim[] { + client EvalModel + prompt #" + Extract atomic claims from the raw notes below. + + Rules: + - Preserve the original intent. + - Split compound statements into separate claims when useful. + - Use `Fact` only for concrete statements in the notes. + - Use `Thesis` for directional beliefs that still affect scope. + - Use `Boundary` for explicit scope limits or exclusions. + - Use `Constraint` for hard operating requirements. + - Use `OpenQuestion` when the note is unresolved. + - Keep `sources` short and human-readable. + + Raw notes: + {{ raw_notes }} + + {{ ctx.output_format }} + "# +} + +function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrief { + client EvalModel + prompt #" + Compile a spec brief from the inputs below. + + Rules: + - Produce a behavioral brief suitable for `spec-creator`. + - Prefer service-level behavior over product doctrine. + - Preserve unresolved questions explicitly. + - Do not invent implementation detail. + - If `existing_spec` is present, preserve compatible intent and only sharpen gaps. + + Raw notes: + {{ raw_notes }} + + Existing spec: + {{ existing_spec }} + + {{ ctx.output_format }} + "# +} + +function CritiqueSpecBrief(brief: SpecBrief) -> SpecCritique { + client EvalModel + prompt #" + Critique the spec brief below. + + Rules: + - Flag contradictions across goals, non-goals, and boundaries. + - Flag implementation leakage. + - Flag ambiguous terms that weaken a behavioral specification. + - Flag missing sections implied by the current brief. + + Brief: + {{ brief|format(type="yaml") }} + + {{ ctx.output_format }} + "# +} + +function RenderSpecCreatorPrompt(brief: SpecBrief) -> string { + client EvalModel + prompt #" + Render a prompt for the `spec-creator` skill using the brief below. + + Rules: + - Ask for a top-level `SPEC.md`. + - Keep the output language-agnostic and behavioral. + - Preserve unresolved questions instead of inventing decisions. + - Emphasize problem statement, goals, non-goals, components, dependencies, and entities. + - Make the prompt directly usable by an agent. + + Brief: + {{ brief|format(type="yaml") }} + "# +} diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml new file mode 100644 index 0000000..068a32a --- /dev/null +++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml @@ -0,0 +1,67 @@ +function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief { + client EvalModel + prompt #" + Compile a spec brief from the evaluation packet below. + + Task prompt: + {{ packet.task_prompt }} + + Raw notes: + {{ packet.raw_notes }} + + Existing spec: + {{ packet.existing_spec }} + + Expected criteria: + {{ packet.expected_criteria }} + + Rules: + - Produce a behavioral service brief suitable for `spec-creator`. + - Treat expected criteria as evaluation guidance, not as license to invent. + - Preserve unresolved questions explicitly. + - Avoid implementation detail. + + {{ ctx.output_format }} + "# +} + +function RenderSpecDocumentDraft(brief: SpecBrief) -> string { + client EvalModel + prompt #" + Draft a `SPEC.md` from the brief below. + + Rules: + - Keep the document behavioral and language-agnostic. + - Use problem statement, goals, non-goals, boundaries, components, + dependencies, and entities. + - Preserve uncertainty where the source packet is in transition. + - Avoid implementation detail. + + Brief: + {{ brief|format(type="yaml") }} + "# +} + +function EvaluateSpecDocument( + packet: SpecEvalPacket, + candidate_document: string +) -> EvalReport { + client EvalModel + prompt #" + Evaluate the candidate `SPEC.md` against the evaluation packet. + + Packet: + {{ packet|format(type="yaml") }} + + Candidate document: + {{ candidate_document }} + + Rules: + - Grade against the expected criteria explicitly. + - Penalize invented capabilities, invented certainty, or implementation leakage. + - Reward correct scope boundaries and careful handling of transition states. + - Use `Pass`, `Partial`, or `Fail` for each criterion. + + {{ ctx.output_format }} + "# +} diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_types.baml b/skills/spec-creator/baml_src/spec_compiler/eval_types.baml new file mode 100644 index 0000000..d3576c9 --- /dev/null +++ b/skills/spec-creator/baml_src/spec_compiler/eval_types.baml @@ -0,0 +1,26 @@ +enum EvalStatus { + Pass + Partial + Fail +} + +class EvalCheck { + criterion string @assert(nonempty_criterion, {{ this|length > 0 }}) + status EvalStatus + rationale string @assert(nonempty_rationale, {{ this|length > 0 }}) +} + +class EvalReport { + overall_status EvalStatus + summary string @assert(nonempty_summary, {{ this|length > 0 }}) + checks EvalCheck[] + open_issues string[] +} + +class SpecEvalPacket { + packet_name string @assert(nonempty_packet_name, {{ this|length > 0 }}) + task_prompt string @assert(nonempty_task_prompt, {{ this|length > 0 }}) + raw_notes string @assert(nonempty_raw_notes, {{ this|length > 0 }}) + expected_criteria string @assert(nonempty_expected_criteria, {{ this|length > 0 }}) + existing_spec string? +} diff --git a/skills/spec-creator/baml_src/spec_compiler/spec_types.baml b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml new file mode 100644 index 0000000..3d116b5 --- /dev/null +++ b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml @@ -0,0 +1,37 @@ +class ComponentBrief { + name string @assert(nonempty_name, {{ this|length > 0 }}) + responsibility string @assert(nonempty_responsibility, {{ this|length > 0 }}) +} + +class FieldBrief { + name string @assert(nonempty_name, {{ this|length > 0 }}) + type_expression string @assert(nonempty_type_expression, {{ this|length > 0 }}) + description string @assert(nonempty_description, {{ this|length > 0 }}) + required bool +} + +class EntityBrief { + name string @assert(nonempty_name, {{ this|length > 0 }}) + description string @assert(nonempty_description, {{ this|length > 0 }}) + fields FieldBrief[] +} + +class SpecBrief { + service_name string @assert(nonempty_service_name, {{ this|length > 0 }}) + purpose string @assert(nonempty_purpose, {{ this|length > 0 }}) + operational_problems string[] + goals string[] + non_goals string[] + important_boundaries string[] + components ComponentBrief[] + external_dependencies string[] + entities EntityBrief[] + unresolved_questions string[] +} + +class SpecCritique { + contradictions string[] + implementation_leaks string[] + ambiguous_terms string[] + missing_sections string[] +} diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json index 8e81200..8c9e5ad 100644 --- a/skills/spec-creator/evals/evals.json +++ b/skills/spec-creator/evals/evals.json @@ -1,5 +1,12 @@ { "skill_name": "spec-creator", + "runner_contract": { + "type": "baml_pipeline", + "packet_type": "SpecEvalPacket", + "compile_brief_function": "CompileSpecBriefFromPacket", + "render_document_function": "RenderSpecDocumentDraft", + "evaluate_document_function": "EvaluateSpecDocument" + }, "evals": [ { "id": 0, @@ -21,6 +28,18 @@ "prompt": "We have an existing SPEC.md at the repo root for Log Shipper. Please update it: add a non-goal stating that cross-region log replication is out of scope, and add a new main component called `Offset Store` that persists per-file read offsets. Keep everything else as-is.", "expected_output": "SPEC.md with the existing sections preserved verbatim except: (1) a new bullet added under Non-Goals covering cross-region replication, phrased as a gerund/noun phrase per the language guide; (2) a new numbered component `Offset Store` added under 3.1 with a verb-led description. Section numbering remains 1, 2, 3; component numbering extends to 4. No first-person pronouns introduced.", "files": ["fixtures/existing_spec.md"] + }, + { + "id": 3, + "eval_name": "create-from-vercel-mcp-source-packet", + "prompt": "Use the source packet in `fixtures/vercel_mcp/raw_notes.md` to write a `SPEC.md` for a service called `Vercel MCP`. Treat it as a long-running remote MCP service, not as a company-level foundation document. Preserve timeline-specific ambiguity where the source packet is in transition, and do not invent write capabilities that the notes do not justify.", + "expected_output": "A `SPEC.md` that frames `Vercel MCP` as an OAuth-protected remote MCP service for AI tools, includes a Problem Statement about secure structured access to Vercel docs, projects, deployments, and logs, sets clear goals and non-goals, identifies boundaries around approved clients and official endpoint usage, captures the current beta/read-only tension without pretending the service already has unconstrained write access, and stays behavioral rather than implementation-level.", + "expected_file": "fixtures/vercel_mcp/expected_criteria.md", + "packet_files": { + "raw_notes": "fixtures/vercel_mcp/raw_notes.md", + "expected_criteria": "fixtures/vercel_mcp/expected_criteria.md" + }, + "files": ["fixtures/vercel_mcp/raw_notes.md"] } ] } diff --git a/skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md b/skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md new file mode 100644 index 0000000..d297751 --- /dev/null +++ b/skills/spec-creator/evals/fixtures/vercel_mcp/expected_criteria.md @@ -0,0 +1,16 @@ +# Expected Criteria + +- The output should frame `Vercel MCP` as a remote MCP service for AI tools + interacting with Vercel resources. +- The purpose should mention secure access to Vercel docs, projects, + deployments, or logs through an OAuth-protected MCP endpoint. +- The problem statement should capture the need for structured AI access to + Vercel context from tools or development environments. +- The output should include boundaries around official endpoint usage, + approved clients, and security-sensitive access patterns. +- The output should preserve the transition between the August 6, 2025 + read-only launch framing and the January 30, 2026 broader management framing. +- The output should avoid claiming unconstrained write behavior unless it is + explicitly scoped or qualified. +- The output should stay behavioral and language-agnostic, not implementation + specific. diff --git a/skills/spec-creator/evals/fixtures/vercel_mcp/raw_notes.md b/skills/spec-creator/evals/fixtures/vercel_mcp/raw_notes.md new file mode 100644 index 0000000..ad3be92 --- /dev/null +++ b/skills/spec-creator/evals/fixtures/vercel_mcp/raw_notes.md @@ -0,0 +1,62 @@ +# Vercel MCP Source Packet + +Assembled on April 20, 2026 from official Vercel sources. + +This packet is intentionally narrow. It exists to test whether `spec-creator` +can produce a concrete service specification from a modern product surface +without inventing capabilities that are still in transition. + +## Source 1 + +- URL: [https://vercel.com/blog/introducing-vercel-mcp-connect-vercel-to-your-ai-tools](https://vercel.com/blog/introducing-vercel-mcp-connect-vercel-to-your-ai-tools) +- Published: August 6, 2025 +- Vercel introduced an official MCP server in public beta. +- The launch post describes the service as a secure, OAuth-compliant interface + that lets AI clients interact with Vercel projects. +- The launch motivation is that AI tools need secure, structured access to + infrastructure like Vercel from inside development environments and AI + assistants. +- The launch capabilities include searching docs, retrieving deployment logs, + fetching teams, and fetching projects. +- The launch post says the initial service is read-only. +- The launch post also says only approved clients are allowed, and OAuth + consent is shown on every connection. +- Official endpoint in the launch post: `https://mcp.vercel.com`. + +## Source 2 + +- URL: [https://vercel.com/docs/agent-resources/vercel-mcp](https://vercel.com/docs/agent-resources/vercel-mcp) +- Last updated: January 30, 2026 +- The product docs describe Vercel MCP as Vercel's official remote MCP with + OAuth. +- The docs say it lets AI tools search docs, manage projects and deployments, + and analyze deployment logs. +- The docs list many supported clients including Claude tools, ChatGPT, Codex + CLI, Cursor, VS Code with Copilot, Devin, Raycast, Goose, Windsurf, and + Gemini tools. +- The docs emphasize endpoint verification, OAuth, and approved-client + restrictions as security controls. +- The docs position the service as part of an agent workflow around live + Vercel context and project operations. + +## Source 3 + +- URL: [https://vercel.com/docs](https://vercel.com/docs) +- Last updated: January 30, 2026 +- The docs index places Vercel MCP inside a broader AI infrastructure surface + alongside `Agents`, `MCP Servers`, `Agent Resources`, `Sandbox`, `AI SDK`, + and `AI Gateway`. +- This suggests Vercel MCP is not an isolated side experiment; it is part of a + broader AI-tooling platform direction. + +## Important boundaries and tensions + +- The service is specifically about Vercel context and operations for AI + clients, not a generic MCP hosting platform. +- The endpoint is official and singular in the notes: + `https://mcp.vercel.com`. +- August 6, 2025 launch framing is explicitly read-only. +- January 30, 2026 docs language suggests broader project/deployment + management. +- A good spec should preserve that transition carefully. It should not invent + arbitrary mutation powers if the source packet does not settle them. From 068687836acd5fb05c67f0a4962110aa01b1525f Mon Sep 17 00:00:00 2001 From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com> Date: Mon, 20 Apr 2026 15:26:15 +1000 Subject: [PATCH 02/30] Migrate evals to AI Gateway and Bun --- .gitignore | 23 ++- README.md | 16 +- bun.lock | 59 ++++++ package-lock.json | 188 ------------------ package.json | 13 +- scripts/run-baml-eval.mjs | 24 ++- .../foundation-creator/baml_src/clients.baml | 5 +- .../compiler_functions.baml | 5 + .../foundation_compiler/eval_runner.baml | 3 + skills/spec-creator/baml_src/clients.baml | 5 +- .../spec_compiler/compiler_functions.baml | 4 + .../baml_src/spec_compiler/eval_runner.baml | 3 + 12 files changed, 133 insertions(+), 215 deletions(-) create mode 100644 bun.lock delete mode 100644 package-lock.json diff --git a/.gitignore b/.gitignore index a7f1d20..f399c82 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,18 @@ node_modules/ -skills/foundation-creator/baml_client/ -skills/foundation-creator/baml_client_dist/ -skills/spec-creator/baml_client/ -skills/spec-creator/baml_client_dist/ -skills/foundation-creator/evals/runs/ -skills/spec-creator/evals/runs/ + +# Generated BAML clients +skills/**/baml_client/ +skills/**/baml_client_dist/ + +# Local eval outputs +skills/**/evals/runs/ + +# Local environment files +.env +.env.local +.env.*.local + +# Local OS and tooling noise +.DS_Store +*.log +.tmp-baml-client-tsconfig.json diff --git a/README.md b/README.md index 7af46e7..4a628c3 100644 --- a/README.md +++ b/README.md @@ -26,14 +26,24 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and `spec-creator`. ```bash -npm install -OPENAI_API_KEY=... npm run eval:foundation -- create-foundation-from-vercel-source-packet -OPENAI_API_KEY=... npm run eval:spec -- create-from-vercel-mcp-source-packet +bun install +bun run eval:foundation -- create-foundation-from-vercel-source-packet +bun run eval:spec -- create-from-vercel-mcp-source-packet ``` Each run writes packet, brief, candidate document, and evaluation report artifacts under `skills//evals/runs/`. +`bun run eval:*` loads `.env` automatically through `dotenv-cli`, so +`AI_GATEWAY_API_KEY` can live in the repo-local `.env` without manual +`source` steps. + +For other local commands that should inherit `.env`, use: + +```bash +bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-vercel-source-packet +``` + ## License MIT diff --git a/bun.lock b/bun.lock new file mode 100644 index 0000000..0a5ac2e --- /dev/null +++ b/bun.lock @@ -0,0 +1,59 @@ +{ + "lockfileVersion": 1, + "configVersion": 0, + "workspaces": { + "": { + "name": "@lightfastai/skills", + "dependencies": { + "@boundaryml/baml": "0.221.0", + "typescript": "5.9.3", + }, + "devDependencies": { + "dotenv-cli": "^8.0.0", + }, + }, + }, + "packages": { + "@boundaryml/baml": ["@boundaryml/baml@0.221.0", "", { "dependencies": { "@scarf/scarf": "^1.3.0" }, "optionalDependencies": { "@boundaryml/baml-darwin-arm64": "0.221.0", "@boundaryml/baml-darwin-x64": "0.221.0", "@boundaryml/baml-linux-arm64-gnu": "0.221.0", "@boundaryml/baml-linux-arm64-musl": "0.221.0", "@boundaryml/baml-linux-x64-gnu": "0.221.0", "@boundaryml/baml-linux-x64-musl": "0.221.0", "@boundaryml/baml-win32-arm64-msvc": "0.221.0", "@boundaryml/baml-win32-x64-msvc": "0.221.0" }, "bin": { "baml": "cli.js", "baml-cli": "cli.js" } }, "sha512-pPOp2JVsG4Wa/tMLnJv/rxil5jsuVDgxnA0xO0h4lKy7t/fKCXOVvO+nzpOZ4byLTP/Ow+8pVvoKRKvx1J/Hsw=="], + + "@boundaryml/baml-darwin-arm64": ["@boundaryml/baml-darwin-arm64@0.221.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-GxqdjVUodyKtgKX/CIDGZyz5lXS0d0iFnV2x7thMQM9ziMrOPcWd3qwflOLYdgDo6Hy9yMULrqtMPkCrmbwEHQ=="], + + "@boundaryml/baml-darwin-x64": ["@boundaryml/baml-darwin-x64@0.221.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-wG3jsgOIr8C+09j0AFZY4F8EHvd1gKoKw6+HR1Oi+cw4pijklCk2LI0AIwMPzgG12BAxWV6jEIONMORmspesFQ=="], + + "@boundaryml/baml-linux-arm64-gnu": ["@boundaryml/baml-linux-arm64-gnu@0.221.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-Xy1M3muUV2B/4f8dVUpX/IN2CI1m4hGtw31V+kQdFYsy3Hvo58qjijtlkKNYZOjqWBqVlgPMFhTvv8N0cD4N/w=="], + + "@boundaryml/baml-linux-arm64-musl": ["@boundaryml/baml-linux-arm64-musl@0.221.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-6RIkHCViXQEsn6Ts5Uk9c6SDgokkXGO4GkoHpoNnKluTJtuB/B2nUOv2O147GFDqtspFDL2jk5d+oiYibfMn0g=="], + + "@boundaryml/baml-linux-x64-gnu": ["@boundaryml/baml-linux-x64-gnu@0.221.0", "", { "os": "linux", "cpu": "x64" }, "sha512-YoOz6N6E37UE4ULRCe24P/Ov2pNxjvI4R+I6Bwhkqdt5HOGsJrf2uJUSC+XxKZpkPqlbo1gGZPoCB0lcyeSkeA=="], + + "@boundaryml/baml-linux-x64-musl": ["@boundaryml/baml-linux-x64-musl@0.221.0", "", { "os": "linux", "cpu": "x64" }, "sha512-gY67VRXrixgTenDtDzVSMo0GjLbeofGtCZuArfiDgCglfJ5/KGBSgwzqrrTuyUVLGK902NmCaYA5OrPSXezSzg=="], + + "@boundaryml/baml-win32-arm64-msvc": ["@boundaryml/baml-win32-arm64-msvc@0.221.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-pTHPv6GVlW7nLVszgm7P7+PdQ97JJ8xnRp3/TeP/ya5z08wKi0ejOInLzElMyZVTB+XY707qGlM9CreJnDH3vg=="], + + "@boundaryml/baml-win32-x64-msvc": ["@boundaryml/baml-win32-x64-msvc@0.221.0", "", { "os": "win32", "cpu": "x64" }, "sha512-XP3CxwsYxOZAOzkWqZd2Dg8iNpDOMrbA/Bz3nqI7oX/wL+ZMkHJwjWQwxIVL+sg2rp+TceV+21UPb6LTmt+qJw=="], + + "@scarf/scarf": ["@scarf/scarf@1.4.0", "", {}, "sha512-xxeapPiUXdZAE3che6f3xogoJPeZgig6omHEy1rIY5WVsB3H2BHNnZH+gHG6x91SCWyQCzWGsuL2Hh3ClO5/qQ=="], + + "cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="], + + "dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="], + + "dotenv-cli": ["dotenv-cli@8.0.0", "", { "dependencies": { "cross-spawn": "^7.0.6", "dotenv": "^16.3.0", "dotenv-expand": "^10.0.0", "minimist": "^1.2.6" }, "bin": { "dotenv": "cli.js" } }, "sha512-aLqYbK7xKOiTMIRf1lDPbI+Y+Ip/wo5k3eyp6ePysVaSqbyxjyK3dK35BTxG+rmd7djf5q2UPs4noPNH+cj0Qw=="], + + "dotenv-expand": ["dotenv-expand@10.0.0", "", {}, "sha512-GopVGCpVS1UKH75VKHGuQFqS1Gusej0z4FyQkPdwjil2gNIv+LNsqBlboOzpJFZKVT95GkCyWJbBSdFEFUWI2A=="], + + "isexe": ["isexe@2.0.0", "", {}, "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="], + + "minimist": ["minimist@1.2.8", "", {}, "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA=="], + + "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="], + + "shebang-command": ["shebang-command@2.0.0", "", { "dependencies": { "shebang-regex": "^3.0.0" } }, "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA=="], + + "shebang-regex": ["shebang-regex@3.0.0", "", {}, "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="], + + "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], + + "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="], + } +} diff --git a/package-lock.json b/package-lock.json deleted file mode 100644 index 7889375..0000000 --- a/package-lock.json +++ /dev/null @@ -1,188 +0,0 @@ -{ - "name": "@lightfastai/skills", - "lockfileVersion": 3, - "requires": true, - "packages": { - "": { - "name": "@lightfastai/skills", - "dependencies": { - "@boundaryml/baml": "0.221.0", - "typescript": "5.9.3" - } - }, - "node_modules/@boundaryml/baml": { - "version": "0.221.0", - "resolved": "https://registry.npmjs.org/@boundaryml/baml/-/baml-0.221.0.tgz", - "integrity": "sha512-pPOp2JVsG4Wa/tMLnJv/rxil5jsuVDgxnA0xO0h4lKy7t/fKCXOVvO+nzpOZ4byLTP/Ow+8pVvoKRKvx1J/Hsw==", - "license": "MIT", - "dependencies": { - "@scarf/scarf": "^1.3.0" - }, - "bin": { - "baml": "cli.js", - "baml-cli": "cli.js" - }, - "engines": { - "node": ">= 10" - }, - "optionalDependencies": { - "@boundaryml/baml-darwin-arm64": "0.221.0", - "@boundaryml/baml-darwin-x64": "0.221.0", - "@boundaryml/baml-linux-arm64-gnu": "0.221.0", - "@boundaryml/baml-linux-arm64-musl": "0.221.0", - "@boundaryml/baml-linux-x64-gnu": "0.221.0", - "@boundaryml/baml-linux-x64-musl": "0.221.0", - "@boundaryml/baml-win32-arm64-msvc": "0.221.0", - "@boundaryml/baml-win32-x64-msvc": "0.221.0" - } - }, - "node_modules/@boundaryml/baml-darwin-arm64": { - "version": "0.221.0", - "resolved": "https://registry.npmjs.org/@boundaryml/baml-darwin-arm64/-/baml-darwin-arm64-0.221.0.tgz", - "integrity": "sha512-GxqdjVUodyKtgKX/CIDGZyz5lXS0d0iFnV2x7thMQM9ziMrOPcWd3qwflOLYdgDo6Hy9yMULrqtMPkCrmbwEHQ==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">= 10" - } - }, - "node_modules/@boundaryml/baml-darwin-x64": { - "version": "0.221.0", - "resolved": "https://registry.npmjs.org/@boundaryml/baml-darwin-x64/-/baml-darwin-x64-0.221.0.tgz", - "integrity": "sha512-wG3jsgOIr8C+09j0AFZY4F8EHvd1gKoKw6+HR1Oi+cw4pijklCk2LI0AIwMPzgG12BAxWV6jEIONMORmspesFQ==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">= 10" - } - }, - "node_modules/@boundaryml/baml-linux-arm64-gnu": { - "version": "0.221.0", - "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-arm64-gnu/-/baml-linux-arm64-gnu-0.221.0.tgz", - "integrity": "sha512-Xy1M3muUV2B/4f8dVUpX/IN2CI1m4hGtw31V+kQdFYsy3Hvo58qjijtlkKNYZOjqWBqVlgPMFhTvv8N0cD4N/w==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">= 10" - } - }, - "node_modules/@boundaryml/baml-linux-arm64-musl": { - "version": "0.221.0", - "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-arm64-musl/-/baml-linux-arm64-musl-0.221.0.tgz", - "integrity": "sha512-6RIkHCViXQEsn6Ts5Uk9c6SDgokkXGO4GkoHpoNnKluTJtuB/B2nUOv2O147GFDqtspFDL2jk5d+oiYibfMn0g==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">= 10" - } - }, - "node_modules/@boundaryml/baml-linux-x64-gnu": { - "version": "0.221.0", - "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-x64-gnu/-/baml-linux-x64-gnu-0.221.0.tgz", - "integrity": "sha512-YoOz6N6E37UE4ULRCe24P/Ov2pNxjvI4R+I6Bwhkqdt5HOGsJrf2uJUSC+XxKZpkPqlbo1gGZPoCB0lcyeSkeA==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">= 10" - } - }, - "node_modules/@boundaryml/baml-linux-x64-musl": { - "version": "0.221.0", - "resolved": "https://registry.npmjs.org/@boundaryml/baml-linux-x64-musl/-/baml-linux-x64-musl-0.221.0.tgz", - "integrity": "sha512-gY67VRXrixgTenDtDzVSMo0GjLbeofGtCZuArfiDgCglfJ5/KGBSgwzqrrTuyUVLGK902NmCaYA5OrPSXezSzg==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">= 10" - } - }, - "node_modules/@boundaryml/baml-win32-arm64-msvc": { - "version": "0.221.0", - "resolved": "https://registry.npmjs.org/@boundaryml/baml-win32-arm64-msvc/-/baml-win32-arm64-msvc-0.221.0.tgz", - "integrity": "sha512-pTHPv6GVlW7nLVszgm7P7+PdQ97JJ8xnRp3/TeP/ya5z08wKi0ejOInLzElMyZVTB+XY707qGlM9CreJnDH3vg==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">= 10" - } - }, - "node_modules/@boundaryml/baml-win32-x64-msvc": { - "version": "0.221.0", - "resolved": "https://registry.npmjs.org/@boundaryml/baml-win32-x64-msvc/-/baml-win32-x64-msvc-0.221.0.tgz", - "integrity": "sha512-XP3CxwsYxOZAOzkWqZd2Dg8iNpDOMrbA/Bz3nqI7oX/wL+ZMkHJwjWQwxIVL+sg2rp+TceV+21UPb6LTmt+qJw==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">= 10" - } - }, - "node_modules/@scarf/scarf": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/@scarf/scarf/-/scarf-1.4.0.tgz", - "integrity": "sha512-xxeapPiUXdZAE3che6f3xogoJPeZgig6omHEy1rIY5WVsB3H2BHNnZH+gHG6x91SCWyQCzWGsuL2Hh3ClO5/qQ==", - "hasInstallScript": true, - "license": "Apache-2.0" - }, - "node_modules/typescript": { - "version": "5.9.3", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", - "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", - "license": "Apache-2.0", - "bin": { - "tsc": "bin/tsc", - "tsserver": "bin/tsserver" - }, - "engines": { - "node": ">=14.17" - } - } - } -} diff --git a/package.json b/package.json index 0ebcd73..944dfe8 100644 --- a/package.json +++ b/package.json @@ -1,15 +1,20 @@ { "name": "@lightfastai/skills", "private": true, + "packageManager": "bun@1.3.9", "type": "module", "scripts": { - "baml:generate:foundation": "npx baml-cli generate --from ./skills/foundation-creator/baml_src", - "baml:generate:spec": "npx baml-cli generate --from ./skills/spec-creator/baml_src", - "eval:foundation": "node ./scripts/run-baml-eval.mjs foundation-creator", - "eval:spec": "node ./scripts/run-baml-eval.mjs spec-creator" + "with-env": "dotenv -e .env --", + "baml:generate:foundation": "bunx baml-cli generate --from ./skills/foundation-creator/baml_src", + "baml:generate:spec": "bunx baml-cli generate --from ./skills/spec-creator/baml_src", + "eval:foundation": "bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator", + "eval:spec": "bun run with-env -- bun run ./scripts/run-baml-eval.mjs spec-creator" }, "dependencies": { "@boundaryml/baml": "0.221.0", "typescript": "5.9.3" + }, + "devDependencies": { + "dotenv-cli": "^8.0.0" } } diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs index af18cec..039d4a6 100644 --- a/scripts/run-baml-eval.mjs +++ b/scripts/run-baml-eval.mjs @@ -163,8 +163,8 @@ async function main() { fail(`Skill '${skillName}' does not declare a supported runner_contract.`); } - if (!process.env.OPENAI_API_KEY) { - fail("OPENAI_API_KEY is required to execute BAML evals with the current client configuration."); + if (!process.env.AI_GATEWAY_API_KEY) { + fail("AI_GATEWAY_API_KEY is required to execute BAML evals."); } await ensureFreshClient(skillRoot); @@ -172,17 +172,21 @@ async function main() { const { b } = generated; const packet = await buildPacket(evalEntry, evalsDir, runner.packet_type); - const compileFn = b[runner.compile_brief_function]; - const renderFn = b[runner.render_document_function]; - const evaluateFn = b[runner.evaluate_document_function]; - - if (!compileFn || !renderFn || !evaluateFn) { + const compileFnName = runner.compile_brief_function; + const renderFnName = runner.render_document_function; + const evaluateFnName = runner.evaluate_document_function; + + if ( + typeof b[compileFnName] !== "function" || + typeof b[renderFnName] !== "function" || + typeof b[evaluateFnName] !== "function" + ) { fail(`Generated client is missing one or more runner functions for '${skillName}'.`); } - const brief = await compileFn(packet); - const candidateDocument = await renderFn(brief); - const report = await evaluateFn(packet, candidateDocument); + const brief = await b[compileFnName](packet); + const candidateDocument = await b[renderFnName](brief); + const report = await b[evaluateFnName](packet, candidateDocument); const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); const runDir = path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`); diff --git a/skills/foundation-creator/baml_src/clients.baml b/skills/foundation-creator/baml_src/clients.baml index cd662f1..f37af5b 100644 --- a/skills/foundation-creator/baml_src/clients.baml +++ b/skills/foundation-creator/baml_src/clients.baml @@ -1,8 +1,9 @@ client EvalModel { provider "openai-responses" options { - api_key env.OPENAI_API_KEY - model "gpt-5-mini" + api_key env.AI_GATEWAY_API_KEY + base_url "https://ai-gateway.vercel.sh/v1" + model "openai/gpt-5-mini" reasoning { effort "medium" } diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml index 0b2b6c0..e5c720e 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml @@ -1,6 +1,7 @@ function ExtractClaims(raw_notes: string) -> Claim[] { client EvalModel prompt #" + {{ _.role("user") }} Extract atomic claims from the raw notes below. Rules: @@ -23,6 +24,7 @@ function ExtractClaims(raw_notes: string) -> Claim[] { function BuildFoundationKernel(claims: Claim[]) -> FoundationKernel { client EvalModel prompt #" + {{ _.role("user") }} Build a stable foundation kernel from the extracted claims below. Rules: @@ -41,6 +43,7 @@ function BuildFoundationKernel(claims: Claim[]) -> FoundationKernel { function CritiqueFoundationKernel(kernel: FoundationKernel) -> FoundationCritique { client EvalModel prompt #" + {{ _.role("user") }} Critique the foundation kernel below. Rules: @@ -63,6 +66,7 @@ function CompileFoundationBrief( ) -> FoundationBrief { client EvalModel prompt #" + {{ _.role("user") }} Compile a concise foundation brief from the kernel and critique below. Rules: @@ -84,6 +88,7 @@ function CompileFoundationBrief( function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string { client EvalModel prompt #" + {{ _.role("user") }} Render a prompt for the `foundation-creator` skill using the brief below. Rules: diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml index 72df02a..3193812 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml @@ -1,6 +1,7 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> FoundationBrief { client EvalModel prompt #" + {{ _.role("user") }} Compile a foundation brief from the evaluation packet below. Task prompt: @@ -25,6 +26,7 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string { client EvalModel prompt #" + {{ _.role("user") }} Draft a top-level foundation document from the brief below. Rules: @@ -44,6 +46,7 @@ function EvaluateFoundationDocument( ) -> EvalReport { client EvalModel prompt #" + {{ _.role("user") }} Evaluate the candidate foundation document against the evaluation packet. Packet: diff --git a/skills/spec-creator/baml_src/clients.baml b/skills/spec-creator/baml_src/clients.baml index cd662f1..f37af5b 100644 --- a/skills/spec-creator/baml_src/clients.baml +++ b/skills/spec-creator/baml_src/clients.baml @@ -1,8 +1,9 @@ client EvalModel { provider "openai-responses" options { - api_key env.OPENAI_API_KEY - model "gpt-5-mini" + api_key env.AI_GATEWAY_API_KEY + base_url "https://ai-gateway.vercel.sh/v1" + model "openai/gpt-5-mini" reasoning { effort "medium" } diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml index 5fbe8b0..83ba3a7 100644 --- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml +++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml @@ -1,6 +1,7 @@ function ExtractClaims(raw_notes: string) -> Claim[] { client EvalModel prompt #" + {{ _.role("user") }} Extract atomic claims from the raw notes below. Rules: @@ -23,6 +24,7 @@ function ExtractClaims(raw_notes: string) -> Claim[] { function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrief { client EvalModel prompt #" + {{ _.role("user") }} Compile a spec brief from the inputs below. Rules: @@ -45,6 +47,7 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie function CritiqueSpecBrief(brief: SpecBrief) -> SpecCritique { client EvalModel prompt #" + {{ _.role("user") }} Critique the spec brief below. Rules: @@ -63,6 +66,7 @@ function CritiqueSpecBrief(brief: SpecBrief) -> SpecCritique { function RenderSpecCreatorPrompt(brief: SpecBrief) -> string { client EvalModel prompt #" + {{ _.role("user") }} Render a prompt for the `spec-creator` skill using the brief below. Rules: diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml index 068a32a..b94e59a 100644 --- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml +++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml @@ -1,6 +1,7 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief { client EvalModel prompt #" + {{ _.role("user") }} Compile a spec brief from the evaluation packet below. Task prompt: @@ -28,6 +29,7 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief { function RenderSpecDocumentDraft(brief: SpecBrief) -> string { client EvalModel prompt #" + {{ _.role("user") }} Draft a `SPEC.md` from the brief below. Rules: @@ -48,6 +50,7 @@ function EvaluateSpecDocument( ) -> EvalReport { client EvalModel prompt #" + {{ _.role("user") }} Evaluate the candidate `SPEC.md` against the evaluation packet. Packet: From b19bf60f514b6a25e90b003fdd90ab1f1198eaa0 Mon Sep 17 00:00:00 2001 From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com> Date: Mon, 20 Apr 2026 15:46:41 +1000 Subject: [PATCH 03/30] Tighten foundation creator prompts and eval coverage --- skills/foundation-creator/SKILL.md | 55 +++++++- .../compiler_functions.baml | 18 +++ .../foundation_compiler/eval_runner.baml | 16 ++- .../foundation_compiler/foundation_types.baml | 1 + skills/foundation-creator/evals/evals.json | 14 +++ .../fixtures/cloudflare/expected_criteria.md | 21 ++++ .../evals/fixtures/cloudflare/raw_notes.md | 119 ++++++++++++++++++ .../foundation-creator/references/language.md | 63 ++++++++++ .../foundation-creator/references/template.md | 52 ++++++++ 9 files changed, 357 insertions(+), 2 deletions(-) create mode 100644 skills/foundation-creator/evals/fixtures/cloudflare/expected_criteria.md create mode 100644 skills/foundation-creator/evals/fixtures/cloudflare/raw_notes.md create mode 100644 skills/foundation-creator/references/language.md create mode 100644 skills/foundation-creator/references/template.md diff --git a/skills/foundation-creator/SKILL.md b/skills/foundation-creator/SKILL.md index e6dbb90..811bb59 100644 --- a/skills/foundation-creator/SKILL.md +++ b/skills/foundation-creator/SKILL.md @@ -18,6 +18,21 @@ product or company primitive. The resulting document is strategic and behavioral, not implementation-level. It should preserve uncertainty where decisions are not yet mature. +This skill is a source-bound documentarian, not a strategy consultant. Its job +is to synthesize what the available material already supports about the +primitive: what it is, what it is not, what durable surfaces exist, and what +remains unresolved. + +## Reference files + +Load on demand, not upfront. + +- `references/template.md` — the allowed section shape for a foundation + document. Read it when drafting a new foundation doc and when checking + whether the output stayed within scope. +- `references/language.md` — wording and restraint rules. Read it before + writing prose and again during the validation pass. + ## Core behavior - Start from thesis and boundaries, not components. @@ -27,6 +42,43 @@ decisions are not yet mature. - Escalate to `spec-creator` only when a subsystem is concrete enough to deserve a `SPEC.md`. +## Allowed content + +- What the primitive is. +- What the primitive is not. +- Durable thesis-level framing. +- Actor model and durable surfaces. +- Strategic bets only when they are clearly supported by the source material. + Frame them as observed directional bets, not recommendations. +- Open questions and unresolved tensions. + +## Forbidden drift + +- Do not invent monetization, revenue models, KPIs, or internal + organizational structure unless the source explicitly states them. +- Do not produce roadmap items, implementation plans, operating cadences, + pilot programs, or execution checklists unless the user explicitly asks. +- Do not turn open questions into decision agendas or recommended next steps. +- Do not fill gaps with plausible-sounding business language. Prefer omission + or an explicit unresolved question. +- Do not collapse ambiguous positioning into a single confident frame when the + source material remains mixed. +- Do not assert market leadership, superiority, or competitive differentiation + unless the source explicitly makes that claim and it matters to the + foundation. + +## Validation focus + +Before finalizing, check for these failure modes: + +- unsupported inference +- consulting-style sections (`Success Signals`, `Decision Agenda`, + `Next Steps`, `Operating Guidance`, similar) +- implementation leakage +- business-model speculation +- metrics or operational milestones not present in the source +- missing explicit open questions where the notes remain unsettled + ## Current compiler surface This skill includes typed BAML contracts under `baml_src/foundation_compiler/` @@ -34,7 +86,8 @@ for: - extracting atomic claims from messy notes - compiling a stable foundation kernel -- critiquing ambiguity, contradiction, and implementation leakage +- critiquing ambiguity, contradiction, unsupported inference, and + implementation leakage - compiling a brief suitable for downstream document rendering The BAML layer is schema-first. Prompt wording and document templates can diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml index e5c720e..b4a90d1 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml @@ -13,6 +13,8 @@ function ExtractClaims(raw_notes: string) -> Claim[] { - Use `Constraint` for hard operating requirements. - Use `OpenQuestion` when the note is unresolved. - Keep `sources` short and human-readable. + - Do not infer business model, monetization, org structure, metrics, or roadmap items unless explicit in the notes. + - Prefer omission over speculation. Raw notes: {{ raw_notes }} @@ -32,6 +34,8 @@ function BuildFoundationKernel(claims: Claim[]) -> FoundationKernel { - Capture durable thesis-level information. - Do not invent implementation detail. - Use empty lists when a category is not yet supported by the claims. + - Do not infer monetization, pricing, KPIs, org structure, GTM strategy, partnership priorities, or execution plans unless explicitly supported. + - Prefer a shorter kernel over a speculative one. Claims: {{ claims|format(type="yaml") }} @@ -50,7 +54,9 @@ function CritiqueFoundationKernel(kernel: FoundationKernel) -> FoundationCritiqu - Flag contradictions across thesis, boundaries, and bets. - Flag vague claims that should be sharpened before document rendering. - Flag implementation leakage. + - Flag unsupported inferences that are plausible-sounding but not clearly supported by source material. - Flag missing boundaries that create strategic confusion. + - Flag consulting-style drift such as metrics, decision agendas, operating plans, or monetization claims. - Ask only high-leverage clarification questions. Kernel: @@ -74,6 +80,12 @@ function CompileFoundationBrief( - Preserve unresolved questions explicitly. - Exclude critique items that were already resolved by the kernel. - Keep the brief compact and high-signal. + - Remove unsupported inferences rather than softening them. + - Prefer omission over consultant-style expansion. + - Do not add business-model, KPI, org, roadmap, or operating-plan language unless source-backed. + - Treat `strategic_bets` as observed directional bets, not recommendations or settled future state. + - If a surface is visible but still in transition, preserve that qualification in the surrounding summary or questions. + - Avoid market-leadership or competitive-superiority language unless directly supported by source material. Kernel: {{ kernel|format(type="yaml") }} @@ -96,6 +108,12 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string { - Preserve strategic ambiguity where the brief leaves open questions. - Avoid implementation detail. - Emphasize thesis, boundaries, actor model, surfaces, and strategic bets. + - State that the writer is a source-bound synthesizer, not a strategy consultant. + - Require exactly these sections unless the user asks otherwise: `What This Is`, `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, `Open Questions`. + - Forbid extra sections like `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or `Roadmap`. + - Require `Strategic Bets` to be phrased as observed directional bets rather than prescriptions. + - Require recently emerging or transitional surfaces to be qualified explicitly rather than flattened as fully settled. + - Forbid market-leadership or superiority claims unless they are explicit in the brief. - Make the prompt directly usable by an agent. Brief: diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml index 3193812..7760d3d 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml @@ -18,6 +18,8 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found - Preserve ambiguity where the notes do not settle the framing. - Treat expected criteria as evaluation guidance, not as license to invent. - Avoid implementation detail. + - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them. + - Prefer omission over speculation. {{ ctx.output_format }} "# @@ -30,10 +32,17 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string { Draft a top-level foundation document from the brief below. Rules: - - Write a durable strategic document, not a `SPEC.md`. + - Write a durable foundation document, not a `SPEC.md`, strategy memo, roadmap, or operating plan. - Start from thesis and boundaries, not architecture. - Preserve unresolved questions explicitly. - Avoid implementation detail. + - Stay source-bound: do not invent monetization, KPIs, org structure, partnerships, operating guidance, or next-step plans. + - Prefer omission over plausible-sounding speculation. + - Use exactly these sections and no others: `What This Is`, `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, `Open Questions`. + - If `Strategic Bets` is weakly supported, keep it short rather than expanding it. + - Phrase `Strategic Bets` as observed directional bets, not recommendations or settled future state. + - When a surface is visible but still evolving in the packet, qualify it explicitly as emerging, evolving, or unsettled. + - Do not use market-leadership or competitive-superiority language unless the packet explicitly supports it. Brief: {{ brief|format(type="yaml") }} @@ -59,6 +68,11 @@ function EvaluateFoundationDocument( - Grade against the expected criteria explicitly. - Reward preservation of uncertainty when the source packet is genuinely mixed. - Penalize invented certainty, invented capabilities, or implementation leakage. + - Penalize unsupported business-model, monetization, KPI, org, partnership, or operating-plan language. + - Penalize consulting-style sections such as `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or similar drift. + - Penalize `Strategic Bets` phrased as recommendations or settled conclusions when the packet only supports directional evidence. + - Penalize flattening transitional surfaces as fully settled if the packet presents them as evolving. + - Penalize market-leadership or competitive-superiority claims not explicitly supported by the packet. - Use `Pass`, `Partial`, or `Fail` for each criterion. {{ ctx.output_format }} diff --git a/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml index 16ffa0f..002c42d 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml @@ -15,6 +15,7 @@ class FoundationCritique { contradictions string[] vague_claims string[] implementation_leaks string[] + unsupported_inferences string[] missing_boundaries string[] leverage_questions string[] } diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json index d0eae13..b44aefd 100644 --- a/skills/foundation-creator/evals/evals.json +++ b/skills/foundation-creator/evals/evals.json @@ -21,6 +21,20 @@ "files": [ "fixtures/vercel/raw_notes.md" ] + }, + { + "id": 1, + "eval_name": "create-foundation-from-cloudflare-source-packet", + "prompt": "Use the source packet in `fixtures/cloudflare/raw_notes.md` to draft a top-level foundation document for Cloudflare. Preserve the tension between the connectivity cloud, developer platform, and AI/agents platform framings. Do not produce a `SPEC.md`, implementation plan, or architecture diagram.", + "expected_output": "A top-level foundation document that frames Cloudflare as a unified platform spanning security/connectivity, developer infrastructure, and AI surfaces; preserves the tension between `connectivity cloud` and developer/AI platform identities; identifies durable surfaces like network/security control plane, developer runtime, AI infrastructure, and platform-building primitives; clarifies boundaries against generic hyperscaler or single-product framings; and preserves open questions rather than inventing certainty.", + "expected_file": "fixtures/cloudflare/expected_criteria.md", + "packet_files": { + "raw_notes": "fixtures/cloudflare/raw_notes.md", + "expected_criteria": "fixtures/cloudflare/expected_criteria.md" + }, + "files": [ + "fixtures/cloudflare/raw_notes.md" + ] } ] } diff --git a/skills/foundation-creator/evals/fixtures/cloudflare/expected_criteria.md b/skills/foundation-creator/evals/fixtures/cloudflare/expected_criteria.md new file mode 100644 index 0000000..e8e2601 --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/cloudflare/expected_criteria.md @@ -0,0 +1,21 @@ +# Expected Criteria + +- The output should identify Cloudflare as more than CDN/perimeter security: + it should recognize a unified platform spanning connectivity, security, + development, and AI-related surfaces. +- The output should preserve the tension between the enterprise-scale + `connectivity cloud` framing and the `Developer Platform` / AI agent + platform framing instead of collapsing Cloudflare into only one of those. +- The output should identify multiple durable surfaces such as network/security + control plane, developer runtime/platform, AI infrastructure or agents, and + platform-building or multitenant support. +- The output should include an actor model that covers at least enterprise or + security/network teams and developers; it should ideally also recognize + platform builders or AI agents/tools as meaningful actors. +- The output should set clear boundaries: + not a generic hyperscaler/IaaS, not just a security product suite, and not + just an AI agent runtime. +- The output should preserve at least one open question or strategic bet about + how the company's center of gravity is evolving. +- The output should not invent internal org structure, revenue model, + financial claims, or unsupported product lines beyond the packet. diff --git a/skills/foundation-creator/evals/fixtures/cloudflare/raw_notes.md b/skills/foundation-creator/evals/fixtures/cloudflare/raw_notes.md new file mode 100644 index 0000000..1fd059e --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/cloudflare/raw_notes.md @@ -0,0 +1,119 @@ +# Cloudflare Source Packet + +Assembled on April 20, 2026 from official Cloudflare sources. + +This packet is intentionally paraphrased. It is meant to test whether +`foundation-creator` can handle a company that spans enterprise security and +connectivity, developer infrastructure, and newer AI/agent surfaces without +flattening that breadth into a single overconfident label. + +## Source 1 + +- URL: [https://www.cloudflare.com/connectivity-cloud/](https://www.cloudflare.com/connectivity-cloud/) +- Accessed: April 20, 2026 +- Cloudflare describes itself through the `connectivity cloud`. +- The page says the platform is unified across security, connectivity, and + development. +- The page emphasizes one network, one control plane, global scale, + resilience, composable programmable services, and a simplified management + interface. +- The primary framing appears enterprise-oriented: reduce complexity, improve + security, increase performance, and accelerate digital projects. + +## Source 2 + +- URL: [https://developers.cloudflare.com/](https://developers.cloudflare.com/) +- Accessed: April 20, 2026 +- The docs homepage says the `Cloudflare Developer Platform` provides a + serverless execution environment for building new applications or augmenting + existing ones without maintaining infrastructure. +- The same docs surface groups products into `Developer Products`, + `AI Products`, and `Cloudflare One` products. +- The page frames Cloudflare not only as a security/network vendor but also as + a place to build software directly. + +## Source 3 + +- URL: [https://developers.cloudflare.com/workers/](https://developers.cloudflare.com/workers/) +- Last updated: April 15, 2026 +- Workers is described as a serverless platform for building, deploying, and + scaling apps across Cloudflare's global network with no infrastructure to + manage. +- The Workers docs position the platform as full-stack, globally distributed, + and language-flexible. +- The product surface includes front-end applications, back-end applications, + serverless AI inference, background jobs, observability, and integrations + with storage and compute products like Durable Objects, D1, KV, Queues, + Workers AI, Workflows, Vectorize, and R2. + +## Source 4 + +- URL: [https://developers.cloudflare.com/ai/](https://developers.cloudflare.com/ai/) +- Last updated: April 16, 2026 +- Cloudflare AI is described as a unified platform for running AI models, + whether hosted on Cloudflare infrastructure via Workers AI or proxied + through AI Gateway to external providers. +- Related AI products include Workers AI, AI Gateway, Vectorize, Agents, + AI Search, AI Crawl Control, Browser Rendering, and Cloudflare Agent. +- This suggests Cloudflare now treats AI as a first-class product surface + within the platform. + +## Source 5 + +- URL: [https://developers.cloudflare.com/agents/](https://developers.cloudflare.com/agents/) +- Last updated: April 14, 2026 +- The Agents docs say real agents need memory, scheduling, tool use, + coordination, and persistent state. +- The Agents SDK is built around Durable Objects and positions Cloudflare as a + place to run long-lived, stateful, globally distributed agents. +- The docs say agents can use and serve tools through MCP, schedule tasks, + coordinate workflows, browse the web, and connect to AI models including + Workers AI and external providers. +- This is a stronger framing than "AI inference only"; it pushes Cloudflare + toward an agent runtime platform. + +## Source 6 + +- URL: [https://developers.cloudflare.com/cloudflare-for-platforms/](https://developers.cloudflare.com/cloudflare-for-platforms/) +- Last updated: December 15, 2025 +- `Cloudflare for Platforms` says customers can offer Cloudflare's own + products and functionality to their own customers inside their own product. +- The page emphasizes custom domains/subdomains, isolation and multitenancy, + programmable routing/ingress/egress, storage and databases, and ability to + deploy millions of applications and domains. +- The docs explicitly mention deploying an AI vibe coding platform as a starter + use case. +- This suggests Cloudflare is not only a platform for direct customers; it is + also a substrate for other platforms. + +## Source 7 + +- URL: [https://www.cloudflare.com/press/press-releases/2025/cloudflare-accelerates-ai-agent-development-remote-mcp/](https://www.cloudflare.com/press/press-releases/2025/cloudflare-accelerates-ai-agent-development-remote-mcp/) +- Published: April 7, 2025 +- Cloudflare announced new offerings for AI agent development, including a + remote MCP server, durable Workflows, and Durable Objects free tier. +- The press release says Cloudflare's developer platform and global network are + the best place to build and deploy AI agents. +- The launch framing expands Cloudflare beyond web performance/security into an + opinionated platform for agent development. + +## Source 8 + +- URL: [https://developers.cloudflare.com/agents/model-context-protocol/mcp-servers-for-cloudflare/](https://developers.cloudflare.com/agents/model-context-protocol/mcp-servers-for-cloudflare/) +- Accessed: April 20, 2026 +- Cloudflare documents its own MCP servers, including product-specific servers + and a docs server. +- This suggests Cloudflare is not only supporting MCP as a standard for others + but actively using it across its own product/API surface. + +## Tensions and questions the evaluator should preserve + +- Cloudflare uses the enterprise-scale `connectivity cloud` framing while also + maintaining a distinct `Developer Platform` identity. +- The company spans security, networking, performance, developer runtime, + AI infrastructure, agents, and platform-building primitives. +- A good foundation document should avoid flattening Cloudflare into just a CDN, + just Zero Trust/security, or just a developer runtime. +- AI and agents appear increasingly central, but the packet does not fully + settle whether they are an extension of the connectivity cloud, a new primary + platform identity, or one major layer within a broader company primitive. diff --git a/skills/foundation-creator/references/language.md b/skills/foundation-creator/references/language.md new file mode 100644 index 0000000..d51ba80 --- /dev/null +++ b/skills/foundation-creator/references/language.md @@ -0,0 +1,63 @@ +# Foundation Language Guide + +How the foundation document should be worded. + +## 1. Role + +- The document is a source-bound synthesis. +- It names durable framing, boundaries, actors, surfaces, bets, and + unresolved questions. +- It does not act like a consultant memo, board brief, or operating plan. + +## 2. Voice and Tense + +- Present tense, active voice. +- Third person only. Never "we" or "you". +- Declarative statements over persuasive rhetoric. +- Prefer short, dense paragraphs and compact bullets. + +## 3. Restraint Rules + +- Prefer omission over invention. +- If a point is plausible but not supported, omit it or convert it into an + open question. +- If the source material is mixed, preserve the tension explicitly. +- Do not infer monetization, pricing, revenue, GTM segmentation, org design, + partnership priorities, KPIs, or timelines unless directly supported. +- Do not write recommendations, action items, or decision deadlines unless the + user explicitly asks for them. +- Do not assert market leadership, competitive superiority, or winner/loser + framing unless directly supported by the source packet. + +## 4. Allowed Section Behavior + +- `What This Is` explains the primitive at a durable level. +- `Core Thesis` contains only source-backed, thesis-level claims. +- `Boundaries` should be explicit and contrastive. +- `Actor Model` names meaningful actors without inventing internal roles. +- `Durable Surfaces` names persistent product or platform surfaces, not + implementation components. If a surface is source-visible but still in + transition, qualify it explicitly as emerging or evolving. +- `Strategic Bets` should be minimal and clearly grounded in repeated signals. + Phrase them as observed directional bets (`public materials suggest a bet + on...`, `the company appears to be betting on...`) rather than settled + declarations or recommendations. +- `Open Questions` should remain open rather than being quietly resolved in + prose elsewhere. + +## 5. Disallowed Drift + +- No `Success Signals`, KPI, or metrics section. +- No monetization strategy or revenue language. +- No `Decision Agenda`, `Next Steps`, `Operating Guidance`, or milestone plan. +- No implementation components, architecture diagrams, or system design. +- No internal org structure unless explicit in the source. +- No market-leadership or competitive-positioning claims unless explicit in the + source. + +## 6. Tone + +- Dense, calm, and specific. +- No hype language. +- No filler ("it is worth noting", "in order to", "clearly", "obviously"). +- No false certainty when the source packet is mixed. diff --git a/skills/foundation-creator/references/template.md b/skills/foundation-creator/references/template.md new file mode 100644 index 0000000..7fc4adf --- /dev/null +++ b/skills/foundation-creator/references/template.md @@ -0,0 +1,52 @@ +# {Primitive Name} Foundation + +Use only the sections below unless the user explicitly asks for more. + +## What This Is + +{One short paragraph describing the primitive or company/product foundation.} + +## Core Thesis + +- {Durable belief supported by the source material.} +- {Another source-backed thesis.} + +## Boundaries + +- {What this is not.} +- {Explicit limit of scope or category.} + +## Actor Model + +- {Primary actor and relationship to the primitive.} +- {Secondary actor when source-backed.} + +## Durable Surfaces + +- {Surface area that persists across implementations or product shifts.} +- {Another durable surface.} + +## Strategic Bets + +- {Only if clearly supported by the material.} +- {Use fewer bullets rather than speculative ones.} + +## Open Questions + +- {Unresolved tension or ambiguity the source does not settle.} +- {Another open question.} + +## Disallowed Sections + +Do not add sections like: + +- `Success Signals` +- `Metrics` +- `Decision Agenda` +- `Next Steps` +- `Operating Guidance` +- `Implementation Plan` +- `Roadmap` +- `Partnership Strategy` + +Those belong to downstream planning artifacts, not the foundation document. From 2679aeaee753b4b64a975b2f0f696e0aff21aa85 Mon Sep 17 00:00:00 2001 From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com> Date: Mon, 20 Apr 2026 15:56:28 +1000 Subject: [PATCH 04/30] Add deterministic checks and benchmarks to eval runner --- README.md | 13 + scripts/run-baml-eval.mjs | 542 ++++++++++++++++++++- skills/foundation-creator/evals/evals.json | 6 + skills/spec-creator/evals/evals.json | 6 + 4 files changed, 541 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 4a628c3..08353d9 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,24 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and bun install bun run eval:foundation -- create-foundation-from-vercel-source-packet bun run eval:spec -- create-from-vercel-mcp-source-packet +bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3 ``` Each run writes packet, brief, candidate document, and evaluation report artifacts under `skills//evals/runs/`. +The runner now also writes: + +- `deterministic_checks.json` — reference-driven checks derived from the skill's + `template.md` and `language.md` +- `timing.json` — per-stage local timing +- `summary.json` — per-trial LLM status + combined status +- `benchmark.json` — aggregated status counts and timing summaries across all + trials + +When `--trials N` is used, the run directory contains `trial-1/`, `trial-2/`, +... plus a top-level `benchmark.json`. + `bun run eval:*` loads `.env` automatically through `dotenv-cli`, so `AI_GATEWAY_API_KEY` can live in the repo-local `.env` without manual `source` steps. diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs index 039d4a6..f64f4ab 100644 --- a/scripts/run-baml-eval.mjs +++ b/scripts/run-baml-eval.mjs @@ -12,6 +12,57 @@ function fail(message) { process.exit(1); } +const STATUS_RANK = { + Pass: 0, + Partial: 1, + Fail: 2, +}; + +function normalizeLine(line) { + return line.trim().replace(/\s+/g, " "); +} + +function normalizeHeading(line) { + return normalizeLine(line) + .replace(/^#+\s*/, "") + .replace(/\s+/g, " ") + .trim(); +} + +function summarizeNumeric(values) { + if (values.length === 0) { + return { + mean: 0, + min: 0, + max: 0, + }; + } + + const total = values.reduce((sum, value) => sum + value, 0); + return { + mean: Math.round(total / values.length), + min: Math.min(...values), + max: Math.max(...values), + }; +} + +function worstStatus(statuses) { + return statuses.reduce((worst, current) => { + if (!worst) { + return current; + } + return STATUS_RANK[current] > STATUS_RANK[worst] ? current : worst; + }, null); +} + +function hasPronounDrift(document) { + return /\b(we|our|ours|us|you|your|yours)\b/i.test(document); +} + +function hasUppercaseObligationKeyword(document) { + return /\b(MUST|SHOULD|MAY)\b/.test(document); +} + function runCommand(command, args, cwd) { return new Promise((resolve, reject) => { const child = spawn(command, args, { @@ -40,6 +91,36 @@ async function loadText(filePath) { return readFile(filePath, "utf8"); } +function parseArgs(argv) { + const positionals = []; + let trials = 1; + + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index]; + + if (arg === "--trials") { + const next = argv[index + 1]; + if (!next) { + fail("Missing value after --trials."); + } + trials = Number.parseInt(next, 10); + if (!Number.isInteger(trials) || trials < 1) { + fail("--trials must be a positive integer."); + } + index += 1; + continue; + } + + positionals.push(arg); + } + + return { + skillName: positionals[0], + selector: positionals[1], + trials, + }; +} + function getEvalBySelector(evals, selector) { if (!selector) { if (evals.length === 1) { @@ -66,7 +147,7 @@ function getEvalBySelector(evals, selector) { async function generateClient(skillRoot) { const bamlSrc = path.join(skillRoot, "baml_src"); - await runCommand("npx", ["baml-cli", "generate", "--from", bamlSrc], repoRoot); + await runCommand("bunx", ["baml-cli", "generate", "--from", bamlSrc], repoRoot); } async function importGeneratedClient(skillRoot) { @@ -129,7 +210,7 @@ async function ensureFreshClient(skillRoot) { "utf8", ); try { - await runCommand("npx", ["tsc", "--project", tsconfigPath], repoRoot); + await runCommand("bunx", ["tsc", "--project", tsconfigPath], repoRoot); } finally { await rm(tsconfigPath, { force: true }); } @@ -144,33 +225,282 @@ async function writeRunArtifacts(runDir, artifacts) { } } -async function main() { - const skillName = process.argv[2]; - const selector = process.argv[3]; +function createCheck(id, passed, details) { + return { id, passed, details }; +} - if (!skillName) { - fail("Usage: node ./scripts/run-baml-eval.mjs [eval-id-or-name]"); +function extractFoundationTemplateSections(templateText) { + const sections = []; + const lines = templateText.split(/\r?\n/); + + for (const rawLine of lines) { + const line = normalizeLine(rawLine); + if (line === "## Disallowed Sections") { + break; + } + if (line.startsWith("## ")) { + sections.push(line.slice(3).trim()); + } } - const skillRoot = path.join(repoRoot, "skills", skillName); - const evalsDir = path.join(skillRoot, "evals"); - const manifestPath = path.join(evalsDir, "evals.json"); - const manifest = await loadJson(manifestPath); - const evalEntry = getEvalBySelector(manifest.evals, selector); - const runner = manifest.runner_contract; + return sections; +} - if (!runner || runner.type !== "baml_pipeline") { - fail(`Skill '${skillName}' does not declare a supported runner_contract.`); +function extractFoundationDisallowedHeadings(templateText) { + const disallowed = new Set(); + const lines = templateText.split(/\r?\n/); + let inDisallowedSection = false; + + for (const rawLine of lines) { + const line = normalizeLine(rawLine); + if (line === "## Disallowed Sections") { + inDisallowedSection = true; + continue; + } + if (!inDisallowedSection) { + continue; + } + const match = line.match(/^- `(.+)`$/); + if (match) { + disallowed.add(match[1]); + } } - if (!process.env.AI_GATEWAY_API_KEY) { - fail("AI_GATEWAY_API_KEY is required to execute BAML evals."); + return disallowed; +} + +function validateFoundationDocument(candidateDocument, templateText) { + const requiredSections = extractFoundationTemplateSections(templateText); + const disallowedHeadings = extractFoundationDisallowedHeadings(templateText); + const lines = candidateDocument.split(/\r?\n/); + const lineMap = new Map(); + + for (const [index, rawLine] of lines.entries()) { + const line = normalizeHeading(rawLine); + if (!line) { + continue; + } + if (!lineMap.has(line)) { + lineMap.set(line, []); + } + lineMap.get(line).push(index); } - await ensureFreshClient(skillRoot); - const generated = await importGeneratedClient(skillRoot); - const { b } = generated; + const missingSections = []; + const duplicateSections = []; + const positions = []; + + for (const section of requiredSections) { + const matches = lineMap.get(section) ?? []; + if (matches.length === 0) { + missingSections.push(section); + continue; + } + if (matches.length > 1) { + duplicateSections.push(section); + } + positions.push({ + section, + index: matches[0], + }); + } + + const orderIsCorrect = positions.every((entry, index) => { + if (index === 0) { + return true; + } + return entry.index > positions[index - 1].index; + }); + + const emptySections = []; + for (let index = 0; index < positions.length; index += 1) { + const current = positions[index]; + const next = positions[index + 1]; + const start = current.index + 1; + const end = next ? next.index : lines.length; + const sectionBody = lines.slice(start, end).join("\n").trim(); + if (!sectionBody) { + emptySections.push(current.section); + } + } + + const presentDisallowedSections = [...disallowedHeadings].filter((section) => + (lineMap.get(section) ?? []).length > 0, + ); + + return [ + createCheck( + "required_sections_present_once", + missingSections.length === 0 && duplicateSections.length === 0, + missingSections.length === 0 && duplicateSections.length === 0 + ? `All required sections from template are present exactly once: ${requiredSections.join(", ")}.` + : `Missing: ${missingSections.join(", ") || "none"}. Duplicate: ${duplicateSections.join(", ") || "none"}.`, + ), + createCheck( + "required_sections_in_template_order", + missingSections.length === 0 && orderIsCorrect, + missingSections.length > 0 + ? "Section order check skipped because one or more required sections are missing." + : orderIsCorrect + ? "Required sections follow the template order." + : "Required sections are present but not in template order.", + ), + createCheck( + "required_sections_nonempty", + emptySections.length === 0, + emptySections.length === 0 + ? "Every required section has non-empty content." + : `Empty sections: ${emptySections.join(", ")}.`, + ), + createCheck( + "no_disallowed_sections", + presentDisallowedSections.length === 0, + presentDisallowedSections.length === 0 + ? "No disallowed downstream-planning sections were detected." + : `Disallowed sections present: ${presentDisallowedSections.join(", ")}.`, + ), + createCheck( + "no_first_or_second_person", + !hasPronounDrift(candidateDocument), + !hasPronounDrift(candidateDocument) + ? "No obvious first-person or second-person pronouns detected." + : "Detected first-person or second-person pronouns that violate the language guide.", + ), + ]; +} +function extractSpecMajorSections(templateText) { + const sections = []; + const lines = templateText.split(/\r?\n/); + + for (const rawLine of lines) { + const line = normalizeLine(rawLine); + const match = line.match(/^## \d+\.\s+(.+)$/); + if (match) { + sections.push(match[1]); + } + } + + return sections; +} + +function lineExists(lines, matcher) { + return lines.some((line) => matcher(normalizeHeading(line))); +} + +function validateSpecDocument(candidateDocument, templateText) { + const requiredSections = extractSpecMajorSections(templateText); + const lines = candidateDocument.split(/\r?\n/); + const missingSections = requiredSections.filter( + (section) => !lineExists(lines, (line) => line.toLowerCase() === section.toLowerCase()), + ); + + const hasPurpose = lineExists(lines, (line) => line === "Purpose"); + const hasProblemStatement = lineExists( + lines, + (line) => line.toLowerCase() === "problem statement", + ); + const hasNumberedComponents = /^\d+\.\s+`[^`]+`/m.test(candidateDocument); + const hasFieldFormatting = /- `[^`]+` \([^)]+\)/.test(candidateDocument); + + return [ + createCheck( + "core_sections_present", + missingSections.length === 0, + missingSections.length === 0 + ? `All major template sections are present: ${requiredSections.join(", ")}.` + : `Missing major sections: ${missingSections.join(", ")}.`, + ), + createCheck( + "purpose_heading_present", + hasPurpose, + hasPurpose + ? "Purpose heading is present." + : "Purpose heading is missing.", + ), + createCheck( + "problem_statement_present", + hasProblemStatement, + hasProblemStatement + ? "Problem Statement heading is present." + : "Problem Statement heading is missing.", + ), + createCheck( + "component_list_uses_numbering", + hasNumberedComponents, + hasNumberedComponents + ? "Detected numbered component entries in the spec." + : "Did not detect numbered component entries like `1. `Component Name``.", + ), + createCheck( + "domain_fields_use_template_shape", + hasFieldFormatting, + hasFieldFormatting + ? "Detected domain-field lines using the `` `field_name` (type) `` format." + : "Did not detect any domain-field lines using the template field format.", + ), + createCheck( + "no_first_or_second_person", + !hasPronounDrift(candidateDocument), + !hasPronounDrift(candidateDocument) + ? "No obvious first-person or second-person pronouns detected." + : "Detected first-person or second-person pronouns that violate the language guide.", + ), + createCheck( + "obligation_keywords_lowercase", + !hasUppercaseObligationKeyword(candidateDocument), + !hasUppercaseObligationKeyword(candidateDocument) + ? "No uppercase obligation keywords detected." + : "Detected uppercase MUST/SHOULD/MAY, which violates the language guide.", + ), + ]; +} + +async function runDeterministicChecks(skillRoot, validationContract, candidateDocument) { + if (!validationContract || validationContract.type !== "reference_document_checks") { + return { + enabled: false, + overall_pass: true, + checks: [], + }; + } + + const templatePath = path.join(skillRoot, validationContract.template_file); + const languagePath = path.join(skillRoot, validationContract.language_file); + const [templateText, languageText] = await Promise.all([ + loadText(templatePath), + loadText(languagePath), + ]); + + let checks; + switch (validationContract.validator) { + case "foundation-v1": + checks = validateFoundationDocument(candidateDocument, templateText, languageText); + break; + case "spec-v1": + checks = validateSpecDocument(candidateDocument, templateText, languageText); + break; + default: + fail(`Unknown validation contract '${validationContract.validator}'.`); + } + + return { + enabled: true, + validator: validationContract.validator, + overall_pass: checks.every((check) => check.passed), + checks, + }; +} + +async function runSingleTrial({ + evalEntry, + evalsDir, + generated, + runner, + skillRoot, + validationContract, +}) { + const { b } = generated; const packet = await buildPacket(evalEntry, evalsDir, runner.packet_type); const compileFnName = runner.compile_brief_function; const renderFnName = runner.render_document_function; @@ -181,24 +511,184 @@ async function main() { typeof b[renderFnName] !== "function" || typeof b[evaluateFnName] !== "function" ) { - fail(`Generated client is missing one or more runner functions for '${skillName}'.`); + fail(`Generated client is missing one or more runner functions for '${path.basename(skillRoot)}'.`); } + const timing = {}; + const startedAt = Date.now(); + + const compileStartedAt = Date.now(); const brief = await b[compileFnName](packet); + timing.compile_ms = Date.now() - compileStartedAt; + + const renderStartedAt = Date.now(); const candidateDocument = await b[renderFnName](brief); + timing.render_ms = Date.now() - renderStartedAt; + + const deterministicStartedAt = Date.now(); + const deterministic_checks = await runDeterministicChecks( + skillRoot, + validationContract, + candidateDocument, + ); + timing.deterministic_ms = Date.now() - deterministicStartedAt; + + const evaluateStartedAt = Date.now(); const report = await b[evaluateFnName](packet, candidateDocument); + timing.evaluate_ms = Date.now() - evaluateStartedAt; + timing.total_ms = Date.now() - startedAt; + + const combined_status = + deterministic_checks.enabled && !deterministic_checks.overall_pass + ? worstStatus([report.overall_status, "Fail"]) + : report.overall_status; + + return { + packet, + brief, + candidateDocument, + report, + deterministic_checks, + timing, + summary: { + llm_status: report.overall_status, + combined_status, + deterministic_pass: deterministic_checks.overall_pass, + }, + }; +} +function buildBenchmark(skillName, evalName, trials) { + const judgeStatuses = trials.map((trial) => trial.report.overall_status); + const combinedStatuses = trials.map((trial) => trial.summary.combined_status); + const deterministicPassCount = trials.filter( + (trial) => trial.deterministic_checks.overall_pass, + ).length; + + const checkStats = new Map(); + for (const trial of trials) { + for (const check of trial.deterministic_checks.checks) { + if (!checkStats.has(check.id)) { + checkStats.set(check.id, { + id: check.id, + passed: 0, + total: 0, + last_details: "", + }); + } + const stat = checkStats.get(check.id); + stat.total += 1; + if (check.passed) { + stat.passed += 1; + } + stat.last_details = check.details; + } + } + + return { + skill_name: skillName, + eval_name: evalName, + trial_count: trials.length, + judge_status_counts: { + Pass: judgeStatuses.filter((status) => status === "Pass").length, + Partial: judgeStatuses.filter((status) => status === "Partial").length, + Fail: judgeStatuses.filter((status) => status === "Fail").length, + }, + combined_status_counts: { + Pass: combinedStatuses.filter((status) => status === "Pass").length, + Partial: combinedStatuses.filter((status) => status === "Partial").length, + Fail: combinedStatuses.filter((status) => status === "Fail").length, + }, + benchmark_summary: { + llm_worst_status: worstStatus(judgeStatuses), + combined_worst_status: worstStatus(combinedStatuses), + deterministic_pass_rate: Number((deterministicPassCount / trials.length).toFixed(2)), + }, + timing_ms: { + compile: summarizeNumeric(trials.map((trial) => trial.timing.compile_ms)), + render: summarizeNumeric(trials.map((trial) => trial.timing.render_ms)), + deterministic: summarizeNumeric(trials.map((trial) => trial.timing.deterministic_ms)), + evaluate: summarizeNumeric(trials.map((trial) => trial.timing.evaluate_ms)), + total: summarizeNumeric(trials.map((trial) => trial.timing.total_ms)), + }, + deterministic_checks: [...checkStats.values()].map((stat) => ({ + id: stat.id, + pass_rate: Number((stat.passed / stat.total).toFixed(2)), + passed_trials: stat.passed, + total_trials: stat.total, + last_details: stat.last_details, + })), + }; +} + +async function main() { + const { skillName, selector, trials } = parseArgs(process.argv.slice(2)); + + if (!skillName) { + fail( + "Usage: bun run ./scripts/run-baml-eval.mjs [eval-id-or-name] [--trials N]", + ); + } + + const skillRoot = path.join(repoRoot, "skills", skillName); + const evalsDir = path.join(skillRoot, "evals"); + const manifestPath = path.join(evalsDir, "evals.json"); + const manifest = await loadJson(manifestPath); + const evalEntry = getEvalBySelector(manifest.evals, selector); + const runner = manifest.runner_contract; + const validationContract = manifest.validation_contract ?? null; + + if (!runner || runner.type !== "baml_pipeline") { + fail(`Skill '${skillName}' does not declare a supported runner_contract.`); + } + + if (!process.env.AI_GATEWAY_API_KEY) { + fail("AI_GATEWAY_API_KEY is required to execute BAML evals."); + } + + await ensureFreshClient(skillRoot); + const generated = await importGeneratedClient(skillRoot); const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); const runDir = path.join(skillRoot, "evals", "runs", `${timestamp}-${evalEntry.eval_name}`); + const trialResults = []; + + for (let trialIndex = 0; trialIndex < trials; trialIndex += 1) { + const trialResult = await runSingleTrial({ + evalEntry, + evalsDir, + generated, + runner, + skillRoot, + validationContract, + }); + trialResults.push(trialResult); + + const trialArtifacts = { + "packet.json": trialResult.packet, + "brief.json": trialResult.brief, + "candidate.md": trialResult.candidateDocument, + "report.json": trialResult.report, + "deterministic_checks.json": trialResult.deterministic_checks, + "timing.json": trialResult.timing, + "summary.json": trialResult.summary, + }; + + if (trials === 1) { + await writeRunArtifacts(runDir, trialArtifacts); + } else { + await writeRunArtifacts(path.join(runDir, `trial-${trialIndex + 1}`), trialArtifacts); + } + } + + const benchmark = buildBenchmark(skillName, evalEntry.eval_name, trialResults); await writeRunArtifacts(runDir, { - "packet.json": packet, - "brief.json": brief, - "candidate.md": candidateDocument, - "report.json": report, + "benchmark.json": benchmark, }); console.log(`Run complete: ${runDir}`); - console.log(`Overall status: ${report.overall_status}`); + console.log(`Trials: ${trialResults.length}`); + console.log(`LLM worst status: ${benchmark.benchmark_summary.llm_worst_status}`); + console.log(`Combined worst status: ${benchmark.benchmark_summary.combined_worst_status}`); } main().catch((error) => { diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json index b44aefd..1b5a1b8 100644 --- a/skills/foundation-creator/evals/evals.json +++ b/skills/foundation-creator/evals/evals.json @@ -7,6 +7,12 @@ "render_document_function": "RenderFoundationDocumentDraft", "evaluate_document_function": "EvaluateFoundationDocument" }, + "validation_contract": { + "type": "reference_document_checks", + "validator": "foundation-v1", + "template_file": "references/template.md", + "language_file": "references/language.md" + }, "evals": [ { "id": 0, diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json index 8c9e5ad..cec004a 100644 --- a/skills/spec-creator/evals/evals.json +++ b/skills/spec-creator/evals/evals.json @@ -7,6 +7,12 @@ "render_document_function": "RenderSpecDocumentDraft", "evaluate_document_function": "EvaluateSpecDocument" }, + "validation_contract": { + "type": "reference_document_checks", + "validator": "spec-v1", + "template_file": "references/template.md", + "language_file": "references/language.md" + }, "evals": [ { "id": 0, From caea9721a1c9ee5c4f936c78a779839764986f7e Mon Sep 17 00:00:00 2001 From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com> Date: Tue, 21 Apr 2026 00:13:44 +1000 Subject: [PATCH 05/30] Tighten evals and expand foundation coverage --- README.md | 13 + evals/TAXONOMY.md | 82 ++++++ scripts/run-baml-eval.mjs | 277 ++++++++++++++++-- .../compiler_functions.baml | 15 +- .../foundation_compiler/eval_runner.baml | 23 +- skills/foundation-creator/evals/evals.json | 64 ++++ .../fixtures/harbor_care/expected_criteria.md | 24 ++ .../evals/fixtures/harbor_care/raw_notes.md | 69 +++++ .../expected_criteria.md | 28 ++ .../lightfast_founder_notes/raw_notes.md | 76 +++++ .../foundation-creator/references/language.md | 31 +- .../foundation-creator/references/template.md | 10 +- .../spec_compiler/compiler_functions.baml | 22 ++ .../baml_src/spec_compiler/eval_runner.baml | 37 ++- .../baml_src/spec_compiler/spec_types.baml | 2 + skills/spec-creator/evals/evals.json | 43 +++ 16 files changed, 767 insertions(+), 49 deletions(-) create mode 100644 evals/TAXONOMY.md create mode 100644 skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md create mode 100644 skills/foundation-creator/evals/fixtures/harbor_care/raw_notes.md create mode 100644 skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md create mode 100644 skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md diff --git a/README.md b/README.md index 08353d9..e38a7db 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and ```bash bun install bun run eval:foundation -- create-foundation-from-vercel-source-packet +bun run eval:foundation -- create-foundation-from-lightfast-founder-notes bun run eval:spec -- create-from-vercel-mcp-source-packet bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3 ``` @@ -35,6 +36,13 @@ bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator creat Each run writes packet, brief, candidate document, and evaluation report artifacts under `skills//evals/runs/`. +Current `foundation-creator` corpus includes: + +- `create-foundation-from-vercel-source-packet` +- `create-foundation-from-cloudflare-source-packet` +- `create-foundation-from-lightfast-founder-notes` +- `create-foundation-from-harbor-care-source-packet` + The runner now also writes: - `deterministic_checks.json` — reference-driven checks derived from the skill's @@ -44,6 +52,11 @@ The runner now also writes: - `benchmark.json` — aggregated status counts and timing summaries across all trials +Eval manifests also carry lightweight taxonomy metadata +(`scenario_type`, `input_shape`, `ambiguity_level`, `domain_profile`, +`primary_risks`) so benchmark runs can be grouped by failure mode. Shared +taxonomy guidance lives in [`evals/TAXONOMY.md`](evals/TAXONOMY.md). + When `--trials N` is used, the run directory contains `trial-1/`, `trial-2/`, ... plus a top-level `benchmark.json`. diff --git a/evals/TAXONOMY.md b/evals/TAXONOMY.md new file mode 100644 index 0000000..3f843e7 --- /dev/null +++ b/evals/TAXONOMY.md @@ -0,0 +1,82 @@ +# Eval Taxonomy + +This repo tracks eval coverage across a small shared taxonomy so new packets +expand the corpus intentionally instead of growing as one-off examples. + +## Manifest fields + +Each eval entry in `skills/*/evals/evals.json` should declare: + +- `scenario_type` +- `input_shape` +- `ambiguity_level` +- `domain_profile` +- `primary_risks` + +These fields are lightweight metadata. They do not change execution, but they +show up in `benchmark.json` so runs can be grouped by failure mode later. + +## Canonical scenario types + +- `clear_intent_prompt` + - Straightforward create-mode request with explicit scope and components. +- `unstructured_notes_prompt` + - Messy notes, but still mostly service-shaped and not deeply ambiguous. +- `source_packet_transition` + - Curated packet with real-world material that is internally mixed, evolving, + or timeline-sensitive. +- `update_existing_doc` + - Existing document is the dominant constraint; success depends on precise + in-place edits without broad drift. +- `founder_notes_ambiguity` + - Highly ambiguous notes where positioning, boundaries, and unresolved + questions matter more than completeness. +- `cross_domain_generalization` + - Non-default domain chosen to test whether the skill overfits to developer + infrastructure examples. + +## Supporting axes + +### `input_shape` + +- `direct_prompt` +- `notes_prompt` +- `source_packet` +- `existing_doc_update` + +### `ambiguity_level` + +- `low` +- `medium` +- `high` + +### `domain_profile` + +- `developer_infrastructure` +- `company_foundation` +- `non_developer_domain` + +### `primary_risks` + +Use a short list of the dominant failure modes for the eval. Current common +values: + +- `template_drift` +- `implementation_leakage` +- `invented_capabilities` +- `invented_certainty` +- `scope_bleed` +- `source_overfitting` +- `weak_boundaries` +- `update_regression` + +## Current expansion priority + +The next missing slices are: + +- `update_existing_doc` for `foundation-creator` once revise-in-place behavior + is defined +- baseline comparison runs (`current skill` vs `previous skill` / `no skill`) + when the local harness is ready to compare deltas directly +- optional Braintrust-style scorer/export integration if local JSON artifacts are + no longer sufficient for experiment tracking diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs index f64f4ab..d1e7e52 100644 --- a/scripts/run-baml-eval.mjs +++ b/scripts/run-baml-eval.mjs @@ -29,6 +29,10 @@ function normalizeHeading(line) { .trim(); } +function stripLeadingSectionNumber(heading) { + return heading.replace(/^\d+(?:\.\d+)*\.?\s+/, "").trim(); +} + function summarizeNumeric(values) { if (values.length === 0) { return { @@ -63,6 +67,28 @@ function hasUppercaseObligationKeyword(document) { return /\b(MUST|SHOULD|MAY)\b/.test(document); } +function extractNonEmptyNormalizedLines(text) { + return text + .split(/\r?\n/) + .map((line) => normalizeLine(line)) + .filter((line) => line.length > 0); +} + +function linesAppearInOrder(needleLines, haystackLines) { + let needleIndex = 0; + + for (const line of haystackLines) { + if (needleIndex >= needleLines.length) { + break; + } + if (line === needleLines[needleIndex]) { + needleIndex += 1; + } + } + + return needleIndex === needleLines.length; +} + function runCommand(command, args, cwd) { return new Promise((resolve, reject) => { const child = spawn(command, args, { @@ -170,8 +196,10 @@ async function buildPacket(evalEntry, evalsDir, packetType) { const packet = { packet_name: evalEntry.eval_name, task_prompt: evalEntry.prompt, - raw_notes: rawNotesPath ? await loadText(rawNotesPath) : "", - expected_criteria: expectedCriteriaPath ? await loadText(expectedCriteriaPath) : "", + raw_notes: rawNotesPath ? await loadText(rawNotesPath) : evalEntry.prompt, + expected_criteria: expectedCriteriaPath + ? await loadText(expectedCriteriaPath) + : (evalEntry.expected_output ?? ""), }; if (packetType === "SpecEvalPacket") { @@ -269,17 +297,46 @@ function extractFoundationDisallowedHeadings(templateText) { return disallowed; } +function extractFoundationSectionBodies(candidateDocument) { + const lines = candidateDocument.split(/\r?\n/); + const sections = []; + + for (const [index, rawLine] of lines.entries()) { + const match = rawLine.match(/^\s*##\s+(.+?)\s*$/); + if (match) { + sections.push({ + title: normalizeLine(match[1]), + index, + }); + } + } + + const bodies = new Map(); + for (let index = 0; index < sections.length; index += 1) { + const current = sections[index]; + const next = sections[index + 1]; + const start = current.index + 1; + const end = next ? next.index : lines.length; + bodies.set(current.title, lines.slice(start, end).join("\n").trim()); + } + + return { + lines, + sections, + bodies, + }; +} + function validateFoundationDocument(candidateDocument, templateText) { const requiredSections = extractFoundationTemplateSections(templateText); const disallowedHeadings = extractFoundationDisallowedHeadings(templateText); - const lines = candidateDocument.split(/\r?\n/); + const { lines, sections, bodies } = extractFoundationSectionBodies(candidateDocument); + const titleLine = lines.find((line) => normalizeLine(line).length > 0) ?? ""; + const hasMarkdownTitle = /^\s*#\s+.+\s+Foundation\s*$/.test(titleLine); const lineMap = new Map(); - for (const [index, rawLine] of lines.entries()) { - const line = normalizeHeading(rawLine); - if (!line) { - continue; - } + for (const { title, index } of sections) { + const line = normalizeLine(title); if (!lineMap.has(line)) { lineMap.set(line, []); } @@ -315,10 +372,7 @@ function validateFoundationDocument(candidateDocument, templateText) { const emptySections = []; for (let index = 0; index < positions.length; index += 1) { const current = positions[index]; - const next = positions[index + 1]; - const start = current.index + 1; - const end = next ? next.index : lines.length; - const sectionBody = lines.slice(start, end).join("\n").trim(); + const sectionBody = bodies.get(current.section) ?? ""; if (!sectionBody) { emptySections.push(current.section); } @@ -327,8 +381,40 @@ function validateFoundationDocument(candidateDocument, templateText) { const presentDisallowedSections = [...disallowedHeadings].filter((section) => (lineMap.get(section) ?? []).length > 0, ); + const strategicBetsBody = bodies.get("Strategic Bets") ?? ""; + const openQuestionsBody = bodies.get("Open Questions") ?? ""; + const openQuestionBullets = openQuestionsBody + .split(/\r?\n/) + .map((line) => normalizeLine(line)) + .filter((line) => line.startsWith("- ")); + const openQuestionsLookOpen = + openQuestionBullets.length > 0 && + openQuestionBullets.every((line) => line.endsWith("?")); + const strategicBetLines = strategicBetsBody + .split(/\r?\n/) + .map((line) => normalizeLine(line)) + .filter((line) => line.startsWith("- ")); + const strategicBetsBodyHasDirectionalPreamble = /\b(observed directional bets|public signals)\b/i.test( + strategicBetsBody, + ); + const hedgedStrategicBets = + strategicBetLines.length === 0 || + strategicBetLines.every((line) => + /^\-\s+Bet:/i.test(line) + ? strategicBetsBodyHasDirectionalPreamble + : /\b(appears?|suggests?|signals?|signaling|indicates?|indicating|directional bet|directional bets|observed bet|observed bets|a bet that|bet that|bet on)\b/i.test( + line, + ), + ); return [ + createCheck( + "title_heading_present", + hasMarkdownTitle, + hasMarkdownTitle + ? "Detected the required markdown title heading." + : "Missing required title heading like `# Foundation`.", + ), createCheck( "required_sections_present_once", missingSections.length === 0 && duplicateSections.length === 0, @@ -359,6 +445,20 @@ function validateFoundationDocument(candidateDocument, templateText) { ? "No disallowed downstream-planning sections were detected." : `Disallowed sections present: ${presentDisallowedSections.join(", ")}.`, ), + createCheck( + "strategic_bets_use_directional_language", + hedgedStrategicBets, + hedgedStrategicBets + ? "Strategic Bets are framed as directional bets or observed signals." + : "One or more Strategic Bets bullets read as settled conclusions instead of directional bets or observed signals.", + ), + createCheck( + "open_questions_remain_questions", + openQuestionsLookOpen, + openQuestionsLookOpen + ? "Open Questions are written as explicit unanswered questions." + : "Open Questions should be bullet questions that remain open and usually end with `?`.", + ), createCheck( "no_first_or_second_person", !hasPronounDrift(candidateDocument), @@ -392,14 +492,20 @@ function validateSpecDocument(candidateDocument, templateText) { const requiredSections = extractSpecMajorSections(templateText); const lines = candidateDocument.split(/\r?\n/); const missingSections = requiredSections.filter( - (section) => !lineExists(lines, (line) => line.toLowerCase() === section.toLowerCase()), + (section) => + !lineExists( + lines, + (line) => stripLeadingSectionNumber(line).toLowerCase() === section.toLowerCase(), + ), ); - const hasPurpose = lineExists(lines, (line) => line === "Purpose"); - const hasProblemStatement = lineExists( - lines, - (line) => line.toLowerCase() === "problem statement", - ); + const hasStatusLine = lines.some((line) => /^\s*Status:\s+\S+/.test(line)); + const hasPurposeLine = lines.some((line) => /^\s*Purpose:\s+\S+/.test(line)); + const hasProblemStatement = lineExists(lines, (line) => line === "1. Problem Statement"); + const hasGoalSubsections = + lineExists(lines, (line) => line === "2.1 Goals") && + lineExists(lines, (line) => line === "2.2 Non-Goals"); + const hasImportantBoundaryBlock = /(^|\n)Important boundary:\s*\n/m.test(candidateDocument); const hasNumberedComponents = /^\d+\.\s+`[^`]+`/m.test(candidateDocument); const hasFieldFormatting = /- `[^`]+` \([^)]+\)/.test(candidateDocument); @@ -412,18 +518,39 @@ function validateSpecDocument(candidateDocument, templateText) { : `Missing major sections: ${missingSections.join(", ")}.`, ), createCheck( - "purpose_heading_present", - hasPurpose, - hasPurpose - ? "Purpose heading is present." - : "Purpose heading is missing.", + "status_line_present", + hasStatusLine, + hasStatusLine + ? "Status line is present." + : "Status line is missing or malformed.", + ), + createCheck( + "purpose_line_present", + hasPurposeLine, + hasPurposeLine + ? "Purpose line is present." + : "Purpose line is missing or malformed.", ), createCheck( "problem_statement_present", hasProblemStatement, hasProblemStatement - ? "Problem Statement heading is present." - : "Problem Statement heading is missing.", + ? "Problem Statement major section is present." + : "Problem Statement major section is missing.", + ), + createCheck( + "goals_subsections_present", + hasGoalSubsections, + hasGoalSubsections + ? "Detected `2.1 Goals` and `2.2 Non-Goals` subsections." + : "Missing one or both of the required goals subsections: `2.1 Goals`, `2.2 Non-Goals`.", + ), + createCheck( + "important_boundary_block_present", + hasImportantBoundaryBlock, + hasImportantBoundaryBlock + ? "Detected an `Important boundary:` block inside the document." + : "Did not detect the required `Important boundary:` block.", ), createCheck( "component_list_uses_numbering", @@ -456,6 +583,55 @@ function validateSpecDocument(candidateDocument, templateText) { ]; } +function validateSpecUpdateDocument(candidateDocument, existingSpecText) { + const existingLines = extractNonEmptyNormalizedLines(existingSpecText); + const candidateLines = extractNonEmptyNormalizedLines(candidateDocument); + const preservesExistingContent = linesAppearInOrder(existingLines, candidateLines); + const hasOffsetStoreComponent = /^\s*4\.\s+`Offset Store`/m.test(candidateDocument); + const hasCrossRegionNonGoal = + /(^|\n)-\s+(Cross-region log replication\.|Replicating logs across regions\.)/mi.test( + candidateDocument, + ); + + return [ + createCheck( + "existing_content_preserved_in_order", + preservesExistingContent, + preservesExistingContent + ? "All non-empty lines from the existing spec appear in order in the candidate." + : "One or more non-empty lines from the existing spec were removed or reordered.", + ), + createCheck( + "offset_store_component_present", + hasOffsetStoreComponent, + hasOffsetStoreComponent + ? "Detected numbered component `4. `Offset Store``." + : "Did not detect numbered component `4. `Offset Store``.", + ), + createCheck( + "cross_region_nongoal_present", + hasCrossRegionNonGoal, + hasCrossRegionNonGoal + ? "Detected the requested cross-region replication non-goal." + : "Did not detect the requested cross-region replication non-goal.", + ), + createCheck( + "no_first_or_second_person", + !hasPronounDrift(candidateDocument), + !hasPronounDrift(candidateDocument) + ? "No obvious first-person or second-person pronouns detected." + : "Detected first-person or second-person pronouns that violate the language guide.", + ), + createCheck( + "obligation_keywords_lowercase", + !hasUppercaseObligationKeyword(candidateDocument), + !hasUppercaseObligationKeyword(candidateDocument) + ? "No uppercase obligation keywords detected." + : "Detected uppercase MUST/SHOULD/MAY, which violates the language guide.", + ), + ]; +} + async function runDeterministicChecks(skillRoot, validationContract, candidateDocument) { if (!validationContract || validationContract.type !== "reference_document_checks") { return { @@ -465,11 +641,19 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo }; } - const templatePath = path.join(skillRoot, validationContract.template_file); - const languagePath = path.join(skillRoot, validationContract.language_file); - const [templateText, languageText] = await Promise.all([ - loadText(templatePath), - loadText(languagePath), + const templatePath = validationContract.template_file + ? path.join(skillRoot, validationContract.template_file) + : null; + const languagePath = validationContract.language_file + ? path.join(skillRoot, validationContract.language_file) + : null; + const existingSpecPath = validationContract.existing_spec_file + ? path.join(skillRoot, validationContract.existing_spec_file) + : null; + const [templateText, languageText, existingSpecText] = await Promise.all([ + validationContract.template_file ? loadText(templatePath) : Promise.resolve(""), + languagePath ? loadText(languagePath) : Promise.resolve(""), + existingSpecPath ? loadText(existingSpecPath) : Promise.resolve(""), ]); let checks; @@ -480,6 +664,9 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo case "spec-v1": checks = validateSpecDocument(candidateDocument, templateText, languageText); break; + case "spec-update-v1": + checks = validateSpecUpdateDocument(candidateDocument, existingSpecText, languageText); + break; default: fail(`Unknown validation contract '${validationContract.validator}'.`); } @@ -521,6 +708,13 @@ async function runSingleTrial({ const brief = await b[compileFnName](packet); timing.compile_ms = Date.now() - compileStartedAt; + if (runner.packet_type === "SpecEvalPacket") { + brief.update_request = packet.task_prompt; + if (packet.existing_spec) { + brief.existing_spec = packet.existing_spec; + } + } + const renderStartedAt = Date.now(); const candidateDocument = await b[renderFnName](brief); timing.render_ms = Date.now() - renderStartedAt; @@ -621,6 +815,22 @@ function buildBenchmark(skillName, evalName, trials) { }; } +function extractEvalMetadata(evalEntry) { + const fields = [ + "scenario_type", + "input_shape", + "ambiguity_level", + "domain_profile", + "primary_risks", + ]; + + return Object.fromEntries( + fields + .filter((field) => evalEntry[field] !== undefined) + .map((field) => [field, evalEntry[field]]), + ); +} + async function main() { const { skillName, selector, trials } = parseArgs(process.argv.slice(2)); @@ -636,7 +846,7 @@ async function main() { const manifest = await loadJson(manifestPath); const evalEntry = getEvalBySelector(manifest.evals, selector); const runner = manifest.runner_contract; - const validationContract = manifest.validation_contract ?? null; + const validationContract = evalEntry.validation_contract ?? manifest.validation_contract ?? null; if (!runner || runner.type !== "baml_pipeline") { fail(`Skill '${skillName}' does not declare a supported runner_contract.`); @@ -680,7 +890,10 @@ async function main() { } } - const benchmark = buildBenchmark(skillName, evalEntry.eval_name, trialResults); + const benchmark = { + ...buildBenchmark(skillName, evalEntry.eval_name, trialResults), + eval_metadata: extractEvalMetadata(evalEntry), + }; await writeRunArtifacts(runDir, { "benchmark.json": benchmark, }); diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml index b4a90d1..f793b6f 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml @@ -109,10 +109,21 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string { - Avoid implementation detail. - Emphasize thesis, boundaries, actor model, surfaces, and strategic bets. - State that the writer is a source-bound synthesizer, not a strategy consultant. - - Require exactly these sections unless the user asks otherwise: `What This Is`, `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, `Open Questions`. + - Require markdown output with exactly this heading structure unless the user asks otherwise: + - `# Foundation` + - `## What This Is` + - `## Core Thesis` + - `## Boundaries` + - `## Actor Model` + - `## Durable Surfaces` + - `## Strategic Bets` + - `## Open Questions` - Forbid extra sections like `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or `Roadmap`. - - Require `Strategic Bets` to be phrased as observed directional bets rather than prescriptions. + - Require `Strategic Bets` to be phrased as observed directional bets or public signals rather than prescriptions. + - Require each `Strategic Bets` bullet to start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`. + - Forbid bare `Bet:` labels and categorical claims like `X is the wedge` or `Y is a defensible primitive`. - Require recently emerging or transitional surfaces to be qualified explicitly rather than flattened as fully settled. + - Require `Open Questions` bullets to remain actual unresolved questions rather than disguised conclusions. - Forbid market-leadership or superiority claims unless they are explicit in the brief. - Make the prompt directly usable by an agent. diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml index 7760d3d..fb8fd23 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml @@ -19,6 +19,11 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found - Treat expected criteria as evaluation guidance, not as license to invent. - Avoid implementation detail. - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them. + - If `strategic_bets` are included, phrase them as observed directional signals or evidence. + - Each `strategic_bets` item should start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`. + - Do not use bare `Bet:` labels or categorical claims like `X is the wedge` or `Y is a defensible primitive`. + - Keep `strategic_bets` short when evidence is weak. + - Keep unresolved questions as actual unresolved questions rather than quietly resolving them in the summary. - Prefer omission over speculation. {{ ctx.output_format }} @@ -38,10 +43,22 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string { - Avoid implementation detail. - Stay source-bound: do not invent monetization, KPIs, org structure, partnerships, operating guidance, or next-step plans. - Prefer omission over plausible-sounding speculation. - - Use exactly these sections and no others: `What This Is`, `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, `Open Questions`. + - Return markdown. + - Start with `# Foundation`. + - Use exactly these `##` sections and no others: + - `## What This Is` + - `## Core Thesis` + - `## Boundaries` + - `## Actor Model` + - `## Durable Surfaces` + - `## Strategic Bets` + - `## Open Questions` - If `Strategic Bets` is weakly supported, keep it short rather than expanding it. - - Phrase `Strategic Bets` as observed directional bets, not recommendations or settled future state. + - Phrase `Strategic Bets` as observed directional bets or public signals, not recommendations or settled future state. + - Each `Strategic Bets` bullet must start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`. + - Do not use bare `Bet:` labels or categorical statements like `X is the wedge` or `Y is a defensible primitive`. - When a surface is visible but still evolving in the packet, qualify it explicitly as emerging, evolving, or unsettled. + - Write `Open Questions` as actual unresolved questions, typically ending with `?`. - Do not use market-leadership or competitive-superiority language unless the packet explicitly supports it. Brief: @@ -70,8 +87,10 @@ function EvaluateFoundationDocument( - Penalize invented certainty, invented capabilities, or implementation leakage. - Penalize unsupported business-model, monetization, KPI, org, partnership, or operating-plan language. - Penalize consulting-style sections such as `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or similar drift. + - Penalize missing markdown heading structure if the document drifts from the required template shape. - Penalize `Strategic Bets` phrased as recommendations or settled conclusions when the packet only supports directional evidence. - Penalize flattening transitional surfaces as fully settled if the packet presents them as evolving. + - Penalize `Open Questions` that silently resolve ambiguity instead of keeping it open. - Penalize market-leadership or competitive-superiority claims not explicitly supported by the packet. - Use `Pass`, `Partial`, or `Fail` for each criterion. diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json index 1b5a1b8..d8c6e80 100644 --- a/skills/foundation-creator/evals/evals.json +++ b/skills/foundation-creator/evals/evals.json @@ -17,6 +17,15 @@ { "id": 0, "eval_name": "create-foundation-from-vercel-source-packet", + "scenario_type": "source_packet_transition", + "input_shape": "source_packet", + "ambiguity_level": "high", + "domain_profile": "company_foundation", + "primary_risks": [ + "invented_certainty", + "scope_bleed", + "weak_boundaries" + ], "prompt": "Use the source packet in `fixtures/vercel/raw_notes.md` to draft a top-level foundation document for Vercel. Preserve ambiguity where the positioning is in transition. Do not produce a `SPEC.md`, implementation plan, or architecture diagram.", "expected_output": "A top-level foundation document that frames Vercel as a developer cloud/platform company, captures the current tension between `Frontend Cloud` and `AI Cloud`, identifies core surfaces such as deployment workflow, collaboration, security, AI infrastructure, and platform-building, clarifies that Vercel is not just static hosting or general-purpose IaaS, and preserves open questions or strategic bets instead of inventing certainty.", "expected_file": "fixtures/vercel/expected_criteria.md", @@ -31,6 +40,15 @@ { "id": 1, "eval_name": "create-foundation-from-cloudflare-source-packet", + "scenario_type": "source_packet_transition", + "input_shape": "source_packet", + "ambiguity_level": "high", + "domain_profile": "company_foundation", + "primary_risks": [ + "invented_certainty", + "scope_bleed", + "weak_boundaries" + ], "prompt": "Use the source packet in `fixtures/cloudflare/raw_notes.md` to draft a top-level foundation document for Cloudflare. Preserve the tension between the connectivity cloud, developer platform, and AI/agents platform framings. Do not produce a `SPEC.md`, implementation plan, or architecture diagram.", "expected_output": "A top-level foundation document that frames Cloudflare as a unified platform spanning security/connectivity, developer infrastructure, and AI surfaces; preserves the tension between `connectivity cloud` and developer/AI platform identities; identifies durable surfaces like network/security control plane, developer runtime, AI infrastructure, and platform-building primitives; clarifies boundaries against generic hyperscaler or single-product framings; and preserves open questions rather than inventing certainty.", "expected_file": "fixtures/cloudflare/expected_criteria.md", @@ -41,6 +59,52 @@ "files": [ "fixtures/cloudflare/raw_notes.md" ] + }, + { + "id": 2, + "eval_name": "create-foundation-from-lightfast-founder-notes", + "scenario_type": "founder_notes_ambiguity", + "input_shape": "source_packet", + "ambiguity_level": "high", + "domain_profile": "company_foundation", + "primary_risks": [ + "invented_certainty", + "scope_bleed", + "weak_boundaries" + ], + "prompt": "Use the source packet in `fixtures/lightfast_founder_notes/raw_notes.md` to draft a top-level foundation document for Lightfast. Preserve unresolved framing tension. Do not produce a `SPEC.md`, implementation plan, org design, or business strategy memo.", + "expected_output": "A top-level foundation document that frames Lightfast as a durable artifact/constraint layer for agent work, preserves the tension between installable skills, compiler/eval/document substrate, and a broader operating layer for durable agent work, identifies surfaces like skill packages, document artifacts, typed contracts, evals, and possible discovery/distribution, clarifies boundaries against chat wrappers, no-code automation, agencies, and fully autonomous company-in-a-box framings, and keeps open questions unresolved instead of inventing certainty.", + "expected_file": "fixtures/lightfast_founder_notes/expected_criteria.md", + "packet_files": { + "raw_notes": "fixtures/lightfast_founder_notes/raw_notes.md", + "expected_criteria": "fixtures/lightfast_founder_notes/expected_criteria.md" + }, + "files": [ + "fixtures/lightfast_founder_notes/raw_notes.md" + ] + }, + { + "id": 3, + "eval_name": "create-foundation-from-harbor-care-source-packet", + "scenario_type": "cross_domain_generalization", + "input_shape": "source_packet", + "ambiguity_level": "high", + "domain_profile": "non_developer_domain", + "primary_risks": [ + "source_overfitting", + "invented_certainty", + "weak_boundaries" + ], + "prompt": "Use the source packet in `fixtures/harbor_care/raw_notes.md` to draft a top-level foundation document for Harbor Care. Treat it as a trust-heavy care navigation and coordination primitive, not a developer platform. Preserve unresolved framing tension and do not invent clinical authority, business strategy, or implementation detail.", + "expected_output": "A top-level foundation document that frames Harbor Care as a care navigation or coordination primitive, preserves the tension between shared longitudinal care picture, coordination operating layer, and benefits/logistics interpreter framings, identifies durable surfaces like intake, shared timeline, coordination, handoffs, and benefits interpretation, clarifies boundaries against telehealth, insurance, EHR, and clinician marketplace framings, and keeps open questions about automation, sponsor shape, and clinical boundary unresolved.", + "expected_file": "fixtures/harbor_care/expected_criteria.md", + "packet_files": { + "raw_notes": "fixtures/harbor_care/raw_notes.md", + "expected_criteria": "fixtures/harbor_care/expected_criteria.md" + }, + "files": [ + "fixtures/harbor_care/raw_notes.md" + ] } ] } diff --git a/skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md b/skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md new file mode 100644 index 0000000..e3ed403 --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/harbor_care/expected_criteria.md @@ -0,0 +1,24 @@ +# Expected Criteria + +- The output should identify Harbor Care as a care navigation or coordination + primitive, not as telehealth, insurance, an EHR, or a clinician marketplace. +- The output should preserve at least some tension between the plausible + framings: + shared longitudinal care picture, + coordination operating layer, + or benefits/logistics interpreter. +- The output should identify multiple durable surfaces such as intake/context, + shared timeline or longitudinal record, coordination, handoff support, and + benefits/eligibility interpretation. +- The output should include an actor model that recognizes caregivers or + families, human advocates/coordinators, and at least one institutional actor + such as providers, payers, or employers. +- The output should set clear boundaries against clinical diagnosis, telehealth, + insurance, or generic marketplaces. +- The output should preserve open questions about business shape, automation vs + human-in-the-loop, and the clinical boundary instead of collapsing them into + certainty. +- The output should not invent medical claims, treatment outcomes, revenue + model, internal org structure, or unsupported platform surfaces not present in + the packet. + diff --git a/skills/foundation-creator/evals/fixtures/harbor_care/raw_notes.md b/skills/foundation-creator/evals/fixtures/harbor_care/raw_notes.md new file mode 100644 index 0000000..b08c67e --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/harbor_care/raw_notes.md @@ -0,0 +1,69 @@ +# Harbor Care Source Packet + +Assembled on April 20, 2026 from synthetic product-positioning notes. + +This packet exists to test whether `foundation-creator` generalizes beyond +developer infrastructure and still writes a strong foundation document when the +domain is trust-heavy, operational, and human-centered. + +## Positioning notes + +- Harbor Care helps families and care teams navigate fragmented eldercare and + chronic-care coordination. +- The problem is not diagnosis. The problem is that care episodes span many + people, systems, approvals, and handoffs, and nobody has a stable operating + picture. +- Families keep becoming accidental project managers for health and home-care + logistics. +- Human care advocates still matter. Trust breaks if the product acts like a + fully autonomous care robot. +- Not telehealth. +- Not an insurer. +- Not an electronic health record. +- Not a marketplace for doctors or home aides. +- It might be best framed as a care navigation layer or coordination fabric, + but those are still slightly different framings. +- One framing is "shared longitudinal picture of a care journey." +- Another framing is "operating layer for care coordination across people and + institutions." +- Another framing is "benefits + logistics interpreter for families." +- The right center of gravity is not fully settled. + +## Durable surfaces that seem to recur + +- Intake and context gathering across patient, family, and care situation. +- Shared timeline of events, decisions, documents, and upcoming tasks. +- Coordination surface for family members, advocates, providers, and service + organizations. +- Benefits or eligibility interpretation support. +- Handoff support across hospital, clinic, home care, rehab, pharmacy, and + payer contexts. +- Longitudinal record of what happened, what is pending, and who owns which + next action. + +## Actor notes + +- Patients may be involved directly, but family caregivers are often the active + operators. +- Human care advocates or coordinators are central, not incidental. +- Provider offices and discharge planners interact with the system unevenly. +- Employers or payers might sponsor access in some versions of the business, + but that is not the core product truth. +- Regulated and trust-heavy settings mean the product cannot casually overclaim + clinical authority. + +## Strategic tension and open questions + +- The company probably wins on trust, continuity, and coordination quality more + than pure automation. +- There may be a bet on asynchronous coordination rather than forcing every + interaction into a live call center model. +- There may be a bet on keeping humans in the loop while using software and AI + to structure the work around them. +- Open question: + is Harbor Care primarily for families, for employer/payer-sponsored programs, + or for provider-linked coordination models? +- Open question: + how much intelligence should be automated vs escalated to human advocates? +- Open question: + where is the clean boundary between care coordination and clinical guidance? diff --git a/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md new file mode 100644 index 0000000..28cc166 --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/expected_criteria.md @@ -0,0 +1,28 @@ +# Expected Criteria + +- The output should identify Lightfast as a durable artifact or constraint layer + for agent work, not merely a chat wrapper, prompt library, or generic AI + assistant. +- The output should preserve the tension between at least two plausible + framings: + installable skills/distribution, + compiler/eval/document substrate, + or a broader operating layer for durable agent work. +- The output should identify multiple durable surfaces such as skill packages, + foundation/spec artifacts, typed contracts or compiler surfaces, evals, and + possibly discovery or distribution. +- The output should include an actor model that recognizes at least some mix of + founders/operators, builders or engineering teams, and other teams using + agents for repeatable work. +- The output should set clear boundaries: + not project management, + not generic no-code workflow automation, + not an agency, + and not a fully autonomous company-in-a-box. +- The output should preserve open questions around repo-native vs hosted, + authoring vs evaluation vs distribution, and coding-first vs broader + applicability instead of pretending those choices are settled. +- The output should not invent monetization, marketplace certainty, internal + org structure, financial claims, or execution plans that are not present in + the notes. + diff --git a/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md new file mode 100644 index 0000000..ee51510 --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/lightfast_founder_notes/raw_notes.md @@ -0,0 +1,76 @@ +# Lightfast Founder Notes Packet + +Assembled on April 20, 2026 from synthetic founder-style notes for Lightfast. + +This packet is intentionally messy, incomplete, and slightly contradictory. It +tests whether `foundation-creator` can preserve ambiguity and write a durable +foundation document without inventing resolution, monetization, or execution +plans. + +## Raw notes + +- Lightfast feels like it sits in the layer before code and after vague intent: + turn messy human direction into durable working artifacts. +- Skills are probably the first wedge, but the company does not obviously stop + at "prompt packs" or a "skills marketplace." +- Not another chat wrapper. +- Not an agency that manually delivers strategy decks or ops work. +- Not generic no-code workflow automation. +- Probably not a fully autonomous "company in a box" either. +- Strong belief: prompts vanish, transcripts vanish, and code is too late; + durable artifacts like foundation docs, specs, typed contracts, evals, and + reusable skill packages survive model churn better. +- The product should help rough notes become clearer artifacts: + messy notes -> foundation -> spec -> working automation. +- There is a real boundary here: the system should structure judgment, not + silently replace it. +- Human supervision matters. The product should not quietly decide company + strategy or product direction because a model sounded confident. +- Maybe the primitive is "turn ambiguity into stable constraints." +- Maybe the primitive is "operating system for durable agent work." +- That second framing might be too grand or premature. +- Feels repo-native and artifact-native, not just chat-native. +- There is something important about local-first work loops, versioned files, + and outputs that can be inspected and edited. +- Possible durable surfaces: + skills, compiler contracts, eval runners, document templates, reference + artifacts, maybe later a discovery/catalog layer. +- Maybe distribution ends up mattering as much as authoring: + not only creating skills, but making operational knowledge installable. +- Unsure whether the main buyer/user is: + solo builder, + founder/operator, + product/engineering team, + or internal ops team running agents. +- Coding is the easiest first wedge because eval loops are tighter, but the + company should probably not be framed as coding-only forever. +- It may eventually apply across coding, ops, research, support, GTM, and other + repeatable work with heavy ambiguity. +- Tension: + is this mainly better instructions for agents, + or a durable interface layer between humans, models, repositories, and + outputs? +- Tension: + local-first repo tooling + vs a hosted control plane, catalog, or distribution surface. +- Tension: + standardization that makes artifacts portable + vs customization that matches how each team actually works. +- Good boundary: + not project management, + not ticketing, + not a drag-and-drop workflow builder. +- Another boundary: + not a general AI assistant that answers everything. +- Another boundary: + not a substitute for direct ownership or decision-making. +- Open question: + when does a foundation document become a spec, and when does a spec become + code? +- Open question: + how much of the value is in authoring, how much in evaluation, and how much in + distribution? +- Open question: + should Lightfast remain repo-native first, or eventually become a hosted + system of record for agent work? + diff --git a/skills/foundation-creator/references/language.md b/skills/foundation-creator/references/language.md index d51ba80..49a4370 100644 --- a/skills/foundation-creator/references/language.md +++ b/skills/foundation-creator/references/language.md @@ -16,7 +16,21 @@ How the foundation document should be worded. - Declarative statements over persuasive rhetoric. - Prefer short, dense paragraphs and compact bullets. -## 3. Restraint Rules +## 3. Structural Conventions + +- Use `#` for the document title: `# Foundation`. +- Use `##` for every major section in `references/template.md`. +- Keep the section order exactly as the template defines it. +- Use bullets for `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, + `Strategic Bets`, and `Open Questions`. +- `Open Questions` bullets should be written as actual open questions and usually + end with `?`. +- `Strategic Bets` should be framed as observed directional signals or bets + rather than recommendations. Wording like `materials suggest a bet on...`, + `the company appears to be betting on...`, or `public signals indicate...` + is preferred. + +## 4. Restraint Rules - Prefer omission over invention. - If a point is plausible but not supported, omit it or convert it into an @@ -29,7 +43,7 @@ How the foundation document should be worded. - Do not assert market leadership, competitive superiority, or winner/loser framing unless directly supported by the source packet. -## 4. Allowed Section Behavior +## 5. Allowed Section Behavior - `What This Is` explains the primitive at a durable level. - `Core Thesis` contains only source-backed, thesis-level claims. @@ -39,13 +53,16 @@ How the foundation document should be worded. implementation components. If a surface is source-visible but still in transition, qualify it explicitly as emerging or evolving. - `Strategic Bets` should be minimal and clearly grounded in repeated signals. - Phrase them as observed directional bets (`public materials suggest a bet - on...`, `the company appears to be betting on...`) rather than settled - declarations or recommendations. + Each bullet should start with visible-evidence language such as `the notes + suggest a bet on...`, `public materials suggest...`, `there are visible + signals that...`, or `the company appears to be betting on...` rather than + settled declarations or recommendations. + Avoid naked labels like `Bet:` and avoid categorical claims like `X is the + wedge` or `Y is a defensible primitive`. - `Open Questions` should remain open rather than being quietly resolved in prose elsewhere. -## 5. Disallowed Drift +## 6. Disallowed Drift - No `Success Signals`, KPI, or metrics section. - No monetization strategy or revenue language. @@ -55,7 +72,7 @@ How the foundation document should be worded. - No market-leadership or competitive-positioning claims unless explicit in the source. -## 6. Tone +## 7. Tone - Dense, calm, and specific. - No hype language. diff --git a/skills/foundation-creator/references/template.md b/skills/foundation-creator/references/template.md index 7fc4adf..2a323ab 100644 --- a/skills/foundation-creator/references/template.md +++ b/skills/foundation-creator/references/template.md @@ -1,6 +1,8 @@ # {Primitive Name} Foundation Use only the sections below unless the user explicitly asks for more. +All headings in the final document must use markdown heading syntax exactly as +shown here. ## What This Is @@ -28,13 +30,13 @@ Use only the sections below unless the user explicitly asks for more. ## Strategic Bets -- {Only if clearly supported by the material.} -- {Use fewer bullets rather than speculative ones.} +- {The notes suggest a bet on ...} +- {There are visible signals that ...} ## Open Questions -- {Unresolved tension or ambiguity the source does not settle.} -- {Another open question.} +- {Unresolved tension or ambiguity the source does not settle?} +- {Another open question?} ## Disallowed Sections diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml index 83ba3a7..3339db7 100644 --- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml +++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml @@ -73,6 +73,28 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string { - Ask for a top-level `SPEC.md`. - Keep the output language-agnostic and behavioral. - Preserve unresolved questions instead of inventing decisions. + - If the brief includes `existing_spec`, treat the task as update mode and preserve the existing document verbatim except for the requested edits. + - Require the canonical `spec-creator` section shape: + - `# Specification` + - `Status: Draft v1 (language-agnostic)` + - `Purpose: ` + - `## 1. Problem Statement` + - `## 2. Goals and Non-Goals` + - `### 2.1 Goals` + - `### 2.2 Non-Goals` + - `## 3. System Overview` + - `### 3.1 Main Components` + - `### 3.2 External Dependencies` + - `## 4. Core Domain Model` + - `### 4.1 Entities` + - Require `## 1. Problem Statement` to contain: + - one opening paragraph + - a line like `The service solves operational problems:` + - a bullet list of explicit operational problems + - Require `Important boundary:` as a labeled block inside the Problem Statement. + - Require numbered components and template-shaped field lines: `- `field_name` (type, constraints)`. + - Require external dependencies to stay at the system/service level rather than naming low-level API methods or SDK calls. + - In update mode, do not add new top-level sections or template-completion material unless the request explicitly asks for them. - Emphasize problem statement, goals, non-goals, components, dependencies, and entities. - Make the prompt directly usable by an agent. diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml index b94e59a..d093142 100644 --- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml +++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml @@ -33,11 +33,43 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string { Draft a `SPEC.md` from the brief below. Rules: + - Output valid markdown. - Keep the document behavioral and language-agnostic. - - Use problem statement, goals, non-goals, boundaries, components, - dependencies, and entities. + - If `existing_spec` is present, treat this as update mode and let the update-mode rules override the create-mode template-completion rules below. + - Follow the `spec-creator` template shape directly. Do not rename the core headings or collapse them into ad hoc labels. + - Use exactly this opening shape: + - `# Specification` + - `Status: Draft v1 (language-agnostic)` + - `Purpose: ` + - Include these sections in order: + - `## 1. Problem Statement` + - `## 2. Goals and Non-Goals` + - `### 2.1 Goals` + - `### 2.2 Non-Goals` + - `## 3. System Overview` + - `### 3.1 Main Components` + - `### 3.2 External Dependencies` + - `## 4. Core Domain Model` + - `### 4.1 Entities` + - In `## 1. Problem Statement`, write: + - one opening paragraph describing the service at a high level + - a line like `The service solves operational problems:` + - a bullet list of concrete operational problems + - Keep `Important boundary:` as a labeled block inside `## 1. Problem Statement`. Do not turn it into a standalone section. + - Use a numbered list for `### 3.1 Main Components` in the form `1. `Component Name`` followed by indented responsibility bullets. + - For entities, use `#### 4.1.x EntityName`, then `Fields:`, then field lines in the form `- `field_name` (type, constraints)` with indented semantic bullets and optional `Default: `value`` lines. + - Use logical field types (`string`, `integer`, `boolean`, `timestamp`, `list of strings`, `map`, `string or null`), not implementation types. + - List external dependencies as systems or services, not low-level API method names, SDK calls, classes, or internal implementation choices. - Preserve uncertainty where the source packet is in transition. - Avoid implementation detail. + - Optional extra sections are allowed only when the brief materially needs them; if used, place them after `## 4. Core Domain Model`. + - Do not add end markers, appendix-only filler, or operational/program-management sections unless the brief explicitly requires them. + - If `existing_spec` is present, treat this as update mode: + - start from `existing_spec` + - preserve all unchanged lines verbatim + - apply only the edits implied by `update_request` + - do not rewrite existing paragraphs, bullets, or component descriptions unless the request requires it + - do not add new top-level sections, domain model content, dependencies, or other template-completion material unless `update_request` explicitly asks for them Brief: {{ brief|format(type="yaml") }} @@ -63,6 +95,7 @@ function EvaluateSpecDocument( - Grade against the expected criteria explicitly. - Penalize invented capabilities, invented certainty, or implementation leakage. - Reward correct scope boundaries and careful handling of transition states. + - Penalize missing core template sections, malformed field formatting, or moving `Important boundary:` out of the Problem Statement block. - Use `Pass`, `Partial`, or `Fail` for each criterion. {{ ctx.output_format }} diff --git a/skills/spec-creator/baml_src/spec_compiler/spec_types.baml b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml index 3d116b5..2e03a1e 100644 --- a/skills/spec-creator/baml_src/spec_compiler/spec_types.baml +++ b/skills/spec-creator/baml_src/spec_compiler/spec_types.baml @@ -27,6 +27,8 @@ class SpecBrief { external_dependencies string[] entities EntityBrief[] unresolved_questions string[] + update_request string? + existing_spec string? } class SpecCritique { diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json index cec004a..93083be 100644 --- a/skills/spec-creator/evals/evals.json +++ b/skills/spec-creator/evals/evals.json @@ -17,6 +17,14 @@ { "id": 0, "eval_name": "create-from-clear-intent", + "scenario_type": "clear_intent_prompt", + "input_shape": "direct_prompt", + "ambiguity_level": "low", + "domain_profile": "developer_infrastructure", + "primary_risks": [ + "template_drift", + "implementation_leakage" + ], "prompt": "Can you write a SPEC.md for a service we're building called Glacier Tier Manager? It watches S3 buckets and moves old objects to Glacier Deep Archive after policy-defined age thresholds. Polling, not event-driven — we want deterministic cadence. Main components: a Bucket Poller that lists objects on a schedule, a Policy Evaluator that decides which objects are eligible, and a Tiering Executor that issues the storage-class change. External deps: S3 API and per-bucket lifecycle policy files. Not a general-purpose cost optimizer — it only does tiering.", "expected_output": "A SPEC.md at repo root covering Purpose, Problem Statement (with 3 bullets + 'Important boundary' block), Goals + Non-Goals (non-goal should reflect 'not a cost optimizer'), System Overview with the three named components, External Dependencies, and a Core Domain Model. Voice is third-person, fields use the `name` (type) format, obligation keywords are lowercase.", "files": [] @@ -24,6 +32,14 @@ { "id": 1, "eval_name": "create-from-unstructured-notes", + "scenario_type": "unstructured_notes_prompt", + "input_shape": "notes_prompt", + "ambiguity_level": "medium", + "domain_profile": "developer_infrastructure", + "primary_risks": [ + "template_drift", + "weak_boundaries" + ], "prompt": "I have rough notes on a service I want formalized into a SPEC.md. Here's what I have:\n\n- name: Hookshot. it's a webhook retry daemon.\n- retries failed outbound webhooks with exponential backoff\n- keeps state per endpoint (success rate, last attempt)\n- has a per-URL circuit breaker so one broken target doesn't starve others\n- retry schedules configurable per endpoint\n- NOT a general-purpose message queue. teams shouldn't push arbitrary jobs through it\n- reads webhook delivery records from a postgres table called webhook_deliveries\n- marks them delivered or dead-lettered\n\nTurn this into a proper spec.", "expected_output": "A SPEC.md that reorganizes the notes into the template structure: Purpose (one sentence about webhook retry), Problem Statement with bullets, 'Important boundary' + Non-Goals covering the 'not a queue' constraint, Main Components (retry scheduler, circuit breaker, state tracker or equivalent), External Dependencies (postgres), Domain Model with an entity for the delivery record. Notes-style phrasings are rewritten in spec voice.", "files": [] @@ -31,13 +47,40 @@ { "id": 2, "eval_name": "update-add-nongoal-and-component", + "scenario_type": "update_existing_doc", + "input_shape": "existing_doc_update", + "ambiguity_level": "low", + "domain_profile": "developer_infrastructure", + "primary_risks": [ + "update_regression", + "template_drift" + ], "prompt": "We have an existing SPEC.md at the repo root for Log Shipper. Please update it: add a non-goal stating that cross-region log replication is out of scope, and add a new main component called `Offset Store` that persists per-file read offsets. Keep everything else as-is.", "expected_output": "SPEC.md with the existing sections preserved verbatim except: (1) a new bullet added under Non-Goals covering cross-region replication, phrased as a gerund/noun phrase per the language guide; (2) a new numbered component `Offset Store` added under 3.1 with a verb-led description. Section numbering remains 1, 2, 3; component numbering extends to 4. No first-person pronouns introduced.", + "validation_contract": { + "type": "reference_document_checks", + "validator": "spec-update-v1", + "language_file": "references/language.md", + "existing_spec_file": "evals/fixtures/existing_spec.md" + }, + "packet_files": { + "existing_spec": "fixtures/existing_spec.md" + }, "files": ["fixtures/existing_spec.md"] }, { "id": 3, "eval_name": "create-from-vercel-mcp-source-packet", + "scenario_type": "source_packet_transition", + "input_shape": "source_packet", + "ambiguity_level": "high", + "domain_profile": "developer_infrastructure", + "primary_risks": [ + "invented_capabilities", + "invented_certainty", + "template_drift", + "implementation_leakage" + ], "prompt": "Use the source packet in `fixtures/vercel_mcp/raw_notes.md` to write a `SPEC.md` for a service called `Vercel MCP`. Treat it as a long-running remote MCP service, not as a company-level foundation document. Preserve timeline-specific ambiguity where the source packet is in transition, and do not invent write capabilities that the notes do not justify.", "expected_output": "A `SPEC.md` that frames `Vercel MCP` as an OAuth-protected remote MCP service for AI tools, includes a Problem Statement about secure structured access to Vercel docs, projects, deployments, and logs, sets clear goals and non-goals, identifies boundaries around approved clients and official endpoint usage, captures the current beta/read-only tension without pretending the service already has unconstrained write access, and stays behavioral rather than implementation-level.", "expected_file": "fixtures/vercel_mcp/expected_criteria.md", From 65d821ffac690d945251bf55168845b4f40b9b57 Mon Sep 17 00:00:00 2001 From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com> Date: Tue, 21 Apr 2026 00:28:31 +1000 Subject: [PATCH 06/30] Add foundation update-mode eval coverage --- README.md | 2 + evals/TAXONOMY.md | 4 +- scripts/run-baml-eval.mjs | 105 ++++++++++++++++-- skills/foundation-creator/SKILL.md | 49 ++++++++ .../compiler_functions.baml | 2 + .../foundation_compiler/eval_runner.baml | 16 +++ .../foundation_compiler/eval_types.baml | 1 + .../foundation_compiler/foundation_types.baml | 2 + skills/foundation-creator/evals/evals.json | 53 +++++++++ .../evals/fixtures/existing_foundation.md | 56 ++++++++++ .../expected_criteria.md | 13 +++ .../foundation-creator/references/language.md | 2 + 12 files changed, 293 insertions(+), 12 deletions(-) create mode 100644 skills/foundation-creator/evals/fixtures/existing_foundation.md create mode 100644 skills/foundation-creator/evals/fixtures/lightfast_foundation_update/expected_criteria.md diff --git a/README.md b/README.md index e38a7db..9381b4c 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ This repo now includes BAML-backed fixture evals for `foundation-creator` and bun install bun run eval:foundation -- create-foundation-from-vercel-source-packet bun run eval:foundation -- create-foundation-from-lightfast-founder-notes +bun run eval:foundation -- update-lightfast-foundation-boundary-surface-question bun run eval:spec -- create-from-vercel-mcp-source-packet bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3 ``` @@ -42,6 +43,7 @@ Current `foundation-creator` corpus includes: - `create-foundation-from-cloudflare-source-packet` - `create-foundation-from-lightfast-founder-notes` - `create-foundation-from-harbor-care-source-packet` +- `update-lightfast-foundation-boundary-surface-question` The runner now also writes: diff --git a/evals/TAXONOMY.md b/evals/TAXONOMY.md index 3f843e7..7cb33b3 100644 --- a/evals/TAXONOMY.md +++ b/evals/TAXONOMY.md @@ -74,8 +74,8 @@ values: The next missing slices are: -- `update_existing_doc` for `foundation-creator` once revise-in-place behavior - is defined +- a second `update_existing_doc` packet for `foundation-creator` that requires + replacing or tightening existing language rather than additive edits only - baseline comparison runs (`current skill` vs `previous skill` / `no skill`) when the local harness is ready to compare deltas directly - optional Braintrust-style scorer/export integration if local JSON artifacts are diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs index d1e7e52..4f01658 100644 --- a/scripts/run-baml-eval.mjs +++ b/scripts/run-baml-eval.mjs @@ -192,6 +192,9 @@ async function buildPacket(evalEntry, evalsDir, packetType) { const existingSpecPath = packetFiles.existing_spec ? path.join(evalsDir, packetFiles.existing_spec) : null; + const existingFoundationPath = packetFiles.existing_foundation + ? path.join(evalsDir, packetFiles.existing_foundation) + : null; const packet = { packet_name: evalEntry.eval_name, @@ -206,6 +209,12 @@ async function buildPacket(evalEntry, evalsDir, packetType) { packet.existing_spec = existingSpecPath ? await loadText(existingSpecPath) : null; } + if (packetType === "FoundationEvalPacket") { + packet.existing_foundation = existingFoundationPath + ? await loadText(existingFoundationPath) + : null; + } + return packet; } @@ -327,6 +336,34 @@ function extractFoundationSectionBodies(candidateDocument) { }; } +function extractMarkdownBullets(sectionBody) { + const bullets = []; + let current = null; + + for (const rawLine of sectionBody.split(/\r?\n/)) { + if (/^\s*-\s+/.test(rawLine)) { + if (current) { + bullets.push(current); + } + current = normalizeLine(rawLine.replace(/^\s*-\s+/, "")); + continue; + } + + const line = normalizeLine(rawLine); + if (!current || line.length === 0) { + continue; + } + + current = `${current} ${line}`.trim(); + } + + if (current) { + bullets.push(current); + } + + return bullets; +} + function validateFoundationDocument(candidateDocument, templateText) { const requiredSections = extractFoundationTemplateSections(templateText); const disallowedHeadings = extractFoundationDisallowedHeadings(templateText); @@ -383,24 +420,18 @@ function validateFoundationDocument(candidateDocument, templateText) { ); const strategicBetsBody = bodies.get("Strategic Bets") ?? ""; const openQuestionsBody = bodies.get("Open Questions") ?? ""; - const openQuestionBullets = openQuestionsBody - .split(/\r?\n/) - .map((line) => normalizeLine(line)) - .filter((line) => line.startsWith("- ")); + const openQuestionBullets = extractMarkdownBullets(openQuestionsBody); const openQuestionsLookOpen = openQuestionBullets.length > 0 && openQuestionBullets.every((line) => line.endsWith("?")); - const strategicBetLines = strategicBetsBody - .split(/\r?\n/) - .map((line) => normalizeLine(line)) - .filter((line) => line.startsWith("- ")); + const strategicBetLines = extractMarkdownBullets(strategicBetsBody); const strategicBetsBodyHasDirectionalPreamble = /\b(observed directional bets|public signals)\b/i.test( strategicBetsBody, ); const hedgedStrategicBets = strategicBetLines.length === 0 || strategicBetLines.every((line) => - /^\-\s+Bet:/i.test(line) + /^Bet:/i.test(line) ? strategicBetsBodyHasDirectionalPreamble : /\b(appears?|suggests?|signals?|signaling|indicates?|indicating|directional bet|directional bets|observed bet|observed bets|a bet that|bet that|bet on)\b/i.test( line, @@ -469,6 +500,41 @@ function validateFoundationDocument(candidateDocument, templateText) { ]; } +function validateFoundationUpdateDocument( + candidateDocument, + existingFoundationText, + templateText, + validationContract, +) { + const baseChecks = validateFoundationDocument(candidateDocument, templateText); + const existingLines = extractNonEmptyNormalizedLines(existingFoundationText); + const candidateLines = extractNonEmptyNormalizedLines(candidateDocument); + const normalizedCandidate = normalizeLine(candidateDocument); + const preservesExistingContent = linesAppearInOrder(existingLines, candidateLines); + const requiredPatternChecks = (validationContract.required_patterns ?? []).map((patternCheck) => { + const expression = new RegExp(patternCheck.pattern, patternCheck.flags ?? "i"); + const passed = expression.test(normalizedCandidate); + + return createCheck( + patternCheck.id, + passed, + passed ? patternCheck.details_pass : patternCheck.details_fail, + ); + }); + + return [ + ...baseChecks, + createCheck( + "existing_content_preserved_in_order", + preservesExistingContent, + preservesExistingContent + ? "All non-empty lines from the existing foundation appear in order in the candidate." + : "One or more non-empty lines from the existing foundation were removed or reordered.", + ), + ...requiredPatternChecks, + ]; +} + function extractSpecMajorSections(templateText) { const sections = []; const lines = templateText.split(/\r?\n/); @@ -650,10 +716,14 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo const existingSpecPath = validationContract.existing_spec_file ? path.join(skillRoot, validationContract.existing_spec_file) : null; - const [templateText, languageText, existingSpecText] = await Promise.all([ + const existingFoundationPath = validationContract.existing_foundation_file + ? path.join(skillRoot, validationContract.existing_foundation_file) + : null; + const [templateText, languageText, existingSpecText, existingFoundationText] = await Promise.all([ validationContract.template_file ? loadText(templatePath) : Promise.resolve(""), languagePath ? loadText(languagePath) : Promise.resolve(""), existingSpecPath ? loadText(existingSpecPath) : Promise.resolve(""), + existingFoundationPath ? loadText(existingFoundationPath) : Promise.resolve(""), ]); let checks; @@ -661,6 +731,14 @@ async function runDeterministicChecks(skillRoot, validationContract, candidateDo case "foundation-v1": checks = validateFoundationDocument(candidateDocument, templateText, languageText); break; + case "foundation-update-v1": + checks = validateFoundationUpdateDocument( + candidateDocument, + existingFoundationText, + templateText, + validationContract, + ); + break; case "spec-v1": checks = validateSpecDocument(candidateDocument, templateText, languageText); break; @@ -715,6 +793,13 @@ async function runSingleTrial({ } } + if (runner.packet_type === "FoundationEvalPacket") { + brief.update_request = packet.task_prompt; + if (packet.existing_foundation) { + brief.existing_foundation = packet.existing_foundation; + } + } + const renderStartedAt = Date.now(); const candidateDocument = await b[renderFnName](brief); timing.render_ms = Date.now() - renderStartedAt; diff --git a/skills/foundation-creator/SKILL.md b/skills/foundation-creator/SKILL.md index 811bb59..88f7bad 100644 --- a/skills/foundation-creator/SKILL.md +++ b/skills/foundation-creator/SKILL.md @@ -39,9 +39,57 @@ Load on demand, not upfront. - Prefer explicit open questions over invented certainty. - Separate durable beliefs from speculative bets. - Avoid implementation detail unless the user explicitly wants it. +- When updating an existing foundation document, preserve section order and + untouched wording unless the request explicitly requires a rewrite. - Escalate to `spec-creator` only when a subsystem is concrete enough to deserve a `SPEC.md`. +## Decide: create or update + +Before writing: + +1. If the user provides an existing foundation document or asks to revise one, + use **update mode**. +2. Otherwise, use **create mode**. + +Do not treat a revise-in-place request as a greenfield rewrite. The distinction +matters because foundation documents often preserve useful ambiguity that +should not be flattened during an edit. + +## Create mode + +Gather only what is still missing from the notes: + +- what the primitive or company is +- what it is not +- durable thesis-level beliefs +- meaningful actors +- durable surfaces +- unresolved tensions or open questions + +Then draft the document using `references/template.md` and validate it against +`references/language.md`. + +## Update mode + +1. Read the existing foundation document fully. +2. Scope the requested change narrowly. +3. Edit in place: + - preserve the existing heading structure + - keep unchanged lines and bullets verbatim where possible + - add or revise only what the request requires + - do not add new sections, planning material, or stronger certainty unless + the request explicitly justifies it +4. Re-read and validate the whole document, not just the changed section. + +Typical update shapes: + +- add a sharper boundary +- add or remove a durable surface +- clarify a strategic bet without turning it into a recommendation +- add an open question that the earlier draft omitted +- tighten an overconfident sentence back into source-bound language + ## Allowed content - What the primitive is. @@ -78,6 +126,7 @@ Before finalizing, check for these failure modes: - business-model speculation - metrics or operational milestones not present in the source - missing explicit open questions where the notes remain unsettled +- update drift that rewrites unchanged material or quietly resolves ambiguity ## Current compiler surface diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml index f793b6f..6a9f9a4 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml @@ -109,6 +109,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string { - Avoid implementation detail. - Emphasize thesis, boundaries, actor model, surfaces, and strategic bets. - State that the writer is a source-bound synthesizer, not a strategy consultant. + - If the brief includes `existing_foundation`, treat the task as update mode and preserve the existing document verbatim except for the requested edits. - Require markdown output with exactly this heading structure unless the user asks otherwise: - `# Foundation` - `## What This Is` @@ -125,6 +126,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string { - Require recently emerging or transitional surfaces to be qualified explicitly rather than flattened as fully settled. - Require `Open Questions` bullets to remain actual unresolved questions rather than disguised conclusions. - Forbid market-leadership or superiority claims unless they are explicit in the brief. + - In update mode, do not add new sections or broad new framing unless the request explicitly asks for them. - Make the prompt directly usable by an agent. Brief: diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml index fb8fd23..edaa130 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml @@ -10,6 +10,9 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found Raw notes: {{ packet.raw_notes }} + Existing foundation: + {{ packet.existing_foundation }} + Expected criteria: {{ packet.expected_criteria }} @@ -18,6 +21,11 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found - Preserve ambiguity where the notes do not settle the framing. - Treat expected criteria as evaluation guidance, not as license to invent. - Avoid implementation detail. + - If `existing_foundation` is present, treat this as update mode: + - preserve the existing section structure + - carry forward unchanged framing and language where possible + - extract a narrow `update_request` rather than rewriting the entire document + - do not resolve existing ambiguity unless the task prompt explicitly asks for it - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them. - If `strategic_bets` are included, phrase them as observed directional signals or evidence. - Each `strategic_bets` item should start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`. @@ -41,6 +49,7 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string { - Start from thesis and boundaries, not architecture. - Preserve unresolved questions explicitly. - Avoid implementation detail. + - If `existing_foundation` is present, treat this as update mode and let the update-mode rules override the create-mode template-completion rules below. - Stay source-bound: do not invent monetization, KPIs, org structure, partnerships, operating guidance, or next-step plans. - Prefer omission over plausible-sounding speculation. - Return markdown. @@ -60,6 +69,12 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string { - When a surface is visible but still evolving in the packet, qualify it explicitly as emerging, evolving, or unsettled. - Write `Open Questions` as actual unresolved questions, typically ending with `?`. - Do not use market-leadership or competitive-superiority language unless the packet explicitly supports it. + - If `existing_foundation` is present, treat this as update mode: + - start from `existing_foundation` + - preserve all unchanged lines verbatim + - apply only the edits implied by `update_request` + - do not rewrite existing paragraphs or bullets unless the request requires it + - do not add new sections or broad new framing unless `update_request` explicitly asks for them Brief: {{ brief|format(type="yaml") }} @@ -85,6 +100,7 @@ function EvaluateFoundationDocument( - Grade against the expected criteria explicitly. - Reward preservation of uncertainty when the source packet is genuinely mixed. - Penalize invented certainty, invented capabilities, or implementation leakage. + - If `existing_foundation` is present, penalize rewriting unchanged content or adding broad new material outside the requested edit. - Penalize unsupported business-model, monetization, KPI, org, partnership, or operating-plan language. - Penalize consulting-style sections such as `Success Signals`, `Metrics`, `Decision Agenda`, `Next Steps`, `Operating Guidance`, or similar drift. - Penalize missing markdown heading structure if the document drifts from the required template shape. diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml index 94058d1..b8ef525 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_types.baml @@ -22,4 +22,5 @@ class FoundationEvalPacket { task_prompt string @assert(nonempty_task_prompt, {{ this|length > 0 }}) raw_notes string @assert(nonempty_raw_notes, {{ this|length > 0 }}) expected_criteria string @assert(nonempty_expected_criteria, {{ this|length > 0 }}) + existing_foundation string? } diff --git a/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml index 002c42d..2d7e916 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/foundation_types.baml @@ -29,4 +29,6 @@ class FoundationBrief { surfaces string[] strategic_bets string[] unresolved_questions string[] + update_request string? + existing_foundation string? } diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json index d8c6e80..198fdbe 100644 --- a/skills/foundation-creator/evals/evals.json +++ b/skills/foundation-creator/evals/evals.json @@ -105,6 +105,59 @@ "files": [ "fixtures/harbor_care/raw_notes.md" ] + }, + { + "id": 4, + "eval_name": "update-lightfast-foundation-boundary-surface-question", + "scenario_type": "update_existing_doc", + "input_shape": "existing_doc_update", + "ambiguity_level": "low", + "domain_profile": "company_foundation", + "primary_risks": [ + "update_regression", + "template_drift", + "invented_certainty" + ], + "prompt": "We have an existing Lightfast foundation document. Please update it in place: add a boundary clarifying that Lightfast is not a hosted control plane or system of record by default, add a durable surface bullet covering versioned evaluation artifacts and run histories, and add an open question about whether repo-native usage remains the center of gravity or shifts toward a hosted system of record. Keep everything else unchanged.", + "expected_output": "The existing Lightfast foundation document is preserved verbatim except for three requested additions: (1) a new boundary clarifying that Lightfast is not a hosted control plane or system of record by default; (2) a new durable surface bullet about versioned evaluation artifacts and run histories; and (3) a new open question about repo-native versus hosted center of gravity. No new sections, planning drift, or first-person language are introduced.", + "expected_file": "fixtures/lightfast_foundation_update/expected_criteria.md", + "validation_contract": { + "type": "reference_document_checks", + "validator": "foundation-update-v1", + "template_file": "references/template.md", + "language_file": "references/language.md", + "existing_foundation_file": "evals/fixtures/existing_foundation.md", + "required_patterns": [ + { + "id": "hosted_control_plane_boundary_present", + "pattern": "not a hosted control plane or system of record", + "flags": "i", + "details_pass": "Detected the requested hosted-control-plane/system-of-record boundary.", + "details_fail": "Did not detect the requested hosted-control-plane/system-of-record boundary." + }, + { + "id": "evaluation_artifacts_surface_present", + "pattern": "versioned evaluation artifacts and run histor", + "flags": "i", + "details_pass": "Detected the requested durable-surface bullet about evaluation artifacts and run histories.", + "details_fail": "Did not detect the requested durable-surface bullet about evaluation artifacts and run histories." + }, + { + "id": "repo_native_vs_hosted_question_present", + "pattern": "repo-native usage remain[s]? the center of gravity.*hosted system of record", + "flags": "i", + "details_pass": "Detected the requested open question about repo-native versus hosted center of gravity.", + "details_fail": "Did not detect the requested open question about repo-native versus hosted center of gravity." + } + ] + }, + "packet_files": { + "expected_criteria": "fixtures/lightfast_foundation_update/expected_criteria.md", + "existing_foundation": "fixtures/existing_foundation.md" + }, + "files": [ + "fixtures/existing_foundation.md" + ] } ] } diff --git a/skills/foundation-creator/evals/fixtures/existing_foundation.md b/skills/foundation-creator/evals/fixtures/existing_foundation.md new file mode 100644 index 0000000..e1bd1b1 --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/existing_foundation.md @@ -0,0 +1,56 @@ +# Lightfast Foundation + +## What This Is +Lightfast is a durable-artifact layer for agent work. It turns messy direction +into inspectable, versioned artifacts that structure human judgment across +authoring, review, and execution. + +## Core Thesis +- Durable artifacts such as foundation documents, specs, typed contracts, and + reusable skill packages survive model churn better than prompts or + transcripts. +- The core primitive is the translation of ambiguity into stable constraints + that humans and agents can inspect and reuse. +- Repo-native and artifact-native workflows are a strong default because + outputs can be versioned, reviewed, and edited directly. +- Coding workflows are a plausible first wedge, but the product may extend to + other repeatable work with heavy ambiguity. + +## Boundaries +- Not a chat wrapper, prompt library, or general AI assistant. +- Not an agency delivering bespoke strategy or operational work. +- Not project management, ticketing, or a drag-and-drop workflow builder. +- Not a fully autonomous company-in-a-box that replaces human ownership. + +## Actor Model +- Solo builders use reusable skill packages and local-first artifact workflows. +- Founders and operators use foundation documents and specs to align intent + while preserving decision ownership. +- Product, engineering, and operations teams use typed contracts and repeatable + artifacts to review and distribute working knowledge. + +## Durable Surfaces +- Foundation documents and templates that capture durable intent and scope. +- Specs that turn thesis-level framing into reviewable behavioral documents. +- Reusable skill packages that make operational knowledge installable. +- Typed contracts that make expectations explicit across humans, agents, and + code. +- Repo-native files and reference artifacts that support versioned local + editing and review. + +## Strategic Bets +- The notes suggest a bet on skills and coding workflows as an initial wedge + because evaluation loops are tighter there. +- There are visible signals that repo-native artifacts are a preferred + stability and inspection boundary. +- The company appears to be betting that formal constraints and reusable + artifacts will outlast raw prompts and transcripts. + +## Open Questions +- Is the main primitive better described as durable instructions for agents or + as a broader interface layer between humans, models, repositories, and + outputs? +- How much of the long-term value comes from authoring artifacts versus + distributing them? +- Will the company remain coding-first, or extend meaningfully into research, + operations, support, and GTM work? diff --git a/skills/foundation-creator/evals/fixtures/lightfast_foundation_update/expected_criteria.md b/skills/foundation-creator/evals/fixtures/lightfast_foundation_update/expected_criteria.md new file mode 100644 index 0000000..d167d07 --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/lightfast_foundation_update/expected_criteria.md @@ -0,0 +1,13 @@ +# Expected Criteria + +- The output should preserve the existing Lightfast foundation document's title, + section order, and unchanged wording instead of rewriting it from scratch. +- The output should add one new `Boundaries` bullet clarifying that Lightfast + is not a hosted control plane or system of record by default. +- The output should add one new `Durable Surfaces` bullet covering versioned + evaluation artifacts and run histories. +- The output should add one new `Open Questions` bullet asking whether + repo-native usage remains the center of gravity or shifts toward a hosted + system of record. +- The output should not add new sections, planning language, monetization + claims, or stronger certainty than the request supports. diff --git a/skills/foundation-creator/references/language.md b/skills/foundation-creator/references/language.md index 49a4370..60decb0 100644 --- a/skills/foundation-creator/references/language.md +++ b/skills/foundation-creator/references/language.md @@ -21,6 +21,8 @@ How the foundation document should be worded. - Use `#` for the document title: `# Foundation`. - Use `##` for every major section in `references/template.md`. - Keep the section order exactly as the template defines it. +- In update mode, preserve the existing section order and keep unchanged wording + intact unless the requested edit requires a local rewrite. - Use bullets for `Core Thesis`, `Boundaries`, `Actor Model`, `Durable Surfaces`, `Strategic Bets`, and `Open Questions`. - `Open Questions` bullets should be written as actual open questions and usually From a5e56f3ce2002bd0da2bc7f4c6fd76fbb9ece637 Mon Sep 17 00:00:00 2001 From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com> Date: Tue, 21 Apr 2026 12:35:30 +1000 Subject: [PATCH 07/30] Add replacement-heavy foundation update eval --- README.md | 2 + evals/TAXONOMY.md | 2 - scripts/run-baml-eval.mjs | 112 +++++++++++++--- .../compiler_functions.baml | 2 + .../foundation_compiler/eval_runner.baml | 4 +- skills/foundation-creator/evals/evals.json | 120 ++++++++++++++++++ .../existing_foundation_overconfident.md | 47 +++++++ .../expected_criteria.md | 21 +++ 8 files changed, 291 insertions(+), 19 deletions(-) create mode 100644 skills/foundation-creator/evals/fixtures/existing_foundation_overconfident.md create mode 100644 skills/foundation-creator/evals/fixtures/lightfast_foundation_tighten/expected_criteria.md diff --git a/README.md b/README.md index 9381b4c..b0c6ecc 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ bun install bun run eval:foundation -- create-foundation-from-vercel-source-packet bun run eval:foundation -- create-foundation-from-lightfast-founder-notes bun run eval:foundation -- update-lightfast-foundation-boundary-surface-question +bun run eval:foundation -- update-lightfast-foundation-tighten-overreach bun run eval:spec -- create-from-vercel-mcp-source-packet bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3 ``` @@ -44,6 +45,7 @@ Current `foundation-creator` corpus includes: - `create-foundation-from-lightfast-founder-notes` - `create-foundation-from-harbor-care-source-packet` - `update-lightfast-foundation-boundary-surface-question` +- `update-lightfast-foundation-tighten-overreach` The runner now also writes: diff --git a/evals/TAXONOMY.md b/evals/TAXONOMY.md index 7cb33b3..b7fecdc 100644 --- a/evals/TAXONOMY.md +++ b/evals/TAXONOMY.md @@ -74,8 +74,6 @@ values: The next missing slices are: -- a second `update_existing_doc` packet for `foundation-creator` that requires - replacing or tightening existing language rather than additive edits only - baseline comparison runs (`current skill` vs `previous skill` / `no skill`) when the local harness is ready to compare deltas directly - optional Braintrust-style scorer/export integration if local JSON artifacts are diff --git a/scripts/run-baml-eval.mjs b/scripts/run-baml-eval.mjs index 4f01658..4bd5b1d 100644 --- a/scripts/run-baml-eval.mjs +++ b/scripts/run-baml-eval.mjs @@ -266,6 +266,38 @@ function createCheck(id, passed, details) { return { id, passed, details }; } +function compilePatternSpec(patternSpec) { + if (typeof patternSpec === "string") { + return new RegExp(patternSpec, "i"); + } + + return new RegExp(patternSpec.pattern, patternSpec.flags ?? "i"); +} + +function filterLinesByPatternSpecs(lines, patternSpecs = []) { + if (patternSpecs.length === 0) { + return lines; + } + + return lines.filter((line) => + !patternSpecs.some((patternSpec) => compilePatternSpec(patternSpec).test(line)), + ); +} + +function createPatternChecks(normalizedCandidate, patternChecks = [], expectedPresence = true) { + return patternChecks.map((patternCheck) => { + const expression = compilePatternSpec(patternCheck); + const matched = expression.test(normalizedCandidate); + const passed = expectedPresence ? matched : !matched; + + return createCheck( + patternCheck.id, + passed, + passed ? patternCheck.details_pass : patternCheck.details_fail, + ); + }); +} + function extractFoundationTemplateSections(templateText) { const sections = []; const lines = templateText.split(/\r?\n/); @@ -364,6 +396,49 @@ function extractMarkdownBullets(sectionBody) { return bullets; } +function extractNormalizedMarkdownBlocks(text) { + const blocks = []; + let current = null; + + function flush() { + if (current && normalizeLine(current).length > 0) { + blocks.push(normalizeLine(current)); + } + current = null; + } + + for (const rawLine of text.split(/\r?\n/)) { + const trimmed = rawLine.trim(); + + if (trimmed.length === 0) { + flush(); + continue; + } + + if (/^\s*#+\s+/.test(rawLine)) { + flush(); + blocks.push(normalizeLine(rawLine)); + continue; + } + + if (/^\s*-\s+/.test(rawLine)) { + flush(); + current = rawLine.replace(/^\s*-\s+/, "- "); + continue; + } + + if (current) { + current = `${current} ${trimmed}`.trim(); + continue; + } + + current = trimmed; + } + + flush(); + return blocks; +} + function validateFoundationDocument(candidateDocument, templateText) { const requiredSections = extractFoundationTemplateSections(templateText); const disallowedHeadings = extractFoundationDisallowedHeadings(templateText); @@ -433,7 +508,7 @@ function validateFoundationDocument(candidateDocument, templateText) { strategicBetLines.every((line) => /^Bet:/i.test(line) ? strategicBetsBodyHasDirectionalPreamble - : /\b(appears?|suggests?|signals?|signaling|indicates?|indicating|directional bet|directional bets|observed bet|observed bets|a bet that|bet that|bet on)\b/i.test( + : /\b(appears?|suggests?|signals?|signaling|indicates?|indicating|indications?|directional bet|directional bets|observed bet|observed bets|a bet that|bet that|bet on)\b/i.test( line, ), ); @@ -507,20 +582,24 @@ function validateFoundationUpdateDocument( validationContract, ) { const baseChecks = validateFoundationDocument(candidateDocument, templateText); - const existingLines = extractNonEmptyNormalizedLines(existingFoundationText); - const candidateLines = extractNonEmptyNormalizedLines(candidateDocument); + const existingBlocks = extractNormalizedMarkdownBlocks(existingFoundationText); + const candidateBlocks = extractNormalizedMarkdownBlocks(candidateDocument); const normalizedCandidate = normalizeLine(candidateDocument); - const preservesExistingContent = linesAppearInOrder(existingLines, candidateLines); - const requiredPatternChecks = (validationContract.required_patterns ?? []).map((patternCheck) => { - const expression = new RegExp(patternCheck.pattern, patternCheck.flags ?? "i"); - const passed = expression.test(normalizedCandidate); - - return createCheck( - patternCheck.id, - passed, - passed ? patternCheck.details_pass : patternCheck.details_fail, - ); - }); + const preservedExistingBlocks = filterLinesByPatternSpecs( + existingBlocks, + validationContract.allowed_removed_patterns ?? [], + ); + const preservesExistingContent = linesAppearInOrder(preservedExistingBlocks, candidateBlocks); + const requiredPatternChecks = createPatternChecks( + normalizedCandidate, + validationContract.required_patterns ?? [], + true, + ); + const forbiddenPatternChecks = createPatternChecks( + normalizedCandidate, + validationContract.forbidden_patterns ?? [], + false, + ); return [ ...baseChecks, @@ -528,10 +607,11 @@ function validateFoundationUpdateDocument( "existing_content_preserved_in_order", preservesExistingContent, preservesExistingContent - ? "All non-empty lines from the existing foundation appear in order in the candidate." - : "One or more non-empty lines from the existing foundation were removed or reordered.", + ? "All existing markdown blocks appear in order in the candidate, except blocks explicitly marked as replaceable." + : "One or more existing markdown blocks were removed or reordered outside the explicitly replaceable blocks.", ), ...requiredPatternChecks, + ...forbiddenPatternChecks, ]; } diff --git a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml index 6a9f9a4..2a1d340 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/compiler_functions.baml @@ -110,6 +110,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string { - Emphasize thesis, boundaries, actor model, surfaces, and strategic bets. - State that the writer is a source-bound synthesizer, not a strategy consultant. - If the brief includes `existing_foundation`, treat the task as update mode and preserve the existing document verbatim except for the requested edits. + - In update mode, tell the writer to copy every unchanged bullet and question verbatim instead of paraphrasing it. - Require markdown output with exactly this heading structure unless the user asks otherwise: - `# Foundation` - `## What This Is` @@ -127,6 +128,7 @@ function RenderFoundationCreatorPrompt(brief: FoundationBrief) -> string { - Require `Open Questions` bullets to remain actual unresolved questions rather than disguised conclusions. - Forbid market-leadership or superiority claims unless they are explicit in the brief. - In update mode, do not add new sections or broad new framing unless the request explicitly asks for them. + - In update mode, if a boundary, actor, surface, strategic bet, or open question is not part of the requested edit, preserve its wording exactly. - Make the prompt directly usable by an agent. Brief: diff --git a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml index edaa130..3407e0d 100644 --- a/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml +++ b/skills/foundation-creator/baml_src/foundation_compiler/eval_runner.baml @@ -23,9 +23,10 @@ function CompileFoundationBriefFromPacket(packet: FoundationEvalPacket) -> Found - Avoid implementation detail. - If `existing_foundation` is present, treat this as update mode: - preserve the existing section structure - - carry forward unchanged framing and language where possible + - carry forward unchanged framing and language verbatim whenever possible - extract a narrow `update_request` rather than rewriting the entire document - do not resolve existing ambiguity unless the task prompt explicitly asks for it + - if a boundary, actor, surface, strategic bet, or open question is not explicitly targeted by the task prompt, keep its wording unchanged in the brief - Do not infer monetization, metrics, org structure, GTM strategy, or operating plans unless the packet explicitly supports them. - If `strategic_bets` are included, phrase them as observed directional signals or evidence. - Each `strategic_bets` item should start with explicit hedge language such as `The notes suggest a bet on...`, `There are visible signals that...`, or `The company appears to be betting on...`. @@ -72,6 +73,7 @@ function RenderFoundationDocumentDraft(brief: FoundationBrief) -> string { - If `existing_foundation` is present, treat this as update mode: - start from `existing_foundation` - preserve all unchanged lines verbatim + - preserve unchanged bullet wording and unchanged open questions exactly as written - apply only the edits implied by `update_request` - do not rewrite existing paragraphs or bullets unless the request requires it - do not add new sections or broad new framing unless `update_request` explicitly asks for them diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json index 198fdbe..1c7935c 100644 --- a/skills/foundation-creator/evals/evals.json +++ b/skills/foundation-creator/evals/evals.json @@ -158,6 +158,126 @@ "files": [ "fixtures/existing_foundation.md" ] + }, + { + "id": 5, + "eval_name": "update-lightfast-foundation-tighten-overreach", + "scenario_type": "update_existing_doc", + "input_shape": "existing_doc_update", + "ambiguity_level": "medium", + "domain_profile": "company_foundation", + "primary_risks": [ + "update_regression", + "invented_certainty", + "weak_boundaries" + ], + "prompt": "We have an older Lightfast foundation draft that overstates a few conclusions. Update it in place: replace the `operating system for durable agent work` framing with narrower source-bound language about a durable artifact or constraint layer for agent work; replace the thesis line about `replacing ambiguous human direction with machine-executable constraints` so it instead frames Lightfast as structuring human judgment through stable constraints; remove settled claims that repo-native is the clear long-term center of gravity or that hosted control planes matter less, and replace them with an explicit open question about repo-native usage versus a hosted control plane, catalog, or distribution surface; rewrite the wedge language so coding workflows are the easiest first wedge because eval loops are tighter, without implying coding is the permanent center of gravity; and soften any remaining categorical `Strategic Bets` language into directional, source-bound signals. Keep the existing heading structure and preserve everything else unless one of these local rewrites requires adjacent wording changes.", + "expected_output": "The older Lightfast foundation document is updated in place: the overreaching `operating system` framing is narrowed, the primitive is reframed around structuring human judgment rather than replacing it, settled repo-native superiority claims are removed and restored as an open tension, the wedge language now says coding workflows are the easiest first wedge because eval loops are tighter without implying permanent scope, and any remaining categorical strategic-bet language is softened into directional evidence. Unchanged content stays in place and no new planning or strategy sections are added.", + "expected_file": "fixtures/lightfast_foundation_tighten/expected_criteria.md", + "validation_contract": { + "type": "reference_document_checks", + "validator": "foundation-update-v1", + "template_file": "references/template.md", + "language_file": "references/language.md", + "existing_foundation_file": "evals/fixtures/existing_foundation_overconfident.md", + "allowed_removed_patterns": [ + { + "pattern": "^Lightfast is an operating system for durable agent work\\.$", + "flags": "i" + }, + { + "pattern": "^- The core primitive is replacing ambiguous human direction with machine-executable constraints\\.$", + "flags": "i" + }, + { + "pattern": "^- Repo-native workflows are the clear long-term center of gravity\\.$", + "flags": "i" + }, + { + "pattern": "^- Coding is the product center of gravity, not just the initial wedge\\.$", + "flags": "i" + }, + { + "pattern": "^- Skills are the wedge and will remain the product center of gravity\\.$", + "flags": "i" + }, + { + "pattern": "^- Repo-native distribution matters more than any hosted control plane\\.$", + "flags": "i" + }, + { + "pattern": "^- Formal constraints and reusable artifacts will outlast raw prompts and transcripts\\.$", + "flags": "i" + } + ], + "required_patterns": [ + { + "id": "narrower_artifact_layer_framing_present", + "pattern": "source-bound.*durable artifact and constraint layer|source-bound.*durable artifacts? and constraints?|durable artifact and constraint layer.*agent(?:-enabled)? work", + "flags": "i", + "details_pass": "Detected the narrower durable-artifact/constraint-layer framing in `What This Is`.", + "details_fail": "Did not detect the requested narrower durable-artifact/constraint-layer framing." + }, + { + "id": "human_judgment_thesis_present", + "pattern": "structur(?:e|ing) human judgment.*stable(?:,? [a-z-]+)? constraints?|stable(?:,? [a-z-]+)? constraints?.*structur(?:e|ing) human judgment", + "flags": "i", + "details_pass": "Detected the requested thesis language about structuring human judgment through stable constraints.", + "details_fail": "Did not detect the requested thesis language about structuring human judgment through stable constraints." + }, + { + "id": "coding_first_wedge_is_qualified", + "pattern": "coding workflows.*easiest (?:first|initial) wedge because eval(?:uation)? loops are tighter", + "flags": "i", + "details_pass": "Detected the requested qualified coding-first wedge language.", + "details_fail": "Did not detect the requested qualified coding-first wedge language." + }, + { + "id": "repo_native_vs_hosted_tension_restored", + "pattern": "repo-native.*hosted (?:control plane|catalog|distribution surface)", + "flags": "i", + "details_pass": "Detected the requested repo-native versus hosted tension.", + "details_fail": "Did not detect the requested repo-native versus hosted tension." + } + ], + "forbidden_patterns": [ + { + "id": "operating_system_overreach_removed", + "pattern": "operating system for durable agent work", + "flags": "i", + "details_pass": "Did not detect the overreaching `operating system` framing.", + "details_fail": "Still detected the overreaching `operating system` framing." + }, + { + "id": "machine_executable_constraint_line_removed", + "pattern": "replacing ambiguous human direction with machine-executable constraints", + "flags": "i", + "details_pass": "Did not detect the old thesis language about replacing ambiguous human direction.", + "details_fail": "Still detected the old thesis language about replacing ambiguous human direction." + }, + { + "id": "repo_native_winner_claim_removed", + "pattern": "repo-native workflows are the clear long-term center of gravity|repo-native distribution matters more than any hosted control plane", + "flags": "i", + "details_pass": "Did not detect the old settled repo-native winner claims.", + "details_fail": "Still detected a settled repo-native winner claim that should have been removed." + }, + { + "id": "permanent_wedge_claim_removed", + "pattern": "coding is the product center of gravity, not just the initial wedge|skills are the wedge and will remain the product center of gravity", + "flags": "i", + "details_pass": "Did not detect the old permanent-center-of-gravity wedge claims.", + "details_fail": "Still detected a permanent-center-of-gravity wedge claim that should have been removed." + } + ] + }, + "packet_files": { + "expected_criteria": "fixtures/lightfast_foundation_tighten/expected_criteria.md", + "existing_foundation": "fixtures/existing_foundation_overconfident.md" + }, + "files": [ + "fixtures/existing_foundation_overconfident.md" + ] } ] } diff --git a/skills/foundation-creator/evals/fixtures/existing_foundation_overconfident.md b/skills/foundation-creator/evals/fixtures/existing_foundation_overconfident.md new file mode 100644 index 0000000..8ad7f78 --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/existing_foundation_overconfident.md @@ -0,0 +1,47 @@ +# Lightfast Foundation + +## What This Is +Lightfast is an operating system for durable agent work. + +## Core Thesis +- Durable artifacts such as foundation documents, specs, typed contracts, and + reusable skill packages survive model churn better than prompts or + transcripts. +- The core primitive is replacing ambiguous human direction with + machine-executable constraints. +- Repo-native workflows are the clear long-term center of gravity. +- Coding is the product center of gravity, not just the initial wedge. + +## Boundaries +- Not a chat wrapper, prompt library, or general AI assistant. +- Not an agency delivering bespoke strategy or operational work. +- Not project management, ticketing, or a drag-and-drop workflow builder. +- Not a fully autonomous company-in-a-box that replaces human ownership. + +## Actor Model +- Solo builders use reusable skill packages and local-first artifact workflows. +- Founders and operators use foundation documents and specs to align intent + while preserving decision ownership. +- Product, engineering, and operations teams use typed contracts and repeatable + artifacts to review and distribute working knowledge. + +## Durable Surfaces +- Foundation documents and templates that capture durable intent and scope. +- Specs that turn thesis-level framing into reviewable behavioral documents. +- Reusable skill packages that make operational knowledge installable. +- Typed contracts that make expectations explicit across humans, agents, and + code. +- Repo-native files and reference artifacts that support versioned local + editing and review. + +## Strategic Bets +- Skills are the wedge and will remain the product center of gravity. +- Repo-native distribution matters more than any hosted control plane. +- Formal constraints and reusable artifacts will outlast raw prompts and + transcripts. + +## Open Questions +- How much of the long-term value comes from authoring artifacts versus + distributing them? +- Will the company extend meaningfully beyond coding into operations, support, + and GTM work? diff --git a/skills/foundation-creator/evals/fixtures/lightfast_foundation_tighten/expected_criteria.md b/skills/foundation-creator/evals/fixtures/lightfast_foundation_tighten/expected_criteria.md new file mode 100644 index 0000000..a5ebabb --- /dev/null +++ b/skills/foundation-creator/evals/fixtures/lightfast_foundation_tighten/expected_criteria.md @@ -0,0 +1,21 @@ +# Expected Criteria + +- The update should keep the existing Lightfast foundation structure and retain + unchanged lines in place. +- The `What This Is` framing should be narrowed from `operating system for + durable agent work` to a more source-bound durable artifact or constraint + layer framing. +- The thesis should no longer say Lightfast replaces ambiguous human direction + with machine-executable constraints; it should instead say the system + structures human judgment through stable constraints or inspectable artifacts. +- The document should no longer claim repo-native workflows or distribution are + already the clear long-term winner. That tension should be restored as an + open question about repo-native usage versus a hosted control plane, catalog, + or distribution surface. +- The wedge language should be tightened so coding workflows are framed as the + easiest first wedge because eval loops are tighter, without implying coding + is the permanent center of gravity. +- Remaining categorical `Strategic Bets` language should be softened into + directional, source-bound signals rather than settled conclusions. +- The update should not introduce new sections, roadmap content, or invented + certainty beyond the requested rewrites. From c7710303d58acb56aa57fb080865a527cfaa3d3a Mon Sep 17 00:00:00 2001 From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com> Date: Tue, 21 Apr 2026 13:50:15 +1000 Subject: [PATCH 08/30] Add baseline comparison eval harness --- README.md | 15 + evals/TAXONOMY.md | 10 +- scripts/run-baml-eval.mjs | 497 ++++++++++++++++-- .../foundation_compiler/eval_runner.baml | 90 ++++ skills/foundation-creator/evals/evals.json | 2 +- 5 files changed, 562 insertions(+), 52 deletions(-) create mode 100644 skills/foundation-creator/eval_profiles/no-skill/baml_src/foundation_compiler/eval_runner.baml diff --git a/README.md b/README.md index b0c6ecc..d1208ad 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ bun run eval:foundation -- update-lightfast-foundation-boundary-surface-question bun run eval:foundation -- update-lightfast-foundation-tighten-overreach bun run eval:spec -- create-from-vercel-mcp-source-packet bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator create-foundation-from-cloudflare-source-packet --trials 3 +bun run with-env -- bun run ./scripts/run-baml-eval.mjs foundation-creator update-lightfast-foundation-tighten-overreach --compare previous,profile:no-skill ``` Each run writes packet, brief, candidate document, and evaluation report @@ -56,6 +57,20 @@ The runner now also writes: - `benchmark.json` — aggregated status counts and timing summaries across all trials +When `--compare` is used, the run directory also includes: + +- `comparison.json` — head-to-head summary across variants, all judged by the + current skill's evaluator +- `variants/