From a0a59682db03e40d9e65486957ac02fc06fcd4ff Mon Sep 17 00:00:00 2001 From: Jeevan Pillay <169354619+jeevanpillay@users.noreply.github.com> Date: Thu, 23 Apr 2026 21:09:17 +1000 Subject: [PATCH] Add Braintrust eval inspection workflow --- README.md | 29 + package.json | 5 +- scripts/braintrust-evals.ts | 547 ++++++++++++++++++ scripts/evals/README.md | 2 + scripts/evals/static-checks.ts | 1 + scripts/evals/validators/foundation.ts | 5 +- skills/foundation-creator/evals/evals.json | 3 + .../spec_compiler/compiler_functions.baml | 5 + .../baml_src/spec_compiler/eval_runner.baml | 7 + skills/spec-creator/evals/evals.json | 2 +- 10 files changed, 603 insertions(+), 3 deletions(-) create mode 100644 scripts/braintrust-evals.ts diff --git a/README.md b/README.md index c9ac91d..a311fe7 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,11 @@ directory, or a suite directory: bun run eval:spec -- update-add-single-nongoal-preserve-system-overview --deterministic-only skills/spec-creator/evals/runs//candidate.md ``` +Update-mode validation contracts can use `skip_base_check_ids` when a packet +explicitly asks to preserve legacy text that would fail a generic create-mode +style rule. Keep these skips narrow and pair them with required/forbidden +patterns for the actual requested edit. + Current comparison variants: - `current` — working tree prompt stack @@ -149,6 +154,30 @@ skill package names. Current values are `foundation-doc` and `service-spec`. Optional Braintrust environment variables are `BRAINTRUST_EXPERIMENT` for manual curated runs and `BRAINTRUST_ORG` for org selection. +Braintrust can also be inspected from the terminal without opening the UI: + +```bash +bun run braintrust:list -- --limit 5 +bun run braintrust:latest -- --capability foundation-doc +bun run braintrust:latest -- --capability service-spec +bun run braintrust:show -- foundation-doc.smoke.fast.model.20260423-1015.0a10e79 +``` + +These commands use Braintrust's API and BTQL directly, summarize experiment +rows, and print combined status counts, LLM status counts, deterministic +failures, open issues, timing, and per-eval row status. They require +`BRAINTRUST_API_KEY` and use `BRAINTRUST_PROJECT` when set. + +Braintrust also provides an optional beta `bt` CLI for listing experiments, +running BTQL, and syncing experiment data locally: + +```bash +curl -fsSL https://bt.dev/cli/install.sh | bash +bt experiments list --project lightfast-skills --env-file .env --json --no-input +bt sql "SELECT id, input, scores FROM experiment('') LIMIT 20" --env-file .env --json --no-input +bt sync pull experiment: --project lightfast-skills --env-file .env +``` + Eval manifests also carry lightweight taxonomy metadata (`scenario_type`, `input_shape`, `ambiguity_level`, `domain_profile`, `primary_risks`) so benchmark runs can be grouped by failure mode. Shared diff --git a/package.json b/package.json index 640b5ba..f295cdf 100644 --- a/package.json +++ b/package.json @@ -15,7 +15,10 @@ "eval:spec": "bun run with-env -- bun ./scripts/run-baml-eval.ts spec-creator", "eval:spec:smoke": "bun run eval:spec -- --smoke", "eval:check": "bun ./scripts/check-eval-fixtures.ts foundation-creator spec-creator", - "eval:typecheck": "tsc --noEmit --allowImportingTsExtensions --moduleResolution bundler --module esnext --target esnext --skipLibCheck --types node scripts/check-eval-fixtures.ts scripts/run-baml-eval.ts scripts/evals/*.ts scripts/evals/validators/*.ts", + "eval:typecheck": "tsc --noEmit --allowImportingTsExtensions --moduleResolution bundler --module esnext --target esnext --skipLibCheck --types node scripts/check-eval-fixtures.ts scripts/run-baml-eval.ts scripts/braintrust-evals.ts scripts/evals/*.ts scripts/evals/validators/*.ts", + "braintrust:list": "bun run with-env -- bun ./scripts/braintrust-evals.ts list", + "braintrust:latest": "bun run with-env -- bun ./scripts/braintrust-evals.ts latest", + "braintrust:show": "bun run with-env -- bun ./scripts/braintrust-evals.ts show", "ci:check": "bun run eval:check && bun run baml:generate:foundation && bun run baml:generate:spec && bun run eval:typecheck" }, "dependencies": { diff --git a/scripts/braintrust-evals.ts b/scripts/braintrust-evals.ts new file mode 100644 index 0000000..7f82349 --- /dev/null +++ b/scripts/braintrust-evals.ts @@ -0,0 +1,547 @@ +type JsonObject = Record; + +type BraintrustExperiment = { + id: string; + name: string; + project_id?: string; + created?: string; + commit?: string | null; + tags?: string[] | null; + metadata?: JsonObject | null; +}; + +type BraintrustRow = { + id?: string; + input?: JsonObject | null; + scores?: JsonObject | null; + metadata?: JsonObject | null; + metrics?: JsonObject | null; +}; + +type Filters = { + capability?: string; + skill?: string; + profile?: string; + suite?: string; +}; + +type Options = Filters & { + apiUrl: string; + command: string; + experiment?: string; + json: boolean; + limit: number; + org?: string; + project: string; +}; + +const DEFAULT_PROJECT = "lightfast-skills"; +const DEFAULT_API_URL = "https://api.braintrust.dev"; + +function fail(message: string): never { + console.error(message); + process.exit(1); +} + +function usage(): never { + fail(`Usage: + bun run braintrust:list -- [--limit N] [--capability ID] [--skill NAME] [--profile NAME] [--suite MODE] [--json] + bun run braintrust:latest -- [--capability ID] [--skill NAME] [--profile NAME] [--suite MODE] [--json] + bun run braintrust:show -- [--limit N] [--json] + +Environment: + BRAINTRUST_API_KEY is required. + BRAINTRUST_PROJECT defaults to '${DEFAULT_PROJECT}'. + BRAINTRUST_API_URL defaults to '${DEFAULT_API_URL}'.`); +} + +function parseArgs(argv: string[]): Options { + const args = [...argv]; + const command = args.shift() ?? "list"; + const options: Options = { + apiUrl: process.env.BRAINTRUST_API_URL ?? DEFAULT_API_URL, + command, + json: false, + limit: command === "show" || command === "latest" ? 1000 : 10, + org: process.env.BRAINTRUST_ORG ?? process.env.BRAINTRUST_ORG_NAME, + project: + process.env.BRAINTRUST_PROJECT ?? + process.env.BRAINTRUST_DEFAULT_PROJECT ?? + DEFAULT_PROJECT, + }; + + while (args.length > 0) { + const arg = args.shift(); + if (!arg) { + continue; + } + + switch (arg) { + case "--json": + options.json = true; + break; + case "--project": + options.project = requireValue(arg, args.shift()); + break; + case "--api-url": + options.apiUrl = requireValue(arg, args.shift()); + break; + case "--org": + options.org = requireValue(arg, args.shift()); + break; + case "--limit": + options.limit = parsePositiveInteger(requireValue(arg, args.shift()), arg); + break; + case "--capability": + options.capability = requireValue(arg, args.shift()); + break; + case "--skill": + options.skill = requireValue(arg, args.shift()); + break; + case "--profile": + options.profile = requireValue(arg, args.shift()); + break; + case "--suite": + options.suite = requireValue(arg, args.shift()); + break; + default: + if (arg.startsWith("--")) { + usage(); + } + if (options.experiment) { + usage(); + } + options.experiment = arg; + } + } + + return options; +} + +function requireValue(flag: string, value?: string): string { + if (!value || value.startsWith("--")) { + fail(`${flag} requires a value.`); + } + return value; +} + +function parsePositiveInteger(value: string, flag: string): number { + const parsed = Number.parseInt(value, 10); + if (!Number.isInteger(parsed) || parsed <= 0) { + fail(`${flag} must be a positive integer.`); + } + return parsed; +} + +function getApiKey(): string { + const apiKey = process.env.BRAINTRUST_API_KEY; + if (!apiKey) { + fail("BRAINTRUST_API_KEY is required."); + } + return apiKey; +} + +async function braintrustFetch( + options: Options, + path: string, + init: RequestInit = {}, +): Promise { + const apiUrl = options.apiUrl.replace(/\/+$/, ""); + const response = await fetch(`${apiUrl}${path}`, { + ...init, + headers: { + Authorization: `Bearer ${getApiKey()}`, + "Content-Type": "application/json", + ...(init.headers ?? {}), + }, + }); + + if (!response.ok) { + fail(`Braintrust API error ${response.status}: ${await response.text()}`); + } + + return await response.json(); +} + +function metadataOf(experiment: BraintrustExperiment): JsonObject { + return experiment.metadata ?? {}; +} + +function nestedObject(value: unknown): JsonObject { + return value && typeof value === "object" && !Array.isArray(value) + ? (value as JsonObject) + : {}; +} + +function stringValue(value: unknown): string | null { + return typeof value === "string" ? value : null; +} + +function booleanValue(value: unknown): boolean | null { + return typeof value === "boolean" ? value : null; +} + +function numberValue(value: unknown): number | null { + return typeof value === "number" && Number.isFinite(value) ? value : null; +} + +function arrayValue(value: unknown): unknown[] { + return Array.isArray(value) ? value : []; +} + +function statusFromScore(score: unknown): string | null { + if (score === 1) { + return "Pass"; + } + if (score === 0.5) { + return "Partial"; + } + if (score === 0) { + return "Fail"; + } + return null; +} + +function filterExperiment(experiment: BraintrustExperiment, filters: Filters): boolean { + const metadata = metadataOf(experiment); + const tags = experiment.tags ?? []; + const evalProfile = nestedObject(metadata.eval_profile); + + return ( + (!filters.capability || + metadata.capability_id === filters.capability || + tags.includes(filters.capability)) && + (!filters.skill || metadata.skill_name === filters.skill || tags.includes(filters.skill)) && + (!filters.profile || + evalProfile.name === filters.profile || + tags.includes(filters.profile)) && + (!filters.suite || metadata.suite_mode === filters.suite || tags.includes(filters.suite)) + ); +} + +async function listExperiments(options: Options, requestedLimit = options.limit) { + const params = new URLSearchParams({ + limit: String(Math.max(requestedLimit * 5, 50)), + project_name: options.project, + }); + + if (options.org) { + params.set("org_name", options.org); + } + + const response = (await braintrustFetch( + options, + `/v1/experiment?${params}`, + )) as { objects?: BraintrustExperiment[] }; + + return (response.objects ?? []) + .filter((experiment) => filterExperiment(experiment, options)) + .slice(0, requestedLimit); +} + +async function resolveExperiment(options: Options): Promise { + const selector = options.experiment ?? "latest"; + + if (selector === "latest") { + const [latest] = await listExperiments(options, 1); + if (!latest) { + fail("No matching Braintrust experiments found."); + } + return latest; + } + + const params = new URLSearchParams({ + limit: "10", + project_name: options.project, + }); + + if (options.org) { + params.set("org_name", options.org); + } + + if (isUuid(selector)) { + params.append("ids", selector); + } else { + params.set("experiment_name", selector); + } + + const response = (await braintrustFetch( + options, + `/v1/experiment?${params}`, + )) as { objects?: BraintrustExperiment[] }; + const [experiment] = response.objects ?? []; + + if (!experiment) { + fail(`Braintrust experiment not found: ${selector}`); + } + + return experiment; +} + +function isUuid(value: string): boolean { + return /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i.test( + value, + ); +} + +function btqlString(value: string): string { + return `'${value.replaceAll("'", "''")}'`; +} + +async function fetchExperimentRows(options: Options, experimentId: string) { + const query = ` + SELECT id, input, scores, metadata, metrics + FROM experiment(${btqlString(experimentId)}) + LIMIT ${options.limit} + `; + + const response = (await braintrustFetch(options, "/btql", { + method: "POST", + body: JSON.stringify({ query, fmt: "json" }), + })) as { data?: BraintrustRow[] }; + + return response.data ?? []; +} + +function summarizeRow(row: BraintrustRow) { + const input = nestedObject(row.input); + const scores = nestedObject(row.scores); + const metadata = nestedObject(row.metadata); + const metrics = nestedObject(row.metrics); + const summary = nestedObject(metadata.summary); + const report = nestedObject(metadata.report); + const deterministicChecks = nestedObject(metadata.deterministic_checks); + const checks = arrayValue(deterministicChecks.checks).map(nestedObject); + const failedChecks = checks + .filter((check) => check.passed === false) + .map((check) => stringValue(check.id)) + .filter((id): id is string => Boolean(id)); + const openIssues = arrayValue(report.open_issues) + .map((issue) => String(issue)) + .filter(Boolean); + + return { + id: row.id ?? null, + eval_name: stringValue(input.eval_name) ?? "unknown-eval", + variant: stringValue(input.variant) ?? "current", + trial: numberValue(input.trial) ?? null, + llm_status: + stringValue(summary.llm_status) ?? + stringValue(report.overall_status) ?? + statusFromScore(scores.llm_status) ?? + "Unknown", + combined_status: + stringValue(summary.combined_status) ?? + statusFromScore(scores.combined_status) ?? + "Unknown", + deterministic_pass: + booleanValue(summary.deterministic_pass) ?? + (scores.deterministic_pass === 1 ? true : scores.deterministic_pass === 0 ? false : null), + failed_checks: failedChecks, + open_issues: openIssues, + total_ms: numberValue(metrics.total_ms), + artifact_dir: stringValue(metadata.artifact_dir), + }; +} + +function buildExperimentSummary(experiment: BraintrustExperiment, rows: BraintrustRow[]) { + const rowSummaries = rows.map(summarizeRow); + const combinedStatusCounts = countStatuses(rowSummaries.map((row) => row.combined_status)); + const llmStatusCounts = countStatuses(rowSummaries.map((row) => row.llm_status)); + const totalMs = rowSummaries + .map((row) => row.total_ms) + .filter((value): value is number => value !== null); + + return { + experiment, + rows: rowSummaries, + summary: { + row_count: rowSummaries.length, + llm_status_counts: llmStatusCounts, + combined_status_counts: combinedStatusCounts, + deterministic_failures: rowSummaries + .filter((row) => row.failed_checks.length > 0) + .map((row) => ({ + eval_name: row.eval_name, + trial: row.trial, + failed_checks: row.failed_checks, + })), + open_issues: rowSummaries + .filter((row) => row.open_issues.length > 0) + .map((row) => ({ + eval_name: row.eval_name, + trial: row.trial, + open_issues: row.open_issues, + })), + timing_ms: + totalMs.length === 0 + ? null + : { + min: Math.min(...totalMs), + max: Math.max(...totalMs), + avg: Math.round(totalMs.reduce((sum, value) => sum + value, 0) / totalMs.length), + }, + }, + }; +} + +function countStatuses(statuses: string[]) { + return { + Pass: statuses.filter((status) => status === "Pass").length, + Partial: statuses.filter((status) => status === "Partial").length, + Fail: statuses.filter((status) => status === "Fail").length, + Unknown: statuses.filter((status) => !["Pass", "Partial", "Fail"].includes(status)).length, + }; +} + +function printExperimentList(experiments: BraintrustExperiment[]) { + if (experiments.length === 0) { + console.log("No matching Braintrust experiments found."); + return; + } + + const rows = experiments.map((experiment) => { + const metadata = metadataOf(experiment); + const profile = nestedObject(metadata.eval_profile); + const git = nestedObject(metadata.git); + + return { + created: formatDate(experiment.created), + capability: String(metadata.capability_id ?? ""), + suite: String(metadata.suite_mode ?? ""), + profile: String(profile.name ?? ""), + sha: String(git.short_sha ?? experiment.commit ?? ""), + name: experiment.name, + id: experiment.id, + }; + }); + + printTable(rows, ["created", "capability", "suite", "profile", "sha", "name"]); +} + +function printExperimentSummary(summary: ReturnType) { + const experiment = summary.experiment; + const metadata = metadataOf(experiment); + const profile = nestedObject(metadata.eval_profile); + const git = nestedObject(metadata.git); + + console.log(`Experiment: ${experiment.name}`); + console.log(`ID: ${experiment.id}`); + console.log(`Created: ${formatDate(experiment.created)}`); + console.log( + `Run: ${metadata.skill_name ?? "unknown-skill"} / ${metadata.capability_id ?? "unknown-capability"} / ${metadata.suite_mode ?? "unknown-suite"} / ${profile.name ?? "unknown-profile"}`, + ); + console.log(`Commit: ${git.short_sha ?? experiment.commit ?? "unknown"}`); + console.log(`Rows: ${summary.summary.row_count}`); + console.log( + `Combined: Pass ${summary.summary.combined_status_counts.Pass}, Partial ${summary.summary.combined_status_counts.Partial}, Fail ${summary.summary.combined_status_counts.Fail}, Unknown ${summary.summary.combined_status_counts.Unknown}`, + ); + console.log( + `LLM: Pass ${summary.summary.llm_status_counts.Pass}, Partial ${summary.summary.llm_status_counts.Partial}, Fail ${summary.summary.llm_status_counts.Fail}, Unknown ${summary.summary.llm_status_counts.Unknown}`, + ); + + if (summary.summary.timing_ms) { + console.log( + `Timing total ms: avg ${summary.summary.timing_ms.avg}, min ${summary.summary.timing_ms.min}, max ${summary.summary.timing_ms.max}`, + ); + } + + if (summary.summary.deterministic_failures.length > 0) { + console.log(""); + console.log("Deterministic failures:"); + for (const failure of summary.summary.deterministic_failures) { + console.log( + `- ${failure.eval_name}${failure.trial ? ` trial ${failure.trial}` : ""}: ${failure.failed_checks.join(", ")}`, + ); + } + } + + if (summary.summary.open_issues.length > 0) { + console.log(""); + console.log("LLM open issues:"); + for (const issueGroup of summary.summary.open_issues) { + console.log( + `- ${issueGroup.eval_name}${issueGroup.trial ? ` trial ${issueGroup.trial}` : ""}: ${issueGroup.open_issues.join(" | ")}`, + ); + } + } + + console.log(""); + printTable( + summary.rows.map((row) => ({ + combined: row.combined_status, + llm: row.llm_status, + deterministic: row.deterministic_pass === null ? "?" : row.deterministic_pass ? "yes" : "no", + total_ms: row.total_ms === null ? "" : String(row.total_ms), + eval: row.eval_name, + trial: row.trial === null ? "" : String(row.trial), + })), + ["combined", "llm", "deterministic", "total_ms", "trial", "eval"], + ); +} + +function formatDate(value?: string): string { + if (!value) { + return ""; + } + const date = new Date(value); + if (Number.isNaN(date.getTime())) { + return value; + } + return date.toISOString().replace("T", " ").replace(/\.\d+Z$/, "Z"); +} + +function printTable(rows: JsonObject[], columns: string[]) { + const widths = new Map( + columns.map((column) => [ + column, + Math.max( + column.length, + ...rows.map((row) => String(row[column] ?? "").length), + ), + ]), + ); + const formatRow = (row: JsonObject) => + columns.map((column) => String(row[column] ?? "").padEnd(widths.get(column) ?? 0)).join(" "); + + console.log(formatRow(Object.fromEntries(columns.map((column) => [column, column])))); + console.log(columns.map((column) => "-".repeat(widths.get(column) ?? 0)).join(" ")); + for (const row of rows) { + console.log(formatRow(row)); + } +} + +async function main() { + const options = parseArgs(process.argv.slice(2)); + + switch (options.command) { + case "list": { + const experiments = await listExperiments(options); + if (options.json) { + console.log(JSON.stringify(experiments, null, 2)); + } else { + printExperimentList(experiments); + } + break; + } + case "latest": + case "show": { + const experiment = await resolveExperiment(options); + const rows = await fetchExperimentRows(options, experiment.id); + const summary = buildExperimentSummary(experiment, rows); + if (options.json) { + console.log(JSON.stringify(summary, null, 2)); + } else { + printExperimentSummary(summary); + } + break; + } + default: + usage(); + } +} + +main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); +}); diff --git a/scripts/evals/README.md b/scripts/evals/README.md index 76a162a..976edda 100644 --- a/scripts/evals/README.md +++ b/scripts/evals/README.md @@ -9,6 +9,8 @@ Current modules: - `artifacts.ts` owns local artifact writes and deterministic-only candidate artifact discovery. - `baml.ts` regenerates and imports generated BAML clients. +- `../braintrust-evals.ts` is a terminal inspector for Braintrust experiments; + it stays outside the runner core and talks to Braintrust through API/BTQL. - `cli.ts` parses command-line flags into a runner request. - `git.ts` captures lightweight git metadata for experiment names and reporter metadata. diff --git a/scripts/evals/static-checks.ts b/scripts/evals/static-checks.ts index d089d4f..515cc9e 100644 --- a/scripts/evals/static-checks.ts +++ b/scripts/evals/static-checks.ts @@ -195,6 +195,7 @@ async function checkValidationContract(contract, skillRoot, issues, scope) { issue(issues, scope, `'${field}' must be an array when present.`); } } + checkStringArray(contract.skip_base_check_ids, issues, scope, "skip_base_check_ids"); } async function readBamlSourceFiles(directory, issues, scope) { diff --git a/scripts/evals/validators/foundation.ts b/scripts/evals/validators/foundation.ts index b7cf4d4..66ee29c 100644 --- a/scripts/evals/validators/foundation.ts +++ b/scripts/evals/validators/foundation.ts @@ -255,7 +255,10 @@ export function validateFoundationUpdateDocument( validationContract, packet, ) { - const baseChecks = validateFoundationDocument(candidateDocument, templateText, "", packet); + const skippedBaseCheckIds = new Set(validationContract.skip_base_check_ids ?? []); + const baseChecks = validateFoundationDocument(candidateDocument, templateText, "", packet).filter( + (check) => !skippedBaseCheckIds.has(check.id), + ); const filteredExistingFoundationText = removeReplaceableSectionContent( existingFoundationText, validationContract.replaceable_sections ?? [], diff --git a/skills/foundation-creator/evals/evals.json b/skills/foundation-creator/evals/evals.json index 5d1f97d..f88499d 100644 --- a/skills/foundation-creator/evals/evals.json +++ b/skills/foundation-creator/evals/evals.json @@ -129,6 +129,9 @@ "template_file": "references/template.md", "language_file": "references/language.md", "existing_foundation_file": "evals/fixtures/existing_foundation.md", + "skip_base_check_ids": [ + "strategic_bets_use_directional_language" + ], "required_patterns": [ { "id": "hosted_control_plane_boundary_present", diff --git a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml index e4aec0b..3bcfbbc 100644 --- a/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml +++ b/skills/spec-creator/baml_src/spec_compiler/compiler_functions.baml @@ -103,6 +103,9 @@ function CompileSpecBrief(raw_notes: string, existing_spec: string?) -> SpecBrie - In update mode, do not restate or expand unchanged Purpose, Problem Statement, Goal, Non-Goal, component, dependency, or domain-model content just because you can infer a cleaner version. - In update mode, if an unchanged paragraph, bullet, or component description already exists in `existing_spec`, preserve that wording exactly instead of restating it from summary fields. - In update mode, do not make stylistic micro-edits to untouched text. Avoid synonym swaps, helper-word insertion, tense normalization, article changes, singular/plural cleanup, or punctuation-only rewriting unless the requested edit requires it. + - In update mode, if the request or criteria use words such as `refresh`, `replace`, `rewrite`, `stale`, `old`, `older`, `generic`, or `shorthand` for a named section or subsection, treat that section's corresponding brief field as a complete replacement list for that section, not as an append-only merge with the old list. + - In update mode, when a refreshed section says old bullets should be removed, do not carry those bullets forward verbatim in the corresponding brief field even if they are still semantically related. Replace them with the sharper source-backed wording from the packet. + - In update mode, when the packet lists distinct replacement concepts for a refreshed section, preserve those concepts as distinct brief items unless the packet says they are alternatives. Do not collapse separate dependency categories, boundary categories, or open questions into one broad bullet. - In update mode, if the request asks for a new component, boundary bullet, or open-question section, do not automatically propagate that concept into Purpose, Problem Statement, Goals, Abstraction Levels, External Dependencies, or the domain model unless the request explicitly asks for those extra edits. - In update mode, if the request does not explicitly ask to revise `## 4. Core Domain Model`, keep the entity set unchanged. A newly added component, surface, boundary, or open question does not by itself justify a new entity or new fields. - In update mode, prefer expressing newly requested behavior in component text, boundary bullets, or unresolved questions before considering any domain-model edit. @@ -271,7 +274,9 @@ function RenderSpecCreatorPrompt(brief: SpecBrief) -> string { - In update mode, if `update_request` uses words like refresh, replace, or rewrite for a named section or subsection, treat that named section or subsection as replaceable. Keep its heading and neighboring sections unchanged, but replace the section body with the corresponding brief content. - For a refreshed or replaced section, the old section body is not authoritative. Do not copy old bullets or questions from `existing_spec` into the refreshed section unless the same bullet or question appears in the corresponding brief field. - In update mode, insertion is preferred only for add-only requests. Do not preserve stale bullets inside a section that the request explicitly asks to refresh or replace. + - In update mode, if the request or criteria call existing bullets stale, old, older, generic, shorthand, loose, or too vague, remove those bullets from the refreshed section even when they are adjacent to newly requested material. - If `update_request` asks to refresh `### 2.2 Non-Goals`, render `brief.non_goals` as the complete new bullet list for that subsection and remove old non-goal bullets that are absent from `brief.non_goals`. + - If `update_request` asks to refresh `### 3.3 External Dependencies`, render `brief.external_dependencies` as the complete new bullet list for that subsection and remove old dependency bullets that are absent from `brief.external_dependencies`. - If `update_request` asks to refresh `## 5. Open Questions`, render `brief.unresolved_questions` as the complete new bullet list for that section and remove old questions that are absent from `brief.unresolved_questions`. - In update mode, prefer insertion over rewrite. When adding a new bullet or component beside existing text, keep neighboring unchanged lines exactly as they appear in `existing_spec`. - If `update_request` asks for a new bullet, component, boundary, or open question, insert only that material into the existing structure instead of regenerating neighboring paragraphs or adding unrelated template sections. This insertion rule does not apply to refresh, replace, or rewrite requests. diff --git a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml index 5689a08..8a6038f 100644 --- a/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml +++ b/skills/spec-creator/baml_src/spec_compiler/eval_runner.baml @@ -89,6 +89,9 @@ function CompileSpecBriefFromPacket(packet: SpecEvalPacket) -> SpecBrief { - In update mode, do not restate or expand unchanged Purpose, Problem Statement, Goal, Non-Goal, component, dependency, or domain-model content just because you can infer a cleaner version. - In update mode, if an unchanged paragraph, bullet, or component description already exists in `existing_spec`, preserve that wording exactly instead of restating it from summary fields. - In update mode, do not make stylistic micro-edits to untouched text. Avoid synonym swaps, helper-word insertion, tense normalization, article changes, singular/plural cleanup, or punctuation-only rewriting unless the requested edit requires it. + - In update mode, if the request or criteria use words such as `refresh`, `replace`, `rewrite`, `stale`, `old`, `older`, `generic`, or `shorthand` for a named section or subsection, treat that section's corresponding brief field as a complete replacement list for that section, not as an append-only merge with the old list. + - In update mode, when a refreshed section says old bullets should be removed, do not carry those bullets forward verbatim in the corresponding brief field even if they are still semantically related. Replace them with the sharper source-backed wording from the packet. + - In update mode, when the packet lists distinct replacement concepts for a refreshed section, preserve those concepts as distinct brief items unless the packet says they are alternatives. Do not collapse separate dependency categories, boundary categories, or open questions into one broad bullet. - In update mode, if the request asks for a new component, boundary bullet, or open-question section, do not automatically propagate that concept into Purpose, Problem Statement, Goals, Abstraction Levels, External Dependencies, or the domain model unless the request explicitly asks for those extra edits. - In update mode, if the request does not explicitly ask to revise `## 4. Core Domain Model`, keep the entity set unchanged. A newly added component, surface, boundary, or open question does not by itself justify a new entity or new fields. - In update mode, prefer expressing newly requested behavior in component text, boundary bullets, or unresolved questions before considering any domain-model edit. @@ -230,7 +233,9 @@ function RenderSpecDocumentDraft(brief: SpecBrief) -> string { - for a section or subsection refresh, keep the heading and neighboring sections unchanged, but replace that section body with the corresponding brief content - for a refreshed or replaced section, the old section body is not authoritative; do not copy old bullets or questions from `existing_spec` into the refreshed section unless the same bullet or question appears in the corresponding brief field - insertion is preferred only for add-only requests; do not preserve stale bullets inside a section that the request explicitly asks to refresh or replace + - if the request or criteria call existing bullets stale, old, older, generic, shorthand, loose, or too vague, remove those bullets from the refreshed section even when they are adjacent to newly requested material - if `update_request` asks to refresh `### 2.2 Non-Goals`, render `brief.non_goals` as the complete new bullet list for that subsection and remove old non-goal bullets that are absent from `brief.non_goals` + - if `update_request` asks to refresh `### 3.3 External Dependencies`, render `brief.external_dependencies` as the complete new bullet list for that subsection and remove old dependency bullets that are absent from `brief.external_dependencies` - if `update_request` asks to refresh `## 5. Open Questions`, render `brief.unresolved_questions` as the complete new bullet list for that section and remove old questions that are absent from `brief.unresolved_questions` - prefer insertion over rewrite; when adding a new bullet or component beside existing text, keep neighboring unchanged lines exactly as they appear in `existing_spec` - if `update_request` asks for a new bullet, component, boundary, or open question, insert only that material into the existing structure instead of regenerating neighboring paragraphs or adding unrelated template sections; this insertion rule does not apply to refresh, replace, or rewrite requests @@ -266,6 +271,8 @@ function EvaluateSpecDocument( - Penalize invented capabilities, invented certainty, or implementation leakage. - Reward correct scope boundaries and careful handling of transition states. - Penalize missing core template sections, missing `### 3.2 Abstraction Levels`, missing `### 3.3 External Dependencies`, malformed field formatting, or moving `Important boundary:` out of the Problem Statement block. + - In create mode, do not require field-format examples when the candidate explicitly says the service defines no durable service-specific entities or the packet does not support source-backed entities. Field formatting applies only to fields that are actually emitted. + - Do not treat top-level metadata lines such as `Status:` or `Purpose:` as entity fields, and do not call those lines malformed for lacking `name (type)` notation. - If `packet.existing_spec` is present, judge structural completeness relative to the requested update. Do not penalize sections or subsections that were already absent from `existing_spec` unless `task_prompt` or `expected_criteria` explicitly asks for those sections to be added. - In update mode, reward preserving the existing document shape when the task says to keep everything else as-is. - In update mode, penalize style-only paraphrases of untouched paragraphs, bullets, or component descriptions even when the meaning is still close. diff --git a/skills/spec-creator/evals/evals.json b/skills/spec-creator/evals/evals.json index 5d3b922..25ca9db 100644 --- a/skills/spec-creator/evals/evals.json +++ b/skills/spec-creator/evals/evals.json @@ -27,7 +27,7 @@ "implementation_leakage" ], "prompt": "Can you write a SPEC.md for a service we're building called Glacier Tier Manager? It watches S3 buckets and moves old objects to Glacier Deep Archive after policy-defined age thresholds. Polling, not event-driven — we want deterministic cadence. Main components: a Bucket Poller that lists objects on a schedule, a Policy Evaluator that decides which objects are eligible, and a Tiering Executor that issues the storage-class change. External deps: S3 API and per-bucket lifecycle policy files. Not a general-purpose cost optimizer — it only does tiering.", - "expected_output": "A SPEC.md at repo root covering Purpose, Problem Statement (with 3 bullets + 'Important boundary' block), Goals + Non-Goals (non-goal should reflect 'not a cost optimizer'), System Overview with the three named components, External Dependencies, and a Core Domain Model. Voice is third-person, fields use the `name` (type) format, obligation keywords are lowercase.", + "expected_output": "A SPEC.md at repo root covering Purpose, Problem Statement (with 3 bullets + 'Important boundary' block), Goals + Non-Goals (non-goal should reflect 'not a cost optimizer'), System Overview with the three named components, External Dependencies, and a Core Domain Model. Voice is third-person and obligation keywords are lowercase. If the source supports durable service-owned entities, fields use the `name` (type) format; if it does not, the Core Domain Model may explicitly state that no durable service-specific entities are defined.", "files": [] }, {