diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml new file mode 100644 index 0000000..d9674af --- /dev/null +++ b/.github/workflows/evals.yml @@ -0,0 +1,70 @@ +name: Evals + +on: + pull_request: + push: + branches: + - main + workflow_dispatch: + +concurrency: + group: evals-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + +jobs: + checks: + name: Typecheck and smoke evals + runs-on: ubuntu-latest + timeout-minutes: 30 + permissions: + contents: read + env: + AI_GATEWAY_API_KEY: ${{ secrets.AI_GATEWAY_API_KEY }} + BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} + BRAINTRUST_PROJECT: lightfast-skills + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Node + uses: actions/setup-node@v4 + with: + node-version-file: .node-version + + - name: Set up Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: 1.3.9 + + - name: Install dependencies + run: bun install --frozen-lockfile + + - name: Static eval checks + run: bun run ci:check + + - name: Run live smoke evals + if: env.AI_GATEWAY_API_KEY != '' + run: | + reporter="local" + if [ -n "${BRAINTRUST_API_KEY:-}" ]; then + reporter="local,braintrust" + fi + + bun run eval:foundation:smoke -- --reporter "$reporter" + bun run eval:spec:smoke -- --reporter "$reporter" + + - name: Skip live smoke evals + if: env.AI_GATEWAY_API_KEY == '' + run: echo "AI_GATEWAY_API_KEY is not configured; skipping model-backed smoke evals." + + - name: Upload eval artifacts + if: always() && env.AI_GATEWAY_API_KEY != '' + uses: actions/upload-artifact@v4 + with: + name: eval-runs + path: skills/*/evals/runs/** + if-no-files-found: ignore + retention-days: 7 diff --git a/.gitignore b/.gitignore index 92c1e04..f399c82 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,18 @@ +node_modules/ + +# Generated BAML clients +skills/**/baml_client/ +skills/**/baml_client_dist/ + +# Local eval outputs +skills/**/evals/runs/ + +# Local environment files +.env +.env.local +.env.*.local + +# Local OS and tooling noise .DS_Store -__pycache__/ -*.py[cod] -.venv/ -.idea/ -.vscode/ +*.log +.tmp-baml-client-tsconfig.json diff --git a/.node-version b/.node-version new file mode 100644 index 0000000..5bf4400 --- /dev/null +++ b/.node-version @@ -0,0 +1 @@ +24.15.0 diff --git a/README.md b/README.md index 1c9a33a..c9ac91d 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ Agent skills published by Lightfast. Compatible with [Claude Code](https://docs. | Skill | Purpose | |---|---| +| [`foundation-creator`](skills/foundation-creator/) | Draft a top-level foundation document for a product or company primitive: thesis, mission, boundaries, actor model, surfaces, strategic bets, and open questions. | | [`spec-creator`](skills/spec-creator/) | Write and update a top-level `SPEC.md` service specification following a strict template and language guide. | ## Install @@ -13,11 +14,165 @@ Agent skills published by Lightfast. Compatible with [Claude Code](https://docs. Each skill is a subdirectory under `skills/`. To install one into a project: ```bash +npx skills add lightfastai/skills --skill foundation-creator npx skills add lightfastai/skills --skill spec-creator ``` Or copy the directory directly into `.claude/skills/` in your project. +## Local evals + +This repo now includes BAML-backed fixture evals for `foundation-creator` and +`spec-creator`. + +```bash +bun install +bun run ci:check +bun run eval:check +bun run eval:typecheck +bun run eval:foundation -- create-foundation-from-vercel-source-packet +bun run eval:foundation -- create-foundation-from-lightfast-founder-notes +bun run eval:foundation -- update-lightfast-foundation-boundary-surface-question +bun run eval:foundation -- update-lightfast-foundation-tighten-overreach +bun run eval:spec -- create-from-vercel-mcp-source-packet +bun run eval:foundation:smoke +bun run eval:spec:smoke +bun run eval:spec -- --all +bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator create-foundation-from-cloudflare-source-packet --eval-profile gate --trials 3 +bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator update-lightfast-foundation-tighten-overreach --eval-profile fast --compare previous,profile:no-skill +bun run with-env -- bun ./scripts/run-baml-eval.ts foundation-creator create-foundation-from-lightfast-founder-notes --eval-profile cross +``` + +Each run writes packet, brief, candidate document, and evaluation report +artifacts under `skills//evals/runs/`. + +`bun run eval:check` is the cheap deterministic CI guard. It validates eval +manifests, fixture paths, validation regexes, smoke membership, and BAML runner +function wiring without calling any model. + +Current `foundation-creator` corpus includes: + +- `create-foundation-from-vercel-source-packet` +- `create-foundation-from-cloudflare-source-packet` +- `create-foundation-from-lightfast-founder-notes` +- `create-foundation-from-harbor-care-source-packet` +- `update-lightfast-foundation-boundary-surface-question` +- `update-lightfast-foundation-tighten-overreach` + +The runner now also writes: + +- `deterministic_checks.json` — reference-driven checks derived from the skill's + `template.md` and `language.md` +- `timing.json` — per-stage local timing +- `summary.json` — per-trial LLM status + combined status +- `benchmark.json` — aggregated status counts and timing summaries across all + trials + +When `--compare` is used, the run directory also includes: + +- `comparison.json` — head-to-head summary across variants, all judged by the + current skill's evaluator +- `variants/