diff --git a/docs/_diagrams/README.md b/docs/_diagrams/README.md new file mode 100644 index 0000000..3efdcbb --- /dev/null +++ b/docs/_diagrams/README.md @@ -0,0 +1,36 @@ +# Diagram Sources + +Excalidraw source files for diagrams embedded in the docs. Edit the `.excalidraw` source here, then export PNG to `website/public/img/`. + +## Files + +| Source | Rendered PNG | Used In | +| ------------------------------- | --------------------------------------------------- | -------------------------------------------- | +| `eval-test-types.excalidraw` | `website/public/img/eval-test-types.png` | explanation/what-are-evals.md | + +## Exporting to PNG + +A headless renderer is included. From the project root: + +```bash +# Install playwright once if you have not (~50 MB) +npm install --no-save playwright + +# Render any source file in this folder +node docs/_diagrams/render.mjs \ + docs/_diagrams/eval-test-types.excalidraw \ + website/public/img/eval-test-types.png \ + 2 +``` + +The third arg is the scale factor (2 = retina). The renderer loads `@excalidraw/utils` from esm.sh, exports to canvas, and writes the PNG. Internet is required on the first run; thereafter esm.sh caches. + +If you prefer the GUI, drag the `.excalidraw` onto [excalidraw.com](https://excalidraw.com) and use File > Export image > PNG (scale 2x, white background). + +The PNG is committed alongside the source so the docs render without a build step. + +## Convention + +- Two-character element indices only (`a0`, `aZ`, `b3`). Three-char indices like `d9a` silently break the file. +- Use distinct fill colors per logical group (e.g., workspace, artifact flow, trigger flow, output). +- Keep diagrams legible at 1200px wide. diff --git a/docs/_diagrams/eval-test-types.excalidraw b/docs/_diagrams/eval-test-types.excalidraw new file mode 100644 index 0000000..cd8b687 --- /dev/null +++ b/docs/_diagrams/eval-test-types.excalidraw @@ -0,0 +1,1357 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "id": "el_1001", + "type": "text", + "x": 350, + "y": 20, + "width": 600, + "height": 40, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 7926919, + "version": 1, + "versionNonce": 6157151, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "a0", + "text": "Eval Runner: How the Test Types Flow", + "fontSize": 26, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": null, + "originalText": "Eval Runner: How the Test Types Flow", + "lineHeight": 1.25, + "baseline": 22 + }, + { + "id": "el_1002", + "type": "rectangle", + "x": 220, + "y": 90, + "width": 860, + "height": 110, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#fff7e6", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 7934838, + "version": 1, + "versionNonce": 6163302, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "el_1003" + } + ], + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "a1" + }, + { + "id": "el_1003", + "type": "text", + "x": 220, + "y": 105.0, + "width": 860, + "height": 80, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 7942757, + "version": 1, + "versionNonce": 6169453, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "a2", + "text": "Isolated Workspace (Docker container or HOME-overridden temp dir)\nProject rsync + Setup overlays (base + per-eval) + Skill under test + Fixtures + Credentials", + "fontSize": 16, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": "el_1002", + "originalText": "Isolated Workspace (Docker container or HOME-overridden temp dir)\nProject rsync + Setup overlays (base + per-eval) + Skill under test + Fixtures + Credentials", + "lineHeight": 1.25, + "baseline": 13 + }, + { + "id": "el_1004", + "type": "arrow", + "x": 650, + "y": 200, + "width": 300, + "height": 80, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 7950676, + "version": 1, + "versionNonce": 6175604, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "a3", + "points": [ + [ + 0, + 0 + ], + [ + -300, + 80 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "el_1002", + "focus": 0, + "gap": 4 + }, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "el_1005", + "type": "arrow", + "x": 650, + "y": 200, + "width": 300, + "height": 80, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 7958595, + "version": 1, + "versionNonce": 6181755, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "a4", + "points": [ + [ + 0, + 0 + ], + [ + 300, + 80 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "el_1002", + "focus": 0, + "gap": 4 + }, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "el_1006", + "type": "text", + "x": 200, + "y": 230, + "width": 300, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 7966514, + "version": 1, + "versionNonce": 6187906, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "a5", + "text": "Artifact Eval", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": null, + "originalText": "Artifact Eval", + "lineHeight": 1.25, + "baseline": 17 + }, + { + "id": "el_1007", + "type": "text", + "x": 800, + "y": 230, + "width": 300, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 7974433, + "version": 1, + "versionNonce": 6194057, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "a6", + "text": "Trigger Eval", + "fontSize": 20, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": null, + "originalText": "Trigger Eval", + "lineHeight": 1.25, + "baseline": 17 + }, + { + "id": "el_1008", + "type": "rectangle", + "x": 200, + "y": 280, + "width": 300, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#e6f4ff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 7982352, + "version": 1, + "versionNonce": 6200208, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "el_1009" + } + ], + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "a7" + }, + { + "id": "el_1009", + "type": "text", + "x": 200, + "y": 303.0, + "width": 300, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 7990271, + "version": 1, + "versionNonce": 6206359, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "a8", + "text": "claude -p with eval prompt", + "fontSize": 16, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": "el_1008", + "originalText": "claude -p with eval prompt", + "lineHeight": 1.25, + "baseline": 13 + }, + { + "id": "el_1010", + "type": "rectangle", + "x": 200, + "y": 380, + "width": 300, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#e6f4ff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 7998190, + "version": 1, + "versionNonce": 6212510, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "el_1011" + } + ], + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "a9" + }, + { + "id": "el_1011", + "type": "text", + "x": 200, + "y": 403.0, + "width": 300, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8006109, + "version": 1, + "versionNonce": 6218661, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aA", + "text": "transcript.jsonl + artifacts/", + "fontSize": 16, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": "el_1010", + "originalText": "transcript.jsonl + artifacts/", + "lineHeight": 1.25, + "baseline": 13 + }, + { + "id": "el_1012", + "type": "rectangle", + "x": 200, + "y": 480, + "width": 300, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#e6f4ff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 8014028, + "version": 1, + "versionNonce": 6224812, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "el_1013" + } + ], + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aB" + }, + { + "id": "el_1013", + "type": "text", + "x": 200, + "y": 503.0, + "width": 300, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8021947, + "version": 1, + "versionNonce": 6230963, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aC", + "text": "Grader subagent (parallel)\nreads transcript + artifacts", + "fontSize": 16, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": "el_1012", + "originalText": "Grader subagent (parallel)\nreads transcript + artifacts", + "lineHeight": 1.25, + "baseline": 13 + }, + { + "id": "el_1014", + "type": "rectangle", + "x": 200, + "y": 580, + "width": 300, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#e6f4ff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 8029866, + "version": 1, + "versionNonce": 6237114, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "el_1015" + } + ], + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aD" + }, + { + "id": "el_1015", + "type": "text", + "x": 200, + "y": 603.0, + "width": 300, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8037785, + "version": 1, + "versionNonce": 6243265, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aE", + "text": "grading.json\n(PASS/FAIL per expectation)", + "fontSize": 16, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": "el_1014", + "originalText": "grading.json\n(PASS/FAIL per expectation)", + "lineHeight": 1.25, + "baseline": 13 + }, + { + "id": "el_1016", + "type": "arrow", + "x": 350.0, + "y": 350, + "width": 0.0, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8045704, + "version": 1, + "versionNonce": 6249416, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aF", + "points": [ + [ + 0, + 0 + ], + [ + 0.0, + 30 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "el_1008", + "focus": 0, + "gap": 4 + }, + "endBinding": { + "elementId": "el_1010", + "focus": 0, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "el_1017", + "type": "arrow", + "x": 350.0, + "y": 450, + "width": 0.0, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8053623, + "version": 1, + "versionNonce": 6255567, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aG", + "points": [ + [ + 0, + 0 + ], + [ + 0.0, + 30 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "el_1010", + "focus": 0, + "gap": 4 + }, + "endBinding": { + "elementId": "el_1012", + "focus": 0, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "el_1018", + "type": "arrow", + "x": 350.0, + "y": 550, + "width": 0.0, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8061542, + "version": 1, + "versionNonce": 6261718, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aH", + "points": [ + [ + 0, + 0 + ], + [ + 0.0, + 30 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "el_1012", + "focus": 0, + "gap": 4 + }, + "endBinding": { + "elementId": "el_1014", + "focus": 0, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "el_1019", + "type": "rectangle", + "x": 800, + "y": 280, + "width": 300, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#e8f5e9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 8069461, + "version": 1, + "versionNonce": 6267869, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "el_1020" + } + ], + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aI" + }, + { + "id": "el_1020", + "type": "text", + "x": 800, + "y": 303.0, + "width": 300, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8077380, + "version": 1, + "versionNonce": 6274020, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aJ", + "text": "synthetic skill staged at\n.claude/skills//", + "fontSize": 16, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": "el_1019", + "originalText": "synthetic skill staged at\n.claude/skills//", + "lineHeight": 1.25, + "baseline": 13 + }, + { + "id": "el_1021", + "type": "rectangle", + "x": 800, + "y": 380, + "width": 300, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#e8f5e9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 8085299, + "version": 1, + "versionNonce": 6280171, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "el_1022" + } + ], + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aK" + }, + { + "id": "el_1022", + "type": "text", + "x": 800, + "y": 403.0, + "width": 300, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8093218, + "version": 1, + "versionNonce": 6286322, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aL", + "text": "claude -p with query \u00d7 3 runs", + "fontSize": 16, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": "el_1021", + "originalText": "claude -p with query \u00d7 3 runs", + "lineHeight": 1.25, + "baseline": 13 + }, + { + "id": "el_1023", + "type": "rectangle", + "x": 800, + "y": 480, + "width": 300, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#e8f5e9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 8101137, + "version": 1, + "versionNonce": 6292473, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "el_1024" + } + ], + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aM" + }, + { + "id": "el_1024", + "type": "text", + "x": 800, + "y": 503.0, + "width": 300, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8109056, + "version": 1, + "versionNonce": 6298624, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aN", + "text": "detect Skill tool fires\nreferencing ", + "fontSize": 16, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": "el_1023", + "originalText": "detect Skill tool fires\nreferencing ", + "lineHeight": 1.25, + "baseline": 13 + }, + { + "id": "el_1025", + "type": "rectangle", + "x": 800, + "y": 580, + "width": 300, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#e8f5e9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 8116975, + "version": 1, + "versionNonce": 6304775, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "el_1026" + } + ], + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aO" + }, + { + "id": "el_1026", + "type": "text", + "x": 800, + "y": 603.0, + "width": 300, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8124894, + "version": 1, + "versionNonce": 6310926, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aP", + "text": "fire_rate per query\n(pass = matches should_trigger)", + "fontSize": 16, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": "el_1025", + "originalText": "fire_rate per query\n(pass = matches should_trigger)", + "lineHeight": 1.25, + "baseline": 13 + }, + { + "id": "el_1027", + "type": "arrow", + "x": 950.0, + "y": 350, + "width": 0.0, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8132813, + "version": 1, + "versionNonce": 6317077, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aQ", + "points": [ + [ + 0, + 0 + ], + [ + 0.0, + 30 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "el_1019", + "focus": 0, + "gap": 4 + }, + "endBinding": { + "elementId": "el_1021", + "focus": 0, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "el_1028", + "type": "arrow", + "x": 950.0, + "y": 450, + "width": 0.0, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8140732, + "version": 1, + "versionNonce": 6323228, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aR", + "points": [ + [ + 0, + 0 + ], + [ + 0.0, + 30 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "el_1021", + "focus": 0, + "gap": 4 + }, + "endBinding": { + "elementId": "el_1023", + "focus": 0, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "el_1029", + "type": "arrow", + "x": 950.0, + "y": 550, + "width": 0.0, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8148651, + "version": 1, + "versionNonce": 6329379, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aS", + "points": [ + [ + 0, + 0 + ], + [ + 0.0, + 30 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "el_1023", + "focus": 0, + "gap": 4 + }, + "endBinding": { + "elementId": "el_1025", + "focus": 0, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "el_1030", + "type": "rectangle", + "x": 450, + "y": 720, + "width": 400, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#fff0f0", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": { + "type": 3 + }, + "seed": 8156570, + "version": 1, + "versionNonce": 6335530, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "el_1031" + } + ], + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aT" + }, + { + "id": "el_1031", + "type": "text", + "x": 450, + "y": 743.0, + "width": 400, + "height": 24, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8164489, + "version": 1, + "versionNonce": 6341681, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aU", + "text": "report.html (aggregated run report)", + "fontSize": 18, + "fontFamily": 1, + "textAlign": "center", + "verticalAlign": "top", + "containerId": "el_1030", + "originalText": "report.html (aggregated run report)", + "lineHeight": 1.25, + "baseline": 15 + }, + { + "id": "el_1032", + "type": "arrow", + "x": 350.0, + "y": 650, + "width": 180.0, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8172408, + "version": 1, + "versionNonce": 6347832, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aV", + "points": [ + [ + 0, + 0 + ], + [ + 180.0, + 70 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "el_1014", + "focus": 0, + "gap": 4 + }, + "endBinding": { + "elementId": "el_1030", + "focus": 0, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "el_1033", + "type": "arrow", + "x": 950.0, + "y": 650, + "width": 180.0, + "height": 70, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "roundness": null, + "seed": 8180327, + "version": 1, + "versionNonce": 6353983, + "isDeleted": false, + "boundElements": null, + "updatedAt": 1778369776591, + "link": null, + "locked": false, + "index": "aW", + "points": [ + [ + 0, + 0 + ], + [ + -180.0, + 70 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "el_1025", + "focus": 0, + "gap": 4 + }, + "endBinding": { + "elementId": "el_1030", + "focus": 0, + "gap": 4 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/docs/_diagrams/render.html b/docs/_diagrams/render.html new file mode 100644 index 0000000..39267e5 --- /dev/null +++ b/docs/_diagrams/render.html @@ -0,0 +1,48 @@ + + + + +Excalidraw → PNG + + + +
loading...
+ + + + diff --git a/docs/_diagrams/render.mjs b/docs/_diagrams/render.mjs new file mode 100644 index 0000000..92585c9 --- /dev/null +++ b/docs/_diagrams/render.mjs @@ -0,0 +1,66 @@ +#!/usr/bin/env node +// Render an .excalidraw file to PNG via headless Chromium. +// Usage: node excalidraw_to_png.mjs [scale] + +import { chromium } from 'playwright'; +import { readFileSync, writeFileSync, statSync } from 'node:fs'; +import { resolve } from 'node:path'; +import { fileURLToPath, pathToFileURL } from 'node:url'; + +const [, , inPathArg, outPathArg, scaleArg] = process.argv; +if (!inPathArg || !outPathArg) { + console.error('usage: excalidraw_to_png.mjs [scale]'); + process.exit(2); +} +const inPath = resolve(inPathArg); +const outPath = resolve(outPathArg); +const scale = parseFloat(scaleArg || '2'); +statSync(inPath); + +const sceneJson = JSON.parse(readFileSync(inPath, 'utf-8')); + +const htmlPath = resolve(fileURLToPath(import.meta.url), '..', 'excalidraw_render.html'); +const htmlUrl = pathToFileURL(htmlPath).href; + +const browser = await chromium.launch({ headless: true }); +const ctx = await browser.newContext({ viewport: { width: 2400, height: 1800 } }); +const page = await ctx.newPage(); +page.on('console', (msg) => { + const t = msg.type(); + if (t === 'error' || t === 'warning') console.error(`[browser:${t}]`, msg.text()); +}); +page.on('pageerror', (err) => console.error('[pageerror]', err.message)); + +await page.addInitScript( + ({ scene, scale }) => { + window.__scene = scene; + window.__scale = scale; + }, + { scene: sceneJson, scale }, +); + +console.error('loading:', htmlUrl); +await page.goto(htmlUrl, { waitUntil: 'domcontentloaded' }); + +await page.waitForFunction(() => window.__done === true, { timeout: 90_000 }); + +const err = await page.evaluate(() => window.__error); +if (err) { + console.error('render error:', err); + await browser.close(); + process.exit(1); +} + +const dataUrl = await page.evaluate(() => window.__pngDataUrl); +if (!dataUrl || !dataUrl.startsWith('data:image/png;base64,')) { + console.error('no png produced; got:', String(dataUrl).slice(0, 64)); + await browser.close(); + process.exit(1); +} + +const b64 = dataUrl.split(',', 2)[1]; +writeFileSync(outPath, Buffer.from(b64, 'base64')); +const { size } = statSync(outPath); +console.error(`wrote ${outPath} (${size.toLocaleString()} bytes)`); + +await browser.close(); diff --git a/docs/explanation/index.md b/docs/explanation/index.md index 00a9f56..7506fd3 100644 --- a/docs/explanation/index.md +++ b/docs/explanation/index.md @@ -25,6 +25,13 @@ Create world-class AI agents and workflows with the BMad Builder. | **[Skill Authoring Best Practices](/explanation/skill-authoring-best-practices.md)** | Core principles, common patterns, quality dimensions, and anti-patterns | | **[Scripts in Skills](/explanation/scripts-in-skills.md)** | Why deterministic scripts make skills faster, cheaper, and more reliable | +## Evaluating Skills + +| Topic | Description | +| ---------------------------------------------------------------------- | --------------------------------------------------------------------------------- | +| **[What Are Evals](/explanation/what-are-evals.md)** | Artifact and trigger evals; the two single-shot patterns that cover most skills | +| **[Why BMad Eval Runner](/explanation/why-bmad-eval-runner.md)** | What the reference runner misses: isolation, dependency staging, real triggers | + ## Reference | Resource | Description | diff --git a/docs/explanation/what-are-evals.md b/docs/explanation/what-are-evals.md new file mode 100644 index 0000000..587ec6d --- /dev/null +++ b/docs/explanation/what-are-evals.md @@ -0,0 +1,110 @@ +--- +title: 'What Are Evals' +description: How evaluations measure skill behavior, and the two eval shapes BMad supports +--- + +Evals are measurements of how a skill behaves. You write a prompt, declare what should happen, run the skill, and grade the result against your expectations. They are how you know your skill works, that regression tests caught problems, and that a description fires on the right queries. + +## Why Evals Matter + +A skill is a contract. Its description claims when Claude should invoke it. Its body claims a behavior. Evals turn both claims into a test suite you can run after editing a skill, after upgrading the model, or before publishing a module. Every run lands on disk. You can read the transcript, inspect the artifacts, and trust the verdict because the grading is reproducible. + +## How the Test Types Flow + +![Eval Runner test types: shared isolated workspace flowing into parallel artifact and trigger eval pipelines, both aggregated into a final HTML report](/img/eval-test-types.png) + +Both test types share the same isolated workspace setup, then split into parallel pipelines, and converge into a single aggregated report. + +## Two Types of Evals + +BMad evaluates two questions independently. + +### Artifact Evals: Did the Skill Do the Right Thing? + +The runner sends a prompt to the skill in an isolated workspace and captures everything that happened: the full stream-JSON transcript (every tool call the skill made, every assistant message) and every file the skill wrote. A grader subagent then scores your expectations against the captured run. + +Artifact evals live in `evals.json`. Each entry has a `prompt`, optional fixture `files` to stage in the workspace, and a list of `expectations` graded independently. + +Multi-step workflow skills run end-to-end inside the single `claude -p` invocation when the prompt puts them in headless mode. The skill executes its full internal flow (subagent calls, polish passes, finalize sequence) and all of it shows up in the captured run. + +### Trigger Evals: Does the Description Fire on the Right Queries? + +A skill is invisible if Claude never invokes it. The trigger runner places a synthetic copy of your skill in a clean workspace, sends candidate user queries, and watches whether the Skill tool actually fires on each one. + +Trigger evals live in `triggers.json`. Each entry has a `query` and a boolean `should_trigger`. The runner repeats each query (3 times by default) and reports a fire rate per query. + +## What Every Artifact Eval Captures + +Every artifact eval, regardless of style, lands the same shape on disk: + +| File | Contents | +| --------------------- | -------------------------------------------------------------- | +| `prompt.txt` | The eval's prompt verbatim | +| `transcript.jsonl` | Stream-JSON: every tool call, every assistant message | +| `artifacts/` | Every file the skill wrote inside its workspace | +| `metrics.json` | Tool-call counts, timing, output sizes | +| `grading.json` | Per-expectation PASS/FAIL with cited evidence | + +You always have both the transcript and any files the skill wrote. Your expectations choose which to grade against. Trigger evals are simpler: the stream is parsed in real time for Skill tool fires and the per-query fire rate is saved to `triggers-result.json`. + +## Two Ways to Grade an Artifact Eval + +You can mix both styles in the same eval. Most well-tested skills do. + +### Output Grading + +Look at the files the skill produced. Assertions check structure, content, and fidelity to inputs. + +Use output grading when: + +- The deliverable is a file (brief, PRD, plan, doc) and you care that it's correct +- You can write a complete pre-discovery prompt so the skill has nothing to ask about +- You want regression coverage on drafting, format, and extraction + +Examples: + +- "brief.md word count is between 250 and 1500" +- "decision-log.md captures the pricing decision with rejected alternative and rationale" +- "frontmatter contains title, status, created (ISO 8601), updated (ISO 8601)" + +### Transcript Grading + +Look at the captured stream-JSON to see what the skill *did* internally. The grader scans for specific tool calls, checks the order things happened, walks side-artifacts (decision logs, addenda, distillates), and checks file timestamps. + +Use transcript grading when: + +- The skill enforces a protocol (decision log, polish phase, finalize sequence) +- The skill has read-only intents (Validate must not write any files) +- You need to catch regressions where drafting still works but the discipline went soft + +Examples: + +- "transcript contains a Skill tool call invoking bmad-editorial-review-prose" +- "polish-phase Skill calls occur after the brief body Write and before the final JSON status block" +- "input fixture brief.md is byte-identical after the run; no Write or Edit tool calls targeted it" + +## A Worked Example + +The `bmad-product-brief` skill in the BMad Method repository ships a complete eval suite that uses every feature in this guide. It mixes output grading and transcript grading in the same suite, stages dependency skills via setup overlays, includes a read-only Validate intent that asserts no fixtures were touched, and runs trigger evals across positive and negative queries. + +| What you can learn from it | Where to look | +| ----------------------------------- | ------------------------------------------------------ | +| Output grading assertions | `evals/bmm-skills/bmad-product-brief/evals.json` A1-A8 | +| Transcript grading assertions | `evals/bmm-skills/bmad-product-brief/evals.json` B1-B8 | +| Trigger queries (positive/negative) | `evals/bmm-skills/bmad-product-brief/triggers.json` | +| Setup overlays for dependencies | `evals/bmm-skills/bmad-product-brief/setup/` | + +See [Eval Format](/reference/eval-format.md) for the complete schema and a deeper tour of the suite. + +## Best Practices for Evals That Hold Up + +- **Discriminating expectations.** Each assertion should fail for a wrong output, not just an absent file. "brief.md exists" passes for an empty file. Pair existence checks with content checks. +- **Specific facts over vibes.** "incorporates at least 2 specific findings from section X" beats "is high quality." Concrete claims are gradable. +- **Negative assertions.** "does not introduce content from unrelated sections" catches drift that positive assertions miss. +- **Bidirectional fidelity.** For skills with side-artifacts, assert in both directions: every entry in the log appears in the brief, and no claim in the brief is absent from the prompt or log. +- **Read-only enforcement.** For Validate-style intents, assert that input fixtures are byte-identical after the run and that no Write or Edit tool calls targeted them. +- **Trust the grader's pushback.** Graders are instructed to flag weak assertions. When they do, fix the assertion rather than ignoring the note. + +## Next Steps + +For practical run instructions, see [Run Evals Against a Skill](/how-to/run-evals-against-a-skill.md). For the complete file format, see [Eval Format](/reference/eval-format.md). For why isolation matters, see [Why BMad Eval Runner](/explanation/why-bmad-eval-runner.md). diff --git a/docs/explanation/why-bmad-eval-runner.md b/docs/explanation/why-bmad-eval-runner.md new file mode 100644 index 0000000..06c7051 --- /dev/null +++ b/docs/explanation/why-bmad-eval-runner.md @@ -0,0 +1,38 @@ +--- +title: 'Why BMad Eval Runner' +description: Isolation, dependency staging, real trigger detection, and a permanent audit trail for every run +--- + +The eval runner is built around a simple goal: produce results that reflect the skill itself, not the host that ran it. That goal drives four design choices. + +## Isolation + +Every eval starts in a clean room. With Docker, the run executes inside a fresh container off `bmad-eval-runner:latest`. Without Docker, the runner falls back to a per-eval temp directory with `HOME` overridden so global memory and global `CLAUDE.md` cannot influence the result. Either way, two developers running the same eval get the same workspace state. + +Why this matters: skills are sensitive to context. Your global `~/.claude/CLAUDE.md`, your auto-memory, an ancestor `CLAUDE.md` in the project tree, cached MCP settings. All of these reach a default `claude -p` invocation. The eval should measure the skill, not the bench it was tested on. + +## Dependency Staging + +Real BMad skills compose. A product brief skill calls a distillator skill, which calls editorial review skills. The runner stages dependencies through a setup overlay system: directories at `evals/setup/` (base, applied to every eval) and `evals//setup/` (per-eval, applied on top) are rsynced into the workspace before the skill under test is staged. + +Drop the dependency skills into `evals/setup/.claude/skills/` and they are available inside the sandbox. Drop a per-eval `_bmad/config.user.yaml` into `evals/C1/setup/` and it overrides the base for that eval only. + +## Trigger Detection That Reflects Reality + +The runner places the synthetic skill at `/.claude/skills//SKILL.md`, where Claude actually loads skills. The detector watches the stream-JSON transcript for `Skill` tool calls (or `Read` of the synthetic SKILL.md) referencing the unique name. Each query runs three times by default for stability, and the fire rate per query is reported with a configurable threshold. + +This matches how skills surface in production. The trigger rate you see is the rate users will see. + +## Permanent Artifacts + +Every run writes to a dated folder under `~/bmad-evals//`. Each eval gets its own subdirectory containing the prompt, the full stream-JSON transcript, every file the skill wrote, the grading verdict, and timing metrics. + +Nothing is rotated or cleaned up. You can read what happened, share the run folder with a collaborator, or diff it against a future run. Disk management is the user's call, not the runner's. + +## Independent Grading + +After artifact runs complete, the runner spawns a grader subagent per eval (in parallel) using the Agent tool. Each grader reads the eval's transcript and artifacts, scores each expectation independently with cited evidence, and writes `grading.json`. Graders are instructed to flag weak assertions (passing on technicality) so you can spot evals that look strict but would pass for a wrong output. + +## Next Steps + +For a step-by-step run, see [Run Evals Against a Skill](/how-to/run-evals-against-a-skill.md). For the complete eval file schema, see [Eval Format](/reference/eval-format.md). For why Docker matters, see [Install Docker for Evals](/how-to/install-docker-for-evals.md). diff --git a/docs/how-to/install-docker-for-evals.md b/docs/how-to/install-docker-for-evals.md new file mode 100644 index 0000000..95354e5 --- /dev/null +++ b/docs/how-to/install-docker-for-evals.md @@ -0,0 +1,87 @@ +--- +title: 'Install Docker for Evals' +description: Install Docker Desktop so the eval runner can give you reproducible, hermetic test runs +--- + +Use Docker Desktop to give the eval runner a real isolation boundary. Without Docker, the runner falls back to local mode, which is best-effort and has known leak paths. + +## When to Use This + +- You plan to run trigger evals (local mode can leak host skills into the workspace) +- You want runs to be reproducible across machines +- You publish a module and want the same eval verdicts other developers see +- You want a guaranteed-empty `HOME` so global memory cannot influence results + +## When to Skip This + +- One-off iteration on artifact evals where local fallback is good enough for now +- A constrained environment where installing Docker is not feasible. The runner falls back to local mode and tells you it is doing so. + +:::note[Prerequisites] + +- Administrator access on your machine to install Docker Desktop +- A few GB of disk space for the Docker Desktop application and the eval-runner image +::: + +## Why Docker + +The eval runner needs to start each run from a clean slate. It is trying to measure the skill, not the host's accumulated state. Without isolation, three things contaminate the result. + +1. **Global memory and CLAUDE.md.** Your `~/.claude/CLAUDE.md` and auto-memory load on every Claude Code invocation. They influence outputs in ways the skill author cannot control. +2. **Ancestor configuration.** A `CLAUDE.md` anywhere above the skill in the directory tree gets discovered and loaded. +3. **Host-installed skills.** When `claude -p` runs in a directory with `.claude/skills/` somewhere up the tree, those skills are discoverable and can fire instead of (or alongside) the skill under test. This is especially harmful for trigger evals. + +Docker solves all three. The container has its own filesystem, its own `HOME`, and its own `.claude/`. Local mode patches `HOME` and creates a temp directory but cannot prevent ancestor discovery. + +## Step 1: Install Docker Desktop + +Download Docker Desktop for your platform: + +| Platform | Where to Get It | +| -------- | ----------------------------------------------------------------------------------------------------- | +| macOS | [docker.com/products/docker-desktop](https://www.docker.com/products/docker-desktop) | +| Windows | [docker.com/products/docker-desktop](https://www.docker.com/products/docker-desktop) | +| Linux | Docker Engine via your distribution's package manager, or Docker Desktop for Linux | + +Follow the installer's prompts. On macOS, drag the Docker app to Applications and launch it. On Windows, the installer enables WSL 2 if needed. + +## Step 2: Start Docker Desktop + +Launch Docker Desktop. Wait for the whale icon to indicate Docker is running. The eval runner shells out to the `docker` CLI; if Docker is not running, the runner falls back to local mode and tells you why. + +## Step 3: Verify Installation + +Confirm Docker is reachable from your terminal: + +```bash +docker info +``` + +A successful response means the eval runner can use Docker. An error means Docker is not running, or the CLI cannot reach the daemon. + +## Step 4: Let the Runner Build the Image + +The first time you invoke the eval runner with `--isolation docker` (or `auto` when Docker is available), the runner builds `bmad-eval-runner:latest` from a Dockerfile shipped with the skill. This takes a few minutes once. Subsequent runs reuse the cached image. + +The image is a minimal Node 20 base with Claude Code, Python 3, and standard tools. Nothing skill-specific or user-specific lives in the image. Your credentials are mounted in at run time, not baked in. + +:::tip[Credential Safety] +The Dockerfile contains no tokens, API keys, or credentials. Your authentication (macOS Keychain credential or `ANTHROPIC_API_KEY`) is staged into a per-run temp directory and mounted into the container as a read-only volume that disappears when the container exits. +::: + +## What You Get + +- Reproducible runs: the same eval produces the same workspace state on any machine with the image +- Real `HOME` isolation: the container's `/home/evaluator` is empty, not just overridden +- Trigger evals you can trust: only the synthetic skill staged for the test is discoverable, not your host's installed skills +- Network can be locked down per run if your evals do not need internet access + +## Tips + +- Rebuild the image with `python3 scripts/docker_setup.py --rebuild` if you ever need to reset it +- Per-eval container resource use is small (a few hundred MB). Parallel workers each spin up their own container. +- If `docker info` works in one terminal but not in your editor's integrated terminal, your shell PATH probably differs. Open a fresh terminal session. + +## Next Steps + +Run the eval runner against a skill: see [Run Evals Against a Skill](/how-to/run-evals-against-a-skill.md). For isolation internals, see the eval-runner skill's `references/isolation.md`. diff --git a/docs/how-to/run-evals-against-a-skill.md b/docs/how-to/run-evals-against-a-skill.md new file mode 100644 index 0000000..1d4371e --- /dev/null +++ b/docs/how-to/run-evals-against-a-skill.md @@ -0,0 +1,122 @@ +--- +title: 'Run Evals Against a Skill' +description: Use the bmad-eval-runner skill to evaluate a skill, capture artifacts, and grade the results +--- + +Use the `bmad-eval-runner` skill to run a skill's evals in a clean workspace and produce a graded report. + +## When to Use This + +- After editing a skill, to confirm nothing regressed +- Before publishing a module, to validate every skill you ship +- When debugging a description that fires on the wrong queries +- When checking that dependency skills are wired correctly + +## When to Skip This + +- Quick iteration where you are running the skill manually and reading the output yourself +- Skills with no defined evals (the runner halts on missing evals; it does not invent them) + +:::note[Prerequisites] + +- The skill you want to evaluate, with `evals.json` and/or `triggers.json` defined +- Either Docker Desktop installed (preferred) or willingness to run in best-effort local isolation. See [Install Docker for Evals](/how-to/install-docker-for-evals.md). +- An Anthropic account authenticated through Claude Code (the runner reuses your existing credential) +::: + +:::tip[Quick Path] +Invoke the eval runner with the path to your skill: `bmad-eval-runner ./skills/my-skill`. The runner discovers your evals, picks isolation, runs everything in parallel, and tells you where the report lives. +::: + +## Step 1: Confirm Eval Discovery + +The runner looks for evals in this order, taking the first match: + +1. The path you pass via `--evals` +2. `/evals/` +3. `/../../evals//` +4. `/evals//` +5. `/evals/**//` (fuzzy) + +If discovery fails, the runner halts. It does not invent evals. + +## Step 2: Choose Isolation + +Pass `--isolation docker|local|auto`. Default is `auto`, which picks Docker when available and local when not. + +| Mode | When to Use | +| ------ | ------------------------------------------------------------------------ | +| docker | Trigger evals (host skills can leak in local mode); reproducible runs | +| local | Quick iteration when you have not installed Docker | +| auto | Default; lets the runner pick the best available option | + +The first time Docker is selected, the runner builds the `bmad-eval-runner:latest` image. This takes a few minutes once. Subsequent runs reuse the cached image. + +## Step 3: Pick Mode + +Pass `--mode artifact|trigger|both`. Default is `both` if both eval files are found. + +| Mode | Effect | +| -------- | ------------------------------- | +| artifact | Runs `evals.json` only | +| trigger | Runs `triggers.json` only | +| both | Runs everything in parallel | + +## Step 4: Run the Skill + +Invoke the eval runner from your project. A typical invocation: + +```bash +bmad-eval-runner ./src/skills/my-skill --isolation docker --workers 8 +``` + +The runner stages each eval's workspace, executes `claude -p` against the prompt, captures the stream-JSON transcript, and rsyncs any files the skill wrote. After all evals complete, it spawns a grader subagent per eval (in parallel) and aggregates the verdicts. + +## Step 5: Inspect Results + +When the run finishes, the runner emits two paths: + +- The run folder, at `~/bmad-evals//` (or your configured `bmad_builder_reports` location) +- An HTML report at `/report.html` + +Open the report for the summary view. Drop into the run folder for full transcripts, artifacts, and grading details for any eval you want to examine. + +## What You Get + +``` +~/bmad-evals/20260509-172903-my-skill/ +├── run.json # Run metadata +├── report.html # Aggregate HTML report +├── A1/ +│ ├── prompt.txt # The eval's prompt verbatim +│ ├── transcript.jsonl # Stream-JSON tool calls and messages +│ ├── artifacts/ # Files the skill wrote +│ ├── grading.json # Per-expectation verdicts +│ └── metrics.json # Timing and tool-call counts +├── A2/ +│ └── ... +└── triggers-result.json # Trigger eval rates +``` + +Run folders are never deleted automatically. Disk management is your call. + +## Tips + +- Pass `--eval-ids A1,B3` to run only specific evals while iterating +- Pass `--workers 8` to parallelize aggressively (default is 4) +- A specific eval can override the default timeout by setting `"timeout": 900` in its `evals.json` entry +- For trigger evals, prefer Docker. Local mode can let host-installed skills bleed in via cwd-based discovery and bias the fire rate. + +## A Worked Example + +The `bmad-product-brief` skill in the BMad Method repository (`bmad-code-org/BMAD-METHOD`) ships a complete eval suite at `evals/bmm-skills/bmad-product-brief/`. To run it end-to-end: + +```bash +bmad-eval-runner ./src/bmm-skills/1-analysis/bmad-product-brief --isolation docker --workers 8 +``` + +The run produces 17 graded artifact evals (A1-A8 output grading, B1-B8 transcript grading, C1 configuration compliance), 15 trigger eval verdicts, and an aggregated HTML report. Use it as the model when writing evals for your own skills. + +## Next Steps + +For the complete `evals.json` and `triggers.json` schema, see [Eval Format](/reference/eval-format.md). For concepts and patterns, see [What Are Evals](/explanation/what-are-evals.md). diff --git a/docs/reference/eval-format.md b/docs/reference/eval-format.md new file mode 100644 index 0000000..c1d05eb --- /dev/null +++ b/docs/reference/eval-format.md @@ -0,0 +1,208 @@ +--- +title: 'Eval Format' +description: Complete schema for evals.json and triggers.json including fixtures, setup overlays, per-eval timeouts, and grading expectations +--- + +The runner accepts two file shapes. Both are JSON. Both live in an evals directory the runner can discover. + +## File Layout + +A typical evals directory: + +``` +evals// +├── evals.json # Artifact evals +├── triggers.json # Trigger evals +├── setup/ # Base overlay applied to every eval +│ └── .claude/ +│ └── skills/ +│ └── bmad-distillator/ +├── A1/ +│ └── setup/ # Per-eval overlay applied on top of base +├── files/ # Fixture files staged via the eval's `files` field +│ └── some-fixture.md +``` + +## Artifact Evals (evals.json) + +```json +{ + "skill_name": "bmad-product-brief", + "evals": [ + { + "id": "A1", + "prompt": "Run headless. Create a product brief for ...", + "expected_output": "A run folder with brief.md and decision-log.md ...", + "files": ["evals/.../files/q2-brainstorm.md"], + "expectations": [ + "brief.md exists in the run folder", + "decision-log.md captures the pricing decision with rationale", + "Final assistant message contains JSON with intent='create'" + ], + "timeout": 900 + } + ] +} +``` + +### Field Reference + +| Field | Type | Required | Description | +| ----------------- | -------- | -------- | -------------------------------------------------------------------------------------------- | +| `id` | string | yes | Stable identifier; used as the eval's directory name in the run folder | +| `prompt` | string | yes | Literal user message sent to `claude -p` | +| `expected_output` | string | no | Human-readable description; the grader reads it for context but does not score against it | +| `files` | string[] | no | Fixture paths to stage into the workspace before execution | +| `expectations` | string[] | yes | Pass/fail assertions evaluated by the grader subagent | +| `timeout` | int | no | Per-eval timeout in seconds; overrides the runner's default | + +### Top-Level Optional Fields + +| Field | Type | Purpose | +| --------------- | ------ | -------------------------------------------------------------------------------- | +| `skill_name` | string | Documentation only; the runner derives the skill name from the SKILL.md path | +| `_design_notes` | string | Free-form notes for human readers; ignored by the runner | + +### Fixture Files + +Each entry in `files` is staged into the eval's workspace before execution. + +- A nested path (`evals/skill-x/files/some-brief/brief.md`) preserves directory structure inside the workspace +- The eval prompt then references the staged path explicitly so the skill can find it +- Fixtures are read-only inputs. The skill may read them; assertions may check that the skill did not modify them. + +### Per-Eval Timeout + +When omitted, the runner's default applies (currently 600 seconds, overridable via `--timeout`). Set per-eval timeouts when an eval invokes a slow dependency. Example for an eval that triggers a distillator subagent: + +```json +{ + "id": "B8", + "timeout": 900, + "prompt": "Run headless. Create a product brief ... and generate a distillate." +} +``` + +### Expectations + +Expectations are assertions the grader scores independently. Each is graded PASS or FAIL with cited evidence from the transcript or artifacts. The grader is instructed to flag weak assertions, so a passing eval that only proves the skill did not crash is surfaced as a problem, not a victory. + +See [What Are Evals](/explanation/what-are-evals.md) for the two patterns (artifact correctness and process discipline) and the kinds of assertions that work well for each. + +## Trigger Evals (triggers.json) + +```json +[ + { "query": "Help me write a product brief for ...", "should_trigger": true }, + { "query": "Help me brainstorm ideas for ...", "should_trigger": false } +] +``` + +### Field Reference + +| Field | Type | Required | Description | +| ---------------- | ------- | -------- | ---------------------------------------------------------- | +| `query` | string | yes | The user message sent to a clean Claude session | +| `should_trigger` | boolean | yes | Whether the skill's description should fire on this query | + +### How Trigger Detection Works + +The runner places a synthetic copy of the skill at `/.claude/skills//SKILL.md`. It then runs `claude -p` against each query and watches the stream-JSON output for a `Skill` tool call (or a `Read` of the synthetic SKILL.md) referencing the unique name. Each query runs three times by default for stability. The fire rate is the fraction of runs that fired. + +A query passes when: + +- `should_trigger=true` and `trigger_rate >= --trigger-threshold` (default 0.5) +- `should_trigger=false` and `trigger_rate < --trigger-threshold` + +:::caution[Trigger Evals Need Docker] +Local-mode trigger evals can be biased by host-installed skills that are discoverable via cwd-based skill discovery. The detector may see a real skill fire instead of the synthetic. Use Docker isolation for trigger evals whenever it is available. +::: + +## Setup Overlays + +The runner supports a two-tier overlay system that rsyncs files into the workspace before the skill under test is staged. Use overlays to ship dependency skills, project configuration (`_bmad/`), or fixture state with the evals. + +### Base Overlay + +Files at `evals//setup/` are applied to every eval. Use this for dependency skills the skill under test calls. + +``` +evals/bmad-product-brief/setup/ +└── .claude/ + └── skills/ + ├── bmad-distillator/ + ├── bmad-editorial-review-prose/ + └── bmad-editorial-review-structure/ +``` + +### Per-Eval Overlay + +Files at `evals///setup/` are applied after the base overlay, only for that eval. Use this for eval-specific configuration: a custom `_bmad/config.user.yaml`, additional skills, or a different output path. + +``` +evals/bmad-product-brief/C1/setup/ +└── _bmad/ + └── bmm/ + └── config.user.yaml +``` + +The base overlay and per-eval overlay are applied with `rsync -a`, so files at the same path in the per-eval overlay win. + +### Order of Operations + +Inside the workspace, before the eval runs: + +1. Project root is rsynced from the host +2. Base setup overlay is rsynced on top +3. Per-eval setup overlay is rsynced on top +4. The skill under test is symlinked or copied into `.claude/skills/` +5. Fixtures from the `files` array are staged at their declared paths + +The skill under test always wins. Overlays cannot replace the skill being evaluated. + +## Eval Discovery + +The runner discovers evals in this order, taking the first match: + +| Order | Path | +| ----- | ------------------------------------------------------------------- | +| 1 | `--evals ` argument (folder or specific JSON file) | +| 2 | `/evals/` | +| 3 | `/../../evals//` | +| 4 | `/evals//` | +| 5 | `/evals/**//` (fuzzy) | + +If both `evals.json` and `triggers.json` are found in the discovered directory, both run unless `--mode` narrows it. + +## Headless Prompts + +Multi-step workflow skills need a signal to suppress clarifying questions and emit a structured JSON status block at the end. Once that signal is present, the skill executes its full internal flow (subagent calls, polish passes, finalize sequence) inside a single `claude -p` invocation, and the runner captures everything. + +Common conventions for the signal: + +- The literal phrase `Run headless.` at the start of the prompt +- Skill-specific keywords documented in the skill's `## Headless Mode` section +- A complete prompt with no genuine ambiguity for the skill to ask about + +When designing evals for a multi-step skill, pack the prompt with everything the skill's discovery phase would have surfaced. The skill then has no reason to ask, runs end-to-end, and the artifacts are ready for grading. + +## Example Suite + +The `bmad-product-brief` skill in the BMad Method repository ships a complete eval suite that exercises every option in this reference. Read it as a worked example. + +| Group | What It Covers | +| ----- | ---------------------------------------------------------------------------------------------------- | +| A1-A8 | Output grading: brief content, frontmatter, source-filtering, right-sizing across 8 product scenarios | +| B1-B8 | Transcript grading: decision-log fidelity, polish phase ordering, read-only Validate, distillate flow | +| C1 | Configuration compliance: custom output paths, document language, communication style | +| Triggers | 15 queries (positive + negative) covering create, update, validate intents and adjacent skills | + +What it shows in practice: + +- Setup overlays for dependency skills (`bmad-distillator`, editorial review skills) under `evals/setup/.claude/skills/` +- Per-eval setup overlay at `evals/bmad-product-brief/C1/setup/_bmad/` for custom configuration +- Per-eval `timeout` override on B8 (the distillator-invoking eval) +- Read-only fixture staging for Validate-mode evals (A4, B5, B6, B7) using the `files` field +- Mixed expectations in the same eval: some assertions check files, others scan the transcript for tool-call ordering + +See `evals/bmm-skills/bmad-product-brief/` in the BMad Method repository (`bmad-code-org/BMAD-METHOD`). diff --git a/docs/reference/index.md b/docs/reference/index.md index 0cd0ed5..078c7d9 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -7,7 +7,8 @@ description: Technical reference for BMad Builder Technical documentation for BMad Builder configuration and schemas. -| Reference | Description | -| ---------------------------------------------------------------- | --------------------------------------------------------------------- | -| **[Builder Skills](/reference/builder-commands.md)** | Agent Builder and Workflow Builder skills, commands, and capabilities | -| **[Workflow & Skill Patterns](/reference/workflow-patterns.md)** | Structure types, design patterns, and execution models | +| Reference | Description | +| ---------------------------------------------------------------- | ------------------------------------------------------------------------------ | +| **[Builder Skills](/reference/builder-commands.md)** | Agent Builder and Workflow Builder skills, commands, and capabilities | +| **[Workflow & Skill Patterns](/reference/workflow-patterns.md)** | Structure types, design patterns, and execution models | +| **[Eval Format](/reference/eval-format.md)** | Complete schema for evals.json and triggers.json, including overlays and timeouts | diff --git a/skills/bmad-eval-runner/SKILL.md b/skills/bmad-eval-runner/SKILL.md new file mode 100644 index 0000000..911dbc9 --- /dev/null +++ b/skills/bmad-eval-runner/SKILL.md @@ -0,0 +1,91 @@ +--- +name: bmad-eval-runner +description: Run a skill's evals in a clean, isolated environment and report results. Use when the user wants to evaluate a skill, run evals, benchmark a skill, validate triggers, or grade skill outputs. +--- + +# Skill Eval Runner + +## Overview + +Run a skill's evals in an environment that does not bleed in the user's global config, auto-memory, or ancestor `CLAUDE.md` files — so the result reflects the skill itself, not the bench it was tested on. Preserve every run's artifacts so the user can inspect what happened, not just whether it passed. + +Two eval shapes are supported and run independently: + +- **Artifact evals** (`evals.json`) — execute the skill against a prompt, capture the run's outputs, and grade each output against the eval's `expectations`. +- **Trigger evals** (`triggers.json`) — measure whether the skill's `description` actually causes Claude to invoke the skill on a given query versus stay clear when it shouldn't. + +You are an experienced eval engineer. The user wants signal, not theatre. Cite specific findings, surface evals that pass for trivial reasons, and never silently widen tolerances to make a run "succeed." + +## Args + +- Positional: a path to the skill being evaluated (directory containing `SKILL.md`). +- `--evals ` — explicit path to evals folder or a specific `evals.json` / `triggers.json` file. If omitted, discover. +- `--mode artifact|trigger|both` — which eval kind to run. Default: `both` if both files are found, else whichever exists. +- `--isolation docker|local|auto` — sandbox strategy. Default: `auto` (Docker when available, otherwise local). +- `--project-root ` — root of the project the skill belongs to. Default: walk up from skill path looking for `_bmad/` or `.git/`. +- `--output-dir ` — where run folders are written. Default: `{bmad_builder_reports}/eval-runs/` if configured, else `~/bmad-evals/`. +- `--workers ` — parallel evals. Default: 4. +- `--headless` / `-H` — non-interactive; emit final JSON only. + +## On Activation + +1. Resolve config the same way `bmad-workflow-builder` does (`{project-root}/_bmad/config.yaml` then `config.user.yaml`, falling back to `bmb/config.yaml`). Resolve `{user_name}`, `{communication_language}`, `{bmad_builder_reports}`. Apply throughout the session. + +2. If `--headless` was passed, set `{headless_mode}=true` and skip every confirmation below; pick the safest defaults and proceed. + +3. Locate the skill. Verify `/SKILL.md` exists; halt with a clear error if it doesn't. + +4. Discover evals — see `## Eval Discovery` below. + +5. Choose isolation — see `## Isolation` below. On the first Docker run on this machine, the image will need to be built; surface that, ask once unless headless, then cache. + +6. Confirm the run summary with the user (skill, evals found, mode, isolation, output dir) unless headless. Then execute. + +## Eval Discovery + +Look in this order, taking the first match: + +1. `--evals` argument if provided. May point to a folder (containing `evals.json` and/or `triggers.json`) or a specific JSON file. +2. `/evals/` — colocated with the skill. +3. `/../../evals//` — sibling-of-parent layout (common in BMad modules where `evals/` is excluded from distribution but lives next to `src/`). +4. `/evals//` — top-level evals tree. +5. `/evals/**//` — anywhere under project evals. + +Surface what you found and where. If no evals are discovered, halt with a clear message — do not attempt to fabricate evals. + +## Isolation + +Run each eval in a fresh workspace so memory, project CLAUDE.md, prior runs, and host shell config cannot bias the result. Two strategies, picked automatically by default: + +- **Docker** (preferred when available): each eval runs in a fresh container off `bmad-eval-runner:latest`. The host's `ANTHROPIC_API_KEY` is the only env passed in. The skill's project is bind-mounted read-only and copied into a writable scratch dir inside the container; `HOME` is a fresh in-container directory; there is no auto-memory and no host CLAUDE.md. + +- **Local fallback** (when Docker is unavailable or the user opts out): each eval runs in a fresh `~/bmad-evals///workspace/` directory with `HOME=/.home` overridden so global memory and global CLAUDE.md do not leak. The project is copied (or hardlinked where supported) into the workspace. Tell the user this is the active mode and acknowledge that local isolation is best-effort, not hermetic. + +The first time Docker is selected on this machine, build the image — `python3 {skill-root}/scripts/docker_setup.py --build` — and tell the user this is happening once. + +Details and the exact mount layout live in `references/isolation.md`. Read that file when you need to debug an isolation issue or explain to the user what is being isolated. + +## Run Execution + +For artifact evals, invoke `python3 {skill-root}/scripts/run_evals.py` with the resolved arguments. The script handles isolation per eval, runs `claude -p` in the sandbox with the eval's prompt and any staged fixture files, and writes a per-eval folder with `prompt.txt`, `transcript.jsonl`, `artifacts/`, and `metrics.json`. + +For trigger evals, invoke `python3 {skill-root}/scripts/run_triggers.py`. The script measures whether the skill's description causes the skill to fire for each query, with `runs-per-query` repeats for stability, and writes `triggers-result.json`. Trigger evals should run under Docker isolation when available — local mode can have the host's installed skills bleed in via cwd-based skill discovery, biasing the trigger signal. If Docker is unavailable, run trigger evals locally but say so explicitly. + +After artifact runs complete, grade each eval. Spawn a grader subagent per eval in parallel (Agent tool, prompt loaded from `{skill-root}/agents/grader.md` plus the eval's `expectations` and the path to its outputs). Each grader writes `grading.json` next to the artifacts. The grader has license to flag weak assertions — relay that feedback to the user. + +After all grading is done, generate the aggregate report — `python3 {skill-root}/scripts/generate_report.py --run-dir ` — which produces `report.html`. Tell the user where the run folder is and where the HTML report is. + +## Outcomes + +- Every eval's prompt, transcript, artifacts, and grading land on disk and stay there. Nothing is silently cleaned up. +- The run honestly reflects the skill's behavior in a clean room — not the behavior of the host shell with its memories and configs. +- The user knows whether Docker or local was used and why. +- Failures cite specific expectations and evidence; passes that look superficial are flagged, not papered over. + +## Constraints + +- **Artifacts are forever.** Never delete, overwrite, or rotate run folders. Disk usage is the user's call. +- **Auth boundary is narrow.** On macOS, the host's Claude Code OAuth credential is staged into each isolated `.claude/.credentials.json` so the subprocess can authenticate without inheriting host config. `ANTHROPIC_API_KEY`, if set, is also forwarded. Nothing else crosses. +- **Trigger evals do not need real artifacts.** They use a stub command file and only measure description firing — keep them cheap and parallel. +- **No silent fallbacks on grading.** If a grader subagent errors, mark that eval `grading_error` rather than substituting a default verdict. +- **Stop when evals are missing.** If discovery returns nothing, halt with diagnostics — the runner does not invent test cases. diff --git a/skills/bmad-eval-runner/agents/grader.md b/skills/bmad-eval-runner/agents/grader.md new file mode 100644 index 0000000..af1d0fb --- /dev/null +++ b/skills/bmad-eval-runner/agents/grader.md @@ -0,0 +1,93 @@ +# Grader Agent + +Evaluate a single eval's expectations against its captured transcript and artifacts. Return pass/fail per expectation with evidence — and flag weak assertions when you see them. + +You are not the executor. You are not allowed to "fix" the artifacts. Your only job is to inspect what was produced and answer: did each expectation hold? + +## Inputs + +You receive in your prompt: + +- **eval_id**: identifier for this eval +- **prompt**: the original user message that was sent to the skill +- **expected_output**: human-readable description of what success looks like (context only, not scored against) +- **expectations**: list of strings — the assertions you grade +- **transcript_path**: absolute path to a stream-JSON transcript (`.jsonl`) +- **artifacts_dir**: absolute path to the directory containing files the skill wrote +- **grading_path**: absolute path where you write `grading.json` + +## Process + +1. **Read the transcript.** Open `transcript_path`. The transcript is stream-JSON: each line is a JSON event. Note: + - The user prompt that was sent + - Every tool call Claude made — `Write`, `Edit`, `Read`, `Skill`, `Bash`, etc. (the event has `type: "assistant"` and `content[].type: "tool_use"` with `name` and `input`) + - The order tool calls happened in (events are line-ordered) + - The final assistant message — often contains a JSON status block for headless runs + - Any errors or warnings logged + +2. **List and inspect artifacts.** Walk `artifacts_dir`. For each expectation, open the files it implicates and read their contents — do not rely on filenames alone. Note file modification times when ordering or read-only behavior matters. + +3. **Grade each expectation independently.** For each entry in `expectations`, identify what kind of check it is and gather the right evidence: + + - **Side-artifact existence + content** ("decision-log.md exists AND captures decision X") → open the file, read it, check the content matches. + - **Transcript tool-call patterns** ("transcript contains a Skill call to bmad-editorial-review-prose") → scan the transcript for `tool_use` events with the matching `name` and `input`. Quote the matching event. + - **Phase ordering** ("polish call occurs after the Write to brief.md and before the final JSON block") → find the line numbers / event indices of each landmark and verify the order. + - **Read-only enforcement** ("input brief.md is byte-identical to the fixture; no Write/Edit calls targeted it") → compare file content if the original is available; AND scan the transcript for any Write/Edit `tool_use` whose `input.file_path` falls in the protected directory. + - **YAML frontmatter** ("frontmatter contains title, status, created (ISO 8601), updated") → parse the frontmatter, check fields and their formats. + - **JSON output blocks** ("final assistant message contains a JSON object with intent='create'") → look at the final `text` content of the last assistant message; extract the JSON object; check the field. + - **Bidirectional fidelity** ("every decision in decision-log.md is reflected in brief.md AND no claim in brief.md is absent from the input prompt or log") → list decisions in the log, verify each appears in the brief; list substantive claims in the brief, verify each traces to either the prompt or the log. + +4. **Decide PASS or FAIL with specific evidence.** + - PASS only if there is clear, specific evidence the expectation holds AND the evidence reflects substance, not surface compliance (file exists AND contains correct content, not just the right filename). + - FAIL when no evidence is found, evidence contradicts, or the assertion is technically satisfied but the underlying outcome is wrong. + - Cite the evidence — quote a specific line, name a specific file with a path, point to a specific tool call with its index or input. + +5. **Critique the evals.** After grading, surface assertions that look weak: ones that passed but would also pass for a clearly wrong output, or important outcomes you observed (good or bad) that no assertion checks. Keep the bar high — flag what an eval author would say "good catch" about, not nits. + +6. **Write `grading.json`.** Save to `grading_path`. + +## Output Format + +```json +{ + "eval_id": "", + "expectations": [ + { + "text": "brief.md exists in the run folder", + "passed": true, + "evidence": "Found at artifacts/2026-05-09-insulens/brief.md, 487 words" + }, + { + "text": "decision-log.md references having ingested the memo as source material", + "passed": false, + "evidence": "decision-log.md exists but contains only template placeholders; no mention of the memo" + } + ], + "summary": { + "passed": 1, + "failed": 1, + "total": 2, + "pass_rate": 0.5 + }, + "eval_feedback": { + "suggestions": [ + { + "assertion": "brief.md exists in the run folder", + "reason": "Existence is a weak check — an empty brief.md would also pass. Consider pairing with a content assertion (e.g., word count > 200, contains the project name)." + } + ], + "overall": "Assertions check structure but not content correctness in two places." + } +} +``` + +If `eval_feedback.suggestions` would be empty, set it to `[]` and `overall` to `"No suggestions; assertions look solid."` + +## Guidelines + +- **Be objective.** Verdicts come from evidence, not vibes. +- **Be specific.** Quote, name files, point to line numbers. +- **No partial credit.** Each expectation is pass or fail. +- **Burden of proof is on the expectation.** When uncertain, fail. +- **Do not edit artifacts.** You are read-only against the run folder. +- **Do not silently substitute defaults.** If you genuinely cannot read a file or the transcript is missing, mark the affected expectations failed with that as the evidence. diff --git a/skills/bmad-eval-runner/assets/Dockerfile b/skills/bmad-eval-runner/assets/Dockerfile new file mode 100644 index 0000000..9c791ae --- /dev/null +++ b/skills/bmad-eval-runner/assets/Dockerfile @@ -0,0 +1,29 @@ +FROM node:20-bookworm-slim + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + git \ + python3 \ + python3-pip \ + ca-certificates \ + curl \ + jq \ + rsync \ + && rm -rf /var/lib/apt/lists/* + +RUN npm install -g @anthropic-ai/claude-code + +RUN useradd -ms /bin/bash evaluator \ + && mkdir -p /workspace /project /output /home/evaluator/.claude \ + && chown -R evaluator:evaluator /workspace /output /home/evaluator + +USER evaluator +WORKDIR /workspace + +ENV HOME=/home/evaluator +ENV CLAUDE_CONFIG_DIR=/home/evaluator/.claude +ENV PATH=/home/evaluator/.local/bin:$PATH + +CMD ["bash"] diff --git a/skills/bmad-eval-runner/references/eval-formats.md b/skills/bmad-eval-runner/references/eval-formats.md new file mode 100644 index 0000000..6856abc --- /dev/null +++ b/skills/bmad-eval-runner/references/eval-formats.md @@ -0,0 +1,147 @@ +# Eval Formats + +The runner accepts two file shapes, both compatible with Anthropic's skill-creator conventions. + +## Artifact evals — `evals.json` + +```json +{ + "skill_name": "bmad-product-brief", + "evals": [ + { + "id": 1, + "prompt": "I want to create a brief for ...", + "expected_output": "A run folder with brief.md and decision-log.md ...", + "files": [ + "evals/.../files/some-fixture.md" + ], + "expectations": [ + "brief.md exists in the run folder", + "decision-log.md exists", + "brief.md word count is between 250 and 1500" + ] + } + ] +} +``` + +Field semantics: + +- **id**: stable identifier; used as the eval's directory name in the run folder. +- **prompt**: the literal user message Claude will receive. Sent verbatim to `claude -p`. +- **expected_output**: human-readable description, used for context only — the grader reads it but does not score against it directly. +- **files**: optional fixture paths. Resolved relative to the project root (or the evals folder). Each file is staged into the eval's workspace before execution. Path semantics: + - A bare filename is staged at the workspace root. + - A nested path (`some-brief/brief.md`) preserves the directory structure inside the workspace. +- **expectations**: list of pass/fail assertions evaluated by the grader subagent. Each is graded independently. The grader is instructed to flag weak assertions — assertions a wrong output would also trivially pass. + +The grader writes `grading.json` next to each eval's artifacts; the runner aggregates. + +## Trigger evals — `triggers.json` + +```json +[ + { "query": "Help me write a product brief for ...", "should_trigger": true }, + { "query": "Help me brainstorm ideas for ...", "should_trigger": false } +] +``` + +The runner creates a synthetic command file in the sandbox's `.claude/commands/.md` containing the skill's description, then runs each query against `claude -p` with stream-JSON output and detects whether the skill (or a Read of its SKILL.md) appears as a tool call. Each query is run `--runs-per-query` times (default 3); `trigger_rate` is the fraction of runs that fired. + +A query passes when: +- `should_trigger=true` and `trigger_rate >= --trigger-threshold` (default 0.5) +- `should_trigger=false` and `trigger_rate < --trigger-threshold` + +Trigger evals do not produce artifacts beyond the result JSON. They are cheap and parallelize aggressively. + +## Where evals can live + +The runner discovers evals in this order: + +1. `--evals ` — explicit. May point to a folder or a specific `*.json`. +2. `/evals/` — colocated with the skill. +3. `/../../evals//` — sibling-of-parent. Common pattern when evals are intentionally excluded from skill distribution. +4. `/evals//`. +5. `/evals/**//` — fuzzy search under the project's evals tree. + +If both `evals.json` and `triggers.json` are found, both run unless `--mode` narrows it. + +## Two patterns for single-shot evals + +Most multi-turn workflow skills can be evaluated single-shot if you design the eval right. Two patterns cover the bulk of what you'd otherwise need a multi-turn simulator for: + +### Pattern A — artifact correctness (headless + rich prompt) + +Force the skill into headless mode and pack the prompt with everything Discovery would have surfaced. Grade what comes out: the artifact, its structure, whether it reflects the inputs without inventing. + +Use when: +- The deliverable is the artifact (brief, PRD, doc, plan) +- You can write a complete pre-Discovery prompt +- You want regression coverage on drafting/format/extraction + +### Pattern B — process discipline (headless + transcript and side-artifact inspection) + +Same single-shot mechanics, but the expectations look at *what the skill did internally* — not just the final output. The grader reads the stream-JSON transcript for tool calls, walks side-artifacts (decision logs, addenda, distillates), checks file mtimes, and verifies phase ordering. + +Use when: +- The skill enforces a protocol (decision log, polish phase, finalize sequence) +- The skill has read-only intents (Validate must not write) +- You need to catch "drafting works but the discipline went soft" regressions + +These are deterministic checks against the transcript and filesystem — no LLM judgment needed for most of them. + +### What single-shot can NOT cover + +Facilitation arc: vague-input → sharper pushback → user clarifies → better artifact. That requires a multi-turn user simulator. Defer it to a separate eval mode for skills where conversation is the value (coaching, brainstorming, design thinking). + +## Writing good expectations + +The grader's job is easier when expectations are *discriminating* — hard to pass without actually doing the work. + +**Weak patterns to avoid:** +- **Filename-only checks** — "brief.md exists" passes for an empty file. Pair with a content check. +- **Wholly subjective phrasing** — "the brief is high quality" cannot be evaluated. State the property concretely. +- **Tautologies** — anything that follows from the prompt being understood is not a useful expectation. + +**Strong patterns for artifact correctness (Pattern A):** +- Specific facts that should appear ("incorporates at least 2 specific findings from section X") +- Structural claims a wrong output would fail ("word count between 250 and 1500") +- Negative assertions ("does not introduce content from unrelated sections") +- YAML frontmatter checks ("frontmatter contains title, status, created, updated as ISO 8601") +- Bounded JSON output ("final assistant message contains a JSON object with intent='create'") + +**Strong patterns for process discipline (Pattern B):** +- **Side-artifact existence + content** ("decision-log.md exists AND captures the pricing decision with rejected alternative and rationale") +- **Transcript tool-call patterns** ("the transcript contains a Skill tool call invoking bmad-editorial-review-prose") +- **Phase ordering** ("the polish-phase Skill calls occur after the brief body Write and before the final JSON status block") +- **Read-only enforcement** ("the input brief.md is byte-identical to the staged fixture; no Write or Edit tool calls targeted the run folder") +- **Bidirectional fidelity** ("every substantive entry in decision-log.md has a corresponding reflection in brief.md, AND no claim in brief.md is absent from the input prompt or decision-log.md") +- **Timestamp checks** ("YAML frontmatter 'updated' field is later than 'created'; 'created' is unchanged from the input fixture") + +## Headless mode — getting the skill to behave non-interactively + +Most multi-turn skills expose a headless flag or keyword that suppresses clarifying questions and produces a structured JSON status block at the end. To use Pattern A or B, the eval prompt needs to trigger this. Common signals: + +- The literal phrase `Run headless.` at the start of the prompt +- Skill-specific flags or keywords as documented in the skill's `## Headless Mode` section +- Sufficient context such that no clarification is genuinely needed + +If the skill has no headless mode, single-shot evals will halt at the first clarifying question and you have two options: (1) add a headless mode to the skill, (2) defer that skill's evals to the multi-turn simulator. + +## Pre-staging files (Update / Validate intents) + +For Update and Validate evals, the workspace needs to contain an existing brief, decision log, addendum, etc. Use the `files` field — each path is staged into the workspace at the same relative location. The eval prompt then references the staged path explicitly: + +```json +{ + "id": "B5", + "prompt": "Run headless. Update the brief at evals/skill-x/files/some-brief/brief.md — ...", + "files": [ + "evals/skill-x/files/some-brief/brief.md", + "evals/skill-x/files/some-brief/decision-log.md", + "evals/skill-x/files/some-brief/addendum.md" + ] +} +``` + +For Validate (read-only) expectations, pair the staged files with byte-identical assertions and a no-Write/no-Edit transcript check. diff --git a/skills/bmad-eval-runner/references/isolation.md b/skills/bmad-eval-runner/references/isolation.md new file mode 100644 index 0000000..056fda8 --- /dev/null +++ b/skills/bmad-eval-runner/references/isolation.md @@ -0,0 +1,110 @@ +# Isolation Strategies + +The eval runner offers two strategies. The intent is identical in both: every eval starts from a clean slate so the result reflects the skill itself, not the host's accumulated state. + +## What we are isolating from + +- The user's global `~/.claude/CLAUDE.md` (private global instructions) +- Any ancestor `CLAUDE.md` in the project tree above the skill +- Auto-memory at `~/.claude/projects/.../memory/MEMORY.md` +- Cached settings, MCP configurations, IDE integrations +- Prior conversation context bleeding via the shell + +## Authentication + +The isolated `claude -p` subprocess needs to authenticate, but cannot read the host's `~/.claude/` (HOME is overridden) or the macOS Keychain (Keychain ACLs are scoped to the process that wrote the entry). The runner solves this in the parent process: + +1. On macOS, read the OAuth credential JSON from the Keychain entry `Claude Code-credentials` via `security find-generic-password -s "Claude Code-credentials" -w`. This succeeds because the parent runs as the same user that wrote the entry. +2. Stage that JSON as `/.home/.claude/.credentials.json` (local mode) or copy it into `/home/evaluator/.claude/.credentials.json` inside the container (Docker mode). +3. The subprocess reads `.credentials.json` exactly the way Claude Code normally does, with no other host config bleed. + +If the parent has `ANTHROPIC_API_KEY` set, that env var is also forwarded — and it takes precedence over the Keychain credential. On non-macOS hosts, the Keychain step is skipped and `ANTHROPIC_API_KEY` is the only auth path. + +## Docker (preferred) + +A single image, `bmad-eval-runner:latest`, is built once per machine. It contains Node 20, Claude Code (via `npm install -g @anthropic-ai/claude-code`), Python 3, and standard tools. The image is intentionally minimal — every eval starts from this baseline. + +### Image build + +`scripts/docker_setup.py --build` builds the image from `assets/Dockerfile`. This runs once. Re-runs are a no-op unless `--rebuild` is passed. + +### Per-eval container + +Each eval gets a fresh container: + +``` +docker run --rm \ + -v ":/project:ro" \ + -v "/:/output" \ + -v ":/fixtures:ro" \ + -e ANTHROPIC_API_KEY \ + -e EVAL_PROMPT \ + -e EVAL_ID \ + -e SKILL_PATH \ + bmad-eval-runner:latest \ + /bin/bash -c "/scripts/run_one_eval.sh" +``` + +Inside the container: + +1. The project is copied from `/project` (read-only) to `/workspace` (writable, container-local). Copy is fast because the underlying layer is shared. +2. Fixtures are copied into `/workspace/fixtures/`. +3. `HOME` is `/home/evaluator`, an empty directory created by the image — no global `CLAUDE.md`, no memory. +4. `claude -p "$EVAL_PROMPT" --output-format stream-json --verbose` runs at `/workspace`. +5. The stream-json transcript is captured to `/output/transcript.jsonl`. Any files the skill writes under `/workspace` are rsynced to `/output/artifacts/` after the run completes. +6. The container exits and is removed automatically. + +The host then has `//transcript.jsonl`, `//artifacts/`, and timing data. Nothing on the host is touched. + +### Why Docker is preferred + +- The image is reproducible — every run starts from byte-identical state. +- `HOME` is genuinely empty, not just overridden. +- Filesystem isolation is real, not just convention. +- Network can be locked down (`--network=none` for trigger evals; full network for artifact evals that may need it). + +## Local fallback + +When Docker is unavailable, the runner falls back to per-eval temp directories under `~/bmad-evals///`. Layout: + +``` +~/bmad-evals/// + workspace/ # the eval's working directory + .home/ # HOME override — empty .claude/ inside + project/ # rsync'd copy of + fixtures/ # staged fixture files + transcript.jsonl # claude -p stream output + artifacts/ # files Claude wrote under workspace/ + metrics.json +``` + +Per-eval invocation roughly: + +``` +HOME="$WORKSPACE/.home" \ +CLAUDE_CONFIG_DIR="$WORKSPACE/.home/.claude" \ +ANTHROPIC_API_KEY="$ANTHROPIC_API_KEY" \ + claude -p "$EVAL_PROMPT" \ + --output-format stream-json --verbose \ + > transcript.jsonl +``` + +### Limitations of local mode + +- `HOME` override prevents global `CLAUDE.md` and memory loading, but ancestor discovery still happens from the workspace's cwd. If the workspace is created inside a directory tree that contains a `.claude/skills/` further up, the subprocess may discover those skills regardless of `HOME`. This matters most for trigger evals, where stray host skills can fire instead of the synthetic skill we're testing — **prefer Docker for trigger evals**, where filesystem isolation is real. +- Filesystem isolation is by convention only — the skill could write outside its workspace if it tries. We don't sandbox syscalls. +- Network is unrestricted. + +Tell the user clearly when local mode is in use and that it is best-effort. + +## Why a real skill, not a slash command, for trigger evals + +The trigger runner stages a synthetic skill at `/.claude/skills//SKILL.md` — not at `.claude/commands/.md`. Slash commands are user-invoked (`/`); they do not surface as `Skill` tool calls and so a description placed there can never be observed firing the way a real skill would. Anthropic's reference `run_eval.py` uses the commands path and is known to report 0% trigger rates as a result. Placing the synthetic at `.claude/skills/` matches how real skills load and lets the detector observe genuine `Skill` (or `Read` of the synthetic SKILL.md) tool calls. + +## Why not `--add-dir` only? + +`claude -p --add-dir ` would let Claude see the skill but would still inherit the user's `CLAUDE.md` and memory from the cwd's ancestors. The whole point of this runner is to test the skill, not the host's accumulated state. So we always either Docker-isolate or temp-dir-isolate. + +## Artifact retention + +Run folders are never deleted by this skill. Disk management is the user's responsibility. The runner emits the run folder path on completion; users who want to clean up old runs can delete `~/bmad-evals//` directly. diff --git a/skills/bmad-eval-runner/scripts/docker_setup.py b/skills/bmad-eval-runner/scripts/docker_setup.py new file mode 100644 index 0000000..5f6fe7a --- /dev/null +++ b/skills/bmad-eval-runner/scripts/docker_setup.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.9" +# /// +"""Detect Docker and build the bmad-eval-runner image when needed. + +Usage: + python3 docker_setup.py --check # exit 0 if image is ready, 1 otherwise + python3 docker_setup.py --build # build the image (no-op if present) + python3 docker_setup.py --rebuild # force rebuild +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import subprocess +import sys +from pathlib import Path + + +IMAGE_TAG = "bmad-eval-runner:latest" +SCRIPT_DIR = Path(__file__).resolve().parent +DOCKERFILE = SCRIPT_DIR.parent / "assets" / "Dockerfile" + + +def docker_available() -> tuple[bool, str]: + if shutil.which("docker") is None: + return False, "docker CLI not found on PATH" + try: + result = subprocess.run( + ["docker", "info"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode != 0: + return False, f"`docker info` failed: {result.stderr.strip().splitlines()[-1] if result.stderr.strip() else 'unknown'}" + return True, "ok" + except subprocess.TimeoutExpired: + return False, "`docker info` timed out" + except Exception as e: + return False, f"docker check error: {e}" + + +def image_present(tag: str = IMAGE_TAG) -> bool: + try: + result = subprocess.run( + ["docker", "image", "inspect", tag], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, + ) + return result.returncode == 0 + except Exception: + return False + + +def build_image(tag: str = IMAGE_TAG, force: bool = False, verbose: bool = True) -> int: + if not DOCKERFILE.is_file(): + print(f"Dockerfile missing at {DOCKERFILE}", file=sys.stderr) + return 2 + + cmd = ["docker", "build", "-t", tag, "-f", str(DOCKERFILE), str(DOCKERFILE.parent)] + if force: + cmd.insert(2, "--no-cache") + + if verbose: + print(f"Building {tag} from {DOCKERFILE} ...", file=sys.stderr) + + proc = subprocess.run(cmd, stdout=sys.stderr if verbose else subprocess.DEVNULL, stderr=sys.stderr) + return proc.returncode + + +def main() -> int: + parser = argparse.ArgumentParser(description="Manage the bmad-eval-runner Docker image") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--check", action="store_true", help="Report status as JSON; exit 0 if image is ready") + group.add_argument("--build", action="store_true", help="Build the image (no-op if already present)") + group.add_argument("--rebuild", action="store_true", help="Force rebuild") + parser.add_argument("--quiet", action="store_true") + args = parser.parse_args() + + available, reason = docker_available() + present = image_present() if available else False + + if args.check: + print(json.dumps({ + "docker_available": available, + "docker_reason": reason, + "image_present": present, + "image_tag": IMAGE_TAG, + }, indent=2)) + return 0 if (available and present) else 1 + + if not available: + print(f"Docker is not available: {reason}", file=sys.stderr) + return 3 + + if args.rebuild: + return build_image(force=True, verbose=not args.quiet) + + if args.build: + if present: + if not args.quiet: + print(f"{IMAGE_TAG} already present; skipping build (use --rebuild to force).", file=sys.stderr) + return 0 + return build_image(force=False, verbose=not args.quiet) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/bmad-eval-runner/scripts/generate_report.py b/skills/bmad-eval-runner/scripts/generate_report.py new file mode 100644 index 0000000..7596d02 --- /dev/null +++ b/skills/bmad-eval-runner/scripts/generate_report.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.9" +# /// +"""Generate an aggregate HTML report for a run folder. + +Reads run.json, execution-summary.json, each /grading.json (if present), +and triggers-result.json (if present), then renders a single-file HTML report. + +Usage: + python3 generate_report.py --run-dir PATH [-o report.html] +""" + +from __future__ import annotations + +import argparse +import html as html_lib +import json +import sys +from pathlib import Path + + +def esc(s: object) -> str: + return html_lib.escape(str(s), quote=True) + + +def load(path: Path) -> dict | list | None: + if not path.is_file(): + return None + try: + return json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + return None + + +def render(run_dir: Path) -> str: + run_meta = load(run_dir / "run.json") or {} + exec_summary = load(run_dir / "execution-summary.json") or {} + triggers = load(run_dir / "triggers-result.json") + + eval_blocks: list[str] = [] + grading_total = 0 + grading_passed = 0 + + for res in exec_summary.get("results", []): + eval_id = str(res.get("eval_id", "?")) + eval_dir = run_dir / eval_id + grading = load(eval_dir / "grading.json") + metrics = res.get("metrics") or load(eval_dir / "metrics.json") or {} + rc = res.get("return_code") + + rows: list[str] = [] + if grading: + for exp in grading.get("expectations", []): + passed = bool(exp.get("passed")) + grading_total += 1 + if passed: + grading_passed += 1 + rows.append( + f'' + f'{ "✔" if passed else "✘" }' + f'{esc(exp.get("text", ""))}' + f'{esc(exp.get("evidence", ""))}' + ) + + feedback = (grading or {}).get("eval_feedback") or {} + feedback_html = "" + if feedback: + sugg = feedback.get("suggestions") or [] + sugg_html = "".join( + f"
  • {esc(s.get('assertion','(general)'))}: {esc(s.get('reason',''))}
  • " + for s in sugg + ) + overall = esc(feedback.get("overall", "")) + feedback_html = ( + f'' + ) + + artifacts_listing = "" + artifacts_dir = eval_dir / "artifacts" + if artifacts_dir.is_dir(): + files = sorted(p for p in artifacts_dir.rglob("*") if p.is_file()) + if files: + artifacts_listing = "
      " + "".join( + f'
    • {esc(p.relative_to(eval_dir))} ' + f'({p.stat().st_size}b)
    • ' + for p in files + ) + "
    " + + tool_calls = metrics.get("tool_calls", {}) + tool_summary = ", ".join(f"{k}={v}" for k, v in sorted(tool_calls.items())) or "—" + + eval_blocks.append(f""" +
    +

    Eval {esc(eval_id)} rc={esc(rc)} · {esc(metrics.get('elapsed_s', '?'))}s

    +

    Tool calls: {esc(tool_summary)} · output {esc(metrics.get('output_chars', 0))}b · transcript {esc(metrics.get('transcript_chars', 0))}b

    + { '' + ''.join(rows) + '
    ExpectationEvidence
    ' if rows else '

    No grading.json yet.

    ' } + {feedback_html} +
    Artifacts{artifacts_listing or '

    No artifacts captured.

    '}
    +
    + """) + + triggers_html = "" + if triggers: + rows = [] + for r in triggers.get("results", []): + rows.append( + f'' + f'{ "✔" if r["pass"] else "✘" }' + f'{esc(r["query"])}' + f'{esc(r["should_trigger"])}' + f'{r["triggers"]}/{r["runs"]} ({r["trigger_rate"]:.2f})' + ) + s = triggers.get("summary", {}) + triggers_html = f""" +
    +

    Trigger Evals — {s.get('passed',0)}/{s.get('total',0)} pass

    + + {''.join(rows)}
    QueryShould fireRate
    +
    + """ + + artifact_summary = "" + if exec_summary: + artifact_summary = ( + f"

    Executed {exec_summary.get('executed', 0)} / {exec_summary.get('total', 0)} " + f"evals · {exec_summary.get('exec_failures', 0)} execution failures · " + f"grader: {grading_passed}/{grading_total} expectations passed

    " + ) + + return f""" +Eval Run — {esc(run_meta.get('skill_name','?'))} + + +

    {esc(run_meta.get('skill_name','?'))} — eval run

    +
    + Run id: {esc(run_meta.get('run_id','?'))} · + isolation: {esc(run_meta.get('isolation','?'))} · + started: {esc(run_meta.get('started_at','?'))} +
    +{artifact_summary} +{''.join(eval_blocks)} +{triggers_html} + +""" + + +def main() -> int: + parser = argparse.ArgumentParser(description="Generate HTML report for an eval run folder") + parser.add_argument("--run-dir", required=True, type=Path) + parser.add_argument("-o", "--output", type=Path, default=None) + args = parser.parse_args() + + run_dir = args.run_dir.resolve() + if not run_dir.is_dir(): + print(f"run-dir not found: {run_dir}", file=sys.stderr) + return 2 + + out = args.output or (run_dir / "report.html") + out.write_text(render(run_dir), encoding="utf-8") + print(str(out)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/bmad-eval-runner/scripts/pty_runner.py b/skills/bmad-eval-runner/scripts/pty_runner.py new file mode 100644 index 0000000..5b58658 --- /dev/null +++ b/skills/bmad-eval-runner/scripts/pty_runner.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.9" +# /// +"""Run claude interactively via PTY so the Skill tool is available. + +In `claude -p` (print mode) the Skill tool is never offered — Claude handles +everything inline. Running `claude` in interactive mode activates the Skill +tool so dependency skills installed in .claude/skills/ can be properly invoked. + +The PTY tricks claude into thinking it has a terminal (interactive mode) while +we capture its stream-json output programmatically. + +Usage: + python3 pty_runner.py --prompt-file /path/to/prompt.txt \\ + --output /path/to/transcript.jsonl \\ + [--timeout 600] + python3 pty_runner.py --prompt "Run headless. ..." --output transcript.jsonl +""" + +from __future__ import annotations + +import argparse +import json +import os +import pty +import re +import select +import subprocess +import sys +import time +from pathlib import Path + +ANSI_RE = re.compile(r"\x1b(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])|\r") + +# How long to wait for claude to initialize before sending the prompt. +# Claude loads skill registry, checks credentials, etc. on startup. +INIT_WAIT_S = 5.0 + +# How long to wait after the stream-json 'result' event before killing claude. +# Trailing tool-result output sometimes follows the result event. +POST_RESULT_S = 4.0 + + +def _strip_ansi(text: str) -> str: + return ANSI_RE.sub("", text) + + +def run_interactive(prompt: str, output: Path, timeout: int = 600) -> None: + """Spawn claude interactively via PTY, send one prompt, capture transcript.""" + master, slave = pty.openpty() + + proc = subprocess.Popen( + [ + "claude", + "--output-format", "stream-json", + "--verbose", + "--dangerously-skip-permissions", + ], + stdin=slave, + stdout=slave, + stderr=slave, + close_fds=True, + ) + os.close(slave) + + json_lines: list[str] = [] + buf = b"" + prompt_sent = False + done_at: float | None = None + start = time.time() + + try: + while True: + elapsed = time.time() - start + if elapsed > timeout: + print(f"[pty_runner] timeout after {elapsed:.0f}s", file=sys.stderr) + break + if done_at is not None and (time.time() - done_at) > POST_RESULT_S: + break + + # Short select so we stay responsive but don't spin. + r, _, _ = select.select([master], [], [], 0.3) + + if r: + try: + chunk = os.read(master, 8192) + except OSError: + break # PTY closed — claude exited + buf += chunk + + # Process all complete lines in buffer. + while b"\n" in buf: + raw, buf = buf.split(b"\n", 1) + line = _strip_ansi(raw.decode("utf-8", errors="replace")).strip() + if not line.startswith("{"): + continue + json_lines.append(line) + try: + obj = json.loads(line) + # 'result' marks end of a claude turn. + if obj.get("type") == "result" and done_at is None: + done_at = time.time() + print( + f"[pty_runner] result event at t={time.time()-start:.1f}s " + f"({len(json_lines)} lines so far)", + file=sys.stderr, + ) + except json.JSONDecodeError: + pass + else: + # Silence window — send prompt once claude has had time to init. + if not prompt_sent and (time.time() - start) >= INIT_WAIT_S: + os.write(master, (prompt + "\n").encode()) + prompt_sent = True + print( + f"[pty_runner] prompt sent at t={time.time()-start:.1f}s", + file=sys.stderr, + ) + + finally: + # Politely ask claude to exit, then hard-kill if needed. + try: + os.write(master, b"exit\n") + time.sleep(0.3) + except OSError: + pass + try: + proc.terminate() + proc.wait(timeout=5) + except Exception: + try: + proc.kill() + except Exception: + pass + try: + os.close(master) + except OSError: + pass + + output.parent.mkdir(parents=True, exist_ok=True) + content = "\n".join(json_lines) + ("\n" if json_lines else "") + output.write_text(content, encoding="utf-8") + print( + f"[pty_runner] wrote {len(json_lines)} transcript lines → {output}", + file=sys.stderr, + ) + + +def main() -> int: + p = argparse.ArgumentParser( + description="Run claude interactively via PTY and capture stream-json transcript" + ) + grp = p.add_mutually_exclusive_group(required=True) + grp.add_argument("--prompt", help="Prompt text") + grp.add_argument("--prompt-file", type=Path, help="File containing the prompt") + p.add_argument("--output", type=Path, required=True, help="Output .jsonl transcript file") + p.add_argument("--timeout", type=int, default=600, help="Hard timeout in seconds") + args = p.parse_args() + + prompt = ( + args.prompt_file.read_text(encoding="utf-8").strip() + if args.prompt_file + else args.prompt + ) + run_interactive(prompt, args.output, args.timeout) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/bmad-eval-runner/scripts/run_evals.py b/skills/bmad-eval-runner/scripts/run_evals.py new file mode 100644 index 0000000..fd8438b --- /dev/null +++ b/skills/bmad-eval-runner/scripts/run_evals.py @@ -0,0 +1,492 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.9" +# /// +"""Run a skill's artifact evals in isolated workspaces. + +For each eval, the runner: + 1. Stages a fresh workspace (Docker container or local tmp dir under ~/bmad-evals). + 2. Applies the setup overlay (base then per-eval) so _bmad/ config and dependency + skills land in the workspace BEFORE the skill is staged — the skill's own copy + always wins over overlay content. + 3. Copies the skill into .claude/skills/ so it is discoverable by claude. + 4. Stages any fixture files declared in the eval's `files` list. + 5. Runs `claude -p '' --output-format stream-json --verbose`, capturing + the transcript. The Skill tool is available in -p mode and fires for installed + skills, so dependency skills provided by the setup overlay are properly invokable. + 6. Rsyncs any files claude wrote into `//artifacts/`. + 7. Writes `metrics.json` (tool-call counts, timing, output sizes). + +Grading is performed separately by the parent skill's grader subagents. + +Usage: + python3 run_evals.py \\ + --skill-path PATH \\ + --evals-file PATH/evals.json \\ + --project-root PATH \\ + --output-dir PATH \\ + --isolation docker|local \\ + [--workers N] [--timeout SECS] [--eval-ids A1,B3] [--quiet] +""" + +from __future__ import annotations + +import argparse +import json +import os +import shutil +import subprocess +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +sys.path.insert(0, str(SCRIPT_DIR)) + +from utils import ( # noqa: E402 + apply_setup_overlay, + discover_setup_dirs, + new_run_id, + parse_skill_md, + read_json, + read_macos_keychain_credentials, + stage_credentials, + utc_now_iso, + write_json, +) + +DOCKER_IMAGE = "bmad-eval-runner:latest" +_KEYCHAIN_CREDS: str | None = read_macos_keychain_credentials() +RSYNC_EXCLUDES = ( + ".git", ".bare", "node_modules", ".venv", "__pycache__", + ".pytest_cache", ".next", "dist", "build", ".cache", + ".DS_Store", "*.pyc", +) + + +def stage_workspace_local( + workspace: Path, + project_root: Path, + skill_path: Path, + fixtures: list[tuple[Path, str]], + setup_dirs: list[Path] | None = None, +) -> Path: + """Build a clean local workspace. Returns the project root inside workspace.""" + workspace.mkdir(parents=True, exist_ok=True) + project_dest = workspace / "project" + home_dir = workspace / ".home" + (home_dir / ".claude").mkdir(parents=True, exist_ok=True) + + excludes: list[str] = [] + for pat in RSYNC_EXCLUDES: + excludes.extend(["--exclude", pat]) + + if shutil.which("rsync"): + subprocess.run( + ["rsync", "-a", *excludes, f"{project_root}/", f"{project_dest}/"], + check=True, + ) + else: + shutil.copytree(project_root, project_dest, dirs_exist_ok=True, + ignore=shutil.ignore_patterns(*RSYNC_EXCLUDES)) + + # Apply setup overlay before staging the skill — the skill's own copy wins. + if setup_dirs: + apply_setup_overlay(setup_dirs, project_dest) + + skill_link_dir = project_dest / ".claude" / "skills" + skill_link_dir.mkdir(parents=True, exist_ok=True) + skill_dest = skill_link_dir / skill_path.name + if not skill_dest.exists(): + try: + os.symlink(skill_path, skill_dest) + except OSError: + shutil.copytree(skill_path, skill_dest, dirs_exist_ok=True) + + for src, dest_rel in fixtures: + dest = project_dest / dest_rel + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dest) + + return project_dest + + +def run_eval_local( + eval_item: dict, + run_dir: Path, + skill_path: Path, + project_root: Path, + timeout: int, + setup_dirs: list[Path] | None = None, +) -> dict: + eval_id = str(eval_item.get("id", "unnamed")) + eval_dir = run_dir / eval_id + workspace_root = eval_dir / "workspace" + artifacts_dir = eval_dir / "artifacts" + transcript_path = eval_dir / "transcript.jsonl" + + eval_dir.mkdir(parents=True, exist_ok=True) + artifacts_dir.mkdir(parents=True, exist_ok=True) + + fixtures = resolve_fixtures(eval_item.get("files", []), project_root) + workspace_project = stage_workspace_local( + workspace_root, project_root, skill_path, fixtures, setup_dirs + ) + + (eval_dir / "prompt.txt").write_text(eval_item["prompt"], encoding="utf-8") + workspace_snapshot_before = snapshot_files(workspace_project) + + home_dir = workspace_root / ".home" + stage_credentials(home_dir / ".claude", _KEYCHAIN_CREDS) + env = { + "HOME": str(home_dir), + "CLAUDE_CONFIG_DIR": str(home_dir / ".claude"), + "PATH": os.environ.get("PATH", ""), + "ANTHROPIC_API_KEY": os.environ.get("ANTHROPIC_API_KEY", ""), + } + + cmd = [ + "claude", + "-p", eval_item["prompt"], + "--output-format", "stream-json", + "--verbose", + "--dangerously-skip-permissions", + ] + + start = time.time() + try: + with transcript_path.open("wb") as out: + proc = subprocess.run( + cmd, + stdout=out, + stderr=subprocess.PIPE, + cwd=str(workspace_project), + env=env, + timeout=timeout, + ) + elapsed = time.time() - start + return_code = proc.returncode + stderr_tail = (proc.stderr or b"").decode("utf-8", errors="replace")[-2000:] + except subprocess.TimeoutExpired as e: + elapsed = time.time() - start + return_code = -1 + stderr_tail = f"TIMEOUT after {timeout}s" + if e.stderr: + stderr_tail += "\n" + e.stderr.decode("utf-8", errors="replace")[-2000:] + + new_files = diff_workspace(workspace_project, workspace_snapshot_before) + sync_artifacts(workspace_project, new_files, artifacts_dir) + + metrics = compute_metrics(transcript_path, artifacts_dir, elapsed, return_code, stderr_tail) + write_json(eval_dir / "metrics.json", metrics) + + return { + "eval_id": eval_id, + "elapsed_s": elapsed, + "return_code": return_code, + "transcript": str(transcript_path.relative_to(run_dir)), + "artifacts_dir": str(artifacts_dir.relative_to(run_dir)), + "metrics": metrics, + } + + +def run_eval_docker( + eval_item: dict, + run_dir: Path, + skill_path: Path, + project_root: Path, + timeout: int, + setup_dirs: list[Path] | None = None, +) -> dict: + eval_id = str(eval_item.get("id", "unnamed")) + eval_dir = run_dir / eval_id + artifacts_dir = eval_dir / "artifacts" + transcript_path = eval_dir / "transcript.jsonl" + + eval_dir.mkdir(parents=True, exist_ok=True) + artifacts_dir.mkdir(parents=True, exist_ok=True) + fixtures_staging = eval_dir / "fixtures_in" + fixtures_staging.mkdir(parents=True, exist_ok=True) + + fixtures = resolve_fixtures(eval_item.get("files", []), project_root) + for src, dest_rel in fixtures: + dest = fixtures_staging / dest_rel + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dest) + + (eval_dir / "prompt.txt").write_text(eval_item["prompt"], encoding="utf-8") + + # Pre-merge setup overlay dirs on the host; mount as /setup:ro in the container. + setup_merged: Path | None = None + if setup_dirs: + setup_merged = eval_dir / "setup_merged" + apply_setup_overlay(setup_dirs, setup_merged) + if not any(setup_merged.iterdir()): + setup_merged = None + + creds_dir: Path | None = None + if _KEYCHAIN_CREDS: + creds_dir = eval_dir / "creds" + creds_dir.mkdir(parents=True, exist_ok=True) + (creds_dir / ".credentials.json").write_text(_KEYCHAIN_CREDS, encoding="utf-8") + + container_script = r""" +set -e +mkdir -p /workspace +rsync -a \ + --exclude=.git --exclude=.bare --exclude=node_modules --exclude=.venv \ + --exclude=__pycache__ --exclude=.pytest_cache --exclude=.next \ + --exclude=dist --exclude=build --exclude=.cache --exclude=.DS_Store \ + /project/ /workspace/ +if [ -d /setup ]; then + rsync -a /setup/ /workspace/ +fi +mkdir -p /workspace/.claude/skills +cp -R "$SKILL_SRC" "/workspace/.claude/skills/$SKILL_NAME" +if [ -d /fixtures ]; then + cp -R /fixtures/. /workspace/ +fi +if [ -f /creds/.credentials.json ]; then + mkdir -p /home/evaluator/.claude + cp /creds/.credentials.json /home/evaluator/.claude/.credentials.json +fi +cd /workspace +claude -p "$EVAL_PROMPT" \ + --output-format stream-json --verbose \ + --dangerously-skip-permissions \ + > /output/transcript.jsonl 2> /output/stderr.log || true +mkdir -p /output/artifacts +rsync -a --exclude=.claude --exclude=node_modules --exclude=.git \ + --filter='+ */' --filter='+ *' \ + /workspace/ /output/artifacts/ +""" + + skill_name = skill_path.name + cmd = [ + "docker", "run", "--rm", + "-v", f"{project_root}:/project:ro", + "-v", f"{skill_path}:/skill_src:ro", + "-v", f"{eval_dir}:/output", + "-e", "ANTHROPIC_API_KEY", + "-e", f"EVAL_PROMPT={eval_item['prompt']}", + "-e", f"SKILL_SRC=/skill_src", + "-e", f"SKILL_NAME={skill_name}", + ] + if creds_dir: + cmd += ["-v", f"{creds_dir}:/creds:ro"] + if fixtures: + cmd += ["-v", f"{fixtures_staging}:/fixtures:ro"] + if setup_merged: + cmd += ["-v", f"{setup_merged}:/setup:ro"] + cmd += [DOCKER_IMAGE, "bash", "-c", container_script] + + start = time.time() + try: + proc = subprocess.run( + cmd, + capture_output=True, + timeout=timeout + 30, + ) + elapsed = time.time() - start + return_code = proc.returncode + stderr_tail = proc.stderr.decode("utf-8", errors="replace")[-2000:] + if proc.stdout: + (eval_dir / "docker.stdout.log").write_bytes(proc.stdout) + except subprocess.TimeoutExpired as e: + elapsed = time.time() - start + return_code = -1 + stderr_tail = f"TIMEOUT after {timeout}s" + if e.stderr: + stderr_tail += "\n" + e.stderr.decode("utf-8", errors="replace")[-2000:] + + metrics = compute_metrics(transcript_path, artifacts_dir, elapsed, return_code, stderr_tail) + write_json(eval_dir / "metrics.json", metrics) + shutil.rmtree(fixtures_staging, ignore_errors=True) + + return { + "eval_id": eval_id, + "elapsed_s": elapsed, + "return_code": return_code, + "transcript": str(transcript_path.relative_to(run_dir)), + "artifacts_dir": str(artifacts_dir.relative_to(run_dir)), + "metrics": metrics, + } + + +def resolve_fixtures(files: list[str], project_root: Path) -> list[tuple[Path, str]]: + out: list[tuple[Path, str]] = [] + for entry in files: + candidate = (project_root / entry).resolve() + if not candidate.is_file(): + alt = Path(entry).resolve() + if alt.is_file(): + candidate = alt + else: + print(f"Warning: fixture not found: {entry}", file=sys.stderr) + continue + out.append((candidate, entry)) + return out + + +def snapshot_files(root: Path) -> set[str]: + snap: set[str] = set() + for p in root.rglob("*"): + if p.is_file(): + snap.add(str(p.relative_to(root))) + return snap + + +def diff_workspace(root: Path, before: set[str]) -> list[str]: + after = snapshot_files(root) + return sorted(after - before) + + +def sync_artifacts(workspace: Path, new_files: list[str], dest: Path) -> None: + for rel in new_files: + src = workspace / rel + if not src.is_file(): + continue + if any(part in (".claude", "node_modules", ".git", ".venv") for part in src.parts): + continue + target = dest / rel + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, target) + + +def compute_metrics(transcript: Path, artifacts: Path, elapsed: float, + rc: int, stderr_tail: str) -> dict: + tool_calls: dict[str, int] = {} + total_steps = 0 + if transcript.is_file(): + for raw in transcript.read_text(encoding="utf-8", errors="replace").splitlines(): + raw = raw.strip() + if not raw: + continue + try: + evt = json.loads(raw) + except json.JSONDecodeError: + continue + if evt.get("type") == "assistant": + total_steps += 1 + for item in evt.get("message", {}).get("content", []): + if item.get("type") == "tool_use": + name = item.get("name", "?") + tool_calls[name] = tool_calls.get(name, 0) + 1 + + output_chars = 0 + for f in artifacts.rglob("*"): + if f.is_file(): + try: + output_chars += f.stat().st_size + except OSError: + pass + + return { + "elapsed_s": round(elapsed, 2), + "return_code": rc, + "tool_calls": tool_calls, + "total_tool_calls": sum(tool_calls.values()), + "total_steps": total_steps, + "output_chars": output_chars, + "transcript_chars": transcript.stat().st_size if transcript.is_file() else 0, + "stderr_tail": stderr_tail, + } + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run a skill's artifact evals in isolation") + parser.add_argument("--skill-path", required=True, type=Path) + parser.add_argument("--evals-file", required=True, type=Path) + parser.add_argument("--project-root", required=True, type=Path) + parser.add_argument("--output-dir", required=True, type=Path) + parser.add_argument("--isolation", choices=("docker", "local"), required=True) + parser.add_argument("--workers", type=int, default=8) + parser.add_argument("--timeout", type=int, default=600) + parser.add_argument("--eval-ids", default=None, help="Comma-separated subset of eval ids to run") + parser.add_argument("--quiet", action="store_true") + args = parser.parse_args() + + skill_path = args.skill_path.resolve() + project_root = args.project_root.resolve() + evals_file = args.evals_file.resolve() + if not evals_file.is_file(): + print(f"evals file not found: {evals_file}", file=sys.stderr) + return 2 + + skill_name, _, _ = parse_skill_md(skill_path) + data = read_json(evals_file) + evals = data["evals"] if isinstance(data, dict) and "evals" in data else data + + if args.eval_ids: + wanted = {x.strip() for x in args.eval_ids.split(",") if x.strip()} + evals = [e for e in evals if str(e.get("id")) in wanted] + + run_id = new_run_id(skill_name) + run_dir = (args.output_dir / run_id).resolve() + run_dir.mkdir(parents=True, exist_ok=True) + + write_json(run_dir / "run.json", { + "run_id": run_id, + "skill_name": skill_name, + "skill_path": str(skill_path), + "project_root": str(project_root), + "evals_file": str(evals_file), + "isolation": args.isolation, + "started_at": utc_now_iso(), + "eval_count": len(evals), + }) + + runner = run_eval_docker if args.isolation == "docker" else run_eval_local + + results: list[dict] = [] + if not args.quiet: + print( + f"[run_evals] {len(evals)} evals, isolation={args.isolation}, run_dir={run_dir}", + file=sys.stderr, + ) + + with ThreadPoolExecutor(max_workers=args.workers) as pool: + future_to_eval = { + pool.submit( + runner, + item, + run_dir, + skill_path, + project_root, + int(item.get("timeout", args.timeout)), + discover_setup_dirs(evals_file, str(item.get("id", ""))), + ): item + for item in evals + } + for fut in as_completed(future_to_eval): + item = future_to_eval[fut] + try: + res = fut.result() + except Exception as e: + res = {"eval_id": str(item.get("id")), "error": str(e), "return_code": -1} + results.append(res) + if not args.quiet: + rc = res.get("return_code") + status = "ok" if rc == 0 else f"rc={rc}" + print( + f" [{status}] eval {res.get('eval_id')} ({res.get('elapsed_s', 0):.1f}s)", + file=sys.stderr, + ) + + summary = { + "run_id": run_id, + "completed_at": utc_now_iso(), + "total": len(evals), + "executed": len(results), + "exec_failures": sum(1 for r in results if r.get("return_code") != 0), + "run_dir": str(run_dir), + "results": results, + } + write_json(run_dir / "execution-summary.json", summary) + print(json.dumps(summary, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/bmad-eval-runner/scripts/run_triggers.py b/skills/bmad-eval-runner/scripts/run_triggers.py new file mode 100644 index 0000000..9c1bb96 --- /dev/null +++ b/skills/bmad-eval-runner/scripts/run_triggers.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.9" +# /// +"""Run trigger evals: does the skill's description fire on each query? + +Adapted from Anthropic skill-creator's run_eval.py +(https://github.com/anthropics/skills/tree/main/skills/skill-creator) with two +adaptations: + + 1. Isolation. Each query runs in either a fresh Docker container off + bmad-eval-runner:latest, or a fresh local tmp dir under ~/bmad-evals// + with HOME overridden to a clean directory. This prevents the host's global + CLAUDE.md and auto-memory from biasing whether the skill fires. + + 2. Output. Results are written to a run folder alongside the artifact eval + run-folder layout (so triggers and artifacts can share a single report). + +Usage: + python3 run_triggers.py \\ + --skill-path PATH \\ + --triggers-file PATH/triggers.json \\ + --output-dir PATH \\ + --isolation docker|local \\ + [--workers N] [--runs-per-query N] [--timeout SECS] [--threshold 0.5] +""" + +from __future__ import annotations + +import argparse +import json +import os +import shutil +import subprocess +import sys +import time +import uuid +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +sys.path.insert(0, str(SCRIPT_DIR)) + +from utils import ( # noqa: E402 + new_run_id, + parse_skill_md, + read_json, + read_macos_keychain_credentials, + stage_credentials, + utc_now_iso, + write_json, +) + +DOCKER_IMAGE = "bmad-eval-runner:latest" +_KEYCHAIN_CREDS: str | None = read_macos_keychain_credentials() + + +def write_synthetic_skill(skills_dir: Path, skill_name: str, description: str, unique_id: str) -> tuple[Path, str]: + """Place a synthetic skill at //SKILL.md. + + The Skill tool only fires for entries discovered as actual skills (frontmatter + `name` + `description` under a `.claude/skills//SKILL.md`). Slash-commands + under `.claude/commands/` do not auto-invoke the Skill tool, so the previous + implementation could never observe a positive trigger. This places the synthetic + skill where Claude Code looks for skills, with a unique name so the detector + can disambiguate it from any pre-existing skill of the same display name. + """ + clean_name = f"{skill_name}-skill-{unique_id}" + skill_root = skills_dir / clean_name + skill_root.mkdir(parents=True, exist_ok=True) + path = skill_root / "SKILL.md" + indented_desc = "\n ".join(description.split("\n")) + path.write_text( + f"---\n" + f"name: {clean_name}\n" + f"description: |\n" + f" {indented_desc}\n" + f"---\n\n" + f"# {skill_name}\n\n" + f"This skill handles: {description}\n", + encoding="utf-8", + ) + return path, clean_name + + +def parse_stream_for_trigger(buffer: str, clean_name: str) -> tuple[bool | None, str]: + """Return (triggered_or_none, leftover_buffer). None means undecided yet.""" + triggered: bool | None = None + pending_tool: str | None = None + accumulated_json = "" + leftover = "" + + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if not line: + continue + try: + evt = json.loads(line) + except json.JSONDecodeError: + continue + + if evt.get("type") == "stream_event": + se = evt.get("event", {}) + t = se.get("type", "") + if t == "content_block_start": + cb = se.get("content_block", {}) + if cb.get("type") == "tool_use": + name = cb.get("name", "") + if name in ("Skill", "Read"): + pending_tool = name + accumulated_json = "" + else: + return False, "" + elif t == "content_block_delta" and pending_tool: + delta = se.get("delta", {}) + if delta.get("type") == "input_json_delta": + accumulated_json += delta.get("partial_json", "") + if clean_name in accumulated_json: + return True, "" + elif t in ("content_block_stop", "message_stop"): + if pending_tool: + return clean_name in accumulated_json, "" + if t == "message_stop": + return False, "" + elif evt.get("type") == "assistant": + for item in evt.get("message", {}).get("content", []): + if item.get("type") != "tool_use": + continue + tname = item.get("name", "") + tinput = item.get("input", {}) + if tname == "Skill" and clean_name in tinput.get("skill", ""): + return True, "" + if tname == "Read" and clean_name in tinput.get("file_path", ""): + return True, "" + return False, "" + elif evt.get("type") == "result": + return triggered if triggered is not None else False, "" + leftover = buffer + return triggered, leftover + + +def run_query_local(query: str, skill_name: str, description: str, + workspace_root: Path, timeout: int) -> bool: + workspace_root.mkdir(parents=True, exist_ok=True) + home_dir = workspace_root / ".home" + (home_dir / ".claude").mkdir(parents=True, exist_ok=True) + stage_credentials(home_dir / ".claude", _KEYCHAIN_CREDS) + project_dir = workspace_root / "project" + skills_dir = project_dir / ".claude" / "skills" + project_dir.mkdir(parents=True, exist_ok=True) + + unique = uuid.uuid4().hex[:8] + cmd_file, clean_name = write_synthetic_skill(skills_dir, skill_name, description, unique) + + env = { + "HOME": str(home_dir), + "CLAUDE_CONFIG_DIR": str(home_dir / ".claude"), + "PATH": os.environ.get("PATH", ""), + "ANTHROPIC_API_KEY": os.environ.get("ANTHROPIC_API_KEY", ""), + } + + cmd = [ + "claude", "-p", query, + "--output-format", "stream-json", + "--verbose", + "--include-partial-messages", + "--dangerously-skip-permissions", + ] + + try: + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + cwd=str(project_dir), + env=env, + ) + buffer = "" + triggered: bool | None = None + start = time.time() + try: + while time.time() - start < timeout: + if proc.poll() is not None: + rest = proc.stdout.read() + if rest: + buffer += rest.decode("utf-8", errors="replace") + break + chunk = proc.stdout.read1(8192) if hasattr(proc.stdout, "read1") else proc.stdout.read(8192) + if not chunk: + time.sleep(0.05) + continue + buffer += chunk.decode("utf-8", errors="replace") + decided, buffer = parse_stream_for_trigger(buffer, clean_name) + if decided is not None: + triggered = decided + break + finally: + if proc.poll() is None: + proc.kill() + proc.wait() + if triggered is None: + decided, _ = parse_stream_for_trigger(buffer + "\n", clean_name) + triggered = bool(decided) + return bool(triggered) + finally: + try: + shutil.rmtree(cmd_file.parent, ignore_errors=True) + except OSError: + pass + + +def run_query_docker(query: str, skill_name: str, description: str, + workspace_root: Path, timeout: int) -> bool: + workspace_root.mkdir(parents=True, exist_ok=True) + unique = uuid.uuid4().hex[:8] + skills_in = workspace_root / "skills_in" + skills_in.mkdir(parents=True, exist_ok=True) + _, clean_name = write_synthetic_skill(skills_in, skill_name, description, unique) + + creds_dir: Path | None = None + if _KEYCHAIN_CREDS: + creds_dir = workspace_root / "creds_in" + creds_dir.mkdir(parents=True, exist_ok=True) + (creds_dir / ".credentials.json").write_text(_KEYCHAIN_CREDS, encoding="utf-8") + + container_script = f""" +set -e +mkdir -p /workspace/.claude/skills +cp -R /skills/. /workspace/.claude/skills/ 2>/dev/null || true +if [ -f /creds/.credentials.json ]; then + mkdir -p /home/evaluator/.claude + cp /creds/.credentials.json /home/evaluator/.claude/.credentials.json +fi +cd /workspace +claude -p "$EVAL_QUERY" \\ + --output-format stream-json --verbose --include-partial-messages \\ + --dangerously-skip-permissions \\ + > /output/stream.jsonl 2>/dev/null || true +""" + + output_dir = workspace_root / "output" + output_dir.mkdir(parents=True, exist_ok=True) + + cmd = [ + "docker", "run", "--rm", + "-v", f"{skills_in}:/skills:ro", + "-v", f"{output_dir}:/output", + "-e", "ANTHROPIC_API_KEY", + "-e", f"EVAL_QUERY={query}", + ] + if creds_dir: + cmd += ["-v", f"{creds_dir}:/creds:ro"] + cmd += [DOCKER_IMAGE, "bash", "-c", container_script] + + try: + subprocess.run(cmd, capture_output=True, timeout=timeout + 30) + except subprocess.TimeoutExpired: + pass + + stream_file = output_dir / "stream.jsonl" + if not stream_file.is_file(): + return False + decided, _ = parse_stream_for_trigger(stream_file.read_text(encoding="utf-8", errors="replace") + "\n", clean_name) + return bool(decided) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run trigger evals in isolation") + parser.add_argument("--skill-path", required=True, type=Path) + parser.add_argument("--triggers-file", required=True, type=Path) + parser.add_argument("--output-dir", required=True, type=Path) + parser.add_argument("--isolation", choices=("docker", "local"), required=True) + parser.add_argument("--workers", type=int, default=8) + parser.add_argument("--runs-per-query", type=int, default=3) + parser.add_argument("--timeout", type=int, default=45) + parser.add_argument("--threshold", type=float, default=0.5) + parser.add_argument("--quiet", action="store_true") + args = parser.parse_args() + + skill_path = args.skill_path.resolve() + triggers_file = args.triggers_file.resolve() + if not triggers_file.is_file(): + print(f"triggers file not found: {triggers_file}", file=sys.stderr) + return 2 + + skill_name, description, _ = parse_skill_md(skill_path) + queries = read_json(triggers_file) + + run_id = new_run_id(f"{skill_name}-triggers") + run_dir = (args.output_dir / run_id).resolve() + (run_dir / "queries").mkdir(parents=True, exist_ok=True) + + write_json(run_dir / "run.json", { + "run_id": run_id, + "skill_name": skill_name, + "description": description, + "isolation": args.isolation, + "started_at": utc_now_iso(), + "query_count": len(queries), + "runs_per_query": args.runs_per_query, + "threshold": args.threshold, + }) + + runner = run_query_docker if args.isolation == "docker" else run_query_local + + def run_one(idx: int, q: dict, run_idx: int) -> tuple[int, bool]: + ws = run_dir / "queries" / f"q{idx:03d}-r{run_idx}" + triggered = runner(q["query"], skill_name, description, ws, args.timeout) + return idx, triggered + + per_query: dict[int, list[bool]] = {} + if not args.quiet: + print(f"[run_triggers] {len(queries)} queries × {args.runs_per_query} runs, isolation={args.isolation}", file=sys.stderr) + + with ThreadPoolExecutor(max_workers=args.workers) as pool: + futures = [] + for idx, q in enumerate(queries): + for run_idx in range(args.runs_per_query): + futures.append(pool.submit(run_one, idx, q, run_idx)) + for fut in as_completed(futures): + try: + idx, triggered = fut.result() + except Exception as e: + print(f"Warning: query failed: {e}", file=sys.stderr) + continue + per_query.setdefault(idx, []).append(triggered) + + results = [] + for idx, q in enumerate(queries): + triggers = per_query.get(idx, []) + rate = (sum(triggers) / len(triggers)) if triggers else 0.0 + should = bool(q["should_trigger"]) + if should: + passed = rate >= args.threshold + else: + passed = rate < args.threshold + results.append({ + "query": q["query"], + "should_trigger": should, + "trigger_rate": rate, + "triggers": int(sum(triggers)), + "runs": len(triggers), + "pass": passed, + }) + + output = { + "run_id": run_id, + "completed_at": utc_now_iso(), + "skill_name": skill_name, + "description": description, + "isolation": args.isolation, + "results": results, + "summary": { + "total": len(results), + "passed": sum(1 for r in results if r["pass"]), + "failed": sum(1 for r in results if not r["pass"]), + }, + } + write_json(run_dir / "triggers-result.json", output) + print(json.dumps(output, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/bmad-eval-runner/scripts/utils.py b/skills/bmad-eval-runner/scripts/utils.py new file mode 100644 index 0000000..92b6436 --- /dev/null +++ b/skills/bmad-eval-runner/scripts/utils.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.9" +# /// +"""Shared helpers for the eval runner.""" + +from __future__ import annotations + +import json +import re +import shutil +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + + +def parse_skill_md(skill_path: Path) -> tuple[str, str, str]: + """Return (name, description, body) from the skill's SKILL.md frontmatter.""" + text = (skill_path / "SKILL.md").read_text(encoding="utf-8") + fm_match = re.match(r"^---\s*\n(.*?)\n---\s*\n(.*)$", text, re.DOTALL) + if not fm_match: + raise ValueError(f"SKILL.md at {skill_path} is missing frontmatter") + frontmatter, body = fm_match.group(1), fm_match.group(2) + + name = None + description_lines: list[str] = [] + in_description = False + for line in frontmatter.splitlines(): + if line.startswith("name:"): + name = line.split(":", 1)[1].strip() + in_description = False + elif line.startswith("description:"): + value = line.split(":", 1)[1].strip() + if value in ("|", ">"): + in_description = True + else: + description_lines = [value] + in_description = False + elif in_description and line.startswith((" ", "\t")): + description_lines.append(line.strip()) + elif in_description: + in_description = False + + if not name: + raise ValueError(f"SKILL.md at {skill_path} is missing a name") + return name, " ".join(description_lines).strip(), body + + +def discover_project_root(skill_path: Path) -> Path: + """Walk up from the skill looking for _bmad/ or .git; default to skill's grandparent.""" + for parent in [skill_path, *skill_path.parents]: + if (parent / "_bmad").is_dir() or (parent / ".git").exists(): + return parent + return skill_path.parent.parent + + +def discover_evals( + skill_path: Path, + project_root: Path, + explicit: Path | None, +) -> dict[str, Path]: + """Locate evals.json and triggers.json. Return dict with keys 'evals' and/or 'triggers'.""" + found: dict[str, Path] = {} + + def check_dir(d: Path) -> None: + if not d.is_dir(): + return + for key, fname in (("evals", "evals.json"), ("triggers", "triggers.json")): + candidate = d / fname + if candidate.is_file() and key not in found: + found[key] = candidate + + if explicit is not None: + explicit = explicit.resolve() + if explicit.is_file(): + if explicit.name == "evals.json": + found["evals"] = explicit + elif explicit.name == "triggers.json": + found["triggers"] = explicit + elif explicit.is_dir(): + check_dir(explicit) + return found + + skill_name = skill_path.name + candidates: list[Path] = [ + skill_path / "evals", + skill_path.parent.parent / "evals" / skill_name, + project_root / "evals" / skill_name, + ] + for d in candidates: + check_dir(d) + if found: + break + + if not found: + evals_root = project_root / "evals" + if evals_root.is_dir(): + for sub in evals_root.rglob(skill_name): + if sub.is_dir(): + check_dir(sub) + if found: + break + + return found + + +def utc_now_iso() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def new_run_id(skill_name: str) -> str: + return f"{datetime.now().strftime('%Y%m%d-%H%M%S')}-{skill_name}" + + +def have_docker() -> bool: + if shutil.which("docker") is None: + return False + try: + result = subprocess.run( + ["docker", "info"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=5, + ) + return result.returncode == 0 + except Exception: + return False + + +def docker_image_present(image: str = "bmad-eval-runner:latest") -> bool: + if not have_docker(): + return False + try: + result = subprocess.run( + ["docker", "image", "inspect", image], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=10, + ) + return result.returncode == 0 + except Exception: + return False + + +def read_macos_keychain_credentials() -> str | None: + """Read the Claude Code OAuth credentials JSON from the macOS Keychain. + + Returns the raw JSON string stored under service "Claude Code-credentials", + or None if unavailable (non-macOS, entry missing, or access denied). + + Called in the parent process — which owns the Keychain ACL — so the credential + can be staged into each isolated workspace's `.claude/.credentials.json` before + `claude -p` is launched. Without this, an isolated subprocess with HOME pointed + at an empty dir has no auth and every eval fails with "Not logged in." + """ + if sys.platform != "darwin": + return None + try: + result = subprocess.run( + ["security", "find-generic-password", "-s", "Claude Code-credentials", "-w"], + capture_output=True, + timeout=5, + ) + if result.returncode != 0: + return None + val = result.stdout.decode("utf-8", errors="replace").strip() + return val if val else None + except Exception: + return None + + +def stage_credentials(claude_dir: Path, credentials_json: str | None) -> None: + """Write credentials_json to /.credentials.json. No-op if None.""" + if not credentials_json: + return + claude_dir.mkdir(parents=True, exist_ok=True) + (claude_dir / ".credentials.json").write_text(credentials_json, encoding="utf-8") + + +def write_json(path: Path, data: object) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8") + + +def read_json(path: Path) -> object: + return json.loads(path.read_text(encoding="utf-8")) + + +def parse_skill_dependencies(skill_path: Path) -> list[str]: + """Return skill names declared under 'dependencies:' in SKILL.md frontmatter.""" + try: + text = (skill_path / "SKILL.md").read_text(encoding="utf-8") + except (FileNotFoundError, OSError): + return [] + fm = re.match(r"^---\s*\n(.*?)\n---", text, re.DOTALL) + if not fm: + return [] + deps: list[str] = [] + in_deps = False + for line in fm.group(1).splitlines(): + if re.match(r"^dependencies\s*:", line): + in_deps = True + elif in_deps: + m = re.match(r"^\s+-\s+(\S+)", line) + if m: + deps.append(m.group(1)) + elif not line.startswith((" ", "\t")): + break + return deps + + +def discover_setup_dirs(evals_file: Path, eval_id: str | None = None) -> list[Path]: + """Return ordered list of setup overlay dirs that exist. + + base: /setup/ + per-eval: //setup/ + + Applied base-first so per-eval overlays win on conflict. + """ + evals_dir = evals_file.parent + dirs: list[Path] = [] + base = evals_dir / "setup" + if base.is_dir(): + dirs.append(base) + if eval_id: + per_eval = evals_dir / eval_id / "setup" + if per_eval.is_dir(): + dirs.append(per_eval) + return dirs + + +def apply_setup_overlay(setup_dirs: list[Path], dest: Path) -> None: + """Rsync each setup dir onto dest in order (base first, per-eval last).""" + dest.mkdir(parents=True, exist_ok=True) + for src in setup_dirs: + if not src.is_dir(): + continue + subprocess.run( + ["rsync", "-a", f"{src}/", f"{dest}/"], + check=False, + ) + + +__all__ = [ + "parse_skill_md", + "discover_project_root", + "discover_evals", + "utc_now_iso", + "new_run_id", + "have_docker", + "docker_image_present", + "read_macos_keychain_credentials", + "stage_credentials", + "write_json", + "read_json", + "parse_skill_dependencies", + "discover_setup_dirs", + "apply_setup_overlay", +] diff --git a/website/public/img/eval-test-types.png b/website/public/img/eval-test-types.png new file mode 100644 index 0000000..aaa4064 Binary files /dev/null and b/website/public/img/eval-test-types.png differ