diff --git a/scripts/agent-evals/package-lock.json b/scripts/agent-evals/package-lock.json index 66c56a31cae..d5861a4978b 100644 --- a/scripts/agent-evals/package-lock.json +++ b/scripts/agent-evals/package-lock.json @@ -15,9 +15,11 @@ "tsx": "^4.11.0" }, "devDependencies": { + "@types/chai": "^4.3.0", + "@types/chai-fs": "^2.0.5", "@types/mocha": "^9.0.0", "@types/node": "^24.9.0", - "rxjs": "^7.8.2" + "chai-fs": "^2.0.0" } }, "node_modules/@esbuild/darwin-arm64": { @@ -60,6 +62,21 @@ "node": ">=14" } }, + "node_modules/@types/chai": { + "version": "4.3.20", + "resolved": "https://registry.npmjs.org/@types/chai/-/chai-4.3.20.tgz", + "integrity": "sha512-/pC9HAB5I/xMlc5FP77qjCnI16ChlJfW0tGa0IUcFn38VJrTV6DeZ60NU5KZBtaOZqjdpwTWohz5HU1RrhiYxQ==", + "dev": true + }, + "node_modules/@types/chai-fs": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@types/chai-fs/-/chai-fs-2.0.5.tgz", + "integrity": "sha512-Fusfcwil87QBYVN9vqDf46/+mb19bbDyQ7+dRNBZuLetgI5aipjHI0kI9sl2o02w9jTw70y7R58Ns6e40cfa1w==", + "dev": true, + "dependencies": { + "@types/chai": "*" + } + }, "node_modules/@types/mocha": { "version": "9.1.1", "resolved": "https://registry.npmjs.org/@types/mocha/-/mocha-9.1.1.tgz", @@ -91,11 +108,58 @@ "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==" }, + "node_modules/array-events": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/array-events/-/array-events-0.2.0.tgz", + "integrity": "sha512-Js6+JM/MxB72WeODWcUOOD/BWRqx6QTff8FWvweERQ0MdzViScUJV4XwRFnXvyvbfhuwWNrwhid7IJe2ux3r4Q==", + "dev": true, + "dependencies": { + "async-arrays": "*", + "extended-emitter": "*" + }, + "engines": { + "node": "*" + } + }, + "node_modules/assertion-error": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-1.1.0.tgz", + "integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==", + "dev": true, + "peer": true, + "engines": { + "node": "*" + } + }, + "node_modules/async-arrays": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/async-arrays/-/async-arrays-2.0.0.tgz", + "integrity": "sha512-lMm6njQEX7gHbdX/b+PGBDXD/Vwg40BKSatlOaWNxrW/O5wYzARmoh+50h58s3hsyzGPU5+xYndwtc+m91yLiw==", + "dev": true, + "dependencies": { + "sift": "*" + }, + "engines": { + "node": "*" + } + }, "node_modules/balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==" }, + "node_modules/bit-mask": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bit-mask/-/bit-mask-1.0.2.tgz", + "integrity": "sha512-UGtq08LSiazxL4zVmBzrhdCWnT4RWx3JhhD/3crhfv8xxjnVHxf/WoVjEstjSUaZeZRP7kZrWNqup1VvUClCaQ==", + "dev": true, + "dependencies": { + "array-events": "^0.2.0" + }, + "engines": { + "node": "*" + } + }, "node_modules/brace-expansion": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", @@ -109,6 +173,12 @@ "resolved": "https://registry.npmjs.org/browser-stdout/-/browser-stdout-1.3.1.tgz", "integrity": "sha512-qhAVI1+Av2X7qelOfAIYwXONood6XlZE/fXaBSmW/T5SzLAmCgzi+eiWE7fUvbHaeNBQH13UftjpXxsfLkMpgw==" }, + "node_modules/call-me-maybe": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-me-maybe/-/call-me-maybe-1.0.2.tgz", + "integrity": "sha512-HpX65o1Hnr9HH25ojC1YGs7HCQLq0GCOibSaWER0eNpgJ/Z1MZv2mTc7+xh6WOPxbRVcmgbv4hGU+uSQ/2xFZQ==", + "dev": true + }, "node_modules/camelcase": { "version": "6.3.0", "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz", @@ -120,6 +190,41 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/chai": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/chai/-/chai-4.5.0.tgz", + "integrity": "sha512-RITGBfijLkBddZvnn8jdqoTypxvqbOLYQkGGxXzeFjVHvudaPw0HNFD9x928/eUwYWd2dPCugVqspGALTZZQKw==", + "dev": true, + "peer": true, + "dependencies": { + "assertion-error": "^1.1.0", + "check-error": "^1.0.3", + "deep-eql": "^4.1.3", + "get-func-name": "^2.0.2", + "loupe": "^2.3.6", + "pathval": "^1.1.1", + "type-detect": "^4.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/chai-fs": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/chai-fs/-/chai-fs-2.0.0.tgz", + "integrity": "sha512-PGfINFH/7XrQBnbp5/MnbFtzBL1//erKs+uoUdyo7KnW0mUX13L6bTO3Jm8OIexSVSh0Y+aaFhhbxyDtb679DA==", + "dev": true, + "dependencies": { + "bit-mask": "^1.0.1", + "readdir-enhanced": "^1.4.0" + }, + "engines": { + "node": ">=4" + }, + "peerDependencies": { + "chai": ">= 1.6.1 < 5" + } + }, "node_modules/chalk": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", @@ -160,6 +265,19 @@ "node": ">=8" } }, + "node_modules/check-error": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.3.tgz", + "integrity": "sha512-iKEoDYaRmd1mxM90a2OEfWhjsjPpYPuQ+lMYsoxB126+t8fw7ySEO48nmDg5COTjxDI65/Y2OWpeEHk3ZOe8zg==", + "dev": true, + "peer": true, + "dependencies": { + "get-func-name": "^2.0.2" + }, + "engines": { + "node": "*" + } + }, "node_modules/chokidar": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.3.tgz", @@ -310,6 +428,19 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/deep-eql": { + "version": "4.1.4", + "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-4.1.4.tgz", + "integrity": "sha512-SUwdGfqdKOwxCPeVYjwSyRpJ7Z+fhpwIAtmCUdZIWZ/YP5R9WAsyuSgpLVDi9bjWoN2LXHNss/dk3urXtdQxGg==", + "dev": true, + "peer": true, + "dependencies": { + "type-detect": "^4.0.0" + }, + "engines": { + "node": ">=6" + } + }, "node_modules/diff": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/diff/-/diff-7.0.0.tgz", @@ -328,6 +459,12 @@ "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz", "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==" }, + "node_modules/es6-promise": { + "version": "4.2.8", + "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.8.tgz", + "integrity": "sha512-HJDGx5daxeIvxdBxvG2cb9g4tEvwIk3i8+nhX0yGrYmZUzbkdg8QbDevheDB8gd0//uPj4c1EQua8Q+MViT0/w==", + "dev": true + }, "node_modules/esbuild": { "version": "0.25.10", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.10.tgz", @@ -387,6 +524,18 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/extended-emitter": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/extended-emitter/-/extended-emitter-1.6.0.tgz", + "integrity": "sha512-TNF4xMKL9aKYTR2cTNkKYMUnKzzjfV5Nl6TX45smJ/796CmaFt+KCyidgGdod0Kgj5VSL+ctNIGVf+i1l3e+UA==", + "dev": true, + "dependencies": { + "sift": "*" + }, + "engines": { + "node": "*" + } + }, "node_modules/find-up": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", @@ -446,6 +595,16 @@ "node": "6.* || 8.* || >= 10.*" } }, + "node_modules/get-func-name": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz", + "integrity": "sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==", + "dev": true, + "peer": true, + "engines": { + "node": "*" + } + }, "node_modules/get-tsconfig": { "version": "4.10.1", "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.10.1.tgz", @@ -476,6 +635,12 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/glob-to-regexp": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/glob-to-regexp/-/glob-to-regexp-0.3.0.tgz", + "integrity": "sha512-Iozmtbqv0noj0uDDqoL0zNq0VBEfK2YFoMAZoxJe4cwphvLR+JskfF30QhXHOR4m3KrE6NLRYw+U9MRXvifyig==", + "dev": true + }, "node_modules/has-flag": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", @@ -586,6 +751,16 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/loupe": { + "version": "2.3.7", + "resolved": "https://registry.npmjs.org/loupe/-/loupe-2.3.7.tgz", + "integrity": "sha512-zSMINGVYkdpYSOBmLi0D1Uo7JU9nVdQKrHxC8eYlV+9YKK9WePqAlL7lSlorG/U2Fw1w0hTBmaa/jrQ3UbPHtA==", + "dev": true, + "peer": true, + "dependencies": { + "get-func-name": "^2.0.1" + } + }, "node_modules/lru-cache": { "version": "10.4.3", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", @@ -742,6 +917,16 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/pathval": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/pathval/-/pathval-1.1.1.tgz", + "integrity": "sha512-Dp6zGqpTdETdR63lehJYPeIOqpiNBNtc7BpWSLrOje7UaIsE5aY92r/AunQA7rsXvet3lrJ3JnZX29UPTKXyKQ==", + "dev": true, + "peer": true, + "engines": { + "node": "*" + } + }, "node_modules/picocolors": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", @@ -755,6 +940,17 @@ "safe-buffer": "^5.1.0" } }, + "node_modules/readdir-enhanced": { + "version": "1.5.2", + "resolved": "https://registry.npmjs.org/readdir-enhanced/-/readdir-enhanced-1.5.2.tgz", + "integrity": "sha512-oncAoS9LLjy/+DeZfSAdZBI/iFJGcPCOp44RPFI6FIMHuxt5CC5P0cUZ9mET+EZB9ONhcEvAids/lVRkj0sTHw==", + "dev": true, + "dependencies": { + "call-me-maybe": "^1.0.1", + "es6-promise": "^4.1.0", + "glob-to-regexp": "^0.3.0" + } + }, "node_modules/readdirp": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-4.1.2.tgz", @@ -783,15 +979,6 @@ "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" } }, - "node_modules/rxjs": { - "version": "7.8.2", - "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.2.tgz", - "integrity": "sha512-dhKf903U/PQZY6boNNtAGdWbG85WAbjT/1xYoZIC7FAY0yWapOBQVsVrDl58W86//e1VpMNBtRV4MaXfdMySFA==", - "dev": true, - "dependencies": { - "tslib": "^2.1.0" - } - }, "node_modules/safe-buffer": { "version": "5.2.1", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", @@ -838,6 +1025,12 @@ "node": ">=8" } }, + "node_modules/sift": { + "version": "17.1.3", + "resolved": "https://registry.npmjs.org/sift/-/sift-17.1.3.tgz", + "integrity": "sha512-Rtlj66/b0ICeFzYTuNvX/EF1igRbbnGSvEyT79McoZa/DeGhMyC5pWKOEsZKnpkqtSeovd5FL/bjHWC3CIIvCQ==", + "dev": true + }, "node_modules/signal-exit": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", @@ -962,12 +1155,6 @@ "url": "https://github.com/chalk/supports-color?sponsor=1" } }, - "node_modules/tslib": { - "version": "2.8.1", - "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", - "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", - "dev": true - }, "node_modules/tsx": { "version": "4.20.6", "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.20.6.tgz", @@ -986,6 +1173,16 @@ "fsevents": "~2.3.3" } }, + "node_modules/type-detect": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.1.0.tgz", + "integrity": "sha512-Acylog8/luQ8L7il+geoSxhEkazvkslg7PSNKOX59mbB9cOveP5aq9h74Y7YU8yDpJwetzQQrfIwtf4Wp4LKcw==", + "dev": true, + "peer": true, + "engines": { + "node": ">=4" + } + }, "node_modules/undici-types": { "version": "7.16.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", diff --git a/scripts/agent-evals/package.json b/scripts/agent-evals/package.json index 749871bf806..4b27e3b6a6d 100644 --- a/scripts/agent-evals/package.json +++ b/scripts/agent-evals/package.json @@ -11,8 +11,11 @@ "author": "", "license": "ISC", "devDependencies": { + "@types/chai": "^4.3.0", + "@types/chai-fs": "^2.0.5", "@types/mocha": "^9.0.0", - "@types/node": "^24.9.0" + "@types/node": "^24.9.0", + "chai-fs": "^2.0.0" }, "dependencies": { "mocha": "^11.7.1", diff --git a/scripts/agent-evals/src/runner/agent-test-runner.ts b/scripts/agent-evals/src/runner/agent-test-runner.ts index 2bb026df91b..b1065da6369 100644 --- a/scripts/agent-evals/src/runner/agent-test-runner.ts +++ b/scripts/agent-evals/src/runner/agent-test-runner.ts @@ -19,4 +19,10 @@ export interface AgentTestRunner { * an event is not found */ expectToolCalls(tools: ToolDef[]): Promise; + + /** + * Returns an assertion object for the path inside the run directory that can + * be asserted on via chai-fs's API: https://www.chaijs.com/plugins/chai-fs/ + */ + expectFs(filePath: string): Chai.Assertion; } diff --git a/scripts/agent-evals/src/runner/expect-files.ts b/scripts/agent-evals/src/runner/expect-files.ts new file mode 100644 index 00000000000..aef5665f88d --- /dev/null +++ b/scripts/agent-evals/src/runner/expect-files.ts @@ -0,0 +1,10 @@ +import * as chai from "chai"; +import chaiFs from "chai-fs"; + +// Prevent file paths in assertions from being cut off +chai.config.truncateThreshold = 0; +chai.use(chaiFs); + +export function expectFile(path: string): Chai.Assertion { + return chai.expect(path); +} diff --git a/scripts/agent-evals/src/runner/gemini-cli-runner.ts b/scripts/agent-evals/src/runner/gemini-cli-runner.ts index 389356f583d..e63877b5ef8 100644 --- a/scripts/agent-evals/src/runner/gemini-cli-runner.ts +++ b/scripts/agent-evals/src/runner/gemini-cli-runner.ts @@ -10,6 +10,7 @@ import { } from "./tool-matcher.js"; import fs from "fs"; import { throwFailure } from "./logging.js"; +import { expectFile } from "./expect-files.js"; const READY_PROMPT = "Type your message"; @@ -42,7 +43,7 @@ export class GeminiCliRunner implements AgentTestRunner { constructor( private readonly testName: string, testDir: string, - runDir: string, + private readonly runDir: string, ) { // Create a settings file to point the CLI to a local telemetry log this.telemetryPath = path.join(testDir, "telemetry.log"); @@ -92,6 +93,10 @@ export class GeminiCliRunner implements AgentTestRunner { await this.cli.kill(); } + expectFs(filePath: string): Chai.Assertion { + return expectFile(path.join(this.runDir, filePath)); + } + /** * Reads the agent's telemetry file and looks for the given event. Throws if * the event is not found diff --git a/scripts/agent-evals/src/tests/firebase-init.spec.ts b/scripts/agent-evals/src/tests/firebase-init.spec.ts index 3769d573d06..6cc522abe1e 100644 --- a/scripts/agent-evals/src/tests/firebase-init.spec.ts +++ b/scripts/agent-evals/src/tests/firebase-init.spec.ts @@ -3,13 +3,14 @@ import { AgentTestRunner } from "../runner/index.js"; import "../helpers/hooks.js"; describe("/firebase:init", function (this: Mocha.Suite) { - this.retries(2); + // this.retries(2); it("backend app", async function (this: Mocha.Context) { const run: AgentTestRunner = await startAgentTest(this, { templateName: "next-app-hello-world", }); + run.expectFs(".gemini/settings.json").to.be.a.file().and.contents.contains(["experimental"]); await run.type("/firebase:init"); await run.expectText("Backend Services"); await run.expectText("AI Logic");