diff --git a/.github/workflows/eval-e2e.yml b/.github/workflows/eval-e2e.yml deleted file mode 100644 index 823f4a72..00000000 --- a/.github/workflows/eval-e2e.yml +++ /dev/null @@ -1,33 +0,0 @@ -on: - workflow_call: - inputs: - os: - required: false - description: "runs-on property, ex: ubuntu-latest, windows-latest" - type: string - default: "ubuntu-latest" - -jobs: - eval: - name: 'yarn:eval' - runs-on: ${{ input.os }} - steps: - - name: Configure git longpaths if on Windows - if: ${{ runner.os == 'Windows' }} - run: git config --system core.longpaths true - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 - with: - node-version: lts/* - cache: yarn - - uses: actions/setup-java@v4 - with: - distribution: 'temurin' - java-version: '11' - - run: yarn - - run: yarn build - - name: Eval-based E2E tests - shell: bash - run: | - cd packages/mcp - yarn:eval diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml new file mode 100644 index 00000000..96200133 --- /dev/null +++ b/.github/workflows/eval.yml @@ -0,0 +1,45 @@ +on: + workflow_dispatch: + +jobs: + eval-discoverability: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: lts/* + cache: yarn + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '11' + - run: yarn + - run: yarn build + - name: Eval-based discoverability tests + run: | + cd packages/mcp + yarn test:eval:discoverability + env: + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} + eval-e2e: + needs: [eval-discoverability] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: lts/* + cache: yarn + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '11' + - run: yarn + - run: yarn build + - name: Eval-based E2E tests + run: | + cd packages/mcp + yarn test:eval:e2e + env: + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} diff --git a/TESTING.md b/TESTING.md new file mode 100644 index 00000000..0c983f1e --- /dev/null +++ b/TESTING.md @@ -0,0 +1,254 @@ +# Testing MCP tools + +This doc covers the different types of tests used to validate MCP tools and ensure they work correctly with LLM agents. + +## Types of tests + +### E2E Tool Tests + +E2E tool tests focus on the tool logic (no LLM inference required) by using an MCP client to call the tool. Write test cases to assert that: +* invalid param combinations/values return a tool error ([`isError: true` set in the response](https://modelcontextprotocol.io/specification/2025-06-18/server/tools#error-handling)) +* valid user flows run successfully +* [a specific toolset enables your tools](https://github.com/salesforcecli/mcp/blob/15c13cc8f56cf0360c95989c839bcedd5e67a817/packages/mcp-provider-code-analyzer/test/e2e/run_code_analyzer-e2e.test.ts#L23) + +These tests will run on each PR in this repo in linux and windows. + +#### Setup + +Use the `@salesforce/mcp-test-client` MCP client to start the server and client: + +```typescript +import { McpTestClient, DxMcpTransport } from '@salesforce/mcp-test-client'; +import { z } from 'zod'; + +const client = new McpTestClient({ timeout: 300_000 }); + +// Define tool schema +// NOTE: to avoid duplication you may want to import this from your tool +const toolSchema = { + name: z.literal('run_soql_query'), + params: z.object({ + query: z.string(), + usernameOrAlias: z.string(), + directory: z.string() + }) +}; + +// Connect with DX transport +const transport = DxMcpTransport(); +await client.connect(transport); + +// Call tool directly +const result = await client.callTool(toolSchema, { + name: 'run_soql_query', + params: { + query: 'SELECT Name FROM Account LIMIT 8', + usernameOrAlias: 'test-org', + directory: '/path/to/project' + } +}); + +expect(result.isError).to.equal(false); +expect(result.content.length).to.equal(1); +if (result.content[0].type !== 'text') assert.fail(); + +const responseText = result.content[0].text; +expect(responseText).to.contain('SOQL query results:'); + +// Parse the query result JSON +const queryMatch = responseText.match(/SOQL query results:\s*({[\s\S]*})/); +expect(queryMatch).to.not.be.null; + +const queryResult = JSON.parse(queryMatch![1]) as QueryResult; +expect(queryResult.totalSize).to.equal(8); +expect(queryResult.done).to.be.true; +expect(queryResult.records).to.be.an('array'); +expect(queryResult.records.length).to.equal(8); +``` + +See [packages/mcp-provider-dx-core/test/e2e/run_soql_query.test.ts](./packages/mcp-provider-dx-core/test/e2e/run_soql_query.test.ts) for a complete example. + +> [!IMPORTANT] +> These tests should be located in each tool provider package, not in the main MCP package. +> +> `@salesforce/mcp-test-client` is a package inside this monorepo and isn't published. You should add it as a devDep matching its current version to get it in your local provider pkg: +> +> https://github.com/salesforcecli/mcp/tree/main/packages/mcp-test-client + +You can use any test runner you want, we recommend Vitest or Mocha. + +### Evals + +Evaluation tests use LLMs to evaluate tests results. We use two types of tests powered by [vitest-evals](https://github.com/getsentry/vitest-evals/). + +#### Discoverability + +These tests allow you to validate that certain prompts will call your tool with the right parameters. +Each prompt (`input`) should be accompanied by an expected list of tool calls with its params (`expectedTools`), then the `ToolPredictionScorer` scorer will: +* Load all DX MCP tools into context (even non-GA) with the test data (input & expected tool calls) +* Score the expected tool calls based on the MCP tools metadata. + +Unlike other E2E tests, these don't do any tool call and so they are cheaper to run (each test case does 1 roundtrip, check all DX MCP tool metadata for token qty). + +Example: + +```typescript +import { describeEval } from 'vitest-evals'; +import { NoOpTaskRunner } from '../utils/runners.js'; +import { ToolPredictionScorer } from '../utils/scorers/toolPredictionScorer.js'; + +describeEval('run_soql_query', { + data: async () => [ + { + input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.', + expectedTools: [ + { + name: 'run_soql_query', + arguments: { + query: 'SELECT Name FROM Property__c ORDER BY Name ASC', + usernameOrAlias: 'ebikes', + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + }, + }, + ], + }, + ], + task: NoOpTaskRunner(), + scorers: [ToolPredictionScorer()], + threshold: 1.0, + timeout: 30_000, +}); +``` + +See [packages/mcp/test/evals/discoverability/run_soql_query.eval.ts](./packages/mcp/test/evals/discoverability/run_soql_query.eval.ts) for a complete example. + + +#### E2E Evals + +These test intend to cover a real scenario by running each test case in an agent loop with all DX MCP tools exposed. The agent will stop once the task is finished (or if it can't continue). + +Use the Factuality scorer to evaluate the agent response (`response should include X records`, `it should list all tests executed`, etc). +You can also use the vitest-eval's `ToolCallScorer` scorer to evaluate that tools were called correctly. + +```typescript +import { describeEval, ToolCallScorer } from 'vitest-evals'; +import { TaskRunner } from '../utils/runners.js'; +import { Factuality } from '../utils/scorers/factuality.js'; + +describeEval('SOQL queries', { + data: async () => [ + { + input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.', + expected: 'The response should include these records: Architectural Details, City Living...', + expectedTools: [ + { + name: 'run_soql_query', + arguments: { + query: 'SELECT Name FROM Property__c ORDER BY Name ASC', + usernameOrAlias: orgUsername, + directory: projectDir, + }, + }, + ], + }, + ], + task: (input: string) => + TaskRunner({ + promptOptions: { + currentOpenFile: '', + currentOpenWorkspace: projectDir, + }, + })(input), + scorers: [ + Factuality(), + ToolCallScorer({ + ordered: true, + params: 'fuzzy', + }), + ], + threshold: 0.8, + timeout: 300_000, +}); +``` + +> [!TIP] +> If you need to set an SFDX project with a scratch org, use the `cli-plugins-testki` library: +> +> https://github.com/salesforcecli/cli-plugins-testkit + + +See [packages/mcp/test/evals/e2e/run_soql_query.eval.ts](./packages/mcp/test/evals/e2e/run_soql_query.eval.ts) for a complete example. + +#### Setting Prompt Options + +Configure the task runner with prompt settings: + +```typescript +task: (input: string) => + TaskRunner({ + promptOptions: { + // this will set the file/workspace as if being open like in an IDE chat context + currentOpenFile: apexClassFilePath, + currentOpenWorkspace: projectDir, + }, + })(input), +``` + +#### Scoring + +Both eval types use scoring mechanisms: + +- **ToolPredictionScorer**: Validates tool selection and parameters +- **Factuality**: Evaluates response accuracy +- **ToolCallScorer**: Checks tool execution with options like: + - `ordered: true` - Tools must be called in expected order + - `params: 'fuzzy'` - Allow parameter variations (useful for queries) + +Set thresholds (0.0-1.0) to define passing scores. For discoverability tests, use `threshold: 1.0` for exact matches. For E2E tests, use lower thresholds like `threshold: 0.8` to account for response variations. + +### What tests should I write for my tools? + +This will vary depending on the complexity of your tool, the recommended guidance is: + +1. E2E tool tests +Use them to cover the tool logic extensively, focus on: +* Cross OS compatibility (linux and windows) +* Blocking invalid combination of parameters +* [Validating the tool response properly sets `isError` on errors](https://modelcontextprotocol.io/specification/2025-06-18/schema#calltoolresult) + +Even if your tools is mostly "instructions-only" (returns instructions for LLM agents), it's good to have coverage since these tests do a real tool call so you ensure the tool is always callable. + +2. Discoverability Eval tests +All tools should at least 1 test covering the most common utterances you expect users to type that will call your tool. +These tests run evaluate all tools in the server so by having coverage you cover your tools from being "shadowed" if a new tool with similar description is added. + +3. E2E Evals +These should only cover the most common user flows (more expensive to run), tips: +* The user utterance should cover a user flow calling at least 2 tool calls +* Use the `Factuality` scorer to validate the final agent response (`input: run the Geocoding apex test` -> `output: successfully ran the 3 Geocoding apex tests ...`) +* Use the `ToolCallScorer` scorer to assert the chain of tools (and its param) were called in the correct order + +Unsure if your tool should have an E2E eval test? +* If it's an "instructions-only" tool, discoverabitly eval tests may be enough since you care mostly on the tool being called (after that the agent might call built-in tools/other CLI commands) +* Is your tool response critical when combined with other tool in a user task? if not, then discoverability + E2E tool tests might be enough. + +Good scenarios for E2E eval tests: + +* `run_soql_query` tool returns raw SOQL results and we want to evaluate agent response contains the expected records. +* `get_username` is used to resolve org usernames, the `deploy_metadata` validates a user utterance like: +``` +Deploy this file and run the GeocodingServiceTest tests, then summarize the apex test results. +``` +calls `get_username` -> `deploy_metadata` + +Bad scenarios: + +* "instructions-only" tools +* CRUD tools where their responses aren't required for an agent task (e.g. `create_scratch_org` output doesn't matter as long as its successful, that can be covered in a simple E2E tool test) + +### FAQ + +* Which LLMs models are we testing against? + +Only `gemini-2.5-flash` using the Gemini API. +We plan support more models once we switch to Salesforce's LLMG. diff --git a/packages/mcp-provider-dx-core/src/tools/run_soql_query.ts b/packages/mcp-provider-dx-core/src/tools/run_soql_query.ts index b68f624d..d78ae226 100644 --- a/packages/mcp-provider-dx-core/src/tools/run_soql_query.ts +++ b/packages/mcp-provider-dx-core/src/tools/run_soql_query.ts @@ -37,7 +37,7 @@ export const queryOrgParamsSchema = z.object({ query: z.string().describe('SOQL query to run'), usernameOrAlias: usernameOrAliasParam, directory: directoryParam, - useToolingApi: useToolingApiParam, + useToolingApi: useToolingApiParam.describe('Use the Tooling API. Always set to true when querying a tooling sobject.'), }); type InputArgs = z.infer; diff --git a/packages/mcp/package.json b/packages/mcp/package.json index ca2490fb..b6ba70fd 100644 --- a/packages/mcp/package.json +++ b/packages/mcp/package.json @@ -18,8 +18,8 @@ "lint": "wireit", "start": "yarn build && npm link && mcp-inspector sf-mcp-server", "test": "wireit", - "test:eval": "wireit", - "test:only": "wireit" + "test:eval:discoverability": "wireit", + "test:eval:e2e": "wireit" }, "repository": "salesforcecli/mcp", "bugs": { @@ -59,7 +59,6 @@ }, "devDependencies": { "@ai-sdk/google": "^1.2.22", - "@ai-sdk/openai": "^1.3.23", "@salesforce/cli-plugins-testkit": "^5.3.39", "@salesforce/dev-config": "^4.3.2", "@salesforce/prettier-config": "^0.0.3", @@ -134,10 +133,20 @@ ], "output": [] }, - "test:eval": { - "command": "vitest run", + "test:eval:discoverability": { + "command": "vitest --config test/evals/discoverability/vitest.config.ts", "files": [ - "test/**/*.eval.ts" + "test/evals/discoverability/**/*.eval.ts" + ], + "dependencies": [ + "test:compile" + ], + "output": [] + }, + "test:eval:e2e": { + "command": "vitest --config test/evals/e2e/vitest.config.ts", + "files": [ + "test/evals/e2e/**/*.eval.ts" ], "dependencies": [ "test:compile" diff --git a/packages/mcp/test/evals/discoverability/deploy_metadata.eval.ts b/packages/mcp/test/evals/discoverability/deploy_metadata.eval.ts new file mode 100644 index 00000000..556217b7 --- /dev/null +++ b/packages/mcp/test/evals/discoverability/deploy_metadata.eval.ts @@ -0,0 +1,83 @@ +/* + * Copyright 2025, Salesforce, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import { describeEval } from 'vitest-evals'; +import { NoOpTaskRunner } from '../utils/runners.js'; +import { ToolPredictionScorer } from '../utils/scorers/toolPredictionScorer.js'; + +describeEval('deploy', { + data: async () => [ + { + input: 'Deploy this file to my default org and run all apex tests in deployment', + expectedTools: [ + { + name: 'get_username', + arguments: { + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + defaultTargetOrg: true, + }, + }, + { + name: 'deploy_metadata', + arguments: { + sourceDir: [process.env.SF_EVAL_PROMPT_OPEN_FILEPATH], + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + apexTestLevel: 'RunAllTestsInOrg', + usernameOrAlias: 'ebikes-default-org', + }, + }, + ], + }, + { + input: 'Deploy this project to my ebikes org', + expectedTools: [ + { + name: 'deploy_metadata', + arguments: { + usernameOrAlias: 'ebikes', + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + }, + }, + ], + }, + { + input: 'Deploy this file and run the GeocodingServiceTest tests', + expectedTools: [ + { + // user doesn't specify which org to deploy to -> discover it via `get_username` + name: 'get_username', + arguments: { + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + defaultTargetOrg: true, + }, + }, + { + name: 'deploy_metadata', + arguments: { + usernameOrAlias: 'default-org', + sourceDir: [process.env.SF_EVAL_PROMPT_OPEN_FILEPATH], + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + // IMPORTANT: there's a `run_apex_test` available but for these "run test during deployment" scenarios we want to ensure they are only run via `deploy_metadata`, it's a pretty common operation for an agentic loop (test failures rollback deployment) + apexTests: ['GeocodingServiceTest'], + }, + }, + ], + }, + ], + task: NoOpTaskRunner(), + scorers: [ToolPredictionScorer()], + threshold: 1.0, + timeout: 30_000, +}); diff --git a/packages/mcp/test/evals/discoverability/run_apex_test.eval.ts b/packages/mcp/test/evals/discoverability/run_apex_test.eval.ts new file mode 100644 index 00000000..ee7f351d --- /dev/null +++ b/packages/mcp/test/evals/discoverability/run_apex_test.eval.ts @@ -0,0 +1,61 @@ +/* + * Copyright 2025, Salesforce, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import { describeEval } from 'vitest-evals'; +import { NoOpTaskRunner } from '../utils/runners.js'; +import { ToolPredictionScorer } from '../utils/scorers/toolPredictionScorer.js'; + +describeEval('', { + data: async () => [ + { + input: 'Run the GeocodingServiceTest and FileUtilitiesTest tests in the dreamhouse org', + expectedTools: [ + { + name: 'run_apex_test', + arguments: { + usernameOrAlias: 'dreamhouse', + classNames: ['GeocodingServiceTest', 'FileUtilitiesTest'], + testLevel: 'RunSpecifiedTests', + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + }, + }, + ], + }, + { + input: 'Run all apex tests in the org', + expectedTools: [ + { + name: 'get_username', + arguments: { + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + defaultTargetOrg: true, + }, + }, + { + name: 'run_apex_test', + arguments: { + usernameOrAlias: 'default-org', + testLevel: 'RunAllTestsInOrg', + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + }, + }, + ], + }, + ], + task: NoOpTaskRunner(), + scorers: [ToolPredictionScorer()], + threshold: 1.0, + timeout: 30_000, +}); diff --git a/packages/mcp/test/evals/discoverability/run_soql_query.eval.ts b/packages/mcp/test/evals/discoverability/run_soql_query.eval.ts new file mode 100644 index 00000000..12190a6e --- /dev/null +++ b/packages/mcp/test/evals/discoverability/run_soql_query.eval.ts @@ -0,0 +1,61 @@ +/* + * Copyright 2025, Salesforce, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import { describeEval } from 'vitest-evals'; +import { NoOpTaskRunner } from '../utils/runners.js'; +import { ToolPredictionScorer } from '../utils/scorers/toolPredictionScorer.js'; + +describeEval('run_soql_query', { + data: async () => [ + { + input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.', + expectedTools: [ + { + name: 'get_username', + arguments: { + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + defaultTargetOrg: true, + }, + }, + { + name: 'run_soql_query', + arguments: { + query: 'SELECT Name FROM Property__c ORDER BY Name ASC', + usernameOrAlias: 'ebikes', + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + }, + }, + ], + }, + { + input: 'Get the coverage of the GeocodingService apex class, you can query the ApexCodeCoverage tooling object', + expectedTools: [ + { + name: 'run_soql_query', + arguments: { + usernameOrAlias: 'ebikes', + query: 'SELECT Coverage FROM ApexCodeCoverage WHERE ApexClassOrTriggerId = ‘01pD000000066GR’', + useToolingApi: true, + directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR, + }, + }, + ], + }, + ], + task: NoOpTaskRunner(), + scorers: [ToolPredictionScorer()], + threshold: 1.0, + timeout: 30_000, +}); diff --git a/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts b/packages/mcp/test/evals/discoverability/vitest.config.ts similarity index 54% rename from packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts rename to packages/mcp/test/evals/discoverability/vitest.config.ts index e7f9f33f..2605bddc 100644 --- a/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts +++ b/packages/mcp/test/evals/discoverability/vitest.config.ts @@ -13,17 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +import { defineConfig } from 'vitest/config'; -import { describeEval } from 'vitest-evals'; -import { TaskRunner, outputIncludesExpectationArray } from './utils.js'; - -describeEval('describe_code_analyzer_rule', { - data: async () => [{ - input: 'tell me the tags that are associated with the Code Analysis Rule named VFUnescapeEl, which is a rule for the pmd engine', - expected: ['Recommended', 'Security', 'Visualforce'] - }], - task: TaskRunner(), - scorers: [outputIncludesExpectationArray], - threshold: 0.9, - timeout: 60_000 -}); \ No newline at end of file +export default defineConfig({ + test: { + include: ['test/evals/discoverability/*.eval.{ts,mts}'], + reporters: ['vitest-evals/reporter'], + env: { + SF_EVAL_PROMPT_PROJECT_DIR: '/Users/codey/projects/dreamhouse-lwc', + SF_EVAL_PROMPT_OPEN_FILEPATH: '/Users/codey/projects/dreamhouse-lwc/force-app/main/default/classes/GeocodingService.cls' + } + }, +}); diff --git a/packages/mcp/test/evals/e2e/deploy_metadata.eval.ts b/packages/mcp/test/evals/e2e/deploy_metadata.eval.ts new file mode 100644 index 00000000..77ad5963 --- /dev/null +++ b/packages/mcp/test/evals/e2e/deploy_metadata.eval.ts @@ -0,0 +1,125 @@ +/* + * Copyright 2025, Salesforce, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import path from 'node:path'; +import { afterAll, beforeAll } from 'vitest'; +import { TestSession } from '@salesforce/cli-plugins-testkit'; +import { describeEval, ToolCallScorer } from 'vitest-evals'; +import { TaskRunner } from '../utils/runners.js'; +import { Factuality } from '../utils/scorers/factuality.js'; + +let testSession: TestSession; +let orgUsername: string; +let projectDir: string; +let currentOpenFile: string; + +beforeAll(async () => { + testSession = await TestSession.create({ + project: { gitClone: 'https://github.com/trailheadapps/dreamhouse-lwc' }, + scratchOrgs: [{ setDefault: true, config: path.join('config', 'project-scratch-def.json') }], + devhubAuthStrategy: 'AUTO', + }); + + projectDir = testSession.project.dir; + currentOpenFile = path.join(projectDir, 'force-app', 'main', 'default', 'classes', 'GeocodingServiceTest.cls'); + + // get default scratch org username + orgUsername = [...testSession.orgs.keys()][0]; +}, 600_000); + +afterAll(async () => { + await testSession.clean(); +}); + +describeEval('deploy_metadata', { + data: async () => [ + { + input: + 'Deploy this project and run all Apex tests, then assign the dreamhouse permset and summarize the apex test results.', + expected: 'It should have successfully deployed the project and executed all 11 tests without failures', + expectedTools: (() => { + [ + { + name: 'get_username', + arguments: { + defaultTargetOrg: true, + defaultDevHub: false, + directory: projectDir, + }, + }, + { + name: 'deploy_metadata', + arguments: { + apexTestLevel: 'RunAllTestsInOrg', + usernameOrAlias: orgUsername, + directory: projectDir, + }, + }, + { + name: 'assign_permission_set', + arguments: { + permissionSetName: 'dreamhouse', + usernameOrAlias: orgUsername, + directory: projectDir, + }, + }, + ]; + })(), + }, + { + input: 'Deploy this file and run the GeocodingServiceTest tests, then summarize the apex test results.', + expected: + 'It should have deployed 1 component (GeocodingServiceTest class) and successfully executed the "GeocodingServiceTest.successResponse", "GeocodingServiceTest.blankAddress" and "GeocodingServiceTest.errorResponse" tests.', + expectedTools: (() => { + [ + { + name: 'get_username', + arguments: { + defaultTargetOrg: true, + defaultDevHub: false, + directory: projectDir, + }, + }, + { + name: 'deploy_metadata', + arguments: { + apexTestLevel: 'RunAllTestsInOrg', + apexTests: ['GeocodingServiceTest'], + sourceDir: [currentOpenFile], + usernameOrAlias: orgUsername, + directory: projectDir, + }, + }, + ]; + })(), + }, + ], + task: (input: string) => + TaskRunner({ + promptOptions: { + currentOpenFile, + currentOpenWorkspace: projectDir, + }, + })(input), + scorers: [ + Factuality(), + ToolCallScorer({ + ordered: true, + params: 'strict', + }), + ], + threshold: 0.8, + timeout: 600_000, +}); diff --git a/packages/mcp/test/evals/e2e/describe_code_analyzer_rule.eval.ts b/packages/mcp/test/evals/e2e/describe_code_analyzer_rule.eval.ts new file mode 100644 index 00000000..75a6fed8 --- /dev/null +++ b/packages/mcp/test/evals/e2e/describe_code_analyzer_rule.eval.ts @@ -0,0 +1,32 @@ +import { describeEval } from 'vitest-evals'; +import { TaskRunner } from '../utils/runners.js'; + +describeEval('describe_code_analyzer_rule', { + data: async () => [ + { + input: + 'tell me the tags that are associated with the Code Analysis Rule named VFUnescapeEl, which is a rule for the pmd engine', + expected: ['Recommended', 'Security', 'Visualforce'], + }, + ], + task: TaskRunner(), + scorers: [outputIncludesExpectationArray], + threshold: 0.9, + timeout: 60_000, +}); + +export function outputIncludesExpectationArray(opts: { input: string; output: string; expected: string[] }) { + let score: number = 0; + //console.log(`output is ${opts.output}`); + const increment: number = 1 / opts.expected.length; + for (const expected of opts.expected) { + if (opts.output.toLowerCase().includes(expected.toLowerCase())) { + //console.log(`contained ${expected}, icnrementing`); + score += increment; + } + //console.log(`score is now ${score}`) + } + return { + score, + }; +} diff --git a/packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts b/packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts new file mode 100644 index 00000000..9a55509b --- /dev/null +++ b/packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts @@ -0,0 +1,27 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { describeEval } from 'vitest-evals'; +import { TaskRunner } from '../utils/runners.js'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const pathToTarget: string = path.join(__dirname, '..', 'fixtures', 'sample-targets', 'SampleTarget1.cls'); + +describeEval('run_code_analyzer', { + data: async () => [ + { + input: `Run code analysis against ${pathToTarget}, and tell me the number of violations in that file using the response format "There are X violations".`, + expected: [6], + }, + ], + task: TaskRunner(), + scorers: [ + (opts: { output: string; expected: number }) => { + const score: number = opts.output === `There are ${opts.expected} violations.` ? 1 : 0; + return { score }; + }, + ], + threshold: 1.0, + timeout: 60_000, +}); diff --git a/packages/mcp/test/evals/e2e/run_soql_query.eval.ts b/packages/mcp/test/evals/e2e/run_soql_query.eval.ts new file mode 100644 index 00000000..2399ba28 --- /dev/null +++ b/packages/mcp/test/evals/e2e/run_soql_query.eval.ts @@ -0,0 +1,129 @@ +/* + * Copyright 2025, Salesforce, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import path from 'node:path'; +import { afterAll, beforeAll } from 'vitest'; +import { execCmd, TestSession } from '@salesforce/cli-plugins-testkit'; +import { describeEval, ToolCallScorer } from 'vitest-evals'; +import { TaskRunner } from '../utils/runners.js'; +import { Factuality } from '../utils/scorers/factuality.js'; + +let testSession: TestSession; +let orgUsername: string; +let projectDir: string; + +beforeAll(async () => { + testSession = await TestSession.create({ + project: { gitClone: 'https://github.com/trailheadapps/dreamhouse-lwc' }, + scratchOrgs: [{ setDefault: true, config: path.join('config', 'project-scratch-def.json') }], + devhubAuthStrategy: 'AUTO', + }); + + projectDir = testSession.project.dir; + + await execCmd('project deploy start', { + cli: 'sf', + ensureExitCode: 0, + async: true, + }); + + await execCmd('org assign permset -n dreamhouse', { + cli: 'sf', + ensureExitCode: 0, + async: true, + }); + + await execCmd(`data tree import -p ${path.join(testSession.project.dir, 'data', 'sample-data-plan.json')}`, { + cli: 'sf', + ensureExitCode: 0, + async: true, + }); + + // get default scratch org username + orgUsername = [...testSession.orgs.keys()][0]; +}, 600_000); + +afterAll(async () => { + await testSession.clean(); +}); + +describeEval('SOQL queries', { + data: async () => [ + { + input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.', + expected: `The response should include these records: +Architectural Details +City Living +Contemporary City Living +Contemporary Luxury +Heart of Harvard Square +Modern City Living +Quiet Retreat +Seaport District Retreat +Stunning Colonial +Stunning Victorian +Ultimate Sophistication +Waterfront in the City +`, + // IMPORTANT: + // Get expected tools data at runtime rather than at module initialization time to be able to access + // test session context (set in the beforeAll hook). + // + // This is needed because `projectDir` and `orgUsername` are not initialized when declared, so we want to + // read them at test runtime. + expectedTools: (() => { + [ + { + name: 'get_username', + arguments: { + defaultTargetOrg: true, + defaultDevHub: false, + directory: projectDir, + }, + }, + { + name: 'run_soql_query', + arguments: { + query: 'SELECT Name FROM Property__c ORDER BY Name ASC', + usernameOrAlias: orgUsername, + directory: projectDir, + }, + }, + ]; + })(), + }, + ], + // IMPORTANT: + // Create the task runner at runtime rather than at module initialization time to be able to access + // test session context (set in the beforeAll hook). + task: (input: string) => + TaskRunner({ + promptOptions: { + // not needed for this test + currentOpenFile: '', + currentOpenWorkspace: projectDir, + }, + })(input), + scorers: [ + Factuality(), + ToolCallScorer({ + ordered: true, + // fuzzy to account for possible SOQL query diffs agains the expected query (different clauses, casing, etc) + params: 'fuzzy', + }), + ], + threshold: 0.8, + timeout: 300_000, +}); diff --git a/packages/mcp/test/evals/e2e/vitest.config.ts b/packages/mcp/test/evals/e2e/vitest.config.ts new file mode 100644 index 00000000..cd471465 --- /dev/null +++ b/packages/mcp/test/evals/e2e/vitest.config.ts @@ -0,0 +1,24 @@ +/* + * Copyright 2025, Salesforce, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + include: ['test/evals/e2e/*.eval.{ts,mts}'], + reporters: ['vitest-evals/reporter'], + testTimeout: 600_000 + }, +}); diff --git a/packages/mcp/test/evals/run_code_analyzer.eval.ts b/packages/mcp/test/evals/run_code_analyzer.eval.ts deleted file mode 100644 index bb04c375..00000000 --- a/packages/mcp/test/evals/run_code_analyzer.eval.ts +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright 2025, Salesforce, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import path from 'node:path'; -import {fileURLToPath} from 'node:url'; -import { describeEval } from 'vitest-evals'; -import { TaskRunner } from './utils.js'; - -const fileName = fileURLToPath(import.meta.url); -const dirName = path.dirname(fileName); - -const pathToTarget: string = path.join(dirName, '..', 'fixtures', 'sample-targets', 'SampleTarget1.cls'); - -describeEval('run_code_analyzer', { - data: async () => [{ - input: `Run code analysis against ${pathToTarget}, and tell me the number of violations in that file using the response format "There are X violations".`, - expected: [6] - }], - task: TaskRunner(), - scorers: [(opts: {output: string; expected: number}) => { - const score: number = opts.output === `There are ${opts.expected} violations.` ? 1 : 0; - return {score}; - }], - threshold: 0.9, - timeout: 60_000 -}); \ No newline at end of file diff --git a/packages/mcp/test/evals/sf-query-org.eval.ts b/packages/mcp/test/evals/sf-query-org.eval.ts deleted file mode 100644 index ffc8e263..00000000 --- a/packages/mcp/test/evals/sf-query-org.eval.ts +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright 2025, Salesforce, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -import { describeEval } from 'vitest-evals'; -import { Factuality, TaskRunner } from './utils.js'; - -describeEval('SOQL queries', { - data: async () => [ - { - input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.', - expected: `The response should include these records: -Architectural Details -City Living -Contemporary City Living -Contemporary Luxury -Heart of Harvard Square -Modern City Living -Quiet Retreat -Seaport District Retreat -Stunning Colonial -Stunning Victorian -Ultimate Sophistication -Waterfront in the City -`, - // expected: `The response should include these records: - // Sophisticated Urban Escape - // Metropolitan Elegance - // Vibrant City Sanctuary - // Downtown Dreamscape - // Sleek Urban Oasis - // Modern Metropole - // Luxe in the Loop - // `, - }, - ], - task: TaskRunner(), - scorers: [Factuality()], - threshold: 0.6, - timeout: 30_000, -}); \ No newline at end of file diff --git a/packages/mcp/test/evals/utils.ts b/packages/mcp/test/evals/utils.ts deleted file mode 100644 index ea8ee97a..00000000 --- a/packages/mcp/test/evals/utils.ts +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright 2025, Salesforce, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -import * as path from 'node:path'; -import { google } from '@ai-sdk/google'; -import { experimental_createMCPClient, generateObject, streamText, type LanguageModel } from 'ai'; -import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio'; -import { z } from 'zod'; - -// This prompt intends to represent what an IDE context window could look like, some specifics: -// -// * Current open project directory -// * Current open file -const SYSTEM_PROMPT = `You are an assistant responsible for evaluating the results of calling various tools. -You a general purpose LLM-based Agent. Your purpose is to answer the user's query using the tools provided. -- You should ONLY use the tools available to answer the user's query. -- Use as few tool calls as possible to get to the answer. -- Using multiple tool calls to get to the answer is allowed when needed. -The current open project dir is "${process.env.SF_EVAL_PROMPT_PROJECT_DIR}" -`; - -// Supported models: https://ai.google.dev/gemini-api/docs/models -const defaultModel = google('gemini-2.5-flash'); - -export function TaskRunner(model: LanguageModel = defaultModel) { - return async function TaskRun(input: string) { - const mcpClient = await experimental_createMCPClient({ - transport: new Experimental_StdioMCPTransport({ - command: 'node', - args: [path.join(import.meta.dirname, '../../bin/run.js'), '--toolsets', 'all', '-o', 'DEFAULT_TARGET_ORG', '--no-telemetry', '--allow-non-ga-tools'] - }), - }); - - const tools = await mcpClient.tools(); - - try { - const result = streamText({ - model, - tools, - system: SYSTEM_PROMPT, - prompt: input, - maxRetries: 1, - maxSteps: 10, - experimental_telemetry: { - isEnabled: false, - }, - onError: (error) => { - // eslint-disable-next-line no-console - console.error(error); - }, - }); - - // TODO: we don't need text streaming here, maybe switch to `generateText`? - // eslint-disable-next-line - for await (const _ of result.fullStream) { - } - - return await result.text; - } catch (error) { - // eslint-disable-next-line no-console - console.error(error); - throw error; - } finally { - await mcpClient.close(); - } - }; -} - -export function outputIncludesExpectationArray(opts: {input: string; output: string; expected: string[]}) { - let score: number = 0; - const increment: number = 1/opts.expected.length; - for (const expected of opts.expected) { - if (opts.output.toLowerCase().includes(expected.toLowerCase())) { - score += increment; - } - } - return { - score - } -} - -/** - * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`. - * - * ``` - * import { openai } from "@ai-sdk/openai"; - * - * scorers: [Factuality(openai("gpt-4o"))] - * ``` - */ -export function Factuality(model: LanguageModel = defaultModel) { - // TODO: remove function wrapper - // eslint-disable-next-line @typescript-eslint/no-shadow - return async function Factuality(opts: { input: string; output: string; expected?: string }) { - const { object } = await generateObject({ - model, - /** - * Prompt implementation from `autoevals`: - * - * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml} - */ - prompt: ` - You are comparing a submitted answer to an expert answer on a given question. Here is the data: - [BEGIN DATA] - ************ - [Question]: ${opts.input} - ************ - [Expert]: ${opts.expected} - ************ - [Submission]: ${opts.output} - ************ - [END DATA] - Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation, or overall structure. - The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options: - - (A) The submitted answer is a subset of the expert answer and is fully consistent with it. - (B) The submitted answer is a superset of the expert answer and is fully consistent with it. - (C) The submitted answer contains all the same details as the expert answer. - (D) There is a disagreement between the submitted answer and the expert answer. - (E) The answers differ, but these differences don't matter from the perspective of factuality. - `, - schema: z.object({ - answer: z.enum(['A', 'B', 'C', 'D', 'E']).describe('Your selection.'), - rationale: z.string().describe('Why you chose this answer. Be very detailed.'), - }), - }); - - const scores = { - A: 0.4, - B: 0.6, - C: 1, - D: 0, - E: 1, - }; - - return { - score: scores[object.answer], - metadata: { - rationale: object.rationale, - }, - }; - }; -} \ No newline at end of file diff --git a/packages/mcp/test/evals/utils/runners.ts b/packages/mcp/test/evals/utils/runners.ts new file mode 100644 index 00000000..66a83eca --- /dev/null +++ b/packages/mcp/test/evals/utils/runners.ts @@ -0,0 +1,159 @@ +/* + * Copyright 2025, Salesforce, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import * as path from 'node:path'; +import * as fs from 'node:fs'; +import * as os from 'node:os'; +import { google } from '@ai-sdk/google'; +import { experimental_createMCPClient, generateText, type LanguageModel } from 'ai'; +import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio'; +import { TaskResult } from 'vitest-evals'; + +function generateSystemPrompt(opts?: runnerOptions['promptOptions']): string { + let prompt = `You are an assistant responsible for evaluating the results of calling various tools. +You a general purpose LLM-based Agent. Your purpose is to answer the user's query using the tools provided. +- You should ONLY use the tools available to answer the user's query. +- Use as few tool calls as possible to get to the answer. +- Using multiple tool calls to get to the answer is allowed when needed. +`; + + if (opts) { + prompt += ` + +I am working in a workspace with the following folders: +- ${opts.currentOpenWorkspace} + + + +The user's current file is ${opts.currentOpenFile} + +`; + } + + return prompt; +} + +// Supported models: https://ai.google.dev/gemini-api/docs/models +const defaultModel = google('gemini-2.5-flash'); + +type runnerOptions = { + model?: LanguageModel; + promptOptions?: { + currentOpenWorkspace: string; + currentOpenFile: string; + }; +}; + +export function TaskRunner(opts?: runnerOptions) { + return async function TaskRun(input: string): Promise { + const mcpClient = await experimental_createMCPClient({ + transport: new Experimental_StdioMCPTransport({ + command: 'node', + args: [ + path.join(import.meta.dirname, '../../../bin/run.js'), + '--toolsets', + 'all', + '-o', + 'DEFAULT_TARGET_ORG', + '--no-telemetry', + '--allow-non-ga-tools', + ], + // IMPORTANT: + // this is needed because testkit sets it when transferring the hub auth and creating a scratch. + // Without it you get a keychain error/silent failure because the server will look for orgUsername + // in the OS keychain but testkit modifies the home dir in the process so all auth is in the test dir. + env: { + SF_USE_GENERIC_UNIX_KEYCHAIN: 'true', + }, + }), + }); + + const tools = await mcpClient.tools(); + + const systemPrompt = generateSystemPrompt(opts?.promptOptions); + + try { + const { text, steps } = await generateText({ + model: opts?.model ?? defaultModel, + tools, + system: systemPrompt, + prompt: input, + maxRetries: 1, + maxSteps: 10, + experimental_telemetry: { + isEnabled: false, + }, + }); + + if (process.env.SF_MCP_DEBUG_EVALS === 'true') { + const tmpDir = os.tmpdir(); + const tmpFile = path.join(tmpDir, `eval-result-${Date.now()}.json`); + const debugData = { + input, + result: text, + toolCalls: steps + .flatMap((step) => step.toolCalls) + .map((call) => ({ + name: call.toolName, + arguments: call.args, + })), + systemPrompt, + timestamp: new Date().toISOString(), + }; + fs.writeFileSync(tmpFile, JSON.stringify(debugData, null, 2)); + // eslint-disable-next-line no-console + console.warn(`Debug: Result written to ${tmpFile}`); + } + + return { + result: text, + // vitest-evals expects args to be: + // ```ts + // arguments?: Record + // ``` + // + // but ai-sdk v3/google adapter returns args as: + // ```ts + // args: unknown; + // ``` + // + // revisit if this got fixed after migrating to ai-sdk v5 with the LLGM adapter + // @ts-ignore + toolCalls: steps + .flatMap((step) => step.toolCalls) + .map((call) => ({ + name: call.toolName, + arguments: call.args, + })), + }; + } catch (error) { + // eslint-disable-next-line no-console + console.error(error); + throw error; + } finally { + await mcpClient.close(); + } + }; +} + +export function NoOpTaskRunner() { + return async function NoOpTaskRunner(input: string) { + // Just return the input as the result, no tool execution + return { + result: input, + toolCalls: [], + }; + }; +} diff --git a/packages/mcp/test/evals/utils/scorers/factuality.ts b/packages/mcp/test/evals/utils/scorers/factuality.ts new file mode 100644 index 00000000..622383de --- /dev/null +++ b/packages/mcp/test/evals/utils/scorers/factuality.ts @@ -0,0 +1,68 @@ +import { google } from '@ai-sdk/google'; +import { generateObject, type LanguageModel } from 'ai'; +import { z } from 'zod'; + +// Supported models: https://ai.google.dev/gemini-api/docs/models +const defaultModel = google('gemini-2.5-flash'); + +/** + * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`. + * + * ``` + * import { openai } from "@ai-sdk/openai"; + * + * scorers: [Factuality(openai("gpt-4o"))] + * ``` + */ +export function Factuality(model: LanguageModel = defaultModel) { + // eslint-disable-next-line @typescript-eslint/no-shadow + return async function Factuality(opts: { input: string; output: string; expected?: string }) { + const { object } = await generateObject({ + model, + /** + * Prompt implementation from `autoevals`: + * + * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml} + */ + prompt: ` + You are comparing a submitted answer to an expert answer on a given question. Here is the data: + [BEGIN DATA] + ************ + [Question]: ${opts.input} + ************ + [Expert]: ${opts.expected} + ************ + [Submission]: ${opts.output} + ************ + [END DATA] + Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation, or overall structure. + The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options: + + (A) The submitted answer is a subset of the expert answer and is fully consistent with it. + (B) The submitted answer is a superset of the expert answer and is fully consistent with it. + (C) The submitted answer contains all the same details as the expert answer. + (D) There is a disagreement between the submitted answer and the expert answer. + (E) The answers differ, but these differences don't matter from the perspective of factuality. + `, + schema: z.object({ + answer: z.enum(['A', 'B', 'C', 'D', 'E']).describe('Your selection.'), + rationale: z.string().describe('Why you chose this answer. Be very detailed.'), + }), + }); + + const scores = { + A: 0.4, + B: 0.6, + C: 1, + D: 0, + E: 1, + }; + + return { + score: scores[object.answer], + metadata: { + rationale: object.rationale, + }, + }; + }; +} diff --git a/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts b/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts new file mode 100644 index 00000000..4ec76d2c --- /dev/null +++ b/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts @@ -0,0 +1,164 @@ +import { experimental_createMCPClient, generateObject, type LanguageModel } from 'ai'; +import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio'; +import { google } from '@ai-sdk/google'; +import { z } from 'zod'; + +// Supported models: https://ai.google.dev/gemini-api/docs/models +const defaultModel = google('gemini-2.5-flash'); + +let cachedTools: string[] | null = null; + +const predictionSchema = z.object({ + score: z.number().min(0).max(1).describe('Score from 0 to 1'), + rationale: z.string().describe('Explanation of the score'), + predictedTools: z + .array( + z.object({ + name: z.string(), + arguments: z.unknown().optional(), + // arguments: z.record(z.any()).optional().default({}), + }) + ) + .describe('What tools the AI would likely call'), +}); + +interface ToolPredictionScorerOptions { + input: string; + output: string; + expectedTools?: ExpectedToolCall[]; + result?: any; +} + +export interface ExpectedToolCall { + name: string; + arguments: Record; +} + +export function ToolPredictionScorer(model: LanguageModel = defaultModel) { + return async function ToolPredictionScorer(opts: ToolPredictionScorerOptions) { + // If expectedTools is not defined, skip this scorer + if (!opts.expectedTools) { + return { + score: null, + metadata: { + rationale: 'Skipped: No expectedTools defined for this test case', + }, + }; + } + + const expectedTools = opts.expectedTools; + + const AVAILABLE_TOOLS = await getAvailableTools(); + + // Generate a description of the expected tools for the prompt + const expectedDescription = expectedTools + .map((tool) => `- ${tool.name} with arguments: ${JSON.stringify(tool.arguments)}`) + .join('\n'); + + const { object } = await generateObject({ + model, + prompt: generateSystemPrompt(AVAILABLE_TOOLS, opts.input, expectedDescription), + maxRetries: 0, + schema: predictionSchema, + experimental_telemetry: { + isEnabled: false, + }, + }); + + return { + score: object.score, + metadata: { + rationale: object.rationale, + predictedTools: object.predictedTools, + expectedTools: expectedTools, + }, + }; + }; +} + +async function getAvailableTools(): Promise { + if (cachedTools) { + return cachedTools; + } + + const client = await experimental_createMCPClient({ + transport: new Experimental_StdioMCPTransport({ + // when executed via yarn, `sf-mcp-server` points to `packages/mcp/bin/run.js` + command: 'sf-mcp-server', + args: [ + '--toolsets', + 'all', + '-o', + 'DEFAULT_TARGET_ORG', + '--no-telemetry', + '--allow-non-ga-tools', + ], + }), + }); + + // Discover available tools + const toolsMap = await client.tools(); + + cachedTools = Object.entries(toolsMap).map(([name, tool]) => { + // Extract the first line of description for a concise summary + const shortDescription = tool.description || ''; + const params = tool.parameters; + return `${name} - ${shortDescription}\n${JSON.stringify(params)}`; + }); + + // Clean up + await client.close(); + + return cachedTools; +} + +function generateSystemPrompt(availableTools: string[], task: string, expectedDescription: string): string { + return ` +You are evaluating whether an AI assistant with access to Salesforce DX MCP tools would make the correct tool calls for a given task. + +[AVAILABLE TOOLS] +${availableTools.join('\n')} + +[TASK] +${task} + +[EXPECTED TOOL CALLS] +${expectedDescription} + + +When using a tool, follow the JSON schema very carefully and make sure to include ALL required properties. + + +Your goal is to evaluate whether the AI assistant would behave correctly based on: +- The user’s task (intent) +- The list of available tools and their documented behavior +- The arguments required by each tool + +IMPORTANT: +- The provided [EXPECTED TOOL CALLS] represents what *should* happen in this specific test case, *assuming it is valid*. +- **If the expected tools are not appropriate for the task or violate the available tool definitions (e.g., wrong tool for the intent, required params missing, invalid params present), score based on correctness, not blind matching.** + +STRICT VALIDATION RULES: +1. You may ONLY use tools listed under [AVAILABLE TOOLS]. If an expected tool is not listed, the test is invalid — score accordingly. +2. Match the user’s task with the most appropriate tool(s) based on the tool definitions and parameter requirements. +3. Validate each predicted tool call: + - Tool name must be correct for the task + - All required arguments must be present + - No unexpected or invalid arguments + - Tool must be available in the [AVAILABLE TOOLS] list + +SCORING: +- 1.0: All predicted tool calls are correct for the task, use valid tools, and match the expected tool behavior exactly +- 0.8: Minor argument mismatches (e.g., extra but harmless params) +- 0.6: Correct tools used but wrong order or missing some arguments +- 0.3: Some correct tools but major issues (e.g. wrong tool order, invalid args) +- 0.0: Critical mistakes: wrong tools for the task, missing essential tools, or tools not in the available list + +NOTE: +- The goal is not to blindly reproduce the expected tool calls, but to validate whether the expected behavior is appropriate and executable given the available tools and the task. +- If the expected tool call includes incorrect tools or invalid arguments, reduce the score appropriately. + +Current open workspace: "${process.env.SF_EVAL_PROMPT_PROJECT_DIR}" +Current open file: "${process.env.SF_EVAL_PROMPT_OPEN_FILEPATH}" +`; +} diff --git a/packages/mcp/vitest.config.ts b/packages/mcp/vitest.config.ts index d2a22168..fb9b6a11 100644 --- a/packages/mcp/vitest.config.ts +++ b/packages/mcp/vitest.config.ts @@ -16,8 +16,8 @@ import { defineConfig } from 'vitest/config'; export default defineConfig({ - test: { - include: ['**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'], - reporters: ['vitest-evals/reporter'], - }, -}); \ No newline at end of file + test: { + include: ['**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'], + reporters: ['vitest-evals/reporter'], + }, +});