From a8ddbe5af32069afea3f958fdc989635ebc1618a Mon Sep 17 00:00:00 2001
From: Joshua Feingold <jfeingold35@gmail.com>
Date: Fri, 26 Sep 2025 09:45:05 -0500
Subject: [PATCH 01/13] @W-18964528@ Ported (non-working) code from old branch

---
 packages/mcp/package.json                    |  27 ++--
 packages/mcp/test/evals/sf-query-org.eval.ts |  52 +++++++
 packages/mcp/test/evals/utils.ts             | 142 +++++++++++++++++++
 packages/mcp/vitest.config.ts                |  23 +++
 4 files changed, 233 insertions(+), 11 deletions(-)
 create mode 100644 packages/mcp/test/evals/sf-query-org.eval.ts
 create mode 100644 packages/mcp/test/evals/utils.ts
 create mode 100644 packages/mcp/vitest.config.ts

diff --git a/packages/mcp/package.json b/packages/mcp/package.json
index 49d7bfdf..f1c82320 100644
--- a/packages/mcp/package.json
+++ b/packages/mcp/package.json
@@ -54,25 +54,30 @@
     "zod": "^3.25.76"
   },
   "devDependencies": {
+    "@ai-sdk/google": "^1.2.22",
+    "@ai-sdk/openai": "^1.3.23",
     "@salesforce/cli-plugins-testkit": "^5.3.39",
     "@salesforce/dev-config": "^4.3.2",
-    "prettier": "^2.8.8",
-    "@types/node": "^22.16.5",
-    "wireit": "^0.14.12",
-    "eslint": "^8.57.1",
-    "mocha": "11.7.2",
-    "chai": "^4.3.10",
-    "@types/mocha": "^10.0.10",
+    "@salesforce/prettier-config": "^0.0.3",
     "@types/chai": "^4.3.14",
+    "@types/mocha": "^10.0.10",
+    "@types/node": "^22.16.5",
     "@types/sinon": "^10.0.20",
-    "sinon": "10.0.0",
-    "nyc": "^17.0.0",
+    "ai": "^4.3.17",
+    "chai": "^4.3.10",
+    "eslint": "^8.57.1",
     "eslint-config-salesforce-license": "^1.0.1",
     "eslint-config-salesforce-typescript": "4.0.1",
-    "@salesforce/prettier-config": "^0.0.3",
+    "mocha": "11.7.2",
+    "nyc": "^17.0.0",
     "oclif": "^4.21.0",
+    "prettier": "^2.8.8",
+    "sinon": "10.0.0",
     "ts-node": "^10.9.2",
-    "typescript": "^5.8.3"
+    "typescript": "^5.8.3",
+    "vitest": "^3.2.4",
+    "vitest-evals": "^0.3.0",
+    "wireit": "^0.14.12"
   },
   "publishConfig": {
     "access": "public"
diff --git a/packages/mcp/test/evals/sf-query-org.eval.ts b/packages/mcp/test/evals/sf-query-org.eval.ts
new file mode 100644
index 00000000..ffc8e263
--- /dev/null
+++ b/packages/mcp/test/evals/sf-query-org.eval.ts
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import { describeEval } from 'vitest-evals';
+import { Factuality, TaskRunner } from './utils.js';
+
+describeEval('SOQL queries', {
+    data: async () => [
+        {
+            input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.',
+            expected: `The response should include these records:
+Architectural Details
+City Living
+Contemporary City Living
+Contemporary Luxury
+Heart of Harvard Square
+Modern City Living
+Quiet Retreat
+Seaport District Retreat
+Stunning Colonial
+Stunning Victorian
+Ultimate Sophistication
+Waterfront in the City
+`,
+            //         expected: `The response should include these records:
+            // Sophisticated Urban Escape
+            // Metropolitan Elegance
+            // Vibrant City Sanctuary
+            // Downtown Dreamscape
+            // Sleek Urban Oasis
+            // Modern Metropole
+            // Luxe in the Loop
+            // `,
+        },
+    ],
+    task: TaskRunner(),
+    scorers: [Factuality()],
+    threshold: 0.6,
+    timeout: 30_000,
+});
\ No newline at end of file
diff --git a/packages/mcp/test/evals/utils.ts b/packages/mcp/test/evals/utils.ts
new file mode 100644
index 00000000..8c253388
--- /dev/null
+++ b/packages/mcp/test/evals/utils.ts
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import * as path from 'node:path';
+import { google } from '@ai-sdk/google';
+import { experimental_createMCPClient, generateObject, streamText, type LanguageModel } from 'ai';
+import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio';
+import { z } from 'zod';
+
+// This prompt intends to represent what an IDE context window could look like, some specifics:
+//
+// * Current open project directory
+// * Current open file
+const SYSTEM_PROMPT = `You are an assistant responsible for evaluating the results of calling various tools. 
+You a general purpose LLM-based Agent. Your purpose is to answer the user's query using the tools provided.
+- You should ONLY use the tools available to answer the user's query.
+- Use as few tool calls as possible to get to the answer.
+- Using multiple tool calls to get to the answer is allowed when needed.
+The current open project dir is "${process.env.SF_EVAL_PROMPT_PROJECT_DIR}"
+`;
+
+// Supported models: https://ai.google.dev/gemini-api/docs/models
+const defaultModel = google('gemini-2.5-flash');
+
+export function TaskRunner(model: LanguageModel = defaultModel) {
+    return async function TaskRun(input: string) {
+        const mcpClient = await experimental_createMCPClient({
+            transport: new Experimental_StdioMCPTransport({
+                command: 'node',
+                args: [path.join(import.meta.dirname, '../../bin/run.js'), '--toolsets', 'all', '-o', 'DEFAULT_TARGET_ORG', '--no-telemetry']
+            }),
+        });
+
+        const tools = await mcpClient.tools();
+
+        try {
+            const result = streamText({
+                model,
+                tools,
+                system: SYSTEM_PROMPT,
+                prompt: input,
+                maxRetries: 1,
+                maxSteps: 10,
+                experimental_telemetry: {
+                    isEnabled: false,
+                },
+                onError: (error) => {
+                    // eslint-disable-next-line no-console
+                    console.error(error);
+                },
+            });
+
+            // TODO: we don't need text streaming here, maybe switch to `generateText`?
+            // eslint-disable-next-line
+            for await (const _ of result.fullStream) {
+            }
+
+            return await result.text;
+        } catch (error) {
+            // eslint-disable-next-line no-console
+            console.error(error);
+            throw error;
+        } finally {
+            await mcpClient.close();
+        }
+    };
+}
+
+/**
+ * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
+ *
+ * ```
+ * import { openai } from "@ai-sdk/openai";
+ *
+ * scorers: [Factuality(openai("gpt-4o"))]
+ * ```
+ */
+export function Factuality(model: LanguageModel = defaultModel) {
+    // TODO: remove function wrapper
+    // eslint-disable-next-line @typescript-eslint/no-shadow
+    return async function Factuality(opts: { input: string; output: string; expected?: string }) {
+        const { object } = await generateObject({
+            model,
+            /**
+             * Prompt implementation from `autoevals`:
+             *
+             * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
+             */
+            prompt: `
+        You are comparing a submitted answer to an expert answer on a given question. Here is the data:
+        [BEGIN DATA]
+        ************
+        [Question]: ${opts.input}
+        ************
+        [Expert]: ${opts.expected}
+        ************
+        [Submission]: ${opts.output}
+        ************
+        [END DATA]
+        Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation, or overall structure.
+        The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
+        
+        (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
+        (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
+        (C) The submitted answer contains all the same details as the expert answer.
+        (D) There is a disagreement between the submitted answer and the expert answer.
+        (E) The answers differ, but these differences don't matter from the perspective of factuality.
+      `,
+            schema: z.object({
+                answer: z.enum(['A', 'B', 'C', 'D', 'E']).describe('Your selection.'),
+                rationale: z.string().describe('Why you chose this answer. Be very detailed.'),
+            }),
+        });
+
+        const scores = {
+            A: 0.4,
+            B: 0.6,
+            C: 1,
+            D: 0,
+            E: 1,
+        };
+
+        return {
+            score: scores[object.answer],
+            metadata: {
+                rationale: object.rationale,
+            },
+        };
+    };
+}
\ No newline at end of file
diff --git a/packages/mcp/vitest.config.ts b/packages/mcp/vitest.config.ts
new file mode 100644
index 00000000..d2a22168
--- /dev/null
+++ b/packages/mcp/vitest.config.ts
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+    test: {
+        include: ['**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'],
+        reporters: ['vitest-evals/reporter'],
+    },
+});
\ No newline at end of file

From f8536685265e81491502b9d44d78f1de4f8ed308 Mon Sep 17 00:00:00 2001
From: Joshua Feingold <jfeingold35@gmail.com>
Date: Fri, 26 Sep 2025 10:47:38 -0500
Subject: [PATCH 02/13] @W-18964528@ Updated dependency

---
 packages/mcp/package.json |   2 +-
 yarn.lock                 | 121 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 119 insertions(+), 4 deletions(-)

diff --git a/packages/mcp/package.json b/packages/mcp/package.json
index f1c82320..e7aec4da 100644
--- a/packages/mcp/package.json
+++ b/packages/mcp/package.json
@@ -76,7 +76,7 @@
     "ts-node": "^10.9.2",
     "typescript": "^5.8.3",
     "vitest": "^3.2.4",
-    "vitest-evals": "^0.3.0",
+    "vitest-evals": "^0.5.0",
     "wireit": "^0.14.12"
   },
   "publishConfig": {
diff --git a/yarn.lock b/yarn.lock
index 20be39a2..19ff844f 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -2,6 +2,57 @@
 # yarn lockfile v1
 
 
+"@ai-sdk/google@^1.2.22":
+  version "1.2.22"
+  resolved "https://registry.yarnpkg.com/@ai-sdk/google/-/google-1.2.22.tgz#9993e4781c9a773cd17d47490b9efdc90895abd2"
+  integrity sha512-Ppxu3DIieF1G9pyQ5O1Z646GYR0gkC57YdBqXJ82qvCdhEhZHu0TWhmnOoeIWe2olSbuDeoOY+MfJrW8dzS3Hw==
+  dependencies:
+    "@ai-sdk/provider" "1.1.3"
+    "@ai-sdk/provider-utils" "2.2.8"
+
+"@ai-sdk/openai@^1.3.23":
+  version "1.3.24"
+  resolved "https://registry.yarnpkg.com/@ai-sdk/openai/-/openai-1.3.24.tgz#169b78a1ccf338e5dbd8696a55f57d3ca2e3d6bc"
+  integrity sha512-GYXnGJTHRTZc4gJMSmFRgEQudjqd4PUN0ZjQhPwOAYH1yOAvQoG/Ikqs+HyISRbLPCrhbZnPKCNHuRU4OfpW0Q==
+  dependencies:
+    "@ai-sdk/provider" "1.1.3"
+    "@ai-sdk/provider-utils" "2.2.8"
+
+"@ai-sdk/provider-utils@2.2.8":
+  version "2.2.8"
+  resolved "https://registry.yarnpkg.com/@ai-sdk/provider-utils/-/provider-utils-2.2.8.tgz#ad11b92d5a1763ab34ba7b5fc42494bfe08b76d1"
+  integrity sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA==
+  dependencies:
+    "@ai-sdk/provider" "1.1.3"
+    nanoid "^3.3.8"
+    secure-json-parse "^2.7.0"
+
+"@ai-sdk/provider@1.1.3":
+  version "1.1.3"
+  resolved "https://registry.yarnpkg.com/@ai-sdk/provider/-/provider-1.1.3.tgz#ebdda8077b8d2b3f290dcba32c45ad19b2704681"
+  integrity sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==
+  dependencies:
+    json-schema "^0.4.0"
+
+"@ai-sdk/react@1.2.12":
+  version "1.2.12"
+  resolved "https://registry.yarnpkg.com/@ai-sdk/react/-/react-1.2.12.tgz#f4250b6df566b170af98a71d5708b52108dd0ce1"
+  integrity sha512-jK1IZZ22evPZoQW3vlkZ7wvjYGYF+tRBKXtrcolduIkQ/m/sOAVcVeVDUDvh1T91xCnWCdUGCPZg2avZ90mv3g==
+  dependencies:
+    "@ai-sdk/provider-utils" "2.2.8"
+    "@ai-sdk/ui-utils" "1.2.11"
+    swr "^2.2.5"
+    throttleit "2.1.0"
+
+"@ai-sdk/ui-utils@1.2.11":
+  version "1.2.11"
+  resolved "https://registry.yarnpkg.com/@ai-sdk/ui-utils/-/ui-utils-1.2.11.tgz#4f815589d08d8fef7292ade54ee5db5d09652603"
+  integrity sha512-3zcwCc8ezzFlwp3ZD15wAPjf2Au4s3vAbKsXQVyhxODHcmu0iyPO2Eua6D/vicq/AUm/BAo60r97O6HU+EI0+w==
+  dependencies:
+    "@ai-sdk/provider" "1.1.3"
+    "@ai-sdk/provider-utils" "2.2.8"
+    zod-to-json-schema "^3.24.1"
+
 "@ampproject/remapping@^2.2.0":
   version "2.3.0"
   resolved "https://registry.yarnpkg.com/@ampproject/remapping/-/remapping-2.3.0.tgz#ed441b6fa600072520ce18b43d2c8cc8caecc7f4"
@@ -2179,7 +2230,7 @@
   dependencies:
     "@opentelemetry/api" "^1.3.0"
 
-"@opentelemetry/api@^1.3.0", "@opentelemetry/api@^1.7.0", "@opentelemetry/api@^1.9.0":
+"@opentelemetry/api@1.9.0", "@opentelemetry/api@^1.3.0", "@opentelemetry/api@^1.7.0", "@opentelemetry/api@^1.9.0":
   version "1.9.0"
   resolved "https://registry.yarnpkg.com/@opentelemetry/api/-/api-1.9.0.tgz#d03eba68273dc0f7509e2a3d5cba21eae10379fe"
   integrity sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==
@@ -3825,6 +3876,11 @@
   resolved "https://registry.yarnpkg.com/@types/deep-eql/-/deep-eql-4.0.2.tgz#334311971d3a07121e7eb91b684a605e7eea9cbd"
   integrity sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==
 
+"@types/diff-match-patch@^1.0.36":
+  version "1.0.36"
+  resolved "https://registry.yarnpkg.com/@types/diff-match-patch/-/diff-match-patch-1.0.36.tgz#dcef10a69d357fe9d43ac4ff2eca6b85dbf466af"
+  integrity sha512-xFdR6tkm0MWvBfO8xXCSsinYxHcqkQUlcHeSpMC2ukzOb6lwQAfDmW+Qt0AvlGd8HpsS28qKsB+oPeJn9I39jg==
+
 "@types/eslint@^9.6.1":
   version "9.6.1"
   resolved "https://registry.yarnpkg.com/@types/eslint/-/eslint-9.6.1.tgz#d5795ad732ce81715f27f75da913004a56751584"
@@ -4506,6 +4562,18 @@ aggregate-error@^3.0.0:
     clean-stack "^2.0.0"
     indent-string "^4.0.0"
 
+ai@^4.3.17:
+  version "4.3.19"
+  resolved "https://registry.yarnpkg.com/ai/-/ai-4.3.19.tgz#e94f5b37f3885bc9c9637f892e13bddd0a1857e5"
+  integrity sha512-dIE2bfNpqHN3r6IINp9znguYdhIOheKW2LDigAMrgt/upT3B8eBGPSCblENvaZGoq+hxaN9fSMzjWpbqloP+7Q==
+  dependencies:
+    "@ai-sdk/provider" "1.1.3"
+    "@ai-sdk/provider-utils" "2.2.8"
+    "@ai-sdk/react" "1.2.12"
+    "@ai-sdk/ui-utils" "1.2.11"
+    "@opentelemetry/api" "1.9.0"
+    jsondiffpatch "0.6.0"
+
 ajv@^6.12.4, ajv@^6.12.6:
   version "6.12.6"
   resolved "https://registry.yarnpkg.com/ajv/-/ajv-6.12.6.tgz#baf5a62e802b07d977034586f8c3baf5adf26df4"
@@ -5778,6 +5846,11 @@ depd@2.0.0, depd@^2.0.0:
   resolved "https://registry.yarnpkg.com/depd/-/depd-2.0.0.tgz#b696163cc757560d09cf22cc8fad1571b79e76df"
   integrity sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==
 
+dequal@^2.0.3:
+  version "2.0.3"
+  resolved "https://registry.yarnpkg.com/dequal/-/dequal-2.0.3.tgz#2644214f1997d39ed0ee0ece72335490a7ac67be"
+  integrity sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==
+
 destroy@1.2.0:
   version "1.2.0"
   resolved "https://registry.yarnpkg.com/destroy/-/destroy-1.2.0.tgz#4803735509ad8be552934c67df614f94e66fa015"
@@ -5810,6 +5883,11 @@ diagnostic-channel@1.1.1:
   dependencies:
     semver "^7.5.3"
 
+diff-match-patch@^1.0.5:
+  version "1.0.5"
+  resolved "https://registry.yarnpkg.com/diff-match-patch/-/diff-match-patch-1.0.5.tgz#abb584d5f10cd1196dfc55aa03701592ae3f7b37"
+  integrity sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==
+
 diff3@0.0.3:
   version "0.0.3"
   resolved "https://registry.yarnpkg.com/diff3/-/diff3-0.0.3.tgz#d4e5c3a4cdf4e5fe1211ab42e693fcb4321580fc"
@@ -8143,6 +8221,11 @@ json-schema-traverse@^1.0.0:
   resolved "https://registry.yarnpkg.com/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz#ae7bcb3656ab77a73ba5c49bf654f38e6b6860e2"
   integrity sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==
 
+json-schema@^0.4.0:
+  version "0.4.0"
+  resolved "https://registry.yarnpkg.com/json-schema/-/json-schema-0.4.0.tgz#f7de4cf6efab838ebaeb3236474cbba5a1930ab5"
+  integrity sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==
+
 json-stable-stringify-without-jsonify@^1.0.1:
   version "1.0.1"
   resolved "https://registry.yarnpkg.com/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz#9db7b59496ad3f3cfef30a75142d2d930ad72651"
@@ -8175,6 +8258,15 @@ jsonc-parser@^3.0.0:
   resolved "https://registry.yarnpkg.com/jsonc-parser/-/jsonc-parser-3.3.1.tgz#f2a524b4f7fd11e3d791e559977ad60b98b798b4"
   integrity sha512-HUgH65KyejrUFPvHFPbqOY0rsFip3Bo5wb4ngvdi1EpCYWUQDC5V+Y7mZws+DLkr4M//zQJoanu1SP+87Dv1oQ==
 
+jsondiffpatch@0.6.0:
+  version "0.6.0"
+  resolved "https://registry.yarnpkg.com/jsondiffpatch/-/jsondiffpatch-0.6.0.tgz#daa6a25bedf0830974c81545568d5f671c82551f"
+  integrity sha512-3QItJOXp2AP1uv7waBkao5nCvhEv+QmJAd38Ybq7wNI74Q+BBmnLn4EDKz6yI9xGAIQoUF87qHt+kc1IVxB4zQ==
+  dependencies:
+    "@types/diff-match-patch" "^1.0.36"
+    chalk "^5.3.0"
+    diff-match-patch "^1.0.5"
+
 jsonfile@^4.0.0:
   version "4.0.0"
   resolved "https://registry.yarnpkg.com/jsonfile/-/jsonfile-4.0.0.tgz#8771aae0799b64076b76640fca058f9c10e33ecb"
@@ -8795,7 +8887,7 @@ mute-stream@^2.0.0:
   resolved "https://registry.yarnpkg.com/mute-stream/-/mute-stream-2.0.0.tgz#a5446fc0c512b71c83c44d908d5c7b7b4c493b2b"
   integrity sha512-WWdIxpyjEn+FhQJQQv9aQAYlHoNVdzIzUySNV1gHUPDSdZJ3yZn7pAAbQcV7B56Mvu881q9FZV+0Vx2xC44VWA==
 
-nanoid@^3.3.11, nanoid@^3.3.7:
+nanoid@^3.3.11, nanoid@^3.3.7, nanoid@^3.3.8:
   version "3.3.11"
   resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.11.tgz#4f4f112cefbe303202f2199838128936266d185b"
   integrity sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==
@@ -10203,7 +10295,7 @@ scheduler@^0.23.2:
   dependencies:
     loose-envify "^1.1.0"
 
-secure-json-parse@^2.4.0:
+secure-json-parse@^2.4.0, secure-json-parse@^2.7.0:
   version "2.7.0"
   resolved "https://registry.yarnpkg.com/secure-json-parse/-/secure-json-parse-2.7.0.tgz#5a5f9cd6ae47df23dba3151edd06855d47e09862"
   integrity sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==
@@ -10885,6 +10977,14 @@ svg-tags@^1.0.0:
   resolved "https://registry.yarnpkg.com/svg-tags/-/svg-tags-1.0.0.tgz#58f71cee3bd519b59d4b2a843b6c7de64ac04764"
   integrity sha512-ovssysQTa+luh7A5Weu3Rta6FJlFBBbInjOh722LIt6klpU2/HtdUbszju/G4devcvk8PGt7FCLv5wftu3THUA==
 
+swr@^2.2.5:
+  version "2.3.6"
+  resolved "https://registry.yarnpkg.com/swr/-/swr-2.3.6.tgz#5fee0ee8a0762a16871ee371075cb09422b64f50"
+  integrity sha512-wfHRmHWk/isGNMwlLGlZX5Gzz/uTgo0o2IRuTMcf4CPuPFJZlq0rDaKUx+ozB5nBOReNV1kiOyzMfj+MBMikLw==
+  dependencies:
+    dequal "^2.0.3"
+    use-sync-external-store "^1.4.0"
+
 table@^6.9.0:
   version "6.9.0"
   resolved "https://registry.yarnpkg.com/table/-/table-6.9.0.tgz#50040afa6264141c7566b3b81d4d82c47a8668f5"
@@ -10965,6 +11065,11 @@ thread-stream@^3.0.0:
   dependencies:
     real-require "^0.2.0"
 
+throttleit@2.1.0:
+  version "2.1.0"
+  resolved "https://registry.yarnpkg.com/throttleit/-/throttleit-2.1.0.tgz#a7e4aa0bf4845a5bd10daa39ea0c783f631a07b4"
+  integrity sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw==
+
 "through@>=2.2.7 <3", through@^2.3.6:
   version "2.3.8"
   resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5"
@@ -11385,6 +11490,11 @@ use-sidecar@^1.1.3:
     detect-node-es "^1.1.0"
     tslib "^2.0.0"
 
+use-sync-external-store@^1.4.0:
+  version "1.5.0"
+  resolved "https://registry.yarnpkg.com/use-sync-external-store/-/use-sync-external-store-1.5.0.tgz#55122e2a3edd2a6c106174c27485e0fd59bcfca0"
+  integrity sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A==
+
 util-deprecate@^1.0.1, util-deprecate@^1.0.2, util-deprecate@~1.0.1:
   version "1.0.2"
   resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"
@@ -11467,6 +11577,11 @@ vite@^7.0.6:
   optionalDependencies:
     fsevents "~2.3.3"
 
+vitest-evals@^0.5.0:
+  version "0.5.0"
+  resolved "https://registry.yarnpkg.com/vitest-evals/-/vitest-evals-0.5.0.tgz#543f3c59ad7ffe7f05437d85666836ffa1512a9f"
+  integrity sha512-kg8r6NKNBD6jkmyjdO64qduATW6IUG+62atCJj3dLzaRZEG4TIdfFVy64iEh0dSSo1CCpAJi+dJqSN2Y2qv2Sw==
+
 vitest@^3.2.4:
   version "3.2.4"
   resolved "https://registry.yarnpkg.com/vitest/-/vitest-3.2.4.tgz#0637b903ad79d1539a25bc34c0ed54b5c67702ea"

From cc4549c7f951d0e5eebcc9d9906c7612cf75f23d Mon Sep 17 00:00:00 2001
From: Joshua Feingold <jfeingold35@gmail.com>
Date: Fri, 26 Sep 2025 13:46:33 -0500
Subject: [PATCH 03/13] @W-18964528@ Implemented speculative E2E test for
 describe_code_analyzer_rule

---
 .../evals/describe_code_analyzer_rule.eval.ts     | 13 +++++++++++++
 packages/mcp/test/evals/utils.ts                  | 15 ++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts

diff --git a/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts b/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts
new file mode 100644
index 00000000..fa92cee8
--- /dev/null
+++ b/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts
@@ -0,0 +1,13 @@
+import { describeEval } from 'vitest-evals';
+import { TaskRunner, outputIncludesExpectationArray } from './utils.js';
+
+describeEval('describe_code_analyzer_rule', {
+    data: async () => [{
+        input: 'tell me the tags that are associated with the Code Analysis Rule named VFUnescapeEl, which is a rule for the pmd engine',
+        expected: ['Recommended', 'Security', 'Visualforce']
+    }],
+    task: TaskRunner(),
+    scorers: [outputIncludesExpectationArray],
+    threshold: 0.9,
+    timeout: 60_000
+});
\ No newline at end of file
diff --git a/packages/mcp/test/evals/utils.ts b/packages/mcp/test/evals/utils.ts
index 8c253388..38717654 100644
--- a/packages/mcp/test/evals/utils.ts
+++ b/packages/mcp/test/evals/utils.ts
@@ -39,7 +39,7 @@ export function TaskRunner(model: LanguageModel = defaultModel) {
         const mcpClient = await experimental_createMCPClient({
             transport: new Experimental_StdioMCPTransport({
                 command: 'node',
-                args: [path.join(import.meta.dirname, '../../bin/run.js'), '--toolsets', 'all', '-o', 'DEFAULT_TARGET_ORG', '--no-telemetry']
+                args: [path.join(import.meta.dirname, '../../bin/run.js'), '--toolsets', 'all', '-o', 'DEFAULT_TARGET_ORG', '--no-telemetry', '--allow-non-ga-tools']
             }),
         });
 
@@ -78,6 +78,19 @@ export function TaskRunner(model: LanguageModel = defaultModel) {
     };
 }
 
+export function outputIncludesExpectationArray(opts: {input: string, output: string, expected: string[]}) {
+    let score: number = 0;
+    const increment: number = 1/opts.expected.length;
+    for (const expected of opts.expected) {
+        if (opts.output.toLowerCase().includes(expected.toLowerCase())) {
+            score += increment;
+        }
+    }
+    return {
+        score
+    }
+}
+
 /**
  * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
  *

From 1ab18736e4e36b2f118a5ecc0d6b1507ca4c5d11 Mon Sep 17 00:00:00 2001
From: Joshua Feingold <jfeingold35@gmail.com>
Date: Fri, 26 Sep 2025 15:44:12 -0500
Subject: [PATCH 04/13] @W-18964528@ Added test coverage for run_code_analyzer

---
 .../mcp/test/evals/run_code_analyzer.eval.ts  | 23 +++++++++++++++++++
 packages/mcp/test/evals/utils.ts              |  3 +++
 .../fixtures/sample-targets/SampleTarget1.cls |  9 ++++++++
 3 files changed, 35 insertions(+)
 create mode 100644 packages/mcp/test/evals/run_code_analyzer.eval.ts
 create mode 100644 packages/mcp/test/fixtures/sample-targets/SampleTarget1.cls

diff --git a/packages/mcp/test/evals/run_code_analyzer.eval.ts b/packages/mcp/test/evals/run_code_analyzer.eval.ts
new file mode 100644
index 00000000..edb617bf
--- /dev/null
+++ b/packages/mcp/test/evals/run_code_analyzer.eval.ts
@@ -0,0 +1,23 @@
+import path from "node:path";
+import {fileURLToPath} from "node:url";
+import { describeEval } from 'vitest-evals';
+import { TaskRunner } from './utils.js';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+const pathToTarget: string = path.join(__dirname, '..', 'fixtures', 'sample-targets', 'SampleTarget1.cls');
+
+describeEval('run_code_analyzer', {
+    data: async () => [{
+        input: `Run code analysis against ${pathToTarget}, and tell me the number of violations in that file using the response format "There are X violations".` ,
+        expected: [6]
+    }],
+    task: TaskRunner(),
+    scorers: [(opts: {output: string, expected: number}) => {
+        const score: number = opts.output === `There are ${opts.expected} violations.` ? 1 : 0;
+        return {score};
+    }],
+    threshold: 0.9,
+    timeout: 60_000
+});
\ No newline at end of file
diff --git a/packages/mcp/test/evals/utils.ts b/packages/mcp/test/evals/utils.ts
index 38717654..21007c82 100644
--- a/packages/mcp/test/evals/utils.ts
+++ b/packages/mcp/test/evals/utils.ts
@@ -80,11 +80,14 @@ export function TaskRunner(model: LanguageModel = defaultModel) {
 
 export function outputIncludesExpectationArray(opts: {input: string, output: string, expected: string[]}) {
     let score: number = 0;
+    //console.log(`output is ${opts.output}`);
     const increment: number = 1/opts.expected.length;
     for (const expected of opts.expected) {
         if (opts.output.toLowerCase().includes(expected.toLowerCase())) {
+            //console.log(`contained ${expected}, icnrementing`);
             score += increment;
         }
+        //console.log(`score is now ${score}`)
     }
     return {
         score
diff --git a/packages/mcp/test/fixtures/sample-targets/SampleTarget1.cls b/packages/mcp/test/fixtures/sample-targets/SampleTarget1.cls
new file mode 100644
index 00000000..30cec28b
--- /dev/null
+++ b/packages/mcp/test/fixtures/sample-targets/SampleTarget1.cls
@@ -0,0 +1,9 @@
+public class SampleTarget1 {
+    public static boolean doSomething() {
+        Integer i = 0;
+        for (i = 0; i < 10; i++) {
+            Account[] accs = [SELECT Name FROM Account WITH SYSTEM_MODE];
+        }
+        return false;
+    }
+}
\ No newline at end of file

From 894aed734640094d8f8b8fd265090e895c21b004 Mon Sep 17 00:00:00 2001
From: Joshua Feingold <jfeingold35@gmail.com>
Date: Mon, 29 Sep 2025 10:12:51 -0500
Subject: [PATCH 05/13] @W-18964528@ Added GHA for yarn:eval script

---
 .github/workflows/eval-e2e.yml | 33 +++++++++++++++++++++++++++++++++
 packages/mcp/package.json      | 11 +++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 .github/workflows/eval-e2e.yml

diff --git a/.github/workflows/eval-e2e.yml b/.github/workflows/eval-e2e.yml
new file mode 100644
index 00000000..823f4a72
--- /dev/null
+++ b/.github/workflows/eval-e2e.yml
@@ -0,0 +1,33 @@
+on:
+  workflow_call:
+    inputs:
+      os:
+        required: false
+        description: "runs-on property, ex: ubuntu-latest, windows-latest"
+        type: string
+        default: "ubuntu-latest"
+
+jobs:
+  eval:
+    name: 'yarn:eval'
+    runs-on: ${{ input.os }}
+    steps:
+      - name: Configure git longpaths if on Windows
+        if: ${{ runner.os == 'Windows' }}
+        run: git config --system core.longpaths true
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: lts/*
+          cache: yarn
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin'
+          java-version: '11'
+      - run: yarn
+      - run: yarn build
+      - name: Eval-based E2E tests
+        shell: bash
+        run: |
+          cd packages/mcp
+          yarn:eval
diff --git a/packages/mcp/package.json b/packages/mcp/package.json
index e7aec4da..beee0e60 100644
--- a/packages/mcp/package.json
+++ b/packages/mcp/package.json
@@ -18,6 +18,7 @@
     "lint": "wireit",
     "start": "yarn build && npm link && mcp-inspector sf-mcp-server",
     "test": "wireit",
+    "test:eval": "wireit",
     "test:only": "wireit"
   },
   "repository": "salesforcecli/mcp",
@@ -130,6 +131,16 @@
       ],
       "output": []
     },
+    "test:eval": {
+      "command": "vitest run",
+      "files": [
+        "test/**/*.eval.ts"
+      ],
+      "dependencies": [
+        "test:compile"
+      ],
+      "output": []
+    },
     "test": {
       "command": "nyc mocha \"test/**/*.test.ts\"",
       "env": {

From 85825a0fbf8a7e20edfbc38208709f4f30bb1880 Mon Sep 17 00:00:00 2001
From: Cristian Dominguez <cdominguez@salesforce.com>
Date: Thu, 2 Oct 2025 17:18:25 -0300
Subject: [PATCH 06/13] test: add tool predicion scorer for light evals

---
 packages/mcp/test/evals/deploy.eval.ts        |  85 +++++++++
 packages/mcp/test/evals/utils.ts              |  14 +-
 .../mcp/test/utils/toolPredictionScorer.ts    | 175 ++++++++++++++++++
 3 files changed, 273 insertions(+), 1 deletion(-)
 create mode 100644 packages/mcp/test/evals/deploy.eval.ts
 create mode 100644 packages/mcp/test/utils/toolPredictionScorer.ts

diff --git a/packages/mcp/test/evals/deploy.eval.ts b/packages/mcp/test/evals/deploy.eval.ts
new file mode 100644
index 00000000..98de1860
--- /dev/null
+++ b/packages/mcp/test/evals/deploy.eval.ts
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import { describeEval } from 'vitest-evals';
+import { ToolPredictionScorer } from '../utils/toolPredictionScorer.js';
+import { NoOpTaskRunner } from './utils.js';
+
+describeEval('deploy', {
+  data: async () => [
+    {
+      input: 'Deploy this file to my default org and run all apex tests in deployment',
+      expectedTools: [
+        {
+          name: 'get_username',
+          arguments: {
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+            defaultTargetOrg: true
+          },
+        },
+        {
+          name: 'deploy_metadata',
+          arguments: {
+            sourceDir: [process.env.SF_EVAL_PROMPT_OPEN_FILEPATH],
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+            apexTestLevel: 'RunAllTestsInOrg',
+            usernameOrAlias: "ebikes-default-org"
+          },
+        },
+      ],
+    },
+    {
+      input: 'Deploy this project to my ebikes org',
+      expectedTools: [
+        {
+          name: 'deploy_metadata',
+          arguments: {
+            sourceDir: [process.env.SF_EVAL_PROMPT_OPEN_FILEPATH],
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+            apexTests: ['GeocodingServiceTest']
+            // apexTestLevel: 'RunAllTestsInOrg',
+          },
+        },
+      ],
+    },
+    {
+      input: 'Deploy this file and run the GeocodingServiceTest tests',
+      expectedTools: [
+        {
+          name: 'get_username',
+          arguments: {
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+            // TODO: can we do a "fuzzy" match per tool param?
+            // defaultTargetOrg: true
+          },
+        },
+        {
+          name: 'deploy_metadata',
+          arguments: {
+            sourceDir: [process.env.SF_EVAL_PROMPT_OPEN_FILEPATH],
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+            apexTests: ['GeocodingServiceTest']
+            // apexTestLevel: 'RunAllTestsInOrg',
+          },
+        },
+      ],
+    }
+  ],
+  task: NoOpTaskRunner(),
+  scorers: [ToolPredictionScorer()],
+  // TODO(cristian): revise this based on how flexible our params are around get_username and document default
+  threshold: 1.0,
+  timeout: 30_000,
+});
diff --git a/packages/mcp/test/evals/utils.ts b/packages/mcp/test/evals/utils.ts
index 21007c82..d3d057a9 100644
--- a/packages/mcp/test/evals/utils.ts
+++ b/packages/mcp/test/evals/utils.ts
@@ -28,7 +28,9 @@ You a general purpose LLM-based Agent. Your purpose is to answer the user's quer
 - You should ONLY use the tools available to answer the user's query.
 - Use as few tool calls as possible to get to the answer.
 - Using multiple tool calls to get to the answer is allowed when needed.
-The current open project dir is "${process.env.SF_EVAL_PROMPT_PROJECT_DIR}"
+
+Current open workspace: "${process.env.SF_EVAL_PROMPT_PROJECT_DIR}"
+Current open file: "${process.env.SF_EVAL_PROMPT_OPEN_FILEPATH}
 `;
 
 // Supported models: https://ai.google.dev/gemini-api/docs/models
@@ -155,4 +157,14 @@ export function Factuality(model: LanguageModel = defaultModel) {
             },
         };
     };
+}
+
+export function NoOpTaskRunner() {
+  return async function NoOpTaskRunner(input: string) {
+    // Just return the input as the result, no tool execution
+    return {
+      result: input,
+      toolCalls: [],
+    };
+  };
 }
\ No newline at end of file
diff --git a/packages/mcp/test/utils/toolPredictionScorer.ts b/packages/mcp/test/utils/toolPredictionScorer.ts
new file mode 100644
index 00000000..7cfba2fa
--- /dev/null
+++ b/packages/mcp/test/utils/toolPredictionScorer.ts
@@ -0,0 +1,175 @@
+import { experimental_createMCPClient, generateObject, type LanguageModel } from 'ai';
+import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio';
+import { google } from '@ai-sdk/google';
+import * as path from 'node:path';
+import { z } from 'zod';
+
+// Supported models: https://ai.google.dev/gemini-api/docs/models
+const defaultModel = google('gemini-2.5-flash');
+
+let cachedTools: string[] | null = null;
+
+const predictionSchema = z.object({
+  score: z.number().min(0).max(1).describe('Score from 0 to 1'),
+  rationale: z.string().describe('Explanation of the score'),
+  predictedTools: z
+    .array(
+      z.object({
+        name: z.string(),
+        arguments: z.unknown().optional(),
+        // arguments: z.record(z.any()).optional().default({}),
+      })
+    )
+    .describe('What tools the AI would likely call'),
+});
+
+interface ToolPredictionScorerOptions {
+  input: string;
+  output: string;
+  expectedTools?: ExpectedToolCall[];
+  result?: any;
+}
+
+export interface ExpectedToolCall {
+  name: string;
+  arguments: Record<string, any>;
+}
+
+export function ToolPredictionScorer(model: LanguageModel = defaultModel) {
+  return async function ToolPredictionScorer(opts: ToolPredictionScorerOptions) {
+    // If expectedTools is not defined, skip this scorer
+    if (!opts.expectedTools) {
+      return {
+        score: null,
+        metadata: {
+          rationale: 'Skipped: No expectedTools defined for this test case',
+        },
+      };
+    }
+
+    const expectedTools = opts.expectedTools;
+
+    // Get available tools from the MCP server
+    // TODO(cristian): validate that all expected tools are included here, throw if not.
+    const AVAILABLE_TOOLS = await getAvailableTools();
+
+    // Generate a description of the expected tools for the prompt
+    const expectedDescription = expectedTools
+      .map((tool) => `- ${tool.name} with arguments: ${JSON.stringify(tool.arguments)}`)
+      .join('\n');
+
+    const { object } = await generateObject({
+      model,
+      prompt: generateSystemPrompt(AVAILABLE_TOOLS, opts.input, expectedDescription),
+      maxRetries: 0,
+      schema: predictionSchema,
+      experimental_telemetry: {
+        isEnabled: false,
+      },
+    });
+    // console.log('*'.repeat(process.stdout.columns))
+    // console.log(AVAILABLE_TOOLS)
+    // console.log('*'.repeat(process.stdout.columns))
+    // console.log(JSON.stringify(object,null,2))
+    // console.log('*'.repeat(process.stdout.columns))
+
+    return {
+      score: object.score,
+      metadata: {
+        rationale: object.rationale,
+        predictedTools: object.predictedTools,
+        expectedTools: expectedTools,
+      },
+    };
+  };
+}
+
+async function getAvailableTools(): Promise<string[]> {
+  if (cachedTools) {
+    return cachedTools;
+  }
+
+  const client = await experimental_createMCPClient({
+    transport: new Experimental_StdioMCPTransport({
+      command: 'node',
+      args: [
+        path.join(import.meta.dirname, '../../../mcp/bin/run.js'),
+        '--toolsets',
+        'orgs,metadata,testing',
+        '-o',
+        'DEFAULT_TARGET_ORG',
+        '--no-telemetry',
+        '--allow-non-ga-tools',
+      ],
+    }),
+  });
+
+  // Discover available tools
+  const toolsMap = await client.tools();
+
+  // TODO(cristian): this should include full tool desc and params
+  // Convert tools to the format expected by the scorer
+  cachedTools = Object.entries(toolsMap).map(([name, tool]) => {
+    // Extract the first line of description for a concise summary
+    const shortDescription = tool.description || '';
+    const params = tool.parameters
+    return `${name} - ${shortDescription}\n${JSON.stringify(params)}`;
+  });
+
+  // Clean up
+  await client.close();
+  // console.log(JSON.stringify(cachedTools,null,2));
+
+  return cachedTools;
+}
+
+function generateSystemPrompt(availableTools: string[], task: string, expectedDescription: string): string {
+  return `
+You are evaluating whether an AI assistant with access to Salesforce DX MCP tools would make the correct tool calls for a given task.
+
+[AVAILABLE TOOLS]
+${availableTools.join('\n')}
+
+[TASK]
+${task}
+
+[EXPECTED TOOL CALLS]
+${expectedDescription}
+
+<toolUseInstructions>
+When using a tool, follow the JSON schema very carefully and make sure to include ALL required properties.
+</toolUseInstructions>
+
+Your goal is to evaluate whether the AI assistant would behave correctly based on:
+- The user’s task (intent)
+- The list of available tools and their documented behavior
+- The arguments required by each tool
+
+IMPORTANT:
+- The provided [EXPECTED TOOL CALLS] represents what *should* happen in this specific test case, *assuming it is valid*.
+- **If the expected tools are not appropriate for the task or violate the available tool definitions (e.g., wrong tool for the intent, required params missing, invalid params present), score based on correctness, not blind matching.**
+
+STRICT VALIDATION RULES:
+1. You may ONLY use tools listed under [AVAILABLE TOOLS]. If an expected tool is not listed, the test is invalid — score accordingly.
+2. Match the user’s task with the most appropriate tool(s) based on the tool definitions and parameter requirements.
+3. Validate each predicted tool call:
+   - Tool name must be correct for the task
+   - All required arguments must be present
+   - No unexpected or invalid arguments
+   - Tool must be available in the [AVAILABLE TOOLS] list
+
+SCORING:
+- 1.0: All predicted tool calls are correct for the task, use valid tools, and match the expected tool behavior exactly
+- 0.8: Minor argument mismatches (e.g., extra but harmless params)
+- 0.6: Correct tools used but wrong order or missing some arguments
+- 0.3: Some correct tools but major issues (e.g. wrong tool order, invalid args)
+- 0.0: Critical mistakes: wrong tools for the task, missing essential tools, or tools not in the available list
+
+NOTE:
+- The goal is not to blindly reproduce the expected tool calls, but to validate whether the expected behavior is appropriate and executable given the available tools and the task.
+- If the expected tool call includes incorrect tools or invalid arguments, reduce the score appropriately.
+
+Current open workspace: "${process.env.SF_EVAL_PROMPT_PROJECT_DIR}"
+Current open file: "${process.env.SF_EVAL_PROMPT_OPEN_FILEPATH}"
+`;
+}

From 172c5c3a6f49bfe44790a2c850cca513981fb3c2 Mon Sep 17 00:00:00 2001
From: Cristian Dominguez <cdominguez@salesforce.com>
Date: Fri, 10 Oct 2025 09:08:59 -0300
Subject: [PATCH 07/13] chore: refactor + split eval tests

---
 .../src/tools/run_soql_query.ts               |   2 +-
 packages/mcp/package.json                     |   1 -
 .../evals/describe_code_analyzer_rule.eval.ts |  13 --
 .../deploy_metadata.eval.ts}                  |  24 ++-
 .../discoverability/run_apex_test.eval.ts     |  61 +++++++
 .../discoverability/run_soql_query.eval.ts    |  54 ++++++
 .../test/evals/e2e/deploy_metadata.eval.ts    | 125 +++++++++++++
 .../e2e/describe_code_analyzer_rule.eval.ts   |  32 ++++
 .../test/evals/e2e/run_code_analyzer.eval.ts  |  27 +++
 .../mcp/test/evals/e2e/run_soql_query.eval.ts | 129 +++++++++++++
 .../mcp/test/evals/run_code_analyzer.eval.ts  |  23 ---
 packages/mcp/test/evals/sf-query-org.eval.ts  |  52 ------
 packages/mcp/test/evals/utils.ts              | 170 ------------------
 packages/mcp/test/evals/utils/runners.ts      | 159 ++++++++++++++++
 .../test/evals/utils/scorers/factuality.ts    |  68 +++++++
 .../utils/scorers}/toolPredictionScorer.ts    |  14 +-
 packages/mcp/vitest.config.ts                 |  10 +-
 17 files changed, 674 insertions(+), 290 deletions(-)
 delete mode 100644 packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts
 rename packages/mcp/test/evals/{deploy.eval.ts => discoverability/deploy_metadata.eval.ts} (73%)
 create mode 100644 packages/mcp/test/evals/discoverability/run_apex_test.eval.ts
 create mode 100644 packages/mcp/test/evals/discoverability/run_soql_query.eval.ts
 create mode 100644 packages/mcp/test/evals/e2e/deploy_metadata.eval.ts
 create mode 100644 packages/mcp/test/evals/e2e/describe_code_analyzer_rule.eval.ts
 create mode 100644 packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts
 create mode 100644 packages/mcp/test/evals/e2e/run_soql_query.eval.ts
 delete mode 100644 packages/mcp/test/evals/run_code_analyzer.eval.ts
 delete mode 100644 packages/mcp/test/evals/sf-query-org.eval.ts
 delete mode 100644 packages/mcp/test/evals/utils.ts
 create mode 100644 packages/mcp/test/evals/utils/runners.ts
 create mode 100644 packages/mcp/test/evals/utils/scorers/factuality.ts
 rename packages/mcp/test/{utils => evals/utils/scorers}/toolPredictionScorer.ts (89%)

diff --git a/packages/mcp-provider-dx-core/src/tools/run_soql_query.ts b/packages/mcp-provider-dx-core/src/tools/run_soql_query.ts
index b68f624d..d78ae226 100644
--- a/packages/mcp-provider-dx-core/src/tools/run_soql_query.ts
+++ b/packages/mcp-provider-dx-core/src/tools/run_soql_query.ts
@@ -37,7 +37,7 @@ export const queryOrgParamsSchema = z.object({
   query: z.string().describe('SOQL query to run'),
   usernameOrAlias: usernameOrAliasParam,
   directory: directoryParam,
-  useToolingApi: useToolingApiParam,
+  useToolingApi: useToolingApiParam.describe('Use the Tooling API. Always set to true when querying a tooling sobject.'),
 });
 
 type InputArgs = z.infer<typeof queryOrgParamsSchema>;
diff --git a/packages/mcp/package.json b/packages/mcp/package.json
index beee0e60..5d7a64a7 100644
--- a/packages/mcp/package.json
+++ b/packages/mcp/package.json
@@ -56,7 +56,6 @@
   },
   "devDependencies": {
     "@ai-sdk/google": "^1.2.22",
-    "@ai-sdk/openai": "^1.3.23",
     "@salesforce/cli-plugins-testkit": "^5.3.39",
     "@salesforce/dev-config": "^4.3.2",
     "@salesforce/prettier-config": "^0.0.3",
diff --git a/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts b/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts
deleted file mode 100644
index fa92cee8..00000000
--- a/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts
+++ /dev/null
@@ -1,13 +0,0 @@
-import { describeEval } from 'vitest-evals';
-import { TaskRunner, outputIncludesExpectationArray } from './utils.js';
-
-describeEval('describe_code_analyzer_rule', {
-    data: async () => [{
-        input: 'tell me the tags that are associated with the Code Analysis Rule named VFUnescapeEl, which is a rule for the pmd engine',
-        expected: ['Recommended', 'Security', 'Visualforce']
-    }],
-    task: TaskRunner(),
-    scorers: [outputIncludesExpectationArray],
-    threshold: 0.9,
-    timeout: 60_000
-});
\ No newline at end of file
diff --git a/packages/mcp/test/evals/deploy.eval.ts b/packages/mcp/test/evals/discoverability/deploy_metadata.eval.ts
similarity index 73%
rename from packages/mcp/test/evals/deploy.eval.ts
rename to packages/mcp/test/evals/discoverability/deploy_metadata.eval.ts
index 98de1860..556217b7 100644
--- a/packages/mcp/test/evals/deploy.eval.ts
+++ b/packages/mcp/test/evals/discoverability/deploy_metadata.eval.ts
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 import { describeEval } from 'vitest-evals';
-import { ToolPredictionScorer } from '../utils/toolPredictionScorer.js';
-import { NoOpTaskRunner } from './utils.js';
+import { NoOpTaskRunner } from '../utils/runners.js';
+import { ToolPredictionScorer } from '../utils/scorers/toolPredictionScorer.js';
 
 describeEval('deploy', {
   data: async () => [
@@ -26,7 +26,7 @@ describeEval('deploy', {
           name: 'get_username',
           arguments: {
             directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
-            defaultTargetOrg: true
+            defaultTargetOrg: true,
           },
         },
         {
@@ -35,7 +35,7 @@ describeEval('deploy', {
             sourceDir: [process.env.SF_EVAL_PROMPT_OPEN_FILEPATH],
             directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
             apexTestLevel: 'RunAllTestsInOrg',
-            usernameOrAlias: "ebikes-default-org"
+            usernameOrAlias: 'ebikes-default-org',
           },
         },
       ],
@@ -46,10 +46,8 @@ describeEval('deploy', {
         {
           name: 'deploy_metadata',
           arguments: {
-            sourceDir: [process.env.SF_EVAL_PROMPT_OPEN_FILEPATH],
+            usernameOrAlias: 'ebikes',
             directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
-            apexTests: ['GeocodingServiceTest']
-            // apexTestLevel: 'RunAllTestsInOrg',
           },
         },
       ],
@@ -58,28 +56,28 @@ describeEval('deploy', {
       input: 'Deploy this file and run the GeocodingServiceTest tests',
       expectedTools: [
         {
+          // user doesn't specify which org to deploy to -> discover it via `get_username`
           name: 'get_username',
           arguments: {
             directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
-            // TODO: can we do a "fuzzy" match per tool param?
-            // defaultTargetOrg: true
+            defaultTargetOrg: true,
           },
         },
         {
           name: 'deploy_metadata',
           arguments: {
+            usernameOrAlias: 'default-org',
             sourceDir: [process.env.SF_EVAL_PROMPT_OPEN_FILEPATH],
             directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
-            apexTests: ['GeocodingServiceTest']
-            // apexTestLevel: 'RunAllTestsInOrg',
+            // IMPORTANT: there's a `run_apex_test` available but for these "run test during deployment" scenarios we want to ensure they are only run via `deploy_metadata`, it's a pretty common operation for an agentic loop (test failures rollback deployment)
+            apexTests: ['GeocodingServiceTest'],
           },
         },
       ],
-    }
+    },
   ],
   task: NoOpTaskRunner(),
   scorers: [ToolPredictionScorer()],
-  // TODO(cristian): revise this based on how flexible our params are around get_username and document default
   threshold: 1.0,
   timeout: 30_000,
 });
diff --git a/packages/mcp/test/evals/discoverability/run_apex_test.eval.ts b/packages/mcp/test/evals/discoverability/run_apex_test.eval.ts
new file mode 100644
index 00000000..ee7f351d
--- /dev/null
+++ b/packages/mcp/test/evals/discoverability/run_apex_test.eval.ts
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import { describeEval } from 'vitest-evals';
+import { NoOpTaskRunner } from '../utils/runners.js';
+import { ToolPredictionScorer } from '../utils/scorers/toolPredictionScorer.js';
+
+describeEval('', {
+  data: async () => [
+    {
+      input: 'Run the GeocodingServiceTest and FileUtilitiesTest tests in the dreamhouse org',
+      expectedTools: [
+        {
+          name: 'run_apex_test',
+          arguments: {
+            usernameOrAlias: 'dreamhouse',
+            classNames: ['GeocodingServiceTest', 'FileUtilitiesTest'],
+            testLevel: 'RunSpecifiedTests',
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+          },
+        },
+      ],
+    },
+    {
+      input: 'Run all apex tests in the org',
+      expectedTools: [
+        {
+          name: 'get_username',
+          arguments: {
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+            defaultTargetOrg: true,
+          },
+        },
+        {
+          name: 'run_apex_test',
+          arguments: {
+            usernameOrAlias: 'default-org',
+            testLevel: 'RunAllTestsInOrg',
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+          },
+        },
+      ],
+    },
+  ],
+  task: NoOpTaskRunner(),
+  scorers: [ToolPredictionScorer()],
+  threshold: 1.0,
+  timeout: 30_000,
+});
diff --git a/packages/mcp/test/evals/discoverability/run_soql_query.eval.ts b/packages/mcp/test/evals/discoverability/run_soql_query.eval.ts
new file mode 100644
index 00000000..b735bce1
--- /dev/null
+++ b/packages/mcp/test/evals/discoverability/run_soql_query.eval.ts
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import { describeEval } from 'vitest-evals';
+import { NoOpTaskRunner } from '../utils/runners.js';
+import { ToolPredictionScorer } from '../utils/scorers/toolPredictionScorer.js';
+
+describeEval('run_soql_query', {
+  data: async () => [
+    {
+      input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.',
+      expectedTools: [
+        {
+          name: 'run_soql_query',
+          arguments: {
+            query: 'SELECT Name FROM Property__c ORDER BY Name ASC',
+            usernameOrAlias: 'ebikes',
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+          },
+        },
+      ],
+    },
+    {
+      input: 'Get the coverage of the GeocodingService apex class, you can query the ApexCodeCoverage tooling object',
+      expectedTools: [
+        {
+          name: 'run_soql_query',
+          arguments: {
+            usernameOrAlias: 'ebikes',
+            query: 'SELECT Coverage FROM ApexCodeCoverage WHERE ApexClassOrTriggerId = ‘01pD000000066GR’',
+            useToolingApi: true,
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+          },
+        },
+      ],
+    },
+  ],
+  task: NoOpTaskRunner(),
+  scorers: [ToolPredictionScorer()],
+  threshold: 1.0,
+  timeout: 30_000,
+});
diff --git a/packages/mcp/test/evals/e2e/deploy_metadata.eval.ts b/packages/mcp/test/evals/e2e/deploy_metadata.eval.ts
new file mode 100644
index 00000000..77ad5963
--- /dev/null
+++ b/packages/mcp/test/evals/e2e/deploy_metadata.eval.ts
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import path from 'node:path';
+import { afterAll, beforeAll } from 'vitest';
+import { TestSession } from '@salesforce/cli-plugins-testkit';
+import { describeEval, ToolCallScorer } from 'vitest-evals';
+import { TaskRunner } from '../utils/runners.js';
+import { Factuality } from '../utils/scorers/factuality.js';
+
+let testSession: TestSession;
+let orgUsername: string;
+let projectDir: string;
+let currentOpenFile: string;
+
+beforeAll(async () => {
+  testSession = await TestSession.create({
+    project: { gitClone: 'https://github.com/trailheadapps/dreamhouse-lwc' },
+    scratchOrgs: [{ setDefault: true, config: path.join('config', 'project-scratch-def.json') }],
+    devhubAuthStrategy: 'AUTO',
+  });
+
+  projectDir = testSession.project.dir;
+  currentOpenFile = path.join(projectDir, 'force-app', 'main', 'default', 'classes', 'GeocodingServiceTest.cls');
+
+  // get default scratch org username
+  orgUsername = [...testSession.orgs.keys()][0];
+}, 600_000);
+
+afterAll(async () => {
+  await testSession.clean();
+});
+
+describeEval('deploy_metadata', {
+  data: async () => [
+    {
+      input:
+        'Deploy this project and run all Apex tests, then assign the dreamhouse permset and summarize the apex test results.',
+      expected: 'It should have successfully deployed the project and executed all 11 tests without failures',
+      expectedTools: (() => {
+        [
+          {
+            name: 'get_username',
+            arguments: {
+              defaultTargetOrg: true,
+              defaultDevHub: false,
+              directory: projectDir,
+            },
+          },
+          {
+            name: 'deploy_metadata',
+            arguments: {
+              apexTestLevel: 'RunAllTestsInOrg',
+              usernameOrAlias: orgUsername,
+              directory: projectDir,
+            },
+          },
+          {
+            name: 'assign_permission_set',
+            arguments: {
+              permissionSetName: 'dreamhouse',
+              usernameOrAlias: orgUsername,
+              directory: projectDir,
+            },
+          },
+        ];
+      })(),
+    },
+    {
+      input: 'Deploy this file and run the GeocodingServiceTest tests, then summarize the apex test results.',
+      expected:
+        'It should have deployed 1 component (GeocodingServiceTest class) and successfully executed the "GeocodingServiceTest.successResponse", "GeocodingServiceTest.blankAddress" and "GeocodingServiceTest.errorResponse" tests.',
+      expectedTools: (() => {
+        [
+          {
+            name: 'get_username',
+            arguments: {
+              defaultTargetOrg: true,
+              defaultDevHub: false,
+              directory: projectDir,
+            },
+          },
+          {
+            name: 'deploy_metadata',
+            arguments: {
+              apexTestLevel: 'RunAllTestsInOrg',
+              apexTests: ['GeocodingServiceTest'],
+              sourceDir: [currentOpenFile],
+              usernameOrAlias: orgUsername,
+              directory: projectDir,
+            },
+          },
+        ];
+      })(),
+    },
+  ],
+  task: (input: string) =>
+    TaskRunner({
+      promptOptions: {
+        currentOpenFile,
+        currentOpenWorkspace: projectDir,
+      },
+    })(input),
+  scorers: [
+    Factuality(),
+    ToolCallScorer({
+      ordered: true,
+      params: 'strict',
+    }),
+  ],
+  threshold: 0.8,
+  timeout: 600_000,
+});
diff --git a/packages/mcp/test/evals/e2e/describe_code_analyzer_rule.eval.ts b/packages/mcp/test/evals/e2e/describe_code_analyzer_rule.eval.ts
new file mode 100644
index 00000000..75a6fed8
--- /dev/null
+++ b/packages/mcp/test/evals/e2e/describe_code_analyzer_rule.eval.ts
@@ -0,0 +1,32 @@
+import { describeEval } from 'vitest-evals';
+import { TaskRunner } from '../utils/runners.js';
+
+describeEval('describe_code_analyzer_rule', {
+  data: async () => [
+    {
+      input:
+        'tell me the tags that are associated with the Code Analysis Rule named VFUnescapeEl, which is a rule for the pmd engine',
+      expected: ['Recommended', 'Security', 'Visualforce'],
+    },
+  ],
+  task: TaskRunner(),
+  scorers: [outputIncludesExpectationArray],
+  threshold: 0.9,
+  timeout: 60_000,
+});
+
+export function outputIncludesExpectationArray(opts: { input: string; output: string; expected: string[] }) {
+  let score: number = 0;
+  //console.log(`output is ${opts.output}`);
+  const increment: number = 1 / opts.expected.length;
+  for (const expected of opts.expected) {
+    if (opts.output.toLowerCase().includes(expected.toLowerCase())) {
+      //console.log(`contained ${expected}, icnrementing`);
+      score += increment;
+    }
+    //console.log(`score is now ${score}`)
+  }
+  return {
+    score,
+  };
+}
diff --git a/packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts b/packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts
new file mode 100644
index 00000000..55dd3c0b
--- /dev/null
+++ b/packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts
@@ -0,0 +1,27 @@
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { describeEval } from 'vitest-evals';
+import { TaskRunner } from '../utils/runners.js';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+const pathToTarget: string = path.join(__dirname, '..', 'fixtures', 'sample-targets', 'SampleTarget1.cls');
+
+describeEval('run_code_analyzer', {
+  data: async () => [
+    {
+      input: `Run code analysis against ${pathToTarget}, and tell me the number of violations in that file using the response format "There are X violations".`,
+      expected: [6],
+    },
+  ],
+  task: TaskRunner(),
+  scorers: [
+    (opts: { output: string; expected: number }) => {
+      const score: number = opts.output === `There are ${opts.expected} violations.` ? 1 : 0;
+      return { score };
+    },
+  ],
+  threshold: 0.9,
+  timeout: 60_000,
+});
diff --git a/packages/mcp/test/evals/e2e/run_soql_query.eval.ts b/packages/mcp/test/evals/e2e/run_soql_query.eval.ts
new file mode 100644
index 00000000..2399ba28
--- /dev/null
+++ b/packages/mcp/test/evals/e2e/run_soql_query.eval.ts
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import path from 'node:path';
+import { afterAll, beforeAll } from 'vitest';
+import { execCmd, TestSession } from '@salesforce/cli-plugins-testkit';
+import { describeEval, ToolCallScorer } from 'vitest-evals';
+import { TaskRunner } from '../utils/runners.js';
+import { Factuality } from '../utils/scorers/factuality.js';
+
+let testSession: TestSession;
+let orgUsername: string;
+let projectDir: string;
+
+beforeAll(async () => {
+  testSession = await TestSession.create({
+    project: { gitClone: 'https://github.com/trailheadapps/dreamhouse-lwc' },
+    scratchOrgs: [{ setDefault: true, config: path.join('config', 'project-scratch-def.json') }],
+    devhubAuthStrategy: 'AUTO',
+  });
+
+  projectDir = testSession.project.dir;
+
+  await execCmd('project deploy start', {
+    cli: 'sf',
+    ensureExitCode: 0,
+    async: true,
+  });
+
+  await execCmd('org assign permset -n dreamhouse', {
+    cli: 'sf',
+    ensureExitCode: 0,
+    async: true,
+  });
+
+  await execCmd(`data tree import -p ${path.join(testSession.project.dir, 'data', 'sample-data-plan.json')}`, {
+    cli: 'sf',
+    ensureExitCode: 0,
+    async: true,
+  });
+
+  // get default scratch org username
+  orgUsername = [...testSession.orgs.keys()][0];
+}, 600_000);
+
+afterAll(async () => {
+  await testSession.clean();
+});
+
+describeEval('SOQL queries', {
+  data: async () => [
+    {
+      input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.',
+      expected: `The response should include these records:
+Architectural Details
+City Living
+Contemporary City Living
+Contemporary Luxury
+Heart of Harvard Square
+Modern City Living
+Quiet Retreat
+Seaport District Retreat
+Stunning Colonial
+Stunning Victorian
+Ultimate Sophistication
+Waterfront in the City
+`,
+      // IMPORTANT:
+      // Get expected tools data at runtime rather than at module initialization time to be able to access
+      // test session context (set in the beforeAll hook).
+      //
+      // This is needed because `projectDir` and `orgUsername` are not initialized when declared, so we want to
+      // read them at test runtime.
+      expectedTools: (() => {
+        [
+          {
+            name: 'get_username',
+            arguments: {
+              defaultTargetOrg: true,
+              defaultDevHub: false,
+              directory: projectDir,
+            },
+          },
+          {
+            name: 'run_soql_query',
+            arguments: {
+              query: 'SELECT Name FROM Property__c ORDER BY Name ASC',
+              usernameOrAlias: orgUsername,
+              directory: projectDir,
+            },
+          },
+        ];
+      })(),
+    },
+  ],
+  // IMPORTANT:
+  // Create the task runner at runtime rather than at module initialization time to be able to access
+  // test session context (set in the beforeAll hook).
+  task: (input: string) =>
+    TaskRunner({
+      promptOptions: {
+        // not needed for this test
+        currentOpenFile: '',
+        currentOpenWorkspace: projectDir,
+      },
+    })(input),
+  scorers: [
+    Factuality(),
+    ToolCallScorer({
+      ordered: true,
+      // fuzzy to account for possible SOQL query diffs agains the expected query (different clauses, casing, etc)
+      params: 'fuzzy',
+    }),
+  ],
+  threshold: 0.8,
+  timeout: 300_000,
+});
diff --git a/packages/mcp/test/evals/run_code_analyzer.eval.ts b/packages/mcp/test/evals/run_code_analyzer.eval.ts
deleted file mode 100644
index edb617bf..00000000
--- a/packages/mcp/test/evals/run_code_analyzer.eval.ts
+++ /dev/null
@@ -1,23 +0,0 @@
-import path from "node:path";
-import {fileURLToPath} from "node:url";
-import { describeEval } from 'vitest-evals';
-import { TaskRunner } from './utils.js';
-
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-
-const pathToTarget: string = path.join(__dirname, '..', 'fixtures', 'sample-targets', 'SampleTarget1.cls');
-
-describeEval('run_code_analyzer', {
-    data: async () => [{
-        input: `Run code analysis against ${pathToTarget}, and tell me the number of violations in that file using the response format "There are X violations".` ,
-        expected: [6]
-    }],
-    task: TaskRunner(),
-    scorers: [(opts: {output: string, expected: number}) => {
-        const score: number = opts.output === `There are ${opts.expected} violations.` ? 1 : 0;
-        return {score};
-    }],
-    threshold: 0.9,
-    timeout: 60_000
-});
\ No newline at end of file
diff --git a/packages/mcp/test/evals/sf-query-org.eval.ts b/packages/mcp/test/evals/sf-query-org.eval.ts
deleted file mode 100644
index ffc8e263..00000000
--- a/packages/mcp/test/evals/sf-query-org.eval.ts
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2025, Salesforce, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-import { describeEval } from 'vitest-evals';
-import { Factuality, TaskRunner } from './utils.js';
-
-describeEval('SOQL queries', {
-    data: async () => [
-        {
-            input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.',
-            expected: `The response should include these records:
-Architectural Details
-City Living
-Contemporary City Living
-Contemporary Luxury
-Heart of Harvard Square
-Modern City Living
-Quiet Retreat
-Seaport District Retreat
-Stunning Colonial
-Stunning Victorian
-Ultimate Sophistication
-Waterfront in the City
-`,
-            //         expected: `The response should include these records:
-            // Sophisticated Urban Escape
-            // Metropolitan Elegance
-            // Vibrant City Sanctuary
-            // Downtown Dreamscape
-            // Sleek Urban Oasis
-            // Modern Metropole
-            // Luxe in the Loop
-            // `,
-        },
-    ],
-    task: TaskRunner(),
-    scorers: [Factuality()],
-    threshold: 0.6,
-    timeout: 30_000,
-});
\ No newline at end of file
diff --git a/packages/mcp/test/evals/utils.ts b/packages/mcp/test/evals/utils.ts
deleted file mode 100644
index d3d057a9..00000000
--- a/packages/mcp/test/evals/utils.ts
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright 2025, Salesforce, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-import * as path from 'node:path';
-import { google } from '@ai-sdk/google';
-import { experimental_createMCPClient, generateObject, streamText, type LanguageModel } from 'ai';
-import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio';
-import { z } from 'zod';
-
-// This prompt intends to represent what an IDE context window could look like, some specifics:
-//
-// * Current open project directory
-// * Current open file
-const SYSTEM_PROMPT = `You are an assistant responsible for evaluating the results of calling various tools. 
-You a general purpose LLM-based Agent. Your purpose is to answer the user's query using the tools provided.
-- You should ONLY use the tools available to answer the user's query.
-- Use as few tool calls as possible to get to the answer.
-- Using multiple tool calls to get to the answer is allowed when needed.
-
-Current open workspace: "${process.env.SF_EVAL_PROMPT_PROJECT_DIR}"
-Current open file: "${process.env.SF_EVAL_PROMPT_OPEN_FILEPATH}
-`;
-
-// Supported models: https://ai.google.dev/gemini-api/docs/models
-const defaultModel = google('gemini-2.5-flash');
-
-export function TaskRunner(model: LanguageModel = defaultModel) {
-    return async function TaskRun(input: string) {
-        const mcpClient = await experimental_createMCPClient({
-            transport: new Experimental_StdioMCPTransport({
-                command: 'node',
-                args: [path.join(import.meta.dirname, '../../bin/run.js'), '--toolsets', 'all', '-o', 'DEFAULT_TARGET_ORG', '--no-telemetry', '--allow-non-ga-tools']
-            }),
-        });
-
-        const tools = await mcpClient.tools();
-
-        try {
-            const result = streamText({
-                model,
-                tools,
-                system: SYSTEM_PROMPT,
-                prompt: input,
-                maxRetries: 1,
-                maxSteps: 10,
-                experimental_telemetry: {
-                    isEnabled: false,
-                },
-                onError: (error) => {
-                    // eslint-disable-next-line no-console
-                    console.error(error);
-                },
-            });
-
-            // TODO: we don't need text streaming here, maybe switch to `generateText`?
-            // eslint-disable-next-line
-            for await (const _ of result.fullStream) {
-            }
-
-            return await result.text;
-        } catch (error) {
-            // eslint-disable-next-line no-console
-            console.error(error);
-            throw error;
-        } finally {
-            await mcpClient.close();
-        }
-    };
-}
-
-export function outputIncludesExpectationArray(opts: {input: string, output: string, expected: string[]}) {
-    let score: number = 0;
-    //console.log(`output is ${opts.output}`);
-    const increment: number = 1/opts.expected.length;
-    for (const expected of opts.expected) {
-        if (opts.output.toLowerCase().includes(expected.toLowerCase())) {
-            //console.log(`contained ${expected}, icnrementing`);
-            score += increment;
-        }
-        //console.log(`score is now ${score}`)
-    }
-    return {
-        score
-    }
-}
-
-/**
- * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
- *
- * ```
- * import { openai } from "@ai-sdk/openai";
- *
- * scorers: [Factuality(openai("gpt-4o"))]
- * ```
- */
-export function Factuality(model: LanguageModel = defaultModel) {
-    // TODO: remove function wrapper
-    // eslint-disable-next-line @typescript-eslint/no-shadow
-    return async function Factuality(opts: { input: string; output: string; expected?: string }) {
-        const { object } = await generateObject({
-            model,
-            /**
-             * Prompt implementation from `autoevals`:
-             *
-             * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
-             */
-            prompt: `
-        You are comparing a submitted answer to an expert answer on a given question. Here is the data:
-        [BEGIN DATA]
-        ************
-        [Question]: ${opts.input}
-        ************
-        [Expert]: ${opts.expected}
-        ************
-        [Submission]: ${opts.output}
-        ************
-        [END DATA]
-        Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation, or overall structure.
-        The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
-        
-        (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
-        (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
-        (C) The submitted answer contains all the same details as the expert answer.
-        (D) There is a disagreement between the submitted answer and the expert answer.
-        (E) The answers differ, but these differences don't matter from the perspective of factuality.
-      `,
-            schema: z.object({
-                answer: z.enum(['A', 'B', 'C', 'D', 'E']).describe('Your selection.'),
-                rationale: z.string().describe('Why you chose this answer. Be very detailed.'),
-            }),
-        });
-
-        const scores = {
-            A: 0.4,
-            B: 0.6,
-            C: 1,
-            D: 0,
-            E: 1,
-        };
-
-        return {
-            score: scores[object.answer],
-            metadata: {
-                rationale: object.rationale,
-            },
-        };
-    };
-}
-
-export function NoOpTaskRunner() {
-  return async function NoOpTaskRunner(input: string) {
-    // Just return the input as the result, no tool execution
-    return {
-      result: input,
-      toolCalls: [],
-    };
-  };
-}
\ No newline at end of file
diff --git a/packages/mcp/test/evals/utils/runners.ts b/packages/mcp/test/evals/utils/runners.ts
new file mode 100644
index 00000000..ca1d01ea
--- /dev/null
+++ b/packages/mcp/test/evals/utils/runners.ts
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import * as path from 'node:path';
+import * as fs from 'node:fs';
+import * as os from 'node:os';
+import { google } from '@ai-sdk/google';
+import { experimental_createMCPClient, generateText, type LanguageModel } from 'ai';
+import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio';
+import { TaskResult } from 'vitest-evals';
+
+function generateSystemPrompt(opts?: runnerOptions['promptOptions']): string {
+  let prompt = `You are an assistant responsible for evaluating the results of calling various tools. 
+You a general purpose LLM-based Agent. Your purpose is to answer the user's query using the tools provided.
+- You should ONLY use the tools available to answer the user's query.
+- Use as few tool calls as possible to get to the answer.
+- Using multiple tool calls to get to the answer is allowed when needed.
+`;
+
+  if (opts) {
+    prompt += `
+<workspace_info>
+I am working in a workspace with the following folders:
+- ${opts.currentOpenWorkspace}
+</workspace_info>
+
+<editorContext>
+The user's current file is ${opts.currentOpenFile} 
+</editorContext>
+`;
+  }
+
+  return prompt;
+}
+
+// Supported models: https://ai.google.dev/gemini-api/docs/models
+const defaultModel = google('gemini-2.5-flash');
+
+type runnerOptions = {
+  model?: LanguageModel;
+  promptOptions?: {
+    currentOpenWorkspace: string;
+    currentOpenFile: string;
+  };
+};
+
+export function TaskRunner(opts: runnerOptions) {
+  return async function TaskRun(input: string): Promise<TaskResult> {
+    const mcpClient = await experimental_createMCPClient({
+      transport: new Experimental_StdioMCPTransport({
+        command: 'node',
+        args: [
+          path.join(import.meta.dirname, '../../../bin/run.js'),
+          '--toolsets',
+          'all',
+          '-o',
+          'DEFAULT_TARGET_ORG',
+          '--no-telemetry',
+          '--allow-non-ga-tools',
+        ],
+        // IMPORTANT:
+        // this is needed because testkit sets it when transferring the hub auth and creating a scratch.
+        // Without it you get a keychain error/silent failure because the server will look for orgUsername
+        // in the OS keychain but testkit modifies the home dir in the process so all auth is in the test dir.
+        env: {
+          SF_USE_GENERIC_UNIX_KEYCHAIN: 'true',
+        },
+      }),
+    });
+
+    const tools = await mcpClient.tools();
+
+    const systemPrompt = generateSystemPrompt(opts.promptOptions);
+
+    try {
+      const { text, steps } = await generateText({
+        model: opts.model ?? defaultModel,
+        tools,
+        system: systemPrompt,
+        prompt: input,
+        maxRetries: 1,
+        maxSteps: 10,
+        experimental_telemetry: {
+          isEnabled: false,
+        },
+      });
+
+      if (process.env.SF_MCP_DEBUG_EVALS === 'true') {
+        const tmpDir = os.tmpdir();
+        const tmpFile = path.join(tmpDir, `eval-result-${Date.now()}.json`);
+        const debugData = {
+          input,
+          result: text,
+          toolCalls: steps
+            .flatMap((step) => step.toolCalls)
+            .map((call) => ({
+              name: call.toolName,
+              arguments: call.args,
+            })),
+          systemPrompt,
+          timestamp: new Date().toISOString(),
+        };
+        fs.writeFileSync(tmpFile, JSON.stringify(debugData, null, 2));
+        // eslint-disable-next-line no-console
+        console.warn(`Debug: Result written to ${tmpFile}`);
+      }
+
+      return {
+        result: text,
+        // vitest-evals expects args to be:
+        // ```ts
+        // arguments?: Record<string, any>
+        // ```
+        //
+        // but ai-sdk v3/google adapter returns args as:
+        // ```ts
+        // args: unknown;
+        // ```
+        //
+        // revisit if this got fixed after migrating to ai-sdk v5 with the LLGM adapter
+        // @ts-ignore
+        toolCalls: steps
+          .flatMap((step) => step.toolCalls)
+          .map((call) => ({
+            name: call.toolName,
+            arguments: call.args,
+          })),
+      };
+    } catch (error) {
+      // eslint-disable-next-line no-console
+      console.error(error);
+      throw error;
+    } finally {
+      await mcpClient.close();
+    }
+  };
+}
+
+export function NoOpTaskRunner() {
+  return async function NoOpTaskRunner(input: string) {
+    // Just return the input as the result, no tool execution
+    return {
+      result: input,
+      toolCalls: [],
+    };
+  };
+}
diff --git a/packages/mcp/test/evals/utils/scorers/factuality.ts b/packages/mcp/test/evals/utils/scorers/factuality.ts
new file mode 100644
index 00000000..622383de
--- /dev/null
+++ b/packages/mcp/test/evals/utils/scorers/factuality.ts
@@ -0,0 +1,68 @@
+import { google } from '@ai-sdk/google';
+import { generateObject, type LanguageModel } from 'ai';
+import { z } from 'zod';
+
+// Supported models: https://ai.google.dev/gemini-api/docs/models
+const defaultModel = google('gemini-2.5-flash');
+
+/**
+ * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
+ *
+ * ```
+ * import { openai } from "@ai-sdk/openai";
+ *
+ * scorers: [Factuality(openai("gpt-4o"))]
+ * ```
+ */
+export function Factuality(model: LanguageModel = defaultModel) {
+  // eslint-disable-next-line @typescript-eslint/no-shadow
+  return async function Factuality(opts: { input: string; output: string; expected?: string }) {
+    const { object } = await generateObject({
+      model,
+      /**
+       * Prompt implementation from `autoevals`:
+       *
+       * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
+       */
+      prompt: `
+        You are comparing a submitted answer to an expert answer on a given question. Here is the data:
+        [BEGIN DATA]
+        ************
+        [Question]: ${opts.input}
+        ************
+        [Expert]: ${opts.expected}
+        ************
+        [Submission]: ${opts.output}
+        ************
+        [END DATA]
+        Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation, or overall structure.
+        The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
+        
+        (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
+        (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
+        (C) The submitted answer contains all the same details as the expert answer.
+        (D) There is a disagreement between the submitted answer and the expert answer.
+        (E) The answers differ, but these differences don't matter from the perspective of factuality.
+      `,
+      schema: z.object({
+        answer: z.enum(['A', 'B', 'C', 'D', 'E']).describe('Your selection.'),
+        rationale: z.string().describe('Why you chose this answer. Be very detailed.'),
+      }),
+    });
+
+    const scores = {
+      A: 0.4,
+      B: 0.6,
+      C: 1,
+      D: 0,
+      E: 1,
+    };
+
+    return {
+      score: scores[object.answer],
+      metadata: {
+        rationale: object.rationale,
+      },
+    };
+  };
+}
diff --git a/packages/mcp/test/utils/toolPredictionScorer.ts b/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts
similarity index 89%
rename from packages/mcp/test/utils/toolPredictionScorer.ts
rename to packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts
index 7cfba2fa..6771b68c 100644
--- a/packages/mcp/test/utils/toolPredictionScorer.ts
+++ b/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts
@@ -49,8 +49,6 @@ export function ToolPredictionScorer(model: LanguageModel = defaultModel) {
 
     const expectedTools = opts.expectedTools;
 
-    // Get available tools from the MCP server
-    // TODO(cristian): validate that all expected tools are included here, throw if not.
     const AVAILABLE_TOOLS = await getAvailableTools();
 
     // Generate a description of the expected tools for the prompt
@@ -67,11 +65,6 @@ export function ToolPredictionScorer(model: LanguageModel = defaultModel) {
         isEnabled: false,
       },
     });
-    // console.log('*'.repeat(process.stdout.columns))
-    // console.log(AVAILABLE_TOOLS)
-    // console.log('*'.repeat(process.stdout.columns))
-    // console.log(JSON.stringify(object,null,2))
-    // console.log('*'.repeat(process.stdout.columns))
 
     return {
       score: object.score,
@@ -95,7 +88,7 @@ async function getAvailableTools(): Promise<string[]> {
       args: [
         path.join(import.meta.dirname, '../../../mcp/bin/run.js'),
         '--toolsets',
-        'orgs,metadata,testing',
+        'orgs,metadata,testing,data',
         '-o',
         'DEFAULT_TARGET_ORG',
         '--no-telemetry',
@@ -107,18 +100,15 @@ async function getAvailableTools(): Promise<string[]> {
   // Discover available tools
   const toolsMap = await client.tools();
 
-  // TODO(cristian): this should include full tool desc and params
-  // Convert tools to the format expected by the scorer
   cachedTools = Object.entries(toolsMap).map(([name, tool]) => {
     // Extract the first line of description for a concise summary
     const shortDescription = tool.description || '';
-    const params = tool.parameters
+    const params = tool.parameters;
     return `${name} - ${shortDescription}\n${JSON.stringify(params)}`;
   });
 
   // Clean up
   await client.close();
-  // console.log(JSON.stringify(cachedTools,null,2));
 
   return cachedTools;
 }
diff --git a/packages/mcp/vitest.config.ts b/packages/mcp/vitest.config.ts
index d2a22168..fb9b6a11 100644
--- a/packages/mcp/vitest.config.ts
+++ b/packages/mcp/vitest.config.ts
@@ -16,8 +16,8 @@
 import { defineConfig } from 'vitest/config';
 
 export default defineConfig({
-    test: {
-        include: ['**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'],
-        reporters: ['vitest-evals/reporter'],
-    },
-});
\ No newline at end of file
+  test: {
+    include: ['**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'],
+    reporters: ['vitest-evals/reporter'],
+  },
+});

From 3e53cf7d94fae1a859417712bd50b966e422ce30 Mon Sep 17 00:00:00 2001
From: Cristian Dominguez <6853656+cristiand391@users.noreply.github.com>
Date: Thu, 16 Oct 2025 18:00:46 -0300
Subject: [PATCH 08/13] add TESTING.md

[skip ci]
---
 TESTING.md | 208 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 TESTING.md

diff --git a/TESTING.md b/TESTING.md
new file mode 100644
index 00000000..e4057d8c
--- /dev/null
+++ b/TESTING.md
@@ -0,0 +1,208 @@
+# Testing MCP tools
+
+This doc covers the different types of tests used to validate MCP tools and ensure they work correctly with LLM agents.
+
+## Types of tests
+
+### E2E Tool Tests
+
+E2E tool tests focus on the tool logic (no LLM inference required) by using an MCP client to call the tool. Write test cases to assert that:
+* invalid param combinations/values return a tool error ([`isError: true` set in the response](https://modelcontextprotocol.io/specification/2025-06-18/server/tools#error-handling))
+* valid user flows run successfully
+* [a specific toolset enables your tools](https://github.com/salesforcecli/mcp/blob/15c13cc8f56cf0360c95989c839bcedd5e67a817/packages/mcp-provider-code-analyzer/test/e2e/run_code_analyzer-e2e.test.ts#L23)
+
+These tests will run on each PR in this repo in linux and windows.
+
+#### Setup
+
+Use the `@salesforce/mcp-test-client` MCP client to start the server and client:
+
+```typescript
+import { McpTestClient, DxMcpTransport } from '@salesforce/mcp-test-client';
+import { z } from 'zod';
+
+const client = new McpTestClient({ timeout: 300_000 });
+
+// Define tool schema
+// NOTE: to avoid duplication you may want to import this from your tool
+const toolSchema = {
+  name: z.literal('run_soql_query'),
+  params: z.object({
+    query: z.string(),
+    usernameOrAlias: z.string(),
+    directory: z.string()
+  })
+};
+
+// Connect with DX transport
+const transport = DxMcpTransport();
+await client.connect(transport);
+
+// Call tool directly
+const result = await client.callTool(toolSchema, {
+  name: 'run_soql_query',
+  params: {
+    query: 'SELECT Name FROM Account LIMIT 8',
+    usernameOrAlias: 'test-org',
+    directory: '/path/to/project'
+  }
+});
+
+expect(result.isError).to.equal(false);
+expect(result.content.length).to.equal(1);
+if (result.content[0].type !== 'text') assert.fail();
+
+const responseText = result.content[0].text;
+expect(responseText).to.contain('SOQL query results:');
+
+// Parse the query result JSON
+const queryMatch = responseText.match(/SOQL query results:\s*({[\s\S]*})/);
+expect(queryMatch).to.not.be.null;
+
+const queryResult = JSON.parse(queryMatch![1]) as QueryResult<jsforceRecord & { Name: string }>;
+expect(queryResult.totalSize).to.equal(8);
+expect(queryResult.done).to.be.true;
+expect(queryResult.records).to.be.an('array');
+expect(queryResult.records.length).to.equal(8);
+```
+
+See [packages/mcp-provider-dx-core/test/e2e/run_soql_query.test.ts](./packages/mcp-provider-dx-core/test/e2e/run_soql_query.test.ts) for a complete example.
+
+> [!IMPORTANT]
+> These tests should be located in each tool provider package, not in the main MCP package.
+> 
+> `@salesforce/mcp-test-client` is a package inside this monorepo and isn't published. You should add it as a devDep matching its current version to get it in your local provider pkg:
+>
+> https://github.com/salesforcecli/mcp/tree/main/packages/mcp-test-client
+
+You can use any test runner you want, we recommend Vitest or Mocha.
+
+### Evals
+
+Evaluation tests use LLMs to evaluate tests results. We use two types of tests powered by [vitest-evals](https://github.com/getsentry/vitest-evals/).
+
+#### Discoverability
+
+These tests allow you to validate that certain prompts will call your tool with the right parameters.
+Each prompt (`input`) should be accompanied by an expected list of tool calls with its params (`expectedTools`), then the `ToolPredictionScorer` scorer will:
+* Load all DX MCP tools into context (even non-GA) with the test data (input & expected tool calls)
+* Score the expected tool calls based on the MCP tools metadata.
+
+Unlike other E2E tests, these don't do any tool call and so they are cheaper to run (each test case does 1 roundtrip, check all DX MCP tool metadata for token qty).
+
+Example:
+
+```typescript
+import { describeEval } from 'vitest-evals';
+import { NoOpTaskRunner } from '../utils/runners.js';
+import { ToolPredictionScorer } from '../utils/scorers/toolPredictionScorer.js';
+
+describeEval('run_soql_query', {
+  data: async () => [
+    {
+      input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.',
+      expectedTools: [
+        {
+          name: 'run_soql_query',
+          arguments: {
+            query: 'SELECT Name FROM Property__c ORDER BY Name ASC',
+            usernameOrAlias: 'ebikes',
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+          },
+        },
+      ],
+    },
+  ],
+  task: NoOpTaskRunner(),
+  scorers: [ToolPredictionScorer()],
+  threshold: 1.0,
+  timeout: 30_000,
+});
+```
+
+See [packages/mcp/test/evals/discoverability/run_soql_query.eval.ts](./packages/mcp/test/evals/discoverability/run_soql_query.eval.ts) for a complete example.
+
+
+#### E2E Evals
+
+These test intend to cover a real scenario by running each test case in an agent loop with all DX MCP tools exposed. The agent will stop once the task is finished (or if it can't continue).
+
+Use the Factuality scorer to evaluate the agent response (`response should include X records`, `it should list all tests executed`, etc).
+You can also use the vitest-eval's `ToolCallScorer` scorer to evaluate that tools were called correctly.
+
+```typescript
+import { describeEval, ToolCallScorer } from 'vitest-evals';
+import { TaskRunner } from '../utils/runners.js';
+import { Factuality } from '../utils/scorers/factuality.js';
+
+describeEval('SOQL queries', {
+  data: async () => [
+    {
+      input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.',
+      expected: 'The response should include these records: Architectural Details, City Living...',
+      expectedTools: [
+        {
+          name: 'run_soql_query',
+          arguments: {
+            query: 'SELECT Name FROM Property__c ORDER BY Name ASC',
+            usernameOrAlias: orgUsername,
+            directory: projectDir,
+          },
+        },
+      ],
+    },
+  ],
+  task: (input: string) =>
+    TaskRunner({
+      promptOptions: {
+        currentOpenFile: '',
+        currentOpenWorkspace: projectDir,
+      },
+    })(input),
+  scorers: [
+    Factuality(),
+    ToolCallScorer({
+      ordered: true,
+      params: 'fuzzy',
+    }),
+  ],
+  threshold: 0.8,
+  timeout: 300_000,
+});
+```
+
+> [!TIP]
+> If you need to set an SFDX project with a scratch org, use the `cli-plugins-testki` library:
+>
+> https://github.com/salesforcecli/cli-plugins-testkit
+
+
+See [packages/mcp/test/evals/e2e/run_soql_query.eval.ts](./packages/mcp/test/evals/e2e/run_soql_query.eval.ts) for a complete example.
+
+
+#### Setting Prompt Options
+
+Configure the task runner with prompt settings:
+
+```typescript
+task: (input: string) =>
+  TaskRunner({
+    promptOptions: {
+      // this will set the file/workspace as if being open like in an IDE chat context
+      currentOpenFile: apexClassFilePath,
+      currentOpenWorkspace: projectDir,
+    },
+  })(input),
+```
+
+#### Scoring
+
+Both eval types use scoring mechanisms:
+
+- **ToolPredictionScorer**: Validates tool selection and parameters
+- **Factuality**: Evaluates response accuracy
+- **ToolCallScorer**: Checks tool execution with options like:
+  - `ordered: true` - Tools must be called in expected order
+  - `params: 'fuzzy'` - Allow parameter variations (useful for queries)
+
+Set thresholds (0.0-1.0) to define passing scores. For discoverability tests, use `threshold: 1.0` for exact matches. For E2E tests, use lower thresholds like `threshold: 0.8` to account for response variations.

From 2cfddacf69ac0d57816ba35efbcb4794e9faff3c Mon Sep 17 00:00:00 2001
From: Cristian Dominguez <cdominguez@salesforce.com>
Date: Thu, 16 Oct 2025 18:03:29 -0300
Subject: [PATCH 09/13] chore: use mcp binf

---
 .../mcp/test/evals/utils/scorers/toolPredictionScorer.ts     | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts b/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts
index 6771b68c..545c1c1a 100644
--- a/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts
+++ b/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts
@@ -1,7 +1,6 @@
 import { experimental_createMCPClient, generateObject, type LanguageModel } from 'ai';
 import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio';
 import { google } from '@ai-sdk/google';
-import * as path from 'node:path';
 import { z } from 'zod';
 
 // Supported models: https://ai.google.dev/gemini-api/docs/models
@@ -84,9 +83,9 @@ async function getAvailableTools(): Promise<string[]> {
 
   const client = await experimental_createMCPClient({
     transport: new Experimental_StdioMCPTransport({
-      command: 'node',
+      // when executed via yarn, `sf-mcp-server` points to `packages/mcp/bin/run.js`
+      command: 'sf-mcp-server',
       args: [
-        path.join(import.meta.dirname, '../../../mcp/bin/run.js'),
         '--toolsets',
         'orgs,metadata,testing,data',
         '-o',

From 0a4502ca80c60b1e2b23e3a7761f51ca75c8b08e Mon Sep 17 00:00:00 2001
From: Cristian Dominguez <cdominguez@salesforce.com>
Date: Mon, 20 Oct 2025 14:15:35 -0300
Subject: [PATCH 10/13] chore: update workflow + updates

---
 .github/workflows/eval-e2e.yml                | 33 --------------
 .github/workflows/eval.yml                    | 44 +++++++++++++++++++
 packages/mcp/package.json                     | 20 ++++++---
 .../discoverability/run_soql_query.eval.ts    |  7 +++
 .../evals/discoverability/vitest.config.ts    | 27 ++++++++++++
 .../test/evals/e2e/run_code_analyzer.eval.ts  |  2 +-
 packages/mcp/test/evals/e2e/vitest.config.ts  | 24 ++++++++++
 packages/mcp/test/evals/utils/runners.ts      |  6 +--
 8 files changed, 121 insertions(+), 42 deletions(-)
 delete mode 100644 .github/workflows/eval-e2e.yml
 create mode 100644 .github/workflows/eval.yml
 create mode 100644 packages/mcp/test/evals/discoverability/vitest.config.ts
 create mode 100644 packages/mcp/test/evals/e2e/vitest.config.ts

diff --git a/.github/workflows/eval-e2e.yml b/.github/workflows/eval-e2e.yml
deleted file mode 100644
index 823f4a72..00000000
--- a/.github/workflows/eval-e2e.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-on:
-  workflow_call:
-    inputs:
-      os:
-        required: false
-        description: "runs-on property, ex: ubuntu-latest, windows-latest"
-        type: string
-        default: "ubuntu-latest"
-
-jobs:
-  eval:
-    name: 'yarn:eval'
-    runs-on: ${{ input.os }}
-    steps:
-      - name: Configure git longpaths if on Windows
-        if: ${{ runner.os == 'Windows' }}
-        run: git config --system core.longpaths true
-      - uses: actions/checkout@v4
-      - uses: actions/setup-node@v4
-        with:
-          node-version: lts/*
-          cache: yarn
-      - uses: actions/setup-java@v4
-        with:
-          distribution: 'temurin'
-          java-version: '11'
-      - run: yarn
-      - run: yarn build
-      - name: Eval-based E2E tests
-        shell: bash
-        run: |
-          cd packages/mcp
-          yarn:eval
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
new file mode 100644
index 00000000..a189916c
--- /dev/null
+++ b/.github/workflows/eval.yml
@@ -0,0 +1,44 @@
+on:
+  workflow_dispatch:
+
+jobs:
+  eval-discoverability:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: lts/*
+          cache: yarn
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin'
+          java-version: '11'
+      - run: yarn
+      - run: yarn build
+      - name: Eval-based discoverability tests
+        run: |
+          cd packages/mcp
+          test:eval:discoverability
+        env:
+          GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
+  eval-e2e:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: lts/*
+          cache: yarn
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin'
+          java-version: '11'
+      - run: yarn
+      - run: yarn build
+      - name: Eval-based E2E tests
+        run: |
+          cd packages/mcp
+          test:eval:e2e
+        env:
+          GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
diff --git a/packages/mcp/package.json b/packages/mcp/package.json
index a1ae010a..79d823f8 100644
--- a/packages/mcp/package.json
+++ b/packages/mcp/package.json
@@ -18,8 +18,8 @@
     "lint": "wireit",
     "start": "yarn build && npm link && mcp-inspector sf-mcp-server",
     "test": "wireit",
-    "test:eval": "wireit",
-    "test:only": "wireit"
+    "test:eval:discoverability": "wireit",
+    "test:eval:e2e": "wireit"
   },
   "repository": "salesforcecli/mcp",
   "bugs": {
@@ -130,10 +130,20 @@
       ],
       "output": []
     },
-    "test:eval": {
-      "command": "vitest run",
+    "test:eval:discoverability": {
+      "command": "vitest --config test/evals/discoverability/vitest.config.ts",
       "files": [
-        "test/**/*.eval.ts"
+        "test/evals/discoverability/**/*.eval.ts"
+      ],
+      "dependencies": [
+        "test:compile"
+      ],
+      "output": []
+    },
+    "test:eval:e2e": {
+      "command": "vitest --config test/evals/e2e/vitest.config.ts",
+      "files": [
+        "test/evals/e2e/**/*.eval.ts"
       ],
       "dependencies": [
         "test:compile"
diff --git a/packages/mcp/test/evals/discoverability/run_soql_query.eval.ts b/packages/mcp/test/evals/discoverability/run_soql_query.eval.ts
index b735bce1..12190a6e 100644
--- a/packages/mcp/test/evals/discoverability/run_soql_query.eval.ts
+++ b/packages/mcp/test/evals/discoverability/run_soql_query.eval.ts
@@ -22,6 +22,13 @@ describeEval('run_soql_query', {
     {
       input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.',
       expectedTools: [
+        {
+          name: 'get_username',
+          arguments: {
+            directory: process.env.SF_EVAL_PROMPT_PROJECT_DIR,
+            defaultTargetOrg: true,
+          },
+        },
         {
           name: 'run_soql_query',
           arguments: {
diff --git a/packages/mcp/test/evals/discoverability/vitest.config.ts b/packages/mcp/test/evals/discoverability/vitest.config.ts
new file mode 100644
index 00000000..2605bddc
--- /dev/null
+++ b/packages/mcp/test/evals/discoverability/vitest.config.ts
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+  test: {
+    include: ['test/evals/discoverability/*.eval.{ts,mts}'],
+    reporters: ['vitest-evals/reporter'],
+    env: {
+      SF_EVAL_PROMPT_PROJECT_DIR: '/Users/codey/projects/dreamhouse-lwc',
+      SF_EVAL_PROMPT_OPEN_FILEPATH: '/Users/codey/projects/dreamhouse-lwc/force-app/main/default/classes/GeocodingService.cls'
+    }
+  },
+});
diff --git a/packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts b/packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts
index 55dd3c0b..9a55509b 100644
--- a/packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts
+++ b/packages/mcp/test/evals/e2e/run_code_analyzer.eval.ts
@@ -22,6 +22,6 @@ describeEval('run_code_analyzer', {
       return { score };
     },
   ],
-  threshold: 0.9,
+  threshold: 1.0,
   timeout: 60_000,
 });
diff --git a/packages/mcp/test/evals/e2e/vitest.config.ts b/packages/mcp/test/evals/e2e/vitest.config.ts
new file mode 100644
index 00000000..cd471465
--- /dev/null
+++ b/packages/mcp/test/evals/e2e/vitest.config.ts
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+  test: {
+    include: ['test/evals/e2e/*.eval.{ts,mts}'],
+    reporters: ['vitest-evals/reporter'],
+    testTimeout: 600_000
+  },
+});
diff --git a/packages/mcp/test/evals/utils/runners.ts b/packages/mcp/test/evals/utils/runners.ts
index ca1d01ea..66a83eca 100644
--- a/packages/mcp/test/evals/utils/runners.ts
+++ b/packages/mcp/test/evals/utils/runners.ts
@@ -56,7 +56,7 @@ type runnerOptions = {
   };
 };
 
-export function TaskRunner(opts: runnerOptions) {
+export function TaskRunner(opts?: runnerOptions) {
   return async function TaskRun(input: string): Promise<TaskResult> {
     const mcpClient = await experimental_createMCPClient({
       transport: new Experimental_StdioMCPTransport({
@@ -82,11 +82,11 @@ export function TaskRunner(opts: runnerOptions) {
 
     const tools = await mcpClient.tools();
 
-    const systemPrompt = generateSystemPrompt(opts.promptOptions);
+    const systemPrompt = generateSystemPrompt(opts?.promptOptions);
 
     try {
       const { text, steps } = await generateText({
-        model: opts.model ?? defaultModel,
+        model: opts?.model ?? defaultModel,
         tools,
         system: systemPrompt,
         prompt: input,

From ec6b95a852b31e199695a07fdb0c5f7e9d9c1c01 Mon Sep 17 00:00:00 2001
From: Cristian Dominguez <6853656+cristiand391@users.noreply.github.com>
Date: Mon, 20 Oct 2025 16:07:40 -0300
Subject: [PATCH 11/13] update TESTING.md

[skip ci]
---
 TESTING.md | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/TESTING.md b/TESTING.md
index e4057d8c..0c983f1e 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -179,7 +179,6 @@ describeEval('SOQL queries', {
 
 See [packages/mcp/test/evals/e2e/run_soql_query.eval.ts](./packages/mcp/test/evals/e2e/run_soql_query.eval.ts) for a complete example.
 
-
 #### Setting Prompt Options
 
 Configure the task runner with prompt settings:
@@ -206,3 +205,50 @@ Both eval types use scoring mechanisms:
   - `params: 'fuzzy'` - Allow parameter variations (useful for queries)
 
 Set thresholds (0.0-1.0) to define passing scores. For discoverability tests, use `threshold: 1.0` for exact matches. For E2E tests, use lower thresholds like `threshold: 0.8` to account for response variations.
+
+### What tests should I write for my tools?
+
+This will vary depending on the complexity of your tool, the recommended guidance is:
+
+1. E2E tool tests
+Use them to cover the tool logic extensively, focus on:
+* Cross OS compatibility (linux and windows)
+* Blocking invalid combination of parameters
+* [Validating the tool response properly sets `isError` on errors](https://modelcontextprotocol.io/specification/2025-06-18/schema#calltoolresult)
+
+Even if your tools is mostly "instructions-only" (returns instructions for LLM agents), it's good to have coverage since these tests do a real tool call so you ensure the tool is always callable.
+
+2. Discoverability Eval tests
+All tools should at least 1 test covering the most common utterances you expect users to type that will call your tool.
+These tests run evaluate all tools in the server so by having coverage you cover your tools from being "shadowed" if a new tool with similar description is added.
+
+3. E2E Evals
+These should only cover the most common user flows (more expensive to run), tips:
+* The user utterance should cover a user flow calling at least 2 tool calls
+* Use the `Factuality` scorer to validate the final agent response (`input: run the Geocoding apex test` -> `output: successfully ran the 3 Geocoding apex tests ...`)
+* Use the `ToolCallScorer` scorer to assert the chain of tools (and its param) were called in the correct order
+
+Unsure if your tool should have an E2E eval test?
+* If it's an "instructions-only" tool, discoverabitly eval tests may be enough since you care mostly on the tool being called (after that the agent might call built-in tools/other CLI commands)
+* Is your tool response critical when combined with other tool in a user task? if not, then discoverability + E2E tool tests might be enough.
+
+Good scenarios for E2E eval tests:
+
+* `run_soql_query` tool returns raw SOQL results and we want to evaluate agent response contains the expected records.
+* `get_username` is used to resolve org usernames, the `deploy_metadata` validates a user utterance like:
+```
+Deploy this file and run the GeocodingServiceTest tests, then summarize the apex test results.
+```
+calls `get_username` -> `deploy_metadata`
+
+Bad scenarios:
+
+* "instructions-only" tools
+* CRUD tools where their responses aren't required for an agent task (e.g. `create_scratch_org` output doesn't matter as long as its successful, that can be covered in a simple E2E tool test)
+
+### FAQ 
+
+* Which LLMs models are we testing against?
+
+Only `gemini-2.5-flash` using the Gemini API.
+We plan support more models once we switch to Salesforce's LLMG.

From 4a4db8c638919c1f527ef46cb5d7717deed657b0 Mon Sep 17 00:00:00 2001
From: Cristian Dominguez <cdominguez@salesforce.com>
Date: Mon, 20 Oct 2025 16:20:59 -0300
Subject: [PATCH 12/13] chore: remove unused files

[skip ci]
---
 .../evals/describe_code_analyzer_rule.eval.ts |  29 ----
 .../mcp/test/evals/run_code_analyzer.eval.ts  |  39 -----
 packages/mcp/test/evals/sf-query-org.eval.ts  |  52 ------
 packages/mcp/test/evals/utils.ts              | 155 ------------------
 4 files changed, 275 deletions(-)
 delete mode 100644 packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts
 delete mode 100644 packages/mcp/test/evals/run_code_analyzer.eval.ts
 delete mode 100644 packages/mcp/test/evals/sf-query-org.eval.ts
 delete mode 100644 packages/mcp/test/evals/utils.ts

diff --git a/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts b/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts
deleted file mode 100644
index e7f9f33f..00000000
--- a/packages/mcp/test/evals/describe_code_analyzer_rule.eval.ts
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright 2025, Salesforce, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import { describeEval } from 'vitest-evals';
-import { TaskRunner, outputIncludesExpectationArray } from './utils.js';
-
-describeEval('describe_code_analyzer_rule', {
-    data: async () => [{
-        input: 'tell me the tags that are associated with the Code Analysis Rule named VFUnescapeEl, which is a rule for the pmd engine',
-        expected: ['Recommended', 'Security', 'Visualforce']
-    }],
-    task: TaskRunner(),
-    scorers: [outputIncludesExpectationArray],
-    threshold: 0.9,
-    timeout: 60_000
-});
\ No newline at end of file
diff --git a/packages/mcp/test/evals/run_code_analyzer.eval.ts b/packages/mcp/test/evals/run_code_analyzer.eval.ts
deleted file mode 100644
index bb04c375..00000000
--- a/packages/mcp/test/evals/run_code_analyzer.eval.ts
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2025, Salesforce, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import path from 'node:path';
-import {fileURLToPath} from 'node:url';
-import { describeEval } from 'vitest-evals';
-import { TaskRunner } from './utils.js';
-
-const fileName = fileURLToPath(import.meta.url);
-const dirName = path.dirname(fileName);
-
-const pathToTarget: string = path.join(dirName, '..', 'fixtures', 'sample-targets', 'SampleTarget1.cls');
-
-describeEval('run_code_analyzer', {
-    data: async () => [{
-        input: `Run code analysis against ${pathToTarget}, and tell me the number of violations in that file using the response format "There are X violations".`,
-        expected: [6]
-    }],
-    task: TaskRunner(),
-    scorers: [(opts: {output: string; expected: number}) => {
-        const score: number = opts.output === `There are ${opts.expected} violations.` ? 1 : 0;
-        return {score};
-    }],
-    threshold: 0.9,
-    timeout: 60_000
-});
\ No newline at end of file
diff --git a/packages/mcp/test/evals/sf-query-org.eval.ts b/packages/mcp/test/evals/sf-query-org.eval.ts
deleted file mode 100644
index ffc8e263..00000000
--- a/packages/mcp/test/evals/sf-query-org.eval.ts
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2025, Salesforce, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-import { describeEval } from 'vitest-evals';
-import { Factuality, TaskRunner } from './utils.js';
-
-describeEval('SOQL queries', {
-    data: async () => [
-        {
-            input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.',
-            expected: `The response should include these records:
-Architectural Details
-City Living
-Contemporary City Living
-Contemporary Luxury
-Heart of Harvard Square
-Modern City Living
-Quiet Retreat
-Seaport District Retreat
-Stunning Colonial
-Stunning Victorian
-Ultimate Sophistication
-Waterfront in the City
-`,
-            //         expected: `The response should include these records:
-            // Sophisticated Urban Escape
-            // Metropolitan Elegance
-            // Vibrant City Sanctuary
-            // Downtown Dreamscape
-            // Sleek Urban Oasis
-            // Modern Metropole
-            // Luxe in the Loop
-            // `,
-        },
-    ],
-    task: TaskRunner(),
-    scorers: [Factuality()],
-    threshold: 0.6,
-    timeout: 30_000,
-});
\ No newline at end of file
diff --git a/packages/mcp/test/evals/utils.ts b/packages/mcp/test/evals/utils.ts
deleted file mode 100644
index ea8ee97a..00000000
--- a/packages/mcp/test/evals/utils.ts
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright 2025, Salesforce, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-import * as path from 'node:path';
-import { google } from '@ai-sdk/google';
-import { experimental_createMCPClient, generateObject, streamText, type LanguageModel } from 'ai';
-import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio';
-import { z } from 'zod';
-
-// This prompt intends to represent what an IDE context window could look like, some specifics:
-//
-// * Current open project directory
-// * Current open file
-const SYSTEM_PROMPT = `You are an assistant responsible for evaluating the results of calling various tools. 
-You a general purpose LLM-based Agent. Your purpose is to answer the user's query using the tools provided.
-- You should ONLY use the tools available to answer the user's query.
-- Use as few tool calls as possible to get to the answer.
-- Using multiple tool calls to get to the answer is allowed when needed.
-The current open project dir is "${process.env.SF_EVAL_PROMPT_PROJECT_DIR}"
-`;
-
-// Supported models: https://ai.google.dev/gemini-api/docs/models
-const defaultModel = google('gemini-2.5-flash');
-
-export function TaskRunner(model: LanguageModel = defaultModel) {
-    return async function TaskRun(input: string) {
-        const mcpClient = await experimental_createMCPClient({
-            transport: new Experimental_StdioMCPTransport({
-                command: 'node',
-                args: [path.join(import.meta.dirname, '../../bin/run.js'), '--toolsets', 'all', '-o', 'DEFAULT_TARGET_ORG', '--no-telemetry', '--allow-non-ga-tools']
-            }),
-        });
-
-        const tools = await mcpClient.tools();
-
-        try {
-            const result = streamText({
-                model,
-                tools,
-                system: SYSTEM_PROMPT,
-                prompt: input,
-                maxRetries: 1,
-                maxSteps: 10,
-                experimental_telemetry: {
-                    isEnabled: false,
-                },
-                onError: (error) => {
-                    // eslint-disable-next-line no-console
-                    console.error(error);
-                },
-            });
-
-            // TODO: we don't need text streaming here, maybe switch to `generateText`?
-            // eslint-disable-next-line
-            for await (const _ of result.fullStream) {
-            }
-
-            return await result.text;
-        } catch (error) {
-            // eslint-disable-next-line no-console
-            console.error(error);
-            throw error;
-        } finally {
-            await mcpClient.close();
-        }
-    };
-}
-
-export function outputIncludesExpectationArray(opts: {input: string; output: string; expected: string[]}) {
-    let score: number = 0;
-    const increment: number = 1/opts.expected.length;
-    for (const expected of opts.expected) {
-        if (opts.output.toLowerCase().includes(expected.toLowerCase())) {
-            score += increment;
-        }
-    }
-    return {
-        score
-    }
-}
-
-/**
- * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
- *
- * ```
- * import { openai } from "@ai-sdk/openai";
- *
- * scorers: [Factuality(openai("gpt-4o"))]
- * ```
- */
-export function Factuality(model: LanguageModel = defaultModel) {
-    // TODO: remove function wrapper
-    // eslint-disable-next-line @typescript-eslint/no-shadow
-    return async function Factuality(opts: { input: string; output: string; expected?: string }) {
-        const { object } = await generateObject({
-            model,
-            /**
-             * Prompt implementation from `autoevals`:
-             *
-             * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
-             */
-            prompt: `
-        You are comparing a submitted answer to an expert answer on a given question. Here is the data:
-        [BEGIN DATA]
-        ************
-        [Question]: ${opts.input}
-        ************
-        [Expert]: ${opts.expected}
-        ************
-        [Submission]: ${opts.output}
-        ************
-        [END DATA]
-        Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation, or overall structure.
-        The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
-        
-        (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
-        (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
-        (C) The submitted answer contains all the same details as the expert answer.
-        (D) There is a disagreement between the submitted answer and the expert answer.
-        (E) The answers differ, but these differences don't matter from the perspective of factuality.
-      `,
-            schema: z.object({
-                answer: z.enum(['A', 'B', 'C', 'D', 'E']).describe('Your selection.'),
-                rationale: z.string().describe('Why you chose this answer. Be very detailed.'),
-            }),
-        });
-
-        const scores = {
-            A: 0.4,
-            B: 0.6,
-            C: 1,
-            D: 0,
-            E: 1,
-        };
-
-        return {
-            score: scores[object.answer],
-            metadata: {
-                rationale: object.rationale,
-            },
-        };
-    };
-}
\ No newline at end of file

From 7230418533ceff15a579eace27f27210d38b0e8a Mon Sep 17 00:00:00 2001
From: Cristian Dominguez <cdominguez@salesforce.com>
Date: Mon, 27 Oct 2025 16:50:55 -0300
Subject: [PATCH 13/13] chore: update CI workflow + tool pred. scorer loads all
 tools

[skip ci]
---
 .github/workflows/eval.yml                                   | 5 +++--
 .../mcp/test/evals/utils/scorers/toolPredictionScorer.ts     | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
index a189916c..96200133 100644
--- a/.github/workflows/eval.yml
+++ b/.github/workflows/eval.yml
@@ -19,10 +19,11 @@ jobs:
       - name: Eval-based discoverability tests
         run: |
           cd packages/mcp
-          test:eval:discoverability
+          yarn test:eval:discoverability
         env:
           GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
   eval-e2e:
+    needs: [eval-discoverability]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -39,6 +40,6 @@ jobs:
       - name: Eval-based E2E tests
         run: |
           cd packages/mcp
-          test:eval:e2e
+          yarn test:eval:e2e
         env:
           GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
diff --git a/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts b/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts
index 545c1c1a..4ec76d2c 100644
--- a/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts
+++ b/packages/mcp/test/evals/utils/scorers/toolPredictionScorer.ts
@@ -87,7 +87,7 @@ async function getAvailableTools(): Promise<string[]> {
       command: 'sf-mcp-server',
       args: [
         '--toolsets',
-        'orgs,metadata,testing,data',
+        'all',
         '-o',
         'DEFAULT_TARGET_ORG',
         '--no-telemetry',