From cc4eba1c12705ac93bb20a1fc1efde5c464e6813 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 10 Jun 2025 14:30:02 -0600
Subject: [PATCH 01/51] feat: add telemetry

---
 package.json                                |   1 +
 src/index.ts                                |  58 ++-
 src/sf-mcp-server.ts                        | 131 +++++++
 src/telemetry.ts                            | 158 +++++++++
 src/tools/core/sf-get-username.ts           |   4 +-
 src/tools/data/sf-query-org.ts              |   4 +-
 src/tools/metadata/sf-deploy-metadata.ts    |   4 +-
 src/tools/metadata/sf-retrieve-metadata.ts  |   4 +-
 src/tools/orgs/sf-list-all-orgs.ts          |   4 +
 src/tools/users/sf-assign-permission-set.ts |   4 +-
 yarn.lock                                   | 375 ++++++++++++++++++--
 11 files changed, 695 insertions(+), 52 deletions(-)
 create mode 100644 src/sf-mcp-server.ts
 create mode 100644 src/telemetry.ts
diff --git a/package.json b/package.json
index e452aadc..08aafed5 100644
--- a/package.json
+++ b/package.json
@@ -46,6 +46,7 @@
     "@salesforce/kit": "^3.1.6",
     "@salesforce/source-deploy-retrieve": "^12.19.7",
     "@salesforce/source-tracking": "^7.4.1",
+    "@salesforce/telemetry": "^6.0.39",
     "@salesforce/ts-types": "^2.0.11",
     "zod": "^3.25.42"
   },
diff --git a/src/index.ts b/src/index.ts
index 7d4426f3..fac5769c 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -16,7 +16,6 @@
 
 /* eslint-disable no-console */
 
-import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
 import { Command, Flags, ux } from '@oclif/core';
 import * as core from './tools/core/index.js';
@@ -25,9 +24,30 @@ import * as data from './tools/data/index.js';
 import * as users from './tools/users/index.js';
 import * as metadata from './tools/metadata/index.js';
 import Cache from './shared/cache.js';
+import { Telemetry } from './telemetry.js';
+import { SfMcpServer } from './sf-mcp-server.js';
 
 const TOOLSETS = ['all', 'orgs', 'data', 'users', 'metadata'] as const;
 
+/**
+ * Sanitizes an array of org usernames by replacing specific orgs with a placeholder.
+ * Special values (DEFAULT_TARGET_ORG, DEFAULT_TARGET_DEV_HUB, ALLOW_ALL_ORGS) are preserved.
+ *
+ * @param {string[]} input - Array of org identifiers to sanitize
+ * @returns {string} Comma-separated string of sanitized org identifiers
+ */
+function sanitizeOrgInput(input: string[]): string {
+  return input
+    .map((org) => {
+      if (org === 'DEFAULT_TARGET_ORG' || org === 'DEFAULT_TARGET_DEV_HUB' || org === 'ALLOW_ALL_ORGS') {
+        return org;
+      }
+
+      return 'SANITIZED_ORG';
+    })
+    .join(', ');
+}
+
 export default class McpServerCommand extends Command {
   public static summary = 'Start the Salesforce MCP server';
   public static description = `This command starts the Model Context Protocol (MCP) server for Salesforce, allowing access to various tools and orgs.
@@ -50,7 +70,7 @@ You can also use special values to control access to orgs:
       delimiter: ',',
       parse: async (input: string) => {
         if (input === 'ALLOW_ALL_ORGS') {
-          ux.warn('WARNING: ALLOW_ALL_ORGS is set. This allows access to all authenticated orgs. Use with caution.');
+          ux.warn('ALLOW_ALL_ORGS is set. This allows access to all authenticated orgs. Use with caution.');
         }
 
         if (
@@ -76,6 +96,9 @@ You can also use special values to control access to orgs:
       default: ['all'],
     })(),
     version: Flags.version(),
+    'no-telemetry': Flags.boolean({
+      summary: 'Disable telemetry',
+    }),
   };
 
   public static examples = [
@@ -93,18 +116,35 @@ You can also use special values to control access to orgs:
     },
   ];
 
+  private telemetry?: Telemetry;
+
   public async run(): Promise<void> {
     const { flags } = await this.parse(McpServerCommand);
+    if (!flags['no-telemetry']) {
+      this.telemetry = new Telemetry(this.config);
+      await this.telemetry.start({
+        toolsets: flags.toolsets.join(', '),
+        orgs: sanitizeOrgInput(flags.orgs),
+      });
+
+      process.stdin.on('close', () => {
+        this.telemetry?.stop();
+      });
+    }
+
     Cache.getInstance().set('allowedOrgs', new Set(flags.orgs));
     this.logToStderr(`Allowed orgs:\n${flags.orgs.map((org) => `- ${org}`).join('\n')}`);
-    const server = new McpServer({
-      name: 'sf-mcp-server',
-      version: this.config.version,
-      capabilities: {
-        resources: {},
-        tools: {},
+    const server = new SfMcpServer(
+      {
+        name: 'sf-mcp-server',
+        version: this.config.version,
+        capabilities: {
+          resources: {},
+          tools: {},
+        },
       },
-    });
+      { telemetry: this.telemetry }
+    );
 
     // // TODO: Should we add annotations to our tools? https://modelcontextprotocol.io/docs/concepts/tools#tool-definition-structure
     // // TODO: Move tool names into a shared file, that way if we reference them in multiple places, we can update them in one place
diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
new file mode 100644
index 00000000..6c29379c
--- /dev/null
+++ b/src/sf-mcp-server.ts
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import { McpServer, RegisteredTool, ToolCallback } from '@modelcontextprotocol/sdk/server/mcp.js';
+import {
+  CallToolResult,
+  Implementation,
+  ServerNotification,
+  ServerRequest,
+  ToolAnnotations,
+} from '@modelcontextprotocol/sdk/types.js';
+import { ServerOptions } from '@modelcontextprotocol/sdk/server/index.js';
+import { RequestHandlerExtra } from '@modelcontextprotocol/sdk/shared/protocol.js';
+import { ZodRawShape } from 'zod';
+import { Telemetry } from './telemetry.js';
+
+/**
+ * A server implementation that extends the base MCP server with telemetry capabilities.
+ *
+ * @extends {McpServer}
+ */
+export class SfMcpServer extends McpServer {
+  /** Optional telemetry instance for tracking server events */
+  private telemetry?: Telemetry;
+
+  /**
+   * Creates a new SfMcpServer instance
+   *
+   * @param {Implementation} serverInfo - The server implementation details
+   * @param {ServerOptions & { telemetry?: Telemetry }} [options] - Optional server configuration including telemetry
+   */
+  public constructor(serverInfo: Implementation, options?: ServerOptions & { telemetry?: Telemetry }) {
+    super(serverInfo, options);
+    this.telemetry = options?.telemetry;
+  }
+
+  /**
+   * Registers a zero-argument tool `name`, which will run the given function when the client calls it.
+   */
+  public tool(name: string, cb: ToolCallback): RegisteredTool;
+  /**
+   * Registers a zero-argument tool `name` (with a description) which will run the given function when the client calls it.
+   */
+  public tool(name: string, description: string, cb: ToolCallback): RegisteredTool;
+  /**
+   * Registers a tool taking either a parameter schema for validation or annotations for additional metadata.
+   * This unified overload handles both `tool(name, paramsSchema, cb)` and `tool(name, annotations, cb)` cases.
+   *
+   * Note: We use a union type for the second parameter because TypeScript cannot reliably disambiguate
+   * between ToolAnnotations and ZodRawShape during overload resolution, as both are plain object types.
+   */
+  public tool<Args extends ZodRawShape>(
+    name: string,
+    paramsSchemaOrAnnotations: Args | ToolAnnotations,
+    cb: ToolCallback<Args>
+  ): RegisteredTool;
+  /**
+   * Registers a tool `name` (with a description) taking either parameter schema or annotations.
+   * This unified overload handles both `tool(name, description, paramsSchema, cb)` and
+   * `tool(name, description, annotations, cb)` cases.
+   *
+   * Note: We use a union type for the third parameter because TypeScript cannot reliably disambiguate
+   * between ToolAnnotations and ZodRawShape during overload resolution, as both are plain object types.
+   */
+  public tool<Args extends ZodRawShape>(
+    name: string,
+    description: string,
+    paramsSchemaOrAnnotations: Args | ToolAnnotations,
+    cb: ToolCallback<Args>
+  ): RegisteredTool;
+  /**
+   * Registers a tool with both parameter schema and annotations.
+   */
+  public tool<Args extends ZodRawShape>(
+    name: string,
+    paramsSchema: Args,
+    annotations: ToolAnnotations,
+    cb: ToolCallback<Args>
+  ): RegisteredTool;
+  /**
+   * Registers a tool with description, parameter schema, and annotations.
+   */
+  public tool<Args extends ZodRawShape>(
+    name: string,
+    description: string,
+    paramsSchema: Args,
+    annotations: ToolAnnotations,
+    cb: ToolCallback<Args>
+  ): RegisteredTool;
+
+  /**
+   * Registers a tool with the server and wraps its callback with telemetry tracking
+   *
+   * @param {string} name - The name of the tool to register
+   * @param {...unknown[]} rest - Additional arguments for tool registration, with the last argument being the callback
+   * @returns {RegisteredTool} The registered tool instance
+   */
+  public tool(name: string, ...rest: unknown[]): RegisteredTool {
+    // Given the signature of the tool function, the last argument is always the callback
+    const cb = rest[rest.length - 1] as ToolCallback;
+
+    const wrappedCb = async (args: RequestHandlerExtra<ServerRequest, ServerNotification>): Promise<CallToolResult> => {
+      this.telemetry?.sendEvent('MCP_SERVER_TOOL_CALLED', {
+        name,
+      });
+      const result = await cb(args);
+      if (result.isError) {
+        this.telemetry?.sendEvent('MCP_SERVER_TOOL_ERROR', {
+          name,
+        });
+      }
+      return result;
+    };
+
+    // @ts-expect-error because we no longer know what the type of rest is
+    return super.tool(name, ...rest.slice(0, -1), wrappedCb);
+  }
+}
diff --git a/src/telemetry.ts b/src/telemetry.ts
new file mode 100644
index 00000000..1abdf625
--- /dev/null
+++ b/src/telemetry.ts
@@ -0,0 +1,158 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// acknowledge telemetry unless the user has explicitly disabled it
+// create a session id that is sent with every event
+// use the @salesforce/telemetry package to send all events
+// find the user id stored at /Users/<username>/Library/Caches/sf/CLIID.txt
+//   this path is configurable by the user and differs by OS so we need to make a best guess at where it is and then default to a new one
+//   if the file doesn't exist.
+//   a best guess might be to access this.config.cacheDir and replace 'sf' with 'sf-mcp-server'. That will get use the OS specific paths
+//   but it won't work if the user has a different cache directory set via env var.
+
+import { randomBytes } from 'node:crypto';
+import { readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import { Attributes, TelemetryReporter } from '@salesforce/telemetry';
+import { warn } from '@oclif/core/ux';
+import { Config } from '@oclif/core';
+
+const PROJECT = 'salesforce-mcp-server';
+const APP_INSIGHTS_KEY =
+  'InstrumentationKey=2ca64abb-6123-4c7b-bd9e-4fe73e71fe9c;IngestionEndpoint=https://eastus-1.in.applicationinsights.azure.com/;LiveEndpoint=https://eastus.livediagnostics.monitor.azure.com/;ApplicationId=ecd8fa7a-0e0d-4109-94db-4d7878ada862';
+
+const generateRandomId = (): string => randomBytes(20).toString('hex');
+
+const getCliId = (cacheDir: string): string => {
+  // We need to find sf's cache directory and read the CLIID.txt file from there.
+  // The problem is that sf's cache directory is OS specific and we don't want to
+  // hardcode all the potential paths. oclif does this for us already during startup
+  // so we can simply replace sf-mcp-server with sf in the cache directory path and
+  // end up with the correct OS specific path.
+  //
+  // The only downside to this approach is that the user could have a different
+  // cache directory set via env var. In that case, we'll just generate a new CLIID.
+  // This is a very rare case and we can live with it for now.
+  const sfCacheDir = cacheDir.replace('sf-mcp-server', 'sf');
+  const cliIdPath = join(sfCacheDir, 'CLIID.txt');
+  try {
+    return readFileSync(cliIdPath, 'utf-8');
+  } catch {
+    return generateRandomId();
+  }
+};
+
+class McpTelemetryReporter extends TelemetryReporter {
+  /**
+   * TelemetryReporter references sf's config to determine if telemetry is enabled.
+   * We want to always send telemetry events, so we override the method to always return true.
+   * This is okay to do since the Telemetry class won't be instantiated in the MCP server if telemetry is disabled.
+   *
+   * @returns true
+   */
+  // eslint-disable-next-line class-methods-use-this
+  public isSfdxTelemetryEnabled(): boolean {
+    return true;
+  }
+}
+
+export class Telemetry {
+  /**
+   * A unique identifier for the session.
+   */
+  private sessionId: string;
+  /**
+   * The unique identifier generated for the user by the `sf` CLI.
+   * If it doesn't exist, or we can't read it, we'll generate a new one.
+   */
+  private cliId: string;
+  private started = false;
+  private reporter?: McpTelemetryReporter;
+
+  public constructor(private readonly config: Config) {
+    warn(
+      'You acknowledge and agree that the MCP server may collect usage information, user environment, and crash reports for the purposes of providing services or functions that are relevant to use of the MCP server and product improvements.'
+    );
+    this.sessionId = generateRandomId();
+    this.cliId = getCliId(config.cacheDir);
+  }
+
+  public sendEvent(eventName: string, attributes: Attributes): void {
+    this.reporter?.sendTelemetryEvent(eventName, {
+      ...attributes,
+      sessionId: this.sessionId,
+      cliId: this.cliId,
+    });
+  }
+
+  public async start(attributes: Attributes): Promise<void> {
+    if (this.started) return;
+    this.started = true;
+
+    this.reporter = await McpTelemetryReporter.create({
+      project: PROJECT,
+      key: APP_INSIGHTS_KEY,
+      userId: this.cliId,
+      waitForConnection: true,
+    });
+
+    this.reporter.start();
+
+    this.reporter.sendTelemetryEvent('MCP_SERVER_STARTED', {
+      ...attributes,
+      // Identifiers
+      sessionId: this.sessionId,
+      cliId: this.cliId,
+      // System information
+      version: this.config.version,
+      platform: this.config.platform,
+      arch: this.config.arch,
+      nodeVersion: process.version,
+      nodeEnv: process.env.NODE_ENV,
+      shell: this.config.shell,
+      origin: this.config.userAgent,
+      // Timestamps
+      date: new Date().toUTCString(),
+      timestamp: String(Date.now()),
+      processUptime: process.uptime() * 1000,
+    });
+  }
+
+  public stop(): void {
+    if (!this.started) return;
+    this.started = false;
+
+    this.reporter?.sendTelemetryEvent('MCP_SERVER_STOPPED', {
+      // Identifiers
+      sessionId: this.sessionId,
+      cliId: this.cliId,
+      // System information
+      version: this.config.version,
+      platform: this.config.platform,
+      arch: this.config.arch,
+      nodeVersion: process.version,
+      nodeEnv: process.env.NODE_ENV,
+      shell: this.config.shell,
+      origin: this.config.userAgent,
+      // Timestamps
+      date: new Date().toUTCString(),
+      timestamp: String(Date.now()),
+      processUptime: process.uptime() * 1000,
+    });
+
+    this.reporter?.stop();
+  }
+}
diff --git a/src/tools/core/sf-get-username.ts b/src/tools/core/sf-get-username.ts
index f604ffb5..bdd8ffc1 100644
--- a/src/tools/core/sf-get-username.ts
+++ b/src/tools/core/sf-get-username.ts
@@ -15,11 +15,11 @@
  */
 
 import { z } from 'zod';
-import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { textResponse } from '../../shared/utils.js';
 import { getDefaultTargetOrg, getDefaultTargetDevHub, suggestUsername } from '../../shared/auth.js';
 import { directoryParam } from '../../shared/params.js';
 import { type ConfigInfoWithCache, type ToolTextResponse } from '../../shared/types.js';
+import { SfMcpServer } from '../../sf-mcp-server.js';
 
 /*
  * Get username for Salesforce org
@@ -58,7 +58,7 @@ Get username for my default dev hub
 
 export type GetUsernameParamsSchema = z.infer<typeof getUsernameParamsSchema>;
 
-export const registerToolGetUsername = (server: McpServer): void => {
+export const registerToolGetUsername = (server: SfMcpServer): void => {
   server.tool(
     'sf-get-username',
     `Intelligently determines the appropriate username or alias for Salesforce operations.
diff --git a/src/tools/data/sf-query-org.ts b/src/tools/data/sf-query-org.ts
index 9a2fe98c..ef68bcde 100644
--- a/src/tools/data/sf-query-org.ts
+++ b/src/tools/data/sf-query-org.ts
@@ -29,10 +29,10 @@
 
 import { z } from 'zod';
 
-import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { getConnection } from '../../shared/auth.js';
 import { textResponse } from '../../shared/utils.js';
 import { directoryParam, usernameOrAliasParam } from '../../shared/params.js';
+import { SfMcpServer } from '../../sf-mcp-server.js';
 
 export const queryOrgParamsSchema = z.object({
   query: z.string().describe('SOQL query to run'),
@@ -42,7 +42,7 @@ export const queryOrgParamsSchema = z.object({
 
 export type QueryOrgOptions = z.infer<typeof queryOrgParamsSchema>;
 
-export const registerToolQueryOrg = (server: McpServer): void => {
+export const registerToolQueryOrg = (server: SfMcpServer): void => {
   server.tool(
     'sf-query-org',
     'Run a SOQL query against a Salesforce org.',
diff --git a/src/tools/metadata/sf-deploy-metadata.ts b/src/tools/metadata/sf-deploy-metadata.ts
index 13a47775..c8f70cc8 100644
--- a/src/tools/metadata/sf-deploy-metadata.ts
+++ b/src/tools/metadata/sf-deploy-metadata.ts
@@ -16,7 +16,6 @@
 
 import { z } from 'zod';
 
-import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { Connection, Org, SfProject } from '@salesforce/core';
 import { SourceTracking } from '@salesforce/source-tracking';
 import { ComponentSet, ComponentSetBuilder } from '@salesforce/source-deploy-retrieve';
@@ -25,6 +24,7 @@ import { Duration } from '@salesforce/kit';
 import { directoryParam, usernameOrAliasParam } from '../../shared/params.js';
 import { textResponse } from '../../shared/utils.js';
 import { getConnection } from '../../shared/auth.js';
+import { SfMcpServer } from '../../sf-mcp-server.js';
 
 const deployMetadataParams = z.object({
   sourceDir: z
@@ -81,7 +81,7 @@ export type DeployMetadata = z.infer<typeof deployMetadataParams>;
  * Returns:
  * - textResponse: Deploy result.
  */
-export const registerToolDeployMetadata = (server: McpServer): void => {
+export const registerToolDeployMetadata = (server: SfMcpServer): void => {
   server.tool(
     'sf-deploy-metadata',
     `Deploy metadata to an org from your local project.
diff --git a/src/tools/metadata/sf-retrieve-metadata.ts b/src/tools/metadata/sf-retrieve-metadata.ts
index d7c15eab..2df2d8cf 100644
--- a/src/tools/metadata/sf-retrieve-metadata.ts
+++ b/src/tools/metadata/sf-retrieve-metadata.ts
@@ -16,7 +16,6 @@
 
 import { z } from 'zod';
 
-import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { Connection, Org, SfProject } from '@salesforce/core';
 import { SourceTracking } from '@salesforce/source-tracking';
 import { ComponentSet, ComponentSetBuilder } from '@salesforce/source-deploy-retrieve';
@@ -25,6 +24,7 @@ import { Duration } from '@salesforce/kit';
 import { directoryParam, usernameOrAliasParam } from '../../shared/params.js';
 import { textResponse } from '../../shared/utils.js';
 import { getConnection } from '../../shared/auth.js';
+import { SfMcpServer } from '../../sf-mcp-server.js';
 
 const retrieveMetadataParams = z.object({
   sourceDir: z
@@ -50,7 +50,7 @@ const retrieveMetadataParams = z.object({
  * Returns:
  * - textResponse: Retrieve result.
  */
-export const registerToolRetrieveMetadata = (server: McpServer): void => {
+export const registerToolRetrieveMetadata = (server: SfMcpServer): void => {
   server.tool(
     'sf-retrieve-metadata',
     `Retrieve metadata from an org to your local project.
diff --git a/src/tools/orgs/sf-list-all-orgs.ts b/src/tools/orgs/sf-list-all-orgs.ts
index b2738bd1..a165e715 100644
--- a/src/tools/orgs/sf-list-all-orgs.ts
+++ b/src/tools/orgs/sf-list-all-orgs.ts
@@ -40,6 +40,8 @@ export const listAllOrgsParamsSchema = z.object({
 export type ListAllOrgsOptions = z.infer<typeof listAllOrgsParamsSchema>;
 
 export const registerToolListAllOrgs = (server: McpServer): void => {
+  // eslint-disable-next-line no-console
+  console.error('registerToolListAllOrgs');
   server.tool(
     'sf-list-all-orgs',
     `Lists all configured Salesforce orgs.
@@ -54,6 +56,8 @@ List all orgs
 `,
     listAllOrgsParamsSchema.shape,
     async ({ directory }) => {
+      // eslint-disable-next-line no-console
+      console.error('listAllOrgs', directory);
       try {
         process.chdir(directory);
         const orgs = await getAllAllowedOrgs();
diff --git a/src/tools/users/sf-assign-permission-set.ts b/src/tools/users/sf-assign-permission-set.ts
index f7401a87..e3d028e0 100644
--- a/src/tools/users/sf-assign-permission-set.ts
+++ b/src/tools/users/sf-assign-permission-set.ts
@@ -15,11 +15,11 @@
  */
 
 import { Org, StateAggregator, User } from '@salesforce/core';
-import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { z } from 'zod';
 import { directoryParam, usernameOrAliasParam } from '../../shared/params.js';
 import { textResponse } from '../../shared/utils.js';
 import { getConnection } from '../../shared/auth.js';
+import { SfMcpServer } from '../../sf-mcp-server.js';
 
 /*
  * Assign permission set
@@ -63,7 +63,7 @@ Set the permission set MyPermSet on behalf of my-alias.`),
 
 export type AssignPermissionSetOptions = z.infer<typeof assignPermissionSetParamsSchema>;
 
-export const registerToolAssignPermissionSet = (server: McpServer): void => {
+export const registerToolAssignPermissionSet = (server: SfMcpServer): void => {
   server.tool(
     'sf-assign-permission-set',
     'Assign a permission set to one or more org users.',
diff --git a/yarn.lock b/yarn.lock
index 3b6e04ce..549ab2c7 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -624,6 +624,82 @@
     "@smithy/types" "^4.3.1"
     tslib "^2.6.2"
 
+"@azure/abort-controller@^2.0.0":
+  version "2.1.2"
+  resolved "https://registry.yarnpkg.com/@azure/abort-controller/-/abort-controller-2.1.2.tgz#42fe0ccab23841d9905812c58f1082d27784566d"
+  integrity sha512-nBrLsEWm4J2u5LpAPjxADTlq3trDgVZZXHNKabeXZtpq3d3AbN/KGO82R87rdDz5/lYB024rtEf10/q0urNgsA==
+  dependencies:
+    tslib "^2.6.2"
+
+"@azure/core-auth@1.7.2":
+  version "1.7.2"
+  resolved "https://registry.yarnpkg.com/@azure/core-auth/-/core-auth-1.7.2.tgz#558b7cb7dd12b00beec07ae5df5907d74df1ebd9"
+  integrity sha512-Igm/S3fDYmnMq1uKS38Ae1/m37B3zigdlZw+kocwEhh5GjyKjPrXKO2J6rzpC1wAxrNil/jX9BJRqBshyjnF3g==
+  dependencies:
+    "@azure/abort-controller" "^2.0.0"
+    "@azure/core-util" "^1.1.0"
+    tslib "^2.6.2"
+
+"@azure/core-auth@^1.4.0":
+  version "1.9.0"
+  resolved "https://registry.yarnpkg.com/@azure/core-auth/-/core-auth-1.9.0.tgz#ac725b03fabe3c892371065ee9e2041bee0fd1ac"
+  integrity sha512-FPwHpZywuyasDSLMqJ6fhbOK3TqUdviZNF8OqRGA4W5Ewib2lEEZ+pBsYcBa88B2NGO/SEnYPGhyBqNlE8ilSw==
+  dependencies:
+    "@azure/abort-controller" "^2.0.0"
+    "@azure/core-util" "^1.11.0"
+    tslib "^2.6.2"
+
+"@azure/core-rest-pipeline@1.16.3":
+  version "1.16.3"
+  resolved "https://registry.yarnpkg.com/@azure/core-rest-pipeline/-/core-rest-pipeline-1.16.3.tgz#bde3bc3ebad7f885ddd9de6af5e5a8fc254b287e"
+  integrity sha512-VxLk4AHLyqcHsfKe4MZ6IQ+D+ShuByy+RfStKfSjxJoL3WBWq17VNmrz8aT8etKzqc2nAeIyLxScjpzsS4fz8w==
+  dependencies:
+    "@azure/abort-controller" "^2.0.0"
+    "@azure/core-auth" "^1.4.0"
+    "@azure/core-tracing" "^1.0.1"
+    "@azure/core-util" "^1.9.0"
+    "@azure/logger" "^1.0.0"
+    http-proxy-agent "^7.0.0"
+    https-proxy-agent "^7.0.0"
+    tslib "^2.6.2"
+
+"@azure/core-tracing@^1.0.1", "@azure/core-tracing@^1.2.0":
+  version "1.2.0"
+  resolved "https://registry.yarnpkg.com/@azure/core-tracing/-/core-tracing-1.2.0.tgz#7be5d53c3522d639cf19042cbcdb19f71bc35ab2"
+  integrity sha512-UKTiEJPkWcESPYJz3X5uKRYyOcJD+4nYph+KpfdPRnQJVrZfk0KJgdnaAWKfhsBBtAf/D58Az4AvCJEmWgIBAg==
+  dependencies:
+    tslib "^2.6.2"
+
+"@azure/core-util@^1.1.0", "@azure/core-util@^1.11.0", "@azure/core-util@^1.9.0":
+  version "1.12.0"
+  resolved "https://registry.yarnpkg.com/@azure/core-util/-/core-util-1.12.0.tgz#0b8c2837e6d67c3fbaeae20df34cf07f66b3480d"
+  integrity sha512-13IyjTQgABPARvG90+N2dXpC+hwp466XCdQXPCRlbWHgd3SJd5Q1VvaBGv6k1BIa4MQm6hAF1UBU1m8QUxV8sQ==
+  dependencies:
+    "@azure/abort-controller" "^2.0.0"
+    "@typespec/ts-http-runtime" "^0.2.2"
+    tslib "^2.6.2"
+
+"@azure/logger@^1.0.0":
+  version "1.2.0"
+  resolved "https://registry.yarnpkg.com/@azure/logger/-/logger-1.2.0.tgz#a79aefcdd57d2a96603fab59c9a66e0d9022a564"
+  integrity sha512-0hKEzLhpw+ZTAfNJyRrn6s+V0nDWzXk9OjBr2TiGIu0OfMr5s2V4FpKLTAK3Ca5r5OKLbf4hkOGDPyiRjie/jA==
+  dependencies:
+    "@typespec/ts-http-runtime" "^0.2.2"
+    tslib "^2.6.2"
+
+"@azure/opentelemetry-instrumentation-azure-sdk@^1.0.0-beta.5":
+  version "1.0.0-beta.9"
+  resolved "https://registry.yarnpkg.com/@azure/opentelemetry-instrumentation-azure-sdk/-/opentelemetry-instrumentation-azure-sdk-1.0.0-beta.9.tgz#d8451d39c342df2acbc6f4a416902bbd2315f133"
+  integrity sha512-gNCFokEoQQEkhu2T8i1i+1iW2o9wODn2slu5tpqJmjV1W7qf9dxVv6GNXW1P1WC8wMga8BCc2t/oMhOK3iwRQg==
+  dependencies:
+    "@azure/core-tracing" "^1.2.0"
+    "@azure/logger" "^1.0.0"
+    "@opentelemetry/api" "^1.9.0"
+    "@opentelemetry/core" "^2.0.0"
+    "@opentelemetry/instrumentation" "^0.200.0"
+    "@opentelemetry/sdk-trace-web" "^2.0.0"
+    tslib "^2.7.0"
+
 "@babel/code-frame@^7.0.0", "@babel/code-frame@^7.26.2":
   version "7.26.2"
   resolved "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.26.2.tgz"
@@ -1288,6 +1364,11 @@
     node-fetch "^2.6.1"
     xml2js "^0.6.2"
 
+"@microsoft/applicationinsights-web-snippet@1.0.1":
+  version "1.0.1"
+  resolved "https://registry.yarnpkg.com/@microsoft/applicationinsights-web-snippet/-/applicationinsights-web-snippet-1.0.1.tgz#6bb788b2902e48bf5d460c38c6bb7fedd686ddd7"
+  integrity sha512-2IHAOaLauc8qaAitvWS+U931T+ze+7MNWrDHY47IENP5y2UA0vqJDu67kWZDdpCN1fFC77sfgfB+HV7SrKshnQ==
+
 "@modelcontextprotocol/inspector-cli@^0.14.0":
   version "0.14.0"
   resolved "https://registry.yarnpkg.com/@modelcontextprotocol/inspector-cli/-/inspector-cli-0.14.0.tgz#880ddbd921a98fac5c9765cdff20e4dd13e93bfc"
@@ -1494,6 +1575,95 @@
     lodash "^4.17.21"
     registry-auth-token "^5.1.0"
 
+"@opentelemetry/api-logs@0.200.0":
+  version "0.200.0"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/api-logs/-/api-logs-0.200.0.tgz#f9015fd844920c13968715b3cdccf5a4d4ff907e"
+  integrity sha512-IKJBQxh91qJ+3ssRly5hYEJ8NDHu9oY/B1PXVSCWf7zytmYO9RNLB0Ox9XQ/fJ8m6gY6Q6NtBWlmXfaXt5Uc4Q==
+  dependencies:
+    "@opentelemetry/api" "^1.3.0"
+
+"@opentelemetry/api@^1.3.0", "@opentelemetry/api@^1.7.0", "@opentelemetry/api@^1.9.0":
+  version "1.9.0"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/api/-/api-1.9.0.tgz#d03eba68273dc0f7509e2a3d5cba21eae10379fe"
+  integrity sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==
+
+"@opentelemetry/core@1.30.1", "@opentelemetry/core@^1.19.0":
+  version "1.30.1"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/core/-/core-1.30.1.tgz#a0b468bb396358df801881709ea38299fc30ab27"
+  integrity sha512-OOCM2C/QIURhJMuKaekP3TRBxBKxG/TWWA0TL2J6nXUtDnuCtccy49LUJF8xPFXMX+0LMcxFpCo8M9cGY1W6rQ==
+  dependencies:
+    "@opentelemetry/semantic-conventions" "1.28.0"
+
+"@opentelemetry/core@2.0.1", "@opentelemetry/core@^2.0.0":
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/core/-/core-2.0.1.tgz#44e1149d5666a4743cde943ef89841db3ce0f8bc"
+  integrity sha512-MaZk9SJIDgo1peKevlbhP6+IwIiNPNmswNL4AF0WaQJLbHXjr9SrZMgS12+iqr9ToV4ZVosCcc0f8Rg67LXjxw==
+  dependencies:
+    "@opentelemetry/semantic-conventions" "^1.29.0"
+
+"@opentelemetry/instrumentation@^0.200.0":
+  version "0.200.0"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/instrumentation/-/instrumentation-0.200.0.tgz#29d1d4f70cbf0cb1ca9f2f78966379b0be96bddc"
+  integrity sha512-pmPlzfJd+vvgaZd/reMsC8RWgTXn2WY1OWT5RT42m3aOn5532TozwXNDhg1vzqJ+jnvmkREcdLr27ebJEQt0Jg==
+  dependencies:
+    "@opentelemetry/api-logs" "0.200.0"
+    "@types/shimmer" "^1.2.0"
+    import-in-the-middle "^1.8.1"
+    require-in-the-middle "^7.1.1"
+    shimmer "^1.2.1"
+
+"@opentelemetry/resources@1.30.1":
+  version "1.30.1"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/resources/-/resources-1.30.1.tgz#a4eae17ebd96947fdc7a64f931ca4b71e18ce964"
+  integrity sha512-5UxZqiAgLYGFjS4s9qm5mBVo433u+dSPUFWVWXmLAD4wB65oMCoXaJP1KJa9DIYYMeHu3z4BZcStG3LC593cWA==
+  dependencies:
+    "@opentelemetry/core" "1.30.1"
+    "@opentelemetry/semantic-conventions" "1.28.0"
+
+"@opentelemetry/resources@2.0.1":
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/resources/-/resources-2.0.1.tgz#0365d134291c0ed18d96444a1e21d0e6a481c840"
+  integrity sha512-dZOB3R6zvBwDKnHDTB4X1xtMArB/d324VsbiPkX/Yu0Q8T2xceRthoIVFhJdvgVM2QhGVUyX9tzwiNxGtoBJUw==
+  dependencies:
+    "@opentelemetry/core" "2.0.1"
+    "@opentelemetry/semantic-conventions" "^1.29.0"
+
+"@opentelemetry/sdk-trace-base@2.0.1":
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.0.1.tgz#25808bb6a3d08a501ad840249e4d43d3493eb6e5"
+  integrity sha512-xYLlvk/xdScGx1aEqvxLwf6sXQLXCjk3/1SQT9X9AoN5rXRhkdvIFShuNNmtTEPRBqcsMbS4p/gJLNI2wXaDuQ==
+  dependencies:
+    "@opentelemetry/core" "2.0.1"
+    "@opentelemetry/resources" "2.0.1"
+    "@opentelemetry/semantic-conventions" "^1.29.0"
+
+"@opentelemetry/sdk-trace-base@^1.19.0":
+  version "1.30.1"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/sdk-trace-base/-/sdk-trace-base-1.30.1.tgz#41a42234096dc98e8f454d24551fc80b816feb34"
+  integrity sha512-jVPgBbH1gCy2Lb7X0AVQ8XAfgg0pJ4nvl8/IiQA6nxOsPvS+0zMJaFSs2ltXe0J6C8dqjcnpyqINDJmU30+uOg==
+  dependencies:
+    "@opentelemetry/core" "1.30.1"
+    "@opentelemetry/resources" "1.30.1"
+    "@opentelemetry/semantic-conventions" "1.28.0"
+
+"@opentelemetry/sdk-trace-web@^2.0.0":
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/sdk-trace-web/-/sdk-trace-web-2.0.1.tgz#ad6f590cbc1a1a2e800a3815bd6b1923c8c78a4d"
+  integrity sha512-R4/i0rISvAujG4Zwk3s6ySyrWG+Db3SerZVM4jZ2lEzjrNylF7nRAy1hVvWe8gTbwIxX+6w6ZvZwdtl2C7UQHQ==
+  dependencies:
+    "@opentelemetry/core" "2.0.1"
+    "@opentelemetry/sdk-trace-base" "2.0.1"
+
+"@opentelemetry/semantic-conventions@1.28.0":
+  version "1.28.0"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/semantic-conventions/-/semantic-conventions-1.28.0.tgz#337fb2bca0453d0726696e745f50064411f646d6"
+  integrity sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA==
+
+"@opentelemetry/semantic-conventions@^1.19.0", "@opentelemetry/semantic-conventions@^1.29.0":
+  version "1.34.0"
+  resolved "https://registry.yarnpkg.com/@opentelemetry/semantic-conventions/-/semantic-conventions-1.34.0.tgz#8b6a46681b38a4d5947214033ac48128328c1738"
+  integrity sha512-aKcOkyrorBGlajjRdVoJWHTxfxO1vCNHLJVlSDaRHDIdjU+pX8IYQPvPDkYiujKLbRnWU+1TBwEt0QRgSm4SGA==
+
 "@pkgjs/parseargs@^0.11.0":
   version "0.11.0"
   resolved "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz"
@@ -1992,6 +2162,17 @@
     isomorphic-git "^1.30.1"
     ts-retry-promise "^0.8.1"
 
+"@salesforce/telemetry@^6.0.39":
+  version "6.0.39"
+  resolved "https://registry.yarnpkg.com/@salesforce/telemetry/-/telemetry-6.0.39.tgz#b27cddd42948ba59d0aab9edde3db8e434ad68b4"
+  integrity sha512-5Wc+c7TOSeevqDpANVBDgJ17/fnOSmw8Ok/d3jhPli757vyHg0FAJyX9x+1mXxZhDtYJag9yJZ1RWEjOB+7RgQ==
+  dependencies:
+    "@salesforce/core" "^8.8.0"
+    "@salesforce/kit" "^3.2.3"
+    applicationinsights "^2.9.6"
+    got "^11"
+    proxy-agent "^6.5.0"
+
 "@salesforce/ts-types@^2.0.10", "@salesforce/ts-types@^2.0.11", "@salesforce/ts-types@^2.0.12":
   version "2.0.12"
   resolved "https://registry.npmjs.org/@salesforce/ts-types/-/ts-types-2.0.12.tgz"
@@ -2775,6 +2956,11 @@
     "@types/glob" "~7.2.0"
     "@types/node" "*"
 
+"@types/shimmer@^1.2.0":
+  version "1.2.0"
+  resolved "https://registry.yarnpkg.com/@types/shimmer/-/shimmer-1.2.0.tgz#9b706af96fa06416828842397a70dfbbf1c14ded"
+  integrity sha512-UE7oxhQLLd9gub6JKIAhDq06T0F6FnztwMNRvYgjeQSBeMc1ZG/tA47EwfduvkuQS8apbkM/lpLpWsaCeYsXVg==
+
 "@types/sinon@^10.0.20":
   version "10.0.20"
   resolved "https://registry.npmjs.org/@types/sinon/-/sinon-10.0.20.tgz"
@@ -2928,6 +3114,15 @@
     "@typescript-eslint/types" "7.18.0"
     eslint-visitor-keys "^3.4.3"
 
+"@typespec/ts-http-runtime@^0.2.2":
+  version "0.2.3"
+  resolved "https://registry.yarnpkg.com/@typespec/ts-http-runtime/-/ts-http-runtime-0.2.3.tgz#5a5796588ba050b57bda58852697d6173377b647"
+  integrity sha512-oRhjSzcVjX8ExyaF8hC0zzTqxlVuRlgMHL/Bh4w3xB9+wjbm0FpXylVU/lBrn+kgphwYTrOk3tp+AVShGmlYCg==
+  dependencies:
+    http-proxy-agent "^7.0.0"
+    https-proxy-agent "^7.0.0"
+    tslib "^2.6.2"
+
 "@ungap/structured-clone@^1.0.0", "@ungap/structured-clone@^1.2.0":
   version "1.3.0"
   resolved "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.3.0.tgz"
@@ -2956,6 +3151,11 @@ accepts@^2.0.0:
     mime-types "^3.0.0"
     negotiator "^1.0.0"
 
+acorn-import-attributes@^1.9.5:
+  version "1.9.5"
+  resolved "https://registry.yarnpkg.com/acorn-import-attributes/-/acorn-import-attributes-1.9.5.tgz#7eb1557b1ba05ef18b5ed0ec67591bfab04688ef"
+  integrity sha512-n02Vykv5uA3eHGM/Z2dQrcD56kL8TyDb2p1+0P83PClMnC/nc+anbQRhIOWnSq4Ke/KvDPrY3C9hDtC/A3eHnQ==
+
 acorn-jsx@^5.3.2:
   version "5.3.2"
   resolved "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz"
@@ -2966,6 +3166,11 @@ acorn-walk@^8.1.1:
   resolved "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.2.0.tgz"
   integrity sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==
 
+acorn@^8.14.0:
+  version "8.15.0"
+  resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.15.0.tgz#a360898bc415edaac46c8241f6383975b930b816"
+  integrity sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==
+
 acorn@^8.4.1, acorn@^8.9.0:
   version "8.11.3"
   resolved "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz"
@@ -3065,6 +3270,24 @@ append-transform@^2.0.0:
   dependencies:
     default-require-extensions "^3.0.0"
 
+applicationinsights@^2.9.6:
+  version "2.9.7"
+  resolved "https://registry.yarnpkg.com/applicationinsights/-/applicationinsights-2.9.7.tgz#78a2a8e27497eb697ff0bc0ac28625d95a1db0fa"
+  integrity sha512-dxIVB2AAEMec3FDiYThgEbc9R4u1TatrzL+kFgOf+ABaEgHm8+i8ngVLHfKObjHvy2HHPf810OLWTrqyeWT/oA==
+  dependencies:
+    "@azure/core-auth" "1.7.2"
+    "@azure/core-rest-pipeline" "1.16.3"
+    "@azure/opentelemetry-instrumentation-azure-sdk" "^1.0.0-beta.5"
+    "@microsoft/applicationinsights-web-snippet" "1.0.1"
+    "@opentelemetry/api" "^1.7.0"
+    "@opentelemetry/core" "^1.19.0"
+    "@opentelemetry/sdk-trace-base" "^1.19.0"
+    "@opentelemetry/semantic-conventions" "^1.19.0"
+    cls-hooked "^4.2.2"
+    continuation-local-storage "^3.2.1"
+    diagnostic-channel "1.1.1"
+    diagnostic-channel-publishers "1.0.8"
+
 archy@^1.0.0:
   version "1.0.0"
   resolved "https://registry.npmjs.org/archy/-/archy-1.0.0.tgz"
@@ -3206,6 +3429,21 @@ ast-types@^0.13.4:
   dependencies:
     tslib "^2.0.1"
 
+async-hook-jl@^1.7.6:
+  version "1.7.6"
+  resolved "https://registry.yarnpkg.com/async-hook-jl/-/async-hook-jl-1.7.6.tgz#4fd25c2f864dbaf279c610d73bf97b1b28595e68"
+  integrity sha512-gFaHkFfSxTjvoxDMYqDuGHlcRyUuamF8s+ZTtJdDzqjws4mCt7v0vuV79/E2Wr2/riMQgtG4/yUtXWs1gZ7JMg==
+  dependencies:
+    stack-chain "^1.3.7"
+
+async-listener@^0.6.0:
+  version "0.6.10"
+  resolved "https://registry.yarnpkg.com/async-listener/-/async-listener-0.6.10.tgz#a7c97abe570ba602d782273c0de60a51e3e17cbc"
+  integrity sha512-gpuo6xOyF4D5DE5WvyqZdPA3NGhiT6Qf07l7DCB0wwDEsLvDIbCr6j9S5aj5Ch96dLace5tXVzWBZkxU/c5ohw==
+  dependencies:
+    semver "^5.3.0"
+    shimmer "^1.1.0"
+
 async-lock@^1.4.1:
   version "1.4.1"
   resolved "https://registry.npmjs.org/async-lock/-/async-lock-1.4.1.tgz"
@@ -3584,6 +3822,11 @@ ci-info@^4.0.0:
   resolved "https://registry.npmjs.org/ci-info/-/ci-info-4.0.0.tgz"
   integrity sha512-TdHqgGf9odd8SXNuxtUBVx8Nv+qZOejE6qyqiy5NtbYYQOeFa6zmHkxlPzmaLxWWHsU6nJmB7AETdVPi+2NBUg==
 
+cjs-module-lexer@^1.2.2:
+  version "1.4.3"
+  resolved "https://registry.yarnpkg.com/cjs-module-lexer/-/cjs-module-lexer-1.4.3.tgz#0f79731eb8cfe1ec72acd4066efac9d61991b00d"
+  integrity sha512-9z8TZaGM1pfswYeXrUpzPrkx8UnWYdhJclsiYMm6x/w5+nN+8Tf/LnAgfLGQCm59qAOxU8WwHEq2vNwF6i4j+Q==
+
 class-variance-authority@^0.7.0:
   version "0.7.1"
   resolved "https://registry.npmjs.org/class-variance-authority/-/class-variance-authority-0.7.1.tgz"
@@ -3659,6 +3902,15 @@ clone-response@^1.0.2:
   dependencies:
     mimic-response "^1.0.0"
 
+cls-hooked@^4.2.2:
+  version "4.2.2"
+  resolved "https://registry.yarnpkg.com/cls-hooked/-/cls-hooked-4.2.2.tgz#ad2e9a4092680cdaffeb2d3551da0e225eae1908"
+  integrity sha512-J4Xj5f5wq/4jAvcdgoGsL3G103BtWpZrMo8NEinRltN+xpTZdI+M38pyQqhuFU/P792xkMFvnKSf+Lm81U1bxw==
+  dependencies:
+    async-hook-jl "^1.7.6"
+    emitter-listener "^1.0.1"
+    semver "^5.4.1"
+
 clsx@^2.1.1:
   version "2.1.1"
   resolved "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz"
@@ -3778,6 +4030,14 @@ content-type@^1.0.4, content-type@^1.0.5:
   resolved "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz"
   integrity sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==
 
+continuation-local-storage@^3.2.1:
+  version "3.2.1"
+  resolved "https://registry.yarnpkg.com/continuation-local-storage/-/continuation-local-storage-3.2.1.tgz#11f613f74e914fe9b34c92ad2d28fe6ae1db7ffb"
+  integrity sha512-jx44cconVqkCEEyLSKWwkvUXwO561jXMa3LPjTPsm5QR22PA0/mhe33FT4Xb5y74JDvt/Cq+5lm8S8rskLv9ZA==
+  dependencies:
+    async-listener "^0.6.0"
+    emitter-listener "^1.1.1"
+
 conventional-changelog-angular@^6.0.0:
   version "6.0.0"
   resolved "https://registry.npmjs.org/conventional-changelog-angular/-/conventional-changelog-angular-6.0.0.tgz"
@@ -4069,6 +4329,18 @@ devlop@^1.0.0:
   dependencies:
     dequal "^2.0.0"
 
+diagnostic-channel-publishers@1.0.8:
+  version "1.0.8"
+  resolved "https://registry.yarnpkg.com/diagnostic-channel-publishers/-/diagnostic-channel-publishers-1.0.8.tgz#700557a902c443cb11f999f19f50a8bb3be490a0"
+  integrity sha512-HmSm9hXxSPxA9BaLGY98QU1zsdjeCk113KjAYGPCen1ZP6mhVaTPzHd6UYv5r21DnWANi+f+NyPOHruGT9jpqQ==
+
+diagnostic-channel@1.1.1:
+  version "1.1.1"
+  resolved "https://registry.yarnpkg.com/diagnostic-channel/-/diagnostic-channel-1.1.1.tgz#44b60972de9ee055c16216535b0e9db3f6a0efd0"
+  integrity sha512-r2HV5qFkUICyoaKlBEpLKHjxMXATUf/l+h8UZPGBHGLy4DDiY2sOLcIctax4eRnTw5wH2jTMExLntGPJ8eOJxw==
+  dependencies:
+    semver "^7.5.3"
+
 diff3@0.0.3:
   version "0.0.3"
   resolved "https://registry.npmjs.org/diff3/-/diff3-0.0.3.tgz"
@@ -4188,6 +4460,13 @@ electron-to-chromium@^1.5.73:
   resolved "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.141.tgz"
   integrity sha512-qS+qH9oqVYc1ooubTiB9l904WVyM6qNYxtOEEGReoZXw3xlqeYdFr5GclNzbkAufWgwWLEPoDi3d9MoRwwIjGw==
 
+emitter-listener@^1.0.1, emitter-listener@^1.1.1:
+  version "1.1.2"
+  resolved "https://registry.yarnpkg.com/emitter-listener/-/emitter-listener-1.1.2.tgz#56b140e8f6992375b3d7cb2cab1cc7432d9632e8"
+  integrity sha512-Bt1sBAGFHY9DKY+4/2cV6izcKJUf5T7/gkdmkxzX/qv9CcGH8xSwVRW5mtX03SWJtRTWSOpzCuWN9rBFYZepZQ==
+  dependencies:
+    shimmer "^1.2.0"
+
 emoji-regex-xs@^1.0.0:
   version "1.0.0"
   resolved "https://registry.npmjs.org/emoji-regex-xs/-/emoji-regex-xs-1.0.0.tgz"
@@ -5187,7 +5466,7 @@ gopd@^1.0.1, gopd@^1.2.0:
   resolved "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz"
   integrity sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==
 
-got@^11.8.6:
+got@^11, got@^11.8.6:
   version "11.8.6"
   resolved "https://registry.npmjs.org/got/-/got-11.8.6.tgz"
   integrity sha512-6tfZ91bOr7bOXnK7PRDCGBLa1H4U080YHNaAQ2KsMGlLEzRbk44nsZF2E1IeRc3vtJHPVbKCYgdFbaGO2ljd8g==
@@ -5441,7 +5720,7 @@ https-proxy-agent@^5.0.0:
     agent-base "6"
     debug "4"
 
-https-proxy-agent@^7.0.1, https-proxy-agent@^7.0.6:
+https-proxy-agent@^7.0.0, https-proxy-agent@^7.0.1, https-proxy-agent@^7.0.6:
   version "7.0.6"
   resolved "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz"
   integrity sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==
@@ -5501,6 +5780,16 @@ import-fresh@^3.0.0, import-fresh@^3.2.1, import-fresh@^3.3.0:
     parent-module "^1.0.0"
     resolve-from "^4.0.0"
 
+import-in-the-middle@^1.8.1:
+  version "1.14.0"
+  resolved "https://registry.yarnpkg.com/import-in-the-middle/-/import-in-the-middle-1.14.0.tgz#c9b6cd3400718ff0acbbf5c870adf708bbf5ef39"
+  integrity sha512-g5zLT0HaztRJWysayWYiUq/7E5H825QIiecMD2pI5QO7Wzr847l6GDvPvmZaDIdrDtS2w7qRczywxiK6SL5vRw==
+  dependencies:
+    acorn "^8.14.0"
+    acorn-import-attributes "^1.9.5"
+    cjs-module-lexer "^1.2.2"
+    module-details-from-path "^1.0.3"
+
 imurmurhash@^0.1.4:
   version "0.1.4"
   resolved "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz"
@@ -5615,6 +5904,13 @@ is-core-module@^2.13.0, is-core-module@^2.13.1, is-core-module@^2.5.0:
   dependencies:
     hasown "^2.0.0"
 
+is-core-module@^2.16.0:
+  version "2.16.1"
+  resolved "https://registry.yarnpkg.com/is-core-module/-/is-core-module-2.16.1.tgz#2a98801a849f43e2add644fbb6bc6229b19a4ef4"
+  integrity sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==
+  dependencies:
+    hasown "^2.0.2"
+
 is-date-object@^1.0.1:
   version "1.0.5"
   resolved "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.5.tgz"
@@ -6638,6 +6934,11 @@ mocha@^10.7.0:
     yargs-parser "^20.2.9"
     yargs-unparser "^2.0.0"
 
+module-details-from-path@^1.0.3:
+  version "1.0.4"
+  resolved "https://registry.yarnpkg.com/module-details-from-path/-/module-details-from-path-1.0.4.tgz#b662fdcd93f6c83d3f25289da0ce81c8d9685b94"
+  integrity sha512-EGWKgxALGMgzvxYF1UyGTy0HXX/2vHLkw6+NvDKW2jypWbHpjQuj4UMcqQWXHERJhVGKikolT06G3bcKe4fi7w==
+
 mri@^1.2.0:
   version "1.2.0"
   resolved "https://registry.npmjs.org/mri/-/mri-1.2.0.tgz"
@@ -7357,7 +7658,7 @@ proxy-addr@^2.0.7:
     forwarded "0.2.0"
     ipaddr.js "1.9.1"
 
-proxy-agent@^6.4.0:
+proxy-agent@^6.4.0, proxy-agent@^6.5.0:
   version "6.5.0"
   resolved "https://registry.npmjs.org/proxy-agent/-/proxy-agent-6.5.0.tgz"
   integrity sha512-TmatMXdr2KlRiA2CyDu8GqR8EjahTG3aY3nXjdzFyoZbmB8hrBsTyMezhULIXKnC0jpfjlmiZ3+EaCzoInSu/A==
@@ -7650,6 +7951,15 @@ require-from-string@^2.0.2:
   resolved "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz"
   integrity sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==
 
+require-in-the-middle@^7.1.1:
+  version "7.5.2"
+  resolved "https://registry.yarnpkg.com/require-in-the-middle/-/require-in-the-middle-7.5.2.tgz#dc25b148affad42e570cf0e41ba30dc00f1703ec"
+  integrity sha512-gAZ+kLqBdHarXB64XpAe2VCjB7rIRv+mU8tfRWziHRJ5umKsIHN2tLLv6EtMw7WCdP19S0ERVMldNvxYCHnhSQ==
+  dependencies:
+    debug "^4.3.5"
+    module-details-from-path "^1.0.3"
+    resolve "^1.22.8"
+
 require-main-filename@^2.0.0:
   version "2.0.0"
   resolved "https://registry.npmjs.org/require-main-filename/-/require-main-filename-2.0.0.tgz"
@@ -7691,6 +8001,15 @@ resolve@^1.1.6, resolve@^1.10.0, resolve@^1.22.2, resolve@^1.22.4:
     path-parse "^1.0.7"
     supports-preserve-symlinks-flag "^1.0.0"
 
+resolve@^1.22.8:
+  version "1.22.10"
+  resolved "https://registry.yarnpkg.com/resolve/-/resolve-1.22.10.tgz#b663e83ffb09bbf2386944736baae803029b8b39"
+  integrity sha512-NPRy+/ncIMeDlTAsuqwKIiferiawhefFJtkNSW0qZJEqMEb+qBt/77B/jGeeek+F0uOeN05CDa6HXbbIgtVX4w==
+  dependencies:
+    is-core-module "^2.16.0"
+    path-parse "^1.0.7"
+    supports-preserve-symlinks-flag "^1.0.0"
+
 responselike@^2.0.0:
   version "2.0.1"
   resolved "https://registry.npmjs.org/responselike/-/responselike-2.0.1.tgz"
@@ -7825,6 +8144,11 @@ semver@7.5.4:
   dependencies:
     lru-cache "^6.0.0"
 
+semver@^5.3.0, semver@^5.4.1:
+  version "5.7.2"
+  resolved "https://registry.yarnpkg.com/semver/-/semver-5.7.2.tgz#48d55db737c3287cd4835e17fa13feace1c41ef8"
+  integrity sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==
+
 semver@^6.0.0, semver@^6.3.1:
   version "6.3.1"
   resolved "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz"
@@ -7835,7 +8159,7 @@ semver@^7.3.4, semver@^7.5.4, semver@^7.6.0, semver@^7.6.3:
   resolved "https://registry.npmjs.org/semver/-/semver-7.7.1.tgz"
   integrity sha512-hlq8tAfn0m/61p4BVRcPzIGr6LKiMwo4VM6dGi6pt4qcRkmNzTcWq6eCEjEh+qXjkMDvPlOFFSGwQjoEa6gyMA==
 
-semver@^7.3.5, semver@^7.7.1:
+semver@^7.3.5, semver@^7.5.3, semver@^7.7.1:
   version "7.7.2"
   resolved "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz"
   integrity sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==
@@ -7991,6 +8315,11 @@ shiki@^1.16.2:
     "@shikijs/vscode-textmate" "^10.0.1"
     "@types/hast" "^3.0.4"
 
+shimmer@^1.1.0, shimmer@^1.2.0, shimmer@^1.2.1:
+  version "1.2.1"
+  resolved "https://registry.yarnpkg.com/shimmer/-/shimmer-1.2.1.tgz#610859f7de327b587efebf501fb43117f9aff337"
+  integrity sha512-sQTKC1Re/rM6XyFM6fIAGHRPVGvyXfgzIDvzoq608vM+jeyVD0Tu1E6Np0Kc2zAIFWIj963V2800iF/9LPieQw==
+
 side-channel-list@^1.0.0:
   version "1.0.0"
   resolved "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz"
@@ -8239,21 +8568,17 @@ srcset@^5.0.0:
   resolved "https://registry.npmjs.org/srcset/-/srcset-5.0.1.tgz"
   integrity sha512-/P1UYbGfJVlxZag7aABNRrulEXAwCSDo7fklafOQrantuPTDmYgijJMks2zusPCVzgW9+4P69mq7w6pYuZpgxw==
 
+stack-chain@^1.3.7:
+  version "1.3.7"
+  resolved "https://registry.yarnpkg.com/stack-chain/-/stack-chain-1.3.7.tgz#d192c9ff4ea6a22c94c4dd459171e3f00cea1285"
+  integrity sha512-D8cWtWVdIe/jBA7v5p5Hwl5yOSOrmZPWDPe2KxQ5UAGD+nxbxU0lKXA4h85Ta6+qgdKVL3vUxsbIZjc1kBG7ug==
+
 statuses@2.0.1, statuses@^2.0.1:
   version "2.0.1"
   resolved "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz"
   integrity sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==
 
-"string-width-cjs@npm:string-width@^4.2.0":
-  version "4.2.3"
-  resolved "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz"
-  integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
-  dependencies:
-    emoji-regex "^8.0.0"
-    is-fullwidth-code-point "^3.0.0"
-    strip-ansi "^6.0.1"
-
-string-width@^4.0.0, string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
+"string-width-cjs@npm:string-width@^4.2.0", string-width@^4.0.0, string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
   version "4.2.3"
   resolved "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz"
   integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
@@ -8320,14 +8645,7 @@ stringify-entities@^4.0.0:
     character-entities-html4 "^2.0.0"
     character-entities-legacy "^3.0.0"
 
-"strip-ansi-cjs@npm:strip-ansi@^6.0.1":
-  version "6.0.1"
-  resolved "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz"
-  integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
-  dependencies:
-    ansi-regex "^5.0.1"
-
-strip-ansi@6.0.1, strip-ansi@^6.0.0, strip-ansi@^6.0.1:
+"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@6.0.1, strip-ansi@^6.0.0, strip-ansi@^6.0.1:
   version "6.0.1"
   resolved "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz"
   integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
@@ -8553,7 +8871,7 @@ tsconfig-paths@^3.15.0:
     minimist "^1.2.6"
     strip-bom "^3.0.0"
 
-tslib@^2.0.0, tslib@^2.0.1, tslib@^2.0.3, tslib@^2.1.0, tslib@^2.6.2:
+tslib@^2.0.0, tslib@^2.0.1, tslib@^2.0.3, tslib@^2.1.0, tslib@^2.6.2, tslib@^2.7.0:
   version "2.8.1"
   resolved "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz"
   integrity sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==
@@ -8973,7 +9291,7 @@ workerpool@^6.5.1:
   resolved "https://registry.npmjs.org/workerpool/-/workerpool-6.5.1.tgz"
   integrity sha512-Fs4dNYcsdpYSAfVxhnl1L5zTksjvOJxtC5hzMNl+1t9B8hTJTdKDyZ5ju7ztgPy+ft9tBFXoOlDNiOT9WUXZlA==
 
-"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0":
+"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0:
   version "7.0.0"
   resolved "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz"
   integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
@@ -8991,15 +9309,6 @@ wrap-ansi@^6.2.0:
     string-width "^4.1.0"
     strip-ansi "^6.0.0"
 
-wrap-ansi@^7.0.0:
-  version "7.0.0"
-  resolved "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz"
-  integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
-  dependencies:
-    ansi-styles "^4.0.0"
-    string-width "^4.1.0"
-    strip-ansi "^6.0.0"
-
 wrap-ansi@^8.1.0:
   version "8.1.0"
   resolved "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz"

From e15d1e8d3143b2c91ffa6b130fcb2f398e445d62 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 10 Jun 2025 14:39:01 -0600
Subject: [PATCH 02/51] fix: ensure all events have same props

---
 src/telemetry.ts | 50 ++++++++++++++++--------------------------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/src/telemetry.ts b/src/telemetry.ts
index 1abdf625..cbfb44fc 100644
--- a/src/telemetry.ts
+++ b/src/telemetry.ts
@@ -90,11 +90,24 @@ export class Telemetry {
     this.cliId = getCliId(config.cacheDir);
   }
 
-  public sendEvent(eventName: string, attributes: Attributes): void {
+  public sendEvent(eventName: string, attributes?: Attributes): void {
     this.reporter?.sendTelemetryEvent(eventName, {
       ...attributes,
+      // Identifiers
       sessionId: this.sessionId,
       cliId: this.cliId,
+      // System information
+      version: this.config.version,
+      platform: this.config.platform,
+      arch: this.config.arch,
+      nodeVersion: process.version,
+      nodeEnv: process.env.NODE_ENV,
+      shell: this.config.shell,
+      origin: this.config.userAgent,
+      // Timestamps
+      date: new Date().toUTCString(),
+      timestamp: String(Date.now()),
+      processUptime: process.uptime() * 1000,
     });
   }
 
@@ -111,23 +124,8 @@ export class Telemetry {
 
     this.reporter.start();
 
-    this.reporter.sendTelemetryEvent('MCP_SERVER_STARTED', {
+    this.sendEvent('MCP_SERVER_STARTED', {
       ...attributes,
-      // Identifiers
-      sessionId: this.sessionId,
-      cliId: this.cliId,
-      // System information
-      version: this.config.version,
-      platform: this.config.platform,
-      arch: this.config.arch,
-      nodeVersion: process.version,
-      nodeEnv: process.env.NODE_ENV,
-      shell: this.config.shell,
-      origin: this.config.userAgent,
-      // Timestamps
-      date: new Date().toUTCString(),
-      timestamp: String(Date.now()),
-      processUptime: process.uptime() * 1000,
     });
   }
 
@@ -135,23 +133,7 @@ export class Telemetry {
     if (!this.started) return;
     this.started = false;
 
-    this.reporter?.sendTelemetryEvent('MCP_SERVER_STOPPED', {
-      // Identifiers
-      sessionId: this.sessionId,
-      cliId: this.cliId,
-      // System information
-      version: this.config.version,
-      platform: this.config.platform,
-      arch: this.config.arch,
-      nodeVersion: process.version,
-      nodeEnv: process.env.NODE_ENV,
-      shell: this.config.shell,
-      origin: this.config.userAgent,
-      // Timestamps
-      date: new Date().toUTCString(),
-      timestamp: String(Date.now()),
-      processUptime: process.uptime() * 1000,
-    });
+    this.sendEvent('MCP_SERVER_STOPPED');
 
     this.reporter?.stop();
   }

From 5ba0e32534a6d3d6de8bd8fb371cfc4aa3a4919d Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 10 Jun 2025 15:28:22 -0600
Subject: [PATCH 03/51] chore: clean up

---
 src/sf-mcp-server.ts | 3 +++
 src/telemetry.ts     | 9 ---------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
index 6c29379c..beb5527e 100644
--- a/src/sf-mcp-server.ts
+++ b/src/sf-mcp-server.ts
@@ -30,6 +30,9 @@ import { Telemetry } from './telemetry.js';
 /**
  * A server implementation that extends the base MCP server with telemetry capabilities.
  *
+ * The method overloads for `tool` are taken directly from the source code for the original McpServer. They're
+ * copied here so that the types don't get lost.
+ *
  * @extends {McpServer}
  */
 export class SfMcpServer extends McpServer {
diff --git a/src/telemetry.ts b/src/telemetry.ts
index cbfb44fc..7805f5e9 100644
--- a/src/telemetry.ts
+++ b/src/telemetry.ts
@@ -14,15 +14,6 @@
  * limitations under the License.
  */
 
-// acknowledge telemetry unless the user has explicitly disabled it
-// create a session id that is sent with every event
-// use the @salesforce/telemetry package to send all events
-// find the user id stored at /Users/<username>/Library/Caches/sf/CLIID.txt
-//   this path is configurable by the user and differs by OS so we need to make a best guess at where it is and then default to a new one
-//   if the file doesn't exist.
-//   a best guess might be to access this.config.cacheDir and replace 'sf' with 'sf-mcp-server'. That will get use the OS specific paths
-//   but it won't work if the user has a different cache directory set via env var.
-
 import { randomBytes } from 'node:crypto';
 import { readFileSync } from 'node:fs';
 import { join } from 'node:path';

From 4df91ce80f5ed44bcb0b54fe350813bbb3dad7e3 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Wed, 11 Jun 2025 09:27:09 -0600
Subject: [PATCH 04/51] feat: add runtimeMs

---
 src/sf-mcp-server.ts               | 9 ++++++++-
 src/tools/orgs/sf-list-all-orgs.ts | 4 ----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
index beb5527e..781b92de 100644
--- a/src/sf-mcp-server.ts
+++ b/src/sf-mcp-server.ts
@@ -116,15 +116,22 @@ export class SfMcpServer extends McpServer {
     const cb = rest[rest.length - 1] as ToolCallback;
 
     const wrappedCb = async (args: RequestHandlerExtra<ServerRequest, ServerNotification>): Promise<CallToolResult> => {
+      const startTime = Date.now();
+      const result = await cb(args);
+      const runtimeMs = Date.now() - startTime;
+
       this.telemetry?.sendEvent('MCP_SERVER_TOOL_CALLED', {
         name,
+        runtimeMs,
       });
-      const result = await cb(args);
+
       if (result.isError) {
         this.telemetry?.sendEvent('MCP_SERVER_TOOL_ERROR', {
           name,
+          runtimeMs,
         });
       }
+
       return result;
     };
 
diff --git a/src/tools/orgs/sf-list-all-orgs.ts b/src/tools/orgs/sf-list-all-orgs.ts
index a165e715..b2738bd1 100644
--- a/src/tools/orgs/sf-list-all-orgs.ts
+++ b/src/tools/orgs/sf-list-all-orgs.ts
@@ -40,8 +40,6 @@ export const listAllOrgsParamsSchema = z.object({
 export type ListAllOrgsOptions = z.infer<typeof listAllOrgsParamsSchema>;
 
 export const registerToolListAllOrgs = (server: McpServer): void => {
-  // eslint-disable-next-line no-console
-  console.error('registerToolListAllOrgs');
   server.tool(
     'sf-list-all-orgs',
     `Lists all configured Salesforce orgs.
@@ -56,8 +54,6 @@ List all orgs
 `,
     listAllOrgsParamsSchema.shape,
     async ({ directory }) => {
-      // eslint-disable-next-line no-console
-      console.error('listAllOrgs', directory);
       try {
         process.chdir(directory);
         const orgs = await getAllAllowedOrgs();

From e5c062aa8b4f7171aef722d67c5b8f09470ccdb2 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Wed, 11 Jun 2025 11:13:57 -0600
Subject: [PATCH 05/51] fix: handle failed connection to appinsights

---
 src/telemetry.ts | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/telemetry.ts b/src/telemetry.ts
index 7805f5e9..314d6633 100644
--- a/src/telemetry.ts
+++ b/src/telemetry.ts
@@ -106,18 +106,22 @@ export class Telemetry {
     if (this.started) return;
     this.started = true;
 
-    this.reporter = await McpTelemetryReporter.create({
-      project: PROJECT,
-      key: APP_INSIGHTS_KEY,
-      userId: this.cliId,
-      waitForConnection: true,
-    });
+    try {
+      this.reporter = await McpTelemetryReporter.create({
+        project: PROJECT,
+        key: APP_INSIGHTS_KEY,
+        userId: this.cliId,
+        waitForConnection: true,
+      });
 
-    this.reporter.start();
+      this.reporter.start();
 
-    this.sendEvent('MCP_SERVER_STARTED', {
-      ...attributes,
-    });
+      this.sendEvent('MCP_SERVER_STARTED', {
+        ...attributes,
+      });
+    } catch {
+      // connection probably failed, but we can continue without telemetry
+    }
   }
 
   public stop(): void {

From 8e393fa0c9029ffb4f05bcdbe5cd69c109f73cdb Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 12 Jun 2025 09:59:29 -0600
Subject: [PATCH 06/51] feat: logging

---
 bin/run.js           | 6 ++++++
 src/index.ts         | 8 ++++++++
 src/sf-mcp-server.ts | 6 ++++++
 3 files changed, 20 insertions(+)

diff --git a/bin/run.js b/bin/run.js
index 176d2af5..17a05a34 100755
--- a/bin/run.js
+++ b/bin/run.js
@@ -1,5 +1,11 @@
 #!/usr/bin/env node
 
+if (process.argv.includes('--debug')) {
+  process.env.DEBUG = 'sf*';
+  process.env.SF_LOG_COLORIZE = 'false';
+  process.env.SF_LOG_STDERR = 'true';
+}
+
 import { execute } from '@oclif/core';
 
 await execute({ dir: import.meta.url });
diff --git a/src/index.ts b/src/index.ts
index fac5769c..e69b4f49 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -99,6 +99,9 @@ You can also use special values to control access to orgs:
     'no-telemetry': Flags.boolean({
       summary: 'Disable telemetry',
     }),
+    debug: Flags.boolean({
+      summary: 'Enable debug logging',
+    }),
   };
 
   public static examples = [
@@ -155,6 +158,7 @@ You can also use special values to control access to orgs:
     // ************************
     // CORE TOOLS (always on)
     // ************************
+    this.logToStderr('Registering core tools');
     // get username
     core.registerToolGetUsername(server);
 
@@ -162,6 +166,7 @@ You can also use special values to control access to orgs:
     // ORG TOOLS
     // ************************
     if (all || enabledToolsets.has('orgs')) {
+      this.logToStderr('Registering org tools');
       // list all orgs
       orgs.registerToolListAllOrgs(server);
     }
@@ -170,6 +175,7 @@ You can also use special values to control access to orgs:
     // DATA TOOLS
     // ************************
     if (all || enabledToolsets.has('data')) {
+      this.logToStderr('Registering data tools');
       // query org
       data.registerToolQueryOrg(server);
     }
@@ -178,6 +184,7 @@ You can also use special values to control access to orgs:
     // USER TOOLS
     // ************************
     if (all || enabledToolsets.has('users')) {
+      this.logToStderr('Registering user tools');
       // assign permission set
       users.registerToolAssignPermissionSet(server);
     }
@@ -186,6 +193,7 @@ You can also use special values to control access to orgs:
     // METADATA TOOLS
     // ************************
     if (all || enabledToolsets.has('metadata')) {
+      this.logToStderr('Registering metadata tools');
       // deploy metadata
       metadata.registerToolDeployMetadata(server);
       // retrieve metadata
diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
index 781b92de..538b3979 100644
--- a/src/sf-mcp-server.ts
+++ b/src/sf-mcp-server.ts
@@ -25,6 +25,7 @@ import {
 import { ServerOptions } from '@modelcontextprotocol/sdk/server/index.js';
 import { RequestHandlerExtra } from '@modelcontextprotocol/sdk/shared/protocol.js';
 import { ZodRawShape } from 'zod';
+import { Logger } from '@salesforce/core';
 import { Telemetry } from './telemetry.js';
 
 /**
@@ -36,6 +37,7 @@ import { Telemetry } from './telemetry.js';
  * @extends {McpServer}
  */
 export class SfMcpServer extends McpServer {
+  private logger = Logger.childFromRoot('mcp-server');
   /** Optional telemetry instance for tracking server events */
   private telemetry?: Telemetry;
 
@@ -116,16 +118,20 @@ export class SfMcpServer extends McpServer {
     const cb = rest[rest.length - 1] as ToolCallback;
 
     const wrappedCb = async (args: RequestHandlerExtra<ServerRequest, ServerNotification>): Promise<CallToolResult> => {
+      this.logger.debug(`Tool ${name} called`);
       const startTime = Date.now();
       const result = await cb(args);
       const runtimeMs = Date.now() - startTime;
 
+      this.logger.debug(`Tool ${name} completed in ${runtimeMs}ms`);
+
       this.telemetry?.sendEvent('MCP_SERVER_TOOL_CALLED', {
         name,
         runtimeMs,
       });
 
       if (result.isError) {
+        this.logger.debug(`Tool ${name} errored`);
         this.telemetry?.sendEvent('MCP_SERVER_TOOL_ERROR', {
           name,
           runtimeMs,

From fe2d52b40eaf5e11ca516d59fa84d0dc9afb7b2f Mon Sep 17 00:00:00 2001
From: Cristian Dominguez <cdominguez@salesforce.com>
Date: Thu, 12 Jun 2025 11:13:42 -0300
Subject: [PATCH 07/51] chore: extract method signatures

---
 src/sf-mcp-server.ts | 80 +++++---------------------------------------
 1 file changed, 8 insertions(+), 72 deletions(-)

diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
index 781b92de..dbe832fb 100644
--- a/src/sf-mcp-server.ts
+++ b/src/sf-mcp-server.ts
@@ -15,18 +15,15 @@
  */
 
 import { McpServer, RegisteredTool, ToolCallback } from '@modelcontextprotocol/sdk/server/mcp.js';
-import {
-  CallToolResult,
-  Implementation,
-  ServerNotification,
-  ServerRequest,
-  ToolAnnotations,
-} from '@modelcontextprotocol/sdk/types.js';
+import { CallToolResult, Implementation, ServerNotification, ServerRequest } from '@modelcontextprotocol/sdk/types.js';
 import { ServerOptions } from '@modelcontextprotocol/sdk/server/index.js';
 import { RequestHandlerExtra } from '@modelcontextprotocol/sdk/shared/protocol.js';
-import { ZodRawShape } from 'zod';
 import { Telemetry } from './telemetry.js';
 
+type ToolMethodSignatures = {
+  tool: McpServer['tool'];
+};
+
 /**
  * A server implementation that extends the base MCP server with telemetry capabilities.
  *
@@ -35,7 +32,7 @@ import { Telemetry } from './telemetry.js';
  *
  * @extends {McpServer}
  */
-export class SfMcpServer extends McpServer {
+export class SfMcpServer extends McpServer implements ToolMethodSignatures {
   /** Optional telemetry instance for tracking server events */
   private telemetry?: Telemetry;
 
@@ -50,68 +47,7 @@ export class SfMcpServer extends McpServer {
     this.telemetry = options?.telemetry;
   }
 
-  /**
-   * Registers a zero-argument tool `name`, which will run the given function when the client calls it.
-   */
-  public tool(name: string, cb: ToolCallback): RegisteredTool;
-  /**
-   * Registers a zero-argument tool `name` (with a description) which will run the given function when the client calls it.
-   */
-  public tool(name: string, description: string, cb: ToolCallback): RegisteredTool;
-  /**
-   * Registers a tool taking either a parameter schema for validation or annotations for additional metadata.
-   * This unified overload handles both `tool(name, paramsSchema, cb)` and `tool(name, annotations, cb)` cases.
-   *
-   * Note: We use a union type for the second parameter because TypeScript cannot reliably disambiguate
-   * between ToolAnnotations and ZodRawShape during overload resolution, as both are plain object types.
-   */
-  public tool<Args extends ZodRawShape>(
-    name: string,
-    paramsSchemaOrAnnotations: Args | ToolAnnotations,
-    cb: ToolCallback<Args>
-  ): RegisteredTool;
-  /**
-   * Registers a tool `name` (with a description) taking either parameter schema or annotations.
-   * This unified overload handles both `tool(name, description, paramsSchema, cb)` and
-   * `tool(name, description, annotations, cb)` cases.
-   *
-   * Note: We use a union type for the third parameter because TypeScript cannot reliably disambiguate
-   * between ToolAnnotations and ZodRawShape during overload resolution, as both are plain object types.
-   */
-  public tool<Args extends ZodRawShape>(
-    name: string,
-    description: string,
-    paramsSchemaOrAnnotations: Args | ToolAnnotations,
-    cb: ToolCallback<Args>
-  ): RegisteredTool;
-  /**
-   * Registers a tool with both parameter schema and annotations.
-   */
-  public tool<Args extends ZodRawShape>(
-    name: string,
-    paramsSchema: Args,
-    annotations: ToolAnnotations,
-    cb: ToolCallback<Args>
-  ): RegisteredTool;
-  /**
-   * Registers a tool with description, parameter schema, and annotations.
-   */
-  public tool<Args extends ZodRawShape>(
-    name: string,
-    description: string,
-    paramsSchema: Args,
-    annotations: ToolAnnotations,
-    cb: ToolCallback<Args>
-  ): RegisteredTool;
-
-  /**
-   * Registers a tool with the server and wraps its callback with telemetry tracking
-   *
-   * @param {string} name - The name of the tool to register
-   * @param {...unknown[]} rest - Additional arguments for tool registration, with the last argument being the callback
-   * @returns {RegisteredTool} The registered tool instance
-   */
-  public tool(name: string, ...rest: unknown[]): RegisteredTool {
+  public tool: McpServer['tool'] = (name: string, ...rest: unknown[]): RegisteredTool => {
     // Given the signature of the tool function, the last argument is always the callback
     const cb = rest[rest.length - 1] as ToolCallback;
 
@@ -137,5 +73,5 @@ export class SfMcpServer extends McpServer {
 
     // @ts-expect-error because we no longer know what the type of rest is
     return super.tool(name, ...rest.slice(0, -1), wrappedCb);
-  }
+  };
 }

From 5e85f75fdba6954d4cabe12b722f4998e0deb7aa Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 12 Jun 2025 10:45:15 -0600
Subject: [PATCH 08/51] chore: code review

---
 src/index.ts         | 21 ++++++++++++++-----
 src/sf-mcp-server.ts |  4 ++--
 src/telemetry.ts     | 49 +++++++++++++++++++++-----------------------
 3 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/src/index.ts b/src/index.ts
index fac5769c..050f1b6c 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -120,14 +120,13 @@ You can also use special values to control access to orgs:
 
   public async run(): Promise<void> {
     const { flags } = await this.parse(McpServerCommand);
+
     if (!flags['no-telemetry']) {
       this.telemetry = new Telemetry(this.config);
-      await this.telemetry.start({
-        toolsets: flags.toolsets.join(', '),
-        orgs: sanitizeOrgInput(flags.orgs),
-      });
+      await this.telemetry.start();
 
-      process.stdin.on('close', () => {
+      process.stdin.on('close', (err) => {
+        this.telemetry?.sendEvent(err ? 'SERVER_STOPPED_ERROR' : 'SERVER_STOPPED_SUCCESS');
         this.telemetry?.stop();
       });
     }
@@ -195,5 +194,17 @@ You can also use special values to control access to orgs:
     const transport = new StdioServerTransport();
     await server.connect(transport);
     console.error(`✅ Salesforce MCP Server v${this.config.version} running on stdio`);
+    this.telemetry?.sendEvent('SERVER_START_SUCCESS', {
+      toolsets: flags.toolsets.join(', '),
+      orgs: sanitizeOrgInput(flags.orgs),
+    });
+  }
+
+  protected async catch(error: Error): Promise<void> {
+    this.telemetry?.sendEvent('SERVER_START_ERROR', {
+      error: error.message,
+      stack: error.stack,
+    });
+    await super.catch(error);
   }
 }
diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
index dbe832fb..ffc295c5 100644
--- a/src/sf-mcp-server.ts
+++ b/src/sf-mcp-server.ts
@@ -56,13 +56,13 @@ export class SfMcpServer extends McpServer implements ToolMethodSignatures {
       const result = await cb(args);
       const runtimeMs = Date.now() - startTime;
 
-      this.telemetry?.sendEvent('MCP_SERVER_TOOL_CALLED', {
+      this.telemetry?.sendEvent('TOOL_CALLED', {
         name,
         runtimeMs,
       });
 
       if (result.isError) {
-        this.telemetry?.sendEvent('MCP_SERVER_TOOL_ERROR', {
+        this.telemetry?.sendEvent('TOOL_ERROR', {
           name,
           runtimeMs,
         });
diff --git a/src/telemetry.ts b/src/telemetry.ts
index 314d6633..b4a3604c 100644
--- a/src/telemetry.ts
+++ b/src/telemetry.ts
@@ -82,27 +82,31 @@ export class Telemetry {
   }
 
   public sendEvent(eventName: string, attributes?: Attributes): void {
-    this.reporter?.sendTelemetryEvent(eventName, {
-      ...attributes,
-      // Identifiers
-      sessionId: this.sessionId,
-      cliId: this.cliId,
-      // System information
-      version: this.config.version,
-      platform: this.config.platform,
-      arch: this.config.arch,
-      nodeVersion: process.version,
-      nodeEnv: process.env.NODE_ENV,
-      shell: this.config.shell,
-      origin: this.config.userAgent,
-      // Timestamps
-      date: new Date().toUTCString(),
-      timestamp: String(Date.now()),
-      processUptime: process.uptime() * 1000,
-    });
+    try {
+      this.reporter?.sendTelemetryEvent(eventName, {
+        ...attributes,
+        // Identifiers
+        sessionId: this.sessionId,
+        cliId: this.cliId,
+        // System information
+        version: this.config.version,
+        platform: this.config.platform,
+        arch: this.config.arch,
+        nodeVersion: process.version,
+        nodeEnv: process.env.NODE_ENV,
+        shell: this.config.shell,
+        origin: this.config.userAgent,
+        // Timestamps
+        date: new Date().toUTCString(),
+        timestamp: String(Date.now()),
+        processUptime: process.uptime() * 1000,
+      });
+    } catch {
+      /* empty */
+    }
   }
 
-  public async start(attributes: Attributes): Promise<void> {
+  public async start(): Promise<void> {
     if (this.started) return;
     this.started = true;
 
@@ -115,10 +119,6 @@ export class Telemetry {
       });
 
       this.reporter.start();
-
-      this.sendEvent('MCP_SERVER_STARTED', {
-        ...attributes,
-      });
     } catch {
       // connection probably failed, but we can continue without telemetry
     }
@@ -127,9 +127,6 @@ export class Telemetry {
   public stop(): void {
     if (!this.started) return;
     this.started = false;
-
-    this.sendEvent('MCP_SERVER_STOPPED');
-
     this.reporter?.stop();
   }
 }

From c6d7f79f4433db271c8eb00794a3857840e76f20 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 12 Jun 2025 13:23:17 -0600
Subject: [PATCH 09/51] feat: add client info to telemetry events

---
 src/index.ts         | 12 ++++++------
 src/sf-mcp-server.ts | 28 ++++++++++++++++++++++++++++
 src/telemetry.ts     |  8 ++++++--
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/src/index.ts b/src/index.ts
index 050f1b6c..1f89a309 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -122,7 +122,11 @@ You can also use special values to control access to orgs:
     const { flags } = await this.parse(McpServerCommand);
 
     if (!flags['no-telemetry']) {
-      this.telemetry = new Telemetry(this.config);
+      this.telemetry = new Telemetry(this.config, {
+        toolsets: flags.toolsets.join(', '),
+        orgs: sanitizeOrgInput(flags.orgs),
+      });
+
       await this.telemetry.start();
 
       process.stdin.on('close', (err) => {
@@ -194,14 +198,10 @@ You can also use special values to control access to orgs:
     const transport = new StdioServerTransport();
     await server.connect(transport);
     console.error(`✅ Salesforce MCP Server v${this.config.version} running on stdio`);
-    this.telemetry?.sendEvent('SERVER_START_SUCCESS', {
-      toolsets: flags.toolsets.join(', '),
-      orgs: sanitizeOrgInput(flags.orgs),
-    });
   }
 
   protected async catch(error: Error): Promise<void> {
-    this.telemetry?.sendEvent('SERVER_START_ERROR', {
+    this.telemetry?.sendEvent('START_ERROR', {
       error: error.message,
       stack: error.stack,
     });
diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
index ffc295c5..05786567 100644
--- a/src/sf-mcp-server.ts
+++ b/src/sf-mcp-server.ts
@@ -18,10 +18,12 @@ import { McpServer, RegisteredTool, ToolCallback } from '@modelcontextprotocol/s
 import { CallToolResult, Implementation, ServerNotification, ServerRequest } from '@modelcontextprotocol/sdk/types.js';
 import { ServerOptions } from '@modelcontextprotocol/sdk/server/index.js';
 import { RequestHandlerExtra } from '@modelcontextprotocol/sdk/shared/protocol.js';
+import { Transport } from '@modelcontextprotocol/sdk/shared/transport.js';
 import { Telemetry } from './telemetry.js';
 
 type ToolMethodSignatures = {
   tool: McpServer['tool'];
+  connect: McpServer['connect'];
 };
 
 /**
@@ -45,8 +47,34 @@ export class SfMcpServer extends McpServer implements ToolMethodSignatures {
   public constructor(serverInfo: Implementation, options?: ServerOptions & { telemetry?: Telemetry }) {
     super(serverInfo, options);
     this.telemetry = options?.telemetry;
+    this.server.oninitialized = (): void => {
+      const clientInfo = this.server.getClientVersion();
+      if (clientInfo) {
+        this.telemetry?.addAttributes({
+          clientName: clientInfo.name,
+          clientVersion: clientInfo.version,
+        });
+      }
+      this.telemetry?.sendEvent('SERVER_START_SUCCESS');
+    };
   }
 
+  public connect: McpServer['connect'] = async (transport: Transport): Promise<void> => {
+    try {
+      await super.connect(transport);
+      if (!this.isConnected()) {
+        this.telemetry?.sendEvent('SERVER_START_ERROR', {
+          error: 'Server not connected',
+        });
+      }
+    } catch (error: unknown) {
+      this.telemetry?.sendEvent('SERVER_START_ERROR', {
+        error: error instanceof Error ? error.message : 'Unknown error',
+        stack: error instanceof Error ? error.stack : undefined,
+      });
+    }
+  };
+
   public tool: McpServer['tool'] = (name: string, ...rest: unknown[]): RegisteredTool => {
     // Given the signature of the tool function, the last argument is always the callback
     const cb = rest[rest.length - 1] as ToolCallback;
diff --git a/src/telemetry.ts b/src/telemetry.ts
index b4a3604c..2a2461a0 100644
--- a/src/telemetry.ts
+++ b/src/telemetry.ts
@@ -73,7 +73,7 @@ export class Telemetry {
   private started = false;
   private reporter?: McpTelemetryReporter;
 
-  public constructor(private readonly config: Config) {
+  public constructor(private readonly config: Config, private attributes: Attributes) {
     warn(
       'You acknowledge and agree that the MCP server may collect usage information, user environment, and crash reports for the purposes of providing services or functions that are relevant to use of the MCP server and product improvements.'
     );
@@ -81,9 +81,14 @@ export class Telemetry {
     this.cliId = getCliId(config.cacheDir);
   }
 
+  public addAttributes(attributes: Attributes): void {
+    this.attributes = { ...this.attributes, ...attributes };
+  }
+
   public sendEvent(eventName: string, attributes?: Attributes): void {
     try {
       this.reporter?.sendTelemetryEvent(eventName, {
+        ...this.attributes,
         ...attributes,
         // Identifiers
         sessionId: this.sessionId,
@@ -94,7 +99,6 @@ export class Telemetry {
         arch: this.config.arch,
         nodeVersion: process.version,
         nodeEnv: process.env.NODE_ENV,
-        shell: this.config.shell,
         origin: this.config.userAgent,
         // Timestamps
         date: new Date().toUTCString(),

From 7eda5d1a70860846d9f05cadcf5453a6c6dd7f45 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 12 Jun 2025 14:44:23 -0600
Subject: [PATCH 10/51] fix: init telemetry in catch

---
 src/index.ts     | 6 ++++++
 src/telemetry.ts | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/index.ts b/src/index.ts
index 1f89a309..becfdbfb 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -201,10 +201,16 @@ You can also use special values to control access to orgs:
   }
 
   protected async catch(error: Error): Promise<void> {
+    if (!this.telemetry) {
+      this.telemetry = new Telemetry(this.config);
+      await this.telemetry.start();
+    }
+
     this.telemetry?.sendEvent('START_ERROR', {
       error: error.message,
       stack: error.stack,
     });
+
     await super.catch(error);
   }
 }
diff --git a/src/telemetry.ts b/src/telemetry.ts
index 2a2461a0..7b5b538c 100644
--- a/src/telemetry.ts
+++ b/src/telemetry.ts
@@ -73,7 +73,7 @@ export class Telemetry {
   private started = false;
   private reporter?: McpTelemetryReporter;
 
-  public constructor(private readonly config: Config, private attributes: Attributes) {
+  public constructor(private readonly config: Config, private attributes: Attributes = {}) {
     warn(
       'You acknowledge and agree that the MCP server may collect usage information, user environment, and crash reports for the purposes of providing services or functions that are relevant to use of the MCP server and product improvements.'
     );

From 05d188ba80a4274d9c6119cf9af5d4ed897b46b1 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 12 Jun 2025 15:24:28 -0600
Subject: [PATCH 11/51] fix: consolidate TOOL_CALLED and TOOL_ERROR

---
 src/sf-mcp-server.ts | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
index 05786567..43100195 100644
--- a/src/sf-mcp-server.ts
+++ b/src/sf-mcp-server.ts
@@ -87,15 +87,9 @@ export class SfMcpServer extends McpServer implements ToolMethodSignatures {
       this.telemetry?.sendEvent('TOOL_CALLED', {
         name,
         runtimeMs,
+        isError: result.isError,
       });
 
-      if (result.isError) {
-        this.telemetry?.sendEvent('TOOL_ERROR', {
-          name,
-          runtimeMs,
-        });
-      }
-
       return result;
     };
 

From 024fa770470bd4fddec3c1362304d88ad28f1841 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 12 Jun 2025 15:23:51 -0600
Subject: [PATCH 12/51] feat: count tokens of each tool

---
 package.json         |  3 +-
 src/index.ts         |  4 ++-
 src/sf-mcp-server.ts | 76 ++++++++++++++++++++++++++++++++++++++++++--
 yarn.lock            |  5 +++
 4 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/package.json b/package.json
index 08aafed5..7d1d6e58 100644
--- a/package.json
+++ b/package.json
@@ -48,7 +48,8 @@
     "@salesforce/source-tracking": "^7.4.1",
     "@salesforce/telemetry": "^6.0.39",
     "@salesforce/ts-types": "^2.0.11",
-    "zod": "^3.25.42"
+    "zod": "^3.25.42",
+    "zod-to-json-schema": "^3.24.5"
   },
   "devDependencies": {
     "@modelcontextprotocol/inspector": "^0.14.0",
diff --git a/src/index.ts b/src/index.ts
index 7994264e..f14794a4 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -149,7 +149,9 @@ You can also use special values to control access to orgs:
           tools: {},
         },
       },
-      { telemetry: this.telemetry }
+      {
+        telemetry: this.telemetry,
+      }
     );
 
     // // TODO: Should we add annotations to our tools? https://modelcontextprotocol.io/docs/concepts/tools#tool-definition-structure
diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
index 3a45d294..b77f479d 100644
--- a/src/sf-mcp-server.ts
+++ b/src/sf-mcp-server.ts
@@ -15,11 +15,18 @@
  */
 
 import { McpServer, RegisteredTool, ToolCallback } from '@modelcontextprotocol/sdk/server/mcp.js';
-import { CallToolResult, Implementation, ServerNotification, ServerRequest } from '@modelcontextprotocol/sdk/types.js';
+import {
+  CallToolResult,
+  Implementation,
+  ServerNotification,
+  ServerRequest,
+  Tool,
+} from '@modelcontextprotocol/sdk/types.js';
 import { ServerOptions } from '@modelcontextprotocol/sdk/server/index.js';
 import { RequestHandlerExtra } from '@modelcontextprotocol/sdk/shared/protocol.js';
 import { Logger } from '@salesforce/core';
 import { Transport } from '@modelcontextprotocol/sdk/shared/transport.js';
+import { zodToJsonSchema } from 'zod-to-json-schema';
 import { Telemetry } from './telemetry.js';
 
 type ToolMethodSignatures = {
@@ -27,6 +34,39 @@ type ToolMethodSignatures = {
   connect: McpServer['connect'];
 };
 
+const EMPTY_OBJECT_JSON_SCHEMA = {
+  type: 'object' as const,
+};
+
+function countTokens(tool: Tool): number {
+  let totalTokens = 0;
+
+  // Count tokens in tool name
+  totalTokens += tool.name.length;
+
+  // Count tokens in description
+  if (tool.description) {
+    totalTokens += tool.description.length;
+  }
+
+  // Count tokens in input schema
+  if (tool.inputSchema) {
+    totalTokens += JSON.stringify(tool.inputSchema).length;
+  }
+
+  // Count tokens in output schema
+  if (tool.outputSchema) {
+    totalTokens += JSON.stringify(tool.outputSchema).length;
+  }
+
+  // Count tokens in annotations
+  if (tool.annotations) {
+    totalTokens += JSON.stringify(tool.annotations).length;
+  }
+
+  return totalTokens;
+}
+
 /**
  * A server implementation that extends the base MCP server with telemetry capabilities.
  *
@@ -37,6 +77,7 @@ type ToolMethodSignatures = {
  */
 export class SfMcpServer extends McpServer implements ToolMethodSignatures {
   private logger = Logger.childFromRoot('mcp-server');
+  private tokenCounts: Record<string, number> = {};
 
   /** Optional telemetry instance for tracking server events */
   private telemetry?: Telemetry;
@@ -59,6 +100,13 @@ export class SfMcpServer extends McpServer implements ToolMethodSignatures {
         });
       }
       this.telemetry?.sendEvent('SERVER_START_SUCCESS');
+      // eslint-disable-next-line no-console
+      console.error('Token counts', this.tokenCounts);
+      // eslint-disable-next-line no-console
+      console.error(
+        'Total tokens',
+        Object.values(this.tokenCounts).reduce((acc, count) => acc + count, 0)
+      );
     };
   }
 
@@ -101,6 +149,30 @@ export class SfMcpServer extends McpServer implements ToolMethodSignatures {
     };
 
     // @ts-expect-error because we no longer know what the type of rest is
-    return super.tool(name, ...rest.slice(0, -1), wrappedCb);
+    const tool = super.tool(name, ...rest.slice(0, -1), wrappedCb);
+
+    // Count the number to tokens for the tool definition
+    // Implementation copied from the typescript sdk:
+    // https://github.com/modelcontextprotocol/typescript-sdk/blob/dd69efa1de8646bb6b195ff8d5f52e13739f4550/src/server/mcp.ts#L110
+    const toolDefinition: Tool = {
+      name,
+      description: tool.description,
+      inputSchema: tool.inputSchema
+        ? (zodToJsonSchema(tool.inputSchema, {
+            strictUnions: true,
+          }) as Tool['inputSchema'])
+        : EMPTY_OBJECT_JSON_SCHEMA,
+      annotations: tool.annotations,
+    };
+
+    if (tool.outputSchema) {
+      toolDefinition.outputSchema = zodToJsonSchema(tool.outputSchema, {
+        strictUnions: true,
+      }) as Tool['outputSchema'];
+    }
+
+    this.tokenCounts[name] = countTokens(toolDefinition);
+
+    return tool;
   };
 }
diff --git a/yarn.lock b/yarn.lock
index 549ab2c7..cd6b09e7 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -9472,6 +9472,11 @@ zod-to-json-schema@^3.24.1:
   resolved "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.24.5.tgz"
   integrity sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g==
 
+zod-to-json-schema@^3.24.5:
+  version "3.24.5"
+  resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.24.5.tgz#d1095440b147fb7c2093812a53c54df8d5df50a3"
+  integrity sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g==
+
 zod@^3.23.8, zod@^3.25.42:
   version "3.25.42"
   resolved "https://registry.npmjs.org/zod/-/zod-3.25.42.tgz"

From a909d69497bc8e5076da1fb7762b1e88fe1285f8 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Fri, 13 Jun 2025 11:20:14 -0600
Subject: [PATCH 13/51] chore: clean up token count logging

---
 src/sf-mcp-server.ts | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
index b77f479d..dbd2ef32 100644
--- a/src/sf-mcp-server.ts
+++ b/src/sf-mcp-server.ts
@@ -100,13 +100,16 @@ export class SfMcpServer extends McpServer implements ToolMethodSignatures {
         });
       }
       this.telemetry?.sendEvent('SERVER_START_SUCCESS');
-      // eslint-disable-next-line no-console
-      console.error('Token counts', this.tokenCounts);
-      // eslint-disable-next-line no-console
-      console.error(
-        'Total tokens',
-        Object.values(this.tokenCounts).reduce((acc, count) => acc + count, 0)
+
+      this.logger.debug(
+        `Total tokens: ${Object.values(this.tokenCounts)
+          .reduce((acc, count) => acc + count, 0)
+          .toString()}`
       );
+
+      for (const [name, count] of Object.entries(this.tokenCounts)) {
+        this.logger.debug(`${name}: ${count}`);
+      }
     };
   }
 

From c5bb91061ef5fd63e444f7ce7f8606b51f3e9b65 Mon Sep 17 00:00:00 2001
From: Cristian Dominguez <cdominguez@salesforce.com>
Date: Mon, 16 Jun 2025 09:56:17 -0300
Subject: [PATCH 14/51] chore: bump core

---
 package.json |   2 +-
 yarn.lock    | 147 ++++++++-------------------------------------------
 2 files changed, 23 insertions(+), 126 deletions(-)

diff --git a/package.json b/package.json
index 123fa33c..fc2c413e 100644
--- a/package.json
+++ b/package.json
@@ -42,7 +42,7 @@
     "@jsforce/jsforce-node": "^3.8.2",
     "@modelcontextprotocol/sdk": "^1.12.3",
     "@oclif/core": "^4.3.3",
-    "@salesforce/core": "^8.11.4",
+    "@salesforce/core": "^8.14.0",
     "@salesforce/kit": "^3.1.6",
     "@salesforce/source-deploy-retrieve": "^12.19.10",
     "@salesforce/source-tracking": "^7.4.1",
diff --git a/yarn.lock b/yarn.lock
index 2eea2e82..a3fbff8f 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -1474,55 +1474,7 @@
     "@nodelib/fs.scandir" "2.1.5"
     fastq "^1.6.0"
 
-"@oclif/core@^4", "@oclif/core@^4.3.2":
-  version "4.3.2"
-  resolved "https://registry.yarnpkg.com/@oclif/core/-/core-4.3.2.tgz#522d5f94044ebde5f4041b0bad4f1abfe7a73f61"
-  integrity sha512-3OVGENifC2NzYn/t31fYOrZOGJ5WpUfRktz8v/W4QbP4Su3S/BcBuVuIde65B1mHrnAE/62yOFA/kLx4w1Vf8g==
-  dependencies:
-    ansi-escapes "^4.3.2"
-    ansis "^3.17.0"
-    clean-stack "^3.0.1"
-    cli-spinners "^2.9.2"
-    debug "^4.4.0"
-    ejs "^3.1.10"
-    get-package-type "^0.1.0"
-    indent-string "^4.0.0"
-    is-wsl "^2.2.0"
-    lilconfig "^3.1.3"
-    minimatch "^9.0.5"
-    semver "^7.6.3"
-    string-width "^4.2.3"
-    supports-color "^8"
-    tinyglobby "^0.2.13"
-    widest-line "^3.1.0"
-    wordwrap "^1.0.0"
-    wrap-ansi "^7.0.0"
-
-"@oclif/core@^4.2.10":
-  version "4.3.0"
-  resolved "https://registry.npmjs.org/@oclif/core/-/core-4.3.0.tgz"
-  integrity sha512-lIzHY+JMP6evrS5E/sGijNnwrCoNtGy8703jWXcMuPOYKiFhWoAqnIm1BGgoRgmxczkbSfRsHUL/lwsSgh74Lw==
-  dependencies:
-    ansi-escapes "^4.3.2"
-    ansis "^3.17.0"
-    clean-stack "^3.0.1"
-    cli-spinners "^2.9.2"
-    debug "^4.4.0"
-    ejs "^3.1.10"
-    get-package-type "^0.1.0"
-    globby "^11.1.0"
-    indent-string "^4.0.0"
-    is-wsl "^2.2.0"
-    lilconfig "^3.1.3"
-    minimatch "^9.0.5"
-    semver "^7.6.3"
-    string-width "^4.2.3"
-    supports-color "^8"
-    widest-line "^3.1.0"
-    wordwrap "^1.0.0"
-    wrap-ansi "^7.0.0"
-
-"@oclif/core@^4.3.3":
+"@oclif/core@^4", "@oclif/core@^4.2.10", "@oclif/core@^4.3.2", "@oclif/core@^4.3.3":
   version "4.3.3"
   resolved "https://registry.yarnpkg.com/@oclif/core/-/core-4.3.3.tgz#a527536b62ef202c58d2b69ce9cd1e64eb3a94b1"
   integrity sha512-A0mk4nlVE+r34fl91OdglXVPwhhfzM59IhSxnOigqMkwxFgT8z3i2WlUgzmazzvzSccs2KM4N2HkTS3NEvW96g==
@@ -2048,10 +2000,10 @@
     strip-ansi "6.0.1"
     ts-retry-promise "^0.8.1"
 
-"@salesforce/core@^8.11.4", "@salesforce/core@^8.12.0", "@salesforce/core@^8.8.0":
-  version "8.12.0"
-  resolved "https://registry.yarnpkg.com/@salesforce/core/-/core-8.12.0.tgz#a458cc3e39f4e7df57d94f0deaaa0fd0660b18c9"
-  integrity sha512-LJIjoQ3UQJ1r/xxdQcaG5bU8MfxeO/LJhrfK/7LZeHVtp1iOIgedbwPuVNzTzYciDWh8elborarrPM4uWjtu5g==
+"@salesforce/core@^8.12.0", "@salesforce/core@^8.14.0", "@salesforce/core@^8.8.0":
+  version "8.14.0"
+  resolved "https://registry.yarnpkg.com/@salesforce/core/-/core-8.14.0.tgz#fcdd8b641221fee668b95ed2ede56b251668077c"
+  integrity sha512-Ta1aY15TfgxLyFNNlkw60Mm3dDtiEb50TSp3/wzrbuMgkEGvFBEZQca/ChrjANXhpw8pURDUTzL4VV/1eGCHrQ==
   dependencies:
     "@jsforce/jsforce-node" "^3.8.2"
     "@salesforce/kit" "^3.2.2"
@@ -3166,16 +3118,11 @@ acorn-walk@^8.1.1:
   resolved "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.2.0.tgz"
   integrity sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==
 
-acorn@^8.14.0:
+acorn@^8.14.0, acorn@^8.4.1, acorn@^8.9.0:
   version "8.15.0"
   resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.15.0.tgz#a360898bc415edaac46c8241f6383975b930b816"
   integrity sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==
 
-acorn@^8.4.1, acorn@^8.9.0:
-  version "8.11.3"
-  resolved "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz"
-  integrity sha512-Y9rRfJG5jcKOE0CLisYbojUjIrIEE7AGMzA/Sm4BslANhbS+cDMpgBdcPT91oJ7OuJ9hYJBx59RjbhxVnrF8Xg==
-
 agent-base@6:
   version "6.0.2"
   resolved "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz"
@@ -4170,10 +4117,10 @@ dateformat@^4.6.3:
   resolved "https://registry.npmjs.org/dateformat/-/dateformat-4.6.3.tgz"
   integrity sha512-2P0p0pFGzHS5EMnhdxQi7aJN+iMheud0UhG4dlE1DLAlvL8JHjJJTX/CSm4JXwV0Ka5nGk3zC5mcb5bUQUxxMA==
 
-debug@4, debug@^4.1.0, debug@^4.1.1, debug@^4.3.1, debug@^4.3.2, debug@^4.3.4, debug@^4.3.5, debug@^4.3.7, debug@^4.4.0:
-  version "4.4.0"
-  resolved "https://registry.npmjs.org/debug/-/debug-4.4.0.tgz"
-  integrity sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA==
+debug@4, debug@^4.1.0, debug@^4.1.1, debug@^4.3.1, debug@^4.3.2, debug@^4.3.4, debug@^4.3.5, debug@^4.3.7, debug@^4.4.0, debug@^4.4.1:
+  version "4.4.1"
+  resolved "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz"
+  integrity sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==
   dependencies:
     ms "^2.1.3"
 
@@ -4184,13 +4131,6 @@ debug@^3.2.7:
   dependencies:
     ms "^2.1.1"
 
-debug@^4.4.1:
-  version "4.4.1"
-  resolved "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz"
-  integrity sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==
-  dependencies:
-    ms "^2.1.3"
-
 decamelize-keys@^1.1.0:
   version "1.1.0"
   resolved "https://registry.npmjs.org/decamelize-keys/-/decamelize-keys-1.1.0.tgz"
@@ -4267,14 +4207,7 @@ define-lazy-prop@^3.0.0:
   resolved "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz"
   integrity sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==
 
-define-properties@^1.1.3:
-  version "1.1.3"
-  resolved "https://registry.npmjs.org/define-properties/-/define-properties-1.1.3.tgz"
-  integrity sha512-3MqfYKj2lLzdMSf8ZIZE/V+Zuy+BgD6f164e8K2w7dgnpKArBDerGYpM46IYYcjnkdPNMjPk9A6VFB8+3SKlXQ==
-  dependencies:
-    object-keys "^1.0.12"
-
-define-properties@^1.2.0, define-properties@^1.2.1:
+define-properties@^1.1.3, define-properties@^1.2.0, define-properties@^1.2.1:
   version "1.2.1"
   resolved "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz"
   integrity sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==
@@ -5650,12 +5583,7 @@ htmlparser2@^9.0.0:
     domutils "^3.1.0"
     entities "^4.5.0"
 
-http-cache-semantics@^4.0.0:
-  version "4.1.1"
-  resolved "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.1.tgz"
-  integrity sha512-er295DKPVsV82j5kw1Gjt+ADA/XYHsajl82cGNQG2eyoPkvgUhX+nDIyelzhIWbbsXP39EHcI6l5tYs2FYqYXQ==
-
-http-cache-semantics@^4.1.1:
+http-cache-semantics@^4.0.0, http-cache-semantics@^4.1.1:
   version "4.2.0"
   resolved "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.2.0.tgz"
   integrity sha512-dTxcvPXqPvXBQpq5dUr6mEMJX4oIEFv6bwom3FDwKRDsuIjjJGANqhBuoAn9c1RQJIdAKav33ED65E2ys+87QQ==
@@ -5897,14 +5825,7 @@ is-callable@^1.1.3, is-callable@^1.1.4, is-callable@^1.2.7:
   resolved "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz"
   integrity sha512-1BC0BVFhS/p0qtw6enp8e+8OD0UrK0oFLztSjNzhcKA3WDuJxxAPXzPuPtKkjEY9UUoEWlX/8fgKeu2S8i9JTA==
 
-is-core-module@^2.13.0, is-core-module@^2.13.1, is-core-module@^2.5.0:
-  version "2.13.1"
-  resolved "https://registry.npmjs.org/is-core-module/-/is-core-module-2.13.1.tgz"
-  integrity sha512-hHrIjvZsftOsvKSn2TRYl63zvxsgE0K+0mYMoH6gD4omR5IWB2KynivBQczo3+wF1cCkjzvptnI9Q0sPU66ilw==
-  dependencies:
-    hasown "^2.0.0"
-
-is-core-module@^2.16.0:
+is-core-module@^2.13.0, is-core-module@^2.13.1, is-core-module@^2.16.0, is-core-module@^2.5.0:
   version "2.16.1"
   resolved "https://registry.yarnpkg.com/is-core-module/-/is-core-module-2.16.1.tgz#2a98801a849f43e2add644fbb6bc6229b19a4ef4"
   integrity sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==
@@ -6598,7 +6519,7 @@ lowercase-keys@^3.0.0:
   resolved "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-3.0.0.tgz"
   integrity sha512-ozCC6gdQ+glXOQsveKD0YsDy8DSQFjDTz4zyzEHNV5+JP5D62LmfDZ6o1cycFx9ouG940M5dE8C8CTewdj2YWQ==
 
-lru-cache@^10.0.1:
+lru-cache@^10.0.1, "lru-cache@^9.1.1 || ^10.0.0":
   version "10.4.3"
   resolved "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz"
   integrity sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==
@@ -6622,11 +6543,6 @@ lru-cache@^7.14.1:
   resolved "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz"
   integrity sha512-jumlc0BIUrS3qJGgIkWZsyfAM7NCWiBcCDhnd+3NNM5KbBmLTgHVfWBcg6W+rLUsIpzpERPsvwUP7CckAQSOoA==
 
-"lru-cache@^9.1.1 || ^10.0.0":
-  version "10.2.0"
-  resolved "https://registry.npmjs.org/lru-cache/-/lru-cache-10.2.0.tgz"
-  integrity sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==
-
 lucide-react@^0.447.0:
   version "0.447.0"
   resolved "https://registry.npmjs.org/lucide-react/-/lucide-react-0.447.0.tgz"
@@ -7125,7 +7041,7 @@ object-inspect@^1.13.1, object-inspect@^1.13.3:
   resolved "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz"
   integrity sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==
 
-object-keys@^1.0.12, object-keys@^1.1.1:
+object-keys@^1.1.1:
   version "1.1.1"
   resolved "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz"
   integrity sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==
@@ -7992,16 +7908,7 @@ resolve-global@1.0.0, resolve-global@^1.0.0:
   dependencies:
     global-dirs "^0.1.1"
 
-resolve@^1.1.6, resolve@^1.10.0, resolve@^1.22.2, resolve@^1.22.4:
-  version "1.22.8"
-  resolved "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz"
-  integrity sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw==
-  dependencies:
-    is-core-module "^2.13.0"
-    path-parse "^1.0.7"
-    supports-preserve-symlinks-flag "^1.0.0"
-
-resolve@^1.22.8:
+resolve@^1.1.6, resolve@^1.10.0, resolve@^1.22.2, resolve@^1.22.4, resolve@^1.22.8:
   version "1.22.10"
   resolved "https://registry.yarnpkg.com/resolve/-/resolve-1.22.10.tgz#b663e83ffb09bbf2386944736baae803029b8b39"
   integrity sha512-NPRy+/ncIMeDlTAsuqwKIiferiawhefFJtkNSW0qZJEqMEb+qBt/77B/jGeeek+F0uOeN05CDa6HXbbIgtVX4w==
@@ -8132,10 +8039,10 @@ secure-json-parse@^2.4.0:
   resolved "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz"
   integrity sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==
 
-"semver@2 || 3 || 4 || 5":
-  version "5.7.1"
-  resolved "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz"
-  integrity sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==
+"semver@2 || 3 || 4 || 5", semver@^5.3.0, semver@^5.4.1:
+  version "5.7.2"
+  resolved "https://registry.yarnpkg.com/semver/-/semver-5.7.2.tgz#48d55db737c3287cd4835e17fa13feace1c41ef8"
+  integrity sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==
 
 semver@7.5.4:
   version "7.5.4"
@@ -8144,22 +8051,12 @@ semver@7.5.4:
   dependencies:
     lru-cache "^6.0.0"
 
-semver@^5.3.0, semver@^5.4.1:
-  version "5.7.2"
-  resolved "https://registry.yarnpkg.com/semver/-/semver-5.7.2.tgz#48d55db737c3287cd4835e17fa13feace1c41ef8"
-  integrity sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==
-
 semver@^6.0.0, semver@^6.3.1:
   version "6.3.1"
   resolved "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz"
   integrity sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==
 
-semver@^7.3.4, semver@^7.5.4, semver@^7.6.0, semver@^7.6.3:
-  version "7.7.1"
-  resolved "https://registry.npmjs.org/semver/-/semver-7.7.1.tgz"
-  integrity sha512-hlq8tAfn0m/61p4BVRcPzIGr6LKiMwo4VM6dGi6pt4qcRkmNzTcWq6eCEjEh+qXjkMDvPlOFFSGwQjoEa6gyMA==
-
-semver@^7.3.5, semver@^7.5.3, semver@^7.7.1:
+semver@^7.3.4, semver@^7.3.5, semver@^7.5.3, semver@^7.5.4, semver@^7.6.0, semver@^7.6.3, semver@^7.7.1:
   version "7.7.2"
   resolved "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz"
   integrity sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==
@@ -8779,7 +8676,7 @@ tiny-jsonc@^1.0.2:
   resolved "https://registry.npmjs.org/tiny-jsonc/-/tiny-jsonc-1.0.2.tgz"
   integrity sha512-f5QDAfLq6zIVSyCZQZhhyl0QS6MvAyTxgz4X4x3+EoCktNWEYJ6PeoEA97fyb98njpBNNi88ybpD7m+BDFXaCw==
 
-tinyglobby@^0.2.13, tinyglobby@^0.2.14, tinyglobby@^0.2.9:
+tinyglobby@^0.2.14, tinyglobby@^0.2.9:
   version "0.2.14"
   resolved "https://registry.yarnpkg.com/tinyglobby/-/tinyglobby-0.2.14.tgz#5280b0cf3f972b050e74ae88406c0a6a58f4079d"
   integrity sha512-tX5e7OM1HnYr2+a2C/4V0htOcSQcoSTH9KgJnVvNm5zm/cyEWKJ7j7YutsH9CxMdtOkkLFy2AHrMci9IM8IPZQ==

From e93b915542312753d529460e4825d5927a796537 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Mon, 16 Jun 2025 09:13:19 -0600
Subject: [PATCH 15/51] fix: remove faulty token counting

---
 package.json         |  3 +-
 src/sf-mcp-server.ts | 79 ++------------------------------------------
 yarn.lock            |  5 ---
 3 files changed, 3 insertions(+), 84 deletions(-)

diff --git a/package.json b/package.json
index f6dcadc0..9b0f4439 100644
--- a/package.json
+++ b/package.json
@@ -48,8 +48,7 @@
     "@salesforce/source-tracking": "^7.4.1",
     "@salesforce/telemetry": "^6.0.39",
     "@salesforce/ts-types": "^2.0.11",
-    "zod": "^3.25.42",
-    "zod-to-json-schema": "^3.24.5"
+    "zod": "^3.25.42"
   },
   "devDependencies": {
     "@modelcontextprotocol/inspector": "^0.14.0",
diff --git a/src/sf-mcp-server.ts b/src/sf-mcp-server.ts
index dbd2ef32..3a45d294 100644
--- a/src/sf-mcp-server.ts
+++ b/src/sf-mcp-server.ts
@@ -15,18 +15,11 @@
  */
 
 import { McpServer, RegisteredTool, ToolCallback } from '@modelcontextprotocol/sdk/server/mcp.js';
-import {
-  CallToolResult,
-  Implementation,
-  ServerNotification,
-  ServerRequest,
-  Tool,
-} from '@modelcontextprotocol/sdk/types.js';
+import { CallToolResult, Implementation, ServerNotification, ServerRequest } from '@modelcontextprotocol/sdk/types.js';
 import { ServerOptions } from '@modelcontextprotocol/sdk/server/index.js';
 import { RequestHandlerExtra } from '@modelcontextprotocol/sdk/shared/protocol.js';
 import { Logger } from '@salesforce/core';
 import { Transport } from '@modelcontextprotocol/sdk/shared/transport.js';
-import { zodToJsonSchema } from 'zod-to-json-schema';
 import { Telemetry } from './telemetry.js';
 
 type ToolMethodSignatures = {
@@ -34,39 +27,6 @@ type ToolMethodSignatures = {
   connect: McpServer['connect'];
 };
 
-const EMPTY_OBJECT_JSON_SCHEMA = {
-  type: 'object' as const,
-};
-
-function countTokens(tool: Tool): number {
-  let totalTokens = 0;
-
-  // Count tokens in tool name
-  totalTokens += tool.name.length;
-
-  // Count tokens in description
-  if (tool.description) {
-    totalTokens += tool.description.length;
-  }
-
-  // Count tokens in input schema
-  if (tool.inputSchema) {
-    totalTokens += JSON.stringify(tool.inputSchema).length;
-  }
-
-  // Count tokens in output schema
-  if (tool.outputSchema) {
-    totalTokens += JSON.stringify(tool.outputSchema).length;
-  }
-
-  // Count tokens in annotations
-  if (tool.annotations) {
-    totalTokens += JSON.stringify(tool.annotations).length;
-  }
-
-  return totalTokens;
-}
-
 /**
  * A server implementation that extends the base MCP server with telemetry capabilities.
  *
@@ -77,7 +37,6 @@ function countTokens(tool: Tool): number {
  */
 export class SfMcpServer extends McpServer implements ToolMethodSignatures {
   private logger = Logger.childFromRoot('mcp-server');
-  private tokenCounts: Record<string, number> = {};
 
   /** Optional telemetry instance for tracking server events */
   private telemetry?: Telemetry;
@@ -100,16 +59,6 @@ export class SfMcpServer extends McpServer implements ToolMethodSignatures {
         });
       }
       this.telemetry?.sendEvent('SERVER_START_SUCCESS');
-
-      this.logger.debug(
-        `Total tokens: ${Object.values(this.tokenCounts)
-          .reduce((acc, count) => acc + count, 0)
-          .toString()}`
-      );
-
-      for (const [name, count] of Object.entries(this.tokenCounts)) {
-        this.logger.debug(`${name}: ${count}`);
-      }
     };
   }
 
@@ -152,30 +101,6 @@ export class SfMcpServer extends McpServer implements ToolMethodSignatures {
     };
 
     // @ts-expect-error because we no longer know what the type of rest is
-    const tool = super.tool(name, ...rest.slice(0, -1), wrappedCb);
-
-    // Count the number to tokens for the tool definition
-    // Implementation copied from the typescript sdk:
-    // https://github.com/modelcontextprotocol/typescript-sdk/blob/dd69efa1de8646bb6b195ff8d5f52e13739f4550/src/server/mcp.ts#L110
-    const toolDefinition: Tool = {
-      name,
-      description: tool.description,
-      inputSchema: tool.inputSchema
-        ? (zodToJsonSchema(tool.inputSchema, {
-            strictUnions: true,
-          }) as Tool['inputSchema'])
-        : EMPTY_OBJECT_JSON_SCHEMA,
-      annotations: tool.annotations,
-    };
-
-    if (tool.outputSchema) {
-      toolDefinition.outputSchema = zodToJsonSchema(tool.outputSchema, {
-        strictUnions: true,
-      }) as Tool['outputSchema'];
-    }
-
-    this.tokenCounts[name] = countTokens(toolDefinition);
-
-    return tool;
+    return super.tool(name, ...rest.slice(0, -1), wrappedCb);
   };
 }
diff --git a/yarn.lock b/yarn.lock
index cd6b09e7..549ab2c7 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -9472,11 +9472,6 @@ zod-to-json-schema@^3.24.1:
   resolved "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.24.5.tgz"
   integrity sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g==
 
-zod-to-json-schema@^3.24.5:
-  version "3.24.5"
-  resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.24.5.tgz#d1095440b147fb7c2093812a53c54df8d5df50a3"
-  integrity sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g==
-
 zod@^3.23.8, zod@^3.25.42:
   version "3.25.42"
   resolved "https://registry.npmjs.org/zod/-/zod-3.25.42.tgz"

From 91523e25a37b3b594ccc386b094bfa4b8c38ff2a Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Mon, 16 Jun 2025 09:17:44 -0600
Subject: [PATCH 16/51] fix: set SF_LOG_LEVEL when using --debug

---
 bin/run.js | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/run.js b/bin/run.js
index 17a05a34..792d403a 100755
--- a/bin/run.js
+++ b/bin/run.js
@@ -4,6 +4,7 @@ if (process.argv.includes('--debug')) {
   process.env.DEBUG = 'sf*';
   process.env.SF_LOG_COLORIZE = 'false';
   process.env.SF_LOG_STDERR = 'true';
+  process.env.SF_LOG_LEVEL = 'trace';
 }
 
 import { execute } from '@oclif/core';

From 03b7a3d923e54282d587f93ddf35f0327f08f4a5 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Mon, 16 Jun 2025 15:10:05 -0600
Subject: [PATCH 17/51] test: add testing against LLM Gateway

---
 package.json |   1 +
 test/llmg.ts | 250 +++++++++++++++++++++++++++++++++++++++++++++++
 yarn.lock    | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 516 insertions(+), 4 deletions(-)
 create mode 100644 test/llmg.ts

diff --git a/package.json b/package.json
index 492ed987..2684e329 100644
--- a/package.json
+++ b/package.json
@@ -52,6 +52,7 @@
   },
   "devDependencies": {
     "@modelcontextprotocol/inspector": "^0.14.1",
+    "@oclif/table": "^0.4.8",
     "@salesforce/cli-plugins-testkit": "^5.3.39",
     "@salesforce/dev-scripts": "11.0.2",
     "@types/node": "^22.15.31",
diff --git a/test/llmg.ts b/test/llmg.ts
new file mode 100644
index 00000000..d78dd5e6
--- /dev/null
+++ b/test/llmg.ts
@@ -0,0 +1,250 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+const API_KEY = process.env.SF_LLMG_API_KEY;
+process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
+
+if (!API_KEY) {
+  throw new Error('SF_LLMG_API_KEY is not set');
+}
+
+import { spawn } from 'node:child_process';
+import { Tool } from '@modelcontextprotocol/sdk/types.js';
+import { printTable } from '@oclif/table';
+
+type InvocableTool = {
+  name: string;
+  function: {
+    name: string;
+    description: string | undefined;
+    parameters: Tool['inputSchema'];
+  };
+};
+
+type GatewayResponse = {
+  generation_details: {
+    generations: Array<{
+      content: string;
+      tool_invocations: Array<{
+        function: {
+          name: string;
+          arguments: string;
+        };
+      }>;
+    }>;
+  };
+};
+
+const getToolsList = async (): Promise<InvocableTool[]> => {
+  const toolsList: string = await new Promise<string>((resolve, reject) => {
+    const child = spawn('npx', [
+      'mcp-inspector',
+      '--cli',
+      'node',
+      'bin/run.js',
+      '-o',
+      'DEFAULT_TARGET_ORG',
+      '--method',
+      'tools/list',
+    ]);
+
+    let output = '';
+
+    child.stdout.on('data', (data: Buffer) => {
+      output += data.toString();
+    });
+
+    child.stderr.on('data', (data: Buffer) => {
+      reject(new Error(data.toString()));
+    });
+
+    child.on('close', (code: number | null) => {
+      if (code === 0) {
+        resolve(output);
+      } else {
+        reject(new Error(`Process exited with code ${code}`));
+      }
+    });
+  });
+
+  const parsedToolsList = JSON.parse(toolsList) as { tools: Tool[] };
+  return (parsedToolsList.tools ?? []).map((tool) => ({
+    name: tool.name,
+    function: {
+      name: tool.name,
+      description: tool.description,
+      parameters: tool.inputSchema,
+    },
+  }));
+};
+
+const tools = await getToolsList();
+
+/**
+ * Generates a response from the LLM Gateway API using the specified model.
+ *
+ * @param {string} model - The model identifier to use for generation (e.g., 'llmgateway__AzureOpenAIGPT4Omni')
+ * @returns {Promise<unknown>} The parsed JSON response from the API
+ * @throws {Error} If the API request fails or returns an error
+ *
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/#make-your-first-gateway-request} Make Your First Gateway Request Documentation
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/} Models and Providers Documentation
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/apis/rest/#operation/chatMessages} REST API Documentation
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/function-calling/} Function Calling Documentation
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/auth/#api-key-limitations} API Key Authentication Documentation
+ */
+const generateResponse = async (
+  prompt: string,
+  model: string
+): Promise<{ model: string; response: GatewayResponse }> => {
+  const response = await fetch(
+    'https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations',
+    {
+      method: 'POST',
+      headers: {
+        Authorization: `API_KEY ${API_KEY}`,
+        'Content-Type': 'application/json',
+        // We need to figure out which tenant, context, and feature id to use
+        // Maybe this is something that will be given to us once the client registration completes???
+        'x-sfdc-core-tenant-id': 'core/prod1/00DDu0000008cuqMAA',
+        'x-sfdc-app-context': 'EinsteinGPT',
+        'x-client-feature-id': 'EinsteinDocsAnswers',
+      },
+      body: JSON.stringify({
+        model,
+        tools,
+        tool_config: {
+          mode: 'auto',
+        },
+        //   {
+        //     type: 'function',
+        //     function: {
+        //       name: 'get_current_weather',
+        //       description: 'Get the current weather in a given location.',
+        //       parameters: {
+        //         type: 'object',
+        //         properties: {
+        //           location: {
+        //             type: 'string',
+        //             description: 'The city and state, e.g. San Francisco, CA',
+        //           },
+        //           format: {
+        //             type: 'string',
+        //             enum: ['celsius', 'fahrenheit'],
+        //             description: 'The temperature unit to use. Infer this from the users location.',
+        //           },
+        //         },
+        //       },
+        //     },
+        //   },
+        //   {
+        //     name: 'sf-get-username',
+        //     function: {
+        //       name: 'sf-get-username',
+        //       description:
+        //         'Intelligently determines the appropriate username or alias for Salesforce operations.\n\nAGENT/LLM INSTRUCTIONS:\nUse this tool when uncertain which username/org a user wants for Salesforce operations.\nThis tool handles three distinct scenarios:\n\n1. When defaultTargetOrg=true: Fetches the default target org configuration\n   - Use when user says "for my default org" or "for my default target org"\n\n2. When defaultDevHub=true: Fetches the default dev hub configuration\n   - Use when user says "for my default dev hub" or "for my default target dev hub"\n\n3. When both are false (default): Uses suggestUsername to intelligently determine the appropriate org\n   - Use when user is vague and says something like "for my org" or doesn\'t specify\n\nEXAMPLE USAGE:\n- When user says "Do X for my org" → defaultTargetOrg=false, defaultDevHub=false\n- When user says "For my default org" → defaultTargetOrg=true\n- When user says "For my default dev hub" → defaultDevHub=true',
+        //       parameters: {
+        //         type: 'object',
+        //         properties: {
+        //           defaultTargetOrg: {
+        //             type: 'boolean',
+        //             default: false,
+        //             description:
+        //               'Try to find default org\nAGENT INSTRUCTIONS:\nONLY SET TO TRUE when the user explicitly asks for the default org or default target org.\nLeave it as false when the user is vague and says something like "for my org" or "for my-alias".\n\nUSAGE EXAMPLE:\nGet username for my default org\n...for my default target org',
+        //           },
+        //           defaultDevHub: {
+        //             type: 'boolean',
+        //             default: false,
+        //             description:
+        //               'Try to find default dev hub\nAGENT INSTRUCTIONS:\nONLY SET TO TRUE when the user explicitly asks for the default dev hub or default target devhub.\nLeave it as false when the user is vague and says something like "for my org" or "for my-alias".\n\nUSAGE EXAMPLE:\nGet username for my default dev hub\n...for my default target dev hub\n...for my default devhub',
+        //           },
+        //           directory: {
+        //             type: 'string',
+        //             description:
+        //               'The directory to run this tool from.\nAGENT INSTRUCTIONS:\nWe need to know where the user wants to run this tool from.\nLook at your current Workspace Context to determine this filepath.\nALWAYS USE A FULL PATH TO THE DIRECTORY.\nUnless the user explicitly asks for a different directory, or a new directory is created from the action of a tool, use this same directory for future tool calls.\n',
+        //           },
+        //         },
+        //         required: ['directory'],
+        //         additionalProperties: false,
+        //         $schema: 'http://json-schema.org/draft-07/schema#',
+        //       },
+        //     },
+        //   },
+        // ],
+        messages: [
+          {
+            role: 'user',
+            content: prompt,
+          },
+        ],
+        generation_settings: {
+          max_tokens: 500,
+          temperature: 0.5,
+          parameters: {},
+        },
+      }),
+    }
+  );
+
+  const json = (await response.json()) as GatewayResponse;
+
+  return {
+    response: json,
+    model,
+  };
+};
+
+const models = [
+  'llmgateway__OpenAIGPT35Turbo_01_25',
+  'llmgateway__OpenAIGPT4OmniMini',
+  'llmgateway__BedrockAnthropicClaude4Sonnet',
+];
+
+const prompts = [
+  "What's my salesforce username?",
+  'List all my orgs',
+  'Deploy my project (~/my-project) using the my-sf-org alias',
+];
+
+async function displayModelResponses(prompt: string) {
+  const responses = await Promise.all(models.map((model) => generateResponse(prompt, model)));
+
+  printTable({
+    title: `Prompt: ${prompt}`,
+    data: responses.map((response) => ({
+      model: response.model,
+      response: response.response.generation_details.generations[0].content,
+      tool: response.response.generation_details.generations[0].tool_invocations[0].function.name,
+      arguments: Object.entries(
+        JSON.parse(
+          response.response.generation_details.generations[0].tool_invocations[0].function.arguments
+        ) as Record<string, string>
+      )
+        .map(([key, value]) => `${key}: ${value}`)
+        .join('\n'),
+    })),
+    columns: ['model', 'response', 'tool', 'arguments'],
+    headerOptions: {
+      formatter: 'capitalCase',
+    },
+    overflow: 'wrap',
+  });
+}
+
+for (const prompt of prompts) {
+  // eslint-disable-next-line no-await-in-loop
+  await displayModelResponses(prompt);
+}
diff --git a/yarn.lock b/yarn.lock
index 422570fb..88fe831f 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -7,6 +7,14 @@
   resolved "https://registry.npmjs.org/@aashutoshrathi/word-wrap/-/word-wrap-1.2.6.tgz"
   integrity sha512-1Yjs2SvM8TflER/OD3cOjhWWOZb58A2t7wpE2S9XfBYTiIl+XFhQG2bjy4Pu1I+EAlCNUzRDYDdFwFYUKvXcIA==
 
+"@alcalzone/ansi-tokenize@^0.1.3":
+  version "0.1.3"
+  resolved "https://registry.yarnpkg.com/@alcalzone/ansi-tokenize/-/ansi-tokenize-0.1.3.tgz#9f89839561325a8e9a0c32360b8d17e48489993f"
+  integrity sha512-3yWxPTq3UQ/FY9p1ErPxIyfT64elWaMvM9lIHnaqpyft63tkxodF5aUElYHrdisWve5cETkh1+KBw1yJuW0aRw==
+  dependencies:
+    ansi-styles "^6.2.1"
+    is-fullwidth-code-point "^4.0.0"
+
 "@ampproject/remapping@^2.2.0":
   version "2.3.0"
   resolved "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz"
@@ -1527,6 +1535,21 @@
     lodash "^4.17.21"
     registry-auth-token "^5.1.0"
 
+"@oclif/table@^0.4.8":
+  version "0.4.8"
+  resolved "https://registry.yarnpkg.com/@oclif/table/-/table-0.4.8.tgz#38c38fc771ccc3754d2fe37f7279ce65c3d9ab8b"
+  integrity sha512-HgyeNTyUF67OQ2eOCFia0mfxyPFcPwa8sIq1SiiZf8oxw6JtUciWGXb0cmmo5vnbxRJ3er0PHLwMV0/hBG6NWw==
+  dependencies:
+    "@types/react" "^18.3.12"
+    change-case "^5.4.4"
+    cli-truncate "^4.0.0"
+    ink "5.0.1"
+    natural-orderby "^3.0.2"
+    object-hash "^3.0.0"
+    react "^18.3.1"
+    strip-ansi "^7.1.0"
+    wrap-ansi "^9.0.0"
+
 "@opentelemetry/api-logs@0.200.0":
   version "0.200.0"
   resolved "https://registry.yarnpkg.com/@opentelemetry/api-logs/-/api-logs-0.200.0.tgz#f9015fd844920c13968715b3cdccf5a4d4ff907e"
@@ -2888,6 +2911,19 @@
   resolved "https://registry.npmjs.org/@types/normalize-package-data/-/normalize-package-data-2.4.1.tgz"
   integrity sha512-Gj7cI7z+98M282Tqmp2K5EIsoouUEzbBJhQQzDE3jSIRk6r9gsz0oUokqIUR4u1R3dMHo0pDHM7sNOHyhulypw==
 
+"@types/prop-types@*":
+  version "15.7.15"
+  resolved "https://registry.yarnpkg.com/@types/prop-types/-/prop-types-15.7.15.tgz#e6e5a86d602beaca71ce5163fadf5f95d70931c7"
+  integrity sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==
+
+"@types/react@^18.3.12":
+  version "18.3.23"
+  resolved "https://registry.yarnpkg.com/@types/react/-/react-18.3.23.tgz#86ae6f6b95a48c418fecdaccc8069e0fbb63696a"
+  integrity sha512-/LDXMQh55EzZQ0uVAZmKKhfENivEvWz6E+EYzh+/MCjMhNsotd+ZHhBGIjFDTi6+fz0OhQQQLbTgdQIxxCsC0w==
+  dependencies:
+    "@types/prop-types" "*"
+    csstype "^3.0.2"
+
 "@types/responselike@^1.0.0":
   version "1.0.3"
   resolved "https://registry.npmjs.org/@types/responselike/-/responselike-1.0.3.tgz"
@@ -3175,6 +3211,13 @@ ansi-escapes@^4.3.2:
   dependencies:
     type-fest "^0.21.3"
 
+ansi-escapes@^7.0.0:
+  version "7.0.0"
+  resolved "https://registry.yarnpkg.com/ansi-escapes/-/ansi-escapes-7.0.0.tgz#00fc19f491bbb18e1d481b97868204f92109bfe7"
+  integrity sha512-GdYO7a61mR0fOlAsvC9/rIHf7L96sBc6dEWzeOu+KAea5bZyQRPIpojrVoI4AXGJS/ycu/fBTdLrUkA4ODrvjw==
+  dependencies:
+    environment "^1.0.0"
+
 ansi-regex@^5.0.1:
   version "5.0.1"
   resolved "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz"
@@ -3192,7 +3235,7 @@ ansi-styles@^4.0.0, ansi-styles@^4.1.0:
   dependencies:
     color-convert "^2.0.1"
 
-ansi-styles@^6.1.0:
+ansi-styles@^6.0.0, ansi-styles@^6.1.0, ansi-styles@^6.2.1:
   version "6.2.1"
   resolved "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz"
   integrity sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==
@@ -3418,6 +3461,11 @@ atomic-sleep@^1.0.0:
   resolved "https://registry.npmjs.org/atomic-sleep/-/atomic-sleep-1.0.0.tgz"
   integrity sha512-kNOjDqAh7px0XWNI+4QbzoiR/nTkHAWNud2uvnJquD1/x5a7EQZMJT0AczqK0Qn67oY/TTQ1LbUKajZpp3I9tQ==
 
+auto-bind@^5.0.1:
+  version "5.0.1"
+  resolved "https://registry.yarnpkg.com/auto-bind/-/auto-bind-5.0.1.tgz#50d8e63ea5a1dddcb5e5e36451c1a8266ffbb2ae"
+  integrity sha512-ooviqdwwgfIfNmDwo94wlshcdzfO64XV0Cg6oDsDYBJfITDz1EngD2z7DkbvCWn+XIMsIqW27sEVF6qcpJrRcg==
+
 available-typed-arrays@^1.0.7:
   version "1.0.7"
   resolved "https://registry.npmjs.org/available-typed-arrays/-/available-typed-arrays-1.0.7.tgz"
@@ -3709,6 +3757,11 @@ chalk@^5.0.0:
   resolved "https://registry.npmjs.org/chalk/-/chalk-5.3.0.tgz"
   integrity sha512-dLitG79d+GV1Nb/VYcCDFivJeK1hiukt9QjRNVOsUtTy1rR1YJsmpGGTZ3qJos+uw7WmWF4wUwBd9jxjocFC2w==
 
+chalk@^5.3.0:
+  version "5.4.1"
+  resolved "https://registry.yarnpkg.com/chalk/-/chalk-5.4.1.tgz#1b48bf0963ec158dce2aacf69c093ae2dd2092d8"
+  integrity sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==
+
 change-case@^4, change-case@^4.1.2:
   version "4.1.2"
   resolved "https://registry.npmjs.org/change-case/-/change-case-4.1.2.tgz"
@@ -3727,6 +3780,11 @@ change-case@^4, change-case@^4.1.2:
     snake-case "^3.0.4"
     tslib "^2.0.3"
 
+change-case@^5.4.4:
+  version "5.4.4"
+  resolved "https://registry.yarnpkg.com/change-case/-/change-case-5.4.4.tgz#0d52b507d8fb8f204343432381d1a6d7bff97a02"
+  integrity sha512-HRQyTk2/YPEkt9TnUPbOpr64Uw3KOicFWPVBb+xiHvd6eBx/qPr9xqfBFDT8P2vWsvvz4jbEkfDe71W3VyNu2w==
+
 character-entities-html4@^2.0.0:
   version "2.1.0"
   resolved "https://registry.npmjs.org/character-entities-html4/-/character-entities-html4-2.1.0.tgz"
@@ -3805,11 +3863,31 @@ clean-stack@^3.0.1:
   dependencies:
     escape-string-regexp "4.0.0"
 
+cli-boxes@^3.0.0:
+  version "3.0.0"
+  resolved "https://registry.yarnpkg.com/cli-boxes/-/cli-boxes-3.0.0.tgz#71a10c716feeba005e4504f36329ef0b17cf3145"
+  integrity sha512-/lzGpEWL/8PfI0BmBOPRwp0c/wFNX1RdUML3jK/RcSBA9T8mZDdQpqYBKtCFTOfQbwPqWEOpjqW+Fnayc0969g==
+
+cli-cursor@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.yarnpkg.com/cli-cursor/-/cli-cursor-4.0.0.tgz#3cecfe3734bf4fe02a8361cbdc0f6fe28c6a57ea"
+  integrity sha512-VGtlMu3x/4DOtIUwEkRezxUZ2lBacNJCHash0N0WeZDBS+7Ux1dm3XWAgWYxLJFMMdOeXMHXorshEFhbMSGelg==
+  dependencies:
+    restore-cursor "^4.0.0"
+
 cli-spinners@^2.9.2:
   version "2.9.2"
   resolved "https://registry.npmjs.org/cli-spinners/-/cli-spinners-2.9.2.tgz"
   integrity sha512-ywqV+5MmyL4E7ybXgKys4DugZbX0FC6LnwrhjuykIjnK9k8OQacQ7axGKnjDXWNhns0xot3bZI5h55H8yo9cJg==
 
+cli-truncate@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.yarnpkg.com/cli-truncate/-/cli-truncate-4.0.0.tgz#6cc28a2924fee9e25ce91e973db56c7066e6172a"
+  integrity sha512-nPdaFdQ0h/GEigbPClz11D0v/ZJEwxmeVZGeMo3Z5StPtUTkA9o1lD6QwoirYiSDzbcwn2XcjwmCp68W1IS4TA==
+  dependencies:
+    slice-ansi "^5.0.0"
+    string-width "^7.0.0"
+
 cli-width@^4.1.0:
   version "4.1.0"
   resolved "https://registry.npmjs.org/cli-width/-/cli-width-4.1.0.tgz"
@@ -3873,6 +3951,13 @@ cmdk@^1.0.4:
     "@radix-ui/react-id" "^1.1.0"
     "@radix-ui/react-primitive" "^2.0.2"
 
+code-excerpt@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.yarnpkg.com/code-excerpt/-/code-excerpt-4.0.0.tgz#2de7d46e98514385cb01f7b3b741320115f4c95e"
+  integrity sha512-xxodCmBen3iy2i0WtAK8FlFNrRzjUqjRsMfho58xT/wvZU1YTM3fCnRjcy1gJPMepaRlgm/0e6w8SpWHpn3/cA==
+  dependencies:
+    convert-to-spaces "^2.0.1"
+
 color-convert@^2.0.1:
   version "2.0.1"
   resolved "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz"
@@ -4021,6 +4106,11 @@ convert-source-map@^2.0.0:
   resolved "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz"
   integrity sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==
 
+convert-to-spaces@^2.0.1:
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/convert-to-spaces/-/convert-to-spaces-2.0.1.tgz#61a6c98f8aa626c16b296b862a91412a33bceb6b"
+  integrity sha512-rcQ1bsQO9799wq24uE5AM2tAILy4gXGIK/njFWcVQkGNZ96edlpY+A7bjwvzjYvLDyzmG1MmMLZhpcsb+klNMQ==
+
 cookie-signature@^1.2.1:
   version "1.2.2"
   resolved "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.2.2.tgz"
@@ -4092,6 +4182,11 @@ csprng@*:
   dependencies:
     sequin "*"
 
+csstype@^3.0.2:
+  version "3.1.3"
+  resolved "https://registry.yarnpkg.com/csstype/-/csstype-3.1.3.tgz#d80ff294d114fb0e6ac500fbf85b60137d7eff81"
+  integrity sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==
+
 csv-parse@^5.5.2:
   version "5.6.0"
   resolved "https://registry.npmjs.org/csv-parse/-/csv-parse-5.6.0.tgz"
@@ -4405,6 +4500,11 @@ emoji-regex-xs@^1.0.0:
   resolved "https://registry.npmjs.org/emoji-regex-xs/-/emoji-regex-xs-1.0.0.tgz"
   integrity sha512-LRlerrMYoIDrT6jgpeZ2YYl/L8EulRTt5hQcYjy5AInh7HWXKimpqx68aknBFpGL2+/IcogTcaydJEgaTmOpDg==
 
+emoji-regex@^10.3.0:
+  version "10.4.0"
+  resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-10.4.0.tgz#03553afea80b3975749cfcb36f776ca268e413d4"
+  integrity sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==
+
 emoji-regex@^8.0.0:
   version "8.0.0"
   resolved "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz"
@@ -4432,6 +4532,11 @@ entities@^4.2.0, entities@^4.4.0, entities@^4.5.0:
   resolved "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz"
   integrity sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==
 
+environment@^1.0.0:
+  version "1.1.0"
+  resolved "https://registry.yarnpkg.com/environment/-/environment-1.1.0.tgz#8e86c66b180f363c7ab311787e0259665f45a9f1"
+  integrity sha512-xUtoPkMggbz0MPyPiIWr1Kp4aeWJjDZ6SMvURhimjdZgsRuDplF5/s9hcgGhyXMhs+6vpnuoiZ2kFiu3FMnS8Q==
+
 error-ex@^1.3.1:
   version "1.3.2"
   resolved "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz"
@@ -4558,6 +4663,11 @@ escape-string-regexp@^1.0.5:
   resolved "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz"
   integrity sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=
 
+escape-string-regexp@^2.0.0:
+  version "2.0.0"
+  resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-2.0.0.tgz#a30304e99daa32e23b2fd20f51babd07cffca344"
+  integrity sha512-UpzcLCXolUWcNu5HtVMHYdXJjArjsF9C0aNnquZYY4uW/Vu0miy5YoWvbV345HauVvcAUnpRuhMMcqTcGOY2+w==
+
 escodegen@^2.1.0:
   version "2.1.0"
   resolved "https://registry.npmjs.org/escodegen/-/escodegen-2.1.0.tgz"
@@ -5204,6 +5314,11 @@ get-caller-file@^2.0.1, get-caller-file@^2.0.5:
   resolved "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz"
   integrity sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==
 
+get-east-asian-width@^1.0.0:
+  version "1.3.0"
+  resolved "https://registry.yarnpkg.com/get-east-asian-width/-/get-east-asian-width-1.3.0.tgz#21b4071ee58ed04ee0db653371b55b4299875389"
+  integrity sha512-vpeMIQKxczTD/0s2CdEWHcb0eeJe6TFjxb+J5xgX7hScxqrGuyjmv4c1D4A/gelKfyox0gJJwIHF+fLjeaM8kQ==
+
 get-func-name@^2.0.1, get-func-name@^2.0.2:
   version "2.0.2"
   resolved "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz"
@@ -5728,6 +5843,11 @@ indent-string@^4.0.0:
   resolved "https://registry.npmjs.org/indent-string/-/indent-string-4.0.0.tgz"
   integrity sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg==
 
+indent-string@^5.0.0:
+  version "5.0.0"
+  resolved "https://registry.yarnpkg.com/indent-string/-/indent-string-5.0.0.tgz#4fd2980fccaf8622d14c64d694f4cf33c81951a5"
+  integrity sha512-m6FAo/spmsW2Ab2fU35JTYwtOKa2yAwXSwgjSv1TJzh4Mh7mC3lzAOVLBprb72XsTrgkEIsl7YrFNAiDiRhIGg==
+
 inflight@^1.0.4:
   version "1.0.6"
   resolved "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz"
@@ -5751,6 +5871,36 @@ ini@^4.1.3:
   resolved "https://registry.npmjs.org/ini/-/ini-4.1.3.tgz"
   integrity sha512-X7rqawQBvfdjS10YU1y1YVreA3SsLrW9dX2CewP2EbBJM4ypVNLDkO5y04gejPwKIY9lR+7r9gn3rFPt/kmWFg==
 
+ink@5.0.1:
+  version "5.0.1"
+  resolved "https://registry.yarnpkg.com/ink/-/ink-5.0.1.tgz#f2ef9796a3911830c3995dedd227ec84ae27de4b"
+  integrity sha512-ae4AW/t8jlkj/6Ou21H2av0wxTk8vrGzXv+v2v7j4in+bl1M5XRMVbfNghzhBokV++FjF8RBDJvYo+ttR9YVRg==
+  dependencies:
+    "@alcalzone/ansi-tokenize" "^0.1.3"
+    ansi-escapes "^7.0.0"
+    ansi-styles "^6.2.1"
+    auto-bind "^5.0.1"
+    chalk "^5.3.0"
+    cli-boxes "^3.0.0"
+    cli-cursor "^4.0.0"
+    cli-truncate "^4.0.0"
+    code-excerpt "^4.0.0"
+    indent-string "^5.0.0"
+    is-in-ci "^0.1.0"
+    lodash "^4.17.21"
+    patch-console "^2.0.0"
+    react-reconciler "^0.29.0"
+    scheduler "^0.23.0"
+    signal-exit "^3.0.7"
+    slice-ansi "^7.1.0"
+    stack-utils "^2.0.6"
+    string-width "^7.0.0"
+    type-fest "^4.8.3"
+    widest-line "^5.0.0"
+    wrap-ansi "^9.0.0"
+    ws "^8.15.0"
+    yoga-wasm-web "~0.3.3"
+
 internal-slot@^1.0.7:
   version "1.0.7"
   resolved "https://registry.npmjs.org/internal-slot/-/internal-slot-1.0.7.tgz"
@@ -5859,6 +6009,18 @@ is-fullwidth-code-point@^3.0.0:
   resolved "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz"
   integrity sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==
 
+is-fullwidth-code-point@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.yarnpkg.com/is-fullwidth-code-point/-/is-fullwidth-code-point-4.0.0.tgz#fae3167c729e7463f8461ce512b080a49268aa88"
+  integrity sha512-O4L094N2/dZ7xqVdrXhh9r1KODPJpFms8B5sGdJLPy664AgvXsreZUyCQQNItZRDlYug4xStLjNp/sz3HvBowQ==
+
+is-fullwidth-code-point@^5.0.0:
+  version "5.0.0"
+  resolved "https://registry.yarnpkg.com/is-fullwidth-code-point/-/is-fullwidth-code-point-5.0.0.tgz#9609efced7c2f97da7b60145ef481c787c7ba704"
+  integrity sha512-OVa3u9kkBbw7b8Xw5F9P+D/T9X+Z4+JruYVNapTjPYZYUznQ5YfWeFkOj606XYYW8yugTfC8Pj0hYqvi4ryAhA==
+  dependencies:
+    get-east-asian-width "^1.0.0"
+
 is-glob@^4.0.0, is-glob@^4.0.1, is-glob@^4.0.3, is-glob@~4.0.1:
   version "4.0.3"
   resolved "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz"
@@ -5866,6 +6028,11 @@ is-glob@^4.0.0, is-glob@^4.0.1, is-glob@^4.0.3, is-glob@~4.0.1:
   dependencies:
     is-extglob "^2.1.1"
 
+is-in-ci@^0.1.0:
+  version "0.1.0"
+  resolved "https://registry.yarnpkg.com/is-in-ci/-/is-in-ci-0.1.0.tgz#5e07d6a02ec3a8292d3f590973357efa3fceb0d3"
+  integrity sha512-d9PXLEY0v1iJ64xLiQMJ51J128EYHAaOR4yZqQi8aHGfw6KgifM3/Viw1oZZ1GCVmb3gBuyhLyHj0HgR2DhSXQ==
+
 is-inside-container@^1.0.0:
   version "1.0.0"
   resolved "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz"
@@ -6888,6 +7055,11 @@ natural-compare@^1.4.0:
   resolved "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz"
   integrity sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=
 
+natural-orderby@^3.0.2:
+  version "3.0.2"
+  resolved "https://registry.yarnpkg.com/natural-orderby/-/natural-orderby-3.0.2.tgz#1b874d685fbd68beab2c6e7d14f298e03d631ec3"
+  integrity sha512-x7ZdOwBxZCEm9MM7+eQCjkrNLrW3rkBKNHVr78zbtqnMGVNlnDi6C/eUEYgxHNrcbu0ymvjzcwIL/6H1iHri9g==
+
 negotiator@^1.0.0:
   version "1.0.0"
   resolved "https://registry.npmjs.org/negotiator/-/negotiator-1.0.0.tgz"
@@ -7036,6 +7208,11 @@ object-assign@^4:
   resolved "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz"
   integrity sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==
 
+object-hash@^3.0.0:
+  version "3.0.0"
+  resolved "https://registry.yarnpkg.com/object-hash/-/object-hash-3.0.0.tgz#73f97f753e7baffc0e2cc9d6e079079744ac82e9"
+  integrity sha512-RSn9F68PjH9HqtltsSnqYC1XXoWe9Bju5+213R98cNGttag9q9yAOTzdbsqvIa7aNm5WffBZFpWYr2aWrklWAw==
+
 object-inspect@^1.13.1, object-inspect@^1.13.3:
   version "1.13.4"
   resolved "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz"
@@ -7310,6 +7487,11 @@ pascal-case@^3.1.2:
     no-case "^3.0.4"
     tslib "^2.0.3"
 
+patch-console@^2.0.0:
+  version "2.0.0"
+  resolved "https://registry.yarnpkg.com/patch-console/-/patch-console-2.0.0.tgz#9023f4665840e66f40e9ce774f904a63167433bb"
+  integrity sha512-0YNdUceMdaQwoKce1gatDScmMo5pu/tfABfnzEqeG0gtTmd7mh/WcwgUjtAeOU7N8nFFlbQBnFK2gXW5fGvmMA==
+
 path-browserify@^1.0.1:
   version "1.0.1"
   resolved "https://registry.npmjs.org/path-browserify/-/path-browserify-1.0.1.tgz"
@@ -7683,6 +7865,14 @@ react-dom@^18.3.1:
     loose-envify "^1.1.0"
     scheduler "^0.23.2"
 
+react-reconciler@^0.29.0:
+  version "0.29.2"
+  resolved "https://registry.yarnpkg.com/react-reconciler/-/react-reconciler-0.29.2.tgz#8ecfafca63549a4f4f3e4c1e049dd5ad9ac3a54f"
+  integrity sha512-zZQqIiYgDCTP/f1N/mAR10nJGrPD2ZR+jDSEsKWJHYC7Cm2wodlwbR3upZRdC3cjIjSlTLNVyO7Iu0Yy7t2AYg==
+  dependencies:
+    loose-envify "^1.1.0"
+    scheduler "^0.23.2"
+
 react-remove-scroll-bar@^2.3.7:
   version "2.3.8"
   resolved "https://registry.npmjs.org/react-remove-scroll-bar/-/react-remove-scroll-bar-2.3.8.tgz"
@@ -7931,6 +8121,14 @@ responselike@^3.0.0:
   dependencies:
     lowercase-keys "^3.0.0"
 
+restore-cursor@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.yarnpkg.com/restore-cursor/-/restore-cursor-4.0.0.tgz#519560a4318975096def6e609d44100edaa4ccb9"
+  integrity sha512-I9fPXU9geO9bHOt9pHHOhOkYerIMsmVaWB0rA2AI9ERh/+x/i7MV5HKBNrg+ljO5eoPVgCcnFuRjJ9uH6I/3eg==
+  dependencies:
+    onetime "^5.1.0"
+    signal-exit "^3.0.2"
+
 retry@0.13.1:
   version "0.13.1"
   resolved "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz"
@@ -8027,7 +8225,7 @@ sax@>=0.6.0:
   resolved "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz"
   integrity sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==
 
-scheduler@^0.23.2:
+scheduler@^0.23.0, scheduler@^0.23.2:
   version "0.23.2"
   resolved "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz"
   integrity sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==
@@ -8257,7 +8455,7 @@ side-channel@^1.0.4, side-channel@^1.1.0:
     side-channel-map "^1.0.1"
     side-channel-weakmap "^1.0.2"
 
-signal-exit@^3.0.2, signal-exit@^3.0.3:
+signal-exit@^3.0.2, signal-exit@^3.0.3, signal-exit@^3.0.7:
   version "3.0.7"
   resolved "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz"
   integrity sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==
@@ -8310,6 +8508,22 @@ slash@^3.0.0:
   resolved "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz"
   integrity sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==
 
+slice-ansi@^5.0.0:
+  version "5.0.0"
+  resolved "https://registry.yarnpkg.com/slice-ansi/-/slice-ansi-5.0.0.tgz#b73063c57aa96f9cd881654b15294d95d285c42a"
+  integrity sha512-FC+lgizVPfie0kkhqUScwRu1O/lF6NOgJmlCgK+/LYxDCTk8sGelYaHDhFcDN+Sn3Cv+3VSa4Byeo+IMCzpMgQ==
+  dependencies:
+    ansi-styles "^6.0.0"
+    is-fullwidth-code-point "^4.0.0"
+
+slice-ansi@^7.1.0:
+  version "7.1.0"
+  resolved "https://registry.yarnpkg.com/slice-ansi/-/slice-ansi-7.1.0.tgz#cd6b4655e298a8d1bdeb04250a433094b347b9a9"
+  integrity sha512-bSiSngZ/jWeX93BqeIAbImyTbEihizcwNjFoRUIY/T1wWQsfsm2Vw1agPKylXvQTU7iASGdHhyqRlqQzfz+Htg==
+  dependencies:
+    ansi-styles "^6.2.1"
+    is-fullwidth-code-point "^5.0.0"
+
 smart-buffer@^4.2.0:
   version "4.2.0"
   resolved "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz"
@@ -8470,6 +8684,13 @@ stack-chain@^1.3.7:
   resolved "https://registry.yarnpkg.com/stack-chain/-/stack-chain-1.3.7.tgz#d192c9ff4ea6a22c94c4dd459171e3f00cea1285"
   integrity sha512-D8cWtWVdIe/jBA7v5p5Hwl5yOSOrmZPWDPe2KxQ5UAGD+nxbxU0lKXA4h85Ta6+qgdKVL3vUxsbIZjc1kBG7ug==
 
+stack-utils@^2.0.6:
+  version "2.0.6"
+  resolved "https://registry.yarnpkg.com/stack-utils/-/stack-utils-2.0.6.tgz#aaf0748169c02fc33c8232abccf933f54a1cc34f"
+  integrity sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==
+  dependencies:
+    escape-string-regexp "^2.0.0"
+
 statuses@2.0.1, statuses@^2.0.1:
   version "2.0.1"
   resolved "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz"
@@ -8493,6 +8714,15 @@ string-width@^5.0.1, string-width@^5.1.2:
     emoji-regex "^9.2.2"
     strip-ansi "^7.0.1"
 
+string-width@^7.0.0:
+  version "7.2.0"
+  resolved "https://registry.yarnpkg.com/string-width/-/string-width-7.2.0.tgz#b5bb8e2165ce275d4d43476dd2700ad9091db6dc"
+  integrity sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==
+  dependencies:
+    emoji-regex "^10.3.0"
+    get-east-asian-width "^1.0.0"
+    strip-ansi "^7.1.0"
+
 string.prototype.trim@^1.2.8:
   version "1.2.8"
   resolved "https://registry.npmjs.org/string.prototype.trim/-/string.prototype.trim-1.2.8.tgz"
@@ -8549,7 +8779,7 @@ stringify-entities@^4.0.0:
   dependencies:
     ansi-regex "^5.0.1"
 
-strip-ansi@^7.0.1:
+strip-ansi@^7.0.1, strip-ansi@^7.1.0:
   version "7.1.0"
   resolved "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz"
   integrity sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==
@@ -8822,6 +9052,11 @@ type-fest@^0.8.0, type-fest@^0.8.1:
   resolved "https://registry.npmjs.org/type-fest/-/type-fest-0.8.1.tgz"
   integrity sha512-4dbzIzqvjtgiM5rw1k5rEHtBANKmdudhGyBEajN01fEyhaAIhsoKNy6y7+IN93IfpFtwY9iqi7kD+xwKhQsNJA==
 
+type-fest@^4.8.3:
+  version "4.41.0"
+  resolved "https://registry.yarnpkg.com/type-fest/-/type-fest-4.41.0.tgz#6ae1c8e5731273c2bf1f58ad39cbae2c91a46c58"
+  integrity sha512-TeTSQ6H5YHvpqVwBRcnLDCBnDOHWYu7IvGbHT6N8AOymcr9PJGjc1GTtiWZTYg0NCgYwvnYWEkVChQAr9bjfwA==
+
 type-is@^2.0.0, type-is@^2.0.1:
   version "2.0.1"
   resolved "https://registry.npmjs.org/type-is/-/type-is-2.0.1.tgz"
@@ -9167,6 +9402,13 @@ widest-line@^3.1.0:
   dependencies:
     string-width "^4.0.0"
 
+widest-line@^5.0.0:
+  version "5.0.0"
+  resolved "https://registry.yarnpkg.com/widest-line/-/widest-line-5.0.0.tgz#b74826a1e480783345f0cd9061b49753c9da70d0"
+  integrity sha512-c9bZp7b5YtRj2wOe6dlj32MK+Bx/M/d+9VB2SHM1OtsUHR0aV0tdP6DWh/iMt0kWi1t5g1Iudu6hQRNd1A4PVA==
+  dependencies:
+    string-width "^7.0.0"
+
 wireit@^0.14.12:
   version "0.14.12"
   resolved "https://registry.npmjs.org/wireit/-/wireit-0.14.12.tgz"
@@ -9215,6 +9457,15 @@ wrap-ansi@^8.1.0:
     string-width "^5.0.1"
     strip-ansi "^7.0.1"
 
+wrap-ansi@^9.0.0:
+  version "9.0.0"
+  resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-9.0.0.tgz#1a3dc8b70d85eeb8398ddfb1e4a02cd186e58b3e"
+  integrity sha512-G8ura3S+3Z2G+mkgNRq8dqaFZAuxfsxpBB8OCTGRTCtp+l/v9nbFNmCUP1BZMts3G1142MsZfn6eeUKrr4PD1Q==
+  dependencies:
+    ansi-styles "^6.2.1"
+    string-width "^7.0.0"
+    strip-ansi "^7.1.0"
+
 wrappy@1:
   version "1.0.2"
   resolved "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz"
@@ -9230,6 +9481,11 @@ write-file-atomic@^3.0.0:
     signal-exit "^3.0.2"
     typedarray-to-buffer "^3.1.5"
 
+ws@^8.15.0:
+  version "8.18.2"
+  resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.2.tgz#42738b2be57ced85f46154320aabb51ab003705a"
+  integrity sha512-DMricUmwGZUVr++AEAe2uiVM7UoO9MAVZMDu05UQOaUII0lp+zOzLLU4Xqh/JvTqklB1T4uELaaPBKyjE1r4fQ==
+
 ws@^8.18.0:
   version "8.18.1"
   resolved "https://registry.npmjs.org/ws/-/ws-8.18.1.tgz"
@@ -9364,6 +9620,11 @@ yoctocolors-cjs@^2.1.2:
   resolved "https://registry.npmjs.org/yoctocolors-cjs/-/yoctocolors-cjs-2.1.2.tgz"
   integrity sha512-cYVsTjKl8b+FrnidjibDWskAv7UKOfcwaVZdp/it9n1s9fU3IkgDbhdIRKCW4JDsAlECJY0ytoVPT3sK6kideA==
 
+yoga-wasm-web@~0.3.3:
+  version "0.3.3"
+  resolved "https://registry.yarnpkg.com/yoga-wasm-web/-/yoga-wasm-web-0.3.3.tgz#eb8e9fcb18e5e651994732f19a220cb885d932ba"
+  integrity sha512-N+d4UJSJbt/R3wqY7Coqs5pcV0aUj2j9IaQ3rNj9bVCLld8tTGKRa2USARjnvZJWVx1NDmQev8EknoczaOQDOA==
+
 zod-to-json-schema@^3.24.1:
   version "3.24.5"
   resolved "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.24.5.tgz"

From 5cefe3dbc448e27a90bf16848cd60a1fd98993be Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Mon, 16 Jun 2025 15:16:28 -0600
Subject: [PATCH 18/51] chore: clean up

---
 package.json |  3 ++-
 test/llmg.ts | 26 ++++++++++++++------------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/package.json b/package.json
index 7f6214f5..0063f6b6 100644
--- a/package.json
+++ b/package.json
@@ -25,7 +25,8 @@
     "prepare": "sf-install",
     "start": "yarn build && npm link && mcp-inspector sf-mcp-server",
     "test": "wireit",
-    "test:only": "wireit"
+    "test:only": "wireit",
+    "test:llmg": "node --loader ts-node/esm test/llmg.ts"
   },
   "repository": "salesforcecli/mcp",
   "bugs": {
diff --git a/test/llmg.ts b/test/llmg.ts
index d78dd5e6..713c65df 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -207,18 +207,6 @@ const generateResponse = async (
   };
 };
 
-const models = [
-  'llmgateway__OpenAIGPT35Turbo_01_25',
-  'llmgateway__OpenAIGPT4OmniMini',
-  'llmgateway__BedrockAnthropicClaude4Sonnet',
-];
-
-const prompts = [
-  "What's my salesforce username?",
-  'List all my orgs',
-  'Deploy my project (~/my-project) using the my-sf-org alias',
-];
-
 async function displayModelResponses(prompt: string) {
   const responses = await Promise.all(models.map((model) => generateResponse(prompt, model)));
 
@@ -244,6 +232,20 @@ async function displayModelResponses(prompt: string) {
   });
 }
 
+const models = [
+  'llmgateway__OpenAIGPT35Turbo_01_25',
+  'llmgateway__OpenAIGPT4OmniMini',
+  'llmgateway__BedrockAnthropicClaude4Sonnet',
+];
+
+const prompts = [
+  "What's my salesforce username?",
+  'List all my orgs',
+  'Deploy my project (~/my-project) using the my-sf-org alias',
+];
+
+// eslint-disable-next-line no-console
+console.log();
 for (const prompt of prompts) {
   // eslint-disable-next-line no-await-in-loop
   await displayModelResponses(prompt);

From 15aa4402f19795cf83bc880bdcea27c0c342c0e9 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Mon, 16 Jun 2025 15:36:04 -0600
Subject: [PATCH 19/51] test: list token counts

---
 package.json |  2 +-
 test/llmg.ts | 62 +++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/package.json b/package.json
index 0063f6b6..8dae24ea 100644
--- a/package.json
+++ b/package.json
@@ -26,7 +26,7 @@
     "start": "yarn build && npm link && mcp-inspector sf-mcp-server",
     "test": "wireit",
     "test:only": "wireit",
-    "test:llmg": "node --loader ts-node/esm test/llmg.ts"
+    "test:llmg": "node --no-warnings --loader ts-node/esm test/llmg.ts"
   },
   "repository": "salesforcecli/mcp",
   "bugs": {
diff --git a/test/llmg.ts b/test/llmg.ts
index 713c65df..16cdc46c 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -24,6 +24,7 @@ if (!API_KEY) {
 import { spawn } from 'node:child_process';
 import { Tool } from '@modelcontextprotocol/sdk/types.js';
 import { printTable } from '@oclif/table';
+import { stdout } from '@oclif/core/ux';
 
 type InvocableTool = {
   name: string;
@@ -48,6 +49,44 @@ type GatewayResponse = {
   };
 };
 
+/**
+ * Approximates token count for a JSON object using a simple algorithm.
+ * This is a rough approximation and may not match exact token counts from specific LLMs.
+ *
+ * For comparison, here are the token counts:
+ *
+ * | Tool                  | OpenAI | countTokens |
+ * |----------------------|---------|-------------|
+ * | sf-get-username      | 632     | 702         |
+ * | sf-list-all-orgs     | 262     | 283         |
+ * | sf-query-org         | 405     | 416         |
+ * | sf-assign-permission | 609     | 631         |
+ * | sf-deploy-metadata   | 779     | 809         |
+ * | sf-retrieve-metadata | 551     | 592         |
+ *
+ * @param obj - The JSON object to count tokens for
+ * @returns Approximate number of tokens
+ */
+function countTokens(obj: unknown): number {
+  // Convert object to string representation
+  const jsonStr = JSON.stringify(obj);
+
+  // Split into words and count
+  const words = jsonStr.split(/\s+/);
+
+  // Count tokens (rough approximation)
+  let tokenCount = 0;
+  for (const word of words) {
+    // Each word is roughly 1.3 tokens
+    tokenCount += Math.ceil(word.length / 4);
+
+    // Add tokens for special characters
+    tokenCount += (word.match(/[{}[\],:]/g) || []).length;
+  }
+
+  return tokenCount;
+}
+
 const getToolsList = async (): Promise<InvocableTool[]> => {
   const toolsList: string = await new Promise<string>((resolve, reject) => {
     const child = spawn('npx', [
@@ -81,6 +120,22 @@ const getToolsList = async (): Promise<InvocableTool[]> => {
   });
 
   const parsedToolsList = JSON.parse(toolsList) as { tools: Tool[] };
+
+  const toolsWithTokens = parsedToolsList.tools?.map((tool) => ({
+    tool: tool.name,
+    tokens: countTokens(tool),
+  }));
+
+  printTable({
+    title: 'Tools List',
+    data: toolsWithTokens,
+    columns: ['tool', { key: 'tokens', name: 'Approximate Tokens' }],
+    headerOptions: {
+      formatter: 'capitalCase',
+    },
+  });
+  stdout('Total tokens: ' + toolsWithTokens.reduce((acc, tool) => acc + tool.tokens, 0));
+
   return (parsedToolsList.tools ?? []).map((tool) => ({
     name: tool.name,
     function: {
@@ -199,10 +254,8 @@ const generateResponse = async (
     }
   );
 
-  const json = (await response.json()) as GatewayResponse;
-
   return {
-    response: json,
+    response: (await response.json()) as GatewayResponse,
     model,
   };
 };
@@ -244,8 +297,7 @@ const prompts = [
   'Deploy my project (~/my-project) using the my-sf-org alias',
 ];
 
-// eslint-disable-next-line no-console
-console.log();
+stdout();
 for (const prompt of prompts) {
   // eslint-disable-next-line no-await-in-loop
   await displayModelResponses(prompt);

From d18028fc7377b4aecd61f53a43009820ddbc3628 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Mon, 16 Jun 2025 16:47:41 -0600
Subject: [PATCH 20/51] test: make more developer friendly

---
 .gitignore   |   1 +
 package.json |   3 +-
 test/llmg.ts | 119 +++++++++++++++++++++++++++++++++++++++++----------
 yarn.lock    |   5 +++
 4 files changed, 105 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5712aee8..64e581f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,4 @@ node_modules
 
 .cursor
 src/tools/test.ts
+llmg-test.yml
diff --git a/package.json b/package.json
index 8dae24ea..e5acc9b0 100644
--- a/package.json
+++ b/package.json
@@ -62,7 +62,8 @@
     "oclif": "^4.18.0",
     "ts-node": "^10.9.2",
     "ts-patch": "^3.3.0",
-    "typescript": "^5.8.3"
+    "typescript": "^5.8.3",
+    "yaml": "^2.8.0"
   },
   "publishConfig": {
     "access": "public"
diff --git a/test/llmg.ts b/test/llmg.ts
index 16cdc46c..022f9ce3 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -22,10 +22,14 @@ if (!API_KEY) {
 }
 
 import { spawn } from 'node:child_process';
+import fs from 'node:fs/promises';
+import { dirname } from 'node:path';
 import { Tool } from '@modelcontextprotocol/sdk/types.js';
 import { printTable } from '@oclif/table';
 import { stdout } from '@oclif/core/ux';
+import yaml from 'yaml';
 
+import { Command, Flags, flush, handle } from '@oclif/core';
 type InvocableTool = {
   name: string;
   function: {
@@ -146,8 +150,6 @@ const getToolsList = async (): Promise<InvocableTool[]> => {
   }));
 };
 
-const tools = await getToolsList();
-
 /**
  * Generates a response from the LLM Gateway API using the specified model.
  *
@@ -161,9 +163,10 @@ const tools = await getToolsList();
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/function-calling/} Function Calling Documentation
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/auth/#api-key-limitations} API Key Authentication Documentation
  */
-const generateResponse = async (
+const makeGatewayRequest = async (
   prompt: string,
-  model: string
+  model: string,
+  tools: InvocableTool[]
 ): Promise<{ model: string; response: GatewayResponse }> => {
   const response = await fetch(
     'https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations',
@@ -260,8 +263,8 @@ const generateResponse = async (
   };
 };
 
-async function displayModelResponses(prompt: string) {
-  const responses = await Promise.all(models.map((model) => generateResponse(prompt, model)));
+async function compareModelOutputs(prompt: string, models: string[], tools: InvocableTool[]) {
+  const responses = await Promise.all(models.map((model) => makeGatewayRequest(prompt, model, tools)));
 
   printTable({
     title: `Prompt: ${prompt}`,
@@ -285,20 +288,92 @@ async function displayModelResponses(prompt: string) {
   });
 }
 
-const models = [
-  'llmgateway__OpenAIGPT35Turbo_01_25',
-  'llmgateway__OpenAIGPT4OmniMini',
-  'llmgateway__BedrockAnthropicClaude4Sonnet',
-];
-
-const prompts = [
-  "What's my salesforce username?",
-  'List all my orgs',
-  'Deploy my project (~/my-project) using the my-sf-org alias',
-];
-
-stdout();
-for (const prompt of prompts) {
-  // eslint-disable-next-line no-await-in-loop
-  await displayModelResponses(prompt);
+export default class LLMGTest extends Command {
+  public static id = 'llmg';
+  public static summary = 'Test the MCP server against the LLM Gateway API';
+  public static description = `Use this script to verify that the tools in this MCP server can be invoked by various LLM models.
+
+  This script depends on a YAML file that contains the models and prompts to test.
+
+The file is llmg-test.yml by default but can be overridden with the --file flag.
+
+For a complete list of models, see https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/
+
+SF_LLMG_API_KEY must be set in the environment.`;
+
+  public static flags = {
+    file: Flags.file({
+      summary: 'The YAML file to use for the response',
+      description: 'Must contain array of models and prompts',
+      default: 'llmg-test.yml',
+      exists: true,
+      char: 'f',
+    }),
+    help: Flags.help({
+      summary: 'Show help',
+      description: 'Show help for the llmg command',
+      char: 'h',
+    }),
+  };
+
+  public async run(): Promise<void> {
+    const { flags } = await this.parse(LLMGTest);
+
+    const yamlContents = await fs.readFile(flags.file, 'utf8');
+    const yamlObj = yaml.parse(yamlContents) as {
+      models?: string[];
+      prompts?: string[];
+    };
+
+    if (!yamlObj.models) {
+      throw new Error('models is required');
+    }
+
+    if (!yamlObj.prompts) {
+      throw new Error('prompts is required');
+    }
+
+    stdout('Models:');
+    yamlObj.models.forEach((model) => stdout(`  - ${model}`));
+
+    stdout();
+    stdout('Prompts:');
+    yamlObj.prompts.forEach((prompt) => stdout(`  - ${prompt}`));
+
+    stdout();
+    const tools = await getToolsList();
+    stdout();
+
+    for (const prompt of yamlObj.prompts) {
+      // eslint-disable-next-line no-await-in-loop
+      await compareModelOutputs(prompt, yamlObj.models, tools);
+    }
+  }
 }
+
+LLMGTest.run(process.argv.slice(2), {
+  root: dirname(import.meta.dirname),
+  // Tell oclif what the contents of the package.json are.
+  // You could also set these in your package.json but specifying
+  // them here is useful if you're attempting to bundle your CLI
+  // without a package.json
+  pjson: {
+    name: 'llmg',
+    version: '0.0.1',
+    oclif: {
+      // Tell oclif that this is a single command CLI
+      // See: https://oclif.io/docs/command_discovery_strategies
+      commands: {
+        strategy: 'single',
+        target: 'test/llmg.js',
+      },
+    },
+  },
+}).then(
+  async () => {
+    await flush();
+  },
+  async (err) => {
+    await handle(err as Error);
+  }
+);
diff --git a/yarn.lock b/yarn.lock
index 88fe831f..d0da7c8c 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -9534,6 +9534,11 @@ yaml@^2.5.1, yaml@^2.7.1:
   resolved "https://registry.npmjs.org/yaml/-/yaml-2.7.1.tgz"
   integrity sha512-10ULxpnOCQXxJvBgxsn9ptjq6uviG/htZKk9veJGhlqn3w/DxQ631zFF+nlQXLwmImeS5amR2dl2U8sg6U9jsQ==
 
+yaml@^2.8.0:
+  version "2.8.0"
+  resolved "https://registry.yarnpkg.com/yaml/-/yaml-2.8.0.tgz#15f8c9866211bdc2d3781a0890e44d4fa1a5fff6"
+  integrity sha512-4lLa/EcQCB0cJkyts+FpIRx5G/llPxfP6VQU5KByHEhLxY3IJCH0f0Hy1MHI8sClTvsIb8qwRJ6R/ZdlDJ/leQ==
+
 yargs-parser@^18.1.2:
   version "18.1.3"
   resolved "https://registry.npmjs.org/yargs-parser/-/yargs-parser-18.1.3.tgz"

From bb3754a03f428b97dd145cbd4686c87c4cbf12c8 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Mon, 16 Jun 2025 21:16:35 -0600
Subject: [PATCH 21/51] chore: clean up

---
 test/llmg.ts | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/test/llmg.ts b/test/llmg.ts
index 022f9ce3..6d6161e9 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -85,7 +85,7 @@ function countTokens(obj: unknown): number {
     tokenCount += Math.ceil(word.length / 4);
 
     // Add tokens for special characters
-    tokenCount += (word.match(/[{}[\],:]/g) || []).length;
+    tokenCount += (word.match(/[{}[\],:]/g) ?? []).length;
   }
 
   return tokenCount;
@@ -146,6 +146,7 @@ const getToolsList = async (): Promise<InvocableTool[]> => {
       name: tool.name,
       description: tool.description,
       parameters: tool.inputSchema,
+      annotations: tool.annotations,
     },
   }));
 };
@@ -288,8 +289,8 @@ async function compareModelOutputs(prompt: string, models: string[], tools: Invo
   });
 }
 
-export default class LLMGTest extends Command {
-  public static id = 'llmg';
+export default class LLMGatewayTest extends Command {
+  public static id = 'llm-gateway-test';
   public static summary = 'Test the MCP server against the LLM Gateway API';
   public static description = `Use this script to verify that the tools in this MCP server can be invoked by various LLM models.
 
@@ -317,7 +318,7 @@ SF_LLMG_API_KEY must be set in the environment.`;
   };
 
   public async run(): Promise<void> {
-    const { flags } = await this.parse(LLMGTest);
+    const { flags } = await this.parse(LLMGatewayTest);
 
     const yamlContents = await fs.readFile(flags.file, 'utf8');
     const yamlObj = yaml.parse(yamlContents) as {
@@ -351,18 +352,12 @@ SF_LLMG_API_KEY must be set in the environment.`;
   }
 }
 
-LLMGTest.run(process.argv.slice(2), {
+LLMGatewayTest.run(process.argv.slice(2), {
   root: dirname(import.meta.dirname),
-  // Tell oclif what the contents of the package.json are.
-  // You could also set these in your package.json but specifying
-  // them here is useful if you're attempting to bundle your CLI
-  // without a package.json
   pjson: {
-    name: 'llmg',
+    name: 'llm-gateway-test',
     version: '0.0.1',
     oclif: {
-      // Tell oclif that this is a single command CLI
-      // See: https://oclif.io/docs/command_discovery_strategies
       commands: {
         strategy: 'single',
         target: 'test/llmg.js',

From 0b534d1dd05276798c64d16c080e1da9159c77c4 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 17 Jun 2025 09:05:44 -0600
Subject: [PATCH 22/51] chore: clean up

---
 test/llmg.ts | 72 ++++++----------------------------------------------
 1 file changed, 8 insertions(+), 64 deletions(-)

diff --git a/test/llmg.ts b/test/llmg.ts
index 6d6161e9..cfee581b 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -188,61 +188,6 @@ const makeGatewayRequest = async (
         tool_config: {
           mode: 'auto',
         },
-        //   {
-        //     type: 'function',
-        //     function: {
-        //       name: 'get_current_weather',
-        //       description: 'Get the current weather in a given location.',
-        //       parameters: {
-        //         type: 'object',
-        //         properties: {
-        //           location: {
-        //             type: 'string',
-        //             description: 'The city and state, e.g. San Francisco, CA',
-        //           },
-        //           format: {
-        //             type: 'string',
-        //             enum: ['celsius', 'fahrenheit'],
-        //             description: 'The temperature unit to use. Infer this from the users location.',
-        //           },
-        //         },
-        //       },
-        //     },
-        //   },
-        //   {
-        //     name: 'sf-get-username',
-        //     function: {
-        //       name: 'sf-get-username',
-        //       description:
-        //         'Intelligently determines the appropriate username or alias for Salesforce operations.\n\nAGENT/LLM INSTRUCTIONS:\nUse this tool when uncertain which username/org a user wants for Salesforce operations.\nThis tool handles three distinct scenarios:\n\n1. When defaultTargetOrg=true: Fetches the default target org configuration\n   - Use when user says "for my default org" or "for my default target org"\n\n2. When defaultDevHub=true: Fetches the default dev hub configuration\n   - Use when user says "for my default dev hub" or "for my default target dev hub"\n\n3. When both are false (default): Uses suggestUsername to intelligently determine the appropriate org\n   - Use when user is vague and says something like "for my org" or doesn\'t specify\n\nEXAMPLE USAGE:\n- When user says "Do X for my org" → defaultTargetOrg=false, defaultDevHub=false\n- When user says "For my default org" → defaultTargetOrg=true\n- When user says "For my default dev hub" → defaultDevHub=true',
-        //       parameters: {
-        //         type: 'object',
-        //         properties: {
-        //           defaultTargetOrg: {
-        //             type: 'boolean',
-        //             default: false,
-        //             description:
-        //               'Try to find default org\nAGENT INSTRUCTIONS:\nONLY SET TO TRUE when the user explicitly asks for the default org or default target org.\nLeave it as false when the user is vague and says something like "for my org" or "for my-alias".\n\nUSAGE EXAMPLE:\nGet username for my default org\n...for my default target org',
-        //           },
-        //           defaultDevHub: {
-        //             type: 'boolean',
-        //             default: false,
-        //             description:
-        //               'Try to find default dev hub\nAGENT INSTRUCTIONS:\nONLY SET TO TRUE when the user explicitly asks for the default dev hub or default target devhub.\nLeave it as false when the user is vague and says something like "for my org" or "for my-alias".\n\nUSAGE EXAMPLE:\nGet username for my default dev hub\n...for my default target dev hub\n...for my default devhub',
-        //           },
-        //           directory: {
-        //             type: 'string',
-        //             description:
-        //               'The directory to run this tool from.\nAGENT INSTRUCTIONS:\nWe need to know where the user wants to run this tool from.\nLook at your current Workspace Context to determine this filepath.\nALWAYS USE A FULL PATH TO THE DIRECTORY.\nUnless the user explicitly asks for a different directory, or a new directory is created from the action of a tool, use this same directory for future tool calls.\n',
-        //           },
-        //         },
-        //         required: ['directory'],
-        //         additionalProperties: false,
-        //         $schema: 'http://json-schema.org/draft-07/schema#',
-        //       },
-        //     },
-        //   },
-        // ],
         messages: [
           {
             role: 'user',
@@ -292,15 +237,15 @@ async function compareModelOutputs(prompt: string, models: string[], tools: Invo
 export default class LLMGatewayTest extends Command {
   public static id = 'llm-gateway-test';
   public static summary = 'Test the MCP server against the LLM Gateway API';
-  public static description = `Use this script to verify that the tools in this MCP server can be invoked by various LLM models.
+  public static description = `Tests that the MCP server tools are accurately invoked by various LLM models.
 
-  This script depends on a YAML file that contains the models and prompts to test.
+Configuration:
+- Uses a YAML file (default: llmg-test.yml) to specify models and test prompts
+- Override the YAML file using the --file flag
+- Requires SF_LLMG_API_KEY environment variable
 
-The file is llmg-test.yml by default but can be overridden with the --file flag.
-
-For a complete list of models, see https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/
-
-SF_LLMG_API_KEY must be set in the environment.`;
+For available models, see:
+https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/`;
 
   public static flags = {
     file: Flags.file({
@@ -311,8 +256,7 @@ SF_LLMG_API_KEY must be set in the environment.`;
       char: 'f',
     }),
     help: Flags.help({
-      summary: 'Show help',
-      description: 'Show help for the llmg command',
+      description: 'Show help',
       char: 'h',
     }),
   };

From 4856d5bfadaea6188bc9357a67c24be1be180917 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 17 Jun 2025 09:16:02 -0600
Subject: [PATCH 23/51] chore: clean up

---
 test/llmg.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/llmg.ts b/test/llmg.ts
index cfee581b..23e91197 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -57,7 +57,7 @@ type GatewayResponse = {
  * Approximates token count for a JSON object using a simple algorithm.
  * This is a rough approximation and may not match exact token counts from specific LLMs.
  *
- * For comparison, here are the token counts:
+ * For comparison:
  *
  * | Tool                  | OpenAI | countTokens |
  * |----------------------|---------|-------------|
@@ -162,7 +162,7 @@ const getToolsList = async (): Promise<InvocableTool[]> => {
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/} Models and Providers Documentation
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/apis/rest/#operation/chatMessages} REST API Documentation
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/function-calling/} Function Calling Documentation
- * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/auth/#api-key-limitations} API Key Authentication Documentation
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/auth/#api-key-limitations} API Key Limitations Documentation
  */
 const makeGatewayRequest = async (
   prompt: string,

From 3ce1fc801d19e7b988f143767bf43b8ef1430b15 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 17 Jun 2025 13:31:13 -0600
Subject: [PATCH 24/51] chore: make tables prettier

---
 test/llmg.ts | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/test/llmg.ts b/test/llmg.ts
index 23e91197..8edb1f66 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -25,11 +25,23 @@ import { spawn } from 'node:child_process';
 import fs from 'node:fs/promises';
 import { dirname } from 'node:path';
 import { Tool } from '@modelcontextprotocol/sdk/types.js';
-import { printTable } from '@oclif/table';
+import { printTable, TableOptions } from '@oclif/table';
 import { stdout } from '@oclif/core/ux';
 import yaml from 'yaml';
-
 import { Command, Flags, flush, handle } from '@oclif/core';
+
+const TABLE_STYLE = {
+  headerOptions: {
+    formatter: 'capitalCase',
+    color: 'cyanBright',
+  },
+  titleOptions: {
+    color: 'yellowBright',
+  },
+  borderColor: 'gray',
+  overflow: 'wrap',
+} satisfies Partial<TableOptions<Record<string, unknown>>>;
+
 type InvocableTool = {
   name: string;
   function: {
@@ -134,9 +146,7 @@ const getToolsList = async (): Promise<InvocableTool[]> => {
     title: 'Tools List',
     data: toolsWithTokens,
     columns: ['tool', { key: 'tokens', name: 'Approximate Tokens' }],
-    headerOptions: {
-      formatter: 'capitalCase',
-    },
+    ...TABLE_STYLE,
   });
   stdout('Total tokens: ' + toolsWithTokens.reduce((acc, tool) => acc + tool.tokens, 0));
 
@@ -226,11 +236,14 @@ async function compareModelOutputs(prompt: string, models: string[], tools: Invo
         .map(([key, value]) => `${key}: ${value}`)
         .join('\n'),
     })),
-    columns: ['model', 'response', 'tool', 'arguments'],
-    headerOptions: {
-      formatter: 'capitalCase',
-    },
-    overflow: 'wrap',
+    columns: [
+      { key: 'model', width: '30%' },
+      { key: 'response', width: '25%' },
+      { key: 'tool', width: '20%' },
+      { key: 'arguments', width: '25%' },
+    ],
+    width: process.stdout.columns,
+    ...TABLE_STYLE,
   });
 }
 

From f66b7931c4181e918ac8e397152630d5840e6edb Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Wed, 18 Jun 2025 13:47:22 -0600
Subject: [PATCH 25/51] test: allow longer chats

---
 test/llmg.ts | 178 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 118 insertions(+), 60 deletions(-)

diff --git a/test/llmg.ts b/test/llmg.ts
index 8edb1f66..cb6e8b90 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -26,7 +26,7 @@ import fs from 'node:fs/promises';
 import { dirname } from 'node:path';
 import { Tool } from '@modelcontextprotocol/sdk/types.js';
 import { printTable, TableOptions } from '@oclif/table';
-import { stdout } from '@oclif/core/ux';
+import { stdout, colorize } from '@oclif/core/ux';
 import yaml from 'yaml';
 import { Command, Flags, flush, handle } from '@oclif/core';
 
@@ -35,9 +35,6 @@ const TABLE_STYLE = {
     formatter: 'capitalCase',
     color: 'cyanBright',
   },
-  titleOptions: {
-    color: 'yellowBright',
-  },
   borderColor: 'gray',
   overflow: 'wrap',
 } satisfies Partial<TableOptions<Record<string, unknown>>>;
@@ -52,10 +49,12 @@ type InvocableTool = {
 };
 
 type GatewayResponse = {
-  generation_details: {
+  generation_details?: {
     generations: Array<{
       content: string;
-      tool_invocations: Array<{
+      role: string;
+      tool_invocations?: Array<{
+        id: string;
         function: {
           name: string;
           arguments: string;
@@ -146,6 +145,9 @@ const getToolsList = async (): Promise<InvocableTool[]> => {
     title: 'Tools List',
     data: toolsWithTokens,
     columns: ['tool', { key: 'tokens', name: 'Approximate Tokens' }],
+    titleOptions: {
+      color: 'yellowBright',
+    },
     ...TABLE_STYLE,
   });
   stdout('Total tokens: ' + toolsWithTokens.reduce((acc, tool) => acc + tool.tokens, 0));
@@ -162,11 +164,13 @@ const getToolsList = async (): Promise<InvocableTool[]> => {
 };
 
 /**
- * Generates a response from the LLM Gateway API using the specified model.
+ * Makes requests to the LLM Gateway API for multiple prompts using the specified model and tools.
  *
+ * @param {string[]} prompts - Array of prompts to send to the API
  * @param {string} model - The model identifier to use for generation (e.g., 'llmgateway__AzureOpenAIGPT4Omni')
- * @returns {Promise<unknown>} The parsed JSON response from the API
- * @throws {Error} If the API request fails or returns an error
+ * @param {InvocableTool[]} tools - Array of tools that can be invoked by the model
+ * @returns {Promise<{model: string, messages: Array<{role: string, content: string}>, responses: GatewayResponse[]}>} Object containing the model used, conversation messages, and API responses
+ * @throws {Error} If any API request fails or returns an error
  *
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/#make-your-first-gateway-request} Make Your First Gateway Request Documentation
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/} Models and Providers Documentation
@@ -174,73 +178,106 @@ const getToolsList = async (): Promise<InvocableTool[]> => {
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/function-calling/} Function Calling Documentation
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/auth/#api-key-limitations} API Key Limitations Documentation
  */
-const makeGatewayRequest = async (
-  prompt: string,
+const makeGatewayRequests = async (
+  prompts: string[],
   model: string,
   tools: InvocableTool[]
-): Promise<{ model: string; response: GatewayResponse }> => {
-  const response = await fetch(
-    'https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations',
-    {
-      method: 'POST',
-      headers: {
-        Authorization: `API_KEY ${API_KEY}`,
-        'Content-Type': 'application/json',
-        // We need to figure out which tenant, context, and feature id to use
-        // Maybe this is something that will be given to us once the client registration completes???
-        'x-sfdc-core-tenant-id': 'core/prod1/00DDu0000008cuqMAA',
-        'x-sfdc-app-context': 'EinsteinGPT',
-        'x-client-feature-id': 'EinsteinDocsAnswers',
-      },
-      body: JSON.stringify({
-        model,
-        tools,
-        tool_config: {
-          mode: 'auto',
+): Promise<{ model: string; messages: Array<{ role: string; content: string }>; responses: GatewayResponse[] }> => {
+  const messages: Array<{
+    role: string;
+    content: string;
+  }> = [];
+  const responses: GatewayResponse[] = [];
+  for (const prompt of prompts) {
+    // Add the current prompt to messages
+    messages.push({
+      role: 'user',
+      content: prompt,
+    });
+
+    // eslint-disable-next-line no-await-in-loop
+    const response = await fetch(
+      'https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations',
+      {
+        method: 'POST',
+        headers: {
+          Authorization: `API_KEY ${API_KEY}`,
+          'Content-Type': 'application/json',
+          // We need to figure out which tenant, context, and feature id to use
+          // Maybe this is something that will be given to us once the client registration completes???
+          'x-sfdc-core-tenant-id': 'core/prod1/00DDu0000008cuqMAA',
+          'x-sfdc-app-context': 'EinsteinGPT',
+          'x-client-feature-id': 'EinsteinDocsAnswers',
         },
-        messages: [
-          {
-            role: 'user',
-            content: prompt,
+        body: JSON.stringify({
+          model,
+          tools,
+          tool_config: {
+            mode: 'auto',
           },
-        ],
-        generation_settings: {
-          max_tokens: 500,
-          temperature: 0.5,
-          parameters: {},
-        },
-      }),
+          messages,
+          generation_settings: {
+            max_tokens: 500,
+            temperature: 0.5,
+            parameters: {},
+          },
+        }),
+      }
+    );
+
+    // eslint-disable-next-line no-await-in-loop
+    const responseData = (await response.json()) as GatewayResponse;
+    responses.push(responseData);
+
+    // Add the assistant's response to messages for the next iteration
+    if (responseData.generation_details?.generations[0]?.content) {
+      messages.push({
+        role: responseData.generation_details.generations[0].role,
+        content: responseData.generation_details.generations[0].content,
+      });
     }
-  );
+  }
 
   return {
-    response: (await response.json()) as GatewayResponse,
+    responses,
     model,
+    messages,
   };
 };
 
-async function compareModelOutputs(prompt: string, models: string[], tools: InvocableTool[]) {
-  const responses = await Promise.all(models.map((model) => makeGatewayRequest(prompt, model, tools)));
+const castToArray = <T>(value: T | T[]): T[] => (Array.isArray(value) ? value : [value]);
+
+async function compareModelOutputs(prompt: string | string[], models: string[], tools: InvocableTool[]) {
+  const prompts = castToArray(prompt);
+  const responses = await Promise.all(models.map((model) => makeGatewayRequests(prompts, model, tools)));
 
   printTable({
-    title: `Prompt: ${prompt}`,
+    title: `${colorize('yellowBright', 'Prompt')}:\n  - ${prompts.join('\n  - ')}`,
     data: responses.map((response) => ({
       model: response.model,
-      response: response.response.generation_details.generations[0].content,
-      tool: response.response.generation_details.generations[0].tool_invocations[0].function.name,
-      arguments: Object.entries(
-        JSON.parse(
-          response.response.generation_details.generations[0].tool_invocations[0].function.arguments
-        ) as Record<string, string>
-      )
-        .map(([key, value]) => `${key}: ${value}`)
-        .join('\n'),
+      chat: response.messages.map((m) => `${colorize('bold', m.role)}: ${m.content}`).join('\n\n'),
+      tools: response.responses
+        .map((r, index) => {
+          const toolInvocation = r.generation_details?.generations[0].tool_invocations?.[0];
+          if (!toolInvocation) {
+            return `Message ${index + 1}: No tool invoked`;
+          }
+
+          const toolArgs = JSON.parse(toolInvocation.function.arguments) as Record<string, string>;
+          const argsString = Object.entries(toolArgs)
+            .map(([key, value]) => `  - ${key}: ${value}`)
+            .join('\n');
+
+          return `Message ${index + 1}: ${colorize('bold', toolInvocation.function.name)}${
+            argsString ? `\n${argsString}` : ''
+          }`;
+        })
+        .join('\n\n'),
     })),
     columns: [
       { key: 'model', width: '30%' },
-      { key: 'response', width: '25%' },
-      { key: 'tool', width: '20%' },
-      { key: 'arguments', width: '25%' },
+      { key: 'chat', width: '40%' },
+      { key: 'tools', width: '30%', name: 'Tool Invocations' },
     ],
     width: process.stdout.columns,
     ...TABLE_STYLE,
@@ -257,6 +294,21 @@ Configuration:
 - Override the YAML file using the --file flag
 - Requires SF_LLMG_API_KEY environment variable
 
+YAML File Format:
+The YAML file should contain:
+- models: Array of model identifiers to test against
+- prompts: Array of test prompts (can be strings or arrays of strings for multi-turn conversations)
+
+Example YAML structure:
+  models:
+    - llmgateway__OpenAIGPT35Turbo_01_25
+    - llmgateway__OpenAIGPT4OmniMini
+  prompts:
+    - "What's my salesforce username?"
+    - ["I am a Salesforce developer", "Deploy my project"]
+    - - I am a Salesforce developer.
+      - Deploy my project
+
 For available models, see:
 https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/`;
 
@@ -280,7 +332,7 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
     const yamlContents = await fs.readFile(flags.file, 'utf8');
     const yamlObj = yaml.parse(yamlContents) as {
       models?: string[];
-      prompts?: string[];
+      prompts?: Array<string | string[]>;
     };
 
     if (!yamlObj.models) {
@@ -296,7 +348,13 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
 
     stdout();
     stdout('Prompts:');
-    yamlObj.prompts.forEach((prompt) => stdout(`  - ${prompt}`));
+    yamlObj.prompts.forEach((prompt) => {
+      if (Array.isArray(prompt)) {
+        stdout(`  - - ${prompt.join('\n    - ')}`);
+      } else {
+        stdout(`  - ${prompt}`);
+      }
+    });
 
     stdout();
     const tools = await getToolsList();

From a6b37056685638b47893e6103dd54efbe135050d Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Wed, 18 Jun 2025 14:15:15 -0600
Subject: [PATCH 26/51] test: clean up implementation

---
 package.json |   1 +
 test/llmg.ts | 159 +++++++++++++++++++++++++--------------------------
 yarn.lock    |   5 ++
 3 files changed, 84 insertions(+), 81 deletions(-)

diff --git a/package.json b/package.json
index e5acc9b0..f6aac8fa 100644
--- a/package.json
+++ b/package.json
@@ -59,6 +59,7 @@
     "@types/node": "^22.15.31",
     "eslint-config-salesforce-license": "^1.0.1",
     "eslint-plugin-sf-plugin": "^1.20.25",
+    "gpt-tokenizer": "^3.0.1",
     "oclif": "^4.18.0",
     "ts-node": "^10.9.2",
     "ts-patch": "^3.3.0",
diff --git a/test/llmg.ts b/test/llmg.ts
index cb6e8b90..7bca374c 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -29,6 +29,9 @@ import { printTable, TableOptions } from '@oclif/table';
 import { stdout, colorize } from '@oclif/core/ux';
 import yaml from 'yaml';
 import { Command, Flags, flush, handle } from '@oclif/core';
+import { encode as encodeGPT4oMini } from 'gpt-tokenizer/model/gpt-4o-mini';
+import { encode as encodeO3Mini } from 'gpt-tokenizer/model/o3-mini';
+import { encode as encodeGPT4 } from 'gpt-tokenizer/model/gpt-4';
 
 const TABLE_STYLE = {
   headerOptions: {
@@ -45,6 +48,7 @@ type InvocableTool = {
     name: string;
     description: string | undefined;
     parameters: Tool['inputSchema'];
+    annotations: Tool['annotations'];
   };
 };
 
@@ -64,44 +68,6 @@ type GatewayResponse = {
   };
 };
 
-/**
- * Approximates token count for a JSON object using a simple algorithm.
- * This is a rough approximation and may not match exact token counts from specific LLMs.
- *
- * For comparison:
- *
- * | Tool                  | OpenAI | countTokens |
- * |----------------------|---------|-------------|
- * | sf-get-username      | 632     | 702         |
- * | sf-list-all-orgs     | 262     | 283         |
- * | sf-query-org         | 405     | 416         |
- * | sf-assign-permission | 609     | 631         |
- * | sf-deploy-metadata   | 779     | 809         |
- * | sf-retrieve-metadata | 551     | 592         |
- *
- * @param obj - The JSON object to count tokens for
- * @returns Approximate number of tokens
- */
-function countTokens(obj: unknown): number {
-  // Convert object to string representation
-  const jsonStr = JSON.stringify(obj);
-
-  // Split into words and count
-  const words = jsonStr.split(/\s+/);
-
-  // Count tokens (rough approximation)
-  let tokenCount = 0;
-  for (const word of words) {
-    // Each word is roughly 1.3 tokens
-    tokenCount += Math.ceil(word.length / 4);
-
-    // Add tokens for special characters
-    tokenCount += (word.match(/[{}[\],:]/g) ?? []).length;
-  }
-
-  return tokenCount;
-}
-
 const getToolsList = async (): Promise<InvocableTool[]> => {
   const toolsList: string = await new Promise<string>((resolve, reject) => {
     const child = spawn('npx', [
@@ -138,19 +104,31 @@ const getToolsList = async (): Promise<InvocableTool[]> => {
 
   const toolsWithTokens = parsedToolsList.tools?.map((tool) => ({
     tool: tool.name,
-    tokens: countTokens(tool),
+    tokensGPT4oMini: encodeGPT4oMini(JSON.stringify(tool)).length,
+    tokensO3Mini: encodeO3Mini(JSON.stringify(tool)).length,
+    tokensGPT4: encodeGPT4(JSON.stringify(tool)).length,
   }));
+  toolsWithTokens.push({
+    tool: colorize('bold', 'TOTAL'),
+    tokensGPT4oMini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4oMini, 0),
+    tokensO3Mini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensO3Mini, 0),
+    tokensGPT4: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4, 0),
+  });
 
   printTable({
     title: 'Tools List',
     data: toolsWithTokens,
-    columns: ['tool', { key: 'tokens', name: 'Approximate Tokens' }],
+    columns: [
+      'tool',
+      { key: 'tokensGPT4oMini', name: 'GPT 4o Mini' },
+      { key: 'tokensO3Mini', name: 'O3 Mini' },
+      { key: 'tokensGPT4', name: 'GPT 4' },
+    ],
     titleOptions: {
       color: 'yellowBright',
     },
     ...TABLE_STYLE,
   });
-  stdout('Total tokens: ' + toolsWithTokens.reduce((acc, tool) => acc + tool.tokens, 0));
 
   return (parsedToolsList.tools ?? []).map((tool) => ({
     name: tool.name,
@@ -163,6 +141,57 @@ const getToolsList = async (): Promise<InvocableTool[]> => {
   }));
 };
 
+const createRequestHeaders = (): Record<string, string> => ({
+  Authorization: `API_KEY ${API_KEY}`,
+  'Content-Type': 'application/json',
+  // We need to figure out which tenant, context, and feature id to use
+  // Maybe this is something that will be given to us once the client registration completes???
+  'x-sfdc-core-tenant-id': 'core/prod1/00DDu0000008cuqMAA',
+  'x-sfdc-app-context': 'EinsteinGPT',
+  'x-client-feature-id': 'EinsteinDocsAnswers',
+});
+
+const createRequestBody = (
+  model: string,
+  tools: InvocableTool[],
+  messages: Array<{ role: string; content: string }>
+): string =>
+  JSON.stringify({
+    model,
+    tools,
+    tool_config: {
+      mode: 'auto',
+    },
+    messages,
+    generation_settings: {
+      max_tokens: 500,
+      temperature: 0.5,
+      parameters: {},
+    },
+  });
+
+const makeSingleGatewayRequest = async (
+  model: string,
+  tools: InvocableTool[],
+  messages: Array<{ role: string; content: string }>
+): Promise<GatewayResponse> => {
+  const response = await fetch(
+    'https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations',
+    {
+      method: 'POST',
+      headers: createRequestHeaders(),
+      body: createRequestBody(model, tools, messages),
+    }
+  );
+
+  if (!response.ok) {
+    throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+  }
+
+  const responseData = await response.json();
+  return responseData as GatewayResponse;
+};
+
 /**
  * Makes requests to the LLM Gateway API for multiple prompts using the specified model and tools.
  *
@@ -183,11 +212,9 @@ const makeGatewayRequests = async (
   model: string,
   tools: InvocableTool[]
 ): Promise<{ model: string; messages: Array<{ role: string; content: string }>; responses: GatewayResponse[] }> => {
-  const messages: Array<{
-    role: string;
-    content: string;
-  }> = [];
+  const messages: Array<{ role: string; content: string }> = [];
   const responses: GatewayResponse[] = [];
+
   for (const prompt of prompts) {
     // Add the current prompt to messages
     messages.push({
@@ -196,37 +223,7 @@ const makeGatewayRequests = async (
     });
 
     // eslint-disable-next-line no-await-in-loop
-    const response = await fetch(
-      'https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations',
-      {
-        method: 'POST',
-        headers: {
-          Authorization: `API_KEY ${API_KEY}`,
-          'Content-Type': 'application/json',
-          // We need to figure out which tenant, context, and feature id to use
-          // Maybe this is something that will be given to us once the client registration completes???
-          'x-sfdc-core-tenant-id': 'core/prod1/00DDu0000008cuqMAA',
-          'x-sfdc-app-context': 'EinsteinGPT',
-          'x-client-feature-id': 'EinsteinDocsAnswers',
-        },
-        body: JSON.stringify({
-          model,
-          tools,
-          tool_config: {
-            mode: 'auto',
-          },
-          messages,
-          generation_settings: {
-            max_tokens: 500,
-            temperature: 0.5,
-            parameters: {},
-          },
-        }),
-      }
-    );
-
-    // eslint-disable-next-line no-await-in-loop
-    const responseData = (await response.json()) as GatewayResponse;
+    const responseData = await makeSingleGatewayRequest(model, tools, messages);
     responses.push(responseData);
 
     // Add the assistant's response to messages for the next iteration
@@ -263,7 +260,7 @@ async function compareModelOutputs(prompt: string | string[], models: string[],
             return `Message ${index + 1}: No tool invoked`;
           }
 
-          const toolArgs = JSON.parse(toolInvocation.function.arguments) as Record<string, string>;
+          const toolArgs = JSON.parse(toolInvocation.function.arguments ?? '{}') as Record<string, string>;
           const argsString = Object.entries(toolArgs)
             .map(([key, value]) => `  - ${key}: ${value}`)
             .join('\n');
@@ -335,12 +332,12 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
       prompts?: Array<string | string[]>;
     };
 
-    if (!yamlObj.models) {
-      throw new Error('models is required');
+    if (!yamlObj.models?.length) {
+      throw new Error('At least one model is required');
     }
 
-    if (!yamlObj.prompts) {
-      throw new Error('prompts is required');
+    if (!yamlObj.prompts?.length) {
+      throw new Error('At least one prompt is required');
     }
 
     stdout('Models:');
diff --git a/yarn.lock b/yarn.lock
index d0da7c8c..b2bd123a 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -5548,6 +5548,11 @@ got@^13:
     p-cancelable "^3.0.0"
     responselike "^3.0.0"
 
+gpt-tokenizer@^3.0.1:
+  version "3.0.1"
+  resolved "https://registry.yarnpkg.com/gpt-tokenizer/-/gpt-tokenizer-3.0.1.tgz#19fa42314d15b69a1e82d3898336b5ba1f4f2c86"
+  integrity sha512-5jdaspBq/w4sWw322SvQj1Fku+CN4OAfYZeeEg8U7CWtxBz+zkxZ3h0YOHD43ee+nZYZ5Ud70HRN0ANcdIj4qg==
+
 graceful-fs@4.2.10:
   version "4.2.10"
   resolved "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.10.tgz"

From 6f86505de0e7e6af5e4fbc0712d765624ec1502f Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Wed, 18 Jun 2025 14:38:36 -0600
Subject: [PATCH 27/51] test: add --entry-point flag

---
 test/llmg.ts | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/test/llmg.ts b/test/llmg.ts
index 7bca374c..2b656c9f 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -68,18 +68,9 @@ type GatewayResponse = {
   };
 };
 
-const getToolsList = async (): Promise<InvocableTool[]> => {
+const getToolsList = async (entryPoint: string): Promise<InvocableTool[]> => {
   const toolsList: string = await new Promise<string>((resolve, reject) => {
-    const child = spawn('npx', [
-      'mcp-inspector',
-      '--cli',
-      'node',
-      'bin/run.js',
-      '-o',
-      'DEFAULT_TARGET_ORG',
-      '--method',
-      'tools/list',
-    ]);
+    const child = spawn('npx', ['mcp-inspector', '--cli', 'node', ...entryPoint.split(' '), '--method', 'tools/list']);
 
     let output = '';
 
@@ -310,6 +301,11 @@ For available models, see:
 https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/`;
 
   public static flags = {
+    'entry-point': Flags.string({
+      summary: 'The entry point to the MCP server',
+      default: 'bin/run.js -o DEFAULT_TARGET_ORG',
+      char: 'e',
+    }),
     file: Flags.file({
       summary: 'The YAML file to use for the response',
       description: 'Must contain array of models and prompts',
@@ -354,7 +350,7 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
     });
 
     stdout();
-    const tools = await getToolsList();
+    const tools = await getToolsList(flags['entry-point']);
     stdout();
 
     for (const prompt of yamlObj.prompts) {

From 5a2cb856b15ab4973c0710ba8536819cb7d0ea78 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Fri, 18 Jul 2025 09:37:00 -0600
Subject: [PATCH 28/51] fix: use correct inspector package

---
 test/llmg.ts | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/llmg.ts b/test/llmg.ts
index 2b656c9f..09930866 100644
--- a/test/llmg.ts
+++ b/test/llmg.ts
@@ -70,7 +70,14 @@ type GatewayResponse = {
 
 const getToolsList = async (entryPoint: string): Promise<InvocableTool[]> => {
   const toolsList: string = await new Promise<string>((resolve, reject) => {
-    const child = spawn('npx', ['mcp-inspector', '--cli', 'node', ...entryPoint.split(' '), '--method', 'tools/list']);
+    const child = spawn('npx', [
+      '@modelcontextprotocol/inspector',
+      '--cli',
+      'node',
+      ...entryPoint.split(' '),
+      '--method',
+      'tools/list',
+    ]);
 
     let output = '';
 

From 77714c735740f28cae63ef8e2aea77bd7fc4e527 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Fri, 18 Jul 2025 16:14:25 -0600
Subject: [PATCH 29/51] test: convert to confidence test

---
 .gitignore                        |   1 -
 package.json                      |   2 +-
 scripts/.eslintrc.cjs             |  26 ++
 scripts/confidence-test.ts        | 254 +++++++++++++++++++
 scripts/tsconfig.json             |   8 +
 scripts/utils/gateway.ts          | 139 +++++++++++
 scripts/utils/models.ts           |  30 +++
 scripts/utils/table.ts            |  26 ++
 scripts/utils/tools.ts            | 109 +++++++++
 scripts/utils/yaml.ts             |  29 +++
 test-assets/compare-responses.yml |  34 +++
 test/llmg.ts                      | 389 ------------------------------
 12 files changed, 656 insertions(+), 391 deletions(-)
 create mode 100644 scripts/.eslintrc.cjs
 create mode 100644 scripts/confidence-test.ts
 create mode 100644 scripts/tsconfig.json
 create mode 100644 scripts/utils/gateway.ts
 create mode 100644 scripts/utils/models.ts
 create mode 100644 scripts/utils/table.ts
 create mode 100644 scripts/utils/tools.ts
 create mode 100644 scripts/utils/yaml.ts
 create mode 100644 test-assets/compare-responses.yml
 delete mode 100644 test/llmg.ts

diff --git a/.gitignore b/.gitignore
index 64e581f3..5712aee8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,4 +49,3 @@ node_modules
 
 .cursor
 src/tools/test.ts
-llmg-test.yml
diff --git a/package.json b/package.json
index 9acbd376..42fdd6bf 100644
--- a/package.json
+++ b/package.json
@@ -26,7 +26,7 @@
     "start": "yarn build && npm link && mcp-inspector sf-mcp-server",
     "test": "wireit",
     "test:only": "wireit",
-    "test:llmg": "node --no-warnings --loader ts-node/esm test/llmg.ts"
+    "test:confidence-test": "node --no-warnings --loader ts-node/esm scripts/confidence-test.ts"
   },
   "repository": "salesforcecli/mcp",
   "bugs": {
diff --git a/scripts/.eslintrc.cjs b/scripts/.eslintrc.cjs
new file mode 100644
index 00000000..e4e8222f
--- /dev/null
+++ b/scripts/.eslintrc.cjs
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+module.exports = {
+  extends: '../.eslintrc.cjs',
+  parserOptions: {
+    project: [
+      './tsconfig.json',
+      './test/tsconfig.json',
+      './scripts/tsconfig.json', // Add this line
+    ],
+  },
+};
diff --git a/scripts/confidence-test.ts b/scripts/confidence-test.ts
new file mode 100644
index 00000000..8c86b85d
--- /dev/null
+++ b/scripts/confidence-test.ts
@@ -0,0 +1,254 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import { dirname } from 'node:path';
+import { printTable } from '@oclif/table';
+import { stdout, colorize } from '@oclif/core/ux';
+import { Command, Flags, flush, handle } from '@oclif/core';
+import { makeGatewayRequests } from './utils/gateway.js';
+import { getToolsList, InvocableTool } from './utils/tools.js';
+import { TABLE_STYLE } from './utils/table.js';
+import { readYamlFile } from './utils/yaml.js';
+
+const castToArray = <T>(value: T | T[]): T[] => (Array.isArray(value) ? value : [value]);
+
+async function compareModelOutputs(
+  prompt: string | string[],
+  models: string[],
+  tools: InvocableTool[]
+): Promise<Record<string, string[]>> {
+  const prompts = castToArray(prompt);
+  const responses = await Promise.all(models.map((model) => makeGatewayRequests(prompts, model, tools)));
+
+  const invokedTools = responses.reduce<Record<string, string[]>>((acc, response) => {
+    // eslint-disable-next-line no-param-reassign
+    acc[response.model] = response.responses.flatMap(
+      (r) => r.generation_details?.generations[0].tool_invocations?.[0]?.function.name ?? []
+    );
+    return acc;
+  }, {});
+
+  printTable({
+    title: `${colorize('yellowBright', 'Prompt')}:\n  - ${prompts.join('\n  - ')}`,
+    data: responses.map((response) => ({
+      model: response.model,
+      chat: response.messages.map((m) => `${colorize('bold', m.role)}: ${m.content}`).join('\n\n'),
+      tools: response.responses
+        .map((r, index) => {
+          const toolInvocation = r.generation_details?.generations[0].tool_invocations?.[0];
+          if (!toolInvocation) {
+            return `Message ${index + 1}: No tool invoked`;
+          }
+
+          const toolArgs = JSON.parse(toolInvocation.function.arguments ?? '{}') as Record<string, string>;
+          const argsString = Object.entries(toolArgs)
+            .map(([key, value]) => `  - ${key}: ${value}`)
+            .join('\n');
+
+          return `Message ${index + 1}: ${colorize('bold', toolInvocation.function.name)}${
+            argsString ? `\n${argsString}` : ''
+          }`;
+        })
+        .join('\n\n'),
+    })),
+    columns: [
+      { key: 'model', width: '30%' },
+      { key: 'chat', width: '40%' },
+      { key: 'tools', width: '30%', name: 'Tool Invocations' },
+    ],
+    width: process.stdout.columns,
+    ...TABLE_STYLE,
+  });
+
+  return invokedTools;
+}
+
+export default class ConfidenceTest extends Command {
+  public static id = 'confidence-test';
+  public static summary = 'Test the MCP server against the LLM Gateway API';
+  public static description = `Tests that the MCP server tools are accurately invoked by various LLM models.
+
+Configuration:
+- Uses a YAML file (default: test-assets/compare-responses.yml) to specify models and test prompts
+- Override the YAML file using the --file flag
+- Requires SF_LLMG_API_KEY environment variable
+
+YAML File Format:
+The YAML file should contain:
+- models: Array of model identifiers to test against
+- prompts: Array of test prompts (can be strings or arrays of strings for multi-turn conversations)
+
+Example YAML structure:
+  expected-tool: sf-deploy-metadata
+  models:
+    - llmgateway__OpenAIGPT35Turbo_01_25
+    - llmgateway__OpenAIGPT4OmniMini
+  prompts:
+    - "What's my salesforce username?"
+    - ["I am a Salesforce developer", "Deploy my project"]
+    - - I am a Salesforce developer.
+      - Deploy my project
+
+For available models, see:
+https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/`;
+
+  public static flags = {
+    file: Flags.file({
+      summary: 'The YAML file to use for the response',
+      description: 'Must contain array of models and prompts',
+      default: 'test-assets/compare-responses.yml',
+      exists: true,
+      char: 'f',
+    }),
+    help: Flags.help({
+      description: 'Show help',
+      char: 'h',
+    }),
+    'confidence-level': Flags.integer({
+      summary: 'Confidence level for the tool',
+      description: 'If confidence level is below this value, command will fail',
+      min: 0,
+      max: 100,
+      default: 50,
+    }),
+    runs: Flags.integer({
+      summary: 'Number of runs to use for confidence level',
+      description: 'If specified, will run the tool multiple times to determine confidence level',
+      default: 5,
+      char: 'r',
+    }),
+  };
+
+  public async run(): Promise<void> {
+    const { flags } = await this.parse(ConfidenceTest);
+
+    const yamlObj = await readYamlFile<{
+      'expected-tool': string;
+      models: string[];
+      prompts: Array<string | string[]>;
+    }>(flags.file);
+
+    if (!yamlObj.models?.length) {
+      throw new Error('At least one model is required');
+    }
+
+    if (!yamlObj.prompts?.length) {
+      throw new Error('At least one prompt is required');
+    }
+
+    if (!yamlObj['expected-tool']) {
+      throw new Error('Expected tool is required in the YAML file');
+    }
+
+    stdout('Expected Tool:');
+    stdout(`  - ${yamlObj['expected-tool']}`);
+
+    stdout('Models:');
+    yamlObj.models.forEach((model) => stdout(`  - ${model}`));
+
+    stdout();
+    stdout('Prompts:');
+    yamlObj.prompts.forEach((prompt) => {
+      if (Array.isArray(prompt)) {
+        stdout(`  - - ${prompt.join('\n    - ')}`);
+      } else {
+        stdout(`  - ${prompt}`);
+      }
+    });
+
+    stdout();
+    const tools = await getToolsList({ verbose: true });
+    stdout();
+
+    const runLog: Record<string, Record<string, string[][]>> = {};
+
+    // eslint-disable-next-line @typescript-eslint/no-unused-vars
+    for (const _ of Array.from({ length: flags.runs })) {
+      for (const prompt of yamlObj.prompts) {
+        // eslint-disable-next-line no-await-in-loop
+        const invokedTools = await compareModelOutputs(prompt, yamlObj.models, tools);
+        const promptKey = Array.isArray(prompt) ? prompt.join(' ') : prompt;
+        runLog[promptKey] = runLog[promptKey] || {};
+        Object.entries(invokedTools).forEach(([model, iTools]) => {
+          runLog[promptKey][model] = runLog[promptKey][model] || [];
+          runLog[promptKey][model].push(iTools);
+        });
+      }
+    }
+
+    stdout();
+    let pass = true;
+    for (const [prompt, models] of Object.entries(runLog)) {
+      const tableData = Object.entries(models).map(([model, runs]) => {
+        const expectedToolCount = runs.flat().filter((tool) => tool === yamlObj['expected-tool']).length;
+        const totalRuns = runs.length;
+        const confidenceLevel = Math.round((expectedToolCount / totalRuns) * 100);
+
+        if (confidenceLevel < flags['confidence-level']) {
+          pass = false;
+        }
+
+        return {
+          model,
+          expectedTool: yamlObj['expected-tool'],
+          invocations: `${expectedToolCount}/${totalRuns}`,
+          actualInvocations: runs.map((r) => r.join(', ')).join('\n'),
+          confidence: `${confidenceLevel}%`,
+          status: confidenceLevel >= flags['confidence-level'] ? colorize('green', 'PASS') : colorize('red', 'FAIL'),
+        };
+      });
+
+      printTable({
+        title: `Results for prompt:\n${colorize('yellowBright', prompt)}`,
+        data: tableData,
+        columns: [
+          { key: 'model', name: 'Model' },
+          { key: 'expectedTool', name: 'Expected Tool Invocation' },
+          { key: 'actualInvocations', name: 'Actual Tool Invocations' },
+          { key: 'invocations', name: 'Invocation Count' },
+          { key: 'confidence', name: 'Confidence' },
+          { key: 'status', name: 'Status' },
+        ],
+        ...TABLE_STYLE,
+      });
+    }
+
+    if (!pass) {
+      throw new Error('Confidence level not met');
+    }
+  }
+}
+
+ConfidenceTest.run(process.argv.slice(2), {
+  root: dirname(import.meta.dirname),
+  pjson: {
+    name: 'confidence-test',
+    version: '0.0.1',
+    oclif: {
+      commands: {
+        strategy: 'single',
+        target: 'scripts/confidence-test.js',
+      },
+    },
+  },
+}).then(
+  async () => {
+    await flush();
+  },
+  async (err) => {
+    await handle(err as Error);
+  }
+);
diff --git a/scripts/tsconfig.json b/scripts/tsconfig.json
new file mode 100644
index 00000000..93273c54
--- /dev/null
+++ b/scripts/tsconfig.json
@@ -0,0 +1,8 @@
+{
+  "extends": "../tsconfig.json",
+  "compilerOptions": {
+    "outDir": "../dist/scripts",
+    "rootDir": "."
+  },
+  "include": ["**/*"]
+}
diff --git a/scripts/utils/gateway.ts b/scripts/utils/gateway.ts
new file mode 100644
index 00000000..f1a2c879
--- /dev/null
+++ b/scripts/utils/gateway.ts
@@ -0,0 +1,139 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+const API_KEY = process.env.SF_LLMG_API_KEY;
+process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
+
+if (!API_KEY) {
+  throw new Error('SF_LLMG_API_KEY is not set');
+}
+
+type GatewayResponse = {
+  generation_details?: {
+    generations: Array<{
+      content: string;
+      role: string;
+      tool_invocations?: Array<{
+        id: string;
+        function: {
+          name: string;
+          arguments: string;
+        };
+      }>;
+    }>;
+  };
+};
+
+const createRequestHeaders = (): Record<string, string> => ({
+  Authorization: `API_KEY ${API_KEY}`,
+  'Content-Type': 'application/json',
+  // We need to figure out which tenant, context, and feature id to use
+  // Maybe this is something that will be given to us once the client registration completes???
+  'x-sfdc-core-tenant-id': 'core/prod1/00DDu0000008cuqMAA',
+  'x-sfdc-app-context': 'EinsteinGPT',
+  'x-client-feature-id': 'EinsteinDocsAnswers',
+});
+
+const createRequestBody = (
+  model: string,
+  tools: InvocableTool[],
+  messages: Array<{ role: string; content: string }>
+): string =>
+  JSON.stringify({
+    model,
+    tools,
+    tool_config: {
+      mode: 'auto',
+    },
+    messages,
+    generation_settings: {
+      max_tokens: 500,
+      temperature: 0.5,
+      parameters: {},
+    },
+  });
+
+const makeSingleGatewayRequest = async (
+  model: string,
+  tools: InvocableTool[],
+  messages: Array<{ role: string; content: string }>
+): Promise<GatewayResponse> => {
+  const response = await fetch(
+    'https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations',
+    {
+      method: 'POST',
+      headers: createRequestHeaders(),
+      body: createRequestBody(model, tools, messages),
+    }
+  );
+
+  if (!response.ok) {
+    throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+  }
+
+  const responseData = await response.json();
+  return responseData as GatewayResponse;
+};
+
+/**
+ * Makes requests to the LLM Gateway API for multiple prompts using the specified model and tools.
+ *
+ * @param {string[]} prompts - Array of prompts to send to the API
+ * @param {string} model - The model identifier to use for generation (e.g., 'llmgateway__AzureOpenAIGPT4Omni')
+ * @param {InvocableTool[]} tools - Array of tools that can be invoked by the model
+ * @returns {Promise<{model: string, messages: Array<{role: string, content: string}>, responses: GatewayResponse[]}>} Object containing the model used, conversation messages, and API responses
+ * @throws {Error} If any API request fails or returns an error
+ *
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/#make-your-first-gateway-request} Make Your First Gateway Request Documentation
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/} Models and Providers Documentation
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/apis/rest/#operation/chatMessages} REST API Documentation
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/function-calling/} Function Calling Documentation
+ * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/auth/#api-key-limitations} API Key Limitations Documentation
+ */
+export const makeGatewayRequests = async (
+  prompts: string[],
+  model: string,
+  tools: InvocableTool[]
+): Promise<{ model: string; messages: Array<{ role: string; content: string }>; responses: GatewayResponse[] }> => {
+  const messages: Array<{ role: string; content: string }> = [];
+  const responses: GatewayResponse[] = [];
+
+  for (const prompt of prompts) {
+    // Add the current prompt to messages
+    messages.push({
+      role: 'user',
+      content: prompt,
+    });
+
+    // eslint-disable-next-line no-await-in-loop
+    const responseData = await makeSingleGatewayRequest(model, tools, messages);
+    responses.push(responseData);
+
+    // Add the assistant's response to messages for the next iteration
+    if (responseData.generation_details?.generations[0]?.content) {
+      messages.push({
+        role: responseData.generation_details.generations[0].role,
+        content: responseData.generation_details.generations[0].content,
+      });
+    }
+  }
+
+  return {
+    responses,
+    model,
+    messages,
+  };
+};
diff --git a/scripts/utils/models.ts b/scripts/utils/models.ts
new file mode 100644
index 00000000..dc99281e
--- /dev/null
+++ b/scripts/utils/models.ts
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// See https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/
+export const MODELS = [
+  'llmgateway__OpenAIGPT35Turbo_01_25',
+  'llmgateway__OpenAIGPT4OmniMini',
+  'llmgateway__BedrockAnthropicClaude4Sonnet',
+  'llmgateway__OpenAIGPT41Nano',
+  'llmgateway__OpenAIGPT41Mini',
+  'llmgateway__BedrockAnthropicClaude37Sonnet',
+  'llmgateway__BedrockAnthropicClaude3Opus',
+  'llmgateway__VertexAIGemini25Flash001',
+] as const;
+
+export type Model = (typeof MODELS)[number];
+export const DEFAULT_MODEL: Model = 'llmgateway__BedrockAnthropicClaude4Sonnet';
diff --git a/scripts/utils/table.ts b/scripts/utils/table.ts
new file mode 100644
index 00000000..784a47ae
--- /dev/null
+++ b/scripts/utils/table.ts
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import { TableOptions } from '@oclif/table';
+
+export const TABLE_STYLE = {
+  headerOptions: {
+    formatter: 'capitalCase',
+    color: 'cyanBright',
+  },
+  borderColor: 'gray',
+  overflow: 'wrap',
+} satisfies Partial<TableOptions<Record<string, unknown>>>;
diff --git a/scripts/utils/tools.ts b/scripts/utils/tools.ts
new file mode 100644
index 00000000..aa653234
--- /dev/null
+++ b/scripts/utils/tools.ts
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import { spawn } from 'node:child_process';
+import { Tool } from '@modelcontextprotocol/sdk/types.js';
+import { printTable } from '@oclif/table';
+import { colorize } from '@oclif/core/ux';
+import { encode as encodeGPT4oMini } from 'gpt-tokenizer/model/gpt-4o-mini';
+import { encode as encodeO3Mini } from 'gpt-tokenizer/model/o3-mini';
+import { encode as encodeGPT4 } from 'gpt-tokenizer/model/gpt-4';
+import { TABLE_STYLE } from './table.js';
+
+export type InvocableTool = {
+  name: string;
+  function: {
+    name: string;
+    description: string | undefined;
+    parameters: Tool['inputSchema'];
+    annotations: Tool['annotations'];
+  };
+};
+
+export const getToolsList = async ({ verbose }: { verbose: boolean }): Promise<InvocableTool[]> => {
+  const toolsList: string = await new Promise<string>((resolve, reject) => {
+    const child = spawn('npx', [
+      '@modelcontextprotocol/inspector',
+      '--cli',
+      'node',
+      'bin/run.js',
+      '--orgs',
+      'DEFAULT_TARGET_ORG',
+      '--method',
+      'tools/list',
+    ]);
+
+    let output = '';
+
+    child.stdout.on('data', (data: Buffer) => {
+      output += data.toString();
+    });
+
+    child.stderr.on('data', (data: Buffer) => {
+      reject(new Error(data.toString()));
+    });
+
+    child.on('close', (code: number | null) => {
+      if (code === 0) {
+        resolve(output);
+      } else {
+        reject(new Error(`Process exited with code ${code}`));
+      }
+    });
+  });
+
+  const parsedToolsList = JSON.parse(toolsList) as { tools: Tool[] };
+
+  if (verbose) {
+    const toolsWithTokens = parsedToolsList.tools?.map((tool) => ({
+      tool: tool.name,
+      tokensGPT4oMini: encodeGPT4oMini(JSON.stringify(tool)).length,
+      tokensO3Mini: encodeO3Mini(JSON.stringify(tool)).length,
+      tokensGPT4: encodeGPT4(JSON.stringify(tool)).length,
+    }));
+    toolsWithTokens.push({
+      tool: colorize('bold', 'TOTAL'),
+      tokensGPT4oMini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4oMini, 0),
+      tokensO3Mini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensO3Mini, 0),
+      tokensGPT4: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4, 0),
+    });
+
+    printTable({
+      title: 'Tools List',
+      data: toolsWithTokens,
+      columns: [
+        'tool',
+        { key: 'tokensGPT4oMini', name: 'GPT 4o Mini' },
+        { key: 'tokensO3Mini', name: 'O3 Mini' },
+        { key: 'tokensGPT4', name: 'GPT 4' },
+      ],
+      titleOptions: {
+        color: 'yellowBright',
+      },
+      ...TABLE_STYLE,
+    });
+  }
+
+  return (parsedToolsList.tools ?? []).map((tool) => ({
+    name: tool.name,
+    function: {
+      name: tool.name,
+      description: tool.description,
+      parameters: tool.inputSchema,
+      annotations: tool.annotations,
+    },
+  }));
+};
diff --git a/scripts/utils/yaml.ts b/scripts/utils/yaml.ts
new file mode 100644
index 00000000..586c3281
--- /dev/null
+++ b/scripts/utils/yaml.ts
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import fs from 'node:fs/promises';
+import yaml from 'yaml';
+
+export async function readYamlFile<T>(filePath: string): Promise<T> {
+  try {
+    const fileContent = await fs.readFile(filePath, 'utf8');
+    return yaml.parse(fileContent) as T;
+  } catch (error) {
+    throw new Error(
+      `Failed to read or parse YAML file at ${filePath}: ${error instanceof Error ? error.message : String(error)}`
+    );
+  }
+}
diff --git a/test-assets/compare-responses.yml b/test-assets/compare-responses.yml
new file mode 100644
index 00000000..50c768f6
--- /dev/null
+++ b/test-assets/compare-responses.yml
@@ -0,0 +1,34 @@
+models:
+  # - llmgateway__OpenAIGPT35Turbo_01_25
+  - llmgateway__OpenAIGPT4OmniMini
+  - llmgateway__OpenAIGPT41Nano
+  - llmgateway__BedrockAnthropicClaude4Sonnet
+
+expected-tool: sf-deploy-metadata
+
+prompts:
+  # Sparse prompts
+  # - Deploy my changes to the dreamhouse org
+  # - Deploy the manifest in dreamhouse-lwc/force-app/main/default/package.xml to dreamhouse
+  # - Deploy all metadata in dreamhouse-lwc/force-app/main/default to dreamhouse
+
+  # # Context-rich prompts
+  - - My current working directory is /Users/mdonnalley/repos/trailheadapps/dreamhouse-lwc. Please deploy the source in force-app/main/default/classes/Broker.cls to my dreamhouse org.
+
+  # - - I am working on the dreamhouse-lwc project. My org alias is dreamhouse.
+  #   - Deploy only the Lightning Web Components in force-app/main/default/lwc to dreamhouse.
+
+  # - - I want to deploy only the Property object and its fields from dreamhouse-lwc/force-app/main/default/objects/Property__c to dreamhouse.
+  #   - Run all local tests during deployment.
+
+  # - - My org alias is dreamhouse and my project is dreamhouse-lwc.
+  #   - Deploy the changes in force-app/main/default/permissionsets to dreamhouse and run the BrokerTest Apex test.
+
+  # - - I am a Salesforce developer. My org is dreamhouse.
+  #   - Deploy the manifest file dreamhouse-lwc/manifest/package.xml to my org and run all tests.
+
+  # - - I am in dreamhouse-lwc. My org alias is dreamhouse.
+  #   - Deploy only the layouts in force-app/main/default/layouts to dreamhouse.
+
+  # - - My org alias is dreamhouse.
+  #   - Deploy the source in force-app/main/default/objects/Contact__c and run no tests.
diff --git a/test/llmg.ts b/test/llmg.ts
deleted file mode 100644
index 09930866..00000000
--- a/test/llmg.ts
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright 2025, Salesforce, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-const API_KEY = process.env.SF_LLMG_API_KEY;
-process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
-
-if (!API_KEY) {
-  throw new Error('SF_LLMG_API_KEY is not set');
-}
-
-import { spawn } from 'node:child_process';
-import fs from 'node:fs/promises';
-import { dirname } from 'node:path';
-import { Tool } from '@modelcontextprotocol/sdk/types.js';
-import { printTable, TableOptions } from '@oclif/table';
-import { stdout, colorize } from '@oclif/core/ux';
-import yaml from 'yaml';
-import { Command, Flags, flush, handle } from '@oclif/core';
-import { encode as encodeGPT4oMini } from 'gpt-tokenizer/model/gpt-4o-mini';
-import { encode as encodeO3Mini } from 'gpt-tokenizer/model/o3-mini';
-import { encode as encodeGPT4 } from 'gpt-tokenizer/model/gpt-4';
-
-const TABLE_STYLE = {
-  headerOptions: {
-    formatter: 'capitalCase',
-    color: 'cyanBright',
-  },
-  borderColor: 'gray',
-  overflow: 'wrap',
-} satisfies Partial<TableOptions<Record<string, unknown>>>;
-
-type InvocableTool = {
-  name: string;
-  function: {
-    name: string;
-    description: string | undefined;
-    parameters: Tool['inputSchema'];
-    annotations: Tool['annotations'];
-  };
-};
-
-type GatewayResponse = {
-  generation_details?: {
-    generations: Array<{
-      content: string;
-      role: string;
-      tool_invocations?: Array<{
-        id: string;
-        function: {
-          name: string;
-          arguments: string;
-        };
-      }>;
-    }>;
-  };
-};
-
-const getToolsList = async (entryPoint: string): Promise<InvocableTool[]> => {
-  const toolsList: string = await new Promise<string>((resolve, reject) => {
-    const child = spawn('npx', [
-      '@modelcontextprotocol/inspector',
-      '--cli',
-      'node',
-      ...entryPoint.split(' '),
-      '--method',
-      'tools/list',
-    ]);
-
-    let output = '';
-
-    child.stdout.on('data', (data: Buffer) => {
-      output += data.toString();
-    });
-
-    child.stderr.on('data', (data: Buffer) => {
-      reject(new Error(data.toString()));
-    });
-
-    child.on('close', (code: number | null) => {
-      if (code === 0) {
-        resolve(output);
-      } else {
-        reject(new Error(`Process exited with code ${code}`));
-      }
-    });
-  });
-
-  const parsedToolsList = JSON.parse(toolsList) as { tools: Tool[] };
-
-  const toolsWithTokens = parsedToolsList.tools?.map((tool) => ({
-    tool: tool.name,
-    tokensGPT4oMini: encodeGPT4oMini(JSON.stringify(tool)).length,
-    tokensO3Mini: encodeO3Mini(JSON.stringify(tool)).length,
-    tokensGPT4: encodeGPT4(JSON.stringify(tool)).length,
-  }));
-  toolsWithTokens.push({
-    tool: colorize('bold', 'TOTAL'),
-    tokensGPT4oMini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4oMini, 0),
-    tokensO3Mini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensO3Mini, 0),
-    tokensGPT4: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4, 0),
-  });
-
-  printTable({
-    title: 'Tools List',
-    data: toolsWithTokens,
-    columns: [
-      'tool',
-      { key: 'tokensGPT4oMini', name: 'GPT 4o Mini' },
-      { key: 'tokensO3Mini', name: 'O3 Mini' },
-      { key: 'tokensGPT4', name: 'GPT 4' },
-    ],
-    titleOptions: {
-      color: 'yellowBright',
-    },
-    ...TABLE_STYLE,
-  });
-
-  return (parsedToolsList.tools ?? []).map((tool) => ({
-    name: tool.name,
-    function: {
-      name: tool.name,
-      description: tool.description,
-      parameters: tool.inputSchema,
-      annotations: tool.annotations,
-    },
-  }));
-};
-
-const createRequestHeaders = (): Record<string, string> => ({
-  Authorization: `API_KEY ${API_KEY}`,
-  'Content-Type': 'application/json',
-  // We need to figure out which tenant, context, and feature id to use
-  // Maybe this is something that will be given to us once the client registration completes???
-  'x-sfdc-core-tenant-id': 'core/prod1/00DDu0000008cuqMAA',
-  'x-sfdc-app-context': 'EinsteinGPT',
-  'x-client-feature-id': 'EinsteinDocsAnswers',
-});
-
-const createRequestBody = (
-  model: string,
-  tools: InvocableTool[],
-  messages: Array<{ role: string; content: string }>
-): string =>
-  JSON.stringify({
-    model,
-    tools,
-    tool_config: {
-      mode: 'auto',
-    },
-    messages,
-    generation_settings: {
-      max_tokens: 500,
-      temperature: 0.5,
-      parameters: {},
-    },
-  });
-
-const makeSingleGatewayRequest = async (
-  model: string,
-  tools: InvocableTool[],
-  messages: Array<{ role: string; content: string }>
-): Promise<GatewayResponse> => {
-  const response = await fetch(
-    'https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations',
-    {
-      method: 'POST',
-      headers: createRequestHeaders(),
-      body: createRequestBody(model, tools, messages),
-    }
-  );
-
-  if (!response.ok) {
-    throw new Error(`HTTP ${response.status}: ${response.statusText}`);
-  }
-
-  const responseData = await response.json();
-  return responseData as GatewayResponse;
-};
-
-/**
- * Makes requests to the LLM Gateway API for multiple prompts using the specified model and tools.
- *
- * @param {string[]} prompts - Array of prompts to send to the API
- * @param {string} model - The model identifier to use for generation (e.g., 'llmgateway__AzureOpenAIGPT4Omni')
- * @param {InvocableTool[]} tools - Array of tools that can be invoked by the model
- * @returns {Promise<{model: string, messages: Array<{role: string, content: string}>, responses: GatewayResponse[]}>} Object containing the model used, conversation messages, and API responses
- * @throws {Error} If any API request fails or returns an error
- *
- * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/#make-your-first-gateway-request} Make Your First Gateway Request Documentation
- * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/} Models and Providers Documentation
- * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/apis/rest/#operation/chatMessages} REST API Documentation
- * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/function-calling/} Function Calling Documentation
- * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/auth/#api-key-limitations} API Key Limitations Documentation
- */
-const makeGatewayRequests = async (
-  prompts: string[],
-  model: string,
-  tools: InvocableTool[]
-): Promise<{ model: string; messages: Array<{ role: string; content: string }>; responses: GatewayResponse[] }> => {
-  const messages: Array<{ role: string; content: string }> = [];
-  const responses: GatewayResponse[] = [];
-
-  for (const prompt of prompts) {
-    // Add the current prompt to messages
-    messages.push({
-      role: 'user',
-      content: prompt,
-    });
-
-    // eslint-disable-next-line no-await-in-loop
-    const responseData = await makeSingleGatewayRequest(model, tools, messages);
-    responses.push(responseData);
-
-    // Add the assistant's response to messages for the next iteration
-    if (responseData.generation_details?.generations[0]?.content) {
-      messages.push({
-        role: responseData.generation_details.generations[0].role,
-        content: responseData.generation_details.generations[0].content,
-      });
-    }
-  }
-
-  return {
-    responses,
-    model,
-    messages,
-  };
-};
-
-const castToArray = <T>(value: T | T[]): T[] => (Array.isArray(value) ? value : [value]);
-
-async function compareModelOutputs(prompt: string | string[], models: string[], tools: InvocableTool[]) {
-  const prompts = castToArray(prompt);
-  const responses = await Promise.all(models.map((model) => makeGatewayRequests(prompts, model, tools)));
-
-  printTable({
-    title: `${colorize('yellowBright', 'Prompt')}:\n  - ${prompts.join('\n  - ')}`,
-    data: responses.map((response) => ({
-      model: response.model,
-      chat: response.messages.map((m) => `${colorize('bold', m.role)}: ${m.content}`).join('\n\n'),
-      tools: response.responses
-        .map((r, index) => {
-          const toolInvocation = r.generation_details?.generations[0].tool_invocations?.[0];
-          if (!toolInvocation) {
-            return `Message ${index + 1}: No tool invoked`;
-          }
-
-          const toolArgs = JSON.parse(toolInvocation.function.arguments ?? '{}') as Record<string, string>;
-          const argsString = Object.entries(toolArgs)
-            .map(([key, value]) => `  - ${key}: ${value}`)
-            .join('\n');
-
-          return `Message ${index + 1}: ${colorize('bold', toolInvocation.function.name)}${
-            argsString ? `\n${argsString}` : ''
-          }`;
-        })
-        .join('\n\n'),
-    })),
-    columns: [
-      { key: 'model', width: '30%' },
-      { key: 'chat', width: '40%' },
-      { key: 'tools', width: '30%', name: 'Tool Invocations' },
-    ],
-    width: process.stdout.columns,
-    ...TABLE_STYLE,
-  });
-}
-
-export default class LLMGatewayTest extends Command {
-  public static id = 'llm-gateway-test';
-  public static summary = 'Test the MCP server against the LLM Gateway API';
-  public static description = `Tests that the MCP server tools are accurately invoked by various LLM models.
-
-Configuration:
-- Uses a YAML file (default: llmg-test.yml) to specify models and test prompts
-- Override the YAML file using the --file flag
-- Requires SF_LLMG_API_KEY environment variable
-
-YAML File Format:
-The YAML file should contain:
-- models: Array of model identifiers to test against
-- prompts: Array of test prompts (can be strings or arrays of strings for multi-turn conversations)
-
-Example YAML structure:
-  models:
-    - llmgateway__OpenAIGPT35Turbo_01_25
-    - llmgateway__OpenAIGPT4OmniMini
-  prompts:
-    - "What's my salesforce username?"
-    - ["I am a Salesforce developer", "Deploy my project"]
-    - - I am a Salesforce developer.
-      - Deploy my project
-
-For available models, see:
-https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/`;
-
-  public static flags = {
-    'entry-point': Flags.string({
-      summary: 'The entry point to the MCP server',
-      default: 'bin/run.js -o DEFAULT_TARGET_ORG',
-      char: 'e',
-    }),
-    file: Flags.file({
-      summary: 'The YAML file to use for the response',
-      description: 'Must contain array of models and prompts',
-      default: 'llmg-test.yml',
-      exists: true,
-      char: 'f',
-    }),
-    help: Flags.help({
-      description: 'Show help',
-      char: 'h',
-    }),
-  };
-
-  public async run(): Promise<void> {
-    const { flags } = await this.parse(LLMGatewayTest);
-
-    const yamlContents = await fs.readFile(flags.file, 'utf8');
-    const yamlObj = yaml.parse(yamlContents) as {
-      models?: string[];
-      prompts?: Array<string | string[]>;
-    };
-
-    if (!yamlObj.models?.length) {
-      throw new Error('At least one model is required');
-    }
-
-    if (!yamlObj.prompts?.length) {
-      throw new Error('At least one prompt is required');
-    }
-
-    stdout('Models:');
-    yamlObj.models.forEach((model) => stdout(`  - ${model}`));
-
-    stdout();
-    stdout('Prompts:');
-    yamlObj.prompts.forEach((prompt) => {
-      if (Array.isArray(prompt)) {
-        stdout(`  - - ${prompt.join('\n    - ')}`);
-      } else {
-        stdout(`  - ${prompt}`);
-      }
-    });
-
-    stdout();
-    const tools = await getToolsList(flags['entry-point']);
-    stdout();
-
-    for (const prompt of yamlObj.prompts) {
-      // eslint-disable-next-line no-await-in-loop
-      await compareModelOutputs(prompt, yamlObj.models, tools);
-    }
-  }
-}
-
-LLMGatewayTest.run(process.argv.slice(2), {
-  root: dirname(import.meta.dirname),
-  pjson: {
-    name: 'llm-gateway-test',
-    version: '0.0.1',
-    oclif: {
-      commands: {
-        strategy: 'single',
-        target: 'test/llmg.js',
-      },
-    },
-  },
-}).then(
-  async () => {
-    await flush();
-  },
-  async (err) => {
-    await handle(err as Error);
-  }
-);

From 1fedd54c1f6214f1bb69ad4216bc680f00c95202 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Fri, 18 Jul 2025 16:17:01 -0600
Subject: [PATCH 30/51] refactor: use model const

---
 scripts/confidence-test.ts |  5 +++--
 scripts/utils/gateway.ts   | 11 +++++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/scripts/confidence-test.ts b/scripts/confidence-test.ts
index 8c86b85d..e83cb6b0 100644
--- a/scripts/confidence-test.ts
+++ b/scripts/confidence-test.ts
@@ -22,12 +22,13 @@ import { makeGatewayRequests } from './utils/gateway.js';
 import { getToolsList, InvocableTool } from './utils/tools.js';
 import { TABLE_STYLE } from './utils/table.js';
 import { readYamlFile } from './utils/yaml.js';
+import { Model } from './utils/models.js';
 
 const castToArray = <T>(value: T | T[]): T[] => (Array.isArray(value) ? value : [value]);
 
 async function compareModelOutputs(
   prompt: string | string[],
-  models: string[],
+  models: Model[],
   tools: InvocableTool[]
 ): Promise<Record<string, string[]>> {
   const prompts = castToArray(prompt);
@@ -137,7 +138,7 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
 
     const yamlObj = await readYamlFile<{
       'expected-tool': string;
-      models: string[];
+      models: Model[];
       prompts: Array<string | string[]>;
     }>(flags.file);
 
diff --git a/scripts/utils/gateway.ts b/scripts/utils/gateway.ts
index f1a2c879..d3cfe1d1 100644
--- a/scripts/utils/gateway.ts
+++ b/scripts/utils/gateway.ts
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+import { Model } from './models.js';
+import { InvocableTool } from './tools.js';
+
 const API_KEY = process.env.SF_LLMG_API_KEY;
 process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
 
@@ -48,7 +51,7 @@ const createRequestHeaders = (): Record<string, string> => ({
 });
 
 const createRequestBody = (
-  model: string,
+  model: Model,
   tools: InvocableTool[],
   messages: Array<{ role: string; content: string }>
 ): string =>
@@ -67,7 +70,7 @@ const createRequestBody = (
   });
 
 const makeSingleGatewayRequest = async (
-  model: string,
+  model: Model,
   tools: InvocableTool[],
   messages: Array<{ role: string; content: string }>
 ): Promise<GatewayResponse> => {
@@ -105,9 +108,9 @@ const makeSingleGatewayRequest = async (
  */
 export const makeGatewayRequests = async (
   prompts: string[],
-  model: string,
+  model: Model,
   tools: InvocableTool[]
-): Promise<{ model: string; messages: Array<{ role: string; content: string }>; responses: GatewayResponse[] }> => {
+): Promise<{ model: Model; messages: Array<{ role: string; content: string }>; responses: GatewayResponse[] }> => {
   const messages: Array<{ role: string; content: string }> = [];
   const responses: GatewayResponse[] = [];
 

From 343cc6cb5018729a3c0c5a2ca9c9c620ff3d15d3 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Mon, 21 Jul 2025 15:24:00 -0600
Subject: [PATCH 31/51] refactor: make into confidence test

---
 package.json                       |   2 +-
 scripts/confidence-test.ts         | 431 ++++++++++++++++++++---------
 scripts/utils/gateway.ts           |  16 +-
 scripts/utils/tools.ts             |  56 ++--
 test-assets/compare-responses.yml  |  34 ---
 test-assets/sf-deploy-metadata.yml |  27 ++
 yarn.lock                          |   8 +-
 7 files changed, 379 insertions(+), 195 deletions(-)
 delete mode 100644 test-assets/compare-responses.yml
 create mode 100644 test-assets/sf-deploy-metadata.yml

diff --git a/package.json b/package.json
index 42fdd6bf..9545b937 100644
--- a/package.json
+++ b/package.json
@@ -56,7 +56,7 @@
   },
   "devDependencies": {
     "@modelcontextprotocol/inspector": "^0.15.0",
-    "@oclif/table": "^0.4.8",
+    "@oclif/table": "^0.4.9",
     "@salesforce/cli-plugins-testkit": "^5.3.39",
     "@salesforce/dev-scripts": "11.0.2",
     "@types/node": "^22.16.3",
diff --git a/scripts/confidence-test.ts b/scripts/confidence-test.ts
index e83cb6b0..ad374313 100644
--- a/scripts/confidence-test.ts
+++ b/scripts/confidence-test.ts
@@ -18,63 +18,99 @@ import { dirname } from 'node:path';
 import { printTable } from '@oclif/table';
 import { stdout, colorize } from '@oclif/core/ux';
 import { Command, Flags, flush, handle } from '@oclif/core';
+import { z } from 'zod';
 import { makeGatewayRequests } from './utils/gateway.js';
 import { getToolsList, InvocableTool } from './utils/tools.js';
 import { TABLE_STYLE } from './utils/table.js';
 import { readYamlFile } from './utils/yaml.js';
 import { Model } from './utils/models.js';
 
+const Spec = z.object({
+  models: z.array(z.custom<Model>()),
+  'initial-context': z.array(z.string()).optional(),
+  tests: z.array(
+    z.object({
+      prompts: z.union([z.string(), z.array(z.string())]),
+      'expected-tool': z.string(),
+      'expected-parameters': z.record(z.string(), z.string()).optional(),
+      'expected-tool-confidence': z.number(),
+      'expected-parameter-confidence': z.number().optional(),
+    })
+  ),
+});
+
+type Spec = z.infer<typeof Spec>;
+
 const castToArray = <T>(value: T | T[]): T[] => (Array.isArray(value) ? value : [value]);
 
+const groupBy = <T, K extends string | number | symbol>(array: T[], key: (item: T) => K): Record<K, T[]> =>
+  array.reduce<Record<K, T[]>>((result, item) => {
+    const groupKey = key(item);
+    if (!result[groupKey]) {
+      return { ...result, [groupKey]: [item] };
+    }
+    return { ...result, [groupKey]: [...result[groupKey], item] };
+    // eslint-disable-next-line
+  }, {} as Record<K, T[]>);
+
 async function compareModelOutputs(
   prompt: string | string[],
-  models: Model[],
+  spec: Spec,
   tools: InvocableTool[]
-): Promise<Record<string, string[]>> {
+): Promise<{
+  tableData: Array<{ model: Model; chat: string; tools: string }>;
+  invocations: Record<string, Array<{ tool: string; parameters: Record<string, string> }>>;
+}> {
+  const models = spec.models;
   const prompts = castToArray(prompt);
-  const responses = await Promise.all(models.map((model) => makeGatewayRequests(prompts, model, tools)));
+  const responses = await Promise.all(
+    models.map((model) => makeGatewayRequests(prompts, model, tools, spec['initial-context']))
+  );
 
-  const invokedTools = responses.reduce<Record<string, string[]>>((acc, response) => {
-    // eslint-disable-next-line no-param-reassign
-    acc[response.model] = response.responses.flatMap(
-      (r) => r.generation_details?.generations[0].tool_invocations?.[0]?.function.name ?? []
-    );
-    return acc;
-  }, {});
-
-  printTable({
-    title: `${colorize('yellowBright', 'Prompt')}:\n  - ${prompts.join('\n  - ')}`,
-    data: responses.map((response) => ({
-      model: response.model,
-      chat: response.messages.map((m) => `${colorize('bold', m.role)}: ${m.content}`).join('\n\n'),
-      tools: response.responses
-        .map((r, index) => {
-          const toolInvocation = r.generation_details?.generations[0].tool_invocations?.[0];
-          if (!toolInvocation) {
-            return `Message ${index + 1}: No tool invoked`;
-          }
+  const invocations = responses.reduce<Record<string, Array<{ tool: string; parameters: Record<string, string> }>>>(
+    (acc, response) => {
+      const toolInvocations = response.responses.flatMap((r) => {
+        const toolInvocation = r.generation_details?.generations[0].tool_invocations?.[0];
+        if (!toolInvocation) return [];
+
+        const parameters: Record<string, string> = toolInvocation.function.arguments
+          ? (JSON.parse(toolInvocation.function.arguments) as Record<string, string>)
+          : {};
+
+        return [
+          {
+            tool: toolInvocation.function.name,
+            parameters,
+          },
+        ];
+      });
+      return { ...acc, [response.model]: toolInvocations };
+    },
+    {}
+  );
+
+  const tableData = responses.map((response) => ({
+    model: response.model,
+    chat: response.messages.map((m) => `${colorize('bold', m.role)}: ${m.content}`).join('\n\n'),
+    tools: response.responses
+      .map((r, index) => {
+        const toolInvocation = r.generation_details?.generations[0].tool_invocations?.[0];
+        if (!toolInvocation) {
+          return `Message ${index + 1}: No tool invoked`;
+        }
+
+        const toolArgs = JSON.parse(toolInvocation.function.arguments ?? '{}') as Record<string, string>;
+        const argsString = Object.entries(toolArgs)
+          .map(([key, value]) => `  - ${key}: ${value}`)
+          .join('\n');
 
-          const toolArgs = JSON.parse(toolInvocation.function.arguments ?? '{}') as Record<string, string>;
-          const argsString = Object.entries(toolArgs)
-            .map(([key, value]) => `  - ${key}: ${value}`)
-            .join('\n');
-
-          return `Message ${index + 1}: ${colorize('bold', toolInvocation.function.name)}${
-            argsString ? `\n${argsString}` : ''
-          }`;
-        })
-        .join('\n\n'),
-    })),
-    columns: [
-      { key: 'model', width: '30%' },
-      { key: 'chat', width: '40%' },
-      { key: 'tools', width: '30%', name: 'Tool Invocations' },
-    ],
-    width: process.stdout.columns,
-    ...TABLE_STYLE,
-  });
-
-  return invokedTools;
+        return `Message ${index + 1}: ${colorize('bold', toolInvocation.function.name)}${
+          argsString ? `\n${argsString}` : ''
+        }`;
+      })
+      .join('\n\n'),
+  }));
+  return { invocations, tableData };
 }
 
 export default class ConfidenceTest extends Command {
@@ -83,25 +119,37 @@ export default class ConfidenceTest extends Command {
   public static description = `Tests that the MCP server tools are accurately invoked by various LLM models.
 
 Configuration:
-- Uses a YAML file (default: test-assets/compare-responses.yml) to specify models and test prompts
-- Override the YAML file using the --file flag
+- Uses a YAML file to specify models and test prompts
 - Requires SF_LLMG_API_KEY environment variable
 
 YAML File Format:
 The YAML file should contain:
 - models: Array of model identifiers to test against
-- prompts: Array of test prompts (can be strings or arrays of strings for multi-turn conversations)
+- initial-context: Optional array of strings to set the initial context for the conversation
+- tests: Array of test objects with the following properties:
+  - prompts: String or array of strings for test prompts (supports multi-turn conversations)
+  - expected-tool: String identifying the expected tool to be invoked
+  - expected-parameters: Optional object with expected parameter key-value pairs
+  - expected-tool-confidence: Number representing the minimum confidence level (0-100)
 
 Example YAML structure:
-  expected-tool: sf-deploy-metadata
-  models:
-    - llmgateway__OpenAIGPT35Turbo_01_25
-    - llmgateway__OpenAIGPT4OmniMini
-  prompts:
-    - "What's my salesforce username?"
-    - ["I am a Salesforce developer", "Deploy my project"]
-    - - I am a Salesforce developer.
+models:
+  - llmgateway__OpenAIGPT35Turbo_01_25
+  - llmgateway__OpenAIGPT4OmniMini
+tests:
+  - prompts: "What's my salesforce username?"
+    expected-tool: sf-org-display
+    expected-tool-confidence: 80
+  - prompts: ["I am a Salesforce developer", "Deploy my project"]
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      source-dir: "force-app"
+    expected-tool-confidence: 90
+  - prompts:
+      - I am a Salesforce developer.
       - Deploy my project
+    expected-tool: sf-deploy-metadata
+    expected-tool-confidence: 85
 
 For available models, see:
 https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/`;
@@ -110,7 +158,7 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
     file: Flags.file({
       summary: 'The YAML file to use for the response',
       description: 'Must contain array of models and prompts',
-      default: 'test-assets/compare-responses.yml',
+      required: true,
       exists: true,
       char: 'f',
     }),
@@ -118,117 +166,248 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
       description: 'Show help',
       char: 'h',
     }),
-    'confidence-level': Flags.integer({
-      summary: 'Confidence level for the tool',
-      description: 'If confidence level is below this value, command will fail',
-      min: 0,
-      max: 100,
-      default: 50,
-    }),
     runs: Flags.integer({
       summary: 'Number of runs to use for confidence level',
       description: 'If specified, will run the tool multiple times to determine confidence level',
       default: 5,
       char: 'r',
     }),
+    verbose: Flags.boolean({
+      summary: 'Enable verbose output',
+      description: 'If true, will print additional information about the test runs',
+      default: false,
+      char: 'v',
+    }),
   };
 
   public async run(): Promise<void> {
     const { flags } = await this.parse(ConfidenceTest);
 
-    const yamlObj = await readYamlFile<{
-      'expected-tool': string;
-      models: Model[];
-      prompts: Array<string | string[]>;
-    }>(flags.file);
-
-    if (!yamlObj.models?.length) {
-      throw new Error('At least one model is required');
+    const spec = Spec.safeParse(await readYamlFile<Spec>(flags.file));
+    if (!spec.success) {
+      this.error(`Invalid spec file: ${flags.file}\n${spec.error.message}`);
     }
 
-    if (!yamlObj.prompts?.length) {
-      throw new Error('At least one prompt is required');
-    }
+    stdout();
+    const mcpTools = await getToolsList();
+    stdout();
 
-    if (!yamlObj['expected-tool']) {
-      throw new Error('Expected tool is required in the YAML file');
-    }
+    // Generate unique keys for each prompt to track runs
+    // This allows us to group runs by prompt and display results clearly
+    const testIndex = new Map<
+      string,
+      {
+        readable: string;
+        prompts: string[];
+        expectedTool: string;
+        expectedParameters?: Record<string, string>;
+        expectedToolConfidence: number;
+        expectedParameterConfidence: number;
+      }
+    >();
 
-    stdout('Expected Tool:');
-    stdout(`  - ${yamlObj['expected-tool']}`);
+    const runPromises = spec.data.tests.flatMap((test) => {
+      const promptKey = Math.random().toString(36).substring(2, 15);
+      const readablePrompt = `${colorize('yellowBright', 'Prompt')}:\n  - ${castToArray(test.prompts).join('\n  - ')}`;
+      testIndex.set(promptKey, {
+        readable: readablePrompt,
+        prompts: castToArray(test.prompts),
+        expectedTool: test['expected-tool'],
+        expectedParameters: test['expected-parameters'],
+        expectedToolConfidence: test['expected-tool-confidence'],
+        expectedParameterConfidence: test['expected-parameter-confidence'] ?? test['expected-tool-confidence'],
+      });
+      return Array.from({ length: flags.runs }, (_, idx) =>
+        compareModelOutputs(test.prompts, spec.data, mcpTools).then(({ invocations, tableData }) => ({
+          idx,
+          promptKey,
+          invocations,
+          tableData,
+        }))
+      );
+    });
 
-    stdout('Models:');
-    yamlObj.models.forEach((model) => stdout(`  - ${model}`));
+    const results = groupBy(await Promise.all(runPromises), (r) => r.promptKey);
 
-    stdout();
-    stdout('Prompts:');
-    yamlObj.prompts.forEach((prompt) => {
-      if (Array.isArray(prompt)) {
-        stdout(`  - - ${prompt.join('\n    - ')}`);
-      } else {
-        stdout(`  - ${prompt}`);
+    if (flags.verbose) {
+      for (const [promptKey, runs] of Object.entries(results)) {
+        stdout(testIndex.get(promptKey)?.readable ?? 'Unknown Prompt');
+        for (const run of runs) {
+          printTable({
+            title: `Run #${run.idx + 1}`,
+            data: run.tableData,
+            columns: [
+              { key: 'model', width: '30%' },
+              { key: 'chat', width: '40%' },
+              { key: 'tools', width: '30%', name: 'Tool Invocations' },
+            ],
+            width: process.stdout.columns,
+            ...TABLE_STYLE,
+          });
+        }
       }
-    });
+    }
 
     stdout();
-    const tools = await getToolsList({ verbose: true });
+    stdout(colorize('bold', 'SUMMARY'));
+    stdout(`Total Runs: ${Object.values(results).flatMap((m) => Object.values(m)).length}`);
     stdout();
 
-    const runLog: Record<string, Record<string, string[][]>> = {};
-
-    // eslint-disable-next-line @typescript-eslint/no-unused-vars
-    for (const _ of Array.from({ length: flags.runs })) {
-      for (const prompt of yamlObj.prompts) {
-        // eslint-disable-next-line no-await-in-loop
-        const invokedTools = await compareModelOutputs(prompt, yamlObj.models, tools);
-        const promptKey = Array.isArray(prompt) ? prompt.join(' ') : prompt;
-        runLog[promptKey] = runLog[promptKey] || {};
-        Object.entries(invokedTools).forEach(([model, iTools]) => {
-          runLog[promptKey][model] = runLog[promptKey][model] || [];
-          runLog[promptKey][model].push(iTools);
-        });
+    // Initialize all prompt keys as passing
+    const passFailMap = new Map<string, { tools: boolean; parameters: boolean }>(
+      Object.keys(results).map((key) => [key, { tools: true, parameters: true }])
+    );
+
+    for (const [promptKey, testResults] of Object.entries(results)) {
+      const testSpec = testIndex.get(promptKey);
+      if (!testSpec) {
+        stdout(colorize('red', `No test spec found for prompt key: ${promptKey}`));
+        continue;
       }
-    }
 
-    stdout();
-    let pass = true;
-    for (const [prompt, models] of Object.entries(runLog)) {
-      const tableData = Object.entries(models).map(([model, runs]) => {
-        const expectedToolCount = runs.flat().filter((tool) => tool === yamlObj['expected-tool']).length;
+      stdout(testSpec.readable);
+
+      const runsByModel = groupBy(
+        testResults
+          .sort((a, b) => a.idx - b.idx)
+          .flatMap((result) =>
+            Object.entries(result.invocations).map(([model, invocations]) => ({
+              model,
+              tools: invocations.map((inv) => inv.tool),
+              parameters: invocations.map((inv) => inv.parameters),
+            }))
+          ),
+        (r) => r.model
+      );
+
+      const toolTableData = Object.entries(runsByModel).map(([model, runs]) => {
+        const actualToolCount = runs.filter(({ tools }) => tools.includes(testSpec.expectedTool)).length;
         const totalRuns = runs.length;
-        const confidenceLevel = Math.round((expectedToolCount / totalRuns) * 100);
+        const confidence = Math.round((actualToolCount / totalRuns) * 100);
 
-        if (confidenceLevel < flags['confidence-level']) {
-          pass = false;
+        if (confidence < testSpec.expectedToolConfidence) {
+          passFailMap.set(promptKey, {
+            ...(passFailMap.get(promptKey) ?? { tools: true, parameters: true }),
+            tools: false,
+          });
         }
 
         return {
           model,
-          expectedTool: yamlObj['expected-tool'],
-          invocations: `${expectedToolCount}/${totalRuns}`,
-          actualInvocations: runs.map((r) => r.join(', ')).join('\n'),
-          confidence: `${confidenceLevel}%`,
-          status: confidenceLevel >= flags['confidence-level'] ? colorize('green', 'PASS') : colorize('red', 'FAIL'),
+          expectedTool: testSpec.expectedTool,
+          actualTools: runs.map((r, idx) => `Run ${idx + 1}: ${r.tools.join(', ')}`).join('\n'),
+          count: `${actualToolCount}/${totalRuns}`,
+          actualConfidence: `${confidence}%`,
+          expectedConfidence: `${testSpec.expectedToolConfidence}%`,
+          status: confidence >= testSpec.expectedToolConfidence ? colorize('green', 'PASS') : colorize('red', 'FAIL'),
         };
       });
 
       printTable({
-        title: `Results for prompt:\n${colorize('yellowBright', prompt)}`,
-        data: tableData,
+        title: 'Tool Invocations',
+        data: toolTableData,
         columns: [
-          { key: 'model', name: 'Model' },
-          { key: 'expectedTool', name: 'Expected Tool Invocation' },
-          { key: 'actualInvocations', name: 'Actual Tool Invocations' },
-          { key: 'invocations', name: 'Invocation Count' },
-          { key: 'confidence', name: 'Confidence' },
-          { key: 'status', name: 'Status' },
+          { key: 'model', name: 'Model', width: '30%' },
+          { key: 'expectedTool', name: 'Expected Tool Invocation', width: '15%' },
+          { key: 'actualTools', name: 'Actual Tool Invocations', width: '25%' },
+          { key: 'count', name: 'Count', width: '7%' },
+          { key: 'expectedConfidence', name: 'Expected Confidence', width: '8%' },
+          { key: 'actualConfidence', name: 'Actual Confidence', width: '8%' },
+          { key: 'status', name: 'Status', width: '7%' },
         ],
         ...TABLE_STYLE,
+        width: process.stdout.columns,
       });
+
+      if (testSpec.expectedParameters) {
+        const paramTableData = Object.entries(runsByModel).map(([model, runs]) => {
+          const runsThatMatchParameters = runs.filter((run) =>
+            Object.entries(testSpec.expectedParameters ?? {}).every(([key, value]) =>
+              run.parameters.some((param) => param[key] && new RegExp(value).test(param[key]))
+            )
+          ).length;
+
+          const totalRuns = runs.length;
+          const confidence = Math.round((runsThatMatchParameters / totalRuns) * 100);
+
+          if (confidence < testSpec.expectedParameterConfidence) {
+            passFailMap.set(promptKey, {
+              ...(passFailMap.get(promptKey) ?? { tools: true, parameters: true }),
+              parameters: false,
+            });
+          }
+
+          const makeReadableParameters = (params: Array<Record<string, string>>): string =>
+            params
+              .map((param) =>
+                Object.entries(param)
+                  .map(([key, value]) => `  - ${key}: ${value}`)
+                  .join('\n')
+              )
+              .join('\n');
+
+          return {
+            model,
+            count: `${runsThatMatchParameters}/${totalRuns}`,
+            expectedParameters: makeReadableParameters([testSpec.expectedParameters ?? {}]),
+            actualParameters: runs
+              .map((r, idx) => `Run ${idx + 1}:\n${makeReadableParameters(r.parameters)}`)
+              .join('\n'),
+            actualConfidence: `${confidence}%`,
+            expectedConfidence: `${testSpec.expectedParameterConfidence}%`,
+            status:
+              confidence >= testSpec.expectedParameterConfidence ? colorize('green', 'PASS') : colorize('red', 'FAIL'),
+          };
+        });
+
+        printTable({
+          title: 'Parameter Matching',
+          data: paramTableData,
+          columns: [
+            { key: 'model', name: 'Model', width: '30%' },
+            { key: 'expectedParameters', name: 'Expected Parameters', width: '15%' },
+            { key: 'actualParameters', name: 'Actual Parameters', width: '25%' },
+            { key: 'count', name: 'Count', width: '7%' },
+            { key: 'expectedConfidence', name: 'Expected Confidence', width: '8%' },
+            { key: 'actualConfidence', name: 'Actual Confidence', width: '8%' },
+            { key: 'status', name: 'Status', width: '7%' },
+          ],
+          ...TABLE_STYLE,
+          width: process.stdout.columns,
+        });
+      }
+    }
+
+    const failingToolTests = Array.from(passFailMap.entries())
+      .filter(([, result]) => !result.tools)
+      .map(([key]) => testIndex.get(key))
+      .filter((test) => test !== undefined);
+
+    const failingParameterTests = Array.from(passFailMap.entries())
+      .filter(([, result]) => !result.parameters)
+      .map(([key]) => testIndex.get(key))
+      .filter((test) => test !== undefined);
+
+    if (failingToolTests.length > 0) {
+      stdout();
+      stdout(colorize('red', 'Failed Tool Invocations'));
+      stdout('The following prompts did not meet the tool invocation confidence level:');
+      failingToolTests.forEach((test) => stdout(test?.readable ?? 'Unknown Prompt'));
+      stdout();
+    }
+
+    if (failingParameterTests.length > 0) {
+      stdout();
+      stdout(colorize('red', 'Failed Parameter Matching'));
+      stdout('The following prompts did not meet the parameter matching confidence level:');
+      failingParameterTests.forEach((test) => stdout(test?.readable ?? 'Unknown Prompt'));
+      stdout();
     }
 
-    if (!pass) {
-      throw new Error('Confidence level not met');
+    if (failingToolTests.length === 0 && failingParameterTests.length === 0) {
+      stdout(colorize('green', 'All tests passed!'));
+    } else {
+      this.exit(1);
     }
   }
 }
diff --git a/scripts/utils/gateway.ts b/scripts/utils/gateway.ts
index d3cfe1d1..b0d857dd 100644
--- a/scripts/utils/gateway.ts
+++ b/scripts/utils/gateway.ts
@@ -84,6 +84,10 @@ const makeSingleGatewayRequest = async (
   );
 
   if (!response.ok) {
+    // eslint-disable-next-line no-console
+    console.error(`Error making request to LLM Gateway API: ${response.status} ${response.statusText}`);
+    // eslint-disable-next-line no-console
+    console.error('Response body:', JSON.stringify(await response.json(), null, 2));
     throw new Error(`HTTP ${response.status}: ${response.statusText}`);
   }
 
@@ -109,11 +113,21 @@ const makeSingleGatewayRequest = async (
 export const makeGatewayRequests = async (
   prompts: string[],
   model: Model,
-  tools: InvocableTool[]
+  tools: InvocableTool[],
+  initialContext?: string[]
 ): Promise<{ model: Model; messages: Array<{ role: string; content: string }>; responses: GatewayResponse[] }> => {
   const messages: Array<{ role: string; content: string }> = [];
   const responses: GatewayResponse[] = [];
 
+  if (initialContext) {
+    await makeSingleGatewayRequest(model, tools, [
+      {
+        role: 'user',
+        content: initialContext?.join('\n'),
+      },
+    ]);
+  }
+
   for (const prompt of prompts) {
     // Add the current prompt to messages
     messages.push({
diff --git a/scripts/utils/tools.ts b/scripts/utils/tools.ts
index aa653234..7e183b79 100644
--- a/scripts/utils/tools.ts
+++ b/scripts/utils/tools.ts
@@ -33,7 +33,7 @@ export type InvocableTool = {
   };
 };
 
-export const getToolsList = async ({ verbose }: { verbose: boolean }): Promise<InvocableTool[]> => {
+export const getToolsList = async (): Promise<InvocableTool[]> => {
   const toolsList: string = await new Promise<string>((resolve, reject) => {
     const child = spawn('npx', [
       '@modelcontextprotocol/inspector',
@@ -67,35 +67,33 @@ export const getToolsList = async ({ verbose }: { verbose: boolean }): Promise<I
 
   const parsedToolsList = JSON.parse(toolsList) as { tools: Tool[] };
 
-  if (verbose) {
-    const toolsWithTokens = parsedToolsList.tools?.map((tool) => ({
-      tool: tool.name,
-      tokensGPT4oMini: encodeGPT4oMini(JSON.stringify(tool)).length,
-      tokensO3Mini: encodeO3Mini(JSON.stringify(tool)).length,
-      tokensGPT4: encodeGPT4(JSON.stringify(tool)).length,
-    }));
-    toolsWithTokens.push({
-      tool: colorize('bold', 'TOTAL'),
-      tokensGPT4oMini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4oMini, 0),
-      tokensO3Mini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensO3Mini, 0),
-      tokensGPT4: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4, 0),
-    });
+  const toolsWithTokens = parsedToolsList.tools?.map((tool) => ({
+    tool: tool.name,
+    tokensGPT4oMini: encodeGPT4oMini(JSON.stringify(tool)).length,
+    tokensO3Mini: encodeO3Mini(JSON.stringify(tool)).length,
+    tokensGPT4: encodeGPT4(JSON.stringify(tool)).length,
+  }));
+  toolsWithTokens.push({
+    tool: colorize('bold', 'TOTAL'),
+    tokensGPT4oMini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4oMini, 0),
+    tokensO3Mini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensO3Mini, 0),
+    tokensGPT4: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4, 0),
+  });
 
-    printTable({
-      title: 'Tools List',
-      data: toolsWithTokens,
-      columns: [
-        'tool',
-        { key: 'tokensGPT4oMini', name: 'GPT 4o Mini' },
-        { key: 'tokensO3Mini', name: 'O3 Mini' },
-        { key: 'tokensGPT4', name: 'GPT 4' },
-      ],
-      titleOptions: {
-        color: 'yellowBright',
-      },
-      ...TABLE_STYLE,
-    });
-  }
+  printTable({
+    title: 'Tools List',
+    data: toolsWithTokens,
+    columns: [
+      'tool',
+      { key: 'tokensGPT4oMini', name: 'GPT 4o Mini' },
+      { key: 'tokensO3Mini', name: 'O3 Mini' },
+      { key: 'tokensGPT4', name: 'GPT 4' },
+    ],
+    titleOptions: {
+      color: 'yellowBright',
+    },
+    ...TABLE_STYLE,
+  });
 
   return (parsedToolsList.tools ?? []).map((tool) => ({
     name: tool.name,
diff --git a/test-assets/compare-responses.yml b/test-assets/compare-responses.yml
deleted file mode 100644
index 50c768f6..00000000
--- a/test-assets/compare-responses.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-models:
-  # - llmgateway__OpenAIGPT35Turbo_01_25
-  - llmgateway__OpenAIGPT4OmniMini
-  - llmgateway__OpenAIGPT41Nano
-  - llmgateway__BedrockAnthropicClaude4Sonnet
-
-expected-tool: sf-deploy-metadata
-
-prompts:
-  # Sparse prompts
-  # - Deploy my changes to the dreamhouse org
-  # - Deploy the manifest in dreamhouse-lwc/force-app/main/default/package.xml to dreamhouse
-  # - Deploy all metadata in dreamhouse-lwc/force-app/main/default to dreamhouse
-
-  # # Context-rich prompts
-  - - My current working directory is /Users/mdonnalley/repos/trailheadapps/dreamhouse-lwc. Please deploy the source in force-app/main/default/classes/Broker.cls to my dreamhouse org.
-
-  # - - I am working on the dreamhouse-lwc project. My org alias is dreamhouse.
-  #   - Deploy only the Lightning Web Components in force-app/main/default/lwc to dreamhouse.
-
-  # - - I want to deploy only the Property object and its fields from dreamhouse-lwc/force-app/main/default/objects/Property__c to dreamhouse.
-  #   - Run all local tests during deployment.
-
-  # - - My org alias is dreamhouse and my project is dreamhouse-lwc.
-  #   - Deploy the changes in force-app/main/default/permissionsets to dreamhouse and run the BrokerTest Apex test.
-
-  # - - I am a Salesforce developer. My org is dreamhouse.
-  #   - Deploy the manifest file dreamhouse-lwc/manifest/package.xml to my org and run all tests.
-
-  # - - I am in dreamhouse-lwc. My org alias is dreamhouse.
-  #   - Deploy only the layouts in force-app/main/default/layouts to dreamhouse.
-
-  # - - My org alias is dreamhouse.
-  #   - Deploy the source in force-app/main/default/objects/Contact__c and run no tests.
diff --git a/test-assets/sf-deploy-metadata.yml b/test-assets/sf-deploy-metadata.yml
new file mode 100644
index 00000000..63109b7e
--- /dev/null
+++ b/test-assets/sf-deploy-metadata.yml
@@ -0,0 +1,27 @@
+models:
+  # - llmgateway__OpenAIGPT35Turbo_01_25
+  - llmgateway__OpenAIGPT4OmniMini
+  - llmgateway__OpenAIGPT41Nano
+  - llmgateway__BedrockAnthropicClaude4Sonnet
+
+initial-context:
+  - 'The user has the following folders open: /Users/sf-dev/dreamhouse-lwc.'
+
+tests:
+  - prompts:
+      - I am working on the dreamhouse-lwc project. My org alias is dreamhouse. Deploy only the Lightning Web Components in force-app/main/default/lwc to dreamhouse.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/lwc
+      directory: ^/.*dreamhouse-lwc$
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+  - prompts:
+      - Deploy my changes to the dreamhouse org
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      directory: ^/.*dreamhouse-lwc$
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
diff --git a/yarn.lock b/yarn.lock
index 3d0c07d3..68279302 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -1547,10 +1547,10 @@
     lodash "^4.17.21"
     registry-auth-token "^5.1.0"
 
-"@oclif/table@^0.4.8":
-  version "0.4.8"
-  resolved "https://registry.yarnpkg.com/@oclif/table/-/table-0.4.8.tgz#38c38fc771ccc3754d2fe37f7279ce65c3d9ab8b"
-  integrity sha512-HgyeNTyUF67OQ2eOCFia0mfxyPFcPwa8sIq1SiiZf8oxw6JtUciWGXb0cmmo5vnbxRJ3er0PHLwMV0/hBG6NWw==
+"@oclif/table@^0.4.9":
+  version "0.4.9"
+  resolved "https://registry.yarnpkg.com/@oclif/table/-/table-0.4.9.tgz#bf1057e523d948aad8578d4bb721009589bed1b4"
+  integrity sha512-j6M16G2qXhQCZ3e6TffYmJgBdl0sha0/P1X8xpZpaXMvNHE7nWGGvScUACwvMn64XoSLHzLC9yEcaI5IpH0kYg==
   dependencies:
     "@types/react" "^18.3.12"
     change-case "^5.4.4"

From bf0a38ec3e0cb846c926f482f26e86780981317d Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 22 Jul 2025 10:54:00 -0600
Subject: [PATCH 32/51] chore: clean up

---
 scripts/confidence-test.ts             | 127 ++++++++++++++-----------
 scripts/utils/gateway.ts               |  17 +---
 test-assets/sf-deploy-metadata.yml     |  27 ------
 test/confidence/sf-deploy-metadata.yml |  40 ++++++++
 4 files changed, 116 insertions(+), 95 deletions(-)
 delete mode 100644 test-assets/sf-deploy-metadata.yml
 create mode 100644 test/confidence/sf-deploy-metadata.yml

diff --git a/scripts/confidence-test.ts b/scripts/confidence-test.ts
index ad374313..a1b7b803 100644
--- a/scripts/confidence-test.ts
+++ b/scripts/confidence-test.ts
@@ -30,11 +30,12 @@ const Spec = z.object({
   'initial-context': z.array(z.string()).optional(),
   tests: z.array(
     z.object({
-      prompts: z.union([z.string(), z.array(z.string())]),
+      utterances: z.union([z.string(), z.array(z.string())]),
       'expected-tool': z.string(),
       'expected-parameters': z.record(z.string(), z.string()).optional(),
       'expected-tool-confidence': z.number(),
       'expected-parameter-confidence': z.number().optional(),
+      'allowed-tools': z.array(z.string()).optional(),
     })
   ),
 });
@@ -53,8 +54,14 @@ const groupBy = <T, K extends string | number | symbol>(array: T[], key: (item:
     // eslint-disable-next-line
   }, {} as Record<K, T[]>);
 
+const makeReadableParameters = (param: Record<string, string>): string =>
+  Object.entries(param)
+    .sort(([a], [b]) => a.localeCompare(b))
+    .map(([key, value]) => `  - ${key}: ${value}`)
+    .join('\n');
+
 async function compareModelOutputs(
-  prompt: string | string[],
+  utterances: string | string[],
   spec: Spec,
   tools: InvocableTool[]
 ): Promise<{
@@ -62,9 +69,8 @@ async function compareModelOutputs(
   invocations: Record<string, Array<{ tool: string; parameters: Record<string, string> }>>;
 }> {
   const models = spec.models;
-  const prompts = castToArray(prompt);
   const responses = await Promise.all(
-    models.map((model) => makeGatewayRequests(prompts, model, tools, spec['initial-context']))
+    models.map((model) => makeGatewayRequests(castToArray(utterances), model, tools, spec['initial-context']))
   );
 
   const invocations = responses.reduce<Record<string, Array<{ tool: string; parameters: Record<string, string> }>>>(
@@ -96,15 +102,13 @@ async function compareModelOutputs(
       .map((r, index) => {
         const toolInvocation = r.generation_details?.generations[0].tool_invocations?.[0];
         if (!toolInvocation) {
-          return `Message ${index + 1}: No tool invoked`;
+          return `Generation ${index + 1}: No tool invoked`;
         }
 
         const toolArgs = JSON.parse(toolInvocation.function.arguments ?? '{}') as Record<string, string>;
-        const argsString = Object.entries(toolArgs)
-          .map(([key, value]) => `  - ${key}: ${value}`)
-          .join('\n');
+        const argsString = makeReadableParameters(toolArgs);
 
-        return `Message ${index + 1}: ${colorize('bold', toolInvocation.function.name)}${
+        return `Generation ${index + 1}: ${colorize('bold', toolInvocation.function.name)}${
           argsString ? `\n${argsString}` : ''
         }`;
       })
@@ -119,7 +123,7 @@ export default class ConfidenceTest extends Command {
   public static description = `Tests that the MCP server tools are accurately invoked by various LLM models.
 
 Configuration:
-- Uses a YAML file to specify models and test prompts
+- Uses a YAML file to specify models and test utterances
 - Requires SF_LLMG_API_KEY environment variable
 
 YAML File Format:
@@ -127,7 +131,7 @@ The YAML file should contain:
 - models: Array of model identifiers to test against
 - initial-context: Optional array of strings to set the initial context for the conversation
 - tests: Array of test objects with the following properties:
-  - prompts: String or array of strings for test prompts (supports multi-turn conversations)
+  - utterances: String or array of strings for test utterances (supports multi-turn conversations)
   - expected-tool: String identifying the expected tool to be invoked
   - expected-parameters: Optional object with expected parameter key-value pairs
   - expected-tool-confidence: Number representing the minimum confidence level (0-100)
@@ -137,15 +141,17 @@ models:
   - llmgateway__OpenAIGPT35Turbo_01_25
   - llmgateway__OpenAIGPT4OmniMini
 tests:
-  - prompts: "What's my salesforce username?"
+  - utterances: "What's my salesforce username?"
     expected-tool: sf-org-display
     expected-tool-confidence: 80
-  - prompts: ["I am a Salesforce developer", "Deploy my project"]
+  - utterances: ["I am a Salesforce developer", "Deploy my project"]
     expected-tool: sf-deploy-metadata
     expected-parameters:
       source-dir: "force-app"
     expected-tool-confidence: 90
-  - prompts:
+    allowed-tools:
+      - sf-list-all-orgs
+  - utterances:
       - I am a Salesforce developer.
       - Deploy my project
     expected-tool: sf-deploy-metadata
@@ -157,7 +163,7 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
   public static flags = {
     file: Flags.file({
       summary: 'The YAML file to use for the response',
-      description: 'Must contain array of models and prompts',
+      description: 'Must contain array of models and test cases',
       required: true,
       exists: true,
       char: 'f',
@@ -192,46 +198,47 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
     const mcpTools = await getToolsList();
     stdout();
 
-    // Generate unique keys for each prompt to track runs
-    // This allows us to group runs by prompt and display results clearly
+    // Generate unique keys for each utterance to track runs
+    // This allows us to group runs by utterance and display results clearly
     const testIndex = new Map<
       string,
       {
         readable: string;
-        prompts: string[];
+        utterances: string[];
         expectedTool: string;
         expectedParameters?: Record<string, string>;
         expectedToolConfidence: number;
         expectedParameterConfidence: number;
+        allowedTools: string[];
       }
     >();
 
     const runPromises = spec.data.tests.flatMap((test) => {
-      const promptKey = Math.random().toString(36).substring(2, 15);
-      const readablePrompt = `${colorize('yellowBright', 'Prompt')}:\n  - ${castToArray(test.prompts).join('\n  - ')}`;
-      testIndex.set(promptKey, {
-        readable: readablePrompt,
-        prompts: castToArray(test.prompts),
+      const utteranceKey = Math.random().toString(36).substring(2, 15);
+      testIndex.set(utteranceKey, {
+        readable: `${colorize('yellowBright', 'Utterance')}:\n  - ${castToArray(test.utterances).join('\n  - ')}`,
+        utterances: castToArray(test.utterances),
         expectedTool: test['expected-tool'],
         expectedParameters: test['expected-parameters'],
         expectedToolConfidence: test['expected-tool-confidence'],
         expectedParameterConfidence: test['expected-parameter-confidence'] ?? test['expected-tool-confidence'],
+        allowedTools: [test['expected-tool'], ...(test['allowed-tools'] ?? [])],
       });
       return Array.from({ length: flags.runs }, (_, idx) =>
-        compareModelOutputs(test.prompts, spec.data, mcpTools).then(({ invocations, tableData }) => ({
+        compareModelOutputs(test.utterances, spec.data, mcpTools).then(({ invocations, tableData }) => ({
           idx,
-          promptKey,
+          utteranceKey,
           invocations,
           tableData,
         }))
       );
     });
 
-    const results = groupBy(await Promise.all(runPromises), (r) => r.promptKey);
+    const results = groupBy(await Promise.all(runPromises), (r) => r.utteranceKey);
 
     if (flags.verbose) {
-      for (const [promptKey, runs] of Object.entries(results)) {
-        stdout(testIndex.get(promptKey)?.readable ?? 'Unknown Prompt');
+      for (const [utteranceKey, runs] of Object.entries(results)) {
+        stdout(testIndex.get(utteranceKey)?.readable ?? 'Unknown Test Case');
         for (const run of runs) {
           printTable({
             title: `Run #${run.idx + 1}`,
@@ -253,15 +260,15 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
     stdout(`Total Runs: ${Object.values(results).flatMap((m) => Object.values(m)).length}`);
     stdout();
 
-    // Initialize all prompt keys as passing
+    // Initialize all utterance keys as passing
     const passFailMap = new Map<string, { tools: boolean; parameters: boolean }>(
       Object.keys(results).map((key) => [key, { tools: true, parameters: true }])
     );
 
-    for (const [promptKey, testResults] of Object.entries(results)) {
-      const testSpec = testIndex.get(promptKey);
+    for (const [utteranceKey, testResults] of Object.entries(results)) {
+      const testSpec = testIndex.get(utteranceKey);
       if (!testSpec) {
-        stdout(colorize('red', `No test spec found for prompt key: ${promptKey}`));
+        stdout(colorize('red', `No test spec found for utterance key: ${utteranceKey}`));
         continue;
       }
 
@@ -273,21 +280,26 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
           .flatMap((result) =>
             Object.entries(result.invocations).map(([model, invocations]) => ({
               model,
-              tools: invocations.map((inv) => inv.tool),
-              parameters: invocations.map((inv) => inv.parameters),
+              invocations,
+              // tools: invocations.map((inv) => inv.tool),
+              // parameters: invocations.map((inv) => inv.parameters),
             }))
           ),
         (r) => r.model
       );
 
       const toolTableData = Object.entries(runsByModel).map(([model, runs]) => {
-        const actualToolCount = runs.filter(({ tools }) => tools.includes(testSpec.expectedTool)).length;
+        const actualToolCount = runs.filter(
+          ({ invocations }) =>
+            invocations.some((inv) => inv.tool === testSpec.expectedTool) &&
+            invocations.every((inv) => testSpec.allowedTools.includes(inv.tool))
+        ).length;
         const totalRuns = runs.length;
         const confidence = Math.round((actualToolCount / totalRuns) * 100);
 
         if (confidence < testSpec.expectedToolConfidence) {
-          passFailMap.set(promptKey, {
-            ...(passFailMap.get(promptKey) ?? { tools: true, parameters: true }),
+          passFailMap.set(utteranceKey, {
+            ...(passFailMap.get(utteranceKey) ?? { tools: true, parameters: true }),
             tools: false,
           });
         }
@@ -295,7 +307,9 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
         return {
           model,
           expectedTool: testSpec.expectedTool,
-          actualTools: runs.map((r, idx) => `Run ${idx + 1}: ${r.tools.join(', ')}`).join('\n'),
+          actualTools: runs
+            .map((r, idx) => `Run ${idx + 1}: ${r.invocations.flatMap((inv) => inv.tool).join(', ')}`)
+            .join('\n'),
           count: `${actualToolCount}/${totalRuns}`,
           actualConfidence: `${confidence}%`,
           expectedConfidence: `${testSpec.expectedToolConfidence}%`,
@@ -323,7 +337,12 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
         const paramTableData = Object.entries(runsByModel).map(([model, runs]) => {
           const runsThatMatchParameters = runs.filter((run) =>
             Object.entries(testSpec.expectedParameters ?? {}).every(([key, value]) =>
-              run.parameters.some((param) => param[key] && new RegExp(value).test(param[key]))
+              run.invocations.some(
+                (inv) =>
+                  inv.tool === testSpec.expectedTool &&
+                  inv.parameters[key] &&
+                  new RegExp(value).test(inv.parameters[key])
+              )
             )
           ).length;
 
@@ -331,27 +350,23 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
           const confidence = Math.round((runsThatMatchParameters / totalRuns) * 100);
 
           if (confidence < testSpec.expectedParameterConfidence) {
-            passFailMap.set(promptKey, {
-              ...(passFailMap.get(promptKey) ?? { tools: true, parameters: true }),
+            passFailMap.set(utteranceKey, {
+              ...(passFailMap.get(utteranceKey) ?? { tools: true, parameters: true }),
               parameters: false,
             });
           }
 
-          const makeReadableParameters = (params: Array<Record<string, string>>): string =>
-            params
-              .map((param) =>
-                Object.entries(param)
-                  .map(([key, value]) => `  - ${key}: ${value}`)
-                  .join('\n')
-              )
-              .join('\n');
-
           return {
             model,
             count: `${runsThatMatchParameters}/${totalRuns}`,
-            expectedParameters: makeReadableParameters([testSpec.expectedParameters ?? {}]),
+            expectedParameters: makeReadableParameters(testSpec.expectedParameters ?? {}),
             actualParameters: runs
-              .map((r, idx) => `Run ${idx + 1}:\n${makeReadableParameters(r.parameters)}`)
+              .map(
+                (r, idx) =>
+                  `Run ${idx + 1}:\n${makeReadableParameters(
+                    r.invocations.find((inv) => inv.tool === testSpec.expectedTool)?.parameters ?? {}
+                  )}`
+              )
               .join('\n'),
             actualConfidence: `${confidence}%`,
             expectedConfidence: `${testSpec.expectedParameterConfidence}%`,
@@ -391,16 +406,16 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
     if (failingToolTests.length > 0) {
       stdout();
       stdout(colorize('red', 'Failed Tool Invocations'));
-      stdout('The following prompts did not meet the tool invocation confidence level:');
-      failingToolTests.forEach((test) => stdout(test?.readable ?? 'Unknown Prompt'));
+      stdout('The following test cases did not meet the tool invocation confidence level:');
+      failingToolTests.forEach((test) => stdout(test?.readable ?? 'Unknown Test Case'));
       stdout();
     }
 
     if (failingParameterTests.length > 0) {
       stdout();
       stdout(colorize('red', 'Failed Parameter Matching'));
-      stdout('The following prompts did not meet the parameter matching confidence level:');
-      failingParameterTests.forEach((test) => stdout(test?.readable ?? 'Unknown Prompt'));
+      stdout('The following test cases did not meet the parameter matching confidence level:');
+      failingParameterTests.forEach((test) => stdout(test?.readable ?? 'Unknown Test Case'));
       stdout();
     }
 
diff --git a/scripts/utils/gateway.ts b/scripts/utils/gateway.ts
index b0d857dd..b11ed024 100644
--- a/scripts/utils/gateway.ts
+++ b/scripts/utils/gateway.ts
@@ -111,7 +111,7 @@ const makeSingleGatewayRequest = async (
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/auth/#api-key-limitations} API Key Limitations Documentation
  */
 export const makeGatewayRequests = async (
-  prompts: string[],
+  utterances: string[],
   model: Model,
   tools: InvocableTool[],
   initialContext?: string[]
@@ -119,20 +119,13 @@ export const makeGatewayRequests = async (
   const messages: Array<{ role: string; content: string }> = [];
   const responses: GatewayResponse[] = [];
 
-  if (initialContext) {
-    await makeSingleGatewayRequest(model, tools, [
-      {
-        role: 'user',
-        content: initialContext?.join('\n'),
-      },
-    ]);
-  }
+  const allUtterances = initialContext ? [...initialContext, ...utterances] : utterances;
 
-  for (const prompt of prompts) {
-    // Add the current prompt to messages
+  for (const utterance of allUtterances) {
+    // Add the current utterance to messages
     messages.push({
       role: 'user',
-      content: prompt,
+      content: utterance,
     });
 
     // eslint-disable-next-line no-await-in-loop
diff --git a/test-assets/sf-deploy-metadata.yml b/test-assets/sf-deploy-metadata.yml
deleted file mode 100644
index 63109b7e..00000000
--- a/test-assets/sf-deploy-metadata.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-models:
-  # - llmgateway__OpenAIGPT35Turbo_01_25
-  - llmgateway__OpenAIGPT4OmniMini
-  - llmgateway__OpenAIGPT41Nano
-  - llmgateway__BedrockAnthropicClaude4Sonnet
-
-initial-context:
-  - 'The user has the following folders open: /Users/sf-dev/dreamhouse-lwc.'
-
-tests:
-  - prompts:
-      - I am working on the dreamhouse-lwc project. My org alias is dreamhouse. Deploy only the Lightning Web Components in force-app/main/default/lwc to dreamhouse.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      sourceDir: force-app/main/default/lwc
-      directory: ^/.*dreamhouse-lwc$
-      usernameOrAlias: dreamhouse
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
-  - prompts:
-      - Deploy my changes to the dreamhouse org
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      directory: ^/.*dreamhouse-lwc$
-      usernameOrAlias: dreamhouse
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
diff --git a/test/confidence/sf-deploy-metadata.yml b/test/confidence/sf-deploy-metadata.yml
new file mode 100644
index 00000000..1c0bc67e
--- /dev/null
+++ b/test/confidence/sf-deploy-metadata.yml
@@ -0,0 +1,40 @@
+models:
+  # - llmgateway__OpenAIGPT35Turbo_01_25
+  # - llmgateway__OpenAIGPT4OmniMini
+  - llmgateway__OpenAIGPT41Nano
+  # - llmgateway__BedrockAnthropicClaude4Sonnet
+
+initial-context:
+  - 'My current OS is macos. I am working in a workspace with the following folders: /Users/sf-dev/dreamhouse-lwc'
+
+tests:
+  - utterances:
+      - My org alias is dreamhouse. Deploy the Lightning Web Components in force-app/main/default/lwc to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/lwc
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+    allowed-tools:
+      - sf-list-all-orgs
+  # - utterances:
+  #     - Deploy my changes to the dreamhouse org
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     directory: ^/.*dreamhouse-lwc$
+  #     usernameOrAlias: dreamhouse
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
+  # - utterances:
+  #     - Hello. Who are you and what can you do?
+  #     - I am a salesforce developer working on the dreamhouse-lwc project. My org alias is dreamhouse.
+  #     - I want to deploy only the Lightning Web Components in force-app/main/default/lwc to dreamhouse.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     sourceDir: force-app/main/default/lwc
+  #     directory: ^/.*dreamhouse-lwc$
+  #     usernameOrAlias: dreamhouse
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100

From 540f0456c946ac32ceea50a9b49e6f04db764bd8 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 22 Jul 2025 11:11:12 -0600
Subject: [PATCH 33/51] chore: clean up

---
 scripts/confidence-test.ts | 223 +++++++++++++++++++++----------------
 scripts/utils/tools.ts     |  51 ++++-----
 2 files changed, 144 insertions(+), 130 deletions(-)

diff --git a/scripts/confidence-test.ts b/scripts/confidence-test.ts
index a1b7b803..d82658ec 100644
--- a/scripts/confidence-test.ts
+++ b/scripts/confidence-test.ts
@@ -42,6 +42,16 @@ const Spec = z.object({
 
 type Spec = z.infer<typeof Spec>;
 
+type TestCase = {
+  readable: string;
+  utterances: string[];
+  expectedTool: string;
+  expectedParameters?: Record<string, string>;
+  expectedToolConfidence: number;
+  expectedParameterConfidence: number;
+  allowedTools: string[];
+};
+
 const castToArray = <T>(value: T | T[]): T[] => (Array.isArray(value) ? value : [value]);
 
 const groupBy = <T, K extends string | number | symbol>(array: T[], key: (item: T) => K): Record<K, T[]> =>
@@ -60,6 +70,39 @@ const makeReadableParameters = (param: Record<string, string>): string =>
     .map(([key, value]) => `  - ${key}: ${value}`)
     .join('\n');
 
+const countRunsThatPassParameterMatching = (
+  testSpec: TestCase,
+  runs: Array<{ model: string; invocations: Array<{ tool: string; parameters: Record<string, string> }> }>
+): number =>
+  runs.filter((run) =>
+    Object.entries(testSpec.expectedParameters ?? {}).every(([key, value]) =>
+      run.invocations.some(
+        (inv) =>
+          inv.tool === testSpec.expectedTool && inv.parameters[key] && new RegExp(value).test(inv.parameters[key])
+      )
+    )
+  ).length;
+
+const countRunsThatPassToolMatching = (
+  testSpec: TestCase,
+  runs: Array<{ model: string; invocations: Array<{ tool: string; parameters: Record<string, string> }> }>
+): number =>
+  runs.filter(
+    ({ invocations }) =>
+      invocations.some((inv) => inv.tool === testSpec.expectedTool) &&
+      invocations.every((inv) => testSpec.allowedTools.includes(inv.tool))
+  ).length;
+
+const filterFailingTests = (
+  passFailMap: Map<string, { tools: boolean; parameters: boolean }>,
+  testIndex: Map<string, TestCase>,
+  type: 'tools' | 'parameters'
+): TestCase[] =>
+  Array.from(passFailMap.entries())
+    .filter(([, result]) => !result[type])
+    .map(([key]) => testIndex.get(key))
+    .filter((test) => test !== undefined);
+
 async function compareModelOutputs(
   utterances: string | string[],
   spec: Spec,
@@ -123,7 +166,7 @@ export default class ConfidenceTest extends Command {
   public static description = `Tests that the MCP server tools are accurately invoked by various LLM models.
 
 Configuration:
-- Uses a YAML file to specify models and test utterances
+- Uses a YAML file to specify models and test cases
 - Requires SF_LLMG_API_KEY environment variable
 
 YAML File Format:
@@ -135,6 +178,8 @@ The YAML file should contain:
   - expected-tool: String identifying the expected tool to be invoked
   - expected-parameters: Optional object with expected parameter key-value pairs
   - expected-tool-confidence: Number representing the minimum confidence level (0-100)
+  - expected-parameter-confidence: Optional number for parameter confidence (defaults to expected-tool-confidence)
+  - allowed-tools: Optional array of tool names that are acceptable in addition to the expected tool
 
 Example YAML structure:
 models:
@@ -194,24 +239,29 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
       this.error(`Invalid spec file: ${flags.file}\n${spec.error.message}`);
     }
 
-    stdout();
-    const mcpTools = await getToolsList();
-    stdout();
+    const { tools: mcpTools, tokens } = await getToolsList();
+    if (flags.verbose) {
+      stdout();
+      printTable({
+        title: 'Tools List',
+        data: tokens,
+        columns: [
+          'tool',
+          { key: 'tokensGPT4oMini', name: 'GPT 4o Mini' },
+          { key: 'tokensO3Mini', name: 'O3 Mini' },
+          { key: 'tokensGPT4', name: 'GPT 4' },
+        ],
+        titleOptions: {
+          color: 'yellowBright',
+        },
+        ...TABLE_STYLE,
+      });
+      stdout();
+    }
 
     // Generate unique keys for each utterance to track runs
     // This allows us to group runs by utterance and display results clearly
-    const testIndex = new Map<
-      string,
-      {
-        readable: string;
-        utterances: string[];
-        expectedTool: string;
-        expectedParameters?: Record<string, string>;
-        expectedToolConfidence: number;
-        expectedParameterConfidence: number;
-        allowedTools: string[];
-      }
-    >();
+    const testIndex = new Map<string, TestCase>();
 
     const runPromises = spec.data.tests.flatMap((test) => {
       const utteranceKey = Math.random().toString(36).substring(2, 15);
@@ -281,45 +331,37 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
             Object.entries(result.invocations).map(([model, invocations]) => ({
               model,
               invocations,
-              // tools: invocations.map((inv) => inv.tool),
-              // parameters: invocations.map((inv) => inv.parameters),
             }))
           ),
         (r) => r.model
       );
 
-      const toolTableData = Object.entries(runsByModel).map(([model, runs]) => {
-        const actualToolCount = runs.filter(
-          ({ invocations }) =>
-            invocations.some((inv) => inv.tool === testSpec.expectedTool) &&
-            invocations.every((inv) => testSpec.allowedTools.includes(inv.tool))
-        ).length;
-        const totalRuns = runs.length;
-        const confidence = Math.round((actualToolCount / totalRuns) * 100);
-
-        if (confidence < testSpec.expectedToolConfidence) {
-          passFailMap.set(utteranceKey, {
-            ...(passFailMap.get(utteranceKey) ?? { tools: true, parameters: true }),
-            tools: false,
-          });
-        }
-
-        return {
-          model,
-          expectedTool: testSpec.expectedTool,
-          actualTools: runs
-            .map((r, idx) => `Run ${idx + 1}: ${r.invocations.flatMap((inv) => inv.tool).join(', ')}`)
-            .join('\n'),
-          count: `${actualToolCount}/${totalRuns}`,
-          actualConfidence: `${confidence}%`,
-          expectedConfidence: `${testSpec.expectedToolConfidence}%`,
-          status: confidence >= testSpec.expectedToolConfidence ? colorize('green', 'PASS') : colorize('red', 'FAIL'),
-        };
-      });
-
       printTable({
         title: 'Tool Invocations',
-        data: toolTableData,
+        data: Object.entries(runsByModel).map(([model, runs]) => {
+          const actualToolCount = countRunsThatPassToolMatching(testSpec, runs);
+          const totalRuns = runs.length;
+          const confidence = Math.round((actualToolCount / totalRuns) * 100);
+
+          if (confidence < testSpec.expectedToolConfidence) {
+            passFailMap.set(utteranceKey, {
+              ...(passFailMap.get(utteranceKey) ?? { tools: true, parameters: true }),
+              tools: false,
+            });
+          }
+
+          return {
+            model,
+            expectedTool: testSpec.expectedTool,
+            actualTools: runs
+              .map((r, idx) => `Run ${idx + 1}: ${r.invocations.flatMap((inv) => inv.tool).join(', ')}`)
+              .join('\n'),
+            count: `${actualToolCount}/${totalRuns}`,
+            actualConfidence: `${confidence}%`,
+            expectedConfidence: `${testSpec.expectedToolConfidence}%`,
+            status: confidence >= testSpec.expectedToolConfidence ? colorize('green', 'PASS') : colorize('red', 'FAIL'),
+          };
+        }),
         columns: [
           { key: 'model', name: 'Model', width: '30%' },
           { key: 'expectedTool', name: 'Expected Tool Invocation', width: '15%' },
@@ -334,50 +376,40 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
       });
 
       if (testSpec.expectedParameters) {
-        const paramTableData = Object.entries(runsByModel).map(([model, runs]) => {
-          const runsThatMatchParameters = runs.filter((run) =>
-            Object.entries(testSpec.expectedParameters ?? {}).every(([key, value]) =>
-              run.invocations.some(
-                (inv) =>
-                  inv.tool === testSpec.expectedTool &&
-                  inv.parameters[key] &&
-                  new RegExp(value).test(inv.parameters[key])
-              )
-            )
-          ).length;
-
-          const totalRuns = runs.length;
-          const confidence = Math.round((runsThatMatchParameters / totalRuns) * 100);
-
-          if (confidence < testSpec.expectedParameterConfidence) {
-            passFailMap.set(utteranceKey, {
-              ...(passFailMap.get(utteranceKey) ?? { tools: true, parameters: true }),
-              parameters: false,
-            });
-          }
-
-          return {
-            model,
-            count: `${runsThatMatchParameters}/${totalRuns}`,
-            expectedParameters: makeReadableParameters(testSpec.expectedParameters ?? {}),
-            actualParameters: runs
-              .map(
-                (r, idx) =>
-                  `Run ${idx + 1}:\n${makeReadableParameters(
-                    r.invocations.find((inv) => inv.tool === testSpec.expectedTool)?.parameters ?? {}
-                  )}`
-              )
-              .join('\n'),
-            actualConfidence: `${confidence}%`,
-            expectedConfidence: `${testSpec.expectedParameterConfidence}%`,
-            status:
-              confidence >= testSpec.expectedParameterConfidence ? colorize('green', 'PASS') : colorize('red', 'FAIL'),
-          };
-        });
-
         printTable({
           title: 'Parameter Matching',
-          data: paramTableData,
+          data: Object.entries(runsByModel).map(([model, runs]) => {
+            const runsThatMatchParameters = countRunsThatPassParameterMatching(testSpec, runs);
+            const totalRuns = runs.length;
+            const confidence = Math.round((runsThatMatchParameters / totalRuns) * 100);
+
+            if (confidence < testSpec.expectedParameterConfidence) {
+              passFailMap.set(utteranceKey, {
+                ...(passFailMap.get(utteranceKey) ?? { tools: true, parameters: true }),
+                parameters: false,
+              });
+            }
+
+            return {
+              model,
+              count: `${runsThatMatchParameters}/${totalRuns}`,
+              expectedParameters: makeReadableParameters(testSpec.expectedParameters ?? {}),
+              actualParameters: runs
+                .map(
+                  (r, idx) =>
+                    `Run ${idx + 1}:\n${makeReadableParameters(
+                      r.invocations.find((inv) => inv.tool === testSpec.expectedTool)?.parameters ?? {}
+                    )}`
+                )
+                .join('\n'),
+              actualConfidence: `${confidence}%`,
+              expectedConfidence: `${testSpec.expectedParameterConfidence}%`,
+              status:
+                confidence >= testSpec.expectedParameterConfidence
+                  ? colorize('green', 'PASS')
+                  : colorize('red', 'FAIL'),
+            };
+          }),
           columns: [
             { key: 'model', name: 'Model', width: '30%' },
             { key: 'expectedParameters', name: 'Expected Parameters', width: '15%' },
@@ -393,15 +425,8 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
       }
     }
 
-    const failingToolTests = Array.from(passFailMap.entries())
-      .filter(([, result]) => !result.tools)
-      .map(([key]) => testIndex.get(key))
-      .filter((test) => test !== undefined);
-
-    const failingParameterTests = Array.from(passFailMap.entries())
-      .filter(([, result]) => !result.parameters)
-      .map(([key]) => testIndex.get(key))
-      .filter((test) => test !== undefined);
+    const failingToolTests = filterFailingTests(passFailMap, testIndex, 'tools');
+    const failingParameterTests = filterFailingTests(passFailMap, testIndex, 'parameters');
 
     if (failingToolTests.length > 0) {
       stdout();
diff --git a/scripts/utils/tools.ts b/scripts/utils/tools.ts
index 7e183b79..307da218 100644
--- a/scripts/utils/tools.ts
+++ b/scripts/utils/tools.ts
@@ -16,12 +16,10 @@
 
 import { spawn } from 'node:child_process';
 import { Tool } from '@modelcontextprotocol/sdk/types.js';
-import { printTable } from '@oclif/table';
 import { colorize } from '@oclif/core/ux';
 import { encode as encodeGPT4oMini } from 'gpt-tokenizer/model/gpt-4o-mini';
 import { encode as encodeO3Mini } from 'gpt-tokenizer/model/o3-mini';
 import { encode as encodeGPT4 } from 'gpt-tokenizer/model/gpt-4';
-import { TABLE_STYLE } from './table.js';
 
 export type InvocableTool = {
   name: string;
@@ -33,7 +31,10 @@ export type InvocableTool = {
   };
 };
 
-export const getToolsList = async (): Promise<InvocableTool[]> => {
+export const getToolsList = async (): Promise<{
+  tools: InvocableTool[];
+  tokens: Array<{ tool: string; tokensGPT4oMini: number; tokensO3Mini: number; tokensGPT4: number }>;
+}> => {
   const toolsList: string = await new Promise<string>((resolve, reject) => {
     const child = spawn('npx', [
       '@modelcontextprotocol/inspector',
@@ -67,41 +68,29 @@ export const getToolsList = async (): Promise<InvocableTool[]> => {
 
   const parsedToolsList = JSON.parse(toolsList) as { tools: Tool[] };
 
-  const toolsWithTokens = parsedToolsList.tools?.map((tool) => ({
+  const tokens = parsedToolsList.tools?.map((tool) => ({
     tool: tool.name,
     tokensGPT4oMini: encodeGPT4oMini(JSON.stringify(tool)).length,
     tokensO3Mini: encodeO3Mini(JSON.stringify(tool)).length,
     tokensGPT4: encodeGPT4(JSON.stringify(tool)).length,
   }));
-  toolsWithTokens.push({
+  tokens.push({
     tool: colorize('bold', 'TOTAL'),
-    tokensGPT4oMini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4oMini, 0),
-    tokensO3Mini: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensO3Mini, 0),
-    tokensGPT4: toolsWithTokens.reduce((acc, tool) => acc + tool.tokensGPT4, 0),
+    tokensGPT4oMini: tokens.reduce((acc, tool) => acc + tool.tokensGPT4oMini, 0),
+    tokensO3Mini: tokens.reduce((acc, tool) => acc + tool.tokensO3Mini, 0),
+    tokensGPT4: tokens.reduce((acc, tool) => acc + tool.tokensGPT4, 0),
   });
 
-  printTable({
-    title: 'Tools List',
-    data: toolsWithTokens,
-    columns: [
-      'tool',
-      { key: 'tokensGPT4oMini', name: 'GPT 4o Mini' },
-      { key: 'tokensO3Mini', name: 'O3 Mini' },
-      { key: 'tokensGPT4', name: 'GPT 4' },
-    ],
-    titleOptions: {
-      color: 'yellowBright',
-    },
-    ...TABLE_STYLE,
-  });
-
-  return (parsedToolsList.tools ?? []).map((tool) => ({
-    name: tool.name,
-    function: {
+  return {
+    tools: (parsedToolsList.tools ?? []).map((tool) => ({
       name: tool.name,
-      description: tool.description,
-      parameters: tool.inputSchema,
-      annotations: tool.annotations,
-    },
-  }));
+      function: {
+        name: tool.name,
+        description: tool.description,
+        parameters: tool.inputSchema,
+        annotations: tool.annotations,
+      },
+    })),
+    tokens,
+  };
 };

From 889a2673084b39c1e95ac93477b824b9aadfee26 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 22 Jul 2025 14:01:42 -0600
Subject: [PATCH 34/51] refactor: make it easier to add more confidence
 commands

---
 {scripts => confidence}/.eslintrc.cjs         |  0
 confidence/bin/dev.js                         | 22 ++++++++
 confidence/bin/run.js                         | 21 ++++++++
 .../src/commands}/confidence-test.ts          | 54 +++++++++----------
 confidence/src/index.ts                       | 17 ++++++
 {scripts => confidence/src}/utils/gateway.ts  |  0
 {scripts => confidence/src}/utils/models.ts   |  0
 {scripts => confidence/src}/utils/table.ts    |  0
 {scripts => confidence/src}/utils/tools.ts    |  2 +-
 {scripts => confidence/src}/utils/yaml.ts     |  0
 confidence/tsconfig.json                      |  9 ++++
 package.json                                  |  4 +-
 scripts/tsconfig.json                         |  8 ---
 yarn.lock                                     | 26 ++++++++-
 14 files changed, 123 insertions(+), 40 deletions(-)
 rename {scripts => confidence}/.eslintrc.cjs (100%)
 create mode 100755 confidence/bin/dev.js
 create mode 100755 confidence/bin/run.js
 rename {scripts => confidence/src/commands}/confidence-test.ts (95%)
 create mode 100644 confidence/src/index.ts
 rename {scripts => confidence/src}/utils/gateway.ts (100%)
 rename {scripts => confidence/src}/utils/models.ts (100%)
 rename {scripts => confidence/src}/utils/table.ts (100%)
 rename {scripts => confidence/src}/utils/tools.ts (97%)
 rename {scripts => confidence/src}/utils/yaml.ts (100%)
 create mode 100644 confidence/tsconfig.json
 delete mode 100644 scripts/tsconfig.json

diff --git a/scripts/.eslintrc.cjs b/confidence/.eslintrc.cjs
similarity index 100%
rename from scripts/.eslintrc.cjs
rename to confidence/.eslintrc.cjs
diff --git a/confidence/bin/dev.js b/confidence/bin/dev.js
new file mode 100755
index 00000000..f5e5d3ac
--- /dev/null
+++ b/confidence/bin/dev.js
@@ -0,0 +1,22 @@
+#!/usr/bin/env -S node --loader ts-node/esm --disable-warning=ExperimentalWarning
+
+import { dirname } from 'node:path';
+import { execute } from '@oclif/core';
+
+await execute({
+  development: true,
+  dir: import.meta.url,
+  loadOptions: {
+    root: dirname(import.meta.dirname),
+    pjson: {
+      name: 'mcp-test',
+      version: '1.0.0',
+      oclif: {
+        bin: 'mcp-test',
+        dirname: 'mcp-test',
+        commands: './lib/commands',
+        topicSeparator: ' ',
+      },
+    },
+  },
+});
diff --git a/confidence/bin/run.js b/confidence/bin/run.js
new file mode 100755
index 00000000..909acd4c
--- /dev/null
+++ b/confidence/bin/run.js
@@ -0,0 +1,21 @@
+#!/usr/bin/env node
+
+import { dirname } from 'node:path';
+import { execute } from '@oclif/core';
+
+await execute({
+  dir: import.meta.url,
+  loadOptions: {
+    root: dirname(import.meta.dirname),
+    pjson: {
+      name: 'mcp-test',
+      version: '1.0.0',
+      oclif: {
+        bin: 'mcp-test',
+        dirname: 'mcp-test',
+        commands: './lib/commands',
+        topicSeparator: ' ',
+      },
+    },
+  },
+});
diff --git a/scripts/confidence-test.ts b/confidence/src/commands/confidence-test.ts
similarity index 95%
rename from scripts/confidence-test.ts
rename to confidence/src/commands/confidence-test.ts
index d82658ec..21320256 100644
--- a/scripts/confidence-test.ts
+++ b/confidence/src/commands/confidence-test.ts
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
-import { dirname } from 'node:path';
 import { printTable } from '@oclif/table';
 import { stdout, colorize } from '@oclif/core/ux';
-import { Command, Flags, flush, handle } from '@oclif/core';
+import { Command, Flags } from '@oclif/core';
 import { z } from 'zod';
-import { makeGatewayRequests } from './utils/gateway.js';
-import { getToolsList, InvocableTool } from './utils/tools.js';
-import { TABLE_STYLE } from './utils/table.js';
-import { readYamlFile } from './utils/yaml.js';
-import { Model } from './utils/models.js';
+import { makeGatewayRequests } from '../utils/gateway.js';
+import { getToolsList, InvocableTool } from '../utils/tools.js';
+import { TABLE_STYLE } from '../utils/table.js';
+import { readYamlFile } from '../utils/yaml.js';
+import { Model } from '../utils/models.js';
 
 const Spec = z.object({
   models: z.array(z.custom<Model>()),
@@ -161,7 +160,6 @@ async function compareModelOutputs(
 }
 
 export default class ConfidenceTest extends Command {
-  public static id = 'confidence-test';
   public static summary = 'Test the MCP server against the LLM Gateway API';
   public static description = `Tests that the MCP server tools are accurately invoked by various LLM models.
 
@@ -452,23 +450,23 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
   }
 }
 
-ConfidenceTest.run(process.argv.slice(2), {
-  root: dirname(import.meta.dirname),
-  pjson: {
-    name: 'confidence-test',
-    version: '0.0.1',
-    oclif: {
-      commands: {
-        strategy: 'single',
-        target: 'scripts/confidence-test.js',
-      },
-    },
-  },
-}).then(
-  async () => {
-    await flush();
-  },
-  async (err) => {
-    await handle(err as Error);
-  }
-);
+// ConfidenceTest.run(process.argv.slice(2), {
+//   root: dirname(import.meta.dirname),
+//   pjson: {
+//     name: 'confidence-test',
+//     version: '0.0.1',
+//     oclif: {
+//       commands: {
+//         strategy: 'single',
+//         target: 'scripts/confidence-test.js',
+//       },
+//     },
+//   },
+// }).then(
+//   async () => {
+//     await flush();
+//   },
+//   async (err) => {
+//     await handle(err as Error);
+//   }
+// );
diff --git a/confidence/src/index.ts b/confidence/src/index.ts
new file mode 100644
index 00000000..320514bb
--- /dev/null
+++ b/confidence/src/index.ts
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+export {};
diff --git a/scripts/utils/gateway.ts b/confidence/src/utils/gateway.ts
similarity index 100%
rename from scripts/utils/gateway.ts
rename to confidence/src/utils/gateway.ts
diff --git a/scripts/utils/models.ts b/confidence/src/utils/models.ts
similarity index 100%
rename from scripts/utils/models.ts
rename to confidence/src/utils/models.ts
diff --git a/scripts/utils/table.ts b/confidence/src/utils/table.ts
similarity index 100%
rename from scripts/utils/table.ts
rename to confidence/src/utils/table.ts
diff --git a/scripts/utils/tools.ts b/confidence/src/utils/tools.ts
similarity index 97%
rename from scripts/utils/tools.ts
rename to confidence/src/utils/tools.ts
index 307da218..9b09167c 100644
--- a/scripts/utils/tools.ts
+++ b/confidence/src/utils/tools.ts
@@ -61,7 +61,7 @@ export const getToolsList = async (): Promise<{
       if (code === 0) {
         resolve(output);
       } else {
-        reject(new Error(`Process exited with code ${code}`));
+        reject(new Error(`Process exited with code ${code ?? 'unknown'}`));
       }
     });
   });
diff --git a/scripts/utils/yaml.ts b/confidence/src/utils/yaml.ts
similarity index 100%
rename from scripts/utils/yaml.ts
rename to confidence/src/utils/yaml.ts
diff --git a/confidence/tsconfig.json b/confidence/tsconfig.json
new file mode 100644
index 00000000..f6681780
--- /dev/null
+++ b/confidence/tsconfig.json
@@ -0,0 +1,9 @@
+{
+  "extends": "../tsconfig.json",
+  "compilerOptions": {
+    "outDir": "./lib",
+    "rootDir": "./src",
+    "skipLibCheck": true
+  },
+  "include": ["./src/**/*.ts"]
+}
diff --git a/package.json b/package.json
index 9545b937..0fb23d7f 100644
--- a/package.json
+++ b/package.json
@@ -26,7 +26,7 @@
     "start": "yarn build && npm link && mcp-inspector sf-mcp-server",
     "test": "wireit",
     "test:only": "wireit",
-    "test:confidence-test": "node --no-warnings --loader ts-node/esm scripts/confidence-test.ts"
+    "test:confidence": "tsc -p scripts/ --pretty --incremental && confidence/bin/run.js confidence-test"
   },
   "repository": "salesforcecli/mcp",
   "bugs": {
@@ -42,7 +42,7 @@
   "dependencies": {
     "@jsforce/jsforce-node": "^3.9.1",
     "@modelcontextprotocol/sdk": "^1.15.1",
-    "@oclif/core": "^4.5.0",
+    "@oclif/core": "^4.5.1",
     "@salesforce/agents": "^0.15.4",
     "@salesforce/apex-node": "^8.2.1",
     "@salesforce/core": "^8.18.0",
diff --git a/scripts/tsconfig.json b/scripts/tsconfig.json
deleted file mode 100644
index 93273c54..00000000
--- a/scripts/tsconfig.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "extends": "../tsconfig.json",
-  "compilerOptions": {
-    "outDir": "../dist/scripts",
-    "rootDir": "."
-  },
-  "include": ["**/*"]
-}
diff --git a/yarn.lock b/yarn.lock
index 68279302..a5f28b0d 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -1494,7 +1494,7 @@
     "@nodelib/fs.scandir" "2.1.5"
     fastq "^1.6.0"
 
-"@oclif/core@^4", "@oclif/core@^4.4.0", "@oclif/core@^4.5.0":
+"@oclif/core@^4", "@oclif/core@^4.4.0":
   version "4.5.0"
   resolved "https://registry.yarnpkg.com/@oclif/core/-/core-4.5.0.tgz#0163f933098bfa52f86387f11900da1ad13235d3"
   integrity sha512-UYWyDFNKFyzgXVXO0DHfOvJ/8qpw4yPYe7fOHausDEVU44qjDr90ZnfYTljZPK8dhgMggxiZs9n+TFajnXRp7g==
@@ -1518,6 +1518,30 @@
     wordwrap "^1.0.0"
     wrap-ansi "^7.0.0"
 
+"@oclif/core@^4.5.1":
+  version "4.5.1"
+  resolved "https://registry.yarnpkg.com/@oclif/core/-/core-4.5.1.tgz#7fa9041d13f624e4c00d89605d9f732cf8084748"
+  integrity sha512-JAuARvXOzf75L7rqLL3TIP3OmuTf7N/cjRejkGASfRJH+09180+EGbSkPWSMCns+AaYpDMI+fdaJ6QCoa3f15A==
+  dependencies:
+    ansi-escapes "^4.3.2"
+    ansis "^3.17.0"
+    clean-stack "^3.0.1"
+    cli-spinners "^2.9.2"
+    debug "^4.4.0"
+    ejs "^3.1.10"
+    get-package-type "^0.1.0"
+    indent-string "^4.0.0"
+    is-wsl "^2.2.0"
+    lilconfig "^3.1.3"
+    minimatch "^9.0.5"
+    semver "^7.6.3"
+    string-width "^4.2.3"
+    supports-color "^8"
+    tinyglobby "^0.2.14"
+    widest-line "^3.1.0"
+    wordwrap "^1.0.0"
+    wrap-ansi "^7.0.0"
+
 "@oclif/plugin-help@^6.2.29":
   version "6.2.29"
   resolved "https://registry.yarnpkg.com/@oclif/plugin-help/-/plugin-help-6.2.29.tgz#6c349de65f5a7fee5f94032137fc872ac829d259"

From de623992d97768227b89a8226f589951ac3070a2 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Wed, 23 Jul 2025 15:13:47 -0600
Subject: [PATCH 35/51] chore: clean up and more test cases

---
 confidence/.eslintrc.cjs                   |   2 +-
 confidence/src/commands/confidence-test.ts |   8 +-
 confidence/src/utils/gateway.ts            |   4 +-
 package.json                               |   4 +-
 test/confidence/sf-deploy-metadata.yml     | 170 ++++++++++++++++++---
 yarn.lock                                  |  20 ++-
 6 files changed, 180 insertions(+), 28 deletions(-)

diff --git a/confidence/.eslintrc.cjs b/confidence/.eslintrc.cjs
index e4e8222f..ca212568 100644
--- a/confidence/.eslintrc.cjs
+++ b/confidence/.eslintrc.cjs
@@ -20,7 +20,7 @@ module.exports = {
     project: [
       './tsconfig.json',
       './test/tsconfig.json',
-      './scripts/tsconfig.json', // Add this line
+      './confidence/tsconfig.json', // Add this line
     ],
   },
 };
diff --git a/confidence/src/commands/confidence-test.ts b/confidence/src/commands/confidence-test.ts
index 21320256..72a54346 100644
--- a/confidence/src/commands/confidence-test.ts
+++ b/confidence/src/commands/confidence-test.ts
@@ -35,6 +35,8 @@ const Spec = z.object({
       'expected-tool-confidence': z.number(),
       'expected-parameter-confidence': z.number().optional(),
       'allowed-tools': z.array(z.string()).optional(),
+      skip: z.boolean().optional(),
+      only: z.boolean().optional(),
     })
   ),
 });
@@ -261,7 +263,11 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
     // This allows us to group runs by utterance and display results clearly
     const testIndex = new Map<string, TestCase>();
 
-    const runPromises = spec.data.tests.flatMap((test) => {
+    const filteredTests = spec.data.tests.some((test) => test.only)
+      ? [spec.data.tests.find((test) => test.only)!]
+      : spec.data.tests.filter((test) => !test.skip);
+
+    const runPromises = filteredTests.flatMap((test) => {
       const utteranceKey = Math.random().toString(36).substring(2, 15);
       testIndex.set(utteranceKey, {
         readable: `${colorize('yellowBright', 'Utterance')}:\n  - ${castToArray(test.utterances).join('\n  - ')}`,
diff --git a/confidence/src/utils/gateway.ts b/confidence/src/utils/gateway.ts
index b11ed024..c3ec5b49 100644
--- a/confidence/src/utils/gateway.ts
+++ b/confidence/src/utils/gateway.ts
@@ -43,11 +43,9 @@ type GatewayResponse = {
 const createRequestHeaders = (): Record<string, string> => ({
   Authorization: `API_KEY ${API_KEY}`,
   'Content-Type': 'application/json',
-  // We need to figure out which tenant, context, and feature id to use
-  // Maybe this is something that will be given to us once the client registration completes???
   'x-sfdc-core-tenant-id': 'core/prod1/00DDu0000008cuqMAA',
   'x-sfdc-app-context': 'EinsteinGPT',
-  'x-client-feature-id': 'EinsteinDocsAnswers',
+  'x-client-feature-id': 'platform-cli-mcp-tests',
 });
 
 const createRequestBody = (
diff --git a/package.json b/package.json
index 9dabd0b5..6e185cfe 100644
--- a/package.json
+++ b/package.json
@@ -26,7 +26,7 @@
     "start": "yarn build && npm link && mcp-inspector sf-mcp-server",
     "test": "wireit",
     "test:only": "wireit",
-    "test:confidence": "tsc -p scripts/ --pretty --incremental && confidence/bin/run.js confidence-test"
+    "test:confidence": "yarn compile && tsc -p confidence/ --pretty --incremental && confidence/bin/run.js confidence-test"
   },
   "repository": "salesforcecli/mcp",
   "bugs": {
@@ -41,7 +41,7 @@
   ],
   "dependencies": {
     "@jsforce/jsforce-node": "^3.9.1",
-    "@modelcontextprotocol/sdk": "^1.15.1",
+    "@modelcontextprotocol/sdk": "^1.16.0",
     "@oclif/core": "^4.5.1",
     "@salesforce/agents": "^0.15.4",
     "@salesforce/apex-node": "^8.2.1",
diff --git a/test/confidence/sf-deploy-metadata.yml b/test/confidence/sf-deploy-metadata.yml
index 1c0bc67e..d24c7255 100644
--- a/test/confidence/sf-deploy-metadata.yml
+++ b/test/confidence/sf-deploy-metadata.yml
@@ -2,12 +2,13 @@ models:
   # - llmgateway__OpenAIGPT35Turbo_01_25
   # - llmgateway__OpenAIGPT4OmniMini
   - llmgateway__OpenAIGPT41Nano
-  # - llmgateway__BedrockAnthropicClaude4Sonnet
+  - llmgateway__BedrockAnthropicClaude4Sonnet
 
 initial-context:
   - 'My current OS is macos. I am working in a workspace with the following folders: /Users/sf-dev/dreamhouse-lwc'
 
 tests:
+  # Deploy specific source directory (Lightning Web Components)
   - utterances:
       - My org alias is dreamhouse. Deploy the Lightning Web Components in force-app/main/default/lwc to the dreamhouse org.
     expected-tool: sf-deploy-metadata
@@ -19,22 +20,151 @@ tests:
     expected-parameter-confidence: 100
     allowed-tools:
       - sf-list-all-orgs
-  # - utterances:
-  #     - Deploy my changes to the dreamhouse org
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     directory: ^/.*dreamhouse-lwc$
-  #     usernameOrAlias: dreamhouse
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
-  # - utterances:
-  #     - Hello. Who are you and what can you do?
-  #     - I am a salesforce developer working on the dreamhouse-lwc project. My org alias is dreamhouse.
-  #     - I want to deploy only the Lightning Web Components in force-app/main/default/lwc to dreamhouse.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     sourceDir: force-app/main/default/lwc
-  #     directory: ^/.*dreamhouse-lwc$
-  #     usernameOrAlias: dreamhouse
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+
+  # Deploy multiple source directories
+  - utterances:
+      - Deploy the classes and lwc folders to my dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes,force-app/main/default/lwc
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy using manifest file
+  - utterances:
+      - Deploy the components specified in my package.xml manifest to dreamhouse.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      manifest: /Users/sf-dev/dreamhouse-lwc/package.xml
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy all local changes (no sourceDir or manifest specified)
+  - utterances:
+      - Deploy my changes to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy with no tests run
+  - utterances:
+      - Deploy my changes to the dreamhouse org without running any tests.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTestLevel: NoTestRun
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy with local tests
+  - utterances:
+      - Deploy force-app/main/default/classes to the dreamhouse org and run all local tests.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTestLevel: RunLocalTests
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy with all org tests
+  - utterances:
+      - Deploy the apex classes and run all tests in the org including managed packages.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTestLevel: RunAllTestsInOrg
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy with specific apex tests
+  - utterances:
+      - Deploy my classes to the dreamhouse org and run the PropertyControllerTest and BrokerControllerTest apex tests.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTests: PropertyControllerTest,BrokerControllerTest
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy with single apex test
+  - utterances:
+      - Deploy the PropertyController class and run PropertyControllerTest to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTests: PropertyControllerTest
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy specific file type mentioned
+  - utterances:
+      - Deploy the flows to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/flows
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy with multiple metadata types
+  - utterances:
+      - Deploy the objects, classes, and tabs to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/objects,force-app/main/default/classes,force-app/main/default/tabs
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy with complex folder structure
+  - utterances:
+      - Deploy the PropertyController and PropertyTrigger from the classes folder to the dreamhouse org and run PropertyControllerTest, PropertyTriggerTest, and PropertyUtilTest.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTests: PropertyControllerTest,PropertyTriggerTest,PropertyUtilTest
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
+
+  # Deploy without org specified (should use sf-get-username)
+  - utterances:
+      - Deploy the lwc components.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/lwc
+      directory: /Users/sf-dev/dreamhouse-lwc
+    expected-tool-confidence: 90
+    expected-parameter-confidence: 70
+    allowed-tools:
+      - sf-get-username
+
+  # Deploy with relative path specification
+  - utterances:
+      - Deploy everything in the force-app directory to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 100
+    expected-parameter-confidence: 100
diff --git a/yarn.lock b/yarn.lock
index d1a423c5..4b4ae797 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -1714,7 +1714,7 @@
     ts-node "^10.9.2"
     zod "^3.23.8"
 
-"@modelcontextprotocol/sdk@^1.13.1", "@modelcontextprotocol/sdk@^1.15.1":
+"@modelcontextprotocol/sdk@^1.13.1":
   version "1.15.1"
   resolved "https://registry.yarnpkg.com/@modelcontextprotocol/sdk/-/sdk-1.15.1.tgz#30a235f91e144b62ef1810bc63679e423bac37ec"
   integrity sha512-W/XlN9c528yYn+9MQkVjxiTPgPxoxt+oczfjHBDsJx0+59+O7B75Zhsp0B16Xbwbz8ANISDajh6+V7nIcPMc5w==
@@ -1732,6 +1732,24 @@
     zod "^3.23.8"
     zod-to-json-schema "^3.24.1"
 
+"@modelcontextprotocol/sdk@^1.16.0":
+  version "1.16.0"
+  resolved "https://registry.yarnpkg.com/@modelcontextprotocol/sdk/-/sdk-1.16.0.tgz#39a28a4f775778ec90369ddb5ccfb58a5b9b838f"
+  integrity sha512-8ofX7gkZcLj9H9rSd50mCgm3SSF8C7XoclxJuLoV0Cz3rEQ1tv9MZRYYvJtm9n1BiEQQMzSmE/w2AEkNacLYfg==
+  dependencies:
+    ajv "^6.12.6"
+    content-type "^1.0.5"
+    cors "^2.8.5"
+    cross-spawn "^7.0.5"
+    eventsource "^3.0.2"
+    eventsource-parser "^3.0.0"
+    express "^5.0.1"
+    express-rate-limit "^7.5.0"
+    pkce-challenge "^5.0.0"
+    raw-body "^3.0.0"
+    zod "^3.23.8"
+    zod-to-json-schema "^3.24.1"
+
 "@nodelib/fs.scandir@2.1.5":
   version "2.1.5"
   resolved "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz"

From 01ad59ac02dcd2244847f1ff1f569c8a6fd227d6 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Wed, 23 Jul 2025 19:18:18 -0600
Subject: [PATCH 36/51] chore: add comments about client feature

---
 confidence/src/utils/gateway.ts | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/confidence/src/utils/gateway.ts b/confidence/src/utils/gateway.ts
index c3ec5b49..db258254 100644
--- a/confidence/src/utils/gateway.ts
+++ b/confidence/src/utils/gateway.ts
@@ -43,7 +43,10 @@ type GatewayResponse = {
 const createRequestHeaders = (): Record<string, string> => ({
   Authorization: `API_KEY ${API_KEY}`,
   'Content-Type': 'application/json',
+  // taken from example in docs. Theoretically we'd have our own after fully onboarding?
+  // https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/access/gateway-access/
   'x-sfdc-core-tenant-id': 'core/prod1/00DDu0000008cuqMAA',
+  // https://git.soma.salesforce.com/einsteingpt/module-llmg-cts-registry/blob/master/docs/features/PLATFORM_C_L_I_M_C_P_TESTS.yml
   'x-sfdc-app-context': 'EinsteinGPT',
   'x-client-feature-id': 'platform-cli-mcp-tests',
 });

From 856532c2621fa350fe795a722e440dab4b9f1a55 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Wed, 23 Jul 2025 19:52:34 -0600
Subject: [PATCH 37/51] fix: implement retries for 429

---
 confidence/src/utils/gateway.ts | 10 +++++++++-
 package.json                    |  1 +
 yarn.lock                       |  5 +++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/confidence/src/utils/gateway.ts b/confidence/src/utils/gateway.ts
index db258254..4f999ed1 100644
--- a/confidence/src/utils/gateway.ts
+++ b/confidence/src/utils/gateway.ts
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+import makeFetch from 'fetch-retry';
 import { Model } from './models.js';
 import { InvocableTool } from './tools.js';
 
+const fetchRetry = makeFetch(fetch);
+
 const API_KEY = process.env.SF_LLMG_API_KEY;
 process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
 
@@ -75,12 +78,17 @@ const makeSingleGatewayRequest = async (
   tools: InvocableTool[],
   messages: Array<{ role: string; content: string }>
 ): Promise<GatewayResponse> => {
-  const response = await fetch(
+  const response = await fetchRetry(
     'https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations',
     {
       method: 'POST',
       headers: createRequestHeaders(),
       body: createRequestBody(model, tools, messages),
+      retryDelay(attempt) {
+        return Math.pow(2, attempt) * 1000; // 1000, 2000, 4000
+      },
+      retries: 5,
+      retryOn: [429],
     }
   );
 
diff --git a/package.json b/package.json
index 6e185cfe..dd61a714 100644
--- a/package.json
+++ b/package.json
@@ -51,6 +51,7 @@
     "@salesforce/source-tracking": "^7.4.8",
     "@salesforce/telemetry": "^6.0.39",
     "@salesforce/ts-types": "^2.0.11",
+    "fetch-retry": "^6.0.0",
     "open": "^10.1.2",
     "zod": "^3.25.67"
   },
diff --git a/yarn.lock b/yarn.lock
index 4b4ae797..485b408a 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -5586,6 +5586,11 @@ fdir@^6.4.4:
   resolved "https://registry.yarnpkg.com/fdir/-/fdir-6.4.5.tgz#328e280f3a23699362f95f2e82acf978a0c0cb49"
   integrity sha512-4BG7puHpVsIYxZUbiUE3RqGloLaSSwzYie5jvasC4LWuBWzZawynvYouhjbQKw2JuIGYdm0DzIxl8iVidKlUEw==
 
+fetch-retry@^6.0.0:
+  version "6.0.0"
+  resolved "https://registry.yarnpkg.com/fetch-retry/-/fetch-retry-6.0.0.tgz#4ffdf92c834d72ae819e42a4ee2a63f1e9454426"
+  integrity sha512-BUFj1aMubgib37I3v4q78fYo63Po7t4HUPTpQ6/QE6yK6cIQrP+W43FYToeTEyg5m2Y7eFUtijUuAv/PDlWuag==
+
 file-entry-cache@^6.0.1:
   version "6.0.1"
   resolved "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz"

From 6a7ba927e11eee9abb96aed6e6c855d5e96b17d0 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Wed, 23 Jul 2025 21:39:23 -0600
Subject: [PATCH 38/51] feat: rate limit api requests to avoid 429s

---
 confidence/.eslintrc.cjs                   |   3 +
 confidence/src/commands/confidence-test.ts |  25 ++-
 confidence/src/utils/gateway.ts            |  12 +-
 confidence/src/utils/rate-limiter.ts       | 207 +++++++++++++++++++++
 package.json                               |   4 +-
 yarn.lock                                  |  12 ++
 6 files changed, 252 insertions(+), 11 deletions(-)
 create mode 100644 confidence/src/utils/rate-limiter.ts

diff --git a/confidence/.eslintrc.cjs b/confidence/.eslintrc.cjs
index ca212568..9b35591d 100644
--- a/confidence/.eslintrc.cjs
+++ b/confidence/.eslintrc.cjs
@@ -23,4 +23,7 @@ module.exports = {
       './confidence/tsconfig.json', // Add this line
     ],
   },
+  rules: {
+    'import/no-extraneous-dependencies': ['error', { devDependencies: true }],
+  },
 };
diff --git a/confidence/src/commands/confidence-test.ts b/confidence/src/commands/confidence-test.ts
index 72a54346..373ae78d 100644
--- a/confidence/src/commands/confidence-test.ts
+++ b/confidence/src/commands/confidence-test.ts
@@ -267,6 +267,15 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
       ? [spec.data.tests.find((test) => test.only)!]
       : spec.data.tests.filter((test) => !test.skip);
 
+    const logStatus = (message: string): void => {
+      if (flags.verbose) {
+        stdout(colorize('yellowBright', `Status: ${message}`));
+      }
+    };
+
+    let completedRuns = 0;
+    const totalTestCases = filteredTests.length * flags.runs;
+    logStatus(`Running ${totalTestCases} test cases...`);
     const runPromises = filteredTests.flatMap((test) => {
       const utteranceKey = Math.random().toString(36).substring(2, 15);
       testIndex.set(utteranceKey, {
@@ -279,12 +288,16 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
         allowedTools: [test['expected-tool'], ...(test['allowed-tools'] ?? [])],
       });
       return Array.from({ length: flags.runs }, (_, idx) =>
-        compareModelOutputs(test.utterances, spec.data, mcpTools).then(({ invocations, tableData }) => ({
-          idx,
-          utteranceKey,
-          invocations,
-          tableData,
-        }))
+        compareModelOutputs(test.utterances, spec.data, mcpTools).then(({ invocations, tableData }) => {
+          completedRuns++;
+          logStatus(`Completed run ${completedRuns} of ${totalTestCases}`);
+          return {
+            idx,
+            utteranceKey,
+            invocations,
+            tableData,
+          };
+        })
       );
     });
 
diff --git a/confidence/src/utils/gateway.ts b/confidence/src/utils/gateway.ts
index 4f999ed1..76aa9428 100644
--- a/confidence/src/utils/gateway.ts
+++ b/confidence/src/utils/gateway.ts
@@ -17,6 +17,7 @@
 import makeFetch from 'fetch-retry';
 import { Model } from './models.js';
 import { InvocableTool } from './tools.js';
+import { RateLimiter } from './rate-limiter.js';
 
 const fetchRetry = makeFetch(fetch);
 
@@ -73,14 +74,17 @@ const createRequestBody = (
     },
   });
 
+// We're using a pre-production environment so we currently have the default 40 requests per minute per client-feature-id.
+// See: https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/rate-limits/#pre-production-environments
+const rateLimiter = new RateLimiter(40, 60_000);
+
 const makeSingleGatewayRequest = async (
   model: Model,
   tools: InvocableTool[],
   messages: Array<{ role: string; content: string }>
 ): Promise<GatewayResponse> => {
-  const response = await fetchRetry(
-    'https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations',
-    {
+  const response = await rateLimiter.enqueue(async () =>
+    fetchRetry('https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations', {
       method: 'POST',
       headers: createRequestHeaders(),
       body: createRequestBody(model, tools, messages),
@@ -89,7 +93,7 @@ const makeSingleGatewayRequest = async (
       },
       retries: 5,
       retryOn: [429],
-    }
+    })
   );
 
   if (!response.ok) {
diff --git a/confidence/src/utils/rate-limiter.ts b/confidence/src/utils/rate-limiter.ts
new file mode 100644
index 00000000..6c021b4c
--- /dev/null
+++ b/confidence/src/utils/rate-limiter.ts
@@ -0,0 +1,207 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import makeDebug from 'debug';
+
+const debug = makeDebug('confidence:rate-limiter');
+
+type QueuedRequest<T extends Response> = {
+  execute: () => Promise<T>;
+  resolve: (value: T | PromiseLike<T>) => void;
+  reject: (error: unknown) => void;
+};
+
+/**
+ * A rate limiter that controls the frequency of requests using a sliding window approach.
+ *
+ * This class implements a queue-based rate limiter that ensures no more than a specified
+ * number of requests are executed within a given time window. Requests that exceed the
+ * rate limit are queued and executed when the rate limit allows.
+ *
+ * @example
+ * ```typescript
+ * // Create a rate limiter that allows 10 requests per minute
+ * const rateLimiter = new RateLimiter(10, 60_000);
+ *
+ * // Enqueue API calls
+ * const result1 = await rateLimiter.enqueue(() => fetch('/api/data1'));
+ * const result2 = await rateLimiter.enqueue(() => fetch('/api/data2'));
+ *
+ * // Check current status
+ * const status = rateLimiter.getStatus();
+ * console.log(`Queue length: ${status.queueLength}`);
+ * console.log(`Requests in window: ${status.requestsInWindow}`);
+ * ```
+ */
+export class RateLimiter {
+  private static completed: number = 0;
+  private static failed: number = 0;
+
+  private readonly requestTimestamps: number[] = [];
+  private readonly queue: Array<QueuedRequest<Response>> = [];
+  private isProcessing = false;
+
+  public constructor(private readonly maxRequests = 40, private readonly windowMs = 60_000) {}
+
+  /**
+   * Utility function to sleep for a given number of milliseconds
+   */
+  private static sleep(ms: number): Promise<void> {
+    return new Promise((resolve) => setTimeout(resolve, ms));
+  }
+
+  /**
+   * Executes a single request and handles its completion
+   */
+  private static async executeRequest(request: QueuedRequest<Response>): Promise<void> {
+    try {
+      const result = await request.execute();
+      this.completed++;
+      request.resolve(result);
+    } catch (error) {
+      this.failed++;
+      request.reject(error);
+    }
+  }
+
+  /**
+   * Enqueues a request to be executed when rate limit allows
+   *
+   * @param requestFn Function that returns a promise for the actual request
+   * @returns Promise that resolves when the request is executed
+   */
+  public async enqueue(requestFn: () => Promise<Response>): Promise<Response> {
+    debug('Enqueuing request: %O', this.getStatus());
+    return new Promise<Response>((resolve, reject) => {
+      this.queue.push({
+        execute: requestFn,
+        resolve,
+        reject,
+      });
+
+      // Start processing if not already running
+      if (!this.isProcessing) {
+        void this.processQueue();
+      }
+    });
+  }
+
+  /**
+   * Gets current queue status for monitoring/debugging
+   */
+  public getStatus(): {
+    queueLength: number;
+    requestsInWindow: number;
+    maxRequests: number;
+    canExecute: boolean;
+    nextAvailableSlot?: number;
+    isProcessing: boolean;
+    completed: number;
+    failed: number;
+  } {
+    const now = Date.now();
+    this.cleanupOldTimestamps(now);
+
+    return {
+      queueLength: this.queue.length,
+      requestsInWindow: this.requestTimestamps.length,
+      maxRequests: this.maxRequests,
+      canExecute: this.canExecuteRequest(),
+      nextAvailableSlot: this.requestTimestamps.length > 0 ? this.requestTimestamps[0] + this.windowMs : undefined,
+      isProcessing: this.isProcessing,
+      completed: RateLimiter.completed,
+      failed: RateLimiter.failed,
+    };
+  }
+
+  /**
+   * Processes the queue, executing requests when rate limit allows
+   */
+  private async processQueue(): Promise<void> {
+    this.isProcessing = true;
+
+    while (this.queue.length > 0) {
+      const now = Date.now();
+
+      // Remove timestamps outside the current window
+      this.cleanupOldTimestamps(now);
+
+      if (this.canExecuteRequest()) {
+        debug('Executing request: %O', this.getStatus());
+        // Execute the next request without waiting for it to complete
+        const request = this.queue.shift()!;
+        this.recordRequest(now);
+
+        // Execute the request asynchronously - don't await
+        void RateLimiter.executeRequest(request);
+
+        // Add a small delay to prevent burst requests from overwhelming the rate limit
+        // This spreads out request initiation while still allowing parallelization
+        // eslint-disable-next-line no-await-in-loop
+        await RateLimiter.sleep(Math.ceil(this.windowMs / this.maxRequests));
+      } else {
+        // Wait until we can make the next request
+        const delay = this.calculateDelay(now);
+        // eslint-disable-next-line no-await-in-loop
+        await RateLimiter.sleep(delay);
+      }
+    }
+
+    this.isProcessing = false;
+  }
+
+  /**
+   * Checks if a request can be executed based on current rate limit
+   */
+  private canExecuteRequest(): boolean {
+    return this.requestTimestamps.length < this.maxRequests;
+  }
+
+  /**
+   * Records the timestamp of a request
+   */
+  private recordRequest(timestamp: number): void {
+    this.requestTimestamps.push(timestamp);
+  }
+
+  /**
+   * Removes timestamps that are outside the current window
+   */
+  private cleanupOldTimestamps(now: number): void {
+    const cutoff = now - this.windowMs;
+    while (this.requestTimestamps.length > 0 && this.requestTimestamps[0] < cutoff) {
+      this.requestTimestamps.shift();
+    }
+  }
+
+  /**
+   * Calculates how long to wait before the next request can be made
+   */
+  private calculateDelay(now: number): number {
+    if (this.requestTimestamps.length === 0) {
+      return 0;
+    }
+
+    // If we're at the limit, wait until the oldest request expires
+    if (this.requestTimestamps.length >= this.maxRequests) {
+      const oldestRequest = this.requestTimestamps[0];
+      const timeUntilExpiry = oldestRequest + this.windowMs - now;
+      return Math.max(0, timeUntilExpiry + 100); // Add 100ms buffer
+    }
+
+    return 0;
+  }
+}
diff --git a/package.json b/package.json
index dd61a714..9f4c6940 100644
--- a/package.json
+++ b/package.json
@@ -51,7 +51,6 @@
     "@salesforce/source-tracking": "^7.4.8",
     "@salesforce/telemetry": "^6.0.39",
     "@salesforce/ts-types": "^2.0.11",
-    "fetch-retry": "^6.0.0",
     "open": "^10.1.2",
     "zod": "^3.25.67"
   },
@@ -60,9 +59,12 @@
     "@oclif/table": "^0.4.9",
     "@salesforce/cli-plugins-testkit": "^5.3.39",
     "@salesforce/dev-scripts": "11.0.2",
+    "@types/debug": "^4.1.12",
     "@types/node": "^22.16.5",
+    "debug": "^4.4.1",
     "eslint-config-salesforce-license": "^1.0.1",
     "eslint-plugin-sf-plugin": "^1.20.26",
+    "fetch-retry": "^6.0.0",
     "gpt-tokenizer": "^3.0.1",
     "oclif": "^4.21.0",
     "ts-node": "^10.9.2",
diff --git a/yarn.lock b/yarn.lock
index 485b408a..ed32d2d3 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -3219,6 +3219,13 @@
   resolved "https://registry.npmjs.org/@types/chai/-/chai-4.3.20.tgz"
   integrity sha512-/pC9HAB5I/xMlc5FP77qjCnI16ChlJfW0tGa0IUcFn38VJrTV6DeZ60NU5KZBtaOZqjdpwTWohz5HU1RrhiYxQ==
 
+"@types/debug@^4.1.12":
+  version "4.1.12"
+  resolved "https://registry.yarnpkg.com/@types/debug/-/debug-4.1.12.tgz#a155f21690871953410df4b6b6f53187f0500917"
+  integrity sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==
+  dependencies:
+    "@types/ms" "*"
+
 "@types/glob@~7.2.0":
   version "7.2.0"
   resolved "https://registry.npmjs.org/@types/glob/-/glob-7.2.0.tgz"
@@ -3297,6 +3304,11 @@
   resolved "https://registry.npmjs.org/@types/mocha/-/mocha-10.0.10.tgz"
   integrity sha512-xPyYSz1cMPnJQhl0CLMH68j3gprKZaTjG3s5Vi+fDgx+uhG9NOXwbVt52eFS8ECyXhyKcjDLCBEqBExKuiZb7Q==
 
+"@types/ms@*":
+  version "2.1.0"
+  resolved "https://registry.yarnpkg.com/@types/ms/-/ms-2.1.0.tgz#052aa67a48eccc4309d7f0191b7e41434b90bb78"
+  integrity sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==
+
 "@types/mute-stream@^0.0.4":
   version "0.0.4"
   resolved "https://registry.npmjs.org/@types/mute-stream/-/mute-stream-0.0.4.tgz"

From f41a0567a94228ab70114ceb3892fb55f7d1ade7 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 09:19:00 -0600
Subject: [PATCH 39/51] feat: better result reporting

---
 confidence/src/commands/confidence-test.ts | 171 +++++++-------
 test/confidence/sf-deploy-metadata.yml     | 250 ++++++++++-----------
 2 files changed, 202 insertions(+), 219 deletions(-)

diff --git a/confidence/src/commands/confidence-test.ts b/confidence/src/commands/confidence-test.ts
index 373ae78d..0ce7440c 100644
--- a/confidence/src/commands/confidence-test.ts
+++ b/confidence/src/commands/confidence-test.ts
@@ -259,54 +259,47 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
       stdout();
     }
 
-    // Generate unique keys for each utterance to track runs
-    // This allows us to group runs by utterance and display results clearly
-    const testIndex = new Map<string, TestCase>();
-
     const filteredTests = spec.data.tests.some((test) => test.only)
       ? [spec.data.tests.find((test) => test.only)!]
       : spec.data.tests.filter((test) => !test.skip);
 
-    const logStatus = (message: string): void => {
-      if (flags.verbose) {
-        stdout(colorize('yellowBright', `Status: ${message}`));
+    // Generate unique keys for each test case to track runs
+    // This allows us to group runs by test case and display results clearly
+    const testIndex = new Map<string, TestCase>();
+
+    // Map to store test results by testCaseKey
+    // Each entry will contain an array of runs for that test case
+    // This allows us to aggregate results and print them after all runs are complete
+    const testResultsByTestCaseKey = new Map<
+      string,
+      Array<{
+        idx: number;
+        testCaseKey: string;
+        invocations: Record<string, Array<{ tool: string; parameters: Record<string, string> }>>;
+        tableData: Array<{ model: Model; chat: string; tools: string }>;
+      }>
+    >();
+
+    // Map to track pass/fail status by testCaseKey
+    const passFailMap = new Map<string, { tools: boolean; parameters: boolean }>();
+
+    const maybePrintTestResults = (testCaseKey: string): void => {
+      const testRuns = (testResultsByTestCaseKey.get(testCaseKey) ?? []).sort((a, b) => a.idx - b.idx);
+      if (testRuns.length < flags.runs) {
+        return; // Not enough runs yet to print results
       }
-    };
 
-    let completedRuns = 0;
-    const totalTestCases = filteredTests.length * flags.runs;
-    logStatus(`Running ${totalTestCases} test cases...`);
-    const runPromises = filteredTests.flatMap((test) => {
-      const utteranceKey = Math.random().toString(36).substring(2, 15);
-      testIndex.set(utteranceKey, {
-        readable: `${colorize('yellowBright', 'Utterance')}:\n  - ${castToArray(test.utterances).join('\n  - ')}`,
-        utterances: castToArray(test.utterances),
-        expectedTool: test['expected-tool'],
-        expectedParameters: test['expected-parameters'],
-        expectedToolConfidence: test['expected-tool-confidence'],
-        expectedParameterConfidence: test['expected-parameter-confidence'] ?? test['expected-tool-confidence'],
-        allowedTools: [test['expected-tool'], ...(test['allowed-tools'] ?? [])],
-      });
-      return Array.from({ length: flags.runs }, (_, idx) =>
-        compareModelOutputs(test.utterances, spec.data, mcpTools).then(({ invocations, tableData }) => {
-          completedRuns++;
-          logStatus(`Completed run ${completedRuns} of ${totalTestCases}`);
-          return {
-            idx,
-            utteranceKey,
-            invocations,
-            tableData,
-          };
-        })
-      );
-    });
+      const testSpec = testIndex.get(testCaseKey);
+      if (!testSpec) {
+        stdout(colorize('red', `No test spec found for utterance key: ${testCaseKey}`));
+        return;
+      }
 
-    const results = groupBy(await Promise.all(runPromises), (r) => r.utteranceKey);
+      stdout(colorize('bold', ' ─── Results for Test Case ───'));
+      stdout(testSpec.readable);
 
-    if (flags.verbose) {
-      for (const [utteranceKey, runs] of Object.entries(results)) {
-        stdout(testIndex.get(utteranceKey)?.readable ?? 'Unknown Test Case');
-        for (const run of runs) {
+      if (flags.verbose) {
+        for (const run of testRuns) {
           printTable({
             title: `Run #${run.idx + 1}`,
             data: run.tableData,
@@ -320,36 +313,14 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
           });
         }
       }
-    }
-
-    stdout();
-    stdout(colorize('bold', 'SUMMARY'));
-    stdout(`Total Runs: ${Object.values(results).flatMap((m) => Object.values(m)).length}`);
-    stdout();
-
-    // Initialize all utterance keys as passing
-    const passFailMap = new Map<string, { tools: boolean; parameters: boolean }>(
-      Object.keys(results).map((key) => [key, { tools: true, parameters: true }])
-    );
-
-    for (const [utteranceKey, testResults] of Object.entries(results)) {
-      const testSpec = testIndex.get(utteranceKey);
-      if (!testSpec) {
-        stdout(colorize('red', `No test spec found for utterance key: ${utteranceKey}`));
-        continue;
-      }
-
-      stdout(testSpec.readable);
 
       const runsByModel = groupBy(
-        testResults
-          .sort((a, b) => a.idx - b.idx)
-          .flatMap((result) =>
-            Object.entries(result.invocations).map(([model, invocations]) => ({
-              model,
-              invocations,
-            }))
-          ),
+        testRuns.flatMap((result) =>
+          Object.entries(result.invocations).map(([model, invocations]) => ({
+            model,
+            invocations,
+          }))
+        ),
         (r) => r.model
       );
 
@@ -361,8 +332,8 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
           const confidence = Math.round((actualToolCount / totalRuns) * 100);
 
           if (confidence < testSpec.expectedToolConfidence) {
-            passFailMap.set(utteranceKey, {
-              ...(passFailMap.get(utteranceKey) ?? { tools: true, parameters: true }),
+            passFailMap.set(testCaseKey, {
+              ...(passFailMap.get(testCaseKey) ?? { tools: true, parameters: true }),
               tools: false,
             });
           }
@@ -401,8 +372,8 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
             const confidence = Math.round((runsThatMatchParameters / totalRuns) * 100);
 
             if (confidence < testSpec.expectedParameterConfidence) {
-              passFailMap.set(utteranceKey, {
-                ...(passFailMap.get(utteranceKey) ?? { tools: true, parameters: true }),
+              passFailMap.set(testCaseKey, {
+                ...(passFailMap.get(testCaseKey) ?? { tools: true, parameters: true }),
                 parameters: false,
               });
             }
@@ -440,7 +411,40 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
           width: process.stdout.columns,
         });
       }
-    }
+    };
+
+    await Promise.all(
+      filteredTests.flatMap((test) => {
+        const testCaseKey = Math.random().toString(36).substring(2, 15);
+        testIndex.set(testCaseKey, {
+          readable: `${colorize('yellowBright', 'Utterance')}:\n  - ${castToArray(test.utterances).join('\n  - ')}`,
+          utterances: castToArray(test.utterances),
+          expectedTool: test['expected-tool'],
+          expectedParameters: test['expected-parameters'],
+          expectedToolConfidence: test['expected-tool-confidence'],
+          expectedParameterConfidence: test['expected-parameter-confidence'] ?? test['expected-tool-confidence'],
+          allowedTools: [test['expected-tool'], ...(test['allowed-tools'] ?? [])],
+        });
+        passFailMap.set(testCaseKey, {
+          tools: true,
+          parameters: true,
+        });
+        return Array.from({ length: flags.runs }, (_, idx) =>
+          compareModelOutputs(test.utterances, spec.data, mcpTools).then(({ invocations, tableData }) => {
+            testResultsByTestCaseKey.set(testCaseKey, [
+              ...(testResultsByTestCaseKey.get(testCaseKey) ?? []),
+              {
+                idx,
+                testCaseKey,
+                invocations,
+                tableData,
+              },
+            ]);
+            maybePrintTestResults(testCaseKey);
+          })
+        );
+      })
+    );
 
     const failingToolTests = filterFailingTests(passFailMap, testIndex, 'tools');
     const failingParameterTests = filterFailingTests(passFailMap, testIndex, 'parameters');
@@ -468,24 +472,3 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
     }
   }
 }
-
-// ConfidenceTest.run(process.argv.slice(2), {
-//   root: dirname(import.meta.dirname),
-//   pjson: {
-//     name: 'confidence-test',
-//     version: '0.0.1',
-//     oclif: {
-//       commands: {
-//         strategy: 'single',
-//         target: 'scripts/confidence-test.js',
-//       },
-//     },
-//   },
-// }).then(
-//   async () => {
-//     await flush();
-//   },
-//   async (err) => {
-//     await handle(err as Error);
-//   }
-// );
diff --git a/test/confidence/sf-deploy-metadata.yml b/test/confidence/sf-deploy-metadata.yml
index d24c7255..7649cebe 100644
--- a/test/confidence/sf-deploy-metadata.yml
+++ b/test/confidence/sf-deploy-metadata.yml
@@ -32,139 +32,139 @@ tests:
     expected-tool-confidence: 100
     expected-parameter-confidence: 100
 
-  # Deploy using manifest file
-  - utterances:
-      - Deploy the components specified in my package.xml manifest to dreamhouse.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      manifest: /Users/sf-dev/dreamhouse-lwc/package.xml
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy using manifest file
+  # - utterances:
+  #     - Deploy the components specified in my package.xml manifest to dreamhouse.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     manifest: /Users/sf-dev/dreamhouse-lwc/package.xml
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
 
-  # Deploy all local changes (no sourceDir or manifest specified)
-  - utterances:
-      - Deploy my changes to the dreamhouse org.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy all local changes (no sourceDir or manifest specified)
+  # - utterances:
+  #     - Deploy my changes to the dreamhouse org.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
 
-  # Deploy with no tests run
-  - utterances:
-      - Deploy my changes to the dreamhouse org without running any tests.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-      apexTestLevel: NoTestRun
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy with no tests run
+  # - utterances:
+  #     - Deploy my changes to the dreamhouse org without running any tests.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #     apexTestLevel: NoTestRun
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
 
-  # Deploy with local tests
-  - utterances:
-      - Deploy force-app/main/default/classes to the dreamhouse org and run all local tests.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      sourceDir: force-app/main/default/classes
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-      apexTestLevel: RunLocalTests
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy with local tests
+  # - utterances:
+  #     - Deploy force-app/main/default/classes to the dreamhouse org and run all local tests.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     sourceDir: force-app/main/default/classes
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #     apexTestLevel: RunLocalTests
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
 
-  # Deploy with all org tests
-  - utterances:
-      - Deploy the apex classes and run all tests in the org including managed packages.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      sourceDir: force-app/main/default/classes
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-      apexTestLevel: RunAllTestsInOrg
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy with all org tests
+  # - utterances:
+  #     - Deploy the apex classes and run all tests in the org including managed packages.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     sourceDir: force-app/main/default/classes
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #     apexTestLevel: RunAllTestsInOrg
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
 
-  # Deploy with specific apex tests
-  - utterances:
-      - Deploy my classes to the dreamhouse org and run the PropertyControllerTest and BrokerControllerTest apex tests.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      sourceDir: force-app/main/default/classes
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-      apexTests: PropertyControllerTest,BrokerControllerTest
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy with specific apex tests
+  # - utterances:
+  #     - Deploy my classes to the dreamhouse org and run the PropertyControllerTest and BrokerControllerTest apex tests.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     sourceDir: force-app/main/default/classes
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #     apexTests: PropertyControllerTest,BrokerControllerTest
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
 
-  # Deploy with single apex test
-  - utterances:
-      - Deploy the PropertyController class and run PropertyControllerTest to the dreamhouse org.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      sourceDir: force-app/main/default/classes
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-      apexTests: PropertyControllerTest
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy with single apex test
+  # - utterances:
+  #     - Deploy the PropertyController class and run PropertyControllerTest to the dreamhouse org.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     sourceDir: force-app/main/default/classes
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #     apexTests: PropertyControllerTest
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
 
-  # Deploy specific file type mentioned
-  - utterances:
-      - Deploy the flows to the dreamhouse org.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      sourceDir: force-app/main/default/flows
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy specific file type mentioned
+  # - utterances:
+  #     - Deploy the flows to the dreamhouse org.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     sourceDir: force-app/main/default/flows
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
 
-  # Deploy with multiple metadata types
-  - utterances:
-      - Deploy the objects, classes, and tabs to the dreamhouse org.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      sourceDir: force-app/main/default/objects,force-app/main/default/classes,force-app/main/default/tabs
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy with multiple metadata types
+  # - utterances:
+  #     - Deploy the objects, classes, and tabs to the dreamhouse org.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     sourceDir: force-app/main/default/objects,force-app/main/default/classes,force-app/main/default/tabs
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
 
-  # Deploy with complex folder structure
-  - utterances:
-      - Deploy the PropertyController and PropertyTrigger from the classes folder to the dreamhouse org and run PropertyControllerTest, PropertyTriggerTest, and PropertyUtilTest.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      sourceDir: force-app/main/default/classes
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-      apexTests: PropertyControllerTest,PropertyTriggerTest,PropertyUtilTest
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy with complex folder structure
+  # - utterances:
+  #     - Deploy the PropertyController and PropertyTrigger from the classes folder to the dreamhouse org and run PropertyControllerTest, PropertyTriggerTest, and PropertyUtilTest.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     sourceDir: force-app/main/default/classes
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #     apexTests: PropertyControllerTest,PropertyTriggerTest,PropertyUtilTest
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100
 
-  # Deploy without org specified (should use sf-get-username)
-  - utterances:
-      - Deploy the lwc components.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      sourceDir: force-app/main/default/lwc
-      directory: /Users/sf-dev/dreamhouse-lwc
-    expected-tool-confidence: 90
-    expected-parameter-confidence: 70
-    allowed-tools:
-      - sf-get-username
+  # # Deploy without org specified (should use sf-get-username)
+  # - utterances:
+  #     - Deploy the lwc components.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     sourceDir: force-app/main/default/lwc
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #   expected-tool-confidence: 90
+  #   expected-parameter-confidence: 70
+  #   allowed-tools:
+  #     - sf-get-username
 
-  # Deploy with relative path specification
-  - utterances:
-      - Deploy everything in the force-app directory to the dreamhouse org.
-    expected-tool: sf-deploy-metadata
-    expected-parameters:
-      sourceDir: force-app
-      directory: /Users/sf-dev/dreamhouse-lwc
-      usernameOrAlias: dreamhouse
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+  # # Deploy with relative path specification
+  # - utterances:
+  #     - Deploy everything in the force-app directory to the dreamhouse org.
+  #   expected-tool: sf-deploy-metadata
+  #   expected-parameters:
+  #     sourceDir: force-app
+  #     directory: /Users/sf-dev/dreamhouse-lwc
+  #     usernameOrAlias: dreamhouse
+  #   expected-tool-confidence: 100
+  #   expected-parameter-confidence: 100

From 02d7f4ed36dd727500fb381715c2612fa3ecca0c Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 09:35:05 -0600
Subject: [PATCH 40/51] fix: allow bursts in RateLimiter

---
 confidence/src/utils/rate-limiter.ts   | 128 ++++++++++--
 test/confidence/sf-deploy-metadata.yml | 272 +++++++++++++------------
 2 files changed, 254 insertions(+), 146 deletions(-)

diff --git a/confidence/src/utils/rate-limiter.ts b/confidence/src/utils/rate-limiter.ts
index 6c021b4c..e8eb8d46 100644
--- a/confidence/src/utils/rate-limiter.ts
+++ b/confidence/src/utils/rate-limiter.ts
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+/* eslint-disable no-await-in-loop */
+
 import makeDebug from 'debug';
 
 const debug = makeDebug('confidence:rate-limiter');
@@ -25,25 +27,35 @@ type QueuedRequest<T extends Response> = {
 };
 
 /**
- * A rate limiter that controls the frequency of requests using a sliding window approach.
+ * A rate limiter that controls the frequency of requests using a sliding window approach with adaptive burst control.
  *
  * This class implements a queue-based rate limiter that ensures no more than a specified
- * number of requests are executed within a given time window. Requests that exceed the
- * rate limit are queued and executed when the rate limit allows.
+ * number of requests are executed within a given time window. It features intelligent burst
+ * detection that allows rapid execution of small batches while maintaining rate limit compliance
+ * for larger workloads.
+ *
+ * Key Features:
+ * - Sliding window rate limiting
+ * - Adaptive burst control for small request batches
+ * - Gradual transition from burst to controlled spacing as utilization increases
+ * - Comprehensive monitoring and debugging information
  *
  * @example
  * ```typescript
  * // Create a rate limiter that allows 10 requests per minute
  * const rateLimiter = new RateLimiter(10, 60_000);
  *
- * // Enqueue API calls
- * const result1 = await rateLimiter.enqueue(() => fetch('/api/data1'));
- * const result2 = await rateLimiter.enqueue(() => fetch('/api/data2'));
+ * // Small batches execute immediately in burst mode
+ * const results = await Promise.all([
+ *   rateLimiter.enqueue(() => fetch('/api/data1')),
+ *   rateLimiter.enqueue(() => fetch('/api/data2')),
+ *   rateLimiter.enqueue(() => fetch('/api/data3'))
+ * ]);
  *
- * // Check current status
+ * // Check current status including burst mode information
  * const status = rateLimiter.getStatus();
- * console.log(`Queue length: ${status.queueLength}`);
- * console.log(`Requests in window: ${status.requestsInWindow}`);
+ * console.log(`Burst mode active: ${status.burstModeActive}`);
+ * console.log(`Utilization: ${(status.utilizationRatio * 100).toFixed(1)}%`);
  * ```
  */
 export class RateLimiter {
@@ -54,6 +66,24 @@ export class RateLimiter {
   private readonly queue: Array<QueuedRequest<Response>> = [];
   private isProcessing = false;
 
+  /**
+   * Utilization threshold below which burst mode is allowed.
+   * When current window utilization is below this ratio, requests can execute immediately.
+   */
+  private readonly burstUtilizationThreshold = 0.5;
+
+  /**
+   * Total work threshold (current + queued requests) for burst mode.
+   * Burst mode is only allowed when predicted total utilization is below this ratio.
+   */
+  private readonly burstQueueThreshold = 0.75;
+
+  /**
+   * Minimum delay between requests during controlled (non-burst) execution.
+   * Provides a baseline spacing to prevent overwhelming the target service.
+   */
+  private readonly minDelayMs = 50;
+
   public constructor(private readonly maxRequests = 40, private readonly windowMs = 60_000) {}
 
   /**
@@ -111,6 +141,10 @@ export class RateLimiter {
     isProcessing: boolean;
     completed: number;
     failed: number;
+    burstModeActive: boolean;
+    utilizationRatio: number;
+    predictedUtilization: number;
+    timeUntilWindowReset: number;
   } {
     const now = Date.now();
     this.cleanupOldTimestamps(now);
@@ -124,6 +158,10 @@ export class RateLimiter {
       isProcessing: this.isProcessing,
       completed: RateLimiter.completed,
       failed: RateLimiter.failed,
+      burstModeActive: this.shouldAllowBurst(),
+      utilizationRatio: this.requestTimestamps.length / this.maxRequests,
+      predictedUtilization: this.getPredictedWindowUtilization(),
+      timeUntilWindowReset: this.getTimeUntilWindowReset(),
     };
   }
 
@@ -148,21 +186,79 @@ export class RateLimiter {
         // Execute the request asynchronously - don't await
         void RateLimiter.executeRequest(request);
 
-        // Add a small delay to prevent burst requests from overwhelming the rate limit
-        // This spreads out request initiation while still allowing parallelization
-        // eslint-disable-next-line no-await-in-loop
-        await RateLimiter.sleep(Math.ceil(this.windowMs / this.maxRequests));
+        // Use adaptive delay instead of fixed delay
+        const delay = this.calculateAdaptiveDelay();
+        if (delay > 0) {
+          await RateLimiter.sleep(delay);
+        }
       } else {
         // Wait until we can make the next request
-        const delay = this.calculateDelay(now);
-        // eslint-disable-next-line no-await-in-loop
-        await RateLimiter.sleep(delay);
+        await RateLimiter.sleep(this.calculateDelay(now));
       }
     }
 
     this.isProcessing = false;
   }
 
+  /**
+   * Determines if burst mode should be allowed based on current utilization
+   */
+  private shouldAllowBurst(): boolean {
+    const utilizationRatio = this.requestTimestamps.length / this.maxRequests;
+    const queueRatio = this.queue.length / this.maxRequests;
+    const totalWorkRatio = utilizationRatio + queueRatio;
+
+    // Allow bursts when:
+    // 1. Current utilization is below the burst threshold
+    // 2. Total work (current + queued) is below the queue threshold
+    return utilizationRatio < this.burstUtilizationThreshold && totalWorkRatio < this.burstQueueThreshold;
+  }
+
+  /**
+   * Calculates adaptive delay based on current utilization and queue state
+   */
+  private calculateAdaptiveDelay(): number {
+    // Allow immediate execution during burst conditions
+    if (this.shouldAllowBurst()) {
+      return 0;
+    }
+
+    const utilizationRatio = this.requestTimestamps.length / this.maxRequests;
+    const remainingCapacity = this.maxRequests - this.requestTimestamps.length;
+    const queueLength = this.queue.length;
+
+    // If we have enough capacity for all queued requests, use minimal spacing
+    if (remainingCapacity >= queueLength) {
+      return this.minDelayMs;
+    }
+
+    // Calculate base delay and scale it based on utilization
+    const baseDelay = Math.ceil(this.windowMs / this.maxRequests);
+    const scalingFactor = Math.min(utilizationRatio * 2, 1);
+
+    return Math.max(this.minDelayMs, baseDelay * scalingFactor);
+  }
+
+  /**
+   * Gets the time until the current rate limit window resets
+   */
+  private getTimeUntilWindowReset(): number {
+    if (this.requestTimestamps.length === 0) {
+      return 0;
+    }
+    const oldestRequest = this.requestTimestamps[0];
+    return Math.max(0, oldestRequest + this.windowMs - Date.now());
+  }
+
+  /**
+   * Calculates predicted window utilization including queued requests
+   */
+  private getPredictedWindowUtilization(): number {
+    const currentUtilization = this.requestTimestamps.length;
+    const queuedRequests = this.queue.length;
+    return (currentUtilization + queuedRequests) / this.maxRequests;
+  }
+
   /**
    * Checks if a request can be executed based on current rate limit
    */
diff --git a/test/confidence/sf-deploy-metadata.yml b/test/confidence/sf-deploy-metadata.yml
index 7649cebe..60ca46ae 100644
--- a/test/confidence/sf-deploy-metadata.yml
+++ b/test/confidence/sf-deploy-metadata.yml
@@ -16,8 +16,8 @@ tests:
       sourceDir: force-app/main/default/lwc
       directory: /Users/sf-dev/dreamhouse-lwc
       usernameOrAlias: dreamhouse
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
     allowed-tools:
       - sf-list-all-orgs
 
@@ -29,142 +29,154 @@ tests:
       sourceDir: force-app/main/default/classes,force-app/main/default/lwc
       directory: /Users/sf-dev/dreamhouse-lwc
       usernameOrAlias: dreamhouse
-    expected-tool-confidence: 100
-    expected-parameter-confidence: 100
-
-  # # Deploy using manifest file
-  # - utterances:
-  #     - Deploy the components specified in my package.xml manifest to dreamhouse.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     manifest: /Users/sf-dev/dreamhouse-lwc/package.xml
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy all local changes (no sourceDir or manifest specified)
-  # - utterances:
-  #     - Deploy my changes to the dreamhouse org.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+  # Deploy using manifest file
+  - utterances:
+      - Deploy the components specified in my package.xml manifest to dreamhouse.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      manifest: /Users/sf-dev/dreamhouse-lwc/package.xml
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy with no tests run
-  # - utterances:
-  #     - Deploy my changes to the dreamhouse org without running any tests.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #     apexTestLevel: NoTestRun
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+  # Deploy all local changes (no sourceDir or manifest specified)
+  - utterances:
+      - Deploy my changes to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy with local tests
-  # - utterances:
-  #     - Deploy force-app/main/default/classes to the dreamhouse org and run all local tests.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     sourceDir: force-app/main/default/classes
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #     apexTestLevel: RunLocalTests
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+  # Deploy with no tests run
+  - utterances:
+      - Deploy my changes to the dreamhouse org without running any tests.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTestLevel: NoTestRun
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy with all org tests
-  # - utterances:
-  #     - Deploy the apex classes and run all tests in the org including managed packages.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     sourceDir: force-app/main/default/classes
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #     apexTestLevel: RunAllTestsInOrg
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+  # Deploy with local tests
+  - utterances:
+      - Deploy force-app/main/default/classes to the dreamhouse org and run all local tests.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTestLevel: RunLocalTests
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy with specific apex tests
-  # - utterances:
-  #     - Deploy my classes to the dreamhouse org and run the PropertyControllerTest and BrokerControllerTest apex tests.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     sourceDir: force-app/main/default/classes
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #     apexTests: PropertyControllerTest,BrokerControllerTest
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+  # Deploy with all org tests
+  - utterances:
+      - Deploy the apex classes and run all tests in the org including managed packages.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTestLevel: RunAllTestsInOrg
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy with single apex test
-  # - utterances:
-  #     - Deploy the PropertyController class and run PropertyControllerTest to the dreamhouse org.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     sourceDir: force-app/main/default/classes
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #     apexTests: PropertyControllerTest
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+  # Deploy with specific apex tests
+  - utterances:
+      - Deploy my classes to the dreamhouse org and run the PropertyControllerTest and BrokerControllerTest apex tests.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTests: PropertyControllerTest,BrokerControllerTest
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy specific file type mentioned
-  # - utterances:
-  #     - Deploy the flows to the dreamhouse org.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     sourceDir: force-app/main/default/flows
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+  # Deploy with single apex test
+  - utterances:
+      - Deploy the PropertyController class and run PropertyControllerTest to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTests: PropertyControllerTest
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy with multiple metadata types
-  # - utterances:
-  #     - Deploy the objects, classes, and tabs to the dreamhouse org.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     sourceDir: force-app/main/default/objects,force-app/main/default/classes,force-app/main/default/tabs
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+  # Deploy specific file type mentioned
+  - utterances:
+      - Deploy the flows to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/flows
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy with complex folder structure
-  # - utterances:
-  #     - Deploy the PropertyController and PropertyTrigger from the classes folder to the dreamhouse org and run PropertyControllerTest, PropertyTriggerTest, and PropertyUtilTest.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     sourceDir: force-app/main/default/classes
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #     apexTests: PropertyControllerTest,PropertyTriggerTest,PropertyUtilTest
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+  # Deploy with multiple metadata types
+  - utterances:
+      - Deploy the objects, classes, and tabs to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/objects,force-app/main/default/classes,force-app/main/default/tabs
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy without org specified (should use sf-get-username)
-  # - utterances:
-  #     - Deploy the lwc components.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     sourceDir: force-app/main/default/lwc
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #   expected-tool-confidence: 90
-  #   expected-parameter-confidence: 70
-  #   allowed-tools:
-  #     - sf-get-username
+  # Deploy with complex folder structure
+  - utterances:
+      - Deploy the PropertyController and PropertyTrigger from the classes folder to the dreamhouse org and run PropertyControllerTest, PropertyTriggerTest, and PropertyUtilTest.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app/main/default/classes
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+      apexTests: PropertyControllerTest,PropertyTriggerTest,PropertyUtilTest
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs
 
-  # # Deploy with relative path specification
-  # - utterances:
-  #     - Deploy everything in the force-app directory to the dreamhouse org.
-  #   expected-tool: sf-deploy-metadata
-  #   expected-parameters:
-  #     sourceDir: force-app
-  #     directory: /Users/sf-dev/dreamhouse-lwc
-  #     usernameOrAlias: dreamhouse
-  #   expected-tool-confidence: 100
-  #   expected-parameter-confidence: 100
+  # Deploy with relative path specification
+  - utterances:
+      - Deploy everything in the force-app directory to the dreamhouse org.
+    expected-tool: sf-deploy-metadata
+    expected-parameters:
+      sourceDir: force-app
+      directory: /Users/sf-dev/dreamhouse-lwc
+      usernameOrAlias: dreamhouse
+    expected-tool-confidence: 50
+    expected-parameter-confidence: 50
+    allowed-tools:
+      - sf-list-all-orgs

From 865008fd221a1fa10b2923ed74cbab527373f9d7 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 09:59:21 -0600
Subject: [PATCH 41/51] ci: setup up GHA for confidence tests

---
 .github/workflows/test.yml             | 38 ++++++++++++++++++++++++++
 test/confidence/sf-deploy-metadata.yml | 28 ++++++++++++++++---
 2 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 21b34062..ce9173ad 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -7,6 +7,23 @@ on:
 jobs:
   yarn-lockfile-check:
     uses: salesforcecli/github-workflows/.github/workflows/lockFileCheck.yml@main
+
+  # Detect which files have changed to determine what tests to run
+  changes:
+    runs-on: ubuntu-latest
+    outputs:
+      confidence-changed: ${{ steps.changes.outputs.confidence }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dorny/paths-filter@v2
+        id: changes
+        with:
+          filters: |
+            confidence:
+              - 'confidence/**'
+              - 'test/confidence/**'
+              - 'src/tools/**'
+
   # Since the Windows unit tests take much longer, we run the linux unit tests first and then run the windows unit tests in parallel with NUTs
   linux-unit-tests:
     needs: yarn-lockfile-check
@@ -15,6 +32,27 @@ jobs:
     needs: linux-unit-tests
     uses: salesforcecli/github-workflows/.github/workflows/unitTestsWindows.yml@main
 
+  # Run the confidence tests after the unit tests
+  confidence-tests:
+    needs: [linux-unit-tests, changes]
+    runs-on: ubuntu-latest
+    if: ${{ needs.changes.outputs.confidence-changed == 'true'}}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: lts/*
+          cache: yarn
+      - run: yarn install --frozen-lockfile
+      # Note: we cannot parallelize confidence tests since we don't have the rate limits to support it
+      # the test runner has rate limiting built-in to prevent hitting the API limits within that test run
+      - name: Run confidence tests
+        run: |
+          for file in test/confidence/*.yml; do
+            echo "Running confidence test for $file"
+            yarn test:confidence --verbose "$file"
+          done
+
   # Uncomment to enable NUT testing in Github Actions
   # nuts:
   #   needs: linux-unit-tests
diff --git a/test/confidence/sf-deploy-metadata.yml b/test/confidence/sf-deploy-metadata.yml
index 60ca46ae..b5267efa 100644
--- a/test/confidence/sf-deploy-metadata.yml
+++ b/test/confidence/sf-deploy-metadata.yml
@@ -1,16 +1,36 @@
 models:
   # - llmgateway__OpenAIGPT35Turbo_01_25
   # - llmgateway__OpenAIGPT4OmniMini
-  - llmgateway__OpenAIGPT41Nano
+  # - llmgateway__OpenAIGPT41Nano
   - llmgateway__BedrockAnthropicClaude4Sonnet
 
 initial-context:
-  - 'My current OS is macos. I am working in a workspace with the following folders: /Users/sf-dev/dreamhouse-lwc'
+  - 'My current OS is macos. I am working in a workspace with the following folders: /Users/sf-dev/dreamhouse-lwc
+    My org alias is dreamhouse.
+    This is the structure of /Users/sf-dev/dreamhouse-lwc:
+    package.xml
+    force-app/main/default/applications
+    force-app/main/default/aura
+    force-app/main/default/aura/pageTemplate_2_7_3
+    force-app/main/default/classes
+    force-app/main/default/contentassets
+    force-app/main/default/cspTrustedSites
+    force-app/main/default/flexipages
+    force-app/main/default/flows
+    force-app/main/default/layouts
+    force-app/main/default/lwc
+    force-app/main/default/messageChannels
+    force-app/main/default/objects
+    force-app/main/default/permissionsets
+    force-app/main/default/prompts
+    force-app/main/default/remoteSiteSettings
+    force-app/main/default/staticresources
+    force-app/main/default/tabs'
 
 tests:
   # Deploy specific source directory (Lightning Web Components)
   - utterances:
-      - My org alias is dreamhouse. Deploy the Lightning Web Components in force-app/main/default/lwc to the dreamhouse org.
+      - Deploy the Lightning Web Components in force-app/main/default/lwc to the dreamhouse org.
     expected-tool: sf-deploy-metadata
     expected-parameters:
       sourceDir: force-app/main/default/lwc
@@ -88,7 +108,7 @@ tests:
 
   # Deploy with all org tests
   - utterances:
-      - Deploy the apex classes and run all tests in the org including managed packages.
+      - Deploy the apex classes and run all tests in the dreamhouse org including managed packages.
     expected-tool: sf-deploy-metadata
     expected-parameters:
       sourceDir: force-app/main/default/classes

From 6e54cbaa76adc1b138f2480a7c51d00f401e8cf6 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 10:19:14 -0600
Subject: [PATCH 42/51] docs: add docs about confidence tests

---
 DEVELOPING.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/DEVELOPING.md b/DEVELOPING.md
index c9d4b99c..3cb2c2e1 100644
--- a/DEVELOPING.md
+++ b/DEVELOPING.md
@@ -124,9 +124,57 @@ mcp-inspector --cli node bin/run.js --orgs DEFAULT_TARGET_ORG --method tools/lis
 
 Unit tests are run with `yarn test` and use the Mocha test framework. Tests are located in the `test` directory and are named with the pattern, `test/**/*.test.ts`.
 
+### Confidence Tests
+
+Confidence tests validate that the MCP server tools are accurately invoked by various LLM models through the Salesforce LLM Gateway API. These tests ensure that natural language prompts correctly trigger the expected tools with appropriate parameters, maintaining the quality of the AI-powered tool selection.
+
+#### Running Confidence Tests Locally
+
+1. **Set up API access**: You'll need the `SF_LLMG_API_KEY` environment variable to access the LLM Gateway API. Follow the setup instructions [here](https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/quickstart/).
+
+2. **Export the API key**:
+
+   ```shell
+   export SF_LLMG_API_KEY=your_api_key_here
+   ```
+
+3. **Run a specific confidence test**:
+   ```shell
+   yarn test:confidence --file test/confidence/sf-deploy-metadata.yml --verbose
+   ```
+
+#### Test Structure
+
+Confidence tests are defined in YAML files located in `test/confidence/`. Each test file specifies:
+
+- **Models**: Which LLM models to test against. See LLMGateway documentation for [available models](https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/).
+- **Initial Context**: Background information provided to the model
+- **Test Cases**: Natural language utterances with expected tool invocations and confidence thresholds
+
+The tests run multiple iterations (default: 5) to calculate confidence levels and ensure consistent tool selection across different model runs. This can be adjusted by passing the `--runs` flag when running the tests, like this:
+
+```shell
+yarn test:confidence --verbose test/confidence/sf-deploy-metadata.yml --runs 2
+```
+
+#### Understanding Test Results
+
+Tests measure two types of confidence:
+
+- **Tool Confidence**: Whether the correct tool was invoked
+- **Parameter Confidence**: Whether the tool was called with the expected parameters
+
+Failed tests indicate that either:
+
+1. The model selected the wrong tool for a given prompt
+2. The model selected the correct tool but with incorrect parameters
+3. The confidence level fell below the specified threshold
+
+These failures help identify areas where tool descriptions or agent instructions need improvement.
+
 ## Debugging
 
-> [!NOTE]  
+> [!NOTE]
 > This section assumes you're using Visual Studio Code (VS Code).
 
 You can use the VS Code debugger with the MCP Inspector CLI to step through the code of your MCP tools:
@@ -150,7 +198,7 @@ MCP_SERVER_REQUEST_TIMEOUT=120000 mcp-inspector --cli node --inspect-brk bin/run
 We suggest you set `MCP_SERVER_REQUEST_TIMEOUT` to 120000ms (2 minutes) to allow longer debugging sessions without having the MCP Inspector client timeout.
 For other configuration values see: https://github.com/modelcontextprotocol/inspector?tab=readme-ov-file#configuration
 
-> [!IMPORTANT]  
+> [!IMPORTANT]
 > You must compile the local MCP server using `yarn compile` after every change in a TypeScript file, otherwise breakpoints in the TypeScript files might not match the running JavaScript code.
 
 ## Useful yarn Commands

From 1a09b00292ade813f5b0842d1a6d4670162bcf47 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 10:21:25 -0600
Subject: [PATCH 43/51] fix: replace --verbose with --concise

---
 .github/workflows/test.yml                 |  2 +-
 DEVELOPING.md                              |  4 ++--
 confidence/src/commands/confidence-test.ts | 12 ++++++------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ce9173ad..c64a61fd 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -50,7 +50,7 @@ jobs:
         run: |
           for file in test/confidence/*.yml; do
             echo "Running confidence test for $file"
-            yarn test:confidence --verbose "$file"
+            yarn test:confidence "$file"
           done
 
   # Uncomment to enable NUT testing in Github Actions
diff --git a/DEVELOPING.md b/DEVELOPING.md
index 3cb2c2e1..4bba8216 100644
--- a/DEVELOPING.md
+++ b/DEVELOPING.md
@@ -140,7 +140,7 @@ Confidence tests validate that the MCP server tools are accurately invoked by va
 
 3. **Run a specific confidence test**:
    ```shell
-   yarn test:confidence --file test/confidence/sf-deploy-metadata.yml --verbose
+   yarn test:confidence --file test/confidence/sf-deploy-metadata.yml
    ```
 
 #### Test Structure
@@ -154,7 +154,7 @@ Confidence tests are defined in YAML files located in `test/confidence/`. Each t
 The tests run multiple iterations (default: 5) to calculate confidence levels and ensure consistent tool selection across different model runs. This can be adjusted by passing the `--runs` flag when running the tests, like this:
 
 ```shell
-yarn test:confidence --verbose test/confidence/sf-deploy-metadata.yml --runs 2
+yarn test:confidence test/confidence/sf-deploy-metadata.yml --runs 2
 ```
 
 #### Understanding Test Results
diff --git a/confidence/src/commands/confidence-test.ts b/confidence/src/commands/confidence-test.ts
index 0ce7440c..8794ab2e 100644
--- a/confidence/src/commands/confidence-test.ts
+++ b/confidence/src/commands/confidence-test.ts
@@ -223,11 +223,11 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
       default: 5,
       char: 'r',
     }),
-    verbose: Flags.boolean({
-      summary: 'Enable verbose output',
-      description: 'If true, will print additional information about the test runs',
+    concise: Flags.boolean({
+      summary: 'Suppress detailed output for each test run',
+      description: 'If true, will print only the final results of each test run',
       default: false,
-      char: 'v',
+      char: 'c',
     }),
   };
 
@@ -240,7 +240,7 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
     }
 
     const { tools: mcpTools, tokens } = await getToolsList();
-    if (flags.verbose) {
+    if (!flags.concise) {
       stdout();
       printTable({
         title: 'Tools List',
@@ -298,7 +298,7 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
       stdout(colorize('bold', ' ─── Results for Test Case ───'));
       stdout(testSpec.readable);
 
-      if (flags.verbose) {
+      if (!flags.concise) {
         for (const run of testRuns) {
           printTable({
             title: `Run #${run.idx + 1}`,

From 54011bbf328eadb4535150d98360fe99dce3a8d6 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 10:31:29 -0600
Subject: [PATCH 44/51] fix: throw better error when SF_LLMG_API_KEY is not set

---
 confidence/src/commands/confidence-test.ts | 10 ++++++--
 confidence/src/utils/gateway.ts            | 27 ++++++++++------------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/confidence/src/commands/confidence-test.ts b/confidence/src/commands/confidence-test.ts
index 8794ab2e..d7b6eda7 100644
--- a/confidence/src/commands/confidence-test.ts
+++ b/confidence/src/commands/confidence-test.ts
@@ -105,6 +105,7 @@ const filterFailingTests = (
     .filter((test) => test !== undefined);
 
 async function compareModelOutputs(
+  apiKey: string,
   utterances: string | string[],
   spec: Spec,
   tools: InvocableTool[]
@@ -114,7 +115,7 @@ async function compareModelOutputs(
 }> {
   const models = spec.models;
   const responses = await Promise.all(
-    models.map((model) => makeGatewayRequests(castToArray(utterances), model, tools, spec['initial-context']))
+    models.map((model) => makeGatewayRequests(apiKey, castToArray(utterances), model, tools, spec['initial-context']))
   );
 
   const invocations = responses.reduce<Record<string, Array<{ tool: string; parameters: Record<string, string> }>>>(
@@ -234,6 +235,11 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
   public async run(): Promise<void> {
     const { flags } = await this.parse(ConfidenceTest);
 
+    const apiKey = process.env.SF_LLMG_API_KEY;
+    if (!apiKey) {
+      this.error('SF_LLMG_API_KEY environment variable is not set. Please set it to run this command.');
+    }
+
     const spec = Spec.safeParse(await readYamlFile<Spec>(flags.file));
     if (!spec.success) {
       this.error(`Invalid spec file: ${flags.file}\n${spec.error.message}`);
@@ -430,7 +436,7 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
           parameters: true,
         });
         return Array.from({ length: flags.runs }, (_, idx) =>
-          compareModelOutputs(test.utterances, spec.data, mcpTools).then(({ invocations, tableData }) => {
+          compareModelOutputs(apiKey, test.utterances, spec.data, mcpTools).then(({ invocations, tableData }) => {
             testResultsByTestCaseKey.set(testCaseKey, [
               ...(testResultsByTestCaseKey.get(testCaseKey) ?? []),
               {
diff --git a/confidence/src/utils/gateway.ts b/confidence/src/utils/gateway.ts
index 76aa9428..e15f7531 100644
--- a/confidence/src/utils/gateway.ts
+++ b/confidence/src/utils/gateway.ts
@@ -21,13 +21,6 @@ import { RateLimiter } from './rate-limiter.js';
 
 const fetchRetry = makeFetch(fetch);
 
-const API_KEY = process.env.SF_LLMG_API_KEY;
-process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
-
-if (!API_KEY) {
-  throw new Error('SF_LLMG_API_KEY is not set');
-}
-
 type GatewayResponse = {
   generation_details?: {
     generations: Array<{
@@ -44,8 +37,8 @@ type GatewayResponse = {
   };
 };
 
-const createRequestHeaders = (): Record<string, string> => ({
-  Authorization: `API_KEY ${API_KEY}`,
+const createRequestHeaders = (apiKey: string): Record<string, string> => ({
+  Authorization: `API_KEY ${apiKey}`,
   'Content-Type': 'application/json',
   // taken from example in docs. Theoretically we'd have our own after fully onboarding?
   // https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/access/gateway-access/
@@ -79,6 +72,7 @@ const createRequestBody = (
 const rateLimiter = new RateLimiter(40, 60_000);
 
 const makeSingleGatewayRequest = async (
+  apiKey: string,
   model: Model,
   tools: InvocableTool[],
   messages: Array<{ role: string; content: string }>
@@ -86,7 +80,7 @@ const makeSingleGatewayRequest = async (
   const response = await rateLimiter.enqueue(async () =>
     fetchRetry('https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations', {
       method: 'POST',
-      headers: createRequestHeaders(),
+      headers: createRequestHeaders(apiKey),
       body: createRequestBody(model, tools, messages),
       retryDelay(attempt) {
         return Math.pow(2, attempt) * 1000; // 1000, 2000, 4000
@@ -109,12 +103,14 @@ const makeSingleGatewayRequest = async (
 };
 
 /**
- * Makes requests to the LLM Gateway API for multiple prompts using the specified model and tools.
+ * Makes requests to the LLM Gateway API for multiple utterances using the specified model and tools.
  *
- * @param {string[]} prompts - Array of prompts to send to the API
- * @param {string} model - The model identifier to use for generation (e.g., 'llmgateway__AzureOpenAIGPT4Omni')
+ * @param {string} apiKey - API key for authentication with the LLM Gateway
+ * @param {string[]} utterances - Array of utterances to send to the API
+ * @param {Model} model - The model identifier to use for generation
  * @param {InvocableTool[]} tools - Array of tools that can be invoked by the model
- * @returns {Promise<{model: string, messages: Array<{role: string, content: string}>, responses: GatewayResponse[]}>} Object containing the model used, conversation messages, and API responses
+ * @param {string[]} [initialContext] - Optional initial context messages to prepend to the conversation
+ * @returns {Promise<{model: Model, messages: Array<{role: string, content: string}>, responses: GatewayResponse[]}>} Object containing the model used, conversation messages, and API responses
  * @throws {Error} If any API request fails or returns an error
  *
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/#make-your-first-gateway-request} Make Your First Gateway Request Documentation
@@ -124,6 +120,7 @@ const makeSingleGatewayRequest = async (
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/auth/#api-key-limitations} API Key Limitations Documentation
  */
 export const makeGatewayRequests = async (
+  apiKey: string,
   utterances: string[],
   model: Model,
   tools: InvocableTool[],
@@ -142,7 +139,7 @@ export const makeGatewayRequests = async (
     });
 
     // eslint-disable-next-line no-await-in-loop
-    const responseData = await makeSingleGatewayRequest(model, tools, messages);
+    const responseData = await makeSingleGatewayRequest(apiKey, model, tools, messages);
     responses.push(responseData);
 
     // Add the assistant's response to messages for the next iteration

From 09e5406db3ab949a379281ec22344d235ec0d6fe Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 10:32:26 -0600
Subject: [PATCH 45/51] ci: give confidence-test job access to env var

---
 .github/workflows/test.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index c64a61fd..a3445799 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -37,6 +37,8 @@ jobs:
     needs: [linux-unit-tests, changes]
     runs-on: ubuntu-latest
     if: ${{ needs.changes.outputs.confidence-changed == 'true'}}
+    env:
+      SF_LLMG_API_KEY: ${{ secrets.SF_LLMG_API_KEY }}
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-node@v4

From 5ec65910641e6f885e31284c76f08ea0ccd685bf Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 10:35:17 -0600
Subject: [PATCH 46/51] ci: fix test command

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a3445799..38ab8b07 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -52,7 +52,7 @@ jobs:
         run: |
           for file in test/confidence/*.yml; do
             echo "Running confidence test for $file"
-            yarn test:confidence "$file"
+            yarn test:confidence --file "$file"
           done
 
   # Uncomment to enable NUT testing in Github Actions

From f8016794f6a0f2e8969c9b85663d514d87886615 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 12:43:46 -0600
Subject: [PATCH 47/51] fix: smarter retry and limiting

---
 confidence/bin/dev.js                |   1 +
 confidence/bin/run.js                |   1 +
 confidence/src/utils/gateway.ts      |  10 +-
 confidence/src/utils/rate-limiter.ts | 282 ++++++++++++++++++++++-----
 package.json                         |   1 -
 yarn.lock                            |   5 -
 6 files changed, 233 insertions(+), 67 deletions(-)

diff --git a/confidence/bin/dev.js b/confidence/bin/dev.js
index f5e5d3ac..30fd6c8d 100755
--- a/confidence/bin/dev.js
+++ b/confidence/bin/dev.js
@@ -3,6 +3,7 @@
 import { dirname } from 'node:path';
 import { execute } from '@oclif/core';
 
+process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0'; // Disable TLS verification for local testing
 await execute({
   development: true,
   dir: import.meta.url,
diff --git a/confidence/bin/run.js b/confidence/bin/run.js
index 909acd4c..4c7af587 100755
--- a/confidence/bin/run.js
+++ b/confidence/bin/run.js
@@ -3,6 +3,7 @@
 import { dirname } from 'node:path';
 import { execute } from '@oclif/core';
 
+process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0'; // Disable TLS verification for local testing
 await execute({
   dir: import.meta.url,
   loadOptions: {
diff --git a/confidence/src/utils/gateway.ts b/confidence/src/utils/gateway.ts
index e15f7531..857cf545 100644
--- a/confidence/src/utils/gateway.ts
+++ b/confidence/src/utils/gateway.ts
@@ -14,13 +14,10 @@
  * limitations under the License.
  */
 
-import makeFetch from 'fetch-retry';
 import { Model } from './models.js';
 import { InvocableTool } from './tools.js';
 import { RateLimiter } from './rate-limiter.js';
 
-const fetchRetry = makeFetch(fetch);
-
 type GatewayResponse = {
   generation_details?: {
     generations: Array<{
@@ -78,15 +75,10 @@ const makeSingleGatewayRequest = async (
   messages: Array<{ role: string; content: string }>
 ): Promise<GatewayResponse> => {
   const response = await rateLimiter.enqueue(async () =>
-    fetchRetry('https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations', {
+    fetch('https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations', {
       method: 'POST',
       headers: createRequestHeaders(apiKey),
       body: createRequestBody(model, tools, messages),
-      retryDelay(attempt) {
-        return Math.pow(2, attempt) * 1000; // 1000, 2000, 4000
-      },
-      retries: 5,
-      retryOn: [429],
     })
   );
 
diff --git a/confidence/src/utils/rate-limiter.ts b/confidence/src/utils/rate-limiter.ts
index e8eb8d46..cccf7ffb 100644
--- a/confidence/src/utils/rate-limiter.ts
+++ b/confidence/src/utils/rate-limiter.ts
@@ -26,8 +26,42 @@ type QueuedRequest<T extends Response> = {
   reject: (error: unknown) => void;
 };
 
+type RetryConfig = {
+  maxRetries: number;
+  baseDelayMs: number;
+  maxDelayMs: number;
+  retryOn: number[];
+};
+
+type RateLimitStatus = {
+  queueLength: number;
+  requestsInWindow: number;
+  maxRequests: number;
+  canExecute: boolean;
+  nextAvailableSlot?: number;
+  isProcessing: boolean;
+  completed: number;
+  failed: number;
+  burstModeActive: boolean;
+  utilizationRatio: number;
+  timeUntilWindowReset: number;
+  adaptiveMaxRequests: number;
+  backoffMultiplier: number;
+  retryStats: {
+    totalRetries: number;
+    retriesByStatus: Record<number, number>;
+  };
+};
+
+class RateLimitError extends Error {
+  public constructor(message: string, public readonly status?: number, public readonly retryAfter?: number) {
+    super(message);
+    this.name = 'RateLimitError';
+  }
+}
+
 /**
- * A rate limiter that controls the frequency of requests using a sliding window approach with adaptive burst control.
+ * A rate limiter that controls the frequency of requests using a sliding window approach with adaptive burst control and intelligent retry logic.
  *
  * This class implements a queue-based rate limiter that ensures no more than a specified
  * number of requests are executed within a given time window. It features intelligent burst
@@ -35,10 +69,13 @@ type QueuedRequest<T extends Response> = {
  * for larger workloads.
  *
  * Key Features:
- * - Sliding window rate limiting
- * - Adaptive burst control for small request batches
- * - Gradual transition from burst to controlled spacing as utilization increases
+ * - Sliding window rate limiting with adaptive capacity adjustment
+ * - Intelligent burst control for small request batches
+ * - Exponential backoff retry logic with jitter for resilience
+ * - Respect for Retry-After headers when present
+ * - Adaptive rate adjustment based on 429 responses
  * - Comprehensive monitoring and debugging information
+ * - Graceful degradation and recovery mechanisms
  *
  * @example
  * ```typescript
@@ -52,10 +89,11 @@ type QueuedRequest<T extends Response> = {
  *   rateLimiter.enqueue(() => fetch('/api/data3'))
  * ]);
  *
- * // Check current status including burst mode information
+ * // Check current status including burst mode and retry information
  * const status = rateLimiter.getStatus();
  * console.log(`Burst mode active: ${status.burstModeActive}`);
- * console.log(`Utilization: ${(status.utilizationRatio * 100).toFixed(1)}%`);
+ * console.log(`Adaptive capacity: ${status.adaptiveMaxRequests}/${status.maxRequests}`);
+ * console.log(`Retry stats: ${status.retryStats.totalRetries} total retries`);
  * ```
  */
 export class RateLimiter {
@@ -84,7 +122,35 @@ export class RateLimiter {
    */
   private readonly minDelayMs = 50;
 
-  public constructor(private readonly maxRequests = 40, private readonly windowMs = 60_000) {}
+  /**
+   * Configuration for retry logic when handling rate limit errors.
+   */
+  private readonly retryConfig: RetryConfig = {
+    maxRetries: 3,
+    baseDelayMs: 1000,
+    maxDelayMs: 60_000,
+    retryOn: [429, 503, 502, 504],
+  };
+
+  /**
+   * Adaptive rate limiting state.
+   */
+  private adaptiveMaxRequests: number;
+  private readonly originalMaxRequests: number;
+  private backoffMultiplier = 1.0;
+
+  /**
+   * Retry statistics for monitoring.
+   */
+  private retryStats = {
+    totalRetries: 0,
+    retriesByStatus: {} as Record<number, number>,
+  };
+
+  public constructor(private readonly maxRequests = 40, private readonly windowMs = 60_000) {
+    this.adaptiveMaxRequests = maxRequests;
+    this.originalMaxRequests = maxRequests;
+  }
 
   /**
    * Utility function to sleep for a given number of milliseconds
@@ -94,17 +160,19 @@ export class RateLimiter {
   }
 
   /**
-   * Executes a single request and handles its completion
+   * Determines if an error is retryable
    */
-  private static async executeRequest(request: QueuedRequest<Response>): Promise<void> {
-    try {
-      const result = await request.execute();
-      this.completed++;
-      request.resolve(result);
-    } catch (error) {
-      this.failed++;
-      request.reject(error);
+  private static isRetryableError(error: unknown): boolean {
+    if (error instanceof Error) {
+      const message = error.message.toLowerCase();
+      return (
+        message.includes('429') ||
+        message.includes('rate limit') ||
+        message.includes('econnreset') ||
+        message.includes('timeout')
+      );
     }
+    return false;
   }
 
   /**
@@ -132,20 +200,7 @@ export class RateLimiter {
   /**
    * Gets current queue status for monitoring/debugging
    */
-  public getStatus(): {
-    queueLength: number;
-    requestsInWindow: number;
-    maxRequests: number;
-    canExecute: boolean;
-    nextAvailableSlot?: number;
-    isProcessing: boolean;
-    completed: number;
-    failed: number;
-    burstModeActive: boolean;
-    utilizationRatio: number;
-    predictedUtilization: number;
-    timeUntilWindowReset: number;
-  } {
+  public getStatus(): RateLimitStatus {
     const now = Date.now();
     this.cleanupOldTimestamps(now);
 
@@ -159,12 +214,68 @@ export class RateLimiter {
       completed: RateLimiter.completed,
       failed: RateLimiter.failed,
       burstModeActive: this.shouldAllowBurst(),
-      utilizationRatio: this.requestTimestamps.length / this.maxRequests,
-      predictedUtilization: this.getPredictedWindowUtilization(),
+      utilizationRatio: this.requestTimestamps.length / this.adaptiveMaxRequests,
       timeUntilWindowReset: this.getTimeUntilWindowReset(),
+      adaptiveMaxRequests: this.adaptiveMaxRequests,
+      backoffMultiplier: this.backoffMultiplier,
+      retryStats: { ...this.retryStats },
     };
   }
 
+  /**
+   * Executes a single request with retry logic and adaptive rate limiting
+   */
+  private async executeRequest(request: QueuedRequest<Response>): Promise<void> {
+    let lastError: Error | null = null;
+
+    for (let attempt = 0; attempt <= this.retryConfig.maxRetries; attempt++) {
+      try {
+        const result = await request.execute();
+
+        // Check for rate limit response
+        if (!result.ok && this.retryConfig.retryOn.includes(result.status)) {
+          // Immediately adjust rate limit on first 429 to prevent more
+          if (result.status === 429) {
+            this.adjustRateLimit(true);
+          }
+
+          if (attempt === this.retryConfig.maxRetries) {
+            this.recordRetryFailure(result.status);
+            throw new RateLimitError(`Rate limit exceeded after ${this.retryConfig.maxRetries} retries`, result.status);
+          }
+
+          this.recordRetryAttempt(result.status);
+          const delay = this.calculateRetryDelay(attempt, result);
+          debug(`Rate limit hit (${result.status}), retrying in ${delay}ms (attempt ${attempt + 1})`);
+
+          await RateLimiter.sleep(delay);
+          continue;
+        }
+
+        // Success - record and resolve
+        RateLimiter.completed++;
+        this.adjustRateLimit(false);
+        request.resolve(result);
+        return;
+      } catch (error) {
+        lastError = error as Error;
+        if (attempt < this.retryConfig.maxRetries && RateLimiter.isRetryableError(error)) {
+          this.recordRetryAttempt();
+          const delay = this.calculateRetryDelay(attempt);
+          debug(`Retryable error, retrying in ${delay}ms (attempt ${attempt + 1}): ${lastError.message}`);
+
+          await RateLimiter.sleep(delay);
+          continue;
+        }
+        break;
+      }
+    }
+
+    // All retries exhausted
+    RateLimiter.failed++;
+    request.reject(lastError ?? new Error('Max retries exceeded'));
+  }
+
   /**
    * Processes the queue, executing requests when rate limit allows
    */
@@ -184,7 +295,7 @@ export class RateLimiter {
         this.recordRequest(now);
 
         // Execute the request asynchronously - don't await
-        void RateLimiter.executeRequest(request);
+        void this.executeRequest(request);
 
         // Use adaptive delay instead of fixed delay
         const delay = this.calculateAdaptiveDelay();
@@ -204,14 +315,25 @@ export class RateLimiter {
    * Determines if burst mode should be allowed based on current utilization
    */
   private shouldAllowBurst(): boolean {
-    const utilizationRatio = this.requestTimestamps.length / this.maxRequests;
-    const queueRatio = this.queue.length / this.maxRequests;
+    const utilizationRatio = this.requestTimestamps.length / this.adaptiveMaxRequests;
+    const queueRatio = this.queue.length / this.adaptiveMaxRequests;
     const totalWorkRatio = utilizationRatio + queueRatio;
 
+    // Be more conservative with large queues - reduce burst threshold
+    const adjustedBurstThreshold =
+      this.queue.length > this.adaptiveMaxRequests * 0.25
+        ? this.burstUtilizationThreshold * 0.5
+        : this.burstUtilizationThreshold;
+
     // Allow bursts when:
     // 1. Current utilization is below the burst threshold
     // 2. Total work (current + queued) is below the queue threshold
-    return utilizationRatio < this.burstUtilizationThreshold && totalWorkRatio < this.burstQueueThreshold;
+    // 3. We're not in backoff mode
+    return (
+      utilizationRatio < adjustedBurstThreshold &&
+      totalWorkRatio < this.burstQueueThreshold &&
+      this.backoffMultiplier >= 0.9
+    );
   }
 
   /**
@@ -223,22 +345,79 @@ export class RateLimiter {
       return 0;
     }
 
-    const utilizationRatio = this.requestTimestamps.length / this.maxRequests;
-    const remainingCapacity = this.maxRequests - this.requestTimestamps.length;
+    const utilizationRatio = this.requestTimestamps.length / this.adaptiveMaxRequests;
+    const remainingCapacity = this.adaptiveMaxRequests - this.requestTimestamps.length;
     const queueLength = this.queue.length;
 
+    // At high utilization (>90%), use much more conservative delays
+    if (utilizationRatio > 0.9) {
+      const baseDelay = Math.ceil(this.windowMs / this.adaptiveMaxRequests);
+      // Use exponential scaling at high utilization to prevent 429s
+      const aggressiveScaling = Math.pow(utilizationRatio, 3) * 5;
+      return Math.max(baseDelay * aggressiveScaling, 1000); // Minimum 1 second at high utilization
+    }
+
     // If we have enough capacity for all queued requests, use minimal spacing
     if (remainingCapacity >= queueLength) {
       return this.minDelayMs;
     }
 
     // Calculate base delay and scale it based on utilization
-    const baseDelay = Math.ceil(this.windowMs / this.maxRequests);
+    const baseDelay = Math.ceil(this.windowMs / this.adaptiveMaxRequests);
     const scalingFactor = Math.min(utilizationRatio * 2, 1);
 
     return Math.max(this.minDelayMs, baseDelay * scalingFactor);
   }
 
+  /**
+   * Calculates retry delay with exponential backoff
+   */
+  private calculateRetryDelay(attempt: number, response?: Response): number {
+    if (response?.status === 429) {
+      const remainingRequests = this.adaptiveMaxRequests - this.requestTimestamps.length;
+      if (remainingRequests <= 0) {
+        return this.getTimeUntilWindowReset(); // Wait until the window resets
+      }
+    }
+
+    // Exponential backoff with jitter
+    const exponentialDelay = this.retryConfig.baseDelayMs * Math.pow(2, attempt);
+    const jitter = Math.random() * 0.3 * exponentialDelay; // 30% jitter
+    return Math.min(exponentialDelay + jitter, this.retryConfig.maxDelayMs);
+  }
+  /**
+   * Records a retry attempt for statistics
+   */
+  private recordRetryAttempt(status?: number): void {
+    this.retryStats.totalRetries++;
+    if (status) {
+      this.retryStats.retriesByStatus[status] = (this.retryStats.retriesByStatus[status] || 0) + 1;
+    }
+  }
+
+  /**
+   * Records a final retry failure
+   */
+  private recordRetryFailure(status: number): void {
+    debug(`Final retry failure with status ${status} after ${this.retryConfig.maxRetries} attempts`);
+  }
+
+  /**
+   * Adjusts rate limit based on success/failure
+   */
+  private adjustRateLimit(hit429: boolean): void {
+    if (hit429) {
+      // Reduce rate by 25% when we hit rate limits
+      this.backoffMultiplier = Math.max(0.25, this.backoffMultiplier * 0.75);
+      this.adaptiveMaxRequests = Math.floor(this.originalMaxRequests * this.backoffMultiplier);
+      debug(`Rate limit hit, reducing to ${this.adaptiveMaxRequests} requests per window`);
+    } else {
+      // Gradually recover rate limit (2% increase per successful request)
+      this.backoffMultiplier = Math.min(1.0, this.backoffMultiplier * 1.02);
+      this.adaptiveMaxRequests = Math.floor(this.originalMaxRequests * this.backoffMultiplier);
+    }
+  }
+
   /**
    * Gets the time until the current rate limit window resets
    */
@@ -251,19 +430,18 @@ export class RateLimiter {
   }
 
   /**
-   * Calculates predicted window utilization including queued requests
-   */
-  private getPredictedWindowUtilization(): number {
-    const currentUtilization = this.requestTimestamps.length;
-    const queuedRequests = this.queue.length;
-    return (currentUtilization + queuedRequests) / this.maxRequests;
-  }
-
-  /**
-   * Checks if a request can be executed based on current rate limit
+   * Checks if a request can be executed based on current adaptive rate limit
    */
   private canExecuteRequest(): boolean {
-    return this.requestTimestamps.length < this.maxRequests;
+    const utilizationRatio = this.requestTimestamps.length / this.adaptiveMaxRequests;
+
+    // Be more conservative at high utilization to prevent 429s
+    if (utilizationRatio > 0.9) {
+      // Only allow if we have significant capacity remaining
+      return this.requestTimestamps.length <= Math.floor(this.adaptiveMaxRequests * 0.85);
+    }
+
+    return this.requestTimestamps.length < this.adaptiveMaxRequests;
   }
 
   /**
@@ -291,8 +469,8 @@ export class RateLimiter {
       return 0;
     }
 
-    // If we're at the limit, wait until the oldest request expires
-    if (this.requestTimestamps.length >= this.maxRequests) {
+    // If we're at the adaptive limit, wait until the oldest request expires
+    if (this.requestTimestamps.length >= this.adaptiveMaxRequests) {
       const oldestRequest = this.requestTimestamps[0];
       const timeUntilExpiry = oldestRequest + this.windowMs - now;
       return Math.max(0, timeUntilExpiry + 100); // Add 100ms buffer
diff --git a/package.json b/package.json
index 9f4c6940..8b2ada5b 100644
--- a/package.json
+++ b/package.json
@@ -64,7 +64,6 @@
     "debug": "^4.4.1",
     "eslint-config-salesforce-license": "^1.0.1",
     "eslint-plugin-sf-plugin": "^1.20.26",
-    "fetch-retry": "^6.0.0",
     "gpt-tokenizer": "^3.0.1",
     "oclif": "^4.21.0",
     "ts-node": "^10.9.2",
diff --git a/yarn.lock b/yarn.lock
index ed32d2d3..2f5f3532 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -5598,11 +5598,6 @@ fdir@^6.4.4:
   resolved "https://registry.yarnpkg.com/fdir/-/fdir-6.4.5.tgz#328e280f3a23699362f95f2e82acf978a0c0cb49"
   integrity sha512-4BG7puHpVsIYxZUbiUE3RqGloLaSSwzYie5jvasC4LWuBWzZawynvYouhjbQKw2JuIGYdm0DzIxl8iVidKlUEw==
 
-fetch-retry@^6.0.0:
-  version "6.0.0"
-  resolved "https://registry.yarnpkg.com/fetch-retry/-/fetch-retry-6.0.0.tgz#4ffdf92c834d72ae819e42a4ee2a63f1e9454426"
-  integrity sha512-BUFj1aMubgib37I3v4q78fYo63Po7t4HUPTpQ6/QE6yK6cIQrP+W43FYToeTEyg5m2Y7eFUtijUuAv/PDlWuag==
-
 file-entry-cache@^6.0.1:
   version "6.0.1"
   resolved "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz"

From f76a5541e61f48a2d5945297a2f8947df04f820e Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 12:48:55 -0600
Subject: [PATCH 48/51] ci: debug failures

---
 .github/workflows/test.yml           | 3 ++-
 confidence/src/utils/rate-limiter.ts | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 38ab8b07..54d3e037 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -34,7 +34,8 @@ jobs:
 
   # Run the confidence tests after the unit tests
   confidence-tests:
-    needs: [linux-unit-tests, changes]
+    # needs: [linux-unit-tests, changes]
+    needs: [changes]
     runs-on: ubuntu-latest
     if: ${{ needs.changes.outputs.confidence-changed == 'true'}}
     env:
diff --git a/confidence/src/utils/rate-limiter.ts b/confidence/src/utils/rate-limiter.ts
index cccf7ffb..e3d007f0 100644
--- a/confidence/src/utils/rate-limiter.ts
+++ b/confidence/src/utils/rate-limiter.ts
@@ -259,6 +259,15 @@ export class RateLimiter {
         return;
       } catch (error) {
         lastError = error as Error;
+        // eslint-disable-next-line no-console
+        console.error(`Error executing request: ${lastError.message}`, {
+          attempt,
+          status: lastError instanceof RateLimitError ? lastError.status : undefined,
+          retryAfter: lastError instanceof RateLimitError ? lastError.retryAfter : undefined,
+        });
+        // eslint-disable-next-line no-console
+        console.error('Full error details:', lastError);
+
         if (attempt < this.retryConfig.maxRetries && RateLimiter.isRetryableError(error)) {
           this.recordRetryAttempt();
           const delay = this.calculateRetryDelay(attempt);

From ee9e17f30e34fb2defe5726236ed521fde946d3e Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Thu, 24 Jul 2025 12:51:49 -0600
Subject: [PATCH 49/51] ci: debug failures

---
 .github/workflows/test.yml           | 3 +--
 confidence/src/utils/rate-limiter.ts | 6 ++----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 54d3e037..38ab8b07 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -34,8 +34,7 @@ jobs:
 
   # Run the confidence tests after the unit tests
   confidence-tests:
-    # needs: [linux-unit-tests, changes]
-    needs: [changes]
+    needs: [linux-unit-tests, changes]
     runs-on: ubuntu-latest
     if: ${{ needs.changes.outputs.confidence-changed == 'true'}}
     env:
diff --git a/confidence/src/utils/rate-limiter.ts b/confidence/src/utils/rate-limiter.ts
index e3d007f0..c713f8e8 100644
--- a/confidence/src/utils/rate-limiter.ts
+++ b/confidence/src/utils/rate-limiter.ts
@@ -259,14 +259,12 @@ export class RateLimiter {
         return;
       } catch (error) {
         lastError = error as Error;
-        // eslint-disable-next-line no-console
-        console.error(`Error executing request: ${lastError.message}`, {
+        debug(`Error executing request: ${lastError.message}. %O`, {
           attempt,
           status: lastError instanceof RateLimitError ? lastError.status : undefined,
           retryAfter: lastError instanceof RateLimitError ? lastError.retryAfter : undefined,
         });
-        // eslint-disable-next-line no-console
-        console.error('Full error details:', lastError);
+        debug('Full error details: %O', lastError);
 
         if (attempt < this.retryConfig.maxRetries && RateLimiter.isRetryableError(error)) {
           this.recordRetryAttempt();

From 25af4c6682c3aafb9c959ddf9d178f9850f658bb Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Fri, 1 Aug 2025 09:10:59 -0600
Subject: [PATCH 50/51] refactor: user ECA consumer secret and key to auth

---
 .github/workflows/test.yml                 |  4 +-
 DEVELOPING.md                              | 14 +++---
 confidence/src/commands/confidence-test.ts | 21 +++++----
 confidence/src/utils/gateway.ts            | 33 ++++++--------
 confidence/src/utils/jwt.ts                | 50 ++++++++++++++++++++++
 confidence/src/utils/models.ts             | 17 +++-----
 test/confidence/sf-deploy-metadata.yml     |  5 +--
 7 files changed, 93 insertions(+), 51 deletions(-)
 create mode 100644 confidence/src/utils/jwt.ts

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 38ab8b07..f08d064b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -38,7 +38,9 @@ jobs:
     runs-on: ubuntu-latest
     if: ${{ needs.changes.outputs.confidence-changed == 'true'}}
     env:
-      SF_LLMG_API_KEY: ${{ secrets.SF_LLMG_API_KEY }}
+      SF_MCP_CONFIDENCE_CONSUMER_KEY: ${{ secrets.SF_MCP_CONFIDENCE_CONSUMER_KEY }}
+      SF_MCP_CONFIDENCE_CONSUMER_SECRET: ${{ secrets.SF_MCP_CONFIDENCE_CONSUMER_SECRET }}
+      SF_MCP_CONFIDENCE_INSTANCE_URL: ${{ secrets.SF_MCP_CONFIDENCE_INSTANCE_URL }}
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-node@v4
diff --git a/DEVELOPING.md b/DEVELOPING.md
index 4bba8216..e0343bd2 100644
--- a/DEVELOPING.md
+++ b/DEVELOPING.md
@@ -130,15 +130,17 @@ Confidence tests validate that the MCP server tools are accurately invoked by va
 
 #### Running Confidence Tests Locally
 
-1. **Set up API access**: You'll need the `SF_LLMG_API_KEY` environment variable to access the LLM Gateway API. Follow the setup instructions [here](https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/quickstart/).
-
-2. **Export the API key**:
+1. **Set up API access**: Follow this [documentation](https://developer.salesforce.com/docs/einstein/genai/guide/access-models-api-with-rest.html) to setup an External Client App that will give you access to the Models API. Once you have the consumer key and secret from the External Client App, you'll need to add these to environment variables:
 
    ```shell
-   export SF_LLMG_API_KEY=your_api_key_here
+   export SF_MCP_CONFIDENCE_CONSUMER_KEY=your_client_id_here
+   export SF_MCP_CONFIDENCE_CONSUMER_SECRET=your_client_secret_here
+   export SF_MCP_CONFIDENCE_INSTANCE_URL=https://your_instance.salesforce.com
    ```
 
-3. **Run a specific confidence test**:
+   These environment variables are used to generate a JWT token that will be used to authenticate with the Models API.
+
+2. **Run a confidence test**:
    ```shell
    yarn test:confidence --file test/confidence/sf-deploy-metadata.yml
    ```
@@ -147,7 +149,7 @@ Confidence tests validate that the MCP server tools are accurately invoked by va
 
 Confidence tests are defined in YAML files located in `test/confidence/`. Each test file specifies:
 
-- **Models**: Which LLM models to test against. See LLMGateway documentation for [available models](https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/).
+- **Models**: Which LLM models to test against. See the Agentforce Developer Guide for [available models](https://developer.salesforce.com/docs/einstein/genai/guide/supported-models.html).
 - **Initial Context**: Background information provided to the model
 - **Test Cases**: Natural language utterances with expected tool invocations and confidence thresholds
 
diff --git a/confidence/src/commands/confidence-test.ts b/confidence/src/commands/confidence-test.ts
index d7b6eda7..39735e0b 100644
--- a/confidence/src/commands/confidence-test.ts
+++ b/confidence/src/commands/confidence-test.ts
@@ -23,6 +23,7 @@ import { getToolsList, InvocableTool } from '../utils/tools.js';
 import { TABLE_STYLE } from '../utils/table.js';
 import { readYamlFile } from '../utils/yaml.js';
 import { Model } from '../utils/models.js';
+import { mintJWT } from '../utils/jwt.js';
 
 const Spec = z.object({
   models: z.array(z.custom<Model>()),
@@ -105,7 +106,7 @@ const filterFailingTests = (
     .filter((test) => test !== undefined);
 
 async function compareModelOutputs(
-  apiKey: string,
+  jwtToken: string,
   utterances: string | string[],
   spec: Spec,
   tools: InvocableTool[]
@@ -115,7 +116,7 @@ async function compareModelOutputs(
 }> {
   const models = spec.models;
   const responses = await Promise.all(
-    models.map((model) => makeGatewayRequests(apiKey, castToArray(utterances), model, tools, spec['initial-context']))
+    models.map((model) => makeGatewayRequests(jwtToken, castToArray(utterances), model, tools, spec['initial-context']))
   );
 
   const invocations = responses.reduce<Record<string, Array<{ tool: string; parameters: Record<string, string> }>>>(
@@ -168,7 +169,12 @@ export default class ConfidenceTest extends Command {
 
 Configuration:
 - Uses a YAML file to specify models and test cases
-- Requires SF_LLMG_API_KEY environment variable
+- Requires SF_MCP_CONFIDENCE_CONSUMER_KEY environment variable
+- Requires SF_MCP_CONFIDENCE_CONSUMER_SECRET environment variable
+- Requires SF_MCP_CONFIDENCE_INSTANCE_URL environment variable
+
+At runtime, the SF_MCP_CONFIDENCE_CONSUMER_KEY and SF_MCP_CONFIDENCE_CONSUMER_SECRET are used to generate a JWT token from a External Client App in the production org (SF_MCP_CONFIDENCE_INSTANCE_URL).
+This token is then used to authenticate requests to the LLM Gateway API.
 
 YAML File Format:
 The YAML file should contain:
@@ -204,7 +210,7 @@ tests:
     expected-tool-confidence: 85
 
 For available models, see:
-https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/`;
+https://developer.salesforce.com/docs/einstein/genai/guide/supported-models.html`;
 
   public static flags = {
     file: Flags.file({
@@ -235,10 +241,7 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
   public async run(): Promise<void> {
     const { flags } = await this.parse(ConfidenceTest);
 
-    const apiKey = process.env.SF_LLMG_API_KEY;
-    if (!apiKey) {
-      this.error('SF_LLMG_API_KEY environment variable is not set. Please set it to run this command.');
-    }
+    const jwtToken = await mintJWT();
 
     const spec = Spec.safeParse(await readYamlFile<Spec>(flags.file));
     if (!spec.success) {
@@ -436,7 +439,7 @@ https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/mode
           parameters: true,
         });
         return Array.from({ length: flags.runs }, (_, idx) =>
-          compareModelOutputs(apiKey, test.utterances, spec.data, mcpTools).then(({ invocations, tableData }) => {
+          compareModelOutputs(jwtToken, test.utterances, spec.data, mcpTools).then(({ invocations, tableData }) => {
             testResultsByTestCaseKey.set(testCaseKey, [
               ...(testResultsByTestCaseKey.get(testCaseKey) ?? []),
               {
diff --git a/confidence/src/utils/gateway.ts b/confidence/src/utils/gateway.ts
index 857cf545..08929d0a 100644
--- a/confidence/src/utils/gateway.ts
+++ b/confidence/src/utils/gateway.ts
@@ -34,15 +34,11 @@ type GatewayResponse = {
   };
 };
 
-const createRequestHeaders = (apiKey: string): Record<string, string> => ({
-  Authorization: `API_KEY ${apiKey}`,
+const createRequestHeaders = (jwtToken: string): Record<string, string> => ({
+  Authorization: `Bearer ${jwtToken}`,
   'Content-Type': 'application/json',
-  // taken from example in docs. Theoretically we'd have our own after fully onboarding?
-  // https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/access/gateway-access/
-  'x-sfdc-core-tenant-id': 'core/prod1/00DDu0000008cuqMAA',
-  // https://git.soma.salesforce.com/einsteingpt/module-llmg-cts-registry/blob/master/docs/features/PLATFORM_C_L_I_M_C_P_TESTS.yml
   'x-sfdc-app-context': 'EinsteinGPT',
-  'x-client-feature-id': 'platform-cli-mcp-tests',
+  'x-client-feature-id': 'ai-platform-models-connected-app',
 });
 
 const createRequestBody = (
@@ -64,20 +60,19 @@ const createRequestBody = (
     },
   });
 
-// We're using a pre-production environment so we currently have the default 40 requests per minute per client-feature-id.
-// See: https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/rate-limits/#pre-production-environments
-const rateLimiter = new RateLimiter(40, 60_000);
+// See https://developer.salesforce.com/docs/einstein/genai/guide/models-api-rate-limits.html
+const rateLimiter = new RateLimiter(500, 60_000);
 
 const makeSingleGatewayRequest = async (
-  apiKey: string,
+  jwtToken: string,
   model: Model,
   tools: InvocableTool[],
   messages: Array<{ role: string; content: string }>
 ): Promise<GatewayResponse> => {
   const response = await rateLimiter.enqueue(async () =>
-    fetch('https://bot-svc-llm.sfproxy.einsteintest1.test1-uswest2.aws.sfdc.cl/v1.0/chat/generations', {
+    fetch('https://api.salesforce.com/ai/gpt/v1/chat/generations', {
       method: 'POST',
-      headers: createRequestHeaders(apiKey),
+      headers: createRequestHeaders(jwtToken),
       body: createRequestBody(model, tools, messages),
     })
   );
@@ -86,7 +81,7 @@ const makeSingleGatewayRequest = async (
     // eslint-disable-next-line no-console
     console.error(`Error making request to LLM Gateway API: ${response.status} ${response.statusText}`);
     // eslint-disable-next-line no-console
-    console.error('Response body:', JSON.stringify(await response.json(), null, 2));
+    console.error('Response body:', await response.text());
     throw new Error(`HTTP ${response.status}: ${response.statusText}`);
   }
 
@@ -97,7 +92,7 @@ const makeSingleGatewayRequest = async (
 /**
  * Makes requests to the LLM Gateway API for multiple utterances using the specified model and tools.
  *
- * @param {string} apiKey - API key for authentication with the LLM Gateway
+ * @param {string} jwtToken - JWT token for authentication with the Models API
  * @param {string[]} utterances - Array of utterances to send to the API
  * @param {Model} model - The model identifier to use for generation
  * @param {InvocableTool[]} tools - Array of tools that can be invoked by the model
@@ -105,14 +100,10 @@ const makeSingleGatewayRequest = async (
  * @returns {Promise<{model: Model, messages: Array<{role: string, content: string}>, responses: GatewayResponse[]}>} Object containing the model used, conversation messages, and API responses
  * @throws {Error} If any API request fails or returns an error
  *
- * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/#make-your-first-gateway-request} Make Your First Gateway Request Documentation
- * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/} Models and Providers Documentation
- * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/apis/rest/#operation/chatMessages} REST API Documentation
  * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/function-calling/} Function Calling Documentation
- * @see {@link https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/get-started/auth/#api-key-limitations} API Key Limitations Documentation
  */
 export const makeGatewayRequests = async (
-  apiKey: string,
+  jwtToken: string,
   utterances: string[],
   model: Model,
   tools: InvocableTool[],
@@ -131,7 +122,7 @@ export const makeGatewayRequests = async (
     });
 
     // eslint-disable-next-line no-await-in-loop
-    const responseData = await makeSingleGatewayRequest(apiKey, model, tools, messages);
+    const responseData = await makeSingleGatewayRequest(jwtToken, model, tools, messages);
     responses.push(responseData);
 
     // Add the assistant's response to messages for the next iteration
diff --git a/confidence/src/utils/jwt.ts b/confidence/src/utils/jwt.ts
new file mode 100644
index 00000000..7f9132aa
--- /dev/null
+++ b/confidence/src/utils/jwt.ts
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+export const mintJWT = async (): Promise<string> => {
+  const consumerKey = process.env.SF_MCP_CONFIDENCE_CONSUMER_KEY;
+  const consumerSecret = process.env.SF_MCP_CONFIDENCE_CONSUMER_SECRET;
+  const instanceUrl = process.env.SF_MCP_CONFIDENCE_INSTANCE_URL;
+
+  if (!consumerKey || !consumerSecret || !instanceUrl) {
+    throw new Error(
+      'Missing required environment variables: SF_MCP_CONFIDENCE_CONSUMER_KEY, SF_MCP_CONFIDENCE_CONSUMER_SECRET, or SF_MCP_CONFIDENCE_INSTANCE_URL'
+    );
+  }
+
+  const response = await fetch(`${instanceUrl}/services/oauth2/token`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/x-www-form-urlencoded',
+    },
+    body: new URLSearchParams({
+      grant_type: 'client_credentials',
+      client_id: consumerKey,
+      client_secret: consumerSecret,
+    }),
+  });
+
+  if (!response.ok) {
+    throw new Error(`Failed to mint JWT: ${response.statusText}`);
+  }
+
+  const data = (await response.json()) as { access_token?: string };
+  if (!data.access_token) {
+    throw new Error('Failed to retrieve access token from response');
+  }
+
+  return data.access_token;
+};
diff --git a/confidence/src/utils/models.ts b/confidence/src/utils/models.ts
index dc99281e..c1cf4de0 100644
--- a/confidence/src/utils/models.ts
+++ b/confidence/src/utils/models.ts
@@ -14,17 +14,14 @@
  * limitations under the License.
  */
 
-// See https://git.soma.salesforce.com/pages/tech-enablement/einstein/docs/gateway/models-and-providers/
+// https://developer.salesforce.com/docs/einstein/genai/guide/supported-models.html
 export const MODELS = [
-  'llmgateway__OpenAIGPT35Turbo_01_25',
-  'llmgateway__OpenAIGPT4OmniMini',
-  'llmgateway__BedrockAnthropicClaude4Sonnet',
-  'llmgateway__OpenAIGPT41Nano',
-  'llmgateway__OpenAIGPT41Mini',
-  'llmgateway__BedrockAnthropicClaude37Sonnet',
-  'llmgateway__BedrockAnthropicClaude3Opus',
-  'llmgateway__VertexAIGemini25Flash001',
+  'sfdc_ai__DefaultBedrockAnthropicClaude37Sonnet',
+  'sfdc_ai__DefaultOpenAIGPT35Turbo',
+  'sfdc_ai__DefaultGPT41Mini',
+  'sfdc_ai__DefaultBedrockAnthropicClaude4Sonnet',
+  'sfdc_ai__DefaultOpenAIGPT4OmniMini',
+  'sfdc_ai__DefaultVertexAIGeminiPro25',
 ] as const;
 
 export type Model = (typeof MODELS)[number];
-export const DEFAULT_MODEL: Model = 'llmgateway__BedrockAnthropicClaude4Sonnet';
diff --git a/test/confidence/sf-deploy-metadata.yml b/test/confidence/sf-deploy-metadata.yml
index b5267efa..716fbd80 100644
--- a/test/confidence/sf-deploy-metadata.yml
+++ b/test/confidence/sf-deploy-metadata.yml
@@ -1,8 +1,5 @@
 models:
-  # - llmgateway__OpenAIGPT35Turbo_01_25
-  # - llmgateway__OpenAIGPT4OmniMini
-  # - llmgateway__OpenAIGPT41Nano
-  - llmgateway__BedrockAnthropicClaude4Sonnet
+  - sfdc_ai__DefaultBedrockAnthropicClaude4Sonnet
 
 initial-context:
   - 'My current OS is macos. I am working in a workspace with the following folders: /Users/sf-dev/dreamhouse-lwc

From 52a8440621f13f05cc9a2146e92e581d449b02b2 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Fri, 1 Aug 2025 10:14:50 -0600
Subject: [PATCH 51/51] chore: bump oclif/table

---
 package.json | 2 +-
 yarn.lock    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/package.json b/package.json
index c8aeccf3..f9dd1278 100644
--- a/package.json
+++ b/package.json
@@ -58,7 +58,7 @@
   },
   "devDependencies": {
     "@modelcontextprotocol/inspector": "^0.15.0",
-    "@oclif/table": "^0.4.9",
+    "@oclif/table": "^0.4.11",
     "@salesforce/cli-plugins-testkit": "^5.3.39",
     "@salesforce/dev-scripts": "11.0.2",
     "@types/debug": "^4.1.12",
diff --git a/yarn.lock b/yarn.lock
index 5a9a2b72..6e63ccef 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -1957,10 +1957,10 @@
     lodash "^4.17.21"
     registry-auth-token "^5.1.0"
 
-"@oclif/table@^0.4.9":
-  version "0.4.9"
-  resolved "https://registry.yarnpkg.com/@oclif/table/-/table-0.4.9.tgz#bf1057e523d948aad8578d4bb721009589bed1b4"
-  integrity sha512-j6M16G2qXhQCZ3e6TffYmJgBdl0sha0/P1X8xpZpaXMvNHE7nWGGvScUACwvMn64XoSLHzLC9yEcaI5IpH0kYg==
+"@oclif/table@^0.4.11":
+  version "0.4.11"
+  resolved "https://registry.yarnpkg.com/@oclif/table/-/table-0.4.11.tgz#5c6ebcc85554678924099e659ade5b60c0a59bd9"
+  integrity sha512-HKvX4YqabHYrt3juFOldmMedLiHRLBoxO3JnhVQxOCdq7Jr5HP7GM6nrHww4mNSA+Jrc7WhbmSu2GenrHnTOvQ==
   dependencies:
     "@types/react" "^18.3.12"
     change-case "^5.4.4"