From fdfc4253fcff0293a9c345619ff2b3b8dd12d6ad Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 16 Apr 2026 09:43:33 -0500 Subject: [PATCH 1/8] feat!: Flatten JudgeResponse and EvalScore into new LDJudgeResult Co-Authored-By: Claude Opus 4.6 --- .../sdk/server-ai/__tests__/Judge.test.ts | 137 +++++++++++------- .../__tests__/LDAIConfigTrackerImpl.test.ts | 64 ++++---- .../__tests__/LDGraphTrackerImpl.test.ts | 69 ++++++--- .../examples/direct-judge/src/index.ts | 8 +- .../server-ai/src/LDAIConfigTrackerImpl.ts | 23 ++- .../sdk/server-ai/src/LDGraphTrackerImpl.ts | 22 ++- .../sdk/server-ai/src/api/chat/TrackedChat.ts | 22 ++- packages/sdk/server-ai/src/api/chat/types.ts | 4 +- .../src/api/config/LDAIConfigTracker.ts | 16 +- .../server-ai/src/api/graph/LDGraphTracker.ts | 10 +- packages/sdk/server-ai/src/api/judge/Judge.ts | 64 ++++---- packages/sdk/server-ai/src/api/judge/index.ts | 2 +- packages/sdk/server-ai/src/api/judge/types.ts | 28 ++-- 13 files changed, 273 insertions(+), 196 deletions(-) diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts index c0def31740..a2d1aef6af 100644 --- a/packages/sdk/server-ai/__tests__/Judge.test.ts +++ b/packages/sdk/server-ai/__tests__/Judge.test.ts @@ -98,13 +98,11 @@ describe('Judge', () => { ); expect(result).toEqual({ - evals: { - relevance: { - score: 0.8, - reasoning: 'The response is relevant to the question', - }, - }, + score: 0.8, + reasoning: 'The response is relevant to the question', + metricKey: 'relevance', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); @@ -148,12 +146,11 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); expect(result).toBeDefined(); - expect(result?.evals).toHaveProperty('relevance'); - expect(result?.evals.relevance.score).toBe(0.85); - expect(result?.judgeConfigKey).toBe('test-judge'); - expect(result?.success).toBe(true); - // Verify the evaluationMetricKey from config is used in the result - expect(Object.keys(result?.evals || {})).toContain(judgeConfig.evaluationMetricKey); + expect(result.score).toBe(0.85); + expect(result.metricKey).toBe('relevance'); + expect(result.judgeConfigKey).toBe('test-judge'); + expect(result.success).toBe(true); + expect(result.sampled).toBe(true); }); it('handles sampling rate correctly', async () => { @@ -183,18 +180,23 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output', 0.5); expect(result).toBeDefined(); + expect(result.sampled).toBe(true); expect(mockProvider.invokeStructuredModel).toHaveBeenCalled(); Math.random = originalRandom; }); - it('returns undefined when not sampled', async () => { + it('returns unsampled result when skipped by sampling', async () => { const originalRandom = Math.random; Math.random = jest.fn().mockReturnValue(0.8); const result = await judge.evaluate('test input', 'test output', 0.5); - expect(result).toBeUndefined(); + expect(result).toEqual({ + success: false, + sampled: false, + judgeConfigKey: 'test-judge', + }); expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled(); expect(mockLogger.debug).toHaveBeenCalledWith( 'Judge evaluation skipped due to sampling rate: 0.5', @@ -203,7 +205,7 @@ describe('Judge', () => { Math.random = originalRandom; }); - it('returns undefined when evaluationMetricKey and evaluationMetricKeys are both missing', async () => { + it('returns error result when evaluationMetricKey and evaluationMetricKeys are both missing', async () => { const configWithoutMetrics: LDAIJudgeConfig = { ...judgeConfig, evaluationMetricKey: undefined, @@ -213,7 +215,12 @@ describe('Judge', () => { const result = await judgeWithoutMetrics.evaluate('test input', 'test output'); - expect(result).toBeUndefined(); + expect(result).toEqual({ + success: false, + sampled: true, + errorMessage: 'Judge configuration is missing required evaluation metric key', + judgeConfigKey: 'test-judge', + }); expect(mockLogger.warn).toHaveBeenCalledWith( 'Judge configuration is missing required evaluation metric key', mockTrackData, @@ -251,10 +258,11 @@ describe('Judge', () => { const result = await judgeWithSingleKey.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', + metricKey: 'relevance', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); }); @@ -290,10 +298,11 @@ describe('Judge', () => { const result = await judgeWithLegacyKeys.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', + metricKey: 'relevance', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); }); @@ -330,10 +339,11 @@ describe('Judge', () => { // Should skip empty and whitespace strings, use first valid value expect(result).toEqual({ - evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', + metricKey: 'relevance', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); }); @@ -369,15 +379,16 @@ describe('Judge', () => { const result = await judgeWithBoth.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: { - helpfulness: { score: 0.7, reasoning: 'The response is helpful' }, - }, + score: 0.7, + reasoning: 'The response is helpful', + metricKey: 'helpfulness', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); }); - it('returns undefined when messages are missing', async () => { + it('returns error result when messages are missing', async () => { const configWithoutMessages: LDAIJudgeConfig = { ...judgeConfig, messages: undefined, @@ -386,14 +397,19 @@ describe('Judge', () => { const result = await judgeWithoutMessages.evaluate('test input', 'test output'); - expect(result).toBeUndefined(); + expect(result).toEqual({ + success: false, + sampled: true, + errorMessage: 'Judge configuration must include messages', + judgeConfigKey: 'test-judge', + }); expect(mockLogger.warn).toHaveBeenCalledWith( 'Judge configuration must include messages', mockTrackData, ); }); - it('returns empty evaluations with success false when expected metric is missing', async () => { + it('returns result with success false when expected metric is missing', async () => { const mockStructuredResponse: StructuredResponse = { data: { evaluations: { @@ -417,13 +433,13 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: {}, success: false, + sampled: true, judgeConfigKey: 'test-judge', }); }); - it('returns empty evaluations when response structure is malformed', async () => { + it('returns result with success false when response structure is malformed', async () => { const mockStructuredResponse: StructuredResponse = { data: { relevance: { score: 0.8, reasoning: 'Good' }, @@ -447,8 +463,8 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: {}, success: false, + sampled: true, judgeConfigKey: 'test-judge', }); }); @@ -460,9 +476,9 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: {}, success: false, - error: 'Provider error', + sampled: true, + errorMessage: 'Provider error', judgeConfigKey: 'test-judge', }); expect(mockLogger.error).toHaveBeenCalledWith('Judge evaluation failed:', error); @@ -474,9 +490,9 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: {}, success: false, - error: 'Unknown error', + sampled: true, + errorMessage: 'Unknown error', judgeConfigKey: 'test-judge', }); }); @@ -522,13 +538,11 @@ describe('Judge', () => { const result = await judge.evaluateMessages(messages, response); expect(result).toEqual({ - evals: { - relevance: { - score: 0.8, - reasoning: 'The response is relevant to the question', - }, - }, + score: 0.8, + reasoning: 'The response is relevant to the question', + metricKey: 'relevance', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); @@ -560,7 +574,11 @@ describe('Judge', () => { const result = await judge.evaluateMessages(messages, response, 0.5); - expect(result).toBeUndefined(); + expect(result).toEqual({ + success: false, + sampled: false, + judgeConfigKey: 'test-judge', + }); expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled(); Math.random = originalRandom; @@ -611,11 +629,13 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); expect(result).toEqual({ - relevance: { score: 0.8, reasoning: 'Good' }, + score: 0.8, + reasoning: 'Good', + metricKey: 'relevance', }); }); - it('returns empty object for invalid response data', () => { + it('returns undefined for invalid response data', () => { // eslint-disable-next-line no-underscore-dangle const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); const responseData = { @@ -624,7 +644,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); }); it('handles missing score or reasoning fields', () => { @@ -638,7 +658,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); }); it('handles invalid score values out of range', () => { @@ -652,7 +672,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( expect.stringContaining('Invalid score evaluated for relevance: 1.5'), mockTrackData, @@ -670,7 +690,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( expect.stringContaining('Invalid score evaluated for relevance: -0.1'), mockTrackData, @@ -688,7 +708,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( expect.stringContaining('Invalid reasoning evaluated for relevance: 123'), mockTrackData, @@ -706,7 +726,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( 'Missing evaluation for metric key: relevance', mockTrackData, @@ -723,7 +743,12 @@ describe('Judge', () => { const result = await judgeWithEmptyKeys.evaluate('test input', 'test output'); - expect(result).toBeUndefined(); + expect(result).toEqual({ + success: false, + sampled: true, + errorMessage: 'Judge configuration is missing required evaluation metric key', + judgeConfigKey: 'test-judge', + }); expect(mockLogger.warn).toHaveBeenCalledWith( 'Judge configuration is missing required evaluation metric key', mockTrackData, @@ -741,7 +766,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( 'Missing evaluation for metric key: relevance', mockTrackData, @@ -759,7 +784,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( 'Missing evaluation for metric key: relevance', mockTrackData, diff --git a/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts b/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts index a4b40b62cb..4263bc3048 100644 --- a/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts +++ b/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts @@ -841,8 +841,8 @@ describe('trackMetricsOf', () => { }); }); -describe('trackJudgeResponse', () => { - it('tracks evaluation metric key with score', () => { +describe('trackJudgeResult', () => { + it('tracks metric key with score', () => { const tracker = new LDAIConfigTrackerImpl( mockLdClient, testRunId, @@ -854,15 +854,14 @@ describe('trackJudgeResponse', () => { testContext, ); - const judgeResponse = { + tracker.trackJudgeResult({ judgeConfigKey: 'test-judge', - evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, success: true, - }; - - tracker.trackJudgeResponse(judgeResponse); + sampled: true, + score: 0.8, + reasoning: 'The response is relevant', + metricKey: 'relevance', + }); expect(mockTrack).toHaveBeenCalledWith( 'relevance', @@ -872,7 +871,7 @@ describe('trackJudgeResponse', () => { ); }); - it('tracks multiple evaluation metrics when present', () => { + it('does not track when sampled is false', () => { const tracker = new LDAIConfigTrackerImpl( mockLdClient, testRunId, @@ -884,29 +883,38 @@ describe('trackJudgeResponse', () => { testContext, ); - const judgeResponse = { + tracker.trackJudgeResult({ judgeConfigKey: 'test-judge', - evals: { - relevance: { score: 0.8, reasoning: 'Relevant' }, - accuracy: { score: 0.9, reasoning: 'Accurate' }, - }, - success: true, - }; + success: false, + sampled: false, + score: 0.8, + metricKey: 'relevance', + }); - tracker.trackJudgeResponse(judgeResponse); + expect(mockTrack).not.toHaveBeenCalled(); + }); - expect(mockTrack).toHaveBeenCalledWith( - 'relevance', - testContext, - { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' }, - 0.8, - ); - expect(mockTrack).toHaveBeenCalledWith( - 'accuracy', + it('does not track when success is false', () => { + const tracker = new LDAIConfigTrackerImpl( + mockLdClient, + testRunId, + configKey, + variationKey, + version, + modelName, + providerName, testContext, - { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' }, - 0.9, ); + + tracker.trackJudgeResult({ + judgeConfigKey: 'test-judge', + success: false, + sampled: true, + score: 0.8, + metricKey: 'relevance', + }); + + expect(mockTrack).not.toHaveBeenCalled(); }); }); diff --git a/packages/sdk/server-ai/__tests__/LDGraphTrackerImpl.test.ts b/packages/sdk/server-ai/__tests__/LDGraphTrackerImpl.test.ts index fe42bf4e4d..77af551302 100644 --- a/packages/sdk/server-ai/__tests__/LDGraphTrackerImpl.test.ts +++ b/packages/sdk/server-ai/__tests__/LDGraphTrackerImpl.test.ts @@ -140,7 +140,7 @@ it('tracks path', () => { ); }); -it('tracks judge response', () => { +it('tracks judge result', () => { const tracker = new LDGraphTrackerImpl( mockLdClient, graphKey, @@ -148,15 +148,14 @@ it('tracks judge response', () => { version, testContext, ); - const response = { + tracker.trackJudgeResult({ judgeConfigKey: 'my-judge', - evals: { - relevance: { score: 0.9, reasoning: 'Relevant' }, - accuracy: { score: 0.85, reasoning: 'Accurate' }, - }, success: true, - }; - tracker.trackJudgeResponse(response); + sampled: true, + score: 0.9, + reasoning: 'Relevant', + metricKey: 'relevance', + }); expect(mockTrack).toHaveBeenCalledWith( 'relevance', @@ -164,15 +163,9 @@ it('tracks judge response', () => { { ...getExpectedTrackData(), judgeConfigKey: 'my-judge' }, 0.9, ); - expect(mockTrack).toHaveBeenCalledWith( - 'accuracy', - testContext, - { ...getExpectedTrackData(), judgeConfigKey: 'my-judge' }, - 0.85, - ); }); -it('tracks judge response without judgeConfigKey', () => { +it('tracks judge result without judgeConfigKey', () => { const tracker = new LDGraphTrackerImpl( mockLdClient, graphKey, @@ -180,15 +173,53 @@ it('tracks judge response without judgeConfigKey', () => { version, testContext, ); - const response = { - evals: { relevance: { score: 0.7, reasoning: 'Somewhat relevant' } }, + tracker.trackJudgeResult({ success: true, - }; - tracker.trackJudgeResponse(response); + sampled: true, + score: 0.7, + reasoning: 'Somewhat relevant', + metricKey: 'relevance', + }); expect(mockTrack).toHaveBeenCalledWith('relevance', testContext, getExpectedTrackData(), 0.7); }); +it('does not track judge result when not sampled', () => { + const tracker = new LDGraphTrackerImpl( + mockLdClient, + graphKey, + variationKey, + version, + testContext, + ); + tracker.trackJudgeResult({ + judgeConfigKey: 'my-judge', + success: false, + sampled: false, + }); + + expect(mockTrack).not.toHaveBeenCalled(); +}); + +it('does not track judge result when success is false', () => { + const tracker = new LDGraphTrackerImpl( + mockLdClient, + graphKey, + variationKey, + version, + testContext, + ); + tracker.trackJudgeResult({ + judgeConfigKey: 'my-judge', + success: false, + sampled: true, + score: 0.9, + metricKey: 'relevance', + }); + + expect(mockTrack).not.toHaveBeenCalled(); +}); + it('tracks redirect', () => { const tracker = new LDGraphTrackerImpl( mockLdClient, diff --git a/packages/sdk/server-ai/examples/direct-judge/src/index.ts b/packages/sdk/server-ai/examples/direct-judge/src/index.ts index 349b72f1a9..0be897e32c 100644 --- a/packages/sdk/server-ai/examples/direct-judge/src/index.ts +++ b/packages/sdk/server-ai/examples/direct-judge/src/index.ts @@ -65,13 +65,13 @@ async function main() { console.log('Input:', input); console.log('Output:', output); - const judgeResponse = await judge.evaluate(input, output); + const judgeResult = await judge.evaluate(input, output); - // Track the judge evaluation scores on the tracker for the aiConfig you are evaluating. + // Track the judge result on the tracker for the aiConfig you are evaluating. // Example: - // aiConfig.tracker.trackEvalScores(judgeResponse?.evals); + // aiConfig.tracker.trackJudgeResult(judgeResult); - console.log('Judge Response:', judgeResponse); + console.log('Judge Result:', judgeResult); console.log('Success.'); } catch (err) { diff --git a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts index d87729c14f..9b332b6983 100644 --- a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts +++ b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts @@ -2,7 +2,7 @@ import { LDContext } from '@launchdarkly/js-server-sdk-common'; import { LDAIConfigTracker } from './api/config'; import { LDAIMetricSummary } from './api/config/LDAIConfigTracker'; -import { EvalScore, JudgeResponse } from './api/judge/types'; +import { LDJudgeResult } from './api/judge/types'; import { createBedrockTokenUsage, createOpenAiUsage, @@ -119,21 +119,18 @@ export class LDAIConfigTrackerImpl implements LDAIConfigTracker { ); } - trackEvalScores(scores: Record) { - Object.entries(scores).forEach(([metricKey, evalScore]) => { - this._ldClient.track(metricKey, this._context, this.getTrackData(), evalScore.score); - }); - } - - trackJudgeResponse(response: JudgeResponse) { - Object.entries(response.evals).forEach(([metricKey, evalScore]) => { + trackJudgeResult(result: LDJudgeResult, graphKey?: string) { + if (!result.sampled || !result.success) { + return; + } + if (result.metricKey !== undefined && result.score !== undefined) { this._ldClient.track( - metricKey, + result.metricKey, this._context, - { ...this.getTrackData(), judgeConfigKey: response.judgeConfigKey }, - evalScore.score, + { ...this.getTrackData(graphKey), judgeConfigKey: result.judgeConfigKey }, + result.score, ); - }); + } } trackToolCall(toolKey: string): void { diff --git a/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts b/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts index 4c08e26a58..6a9e59db0c 100644 --- a/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts +++ b/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts @@ -1,7 +1,7 @@ import { LDContext } from '@launchdarkly/js-server-sdk-common'; import { LDGraphMetricSummary, LDGraphTracker } from './api/graph/LDGraphTracker'; -import { JudgeResponse } from './api/judge/types'; +import { LDJudgeResult } from './api/judge/types'; import { LDTokenUsage } from './api/metrics'; import { LDClientMin } from './LDClientMin'; @@ -76,14 +76,20 @@ export class LDGraphTrackerImpl implements LDGraphTracker { this._ldClient.track('$ld:ai:graph:path', this._context, { ...this.getTrackData(), path }, 1); } - trackJudgeResponse(response: JudgeResponse): void { - const trackData = response.judgeConfigKey - ? { ...this.getTrackData(), judgeConfigKey: response.judgeConfigKey } - : this.getTrackData(); + trackJudgeResult(result: LDJudgeResult): void { + if (!result.sampled) { + return; + } + if (!result.success) { + return; + } + if (result.metricKey !== undefined && result.score !== undefined) { + const trackData = result.judgeConfigKey + ? { ...this.getTrackData(), judgeConfigKey: result.judgeConfigKey } + : this.getTrackData(); - Object.entries(response.evals).forEach(([metricKey, evalScore]) => { - this._ldClient.track(metricKey, this._context, trackData, evalScore.score); - }); + this._ldClient.track(result.metricKey, this._context, trackData, result.score); + } } trackRedirect(sourceKey: string, redirectedTarget: string): void { diff --git a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts index 054969dc3d..96ba768a22 100644 --- a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts +++ b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts @@ -2,7 +2,7 @@ import { LDLogger } from '@launchdarkly/js-server-sdk-common'; import { LDAICompletionConfig, LDMessage } from '../config/types'; import { Judge } from '../judge/Judge'; -import { JudgeResponse } from '../judge/types'; +import { LDJudgeResult } from '../judge/types'; import { AIProvider } from '../providers/AIProvider'; import { ChatResponse } from './types'; @@ -54,10 +54,8 @@ export class TrackedChat { ) { response.evaluations = this._evaluateWithJudges(this.messages, response).then( (evaluations) => { - evaluations.forEach((judgeResponse) => { - if (judgeResponse?.success) { - tracker.trackJudgeResponse(judgeResponse); - } + evaluations.forEach((judgeResult) => { + tracker.trackJudgeResult(judgeResult); }); return evaluations; }, @@ -79,7 +77,7 @@ export class TrackedChat { private async _evaluateWithJudges( messages: LDMessage[], response: ChatResponse, - ): Promise> { + ): Promise { const judgeConfigs = this.aiConfig.judgeConfiguration!.judges; // Start all judge evaluations in parallel @@ -89,7 +87,11 @@ export class TrackedChat { this._logger?.warn( `Judge configuration is not enabled for ${judgeConfig.key} in ${this.aiConfig.key}`, ); - return undefined; + return { + success: false, + sampled: true, + errorMessage: `Judge configuration is not enabled for ${judgeConfig.key}`, + } as LDJudgeResult; } return judge.evaluateMessages(messages, response, judgeConfig.samplingRate); @@ -98,7 +100,11 @@ export class TrackedChat { // ensure all evaluations complete even if some fail const results = await Promise.allSettled(evaluationPromises); - return results.map((result) => (result.status === 'fulfilled' ? result.value : undefined)); + return results.map((result) => + result.status === 'fulfilled' + ? result.value + : { success: false, sampled: true, errorMessage: 'Judge evaluation failed' }, + ); } /** diff --git a/packages/sdk/server-ai/src/api/chat/types.ts b/packages/sdk/server-ai/src/api/chat/types.ts index 5b32109fcf..19173e30f8 100644 --- a/packages/sdk/server-ai/src/api/chat/types.ts +++ b/packages/sdk/server-ai/src/api/chat/types.ts @@ -1,5 +1,5 @@ import { LDMessage } from '../config/types'; -import { JudgeResponse } from '../judge/types'; +import { LDJudgeResult } from '../judge/types'; import { LDAIMetrics } from '../metrics/LDAIMetrics'; /** @@ -20,5 +20,5 @@ export interface ChatResponse { * Promise that resolves to judge evaluation results. * Only present when judges are configured for evaluation. */ - evaluations?: Promise>; + evaluations?: Promise; } diff --git a/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts b/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts index 883177becb..f10cf662c7 100644 --- a/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts +++ b/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts @@ -1,4 +1,4 @@ -import { EvalScore, JudgeResponse } from '../judge/types'; +import { LDJudgeResult } from '../judge/types'; import { LDAIMetrics, LDFeedbackKind, LDTokenUsage } from '../metrics'; /** @@ -116,18 +116,14 @@ export interface LDAIConfigTracker { trackTimeToFirstToken(timeToFirstTokenMs: number): void; /** - * Track evaluation scores for multiple metrics. + * Track a judge evaluation result. * - * @param scores Record mapping metric keys to their evaluation scores - */ - trackEvalScores(scores: Record): void; - - /** - * Track a judge response containing evaluation scores and judge configuration key. + * No event is emitted when the result was not sampled (result.sampled is false). * - * @param response Judge response containing evaluation scores and judge configuration key + * @param result Judge result containing score, reasoning, and metadata + * @param graphKey When provided, associates this metric with the specified agent graph key. */ - trackJudgeResponse(response: JudgeResponse): void; + trackJudgeResult(result: LDJudgeResult, graphKey?: string): void; /** * Track a single tool invocation. diff --git a/packages/sdk/server-ai/src/api/graph/LDGraphTracker.ts b/packages/sdk/server-ai/src/api/graph/LDGraphTracker.ts index 94cf30658f..9ce432d1db 100644 --- a/packages/sdk/server-ai/src/api/graph/LDGraphTracker.ts +++ b/packages/sdk/server-ai/src/api/graph/LDGraphTracker.ts @@ -1,4 +1,4 @@ -import { JudgeResponse } from '../judge/types'; +import { LDJudgeResult } from '../judge/types'; import { LDTokenUsage } from '../metrics'; /** @@ -83,11 +83,13 @@ export interface LDGraphTracker { trackPath(path: string[]): void; /** - * Track judge responses for the final graph output. + * Track a judge evaluation result for the final graph output. * - * @param response Judge response containing evaluation scores. + * No event is emitted when the result was not sampled (result.sampled is false). + * + * @param result Judge result containing score, reasoning, and metadata. */ - trackJudgeResponse(response: JudgeResponse): void; + trackJudgeResult(result: LDJudgeResult): void; /** * Track when a node redirects to a different target than originally specified. diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index 1bab8d1a12..9fd365819d 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -7,7 +7,7 @@ import { LDAIConfigTracker } from '../config/LDAIConfigTracker'; import { LDAIJudgeConfig, LDMessage } from '../config/types'; import { AIProvider } from '../providers/AIProvider'; import { EvaluationSchemaBuilder } from './EvaluationSchemaBuilder'; -import { EvalScore, JudgeResponse, StructuredResponse } from './types'; +import { LDJudgeResult, StructuredResponse } from './types'; /** * Judge implementation that handles evaluation functionality and conversation management. @@ -57,13 +57,9 @@ export class Judge { * @param input The input prompt or question that was provided to the AI * @param output The AI-generated response to be evaluated * @param samplingRate Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1) - * @returns Promise that resolves to evaluation results or undefined if not sampled + * @returns Promise that resolves to evaluation results; always returns a result, never undefined */ - async evaluate( - input: string, - output: string, - samplingRate: number = 1, - ): Promise { + async evaluate(input: string, output: string, samplingRate: number = 1): Promise { const tracker = this._aiConfig.createTracker!(); try { const evaluationMetricKey = this._getEvaluationMetricKey(); @@ -72,17 +68,31 @@ export class Judge { 'Judge configuration is missing required evaluation metric key', tracker.getTrackData(), ); - return undefined; + return { + success: false, + sampled: true, + errorMessage: 'Judge configuration is missing required evaluation metric key', + judgeConfigKey: this._aiConfig.key, + }; } if (!this._aiConfig.messages) { this._logger?.warn('Judge configuration must include messages', tracker.getTrackData()); - return undefined; + return { + success: false, + sampled: true, + errorMessage: 'Judge configuration must include messages', + judgeConfigKey: this._aiConfig.key, + }; } if (Math.random() > samplingRate) { this._logger?.debug(`Judge evaluation skipped due to sampling rate: ${samplingRate}`); - return undefined; + return { + success: false, + sampled: false, + judgeConfigKey: this._aiConfig.key, + }; } const messages = this._constructEvaluationMessages(input, output); @@ -94,9 +104,9 @@ export class Judge { let { success } = response.metrics; - const evals = this._parseEvaluationResponse(response.data, evaluationMetricKey, tracker); + const evalResult = this._parseEvaluationResponse(response.data, evaluationMetricKey, tracker); - if (!evals[evaluationMetricKey]) { + if (!evalResult) { this._logger?.warn( 'Judge evaluation did not return the expected evaluation', tracker.getTrackData(), @@ -105,16 +115,17 @@ export class Judge { } return { - evals, - success, + success: evalResult ? success : false, + sampled: true, judgeConfigKey: this._aiConfig.key, + ...(evalResult ?? {}), }; } catch (error) { this._logger?.error('Judge evaluation failed:', error); return { - evals: {}, success: false, - error: error instanceof Error ? error.message : 'Unknown error', + sampled: true, + errorMessage: error instanceof Error ? error.message : 'Unknown error', judgeConfigKey: this._aiConfig.key, }; } @@ -126,13 +137,13 @@ export class Judge { * @param messages Array of messages representing the conversation history * @param response The AI response to be evaluated * @param samplingRatio Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1) - * @returns Promise that resolves to evaluation results or undefined if not sampled + * @returns Promise that resolves to evaluation results; always returns a result, never undefined */ async evaluateMessages( messages: LDMessage[], response: ChatResponse, samplingRatio: number = 1, - ): Promise { + ): Promise { const input = messages.length === 0 ? '' : messages.map((msg) => msg.content).join('\r\n'); const output = response.message.content; @@ -177,18 +188,18 @@ export class Judge { /** * Parses the structured evaluation response from the AI provider. + * Returns the flat score/reasoning/metricKey fields, or undefined if parsing fails. */ private _parseEvaluationResponse( data: Record, evaluationMetricKey: string, tracker: LDAIConfigTracker, - ): Record { + ): { score: number; reasoning: string; metricKey: string } | undefined { const evaluations = data.evaluations as Record; - const results: Record = {}; if (!data.evaluations || typeof data.evaluations !== 'object') { this._logger?.warn('Invalid response: missing or invalid evaluations object'); - return results; + return undefined; } const evaluation = evaluations[evaluationMetricKey]; @@ -198,7 +209,7 @@ export class Judge { `Missing evaluation for metric key: ${evaluationMetricKey}`, tracker.getTrackData(), ); - return results; + return undefined; } const evalData = evaluation as Record; @@ -208,7 +219,7 @@ export class Judge { `Invalid score evaluated for ${evaluationMetricKey}: ${evalData.score}. Score must be a number between 0 and 1 inclusive`, tracker.getTrackData(), ); - return results; + return undefined; } if (typeof evalData.reasoning !== 'string') { @@ -216,14 +227,13 @@ export class Judge { `Invalid reasoning evaluated for ${evaluationMetricKey}: ${evalData.reasoning}. Reasoning must be a string`, tracker.getTrackData(), ); - return results; + return undefined; } - results[evaluationMetricKey] = { + return { score: evalData.score, reasoning: evalData.reasoning, + metricKey: evaluationMetricKey, }; - - return results; } } diff --git a/packages/sdk/server-ai/src/api/judge/index.ts b/packages/sdk/server-ai/src/api/judge/index.ts index 912ec47fb0..ca86630278 100644 --- a/packages/sdk/server-ai/src/api/judge/index.ts +++ b/packages/sdk/server-ai/src/api/judge/index.ts @@ -1,2 +1,2 @@ export { Judge } from './Judge'; -export type { EvalScore, JudgeResponse, StructuredResponse } from './types'; +export type { LDJudgeResult, StructuredResponse } from './types'; diff --git a/packages/sdk/server-ai/src/api/judge/types.ts b/packages/sdk/server-ai/src/api/judge/types.ts index 68ad141c89..0f172ce4fe 100644 --- a/packages/sdk/server-ai/src/api/judge/types.ts +++ b/packages/sdk/server-ai/src/api/judge/types.ts @@ -17,25 +17,21 @@ export interface StructuredResponse { } /** - * Score and reasoning for a single evaluation metric. + * Flat result from a judge evaluation containing score, reasoning, and metadata. */ -export interface EvalScore { - /** Score between 0.0 and 1.0 indicating the evaluation result for this metric */ - score: number; - /** Reasoning behind the provided score for this metric */ - reasoning: string; -} - -/** - * Response from a judge evaluation containing scores and reasoning for multiple metrics. - */ -export interface JudgeResponse { - /** The key of the judge configuration that was used to generate this response */ +export interface LDJudgeResult { + /** The key of the judge configuration that was used to generate this result */ judgeConfigKey?: string; - /** Dictionary where keys are metric names and values contain score and reasoning */ - evals: Record; /** Whether the evaluation completed successfully */ success: boolean; /** Error message if evaluation failed */ - error?: string; + errorMessage?: string; + /** Whether this evaluation was sampled (i.e. actually run). False when skipped by sampling. */ + sampled: boolean; + /** Score between 0.0 and 1.0 indicating the evaluation result */ + score?: number; + /** Reasoning behind the provided score */ + reasoning?: string; + /** The metric key for this evaluation */ + metricKey?: string; } From b81c5c28994c0ce2fa07e5e2ca587a3c466773b7 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 16 Apr 2026 10:01:28 -0500 Subject: [PATCH 2/8] refactor: use defaultJudgeResult factory and remove metricKey from _parseEvaluationResponse Co-Authored-By: Claude Opus 4.6 --- .../sdk/server-ai/__tests__/Judge.test.ts | 1 - .../sdk/server-ai/src/api/chat/TrackedChat.ts | 25 ++++---- packages/sdk/server-ai/src/api/judge/Judge.ts | 62 ++++++++----------- packages/sdk/server-ai/src/api/judge/index.ts | 1 + packages/sdk/server-ai/src/api/judge/types.ts | 10 +++ 5 files changed, 50 insertions(+), 49 deletions(-) diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts index a2d1aef6af..ee9ff0351f 100644 --- a/packages/sdk/server-ai/__tests__/Judge.test.ts +++ b/packages/sdk/server-ai/__tests__/Judge.test.ts @@ -631,7 +631,6 @@ describe('Judge', () => { expect(result).toEqual({ score: 0.8, reasoning: 'Good', - metricKey: 'relevance', }); }); diff --git a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts index 96ba768a22..5d68ae0e43 100644 --- a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts +++ b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts @@ -2,7 +2,7 @@ import { LDLogger } from '@launchdarkly/js-server-sdk-common'; import { LDAICompletionConfig, LDMessage } from '../config/types'; import { Judge } from '../judge/Judge'; -import { LDJudgeResult } from '../judge/types'; +import { defaultJudgeResult, LDJudgeResult } from '../judge/types'; import { AIProvider } from '../providers/AIProvider'; import { ChatResponse } from './types'; @@ -87,11 +87,10 @@ export class TrackedChat { this._logger?.warn( `Judge configuration is not enabled for ${judgeConfig.key} in ${this.aiConfig.key}`, ); - return { - success: false, - sampled: true, - errorMessage: `Judge configuration is not enabled for ${judgeConfig.key}`, - } as LDJudgeResult; + const result = defaultJudgeResult(); + result.sampled = true; + result.errorMessage = `Judge configuration is not enabled for ${judgeConfig.key}`; + return result; } return judge.evaluateMessages(messages, response, judgeConfig.samplingRate); @@ -100,11 +99,15 @@ export class TrackedChat { // ensure all evaluations complete even if some fail const results = await Promise.allSettled(evaluationPromises); - return results.map((result) => - result.status === 'fulfilled' - ? result.value - : { success: false, sampled: true, errorMessage: 'Judge evaluation failed' }, - ); + return results.map((settled) => { + if (settled.status === 'fulfilled') { + return settled.value; + } + const result = defaultJudgeResult(); + result.sampled = true; + result.errorMessage = 'Judge evaluation failed'; + return result; + }); } /** diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index 9fd365819d..17f3acfcb5 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -7,7 +7,7 @@ import { LDAIConfigTracker } from '../config/LDAIConfigTracker'; import { LDAIJudgeConfig, LDMessage } from '../config/types'; import { AIProvider } from '../providers/AIProvider'; import { EvaluationSchemaBuilder } from './EvaluationSchemaBuilder'; -import { LDJudgeResult, StructuredResponse } from './types'; +import { defaultJudgeResult, LDJudgeResult, StructuredResponse } from './types'; /** * Judge implementation that handles evaluation functionality and conversation management. @@ -60,6 +60,9 @@ export class Judge { * @returns Promise that resolves to evaluation results; always returns a result, never undefined */ async evaluate(input: string, output: string, samplingRate: number = 1): Promise { + const result = defaultJudgeResult(); + result.judgeConfigKey = this._aiConfig.key; + const tracker = this._aiConfig.createTracker!(); try { const evaluationMetricKey = this._getEvaluationMetricKey(); @@ -68,42 +71,32 @@ export class Judge { 'Judge configuration is missing required evaluation metric key', tracker.getTrackData(), ); - return { - success: false, - sampled: true, - errorMessage: 'Judge configuration is missing required evaluation metric key', - judgeConfigKey: this._aiConfig.key, - }; + result.sampled = true; + result.errorMessage = 'Judge configuration is missing required evaluation metric key'; + return result; } if (!this._aiConfig.messages) { this._logger?.warn('Judge configuration must include messages', tracker.getTrackData()); - return { - success: false, - sampled: true, - errorMessage: 'Judge configuration must include messages', - judgeConfigKey: this._aiConfig.key, - }; + result.sampled = true; + result.errorMessage = 'Judge configuration must include messages'; + return result; } if (Math.random() > samplingRate) { this._logger?.debug(`Judge evaluation skipped due to sampling rate: ${samplingRate}`); - return { - success: false, - sampled: false, - judgeConfigKey: this._aiConfig.key, - }; + return result; } + result.sampled = true; + const messages = this._constructEvaluationMessages(input, output); const response = await tracker.trackMetricsOf( - (result: StructuredResponse) => result.metrics, + (r: StructuredResponse) => r.metrics, () => this._aiProvider.invokeStructuredModel(messages, this._evaluationResponseStructure), ); - let { success } = response.metrics; - const evalResult = this._parseEvaluationResponse(response.data, evaluationMetricKey, tracker); if (!evalResult) { @@ -111,23 +104,19 @@ export class Judge { 'Judge evaluation did not return the expected evaluation', tracker.getTrackData(), ); - success = false; + return result; } - return { - success: evalResult ? success : false, - sampled: true, - judgeConfigKey: this._aiConfig.key, - ...(evalResult ?? {}), - }; + result.success = response.metrics.success; + result.score = evalResult.score; + result.reasoning = evalResult.reasoning; + result.metricKey = evaluationMetricKey; + return result; } catch (error) { this._logger?.error('Judge evaluation failed:', error); - return { - success: false, - sampled: true, - errorMessage: error instanceof Error ? error.message : 'Unknown error', - judgeConfigKey: this._aiConfig.key, - }; + result.sampled = true; + result.errorMessage = error instanceof Error ? error.message : 'Unknown error'; + return result; } } @@ -188,13 +177,13 @@ export class Judge { /** * Parses the structured evaluation response from the AI provider. - * Returns the flat score/reasoning/metricKey fields, or undefined if parsing fails. + * Returns score and reasoning, or undefined if parsing fails. */ private _parseEvaluationResponse( data: Record, evaluationMetricKey: string, tracker: LDAIConfigTracker, - ): { score: number; reasoning: string; metricKey: string } | undefined { + ): { score: number; reasoning: string } | undefined { const evaluations = data.evaluations as Record; if (!data.evaluations || typeof data.evaluations !== 'object') { @@ -233,7 +222,6 @@ export class Judge { return { score: evalData.score, reasoning: evalData.reasoning, - metricKey: evaluationMetricKey, }; } } diff --git a/packages/sdk/server-ai/src/api/judge/index.ts b/packages/sdk/server-ai/src/api/judge/index.ts index ca86630278..bc55137d6d 100644 --- a/packages/sdk/server-ai/src/api/judge/index.ts +++ b/packages/sdk/server-ai/src/api/judge/index.ts @@ -1,2 +1,3 @@ export { Judge } from './Judge'; +export { defaultJudgeResult } from './types'; export type { LDJudgeResult, StructuredResponse } from './types'; diff --git a/packages/sdk/server-ai/src/api/judge/types.ts b/packages/sdk/server-ai/src/api/judge/types.ts index 0f172ce4fe..cb11a1cf97 100644 --- a/packages/sdk/server-ai/src/api/judge/types.ts +++ b/packages/sdk/server-ai/src/api/judge/types.ts @@ -35,3 +35,13 @@ export interface LDJudgeResult { /** The metric key for this evaluation */ metricKey?: string; } + +/** + * Creates an LDJudgeResult with default values (success: false, sampled: false). + */ +export function defaultJudgeResult(): LDJudgeResult { + return { + success: false, + sampled: false, + }; +} From 4ee6fbbc0f4a578e5967361f833b8374233457b1 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 16 Apr 2026 10:05:27 -0500 Subject: [PATCH 3/8] chore: align LDJudgeResult property order to match spec Co-Authored-By: Claude Opus 4.6 --- packages/sdk/server-ai/src/api/judge/types.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/sdk/server-ai/src/api/judge/types.ts b/packages/sdk/server-ai/src/api/judge/types.ts index cb11a1cf97..58550eef10 100644 --- a/packages/sdk/server-ai/src/api/judge/types.ts +++ b/packages/sdk/server-ai/src/api/judge/types.ts @@ -28,12 +28,12 @@ export interface LDJudgeResult { errorMessage?: string; /** Whether this evaluation was sampled (i.e. actually run). False when skipped by sampling. */ sampled: boolean; + /** The metric key for this evaluation */ + metricKey?: string; /** Score between 0.0 and 1.0 indicating the evaluation result */ score?: number; /** Reasoning behind the provided score */ reasoning?: string; - /** The metric key for this evaluation */ - metricKey?: string; } /** From a6396358821e278286e2bf3e7151a95c866941dd Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 16 Apr 2026 10:08:43 -0500 Subject: [PATCH 4/8] chore: combine sampled and success guards in trackJudgeResult Co-Authored-By: Claude Opus 4.6 --- packages/sdk/server-ai/src/LDGraphTrackerImpl.ts | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts b/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts index 6a9e59db0c..d1f0602f50 100644 --- a/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts +++ b/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts @@ -77,10 +77,7 @@ export class LDGraphTrackerImpl implements LDGraphTracker { } trackJudgeResult(result: LDJudgeResult): void { - if (!result.sampled) { - return; - } - if (!result.success) { + if (!result.sampled || !result.success) { return; } if (result.metricKey !== undefined && result.score !== undefined) { From 3c793770a51fafcdf37e02896b65cb0199fc1132 Mon Sep 17 00:00:00 2001 From: Jason Bailey Date: Thu, 16 Apr 2026 10:10:08 -0500 Subject: [PATCH 5/8] Apply suggestions from code review Co-authored-by: Jason Bailey --- packages/sdk/server-ai/src/api/judge/Judge.ts | 4 ++-- packages/sdk/server-ai/src/api/judge/types.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index 17f3acfcb5..7105595a2a 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -57,7 +57,7 @@ export class Judge { * @param input The input prompt or question that was provided to the AI * @param output The AI-generated response to be evaluated * @param samplingRate Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1) - * @returns Promise that resolves to evaluation results; always returns a result, never undefined + * @returns Promise that resolves to evaluation results */ async evaluate(input: string, output: string, samplingRate: number = 1): Promise { const result = defaultJudgeResult(); @@ -126,7 +126,7 @@ export class Judge { * @param messages Array of messages representing the conversation history * @param response The AI response to be evaluated * @param samplingRatio Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1) - * @returns Promise that resolves to evaluation results; always returns a result, never undefined + * @returns Promise that resolves to evaluation results */ async evaluateMessages( messages: LDMessage[], diff --git a/packages/sdk/server-ai/src/api/judge/types.ts b/packages/sdk/server-ai/src/api/judge/types.ts index 58550eef10..3c91e883fb 100644 --- a/packages/sdk/server-ai/src/api/judge/types.ts +++ b/packages/sdk/server-ai/src/api/judge/types.ts @@ -17,7 +17,7 @@ export interface StructuredResponse { } /** - * Flat result from a judge evaluation containing score, reasoning, and metadata. + * Result from a judge evaluation containing score, reasoning, and metadata. */ export interface LDJudgeResult { /** The key of the judge configuration that was used to generate this result */ From 574ffd79d724a403e0dbedf27a04ad52882ec69f Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 16 Apr 2026 10:17:10 -0500 Subject: [PATCH 6/8] fix: remove graphKey param from trackJudgeResult after upstream refactor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit getTrackData() no longer accepts graphKey — it comes from the constructor. Co-Authored-By: Claude Opus 4.6 --- packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts | 4 ++-- packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts index 9b332b6983..b3ed3ae9f1 100644 --- a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts +++ b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts @@ -119,7 +119,7 @@ export class LDAIConfigTrackerImpl implements LDAIConfigTracker { ); } - trackJudgeResult(result: LDJudgeResult, graphKey?: string) { + trackJudgeResult(result: LDJudgeResult) { if (!result.sampled || !result.success) { return; } @@ -127,7 +127,7 @@ export class LDAIConfigTrackerImpl implements LDAIConfigTracker { this._ldClient.track( result.metricKey, this._context, - { ...this.getTrackData(graphKey), judgeConfigKey: result.judgeConfigKey }, + { ...this.getTrackData(), judgeConfigKey: result.judgeConfigKey }, result.score, ); } diff --git a/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts b/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts index f10cf662c7..e0aff2c6b5 100644 --- a/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts +++ b/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts @@ -121,9 +121,8 @@ export interface LDAIConfigTracker { * No event is emitted when the result was not sampled (result.sampled is false). * * @param result Judge result containing score, reasoning, and metadata - * @param graphKey When provided, associates this metric with the specified agent graph key. */ - trackJudgeResult(result: LDJudgeResult, graphKey?: string): void; + trackJudgeResult(result: LDJudgeResult): void; /** * Track a single tool invocation. From b33a85ee6bcbf08749dae2704820e4522febbd73 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 16 Apr 2026 11:50:28 -0500 Subject: [PATCH 7/8] refactor: remove defaultJudgeResult factory, use inline object literals Co-Authored-By: Claude Opus 4.6 --- .../sdk/server-ai/src/api/chat/TrackedChat.ts | 18 +++++++++++------- packages/sdk/server-ai/src/api/judge/Judge.ts | 9 ++++++--- packages/sdk/server-ai/src/api/judge/index.ts | 1 - packages/sdk/server-ai/src/api/judge/types.ts | 10 ---------- 4 files changed, 17 insertions(+), 21 deletions(-) diff --git a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts index 5d68ae0e43..2d5b21a85f 100644 --- a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts +++ b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts @@ -2,7 +2,7 @@ import { LDLogger } from '@launchdarkly/js-server-sdk-common'; import { LDAICompletionConfig, LDMessage } from '../config/types'; import { Judge } from '../judge/Judge'; -import { defaultJudgeResult, LDJudgeResult } from '../judge/types'; +import { LDJudgeResult } from '../judge/types'; import { AIProvider } from '../providers/AIProvider'; import { ChatResponse } from './types'; @@ -87,9 +87,11 @@ export class TrackedChat { this._logger?.warn( `Judge configuration is not enabled for ${judgeConfig.key} in ${this.aiConfig.key}`, ); - const result = defaultJudgeResult(); - result.sampled = true; - result.errorMessage = `Judge configuration is not enabled for ${judgeConfig.key}`; + const result: LDJudgeResult = { + success: false, + sampled: true, + errorMessage: `Judge configuration is not enabled for ${judgeConfig.key}`, + }; return result; } @@ -103,9 +105,11 @@ export class TrackedChat { if (settled.status === 'fulfilled') { return settled.value; } - const result = defaultJudgeResult(); - result.sampled = true; - result.errorMessage = 'Judge evaluation failed'; + const result: LDJudgeResult = { + success: false, + sampled: true, + errorMessage: 'Judge evaluation failed', + }; return result; }); } diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index 7105595a2a..cb19c6797d 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -7,7 +7,7 @@ import { LDAIConfigTracker } from '../config/LDAIConfigTracker'; import { LDAIJudgeConfig, LDMessage } from '../config/types'; import { AIProvider } from '../providers/AIProvider'; import { EvaluationSchemaBuilder } from './EvaluationSchemaBuilder'; -import { defaultJudgeResult, LDJudgeResult, StructuredResponse } from './types'; +import { LDJudgeResult, StructuredResponse } from './types'; /** * Judge implementation that handles evaluation functionality and conversation management. @@ -60,8 +60,11 @@ export class Judge { * @returns Promise that resolves to evaluation results */ async evaluate(input: string, output: string, samplingRate: number = 1): Promise { - const result = defaultJudgeResult(); - result.judgeConfigKey = this._aiConfig.key; + const result: LDJudgeResult = { + success: false, + sampled: false, + judgeConfigKey: this._aiConfig.key, + }; const tracker = this._aiConfig.createTracker!(); try { diff --git a/packages/sdk/server-ai/src/api/judge/index.ts b/packages/sdk/server-ai/src/api/judge/index.ts index bc55137d6d..ca86630278 100644 --- a/packages/sdk/server-ai/src/api/judge/index.ts +++ b/packages/sdk/server-ai/src/api/judge/index.ts @@ -1,3 +1,2 @@ export { Judge } from './Judge'; -export { defaultJudgeResult } from './types'; export type { LDJudgeResult, StructuredResponse } from './types'; diff --git a/packages/sdk/server-ai/src/api/judge/types.ts b/packages/sdk/server-ai/src/api/judge/types.ts index 3c91e883fb..b9d8a05a46 100644 --- a/packages/sdk/server-ai/src/api/judge/types.ts +++ b/packages/sdk/server-ai/src/api/judge/types.ts @@ -35,13 +35,3 @@ export interface LDJudgeResult { /** Reasoning behind the provided score */ reasoning?: string; } - -/** - * Creates an LDJudgeResult with default values (success: false, sampled: false). - */ -export function defaultJudgeResult(): LDJudgeResult { - return { - success: false, - sampled: false, - }; -} From 97b1ce8f8ef81d954da58b1f49246fdcd76d6a3b Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 16 Apr 2026 12:03:22 -0500 Subject: [PATCH 8/8] refactor: use object spread for successful return in Judge.evaluate Co-Authored-By: Claude Opus 4.6 --- packages/sdk/server-ai/src/api/judge/Judge.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index cb19c6797d..e36ab138cd 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -110,11 +110,13 @@ export class Judge { return result; } - result.success = response.metrics.success; - result.score = evalResult.score; - result.reasoning = evalResult.reasoning; - result.metricKey = evaluationMetricKey; - return result; + return { + ...result, + success: response.metrics.success, + score: evalResult.score, + reasoning: evalResult.reasoning, + metricKey: evaluationMetricKey, + }; } catch (error) { this._logger?.error('Judge evaluation failed:', error); result.sampled = true;