diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts index c0def31740..ee9ff0351f 100644 --- a/packages/sdk/server-ai/__tests__/Judge.test.ts +++ b/packages/sdk/server-ai/__tests__/Judge.test.ts @@ -98,13 +98,11 @@ describe('Judge', () => { ); expect(result).toEqual({ - evals: { - relevance: { - score: 0.8, - reasoning: 'The response is relevant to the question', - }, - }, + score: 0.8, + reasoning: 'The response is relevant to the question', + metricKey: 'relevance', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); @@ -148,12 +146,11 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); expect(result).toBeDefined(); - expect(result?.evals).toHaveProperty('relevance'); - expect(result?.evals.relevance.score).toBe(0.85); - expect(result?.judgeConfigKey).toBe('test-judge'); - expect(result?.success).toBe(true); - // Verify the evaluationMetricKey from config is used in the result - expect(Object.keys(result?.evals || {})).toContain(judgeConfig.evaluationMetricKey); + expect(result.score).toBe(0.85); + expect(result.metricKey).toBe('relevance'); + expect(result.judgeConfigKey).toBe('test-judge'); + expect(result.success).toBe(true); + expect(result.sampled).toBe(true); }); it('handles sampling rate correctly', async () => { @@ -183,18 +180,23 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output', 0.5); expect(result).toBeDefined(); + expect(result.sampled).toBe(true); expect(mockProvider.invokeStructuredModel).toHaveBeenCalled(); Math.random = originalRandom; }); - it('returns undefined when not sampled', async () => { + it('returns unsampled result when skipped by sampling', async () => { const originalRandom = Math.random; Math.random = jest.fn().mockReturnValue(0.8); const result = await judge.evaluate('test input', 'test output', 0.5); - expect(result).toBeUndefined(); + expect(result).toEqual({ + success: false, + sampled: false, + judgeConfigKey: 'test-judge', + }); expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled(); expect(mockLogger.debug).toHaveBeenCalledWith( 'Judge evaluation skipped due to sampling rate: 0.5', @@ -203,7 +205,7 @@ describe('Judge', () => { Math.random = originalRandom; }); - it('returns undefined when evaluationMetricKey and evaluationMetricKeys are both missing', async () => { + it('returns error result when evaluationMetricKey and evaluationMetricKeys are both missing', async () => { const configWithoutMetrics: LDAIJudgeConfig = { ...judgeConfig, evaluationMetricKey: undefined, @@ -213,7 +215,12 @@ describe('Judge', () => { const result = await judgeWithoutMetrics.evaluate('test input', 'test output'); - expect(result).toBeUndefined(); + expect(result).toEqual({ + success: false, + sampled: true, + errorMessage: 'Judge configuration is missing required evaluation metric key', + judgeConfigKey: 'test-judge', + }); expect(mockLogger.warn).toHaveBeenCalledWith( 'Judge configuration is missing required evaluation metric key', mockTrackData, @@ -251,10 +258,11 @@ describe('Judge', () => { const result = await judgeWithSingleKey.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', + metricKey: 'relevance', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); }); @@ -290,10 +298,11 @@ describe('Judge', () => { const result = await judgeWithLegacyKeys.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', + metricKey: 'relevance', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); }); @@ -330,10 +339,11 @@ describe('Judge', () => { // Should skip empty and whitespace strings, use first valid value expect(result).toEqual({ - evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', + metricKey: 'relevance', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); }); @@ -369,15 +379,16 @@ describe('Judge', () => { const result = await judgeWithBoth.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: { - helpfulness: { score: 0.7, reasoning: 'The response is helpful' }, - }, + score: 0.7, + reasoning: 'The response is helpful', + metricKey: 'helpfulness', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); }); - it('returns undefined when messages are missing', async () => { + it('returns error result when messages are missing', async () => { const configWithoutMessages: LDAIJudgeConfig = { ...judgeConfig, messages: undefined, @@ -386,14 +397,19 @@ describe('Judge', () => { const result = await judgeWithoutMessages.evaluate('test input', 'test output'); - expect(result).toBeUndefined(); + expect(result).toEqual({ + success: false, + sampled: true, + errorMessage: 'Judge configuration must include messages', + judgeConfigKey: 'test-judge', + }); expect(mockLogger.warn).toHaveBeenCalledWith( 'Judge configuration must include messages', mockTrackData, ); }); - it('returns empty evaluations with success false when expected metric is missing', async () => { + it('returns result with success false when expected metric is missing', async () => { const mockStructuredResponse: StructuredResponse = { data: { evaluations: { @@ -417,13 +433,13 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: {}, success: false, + sampled: true, judgeConfigKey: 'test-judge', }); }); - it('returns empty evaluations when response structure is malformed', async () => { + it('returns result with success false when response structure is malformed', async () => { const mockStructuredResponse: StructuredResponse = { data: { relevance: { score: 0.8, reasoning: 'Good' }, @@ -447,8 +463,8 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: {}, success: false, + sampled: true, judgeConfigKey: 'test-judge', }); }); @@ -460,9 +476,9 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: {}, success: false, - error: 'Provider error', + sampled: true, + errorMessage: 'Provider error', judgeConfigKey: 'test-judge', }); expect(mockLogger.error).toHaveBeenCalledWith('Judge evaluation failed:', error); @@ -474,9 +490,9 @@ describe('Judge', () => { const result = await judge.evaluate('test input', 'test output'); expect(result).toEqual({ - evals: {}, success: false, - error: 'Unknown error', + sampled: true, + errorMessage: 'Unknown error', judgeConfigKey: 'test-judge', }); }); @@ -522,13 +538,11 @@ describe('Judge', () => { const result = await judge.evaluateMessages(messages, response); expect(result).toEqual({ - evals: { - relevance: { - score: 0.8, - reasoning: 'The response is relevant to the question', - }, - }, + score: 0.8, + reasoning: 'The response is relevant to the question', + metricKey: 'relevance', success: true, + sampled: true, judgeConfigKey: 'test-judge', }); @@ -560,7 +574,11 @@ describe('Judge', () => { const result = await judge.evaluateMessages(messages, response, 0.5); - expect(result).toBeUndefined(); + expect(result).toEqual({ + success: false, + sampled: false, + judgeConfigKey: 'test-judge', + }); expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled(); Math.random = originalRandom; @@ -611,11 +629,12 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); expect(result).toEqual({ - relevance: { score: 0.8, reasoning: 'Good' }, + score: 0.8, + reasoning: 'Good', }); }); - it('returns empty object for invalid response data', () => { + it('returns undefined for invalid response data', () => { // eslint-disable-next-line no-underscore-dangle const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); const responseData = { @@ -624,7 +643,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); }); it('handles missing score or reasoning fields', () => { @@ -638,7 +657,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); }); it('handles invalid score values out of range', () => { @@ -652,7 +671,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( expect.stringContaining('Invalid score evaluated for relevance: 1.5'), mockTrackData, @@ -670,7 +689,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( expect.stringContaining('Invalid score evaluated for relevance: -0.1'), mockTrackData, @@ -688,7 +707,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( expect.stringContaining('Invalid reasoning evaluated for relevance: 123'), mockTrackData, @@ -706,7 +725,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( 'Missing evaluation for metric key: relevance', mockTrackData, @@ -723,7 +742,12 @@ describe('Judge', () => { const result = await judgeWithEmptyKeys.evaluate('test input', 'test output'); - expect(result).toBeUndefined(); + expect(result).toEqual({ + success: false, + sampled: true, + errorMessage: 'Judge configuration is missing required evaluation metric key', + judgeConfigKey: 'test-judge', + }); expect(mockLogger.warn).toHaveBeenCalledWith( 'Judge configuration is missing required evaluation metric key', mockTrackData, @@ -741,7 +765,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( 'Missing evaluation for metric key: relevance', mockTrackData, @@ -759,7 +783,7 @@ describe('Judge', () => { const result = parseResponse(responseData, 'relevance', mockTracker); - expect(result).toEqual({}); + expect(result).toBeUndefined(); expect(mockLogger.warn).toHaveBeenCalledWith( 'Missing evaluation for metric key: relevance', mockTrackData, diff --git a/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts b/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts index a4b40b62cb..4263bc3048 100644 --- a/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts +++ b/packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts @@ -841,8 +841,8 @@ describe('trackMetricsOf', () => { }); }); -describe('trackJudgeResponse', () => { - it('tracks evaluation metric key with score', () => { +describe('trackJudgeResult', () => { + it('tracks metric key with score', () => { const tracker = new LDAIConfigTrackerImpl( mockLdClient, testRunId, @@ -854,15 +854,14 @@ describe('trackJudgeResponse', () => { testContext, ); - const judgeResponse = { + tracker.trackJudgeResult({ judgeConfigKey: 'test-judge', - evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, success: true, - }; - - tracker.trackJudgeResponse(judgeResponse); + sampled: true, + score: 0.8, + reasoning: 'The response is relevant', + metricKey: 'relevance', + }); expect(mockTrack).toHaveBeenCalledWith( 'relevance', @@ -872,7 +871,7 @@ describe('trackJudgeResponse', () => { ); }); - it('tracks multiple evaluation metrics when present', () => { + it('does not track when sampled is false', () => { const tracker = new LDAIConfigTrackerImpl( mockLdClient, testRunId, @@ -884,29 +883,38 @@ describe('trackJudgeResponse', () => { testContext, ); - const judgeResponse = { + tracker.trackJudgeResult({ judgeConfigKey: 'test-judge', - evals: { - relevance: { score: 0.8, reasoning: 'Relevant' }, - accuracy: { score: 0.9, reasoning: 'Accurate' }, - }, - success: true, - }; + success: false, + sampled: false, + score: 0.8, + metricKey: 'relevance', + }); - tracker.trackJudgeResponse(judgeResponse); + expect(mockTrack).not.toHaveBeenCalled(); + }); - expect(mockTrack).toHaveBeenCalledWith( - 'relevance', - testContext, - { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' }, - 0.8, - ); - expect(mockTrack).toHaveBeenCalledWith( - 'accuracy', + it('does not track when success is false', () => { + const tracker = new LDAIConfigTrackerImpl( + mockLdClient, + testRunId, + configKey, + variationKey, + version, + modelName, + providerName, testContext, - { ...getExpectedTrackData(), judgeConfigKey: 'test-judge' }, - 0.9, ); + + tracker.trackJudgeResult({ + judgeConfigKey: 'test-judge', + success: false, + sampled: true, + score: 0.8, + metricKey: 'relevance', + }); + + expect(mockTrack).not.toHaveBeenCalled(); }); }); diff --git a/packages/sdk/server-ai/__tests__/LDGraphTrackerImpl.test.ts b/packages/sdk/server-ai/__tests__/LDGraphTrackerImpl.test.ts index fe42bf4e4d..77af551302 100644 --- a/packages/sdk/server-ai/__tests__/LDGraphTrackerImpl.test.ts +++ b/packages/sdk/server-ai/__tests__/LDGraphTrackerImpl.test.ts @@ -140,7 +140,7 @@ it('tracks path', () => { ); }); -it('tracks judge response', () => { +it('tracks judge result', () => { const tracker = new LDGraphTrackerImpl( mockLdClient, graphKey, @@ -148,15 +148,14 @@ it('tracks judge response', () => { version, testContext, ); - const response = { + tracker.trackJudgeResult({ judgeConfigKey: 'my-judge', - evals: { - relevance: { score: 0.9, reasoning: 'Relevant' }, - accuracy: { score: 0.85, reasoning: 'Accurate' }, - }, success: true, - }; - tracker.trackJudgeResponse(response); + sampled: true, + score: 0.9, + reasoning: 'Relevant', + metricKey: 'relevance', + }); expect(mockTrack).toHaveBeenCalledWith( 'relevance', @@ -164,15 +163,9 @@ it('tracks judge response', () => { { ...getExpectedTrackData(), judgeConfigKey: 'my-judge' }, 0.9, ); - expect(mockTrack).toHaveBeenCalledWith( - 'accuracy', - testContext, - { ...getExpectedTrackData(), judgeConfigKey: 'my-judge' }, - 0.85, - ); }); -it('tracks judge response without judgeConfigKey', () => { +it('tracks judge result without judgeConfigKey', () => { const tracker = new LDGraphTrackerImpl( mockLdClient, graphKey, @@ -180,15 +173,53 @@ it('tracks judge response without judgeConfigKey', () => { version, testContext, ); - const response = { - evals: { relevance: { score: 0.7, reasoning: 'Somewhat relevant' } }, + tracker.trackJudgeResult({ success: true, - }; - tracker.trackJudgeResponse(response); + sampled: true, + score: 0.7, + reasoning: 'Somewhat relevant', + metricKey: 'relevance', + }); expect(mockTrack).toHaveBeenCalledWith('relevance', testContext, getExpectedTrackData(), 0.7); }); +it('does not track judge result when not sampled', () => { + const tracker = new LDGraphTrackerImpl( + mockLdClient, + graphKey, + variationKey, + version, + testContext, + ); + tracker.trackJudgeResult({ + judgeConfigKey: 'my-judge', + success: false, + sampled: false, + }); + + expect(mockTrack).not.toHaveBeenCalled(); +}); + +it('does not track judge result when success is false', () => { + const tracker = new LDGraphTrackerImpl( + mockLdClient, + graphKey, + variationKey, + version, + testContext, + ); + tracker.trackJudgeResult({ + judgeConfigKey: 'my-judge', + success: false, + sampled: true, + score: 0.9, + metricKey: 'relevance', + }); + + expect(mockTrack).not.toHaveBeenCalled(); +}); + it('tracks redirect', () => { const tracker = new LDGraphTrackerImpl( mockLdClient, diff --git a/packages/sdk/server-ai/examples/direct-judge/src/index.ts b/packages/sdk/server-ai/examples/direct-judge/src/index.ts index 349b72f1a9..0be897e32c 100644 --- a/packages/sdk/server-ai/examples/direct-judge/src/index.ts +++ b/packages/sdk/server-ai/examples/direct-judge/src/index.ts @@ -65,13 +65,13 @@ async function main() { console.log('Input:', input); console.log('Output:', output); - const judgeResponse = await judge.evaluate(input, output); + const judgeResult = await judge.evaluate(input, output); - // Track the judge evaluation scores on the tracker for the aiConfig you are evaluating. + // Track the judge result on the tracker for the aiConfig you are evaluating. // Example: - // aiConfig.tracker.trackEvalScores(judgeResponse?.evals); + // aiConfig.tracker.trackJudgeResult(judgeResult); - console.log('Judge Response:', judgeResponse); + console.log('Judge Result:', judgeResult); console.log('Success.'); } catch (err) { diff --git a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts index d87729c14f..b3ed3ae9f1 100644 --- a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts +++ b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts @@ -2,7 +2,7 @@ import { LDContext } from '@launchdarkly/js-server-sdk-common'; import { LDAIConfigTracker } from './api/config'; import { LDAIMetricSummary } from './api/config/LDAIConfigTracker'; -import { EvalScore, JudgeResponse } from './api/judge/types'; +import { LDJudgeResult } from './api/judge/types'; import { createBedrockTokenUsage, createOpenAiUsage, @@ -119,21 +119,18 @@ export class LDAIConfigTrackerImpl implements LDAIConfigTracker { ); } - trackEvalScores(scores: Record) { - Object.entries(scores).forEach(([metricKey, evalScore]) => { - this._ldClient.track(metricKey, this._context, this.getTrackData(), evalScore.score); - }); - } - - trackJudgeResponse(response: JudgeResponse) { - Object.entries(response.evals).forEach(([metricKey, evalScore]) => { + trackJudgeResult(result: LDJudgeResult) { + if (!result.sampled || !result.success) { + return; + } + if (result.metricKey !== undefined && result.score !== undefined) { this._ldClient.track( - metricKey, + result.metricKey, this._context, - { ...this.getTrackData(), judgeConfigKey: response.judgeConfigKey }, - evalScore.score, + { ...this.getTrackData(), judgeConfigKey: result.judgeConfigKey }, + result.score, ); - }); + } } trackToolCall(toolKey: string): void { diff --git a/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts b/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts index 4c08e26a58..d1f0602f50 100644 --- a/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts +++ b/packages/sdk/server-ai/src/LDGraphTrackerImpl.ts @@ -1,7 +1,7 @@ import { LDContext } from '@launchdarkly/js-server-sdk-common'; import { LDGraphMetricSummary, LDGraphTracker } from './api/graph/LDGraphTracker'; -import { JudgeResponse } from './api/judge/types'; +import { LDJudgeResult } from './api/judge/types'; import { LDTokenUsage } from './api/metrics'; import { LDClientMin } from './LDClientMin'; @@ -76,14 +76,17 @@ export class LDGraphTrackerImpl implements LDGraphTracker { this._ldClient.track('$ld:ai:graph:path', this._context, { ...this.getTrackData(), path }, 1); } - trackJudgeResponse(response: JudgeResponse): void { - const trackData = response.judgeConfigKey - ? { ...this.getTrackData(), judgeConfigKey: response.judgeConfigKey } - : this.getTrackData(); + trackJudgeResult(result: LDJudgeResult): void { + if (!result.sampled || !result.success) { + return; + } + if (result.metricKey !== undefined && result.score !== undefined) { + const trackData = result.judgeConfigKey + ? { ...this.getTrackData(), judgeConfigKey: result.judgeConfigKey } + : this.getTrackData(); - Object.entries(response.evals).forEach(([metricKey, evalScore]) => { - this._ldClient.track(metricKey, this._context, trackData, evalScore.score); - }); + this._ldClient.track(result.metricKey, this._context, trackData, result.score); + } } trackRedirect(sourceKey: string, redirectedTarget: string): void { diff --git a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts index 054969dc3d..2d5b21a85f 100644 --- a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts +++ b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts @@ -2,7 +2,7 @@ import { LDLogger } from '@launchdarkly/js-server-sdk-common'; import { LDAICompletionConfig, LDMessage } from '../config/types'; import { Judge } from '../judge/Judge'; -import { JudgeResponse } from '../judge/types'; +import { LDJudgeResult } from '../judge/types'; import { AIProvider } from '../providers/AIProvider'; import { ChatResponse } from './types'; @@ -54,10 +54,8 @@ export class TrackedChat { ) { response.evaluations = this._evaluateWithJudges(this.messages, response).then( (evaluations) => { - evaluations.forEach((judgeResponse) => { - if (judgeResponse?.success) { - tracker.trackJudgeResponse(judgeResponse); - } + evaluations.forEach((judgeResult) => { + tracker.trackJudgeResult(judgeResult); }); return evaluations; }, @@ -79,7 +77,7 @@ export class TrackedChat { private async _evaluateWithJudges( messages: LDMessage[], response: ChatResponse, - ): Promise> { + ): Promise { const judgeConfigs = this.aiConfig.judgeConfiguration!.judges; // Start all judge evaluations in parallel @@ -89,7 +87,12 @@ export class TrackedChat { this._logger?.warn( `Judge configuration is not enabled for ${judgeConfig.key} in ${this.aiConfig.key}`, ); - return undefined; + const result: LDJudgeResult = { + success: false, + sampled: true, + errorMessage: `Judge configuration is not enabled for ${judgeConfig.key}`, + }; + return result; } return judge.evaluateMessages(messages, response, judgeConfig.samplingRate); @@ -98,7 +101,17 @@ export class TrackedChat { // ensure all evaluations complete even if some fail const results = await Promise.allSettled(evaluationPromises); - return results.map((result) => (result.status === 'fulfilled' ? result.value : undefined)); + return results.map((settled) => { + if (settled.status === 'fulfilled') { + return settled.value; + } + const result: LDJudgeResult = { + success: false, + sampled: true, + errorMessage: 'Judge evaluation failed', + }; + return result; + }); } /** diff --git a/packages/sdk/server-ai/src/api/chat/types.ts b/packages/sdk/server-ai/src/api/chat/types.ts index 5b32109fcf..19173e30f8 100644 --- a/packages/sdk/server-ai/src/api/chat/types.ts +++ b/packages/sdk/server-ai/src/api/chat/types.ts @@ -1,5 +1,5 @@ import { LDMessage } from '../config/types'; -import { JudgeResponse } from '../judge/types'; +import { LDJudgeResult } from '../judge/types'; import { LDAIMetrics } from '../metrics/LDAIMetrics'; /** @@ -20,5 +20,5 @@ export interface ChatResponse { * Promise that resolves to judge evaluation results. * Only present when judges are configured for evaluation. */ - evaluations?: Promise>; + evaluations?: Promise; } diff --git a/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts b/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts index 883177becb..e0aff2c6b5 100644 --- a/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts +++ b/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts @@ -1,4 +1,4 @@ -import { EvalScore, JudgeResponse } from '../judge/types'; +import { LDJudgeResult } from '../judge/types'; import { LDAIMetrics, LDFeedbackKind, LDTokenUsage } from '../metrics'; /** @@ -116,18 +116,13 @@ export interface LDAIConfigTracker { trackTimeToFirstToken(timeToFirstTokenMs: number): void; /** - * Track evaluation scores for multiple metrics. + * Track a judge evaluation result. * - * @param scores Record mapping metric keys to their evaluation scores - */ - trackEvalScores(scores: Record): void; - - /** - * Track a judge response containing evaluation scores and judge configuration key. + * No event is emitted when the result was not sampled (result.sampled is false). * - * @param response Judge response containing evaluation scores and judge configuration key + * @param result Judge result containing score, reasoning, and metadata */ - trackJudgeResponse(response: JudgeResponse): void; + trackJudgeResult(result: LDJudgeResult): void; /** * Track a single tool invocation. diff --git a/packages/sdk/server-ai/src/api/graph/LDGraphTracker.ts b/packages/sdk/server-ai/src/api/graph/LDGraphTracker.ts index 94cf30658f..9ce432d1db 100644 --- a/packages/sdk/server-ai/src/api/graph/LDGraphTracker.ts +++ b/packages/sdk/server-ai/src/api/graph/LDGraphTracker.ts @@ -1,4 +1,4 @@ -import { JudgeResponse } from '../judge/types'; +import { LDJudgeResult } from '../judge/types'; import { LDTokenUsage } from '../metrics'; /** @@ -83,11 +83,13 @@ export interface LDGraphTracker { trackPath(path: string[]): void; /** - * Track judge responses for the final graph output. + * Track a judge evaluation result for the final graph output. * - * @param response Judge response containing evaluation scores. + * No event is emitted when the result was not sampled (result.sampled is false). + * + * @param result Judge result containing score, reasoning, and metadata. */ - trackJudgeResponse(response: JudgeResponse): void; + trackJudgeResult(result: LDJudgeResult): void; /** * Track when a node redirects to a different target than originally specified. diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index 1bab8d1a12..e36ab138cd 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -7,7 +7,7 @@ import { LDAIConfigTracker } from '../config/LDAIConfigTracker'; import { LDAIJudgeConfig, LDMessage } from '../config/types'; import { AIProvider } from '../providers/AIProvider'; import { EvaluationSchemaBuilder } from './EvaluationSchemaBuilder'; -import { EvalScore, JudgeResponse, StructuredResponse } from './types'; +import { LDJudgeResult, StructuredResponse } from './types'; /** * Judge implementation that handles evaluation functionality and conversation management. @@ -57,13 +57,15 @@ export class Judge { * @param input The input prompt or question that was provided to the AI * @param output The AI-generated response to be evaluated * @param samplingRate Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1) - * @returns Promise that resolves to evaluation results or undefined if not sampled + * @returns Promise that resolves to evaluation results */ - async evaluate( - input: string, - output: string, - samplingRate: number = 1, - ): Promise { + async evaluate(input: string, output: string, samplingRate: number = 1): Promise { + const result: LDJudgeResult = { + success: false, + sampled: false, + judgeConfigKey: this._aiConfig.key, + }; + const tracker = this._aiConfig.createTracker!(); try { const evaluationMetricKey = this._getEvaluationMetricKey(); @@ -72,51 +74,54 @@ export class Judge { 'Judge configuration is missing required evaluation metric key', tracker.getTrackData(), ); - return undefined; + result.sampled = true; + result.errorMessage = 'Judge configuration is missing required evaluation metric key'; + return result; } if (!this._aiConfig.messages) { this._logger?.warn('Judge configuration must include messages', tracker.getTrackData()); - return undefined; + result.sampled = true; + result.errorMessage = 'Judge configuration must include messages'; + return result; } if (Math.random() > samplingRate) { this._logger?.debug(`Judge evaluation skipped due to sampling rate: ${samplingRate}`); - return undefined; + return result; } + result.sampled = true; + const messages = this._constructEvaluationMessages(input, output); const response = await tracker.trackMetricsOf( - (result: StructuredResponse) => result.metrics, + (r: StructuredResponse) => r.metrics, () => this._aiProvider.invokeStructuredModel(messages, this._evaluationResponseStructure), ); - let { success } = response.metrics; - - const evals = this._parseEvaluationResponse(response.data, evaluationMetricKey, tracker); + const evalResult = this._parseEvaluationResponse(response.data, evaluationMetricKey, tracker); - if (!evals[evaluationMetricKey]) { + if (!evalResult) { this._logger?.warn( 'Judge evaluation did not return the expected evaluation', tracker.getTrackData(), ); - success = false; + return result; } return { - evals, - success, - judgeConfigKey: this._aiConfig.key, + ...result, + success: response.metrics.success, + score: evalResult.score, + reasoning: evalResult.reasoning, + metricKey: evaluationMetricKey, }; } catch (error) { this._logger?.error('Judge evaluation failed:', error); - return { - evals: {}, - success: false, - error: error instanceof Error ? error.message : 'Unknown error', - judgeConfigKey: this._aiConfig.key, - }; + result.sampled = true; + result.errorMessage = error instanceof Error ? error.message : 'Unknown error'; + return result; } } @@ -126,13 +131,13 @@ export class Judge { * @param messages Array of messages representing the conversation history * @param response The AI response to be evaluated * @param samplingRatio Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1) - * @returns Promise that resolves to evaluation results or undefined if not sampled + * @returns Promise that resolves to evaluation results */ async evaluateMessages( messages: LDMessage[], response: ChatResponse, samplingRatio: number = 1, - ): Promise { + ): Promise { const input = messages.length === 0 ? '' : messages.map((msg) => msg.content).join('\r\n'); const output = response.message.content; @@ -177,18 +182,18 @@ export class Judge { /** * Parses the structured evaluation response from the AI provider. + * Returns score and reasoning, or undefined if parsing fails. */ private _parseEvaluationResponse( data: Record, evaluationMetricKey: string, tracker: LDAIConfigTracker, - ): Record { + ): { score: number; reasoning: string } | undefined { const evaluations = data.evaluations as Record; - const results: Record = {}; if (!data.evaluations || typeof data.evaluations !== 'object') { this._logger?.warn('Invalid response: missing or invalid evaluations object'); - return results; + return undefined; } const evaluation = evaluations[evaluationMetricKey]; @@ -198,7 +203,7 @@ export class Judge { `Missing evaluation for metric key: ${evaluationMetricKey}`, tracker.getTrackData(), ); - return results; + return undefined; } const evalData = evaluation as Record; @@ -208,7 +213,7 @@ export class Judge { `Invalid score evaluated for ${evaluationMetricKey}: ${evalData.score}. Score must be a number between 0 and 1 inclusive`, tracker.getTrackData(), ); - return results; + return undefined; } if (typeof evalData.reasoning !== 'string') { @@ -216,14 +221,12 @@ export class Judge { `Invalid reasoning evaluated for ${evaluationMetricKey}: ${evalData.reasoning}. Reasoning must be a string`, tracker.getTrackData(), ); - return results; + return undefined; } - results[evaluationMetricKey] = { + return { score: evalData.score, reasoning: evalData.reasoning, }; - - return results; } } diff --git a/packages/sdk/server-ai/src/api/judge/index.ts b/packages/sdk/server-ai/src/api/judge/index.ts index 912ec47fb0..ca86630278 100644 --- a/packages/sdk/server-ai/src/api/judge/index.ts +++ b/packages/sdk/server-ai/src/api/judge/index.ts @@ -1,2 +1,2 @@ export { Judge } from './Judge'; -export type { EvalScore, JudgeResponse, StructuredResponse } from './types'; +export type { LDJudgeResult, StructuredResponse } from './types'; diff --git a/packages/sdk/server-ai/src/api/judge/types.ts b/packages/sdk/server-ai/src/api/judge/types.ts index 68ad141c89..b9d8a05a46 100644 --- a/packages/sdk/server-ai/src/api/judge/types.ts +++ b/packages/sdk/server-ai/src/api/judge/types.ts @@ -17,25 +17,21 @@ export interface StructuredResponse { } /** - * Score and reasoning for a single evaluation metric. + * Result from a judge evaluation containing score, reasoning, and metadata. */ -export interface EvalScore { - /** Score between 0.0 and 1.0 indicating the evaluation result for this metric */ - score: number; - /** Reasoning behind the provided score for this metric */ - reasoning: string; -} - -/** - * Response from a judge evaluation containing scores and reasoning for multiple metrics. - */ -export interface JudgeResponse { - /** The key of the judge configuration that was used to generate this response */ +export interface LDJudgeResult { + /** The key of the judge configuration that was used to generate this result */ judgeConfigKey?: string; - /** Dictionary where keys are metric names and values contain score and reasoning */ - evals: Record; /** Whether the evaluation completed successfully */ success: boolean; /** Error message if evaluation failed */ - error?: string; + errorMessage?: string; + /** Whether this evaluation was sampled (i.e. actually run). False when skipped by sampling. */ + sampled: boolean; + /** The metric key for this evaluation */ + metricKey?: string; + /** Score between 0.0 and 1.0 indicating the evaluation result */ + score?: number; + /** Reasoning behind the provided score */ + reasoning?: string; }