From b120e5ee0b9ce879dc0247fa08cc1a56a2f3a291 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 16 Apr 2026 09:32:19 -0500 Subject: [PATCH 1/3] chore: fix sampled field semantics on JudgeResult Co-Authored-By: Claude Opus 4.6 --- packages/sdk/server-ai/src/ldai/judge/__init__.py | 6 +++--- packages/sdk/server-ai/src/ldai/providers/types.py | 2 +- packages/sdk/server-ai/src/ldai/tracker.py | 3 +++ packages/sdk/server-ai/tests/test_judge.py | 6 ++++-- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index f05756b5..d091482e 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -51,7 +51,7 @@ async def evaluate( :param input_text: The input prompt or question that was provided to the AI :param output_text: The AI-generated response to be evaluated :param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1) - :return: Evaluation result; ``sampled=True`` when skipped due to sampling rate + :return: Evaluation result; ``sampled=True`` when the evaluation was sampled and run """ judge_result = JudgeResult(judge_config_key=self._ai_config.key) @@ -70,9 +70,9 @@ async def evaluate( if random.random() > sampling_rate: log.debug(f'Judge evaluation skipped due to sampling rate: {sampling_rate}') - judge_result.sampled = True return judge_result + judge_result.sampled = True messages = self._construct_evaluation_messages(input_text, output_text) assert self._evaluation_response_structure is not None @@ -110,7 +110,7 @@ async def evaluate_messages( :param messages: Array of messages representing the conversation history :param response: The AI response to be evaluated :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1) - :return: Evaluation result; ``sampled=True`` when skipped due to sampling rate + :return: Evaluation result; ``sampled=True`` when the evaluation was sampled and run """ input_text = '\r\n'.join([msg.content for msg in messages]) if messages else '' output_text = response.message.content diff --git a/packages/sdk/server-ai/src/ldai/providers/types.py b/packages/sdk/server-ai/src/ldai/providers/types.py index 4ad626ba..f60fb664 100644 --- a/packages/sdk/server-ai/src/ldai/providers/types.py +++ b/packages/sdk/server-ai/src/ldai/providers/types.py @@ -65,7 +65,7 @@ class JudgeResult: judge_config_key: Optional[str] = None success: bool = False error_message: Optional[str] = None - sampled: bool = False # True when the judge was skipped due to sampling rate + sampled: bool = False # True when the evaluation was sampled and run score: Optional[float] = None reasoning: Optional[str] = None metric_key: Optional[str] = None diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index b1894f40..e071ae1c 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -253,6 +253,9 @@ def track_judge_result(self, judge_result: Any, *, graph_key: Optional[str] = No :param judge_result: JudgeResult object containing score, metric key, and success status :param graph_key: When set, include ``graphKey`` in the event payload. """ + if not judge_result.sampled: + return + if judge_result.success and judge_result.metric_key: track_data = self.__get_track_data(graph_key=graph_key) if judge_result.judge_config_key: diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index 76bccd0f..b4922d61 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -168,6 +168,7 @@ async def test_evaluate_success_with_valid_response( assert isinstance(result, JudgeResult) assert result.success is True + assert result.sampled is True assert result.metric_key == '$ld:ai:judge:relevance' assert result.score == 0.85 assert result.reasoning is not None @@ -194,6 +195,7 @@ async def test_evaluate_success_with_evaluation_response_shape( assert isinstance(result, JudgeResult) assert result.success is True + assert result.sampled is True assert result.metric_key == '$ld:ai:judge:relevance' assert result.score == 0.9 assert result.reasoning is not None @@ -288,13 +290,13 @@ async def test_evaluate_handles_exception( async def test_evaluate_respects_sampling_rate( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): - """Evaluate should return sampled=True when skipped due to sampling rate.""" + """Evaluate should return sampled=False when skipped due to sampling rate.""" judge = Judge(judge_config_with_key, tracker, mock_runner) result = await judge.evaluate("input", "output", sampling_rate=0.0) assert isinstance(result, JudgeResult) - assert result.sampled is True + assert result.sampled is False assert result.success is False mock_runner.invoke_structured_model.assert_not_called() From 8ed390f09b663e4842aeab3a69cdf12a9ec0e2c2 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 16 Apr 2026 10:05:25 -0500 Subject: [PATCH 2/3] chore: align JudgeResult field order to match spec Co-Authored-By: Claude Opus 4.6 --- packages/sdk/server-ai/src/ldai/providers/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/sdk/server-ai/src/ldai/providers/types.py b/packages/sdk/server-ai/src/ldai/providers/types.py index f60fb664..083141d6 100644 --- a/packages/sdk/server-ai/src/ldai/providers/types.py +++ b/packages/sdk/server-ai/src/ldai/providers/types.py @@ -66,9 +66,9 @@ class JudgeResult: success: bool = False error_message: Optional[str] = None sampled: bool = False # True when the evaluation was sampled and run + metric_key: Optional[str] = None score: Optional[float] = None reasoning: Optional[str] = None - metric_key: Optional[str] = None def to_dict(self) -> Dict[str, Any]: """ From c4e949ea16f8ce6d3e886b4536a95ca76bcd3b05 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 16 Apr 2026 10:19:33 -0500 Subject: [PATCH 3/3] chore: simplify :return: docstrings on evaluate methods Co-Authored-By: Claude Opus 4.6 --- packages/sdk/server-ai/src/ldai/judge/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index d091482e..6db89f68 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -51,7 +51,7 @@ async def evaluate( :param input_text: The input prompt or question that was provided to the AI :param output_text: The AI-generated response to be evaluated :param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1) - :return: Evaluation result; ``sampled=True`` when the evaluation was sampled and run + :return: The result of the judge evaluation. """ judge_result = JudgeResult(judge_config_key=self._ai_config.key) @@ -110,7 +110,7 @@ async def evaluate_messages( :param messages: Array of messages representing the conversation history :param response: The AI response to be evaluated :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1) - :return: Evaluation result; ``sampled=True`` when the evaluation was sampled and run + :return: The result of the judge evaluation. """ input_text = '\r\n'.join([msg.content for msg in messages]) if messages else '' output_text = response.message.content