From 9d6143b08a93f21a450ba44f15e5ccb495fefb35 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Tue, 14 Apr 2026 09:36:29 -0500 Subject: [PATCH] feat!: Rename JudgeResponse to JudgeResult and flatten EvalScore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BREAKING CHANGE: `JudgeResponse` and `EvalScore` are removed. Replace with the new flat `JudgeResult` dataclass. `track_judge_response` and `track_eval_scores` on `LDAIConfigTracker` are removed; use `track_judge_result` instead. - Replace `JudgeResponse` + nested `EvalScore` dict with a flat `JudgeResult` dataclass (`score`, `reasoning`, `metric_key`, `judge_config_key`, `success`, `sampled`, `error_message`) - Add `sampled: bool` to distinguish skipped-by-sampling-rate from failure - Rename `error` → `error_message` - Rename `track_judge_response` → `track_judge_result` on `LDAIConfigTracker`; remove `track_eval_scores` - Remove `track_judge_response` from `AIGraphTracker` (judges are node-level only) - `Judge.evaluate()` always returns a `JudgeResult` (never `None`); builds the result progressively so `judge_config_key` is always set - Simplify `_parse_evaluation_response` to return `(score, reasoning)` tuple Co-Authored-By: Claude Sonnet 4.6 --- packages/sdk/server-ai/src/ldai/__init__.py | 5 +- .../sdk/server-ai/src/ldai/judge/__init__.py | 69 ++++++++--------- .../sdk/server-ai/src/ldai/managed_model.py | 14 ++-- packages/sdk/server-ai/src/ldai/models.py | 4 +- .../server-ai/src/ldai/providers/__init__.py | 6 +- .../sdk/server-ai/src/ldai/providers/types.py | 49 +++++------- packages/sdk/server-ai/src/ldai/tracker.py | 75 ++++--------------- packages/sdk/server-ai/tests/test_judge.py | 73 +++++++++--------- 8 files changed, 117 insertions(+), 178 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/__init__.py b/packages/sdk/server-ai/src/ldai/__init__.py index c70d941c..88123164 100644 --- a/packages/sdk/server-ai/src/ldai/__init__.py +++ b/packages/sdk/server-ai/src/ldai/__init__.py @@ -36,7 +36,7 @@ AgentRunner, ToolRegistry, ) -from ldai.providers.types import EvalScore, JudgeResponse +from ldai.providers.types import JudgeResult from ldai.tracker import AIGraphTracker __all__ = [ @@ -60,11 +60,10 @@ 'ManagedAgent', 'ManagedModel', 'ManagedAgentGraph', - 'EvalScore', 'AgentGraphDefinition', 'Judge', 'JudgeConfiguration', - 'JudgeResponse', + 'JudgeResult', 'LDMessage', 'ModelConfig', 'ProviderConfig', diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index a842db65..f05756b5 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -1,7 +1,7 @@ """Judge implementation for AI evaluation.""" import random -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple import chevron @@ -9,7 +9,7 @@ from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import AIJudgeConfig, LDMessage from ldai.providers.model_runner import ModelRunner -from ldai.providers.types import EvalScore, JudgeResponse, ModelResponse +from ldai.providers.types import JudgeResult, ModelResponse from ldai.tracker import LDAIConfigTracker @@ -44,29 +44,34 @@ async def evaluate( input_text: str, output_text: str, sampling_rate: float = 1.0, - ) -> Optional[JudgeResponse]: + ) -> JudgeResult: """ Evaluates an AI response using the judge's configuration. :param input_text: The input prompt or question that was provided to the AI :param output_text: The AI-generated response to be evaluated :param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1) - :return: Evaluation results or None if not sampled + :return: Evaluation result; ``sampled=True`` when skipped due to sampling rate """ + judge_result = JudgeResult(judge_config_key=self._ai_config.key) + try: if not self._ai_config.evaluation_metric_key: log.warning( 'Judge configuration is missing required evaluationMetricKey' ) - return None + judge_result.error_message = 'Judge configuration is missing required evaluationMetricKey' + return judge_result if not self._ai_config.messages: log.warning('Judge configuration must include messages') - return None + judge_result.error_message = 'Judge configuration must include messages' + return judge_result if random.random() > sampling_rate: log.debug(f'Judge evaluation skipped due to sampling rate: {sampling_rate}') - return None + judge_result.sampled = True + return judge_result messages = self._construct_evaluation_messages(input_text, output_text) assert self._evaluation_response_structure is not None @@ -76,39 +81,36 @@ async def evaluate( lambda result: result.metrics, ) - success = response.metrics.success - evals = self._parse_evaluation_response(response.data) + parsed = self._parse_evaluation_response(response.data) - if not evals: + if parsed is None: log.warning('Judge evaluation did not return the expected evaluation') - success = False - - return JudgeResponse( - judge_config_key=self._ai_config.key, - evals=evals, - success=success, - ) + return judge_result + + score, reasoning = parsed + judge_result.metric_key = self._ai_config.evaluation_metric_key + judge_result.score = score + judge_result.reasoning = reasoning + judge_result.success = response.metrics.success + return judge_result except Exception as error: log.error(f'Judge evaluation failed: {error}') - return JudgeResponse( - evals={}, - success=False, - error=str(error) if isinstance(error, Exception) else 'Unknown error', - ) + judge_result.error_message = str(error) if isinstance(error, Exception) else 'Unknown error' + return judge_result async def evaluate_messages( self, messages: list[LDMessage], response: ModelResponse, sampling_ratio: float = 1.0, - ) -> Optional[JudgeResponse]: + ) -> JudgeResult: """ Evaluates an AI response from chat messages and response. :param messages: Array of messages representing the conversation history :param response: The AI response to be evaluated :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1) - :return: Evaluation results or None if not sampled + :return: Evaluation result; ``sampled=True`` when skipped due to sampling rate """ input_text = '\r\n'.join([msg.content for msg in messages]) if messages else '' output_text = response.message.content @@ -172,28 +174,23 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str: # Use chevron (Mustache) for templating, with no escaping return chevron.render(content, variables) - def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]: + def _parse_evaluation_response(self, data: Dict[str, Any]) -> Optional[Tuple[float, str]]: """ Parses the structured evaluation response. Expects {"score": n, "reasoning": "..."}. - """ - results: Dict[str, EvalScore] = {} - metric_key = self._ai_config.evaluation_metric_key - if not metric_key: - log.warning('Evaluation metric key is missing') - return results + :return: ``(score, reasoning)`` on success, or ``None`` if the response is invalid. + """ if not isinstance(data, dict): log.warning('Invalid response: missing or invalid evaluation') - return results + return None score = data.get('score') reasoning = data.get('reasoning') if not isinstance(score, (int, float)) or score < 0 or score > 1: log.warning(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive') - return results + return None if not isinstance(reasoning, str): log.warning('Invalid reasoning: must be a string') - return results + return None - results[metric_key] = EvalScore(score=float(score), reasoning=reasoning) - return results + return (float(score), reasoning) diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py index c1ef021c..e982cc00 100644 --- a/packages/sdk/server-ai/src/ldai/managed_model.py +++ b/packages/sdk/server-ai/src/ldai/managed_model.py @@ -5,7 +5,7 @@ from ldai.judge import Judge from ldai.models import AICompletionConfig, LDMessage from ldai.providers.model_runner import ModelRunner -from ldai.providers.types import JudgeResponse, ModelResponse +from ldai.providers.types import JudgeResult, ModelResponse from ldai.tracker import LDAIConfigTracker @@ -66,19 +66,19 @@ def _start_judge_evaluations( self, messages: List[LDMessage], response: ModelResponse, - ) -> List[asyncio.Task[Optional[JudgeResponse]]]: + ) -> List[asyncio.Task[Optional[JudgeResult]]]: if not self._ai_config.judge_configuration or not self._ai_config.judge_configuration.judges: return [] - async def evaluate_judge(judge_config: Any) -> Optional[JudgeResponse]: + async def evaluate_judge(judge_config: Any) -> Optional[JudgeResult]: judge = self._judges.get(judge_config.key) if not judge: log.warning(f'Judge configuration is not enabled: {judge_config.key}') return None - eval_result = await judge.evaluate_messages(messages, response, judge_config.sampling_rate) - if eval_result and eval_result.success: - self._tracker.track_judge_response(eval_result) - return eval_result + judge_result = await judge.evaluate_messages(messages, response, judge_config.sampling_rate) + if judge_result.success: + self._tracker.track_judge_result(judge_result) + return judge_result return [ asyncio.create_task(evaluate_judge(jc)) diff --git a/packages/sdk/server-ai/src/ldai/models.py b/packages/sdk/server-ai/src/ldai/models.py index 07b02c23..43d9c9b5 100644 --- a/packages/sdk/server-ai/src/ldai/models.py +++ b/packages/sdk/server-ai/src/ldai/models.py @@ -2,8 +2,6 @@ from dataclasses import dataclass, field from typing import Any, Dict, List, Literal, Optional, Union -from ldai.tracker import LDAIConfigTracker - @dataclass class LDMessage: @@ -182,7 +180,7 @@ class AIConfig: enabled: bool model: Optional[ModelConfig] = None provider: Optional[ProviderConfig] = None - tracker: Optional[LDAIConfigTracker] = None + tracker: Optional[Any] = None def _base_to_dict(self) -> Dict[str, Any]: """ diff --git a/packages/sdk/server-ai/src/ldai/providers/__init__.py b/packages/sdk/server-ai/src/ldai/providers/__init__.py index 0148698b..b2bfa72e 100644 --- a/packages/sdk/server-ai/src/ldai/providers/__init__.py +++ b/packages/sdk/server-ai/src/ldai/providers/__init__.py @@ -6,8 +6,7 @@ from ldai.providers.types import ( AgentGraphResult, AgentResult, - EvalScore, - JudgeResponse, + JudgeResult, LDAIMetrics, ModelResponse, StructuredResponse, @@ -20,8 +19,7 @@ 'AgentGraphRunner', 'AgentResult', 'AgentRunner', - 'EvalScore', - 'JudgeResponse', + 'JudgeResult', 'LDAIMetrics', 'ModelResponse', 'ModelRunner', diff --git a/packages/sdk/server-ai/src/ldai/providers/types.py b/packages/sdk/server-ai/src/ldai/providers/types.py index bb87350e..4ad626ba 100644 --- a/packages/sdk/server-ai/src/ldai/providers/types.py +++ b/packages/sdk/server-ai/src/ldai/providers/types.py @@ -44,7 +44,7 @@ class ModelResponse: """ message: LDMessage metrics: LDAIMetrics - evaluations: Optional[List[JudgeResponse]] = None + evaluations: Optional[List[JudgeResult]] = None @dataclass @@ -58,45 +58,36 @@ class StructuredResponse: @dataclass -class EvalScore: +class JudgeResult: """ - Score and reasoning for a single evaluation metric. + Result from a judge evaluation. """ - score: float # Score between 0.0 and 1.0 - reasoning: str # Reasoning behind the provided score + judge_config_key: Optional[str] = None + success: bool = False + error_message: Optional[str] = None + sampled: bool = False # True when the judge was skipped due to sampling rate + score: Optional[float] = None + reasoning: Optional[str] = None + metric_key: Optional[str] = None def to_dict(self) -> Dict[str, Any]: """ - Render the evaluation score as a dictionary object. - """ - return { - 'score': self.score, - 'reasoning': self.reasoning, - } - - -@dataclass -class JudgeResponse: - """ - Response from a judge evaluation containing scores and reasoning for multiple metrics. - """ - evals: Dict[str, EvalScore] # Dictionary where keys are metric names and values contain score and reasoning - success: bool # Whether the evaluation completed successfully - judge_config_key: Optional[str] = None # The key of the judge configuration that was used to generate this response - error: Optional[str] = None # Error message if evaluation failed - - def to_dict(self) -> Dict[str, Any]: - """ - Render the judge response as a dictionary object. + Render the judge result as a dictionary object. """ result: Dict[str, Any] = { - 'evals': {key: eval_score.to_dict() for key, eval_score in self.evals.items()}, 'success': self.success, + 'sampled': self.sampled, } + if self.score is not None: + result['score'] = self.score + if self.reasoning is not None: + result['reasoning'] = self.reasoning + if self.metric_key is not None: + result['metricKey'] = self.metric_key if self.judge_config_key is not None: result['judgeConfigKey'] = self.judge_config_key - if self.error is not None: - result['error'] = self.error + if self.error_message is not None: + result['errorMessage'] = self.error_message return result diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index c84365ab..b1894f40 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -246,49 +246,23 @@ async def track_metrics_of_async( self.track_duration(duration, graph_key=graph_key) return self._track_from_metrics_extractor(result, metrics_extractor, graph_key=graph_key) - def track_eval_scores(self, scores: Dict[str, Any], *, graph_key: Optional[str] = None) -> None: + def track_judge_result(self, judge_result: Any, *, graph_key: Optional[str] = None) -> None: """ - Track evaluation scores for multiple metrics. + Track a judge result, including the evaluation score with judge config key. - :param scores: Dictionary mapping metric keys to their evaluation scores (EvalScore objects) + :param judge_result: JudgeResult object containing score, metric key, and success status :param graph_key: When set, include ``graphKey`` in the event payload. """ - from ldai.providers.types import EvalScore - - # Track each evaluation score individually - for metric_key, eval_score in scores.items(): - if isinstance(eval_score, EvalScore): - self._ld_client.track( - metric_key, - self._context, - self.__get_track_data(graph_key=graph_key), - eval_score.score - ) - - def track_judge_response(self, judge_response: Any, *, graph_key: Optional[str] = None) -> None: - """ - Track a judge response, including evaluation scores with judge config key. - - :param judge_response: JudgeResponse object containing evals and success status - :param graph_key: When set, include ``graphKey`` in the event payload. - """ - from ldai.providers.types import EvalScore, JudgeResponse - - if isinstance(judge_response, JudgeResponse): - # Track evaluation scores with judge config key included in metadata - if judge_response.evals: - track_data = self.__get_track_data(graph_key=graph_key) - if judge_response.judge_config_key: - track_data = {**track_data, 'judgeConfigKey': judge_response.judge_config_key} - - for metric_key, eval_score in judge_response.evals.items(): - if isinstance(eval_score, EvalScore): - self._ld_client.track( - metric_key, - self._context, - track_data, - eval_score.score - ) + if judge_result.success and judge_result.metric_key: + track_data = self.__get_track_data(graph_key=graph_key) + if judge_result.judge_config_key: + track_data = {**track_data, 'judgeConfigKey': judge_result.judge_config_key} + self._ld_client.track( + judge_result.metric_key, + self._context, + track_data, + judge_result.score, + ) def track_feedback(self, feedback: Dict[str, FeedbackKind], *, graph_key: Optional[str] = None) -> None: """ @@ -595,29 +569,6 @@ def track_path(self, path: List[str]) -> None: 1, ) - def track_judge_response(self, response: Any) -> None: - """ - Track judge responses for the final graph output. - - :param response: JudgeResponse object containing evals and success status. - """ - from ldai.providers.types import EvalScore, JudgeResponse - - if isinstance(response, JudgeResponse): - if response.evals: - track_data = self.__get_track_data() - if response.judge_config_key: - track_data = {**track_data, "judgeConfigKey": response.judge_config_key} - - for metric_key, eval_score in response.evals.items(): - if isinstance(eval_score, EvalScore): - self._ld_client.track( - metric_key, - self._context, - track_data, - eval_score.score, - ) - def track_redirect(self, source_key: str, redirected_target: str) -> None: """ Track when a node redirects to a different target than originally specified. diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index e61ac4a0..76bccd0f 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -9,7 +9,7 @@ from ldai.judge import Judge from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import AIJudgeConfig, AIJudgeConfigDefault, LDMessage, ModelConfig, ProviderConfig -from ldai.providers.types import EvalScore, JudgeResponse, LDAIMetrics, StructuredResponse +from ldai.providers.types import JudgeResult, LDAIMetrics, StructuredResponse from ldai.tracker import LDAIConfigTracker @@ -118,27 +118,31 @@ class TestJudgeEvaluate: """Tests for Judge.evaluate() method.""" @pytest.mark.asyncio - async def test_evaluate_returns_none_when_evaluation_metric_key_missing( + async def test_evaluate_returns_failure_when_evaluation_metric_key_missing( self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): - """Evaluate should return None when evaluation_metric_key is missing.""" + """Evaluate should return a failed JudgeResult when evaluation_metric_key is missing.""" judge = Judge(judge_config_without_key, tracker, mock_runner) - + result = await judge.evaluate("input text", "output text") - - assert result is None + + assert isinstance(result, JudgeResult) + assert result.success is False + assert result.sampled is False mock_runner.invoke_structured_model.assert_not_called() @pytest.mark.asyncio - async def test_evaluate_returns_none_when_messages_missing( + async def test_evaluate_returns_failure_when_messages_missing( self, judge_config_without_messages: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): - """Evaluate should return None when messages are missing.""" + """Evaluate should return a failed JudgeResult when messages are missing.""" judge = Judge(judge_config_without_messages, tracker, mock_runner) - + result = await judge.evaluate("input text", "output text") - - assert result is None + + assert isinstance(result, JudgeResult) + assert result.success is False + assert result.sampled is False mock_runner.invoke_structured_model.assert_not_called() @pytest.mark.asyncio @@ -162,12 +166,12 @@ async def test_evaluate_success_with_valid_response( result = await judge.evaluate("What is AI?", "AI is artificial intelligence.") - assert result is not None - assert isinstance(result, JudgeResponse) + assert isinstance(result, JudgeResult) assert result.success is True - assert '$ld:ai:judge:relevance' in result.evals - assert result.evals['$ld:ai:judge:relevance'].score == 0.85 - assert 'relevant' in result.evals['$ld:ai:judge:relevance'].reasoning.lower() + assert result.metric_key == '$ld:ai:judge:relevance' + assert result.score == 0.85 + assert result.reasoning is not None + assert 'relevant' in result.reasoning.lower() @pytest.mark.asyncio async def test_evaluate_success_with_evaluation_response_shape( @@ -188,11 +192,12 @@ async def test_evaluate_success_with_evaluation_response_shape( judge = Judge(judge_config_with_key, tracker, mock_runner) result = await judge.evaluate("What is feature flagging?", "Feature flagging is...") - assert result is not None + assert isinstance(result, JudgeResult) assert result.success is True - assert '$ld:ai:judge:relevance' in result.evals - assert result.evals['$ld:ai:judge:relevance'].score == 0.9 - assert 'accurate' in result.evals['$ld:ai:judge:relevance'].reasoning.lower() + assert result.metric_key == '$ld:ai:judge:relevance' + assert result.score == 0.9 + assert result.reasoning is not None + assert 'accurate' in result.reasoning.lower() @pytest.mark.asyncio async def test_evaluate_handles_missing_evaluation_in_response( @@ -212,9 +217,9 @@ async def test_evaluate_handles_missing_evaluation_in_response( result = await judge.evaluate("input", "output") - assert result is not None + assert isinstance(result, JudgeResult) assert result.success is False - assert len(result.evals) == 0 + assert result.score is None @pytest.mark.asyncio async def test_evaluate_handles_invalid_score( @@ -237,9 +242,9 @@ async def test_evaluate_handles_invalid_score( result = await judge.evaluate("input", "output") - assert result is not None + assert isinstance(result, JudgeResult) assert result.success is False - assert len(result.evals) == 0 + assert result.score is None @pytest.mark.asyncio async def test_evaluate_handles_missing_reasoning( @@ -259,9 +264,9 @@ async def test_evaluate_handles_missing_reasoning( result = await judge.evaluate("input", "output") - assert result is not None + assert isinstance(result, JudgeResult) assert result.success is False - assert len(result.evals) == 0 + assert result.score is None @pytest.mark.asyncio async def test_evaluate_handles_exception( @@ -275,22 +280,22 @@ async def test_evaluate_handles_exception( result = await judge.evaluate("input", "output") - assert result is not None - assert isinstance(result, JudgeResponse) + assert isinstance(result, JudgeResult) assert result.success is False - assert result.error is not None - assert len(result.evals) == 0 + assert result.error_message is not None @pytest.mark.asyncio async def test_evaluate_respects_sampling_rate( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): - """Evaluate should respect sampling rate.""" + """Evaluate should return sampled=True when skipped due to sampling rate.""" judge = Judge(judge_config_with_key, tracker, mock_runner) - + result = await judge.evaluate("input", "output", sampling_rate=0.0) - - assert result is None + + assert isinstance(result, JudgeResult) + assert result.sampled is True + assert result.success is False mock_runner.invoke_structured_model.assert_not_called()