Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions packages/sdk/server-ai/src/ldai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
AgentRunner,
ToolRegistry,
)
from ldai.providers.types import EvalScore, JudgeResponse
from ldai.providers.types import JudgeResult
from ldai.tracker import AIGraphTracker

__all__ = [
Expand All @@ -60,11 +60,10 @@
'ManagedAgent',
'ManagedModel',
'ManagedAgentGraph',
'EvalScore',
'AgentGraphDefinition',
'Judge',
'JudgeConfiguration',
'JudgeResponse',
'JudgeResult',
'LDMessage',
'ModelConfig',
'ProviderConfig',
Expand Down
69 changes: 33 additions & 36 deletions packages/sdk/server-ai/src/ldai/judge/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
"""Judge implementation for AI evaluation."""

import random
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, Tuple

import chevron

from ldai import log
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
from ldai.models import AIJudgeConfig, LDMessage
from ldai.providers.model_runner import ModelRunner
from ldai.providers.types import EvalScore, JudgeResponse, ModelResponse
from ldai.providers.types import JudgeResult, ModelResponse
from ldai.tracker import LDAIConfigTracker


Expand Down Expand Up @@ -44,29 +44,34 @@ async def evaluate(
input_text: str,
output_text: str,
sampling_rate: float = 1.0,
) -> Optional[JudgeResponse]:
) -> JudgeResult:
"""
Evaluates an AI response using the judge's configuration.

:param input_text: The input prompt or question that was provided to the AI
:param output_text: The AI-generated response to be evaluated
:param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1)
:return: Evaluation results or None if not sampled
:return: Evaluation result; ``sampled=True`` when skipped due to sampling rate
"""
judge_result = JudgeResult(judge_config_key=self._ai_config.key)

try:
if not self._ai_config.evaluation_metric_key:
log.warning(
'Judge configuration is missing required evaluationMetricKey'
)
return None
judge_result.error_message = 'Judge configuration is missing required evaluationMetricKey'
return judge_result

if not self._ai_config.messages:
log.warning('Judge configuration must include messages')
return None
judge_result.error_message = 'Judge configuration must include messages'
return judge_result

if random.random() > sampling_rate:
log.debug(f'Judge evaluation skipped due to sampling rate: {sampling_rate}')
return None
judge_result.sampled = True
return judge_result

messages = self._construct_evaluation_messages(input_text, output_text)
assert self._evaluation_response_structure is not None
Expand All @@ -76,39 +81,36 @@ async def evaluate(
lambda result: result.metrics,
)

success = response.metrics.success
evals = self._parse_evaluation_response(response.data)
parsed = self._parse_evaluation_response(response.data)

if not evals:
if parsed is None:
log.warning('Judge evaluation did not return the expected evaluation')
success = False

return JudgeResponse(
judge_config_key=self._ai_config.key,
evals=evals,
success=success,
)
return judge_result

score, reasoning = parsed
judge_result.metric_key = self._ai_config.evaluation_metric_key
judge_result.score = score
judge_result.reasoning = reasoning
judge_result.success = response.metrics.success
return judge_result
except Exception as error:
log.error(f'Judge evaluation failed: {error}')
return JudgeResponse(
evals={},
success=False,
error=str(error) if isinstance(error, Exception) else 'Unknown error',
)
judge_result.error_message = str(error) if isinstance(error, Exception) else 'Unknown error'
return judge_result

async def evaluate_messages(
self,
messages: list[LDMessage],
response: ModelResponse,
sampling_ratio: float = 1.0,
) -> Optional[JudgeResponse]:
) -> JudgeResult:
"""
Evaluates an AI response from chat messages and response.

:param messages: Array of messages representing the conversation history
:param response: The AI response to be evaluated
:param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1)
:return: Evaluation results or None if not sampled
:return: Evaluation result; ``sampled=True`` when skipped due to sampling rate
"""
input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
output_text = response.message.content
Expand Down Expand Up @@ -172,28 +174,23 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
# Use chevron (Mustache) for templating, with no escaping
return chevron.render(content, variables)

def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
def _parse_evaluation_response(self, data: Dict[str, Any]) -> Optional[Tuple[float, str]]:
"""
Parses the structured evaluation response. Expects {"score": n, "reasoning": "..."}.
"""
results: Dict[str, EvalScore] = {}
metric_key = self._ai_config.evaluation_metric_key
if not metric_key:
log.warning('Evaluation metric key is missing')
return results

:return: ``(score, reasoning)`` on success, or ``None`` if the response is invalid.
"""
if not isinstance(data, dict):
log.warning('Invalid response: missing or invalid evaluation')
return results
return None

score = data.get('score')
reasoning = data.get('reasoning')
if not isinstance(score, (int, float)) or score < 0 or score > 1:
log.warning(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive')
return results
return None
if not isinstance(reasoning, str):
log.warning('Invalid reasoning: must be a string')
return results
return None

results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
return results
return (float(score), reasoning)
14 changes: 7 additions & 7 deletions packages/sdk/server-ai/src/ldai/managed_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ldai.judge import Judge
from ldai.models import AICompletionConfig, LDMessage
from ldai.providers.model_runner import ModelRunner
from ldai.providers.types import JudgeResponse, ModelResponse
from ldai.providers.types import JudgeResult, ModelResponse
from ldai.tracker import LDAIConfigTracker


Expand Down Expand Up @@ -66,19 +66,19 @@ def _start_judge_evaluations(
self,
messages: List[LDMessage],
response: ModelResponse,
) -> List[asyncio.Task[Optional[JudgeResponse]]]:
) -> List[asyncio.Task[Optional[JudgeResult]]]:
if not self._ai_config.judge_configuration or not self._ai_config.judge_configuration.judges:
return []

async def evaluate_judge(judge_config: Any) -> Optional[JudgeResponse]:
async def evaluate_judge(judge_config: Any) -> Optional[JudgeResult]:
judge = self._judges.get(judge_config.key)
if not judge:
log.warning(f'Judge configuration is not enabled: {judge_config.key}')
return None
eval_result = await judge.evaluate_messages(messages, response, judge_config.sampling_rate)
if eval_result and eval_result.success:
self._tracker.track_judge_response(eval_result)
return eval_result
judge_result = await judge.evaluate_messages(messages, response, judge_config.sampling_rate)
if judge_result.success:
self._tracker.track_judge_result(judge_result)
return judge_result

return [
asyncio.create_task(evaluate_judge(jc))
Expand Down
4 changes: 1 addition & 3 deletions packages/sdk/server-ai/src/ldai/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from dataclasses import dataclass, field
from typing import Any, Dict, List, Literal, Optional, Union

from ldai.tracker import LDAIConfigTracker


@dataclass
class LDMessage:
Expand Down Expand Up @@ -182,7 +180,7 @@ class AIConfig:
enabled: bool
model: Optional[ModelConfig] = None
provider: Optional[ProviderConfig] = None
tracker: Optional[LDAIConfigTracker] = None
tracker: Optional[Any] = None

def _base_to_dict(self) -> Dict[str, Any]:
"""
Expand Down
6 changes: 2 additions & 4 deletions packages/sdk/server-ai/src/ldai/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from ldai.providers.types import (
AgentGraphResult,
AgentResult,
EvalScore,
JudgeResponse,
JudgeResult,
LDAIMetrics,
ModelResponse,
StructuredResponse,
Expand All @@ -20,8 +19,7 @@
'AgentGraphRunner',
'AgentResult',
'AgentRunner',
'EvalScore',
'JudgeResponse',
'JudgeResult',
'LDAIMetrics',
'ModelResponse',
'ModelRunner',
Expand Down
49 changes: 20 additions & 29 deletions packages/sdk/server-ai/src/ldai/providers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class ModelResponse:
"""
message: LDMessage
metrics: LDAIMetrics
evaluations: Optional[List[JudgeResponse]] = None
evaluations: Optional[List[JudgeResult]] = None


@dataclass
Expand All @@ -58,45 +58,36 @@ class StructuredResponse:


@dataclass
class EvalScore:
class JudgeResult:
"""
Score and reasoning for a single evaluation metric.
Result from a judge evaluation.
"""
score: float # Score between 0.0 and 1.0
reasoning: str # Reasoning behind the provided score
judge_config_key: Optional[str] = None
success: bool = False
error_message: Optional[str] = None
sampled: bool = False # True when the judge was skipped due to sampling rate
score: Optional[float] = None
reasoning: Optional[str] = None
metric_key: Optional[str] = None

def to_dict(self) -> Dict[str, Any]:
"""
Render the evaluation score as a dictionary object.
"""
return {
'score': self.score,
'reasoning': self.reasoning,
}


@dataclass
class JudgeResponse:
"""
Response from a judge evaluation containing scores and reasoning for multiple metrics.
"""
evals: Dict[str, EvalScore] # Dictionary where keys are metric names and values contain score and reasoning
success: bool # Whether the evaluation completed successfully
judge_config_key: Optional[str] = None # The key of the judge configuration that was used to generate this response
error: Optional[str] = None # Error message if evaluation failed

def to_dict(self) -> Dict[str, Any]:
"""
Render the judge response as a dictionary object.
Render the judge result as a dictionary object.
"""
result: Dict[str, Any] = {
'evals': {key: eval_score.to_dict() for key, eval_score in self.evals.items()},
'success': self.success,
'sampled': self.sampled,
}
if self.score is not None:
result['score'] = self.score
if self.reasoning is not None:
result['reasoning'] = self.reasoning
if self.metric_key is not None:
result['metricKey'] = self.metric_key
if self.judge_config_key is not None:
result['judgeConfigKey'] = self.judge_config_key
if self.error is not None:
result['error'] = self.error
if self.error_message is not None:
result['errorMessage'] = self.error_message
return result


Expand Down
Loading
Loading