Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions src/eval_framework/metrics/llm/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,77 @@
import functools
import logging
import traceback
from collections.abc import Callable
from typing import Any

from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Completion, Error

logger = logging.getLogger(__name__)


def safe_metric_calculation(func: Callable) -> Callable:
"""
Decorator that wraps LLM judge metric calculate methods with exception handling.

This decorator ensures that exceptions during metric calculation don't crash the
entire evaluation process. Instead, exceptions are caught and converted to
MetricResult objects with appropriate error information.
"""

@functools.wraps(func)
def wrapper(self: Any, response: Completion) -> list[MetricResult]:
# Get metric configuration from the class
metric_names = getattr(self, "NAMES", [self.NAME])
higher_is_better = getattr(self, "_higher_is_better", True)

# Handle pre-existing response error
if response.error is not None:
logger.debug(f"Skipping {self.NAME} calculation - response already has error: {response.error}")
return [
MetricResult(
metric_name=name,
value=None,
higher_is_better=higher_is_better,
error=response.error,
)
for name in metric_names
]

# Execute the actual calculation with exception handling
try:
return func(self, response)
except Exception as e:
logger.warning(f"LLM judge metric {self.NAME} failed with {e.__class__.__name__}: {e}")
error = Error(
error_class=e.__class__.__name__,
message=str(e),
traceback=traceback.format_exc(),
)
return [
MetricResult(
metric_name=name,
value=None,
higher_is_better=higher_is_better,
error=error,
)
for name in metric_names
]

return wrapper


class BaseLLMJudgeMetric(BaseMetric[Completion]):
"""Base class for LLM-as-judge metrics.

Attributes:
_higher_is_better: Override in subclasses where lower values are better (e.g., world knowledge).
Used by the safe_metric_calculation decorator for error results.
"""

_higher_is_better: bool = True

def __init__(self, llm_judge: BaseLLM, randomize_order: bool = False) -> None:
self._llm_judge = llm_judge
self._randomize_order = randomize_order
Expand Down
7 changes: 2 additions & 5 deletions src/eval_framework/metrics/llm/llm_judge_chatbot_style.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation
from eval_framework.metrics.llm.graders.chatbot_style_grader import ChatbotStyleGrader
from eval_framework.metrics.llm.graders.language import Language
from eval_framework.shared.types import Completion
Expand All @@ -13,10 +13,8 @@ def __init__(self, llm_judge: BaseLLM):
super().__init__(llm_judge)
self._grader = ChatbotStyleGrader(llm_judge)

@safe_metric_calculation
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]

language = Language(response.get_instruction_language())

grading = self._grader.grade(
Expand All @@ -31,6 +29,5 @@ def calculate(self, response: Completion) -> list[MetricResult]:
higher_is_better=True,
llm_judge_prompt=grading.judge_prompt,
llm_judge_response=grading.judge_response,
error=response.error,
)
]
29 changes: 11 additions & 18 deletions src/eval_framework/metrics/llm/llm_judge_coherence.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation
from eval_framework.metrics.llm.graders.coherence_grader import CoherenceGrader
from eval_framework.metrics.llm.graders.language import Language
from eval_framework.shared.types import Completion
Expand All @@ -16,15 +16,8 @@ def __init__(self, llm_judge: BaseLLM):
super().__init__(llm_judge)
self._grader = CoherenceGrader(llm_judge)

@safe_metric_calculation
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
for key in self.KEYS:
return [
MetricResult(
metric_name=f"{self.NAME} - {key}", value=None, higher_is_better=True, error=response.error
)
]

language = Language(response.get_instruction_language())

grading = self._grader.grade(
Expand All @@ -33,12 +26,12 @@ def calculate(self, response: Completion) -> list[MetricResult]:
language=language,
)

result = MetricResult(
metric_name=f"{self.NAME}/coherence_score",
value=grading.coherence_score,
higher_is_better=True,
llm_judge_prompt=grading.judge_prompt,
llm_judge_response=grading.judge_response,
error=response.error,
)
return [result]
return [
MetricResult(
metric_name=f"{self.NAME}/coherence_score",
value=grading.coherence_score,
higher_is_better=True,
llm_judge_prompt=grading.judge_prompt,
llm_judge_response=grading.judge_response,
)
]
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation
from eval_framework.metrics.llm.graders.language import Language
from eval_framework.metrics.llm.graders.long_context_grader import LongContextGrader
from eval_framework.shared.types import Completion
Expand All @@ -13,10 +13,8 @@ def __init__(self, llm_judge: BaseLLM):
super().__init__(llm_judge)
self._grader = LongContextGrader(llm_judge)

@safe_metric_calculation
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]

assert isinstance(response.ground_truth, str)

language = Language(response.get_instruction_language())
Expand All @@ -34,6 +32,5 @@ def calculate(self, response: Completion) -> list[MetricResult]:
higher_is_better=True,
llm_judge_prompt=grading.judge_prompt,
llm_judge_response=grading.judge_response,
error=response.error,
)
]
7 changes: 2 additions & 5 deletions src/eval_framework/metrics/llm/llm_judge_conciseness.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation
from eval_framework.metrics.llm.graders.conciseness_grader import ConcisenessGrader
from eval_framework.metrics.llm.graders.language import Language
from eval_framework.shared.types import Completion
Expand All @@ -13,10 +13,8 @@ def __init__(self, llm_judge: BaseLLM):
super().__init__(llm_judge)
self._grader = ConcisenessGrader(llm_judge)

@safe_metric_calculation
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]

language = Language(response.get_instruction_language())

grading = self._grader.grade(
Expand All @@ -32,6 +30,5 @@ def calculate(self, response: Completion) -> list[MetricResult]:
higher_is_better=True,
llm_judge_prompt=grading.judge_prompt,
llm_judge_response=grading.judge_response,
error=response.error,
)
]
7 changes: 2 additions & 5 deletions src/eval_framework/metrics/llm/llm_judge_contains_names.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation
from eval_framework.metrics.llm.graders.contains_names_grader import ContainsNamesGrader
from eval_framework.metrics.llm.graders.language import Language
from eval_framework.shared.types import Completion
Expand All @@ -13,10 +13,8 @@ def __init__(self, llm_judge: BaseLLM):
super().__init__(llm_judge)
self._grader = ContainsNamesGrader(llm_judge)

@safe_metric_calculation
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]

language = Language(response.get_instruction_language())

grading = self._grader.grade(
Expand All @@ -31,6 +29,5 @@ def calculate(self, response: Completion) -> list[MetricResult]:
higher_is_better=True,
llm_judge_prompt=grading.judge_prompt,
llm_judge_response=grading.judge_response,
error=response.error,
)
]
11 changes: 3 additions & 8 deletions src/eval_framework/metrics/llm/llm_judge_format_correctness.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import (
MetricResult,
)
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation
from eval_framework.metrics.llm.graders.format_correctness_grader import FormatCorrectnessGrader
from eval_framework.metrics.llm.graders.language import Language
from eval_framework.shared.types import BaseMetricContext, Completion, LanguageMetricContext, extract_context_metric
Expand All @@ -19,10 +17,8 @@ def __init__(self, llm_judge: BaseLLM):
super().__init__(llm_judge)
self._grader = FormatCorrectnessGrader(llm_judge)

@safe_metric_calculation
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]

context = extract_context_metric(response, LanguageMetricContext)

grading = self._grader.grade(
Expand All @@ -38,6 +34,5 @@ def calculate(self, response: Completion) -> list[MetricResult]:
higher_is_better=True,
llm_judge_prompt=grading.judge_prompt,
llm_judge_response=grading.judge_response,
error=response.error,
)
]
12 changes: 2 additions & 10 deletions src/eval_framework/metrics/llm/llm_judge_instruction.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation
from eval_framework.metrics.llm.graders.instruction_grader import InstructionGrader
from eval_framework.metrics.llm.graders.language import Language
from eval_framework.shared.types import Completion
Expand All @@ -22,15 +22,8 @@ def __init__(self, llm_judge: BaseLLM):
super().__init__(llm_judge)
self._grader = InstructionGrader(llm_judge)

@safe_metric_calculation
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
for key in self.KEYS:
return [
MetricResult(
metric_name=f"{self.NAME} - {key}", value=None, higher_is_better=True, error=response.error
)
]

language = Language(response.get_instruction_language())

grading = self._grader.grade(
Expand All @@ -52,7 +45,6 @@ def calculate(self, response: Completion) -> list[MetricResult]:
higher_is_better=True,
llm_judge_prompt=grading.judge_prompt,
llm_judge_response=grading.judge_response,
error=response.error,
)
results.append(result)
return results
7 changes: 2 additions & 5 deletions src/eval_framework/metrics/llm/llm_judge_refusal.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation
from eval_framework.metrics.llm.graders.language import Language
from eval_framework.metrics.llm.graders.refusal_grader import RefusalGrader
from eval_framework.shared.types import Completion
Expand All @@ -13,10 +13,8 @@ def __init__(self, llm_judge: BaseLLM):
super().__init__(llm_judge)
self._grader = RefusalGrader(llm_judge)

@safe_metric_calculation
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=False, error=response.error)]

language = response.get_completion_language() or response.get_instruction_language() or "en"
grading = self._grader.grade(
completion=response.sanitized_completion,
Expand All @@ -30,6 +28,5 @@ def calculate(self, response: Completion) -> list[MetricResult]:
higher_is_better=True,
llm_judge_prompt=grading.judge_prompt,
llm_judge_response=grading.judge_response,
error=response.error,
)
]
Loading