From cf54bf2a4db25fc18f1854f5f8fe0e044a6887a3 Mon Sep 17 00:00:00 2001 From: ahmedhammam Date: Tue, 13 Jan 2026 10:14:51 +0000 Subject: [PATCH] feat: add safe_metric_calculation decorator for LLM judge error handling --- src/eval_framework/metrics/llm/base.py | 66 ++++ .../metrics/llm/llm_judge_chatbot_style.py | 7 +- .../metrics/llm/llm_judge_coherence.py | 29 +- .../llm/llm_judge_completion_accuracy.py | 7 +- .../metrics/llm/llm_judge_conciseness.py | 7 +- .../metrics/llm/llm_judge_contains_names.py | 7 +- .../llm/llm_judge_format_correctness.py | 11 +- .../metrics/llm/llm_judge_instruction.py | 12 +- .../metrics/llm/llm_judge_refusal.py | 7 +- .../metrics/llm/llm_judge_sql.py | 30 +- .../metrics/llm/llm_judge_world_knowledge.py | 8 +- .../test_safe_metric_calculation.py | 290 ++++++++++++++++++ 12 files changed, 395 insertions(+), 86 deletions(-) create mode 100644 tests/tests_eval_framework/metrics/llm_metrics/test_safe_metric_calculation.py diff --git a/src/eval_framework/metrics/llm/base.py b/src/eval_framework/metrics/llm/base.py index ab256cc3..4fdc8132 100644 --- a/src/eval_framework/metrics/llm/base.py +++ b/src/eval_framework/metrics/llm/base.py @@ -1,11 +1,77 @@ +import functools +import logging import traceback +from collections.abc import Callable +from typing import Any from eval_framework.llm.base import BaseLLM from eval_framework.metrics.base import BaseMetric, MetricResult from eval_framework.shared.types import Completion, Error +logger = logging.getLogger(__name__) + + +def safe_metric_calculation(func: Callable) -> Callable: + """ + Decorator that wraps LLM judge metric calculate methods with exception handling. + + This decorator ensures that exceptions during metric calculation don't crash the + entire evaluation process. Instead, exceptions are caught and converted to + MetricResult objects with appropriate error information. + """ + + @functools.wraps(func) + def wrapper(self: Any, response: Completion) -> list[MetricResult]: + # Get metric configuration from the class + metric_names = getattr(self, "NAMES", [self.NAME]) + higher_is_better = getattr(self, "_higher_is_better", True) + + # Handle pre-existing response error + if response.error is not None: + logger.debug(f"Skipping {self.NAME} calculation - response already has error: {response.error}") + return [ + MetricResult( + metric_name=name, + value=None, + higher_is_better=higher_is_better, + error=response.error, + ) + for name in metric_names + ] + + # Execute the actual calculation with exception handling + try: + return func(self, response) + except Exception as e: + logger.warning(f"LLM judge metric {self.NAME} failed with {e.__class__.__name__}: {e}") + error = Error( + error_class=e.__class__.__name__, + message=str(e), + traceback=traceback.format_exc(), + ) + return [ + MetricResult( + metric_name=name, + value=None, + higher_is_better=higher_is_better, + error=error, + ) + for name in metric_names + ] + + return wrapper + class BaseLLMJudgeMetric(BaseMetric[Completion]): + """Base class for LLM-as-judge metrics. + + Attributes: + _higher_is_better: Override in subclasses where lower values are better (e.g., world knowledge). + Used by the safe_metric_calculation decorator for error results. + """ + + _higher_is_better: bool = True + def __init__(self, llm_judge: BaseLLM, randomize_order: bool = False) -> None: self._llm_judge = llm_judge self._randomize_order = randomize_order diff --git a/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py b/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py index 62c8b66e..7be31472 100644 --- a/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +++ b/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py @@ -1,6 +1,6 @@ from eval_framework.llm.base import BaseLLM from eval_framework.metrics.base import MetricResult -from eval_framework.metrics.llm.base import BaseLLMJudgeMetric +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation from eval_framework.metrics.llm.graders.chatbot_style_grader import ChatbotStyleGrader from eval_framework.metrics.llm.graders.language import Language from eval_framework.shared.types import Completion @@ -13,10 +13,8 @@ def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) self._grader = ChatbotStyleGrader(llm_judge) + @safe_metric_calculation def calculate(self, response: Completion) -> list[MetricResult]: - if response.error is not None: - return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] - language = Language(response.get_instruction_language()) grading = self._grader.grade( @@ -31,6 +29,5 @@ def calculate(self, response: Completion) -> list[MetricResult]: higher_is_better=True, llm_judge_prompt=grading.judge_prompt, llm_judge_response=grading.judge_response, - error=response.error, ) ] diff --git a/src/eval_framework/metrics/llm/llm_judge_coherence.py b/src/eval_framework/metrics/llm/llm_judge_coherence.py index 1d1aa8d6..18d3b867 100644 --- a/src/eval_framework/metrics/llm/llm_judge_coherence.py +++ b/src/eval_framework/metrics/llm/llm_judge_coherence.py @@ -1,6 +1,6 @@ from eval_framework.llm.base import BaseLLM from eval_framework.metrics.base import MetricResult -from eval_framework.metrics.llm.base import BaseLLMJudgeMetric +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation from eval_framework.metrics.llm.graders.coherence_grader import CoherenceGrader from eval_framework.metrics.llm.graders.language import Language from eval_framework.shared.types import Completion @@ -16,15 +16,8 @@ def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) self._grader = CoherenceGrader(llm_judge) + @safe_metric_calculation def calculate(self, response: Completion) -> list[MetricResult]: - if response.error is not None: - for key in self.KEYS: - return [ - MetricResult( - metric_name=f"{self.NAME} - {key}", value=None, higher_is_better=True, error=response.error - ) - ] - language = Language(response.get_instruction_language()) grading = self._grader.grade( @@ -33,12 +26,12 @@ def calculate(self, response: Completion) -> list[MetricResult]: language=language, ) - result = MetricResult( - metric_name=f"{self.NAME}/coherence_score", - value=grading.coherence_score, - higher_is_better=True, - llm_judge_prompt=grading.judge_prompt, - llm_judge_response=grading.judge_response, - error=response.error, - ) - return [result] + return [ + MetricResult( + metric_name=f"{self.NAME}/coherence_score", + value=grading.coherence_score, + higher_is_better=True, + llm_judge_prompt=grading.judge_prompt, + llm_judge_response=grading.judge_response, + ) + ] diff --git a/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py b/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py index cc31dc27..a2ab2e18 100644 --- a/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +++ b/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py @@ -1,6 +1,6 @@ from eval_framework.llm.base import BaseLLM from eval_framework.metrics.base import MetricResult -from eval_framework.metrics.llm.base import BaseLLMJudgeMetric +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation from eval_framework.metrics.llm.graders.language import Language from eval_framework.metrics.llm.graders.long_context_grader import LongContextGrader from eval_framework.shared.types import Completion @@ -13,10 +13,8 @@ def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) self._grader = LongContextGrader(llm_judge) + @safe_metric_calculation def calculate(self, response: Completion) -> list[MetricResult]: - if response.error is not None: - return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] - assert isinstance(response.ground_truth, str) language = Language(response.get_instruction_language()) @@ -34,6 +32,5 @@ def calculate(self, response: Completion) -> list[MetricResult]: higher_is_better=True, llm_judge_prompt=grading.judge_prompt, llm_judge_response=grading.judge_response, - error=response.error, ) ] diff --git a/src/eval_framework/metrics/llm/llm_judge_conciseness.py b/src/eval_framework/metrics/llm/llm_judge_conciseness.py index 5d8b6b60..2f228cdf 100644 --- a/src/eval_framework/metrics/llm/llm_judge_conciseness.py +++ b/src/eval_framework/metrics/llm/llm_judge_conciseness.py @@ -1,6 +1,6 @@ from eval_framework.llm.base import BaseLLM from eval_framework.metrics.base import MetricResult -from eval_framework.metrics.llm.base import BaseLLMJudgeMetric +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation from eval_framework.metrics.llm.graders.conciseness_grader import ConcisenessGrader from eval_framework.metrics.llm.graders.language import Language from eval_framework.shared.types import Completion @@ -13,10 +13,8 @@ def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) self._grader = ConcisenessGrader(llm_judge) + @safe_metric_calculation def calculate(self, response: Completion) -> list[MetricResult]: - if response.error is not None: - return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] - language = Language(response.get_instruction_language()) grading = self._grader.grade( @@ -32,6 +30,5 @@ def calculate(self, response: Completion) -> list[MetricResult]: higher_is_better=True, llm_judge_prompt=grading.judge_prompt, llm_judge_response=grading.judge_response, - error=response.error, ) ] diff --git a/src/eval_framework/metrics/llm/llm_judge_contains_names.py b/src/eval_framework/metrics/llm/llm_judge_contains_names.py index a17ff884..8faaf037 100644 --- a/src/eval_framework/metrics/llm/llm_judge_contains_names.py +++ b/src/eval_framework/metrics/llm/llm_judge_contains_names.py @@ -1,6 +1,6 @@ from eval_framework.llm.base import BaseLLM from eval_framework.metrics.base import MetricResult -from eval_framework.metrics.llm.base import BaseLLMJudgeMetric +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation from eval_framework.metrics.llm.graders.contains_names_grader import ContainsNamesGrader from eval_framework.metrics.llm.graders.language import Language from eval_framework.shared.types import Completion @@ -13,10 +13,8 @@ def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) self._grader = ContainsNamesGrader(llm_judge) + @safe_metric_calculation def calculate(self, response: Completion) -> list[MetricResult]: - if response.error is not None: - return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] - language = Language(response.get_instruction_language()) grading = self._grader.grade( @@ -31,6 +29,5 @@ def calculate(self, response: Completion) -> list[MetricResult]: higher_is_better=True, llm_judge_prompt=grading.judge_prompt, llm_judge_response=grading.judge_response, - error=response.error, ) ] diff --git a/src/eval_framework/metrics/llm/llm_judge_format_correctness.py b/src/eval_framework/metrics/llm/llm_judge_format_correctness.py index c7a1dc0b..bfdcc87f 100644 --- a/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +++ b/src/eval_framework/metrics/llm/llm_judge_format_correctness.py @@ -1,8 +1,6 @@ from eval_framework.llm.base import BaseLLM -from eval_framework.metrics.base import ( - MetricResult, -) -from eval_framework.metrics.llm.base import BaseLLMJudgeMetric +from eval_framework.metrics.base import MetricResult +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation from eval_framework.metrics.llm.graders.format_correctness_grader import FormatCorrectnessGrader from eval_framework.metrics.llm.graders.language import Language from eval_framework.shared.types import BaseMetricContext, Completion, LanguageMetricContext, extract_context_metric @@ -19,10 +17,8 @@ def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) self._grader = FormatCorrectnessGrader(llm_judge) + @safe_metric_calculation def calculate(self, response: Completion) -> list[MetricResult]: - if response.error is not None: - return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] - context = extract_context_metric(response, LanguageMetricContext) grading = self._grader.grade( @@ -38,6 +34,5 @@ def calculate(self, response: Completion) -> list[MetricResult]: higher_is_better=True, llm_judge_prompt=grading.judge_prompt, llm_judge_response=grading.judge_response, - error=response.error, ) ] diff --git a/src/eval_framework/metrics/llm/llm_judge_instruction.py b/src/eval_framework/metrics/llm/llm_judge_instruction.py index 1c53ea54..bf315575 100644 --- a/src/eval_framework/metrics/llm/llm_judge_instruction.py +++ b/src/eval_framework/metrics/llm/llm_judge_instruction.py @@ -1,6 +1,6 @@ from eval_framework.llm.base import BaseLLM from eval_framework.metrics.base import MetricResult -from eval_framework.metrics.llm.base import BaseLLMJudgeMetric +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation from eval_framework.metrics.llm.graders.instruction_grader import InstructionGrader from eval_framework.metrics.llm.graders.language import Language from eval_framework.shared.types import Completion @@ -22,15 +22,8 @@ def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) self._grader = InstructionGrader(llm_judge) + @safe_metric_calculation def calculate(self, response: Completion) -> list[MetricResult]: - if response.error is not None: - for key in self.KEYS: - return [ - MetricResult( - metric_name=f"{self.NAME} - {key}", value=None, higher_is_better=True, error=response.error - ) - ] - language = Language(response.get_instruction_language()) grading = self._grader.grade( @@ -52,7 +45,6 @@ def calculate(self, response: Completion) -> list[MetricResult]: higher_is_better=True, llm_judge_prompt=grading.judge_prompt, llm_judge_response=grading.judge_response, - error=response.error, ) results.append(result) return results diff --git a/src/eval_framework/metrics/llm/llm_judge_refusal.py b/src/eval_framework/metrics/llm/llm_judge_refusal.py index 1c77dd9a..bd4dd5a0 100644 --- a/src/eval_framework/metrics/llm/llm_judge_refusal.py +++ b/src/eval_framework/metrics/llm/llm_judge_refusal.py @@ -1,6 +1,6 @@ from eval_framework.llm.base import BaseLLM from eval_framework.metrics.base import MetricResult -from eval_framework.metrics.llm.base import BaseLLMJudgeMetric +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation from eval_framework.metrics.llm.graders.language import Language from eval_framework.metrics.llm.graders.refusal_grader import RefusalGrader from eval_framework.shared.types import Completion @@ -13,10 +13,8 @@ def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) self._grader = RefusalGrader(llm_judge) + @safe_metric_calculation def calculate(self, response: Completion) -> list[MetricResult]: - if response.error is not None: - return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=False, error=response.error)] - language = response.get_completion_language() or response.get_instruction_language() or "en" grading = self._grader.grade( completion=response.sanitized_completion, @@ -30,6 +28,5 @@ def calculate(self, response: Completion) -> list[MetricResult]: higher_is_better=True, llm_judge_prompt=grading.judge_prompt, llm_judge_response=grading.judge_response, - error=response.error, ) ] diff --git a/src/eval_framework/metrics/llm/llm_judge_sql.py b/src/eval_framework/metrics/llm/llm_judge_sql.py index 04d2fbe4..4f17edd2 100644 --- a/src/eval_framework/metrics/llm/llm_judge_sql.py +++ b/src/eval_framework/metrics/llm/llm_judge_sql.py @@ -18,7 +18,7 @@ from eval_framework.llm.base import BaseLLM from eval_framework.metrics.base import MetricResult -from eval_framework.metrics.llm.base import BaseLLMJudgeMetric +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation from eval_framework.metrics.llm.graders.language import Language from eval_framework.metrics.llm.graders.sql_quality_grader import SqlQualityGrader from eval_framework.shared.types import Completion, LanguageMetricContext, extract_context_metric @@ -59,6 +59,14 @@ class LLMJudgeSqlMetricContext(LanguageMetricContext): class LLMJudgeSql(BaseLLMJudgeMetric): NAME = "SQL Quality" + KEYS = [ + "successfully_runs", + "is_just_sql", + "matches_results_count", + "matches_column_count", + "results_equal", + "llm_quality_score", + ] def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) @@ -81,20 +89,8 @@ def __init__(self, llm_judge: BaseLLM): self._start_mysql_db() self._wait_for_db_containers() + @safe_metric_calculation def calculate(self, response: Completion) -> list[MetricResult]: - if response.error is not None: - return [ - MetricResult(metric_name=f"{self.NAME}/{k}", value=None, higher_is_better=True, error=response.error) - for k in [ - "successfully_runs", - "is_just_sql", - "matches_results_count", - "matches_column_count", - "results_equal", - "llm_quality_score", - ] - ] - context = extract_context_metric(response, LLMJudgeSqlMetricContext) assert isinstance(response.ground_truth, str) @@ -124,13 +120,11 @@ def calculate(self, response: Completion) -> list[MetricResult]: metric_name=f"{self.NAME}/successfully_runs", value=float(result is not None and result.success), higher_is_better=True, - error=response.error, ), MetricResult( metric_name=f"{self.NAME}/is_just_sql", value=float(completion_query == completion_stripped), higher_is_better=True, - error=response.error, ), ] @@ -147,19 +141,16 @@ def calculate(self, response: Completion) -> list[MetricResult]: metric_name=f"{self.NAME}/matches_results_count", value=float(output_comparison.matches_results_count), higher_is_better=True, - error=response.error, ), MetricResult( metric_name=f"{self.NAME}/matches_column_count", value=float(output_comparison.matches_column_count), higher_is_better=True, - error=response.error, ), MetricResult( metric_name=f"{self.NAME}/results_equal", value=float(output_comparison.results_equal), higher_is_better=True, - error=response.error, ), ] ) @@ -178,7 +169,6 @@ def calculate(self, response: Completion) -> list[MetricResult]: higher_is_better=True, llm_judge_prompt=grading.judge_prompt, llm_judge_response=grading.judge_response, - error=response.error, ) ) diff --git a/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py b/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py index dd78e411..4963dd80 100644 --- a/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +++ b/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py @@ -1,6 +1,6 @@ from eval_framework.llm.base import BaseLLM from eval_framework.metrics.base import MetricResult -from eval_framework.metrics.llm.base import BaseLLMJudgeMetric +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation from eval_framework.metrics.llm.graders.language import Language from eval_framework.metrics.llm.graders.summary_world_knowledge_grader import SummarizationWorldKnowledgeGrader from eval_framework.shared.types import Completion @@ -8,15 +8,14 @@ class LLMJudgeWorldKnowledge(BaseLLMJudgeMetric): NAME = "World Knowledge" + _higher_is_better = False # Override: lower world knowledge is better for summarization def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) self._grader = SummarizationWorldKnowledgeGrader(llm_judge) + @safe_metric_calculation def calculate(self, response: Completion) -> list[MetricResult]: - if response.error is not None: - return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=False, error=response.error)] - language = Language(response.get_instruction_language()) grading = self._grader.grade( @@ -32,6 +31,5 @@ def calculate(self, response: Completion) -> list[MetricResult]: higher_is_better=False, llm_judge_prompt=grading.judge_prompt, llm_judge_response=grading.judge_response, - error=response.error, ) ] diff --git a/tests/tests_eval_framework/metrics/llm_metrics/test_safe_metric_calculation.py b/tests/tests_eval_framework/metrics/llm_metrics/test_safe_metric_calculation.py new file mode 100644 index 00000000..b17a5f4b --- /dev/null +++ b/tests/tests_eval_framework/metrics/llm_metrics/test_safe_metric_calculation.py @@ -0,0 +1,290 @@ +from unittest.mock import Mock + +import pytest + +from eval_framework.llm.base import BaseLLM +from eval_framework.metrics.base import MetricResult +from eval_framework.metrics.llm.base import BaseLLMJudgeMetric, safe_metric_calculation +from eval_framework.shared.types import Completion, Error +from template_formatting.formatter import Message, Role + + +def create_test_completion( + with_error: bool = False, + error_message: str = "Test error", +) -> Completion: + """Create a test Completion object with optional error.""" + return Completion( + id=0, + subject="test_subject", + ground_truth="expected answer", + prompt="test prompt", + prompt_sequence_positions=None, + messages=[ + Message(role=Role.SYSTEM, content="You are a helpful assistant."), + Message(role=Role.USER, content="What is 2+2?"), + ], + completion="The answer is 4.", + raw_completion="The answer is 4.", + raw_completion_sequence_positions=None, + error=Error(error_class="TestError", message=error_message, traceback="") if with_error else None, + ) + + +@pytest.fixture +def completion() -> Completion: + """Fixture providing a default test completion without errors.""" + return create_test_completion() + + +@pytest.fixture +def completion_with_error() -> Completion: + """Fixture providing a test completion with a pre-existing error.""" + return create_test_completion(with_error=True) + + +class SingleMetricJudge(BaseLLMJudgeMetric): + """Test metric class with a single metric name.""" + + NAME = "Test Single Metric" + + def __init__(self, llm_judge: BaseLLM, should_raise: bool = False): + super().__init__(llm_judge) + self._should_raise = should_raise + self._call_count = 0 + + @safe_metric_calculation + def calculate(self, response: Completion) -> list[MetricResult]: + self._call_count += 1 + if self._should_raise: + raise ValueError("Simulated LLM judge failure") + return [ + MetricResult( + metric_name=self.NAME, + value=1.0, + higher_is_better=True, + llm_judge_prompt="test prompt", + llm_judge_response="test response", + ) + ] + + +class MultiMetricJudge(BaseLLMJudgeMetric): + """Test metric class with multiple metric names (like LLMJudgeInstruction).""" + + NAME = "Test Multi Metric" + KEYS = ["quality", "accuracy", "relevance"] + + def __init__(self, llm_judge: BaseLLM, should_raise: bool = False): + super().__init__(llm_judge) + self._should_raise = should_raise + + @safe_metric_calculation + def calculate(self, response: Completion) -> list[MetricResult]: + if self._should_raise: + raise RuntimeError("Simulated multi-metric failure") + return [ + MetricResult( + metric_name=f"{self.NAME}/{key}", + value=0.8, + higher_is_better=True, + ) + for key in self.KEYS + ] + + +class LowerIsBetterMetric(BaseLLMJudgeMetric): + """Test metric where lower values are better (like WorldKnowledge).""" + + NAME = "Test Lower Is Better" + _higher_is_better = False + + def __init__(self, llm_judge: BaseLLM, should_raise: bool = False): + super().__init__(llm_judge) + self._should_raise = should_raise + + @safe_metric_calculation + def calculate(self, response: Completion) -> list[MetricResult]: + if self._should_raise: + raise Exception("Simulated failure") + return [ + MetricResult( + metric_name=self.NAME, + value=0.1, + higher_is_better=False, + ) + ] + + +class TestSuccessfulExecution: + """Tests verifying the decorator doesn't interfere with normal operation.""" + + def test_single_metric_success(self, completion: Completion) -> None: + """Decorator should not interfere with successful single metric calculation.""" + llm = Mock(spec=BaseLLM) + metric = SingleMetricJudge(llm, should_raise=False) + + results = metric.calculate(completion) + + assert len(results) == 1 + assert results[0].metric_name == "Test Single Metric" + assert results[0].value == 1.0 + assert results[0].error is None + + def test_multi_metric_success(self, completion: Completion) -> None: + """Decorator should not interfere with successful multi-metric calculation.""" + llm = Mock(spec=BaseLLM) + metric = MultiMetricJudge(llm, should_raise=False) + + results = metric.calculate(completion) + + assert len(results) == 3 + assert all(r.error is None for r in results) + assert {r.metric_name for r in results} == { + "Test Multi Metric/quality", + "Test Multi Metric/accuracy", + "Test Multi Metric/relevance", + } + + +class TestPreExistingResponseError: + """Tests for handling responses that already have errors.""" + + def test_single_metric_with_response_error(self) -> None: + """Should return error result without calling the actual calculate logic.""" + llm = Mock(spec=BaseLLM) + metric = SingleMetricJudge(llm, should_raise=False) + response = create_test_completion(with_error=True, error_message="Pre-existing error") + + results = metric.calculate(response) + + assert len(results) == 1 + assert results[0].metric_name == "Test Single Metric" + assert results[0].value is None + assert results[0].error is not None + assert results[0].error.message == "Pre-existing error" + # Verify the actual calculate logic was never called + assert metric._call_count == 0 + + def test_multi_metric_with_response_error(self, completion_with_error: Completion) -> None: + """Should return error results for ALL metric names when response has error.""" + llm = Mock(spec=BaseLLM) + metric = MultiMetricJudge(llm, should_raise=False) + + results = metric.calculate(completion_with_error) + + # Should return one error result per metric name + assert len(results) == 3 + assert all(r.value is None for r in results) + assert all(r.error is not None for r in results) + assert {r.metric_name for r in results} == { + "Test Multi Metric/quality", + "Test Multi Metric/accuracy", + "Test Multi Metric/relevance", + } + + def test_higher_is_better_preserved_for_response_error(self, completion_with_error: Completion) -> None: + """Error results should preserve the higher_is_better setting.""" + llm = Mock(spec=BaseLLM) + metric = LowerIsBetterMetric(llm, should_raise=False) + + results = metric.calculate(completion_with_error) + + assert len(results) == 1 + assert results[0].higher_is_better is False + + +class TestExceptionHandling: + """Tests for catching exceptions during metric calculation.""" + + def test_single_metric_exception_caught(self, completion: Completion) -> None: + """Should catch exception and return error result instead of crashing.""" + llm = Mock(spec=BaseLLM) + metric = SingleMetricJudge(llm, should_raise=True) + + # Should NOT raise - exception should be caught + results = metric.calculate(completion) + + assert len(results) == 1 + assert results[0].value is None + assert results[0].error is not None + assert results[0].error.error_class == "ValueError" + assert "Simulated LLM judge failure" in results[0].error.message + # Verify traceback is captured for debugging + assert results[0].error.traceback != "" + assert "ValueError" in results[0].error.traceback + assert "Simulated LLM judge failure" in results[0].error.traceback + + def test_multi_metric_exception_caught(self, completion: Completion) -> None: + """Should return error results for ALL metric names when exception occurs.""" + llm = Mock(spec=BaseLLM) + metric = MultiMetricJudge(llm, should_raise=True) + + results = metric.calculate(completion) + + assert len(results) == 3 + assert all(r.value is None for r in results) + assert all(r.error is not None for r in results) + assert all(r.error.error_class == "RuntimeError" for r in results) + + def test_higher_is_better_preserved_for_exception(self, completion: Completion) -> None: + """Error results from exceptions should preserve higher_is_better setting.""" + llm = Mock(spec=BaseLLM) + metric = LowerIsBetterMetric(llm, should_raise=True) + + results = metric.calculate(completion) + + assert len(results) == 1 + assert results[0].higher_is_better is False + + +class TestIntegrationWithRealMetrics: + """Integration tests using actual metric classes from the codebase.""" + + def test_conciseness_with_exception(self, completion: Completion) -> None: + """Test LLMJudgeConciseness handles exceptions gracefully.""" + from eval_framework.metrics.llm.llm_judge_conciseness import LLMJudgeConciseness + + llm = Mock(spec=BaseLLM) + # Simulate LLM throwing an exception + llm.generate_from_messages.side_effect = ConnectionError("API timeout") + + metric = LLMJudgeConciseness(llm) + + # Should NOT crash + results = metric.calculate(completion) + + assert len(results) == 1 + assert results[0].value is None + assert results[0].error is not None + assert results[0].error.error_class == "ConnectionError" + + def test_instruction_with_response_error(self) -> None: + """Test LLMJudgeInstruction handles response errors correctly.""" + from eval_framework.metrics.llm.llm_judge_instruction import LLMJudgeInstruction + + llm = Mock(spec=BaseLLM) + metric = LLMJudgeInstruction(llm) + response = create_test_completion(with_error=True, error_message="Model error") + + results = metric.calculate(response) + + # Should return error for all 7 metric keys + assert len(results) == 7 + assert all(r.error is not None for r in results) + assert all(r.value is None for r in results) + + def test_world_knowledge_preserves_higher_is_better(self, completion: Completion) -> None: + """Test LLMJudgeWorldKnowledge error results have higher_is_better=False.""" + from eval_framework.metrics.llm.llm_judge_world_knowledge import LLMJudgeWorldKnowledge + + llm = Mock(spec=BaseLLM) + llm.generate_from_messages.side_effect = Exception("Test exception") + + metric = LLMJudgeWorldKnowledge(llm) + + results = metric.calculate(completion) + + assert len(results) == 1 + assert results[0].higher_is_better is False + assert results[0].error is not None