Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions bug_test_geval_score_mismatch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# BUG REPRODUCTION: GEval Score Mismatch Without Warning
# This configuration demonstrates the bug where GEval metrics silently default
# None scores to 0.0 without any warning or logging.
#
# HOW TO REPRODUCE:
# 1. Run: lightspeed-evaluation --system-config config/system.yaml --eval-config bug_test_geval_score_mismatch.yaml
# 2. Observe the output - if GEval returns None score, it will silently become 0.0
# 3. Check logs - NO WARNING will be shown about the score mismatch
#
# EXPECTED BEHAVIOR: A warning should be logged when score is None
# ACTUAL BEHAVIOR: Silent conversion from None -> 0.0

- conversation_group_id: geval_bug_test
description: Test case to trigger GEval score mismatch bug

turns:
- turn_id: turn_1
query: |
How do I deploy a Kubernetes pod?
response: |
Use kubectl apply -f pod.yaml to deploy a pod.
contexts:
- Kubernetes documentation on pod deployment
expected_response: |
To deploy a pod, use kubectl apply command with a YAML manifest file.

# This GEval metric will be evaluated
turn_metrics:
- geval:technical_accuracy

# You can also add custom GEval with intentionally problematic criteria
# that might cause the LLM judge to fail or return None
turn_metrics_metadata:
geval:technical_accuracy:
threshold: 0.7
# The metric is already defined in system.yaml

# Additional test case with conversation-level GEval
- conversation_group_id: geval_conversation_bug_test
description: Conversation-level GEval bug test

conversation_metrics:
- geval:conversation_coherence

turns:
- turn_id: turn_1
query: Tell me about Kubernetes
response: Kubernetes is a container orchestration platform.
turn_metrics: []

- turn_id: turn_2
query: How do I use it?
response: You use kubectl commands to interact with Kubernetes clusters.
turn_metrics: []
2 changes: 1 addition & 1 deletion config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ llm_pool:
cache_enabled: true
cache_dir: ".caches/llm_cache"
timeout: 300
num_retries: 3
num_retries: 3 # Retry attempts for LLM judge (applies to GEval, custom metrics, etc.; default: 6 if not specified)
parameters:
temperature: 0.0
max_completion_tokens: 1024
Expand Down
127 changes: 111 additions & 16 deletions src/lightspeed_evaluation/core/metrics/geval.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

from pydantic import ValidationError
from tenacity import (
retry,
retry_if_exception,
stop_after_attempt,
wait_exponential,
before_sleep_log,
)

from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager
Expand Down Expand Up @@ -57,6 +64,11 @@ def __init__(
self.deepeval_llm_manager = deepeval_llm_manager
self.metric_manager = metric_manager

# Get num_retries from LLM configuration (default: 6 to match DeepEval's hardcoded value)
# Note: DeepEval's internal retry logic uses hardcoded MAX_RETRIES=6,
# but we add our own retry layer to respect user configuration
self.num_retries = self.deepeval_llm_manager.llm_params.get("num_retries", 6)

def evaluate( # pylint: disable=R0913,R0917
self,
metric_name: str,
Expand Down Expand Up @@ -202,6 +214,67 @@ def _convert_evaluation_params(
# Return the successfully converted list, or None if it ended up empty
return converted if converted else None

def _is_retryable_exception(self, exception: BaseException) -> bool:
"""Check if exception should trigger a retry.

Retryable conditions:
- Rate limiting (429 errors from LLM provider)
- Timeout errors
- Temporary network failures
- LLM provider temporary errors

Args:
exception: The exception to check

Returns:
True if the exception should trigger a retry, False otherwise
"""
# We retry on all exceptions because DeepEval/LiteLLM internally
# handles specific error types (RateLimitError, Timeout, etc.)
# This matches DeepEval's hardcoded behavior: retryable_exceptions = (Exception,)
return isinstance(exception, Exception)

def _measure_with_retry(
self, metric: GEval, test_case: LLMTestCase, context: str
) -> None:
"""Execute metric.measure() with configurable retry logic.

This method wraps DeepEval's metric.measure() with our own retry layer
to respect user-configured num_retries (DeepEval hardcodes MAX_RETRIES=6).

Args:
metric: GEval metric instance
test_case: LLM test case to evaluate
context: Description for logging (e.g., "turn-level" or "conversation-level")

Raises:
Exception: Re-raises the last exception if all retry attempts fail
"""
retry_decorator = retry(
retry=retry_if_exception(self._is_retryable_exception),
stop=stop_after_attempt(self.num_retries),
wait=wait_exponential(multiplier=1, min=1, max=10),
before_sleep=before_sleep_log(logger, logging.WARNING),
reraise=True,
)

@retry_decorator
def _measure() -> None:
metric.measure(test_case)
self.deepeval_llm_manager.flush_deepevals_pending_tasks()

try:
_measure()
except Exception as e:
logger.error(
"GEval %s evaluation failed after %d retry attempts: %s: %s",
context,
self.num_retries,
type(e).__name__,
str(e),
)
raise

def _evaluate_turn( # pylint: disable=R0913,R0917
self,
turn_data: Any,
Expand Down Expand Up @@ -293,22 +366,37 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
# Create test case for a single turn
test_case = LLMTestCase(**test_case_kwargs)

# Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
try:
metric.measure(test_case)
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
self._measure_with_retry(metric, test_case, "turn-level")

score = metric.score if metric.score is not None else 0.0
# Extract score and reason
score = metric.score
reason = (
str(metric.reason)
if hasattr(metric, "reason") and metric.reason
else "No reason provided"
)

# CRITICAL: Warn if score is None (indicates evaluation failure)
# Without this warning, silent conversion to 0.0 masks bugs like:
# - Rate limiting (429 errors after all retries exhausted)
# - LLM judge returning malformed JSON that fails parsing
# - Timeout errors from LLM provider
# - API quota/credits exhausted
# This makes debugging nearly impossible as failed evaluations
# appear as low scores (0.0) instead of errors.
if score is None:
logger.warning(
"GEval turn-level metric returned None score; defaulting to 0.0. "
"This typically indicates LLM judge failure (rate limiting, timeout, "
"invalid JSON response, or quota exhausted). Reason: %s",
reason,
)
score = 0.0

return score, reason
except Exception as e: # pylint: disable=W0718
logger.error(
"GEval turn-level evaluation failed: %s: %s", type(e).__name__, str(e)
)
logger.debug(
"Test case input: %s...",
test_case.input[:100] if test_case.input else "None",
Expand Down Expand Up @@ -406,24 +494,31 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
actual_output="\n".join(conversation_output),
)

# Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
try:
metric.measure(test_case)
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
self._measure_with_retry(metric, test_case, "conversation-level")

score = metric.score if metric.score is not None else 0.0
# Extract score and reason
score = metric.score
reason = (
str(metric.reason)
if hasattr(metric, "reason") and metric.reason
else "No reason provided"
)

# CRITICAL: Warn if score is None (indicates evaluation failure)
# See turn-level evaluation for detailed explanation of why this matters
if score is None:
logger.warning(
"GEval conversation-level metric returned None score; defaulting to 0.0. "
"This typically indicates LLM judge failure (rate limiting, timeout, "
"invalid JSON response, or quota exhausted). Reason: %s",
reason,
)
score = 0.0

return score, reason
except Exception as e: # pylint: disable=W0718
logger.error(
"GEval conversation-level evaluation failed: %s: %s",
type(e).__name__,
str(e),
)
logger.debug("Conversation turns: %d", len(conv_data.turns))
logger.debug(
"Test case input preview: %s...",
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/core/metrics/test_geval.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def mock_llm_manager(self, mocker: MockerFixture) -> Any:
mock_manager = mocker.MagicMock()
mock_llm = mocker.MagicMock()
mock_manager.get_llm.return_value = mock_llm
# Mock llm_params to return valid num_retries (needed for retry logic)
mock_manager.llm_params = {"num_retries": 3}
return mock_manager

@pytest.fixture
Expand Down
Loading