Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,16 @@ api:
# Legacy authentication (fallback when mcp_headers is not configured or disabled)
# Authentication via API_KEY environment variable only for MCP server (without Server name)

# Quality Score Configuration
# Aggregated score from selected metrics for overall system quality assessment
quality_score:
metrics:
- "ragas:faithfulness"
- "ragas:context_precision_with_reference"
- "custom:tool_eval"
- "custom:answer_correctness"
default: true # If true, all metrics in this list get default: true

# Default metrics metadata
metrics_metadata:
# Turn-level metrics metadata
Expand Down
110 changes: 110 additions & 0 deletions src/lightspeed_evaluation/core/models/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,116 @@ class MetricStats(OverallStats):
)


class QualityScoreStatistics(ScoreStatistics):
"""Score statistics with weight for quality score calculation."""

weight: float = Field(
default=0.0,
description="Weight proportion (sample_size / total_samples) used in weighted average",
)


class SystemReport(BaseModel):
"""Aggregated quality score from selected metrics."""

aggregated_quality_score: float = Field(
default=0.0, description="Weighted average of quality score metrics"
)
quality_metrics: dict[str, QualityScoreStatistics] = Field(
default_factory=dict,
description="Individual metrics used in quality score calculation",
)
extra_metrics: dict[str, ScoreStatistics] = Field(
default_factory=dict,
description="Other evaluated metrics calculated, not used for quality score calculation",
)
api_latency: float = Field(
default=0.0, description="[Placeholder] Average API response time in seconds"
)
api_tokens: int = Field(
default=0,
description="[Placeholder] Total number of tokens consumed across all API calls",
)

@staticmethod
def from_results(
by_metric: dict[str, MetricStats],
quality_score_metrics: list[str],
) -> Optional["SystemReport"]:
"""Compute aggregated quality score from specified metrics.

Calculates a weighted average (by sample size) of the mean scores
for each specified metric using pre-computed metric statistics.

Args:
by_metric: Dictionary of metric statistics (from _compute_metric_stats).
quality_score_metrics: List of metric identifiers to include in quality score.

Returns:
SystemReport instance with aggregated score and individual metric stats,
or None if no valid metrics found.
"""
# Validate all quality score metrics exist in by_metric
missing_metrics = [m for m in quality_score_metrics if m not in by_metric]
if missing_metrics:
raise ValueError(
f"Quality score metrics not found in evaluation results: {missing_metrics}. "
f"Available metrics: {list(by_metric.keys())}"
)

# Calculate total samples from quality score metrics only
total_samples = sum(
by_metric[metric_id].score_statistics.count
for metric_id in quality_score_metrics
)

if total_samples == 0:
return None

quality_metrics: dict[str, QualityScoreStatistics] = {}
extra_metrics: dict[str, ScoreStatistics] = {}

# Separate quality metrics from extra metrics
for metric_id in by_metric:
if metric_id in quality_score_metrics:
score_stats = by_metric[metric_id].score_statistics
sample_size = score_stats.count
weight = sample_size / total_samples

quality_metrics[metric_id] = QualityScoreStatistics(
**score_stats.model_dump(),
weight=weight,
)
else:
extra_metrics[metric_id] = by_metric[metric_id].score_statistics

# Calculate aggregated quality score
aggregated_score = SystemReport._calculate_quality_score(quality_metrics)

return SystemReport(
aggregated_quality_score=aggregated_score,
quality_metrics=quality_metrics,
extra_metrics=extra_metrics,
)

@staticmethod
def _calculate_quality_score(
quality_metrics: dict[str, QualityScoreStatistics],
) -> float:
"""Calculate weighted average quality score from quality metrics.

Args:
quality_metrics: Dictionary of quality score statistics with weights.

Returns:
Weighted average quality score.
"""
weighted_sum = 0.0
for stats in quality_metrics.values():
weighted_sum += stats.mean * stats.weight
return weighted_sum


class ConversationStats(OverallStats):
"""Statistics for a specific conversation group."""

Expand Down
20 changes: 20 additions & 0 deletions src/lightspeed_evaluation/core/models/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,21 @@ def from_metadata(cls, raw: dict[str, Any]) -> "GEvalConfig":
return cls.model_validate(data)


class QualityScoreConfig(BaseModel):
"""Quality score configuration."""

model_config = ConfigDict(extra="forbid")

metrics: list[str] = Field(
default_factory=list,
description="List of metric identifiers to use for quality score computation",
)
default: bool = Field(
default=False,
description="If true, set default: true for all metrics in the list",
)


class SystemConfig(BaseModel):
"""System configuration using individual config models."""

Expand Down Expand Up @@ -848,6 +863,11 @@ class SystemConfig(BaseModel):
default_factory=VisualizationConfig, description="Visualization configuration"
)

# Quality score configuration
quality_score: Optional[QualityScoreConfig] = Field(
default=None, description="Quality score configuration"
)

# Default metrics metadata from system config
default_turn_metrics_metadata: dict[str, dict[str, Any]] = Field(
default_factory=dict, description="Default turn metrics metadata"
Expand Down
72 changes: 71 additions & 1 deletion src/lightspeed_evaluation/core/output/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
EvaluationSummary,
MetricStats,
OverallStats,
SystemReport,
StreamingStats,
TagStats,
)
Expand Down Expand Up @@ -69,6 +70,14 @@ def generate_reports(
results: List of evaluation results.
evaluation_data: Optional evaluation data for API token calculation.
"""
# Get quality_score_metrics from system config if available
quality_score_metrics = None
if (
self.system_config is not None
and self.system_config.quality_score is not None
):
quality_score_metrics = self.system_config.quality_score.metrics

# Build EvaluationSummary once, use it everywhere.
# CLI path computes confidence intervals by default (when sample size > 1).
summary = EvaluationSummary.from_results(
Expand All @@ -77,6 +86,14 @@ def generate_reports(
compute_confidence_intervals=True,
)

# Generate SystemReport separately if quality score metrics are configured
system_report = None
if quality_score_metrics:
system_report = SystemReport.from_results(
summary.by_metric,
quality_score_metrics,
)

# Prepare timestamped base filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_filename = f"{self.base_filename}_{timestamp}"
Expand All @@ -92,7 +109,7 @@ def generate_reports(

# Generate individual reports based on configuration
self._generate_individual_reports(
results, base_filename, enabled_outputs, summary
results, base_filename, enabled_outputs, summary, system_report
)

# Generate graphs if enabled
Expand Down Expand Up @@ -161,6 +178,7 @@ def _generate_individual_reports(
base_filename: str,
enabled_outputs: list[str],
summary: EvaluationSummary,
system_report: Optional[SystemReport] = None,
) -> None:
"""Generate reports based on enabled outputs."""
if "csv" in enabled_outputs:
Expand All @@ -170,6 +188,12 @@ def _generate_individual_reports(
if "json" in enabled_outputs:
json_file = self._generate_json_summary_from_model(summary, base_filename)
logger.info("JSON: %s", json_file)
# Generate system_report.json if quality score is configured
if system_report is not None:
system_report_file = self._generate_quality_score_report(
system_report, base_filename
)
logger.info("JSON: %s", system_report_file)

if "txt" in enabled_outputs:
txt_file = self._generate_text_summary_from_model(summary, base_filename)
Expand Down Expand Up @@ -289,6 +313,52 @@ def _generate_json_summary_from_model(

return json_file

def _generate_quality_score_report(
self,
system_report: SystemReport,
base_filename: str,
target_dir: Optional[Path] = None,
) -> Path:
"""Generate quality score JSON report.

Args:
system_report: The SystemReport model instance.
base_filename: Base filename for the output file.
target_dir: Optional directory override for output file location.

Returns:
Path to the generated quality_score.json file.
"""
out = target_dir if target_dir is not None else self.output_dir
quality_score_file = out / f"{base_filename}_system_report.json"

output = {
"timestamp": datetime.now().isoformat(),
"aggregated_quality_score": system_report.aggregated_quality_score,
"quality_metrics": {
metric_id: {
"mean": stats.mean,
"count": stats.count,
"weight": stats.weight,
}
for metric_id, stats in system_report.quality_metrics.items()
},
"extra_metrics": {
metric_id: {
"mean": stats.mean,
"count": stats.count,
}
for metric_id, stats in system_report.extra_metrics.items()
},
"api_latency": system_report.api_latency,
"api_tokens": system_report.api_tokens,
}

with open(quality_score_file, "w", encoding="utf-8") as f:
json.dump(output, f, indent=2)

return quality_score_file

def _generate_text_summary_from_model(
self,
summary: EvaluationSummary,
Expand Down
Loading
Loading