From 3f59a580e300eb8ac8552a359b626e34332f270b Mon Sep 17 00:00:00 2001 From: Eva Micankova Date: Tue, 28 Apr 2026 17:58:03 +0200 Subject: [PATCH] LEADS-349-calculate-aggregated-score-from-key-metrics --- config/system.yaml | 10 ++ .../core/models/summary.py | 110 ++++++++++++++++++ .../core/models/system.py | 20 ++++ .../core/output/generator.py | 72 +++++++++++- .../core/system/loader.py | 85 +++++++++++++- 5 files changed, 291 insertions(+), 6 deletions(-) diff --git a/config/system.yaml b/config/system.yaml index 6110e503..7187a018 100644 --- a/config/system.yaml +++ b/config/system.yaml @@ -104,6 +104,16 @@ api: # Legacy authentication (fallback when mcp_headers is not configured or disabled) # Authentication via API_KEY environment variable only for MCP server (without Server name) +# Quality Score Configuration +# Aggregated score from selected metrics for overall system quality assessment +quality_score: + metrics: + - "ragas:faithfulness" + - "ragas:context_precision_with_reference" + - "custom:tool_eval" + - "custom:answer_correctness" + default: true # If true, all metrics in this list get default: true + # Default metrics metadata metrics_metadata: # Turn-level metrics metadata diff --git a/src/lightspeed_evaluation/core/models/summary.py b/src/lightspeed_evaluation/core/models/summary.py index 34fe7a20..f5db2242 100644 --- a/src/lightspeed_evaluation/core/models/summary.py +++ b/src/lightspeed_evaluation/core/models/summary.py @@ -75,6 +75,116 @@ class MetricStats(OverallStats): ) +class QualityScoreStatistics(ScoreStatistics): + """Score statistics with weight for quality score calculation.""" + + weight: float = Field( + default=0.0, + description="Weight proportion (sample_size / total_samples) used in weighted average", + ) + + +class SystemReport(BaseModel): + """Aggregated quality score from selected metrics.""" + + aggregated_quality_score: float = Field( + default=0.0, description="Weighted average of quality score metrics" + ) + quality_metrics: dict[str, QualityScoreStatistics] = Field( + default_factory=dict, + description="Individual metrics used in quality score calculation", + ) + extra_metrics: dict[str, ScoreStatistics] = Field( + default_factory=dict, + description="Other evaluated metrics calculated, not used for quality score calculation", + ) + api_latency: float = Field( + default=0.0, description="[Placeholder] Average API response time in seconds" + ) + api_tokens: int = Field( + default=0, + description="[Placeholder] Total number of tokens consumed across all API calls", + ) + + @staticmethod + def from_results( + by_metric: dict[str, MetricStats], + quality_score_metrics: list[str], + ) -> Optional["SystemReport"]: + """Compute aggregated quality score from specified metrics. + + Calculates a weighted average (by sample size) of the mean scores + for each specified metric using pre-computed metric statistics. + + Args: + by_metric: Dictionary of metric statistics (from _compute_metric_stats). + quality_score_metrics: List of metric identifiers to include in quality score. + + Returns: + SystemReport instance with aggregated score and individual metric stats, + or None if no valid metrics found. + """ + # Validate all quality score metrics exist in by_metric + missing_metrics = [m for m in quality_score_metrics if m not in by_metric] + if missing_metrics: + raise ValueError( + f"Quality score metrics not found in evaluation results: {missing_metrics}. " + f"Available metrics: {list(by_metric.keys())}" + ) + + # Calculate total samples from quality score metrics only + total_samples = sum( + by_metric[metric_id].score_statistics.count + for metric_id in quality_score_metrics + ) + + if total_samples == 0: + return None + + quality_metrics: dict[str, QualityScoreStatistics] = {} + extra_metrics: dict[str, ScoreStatistics] = {} + + # Separate quality metrics from extra metrics + for metric_id in by_metric: + if metric_id in quality_score_metrics: + score_stats = by_metric[metric_id].score_statistics + sample_size = score_stats.count + weight = sample_size / total_samples + + quality_metrics[metric_id] = QualityScoreStatistics( + **score_stats.model_dump(), + weight=weight, + ) + else: + extra_metrics[metric_id] = by_metric[metric_id].score_statistics + + # Calculate aggregated quality score + aggregated_score = SystemReport._calculate_quality_score(quality_metrics) + + return SystemReport( + aggregated_quality_score=aggregated_score, + quality_metrics=quality_metrics, + extra_metrics=extra_metrics, + ) + + @staticmethod + def _calculate_quality_score( + quality_metrics: dict[str, QualityScoreStatistics], + ) -> float: + """Calculate weighted average quality score from quality metrics. + + Args: + quality_metrics: Dictionary of quality score statistics with weights. + + Returns: + Weighted average quality score. + """ + weighted_sum = 0.0 + for stats in quality_metrics.values(): + weighted_sum += stats.mean * stats.weight + return weighted_sum + + class ConversationStats(OverallStats): """Statistics for a specific conversation group.""" diff --git a/src/lightspeed_evaluation/core/models/system.py b/src/lightspeed_evaluation/core/models/system.py index 5067d3b2..5e6f596a 100644 --- a/src/lightspeed_evaluation/core/models/system.py +++ b/src/lightspeed_evaluation/core/models/system.py @@ -804,6 +804,21 @@ def from_metadata(cls, raw: dict[str, Any]) -> "GEvalConfig": return cls.model_validate(data) +class QualityScoreConfig(BaseModel): + """Quality score configuration.""" + + model_config = ConfigDict(extra="forbid") + + metrics: list[str] = Field( + default_factory=list, + description="List of metric identifiers to use for quality score computation", + ) + default: bool = Field( + default=False, + description="If true, set default: true for all metrics in the list", + ) + + class SystemConfig(BaseModel): """System configuration using individual config models.""" @@ -848,6 +863,11 @@ class SystemConfig(BaseModel): default_factory=VisualizationConfig, description="Visualization configuration" ) + # Quality score configuration + quality_score: Optional[QualityScoreConfig] = Field( + default=None, description="Quality score configuration" + ) + # Default metrics metadata from system config default_turn_metrics_metadata: dict[str, dict[str, Any]] = Field( default_factory=dict, description="Default turn metrics metadata" diff --git a/src/lightspeed_evaluation/core/output/generator.py b/src/lightspeed_evaluation/core/output/generator.py index 3553ab81..a5e86a47 100644 --- a/src/lightspeed_evaluation/core/output/generator.py +++ b/src/lightspeed_evaluation/core/output/generator.py @@ -22,6 +22,7 @@ EvaluationSummary, MetricStats, OverallStats, + SystemReport, StreamingStats, TagStats, ) @@ -69,6 +70,14 @@ def generate_reports( results: List of evaluation results. evaluation_data: Optional evaluation data for API token calculation. """ + # Get quality_score_metrics from system config if available + quality_score_metrics = None + if ( + self.system_config is not None + and self.system_config.quality_score is not None + ): + quality_score_metrics = self.system_config.quality_score.metrics + # Build EvaluationSummary once, use it everywhere. # CLI path computes confidence intervals by default (when sample size > 1). summary = EvaluationSummary.from_results( @@ -77,6 +86,14 @@ def generate_reports( compute_confidence_intervals=True, ) + # Generate SystemReport separately if quality score metrics are configured + system_report = None + if quality_score_metrics: + system_report = SystemReport.from_results( + summary.by_metric, + quality_score_metrics, + ) + # Prepare timestamped base filename timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") base_filename = f"{self.base_filename}_{timestamp}" @@ -92,7 +109,7 @@ def generate_reports( # Generate individual reports based on configuration self._generate_individual_reports( - results, base_filename, enabled_outputs, summary + results, base_filename, enabled_outputs, summary, system_report ) # Generate graphs if enabled @@ -161,6 +178,7 @@ def _generate_individual_reports( base_filename: str, enabled_outputs: list[str], summary: EvaluationSummary, + system_report: Optional[SystemReport] = None, ) -> None: """Generate reports based on enabled outputs.""" if "csv" in enabled_outputs: @@ -170,6 +188,12 @@ def _generate_individual_reports( if "json" in enabled_outputs: json_file = self._generate_json_summary_from_model(summary, base_filename) logger.info("JSON: %s", json_file) + # Generate system_report.json if quality score is configured + if system_report is not None: + system_report_file = self._generate_quality_score_report( + system_report, base_filename + ) + logger.info("JSON: %s", system_report_file) if "txt" in enabled_outputs: txt_file = self._generate_text_summary_from_model(summary, base_filename) @@ -289,6 +313,52 @@ def _generate_json_summary_from_model( return json_file + def _generate_quality_score_report( + self, + system_report: SystemReport, + base_filename: str, + target_dir: Optional[Path] = None, + ) -> Path: + """Generate quality score JSON report. + + Args: + system_report: The SystemReport model instance. + base_filename: Base filename for the output file. + target_dir: Optional directory override for output file location. + + Returns: + Path to the generated quality_score.json file. + """ + out = target_dir if target_dir is not None else self.output_dir + quality_score_file = out / f"{base_filename}_system_report.json" + + output = { + "timestamp": datetime.now().isoformat(), + "aggregated_quality_score": system_report.aggregated_quality_score, + "quality_metrics": { + metric_id: { + "mean": stats.mean, + "count": stats.count, + "weight": stats.weight, + } + for metric_id, stats in system_report.quality_metrics.items() + }, + "extra_metrics": { + metric_id: { + "mean": stats.mean, + "count": stats.count, + } + for metric_id, stats in system_report.extra_metrics.items() + }, + "api_latency": system_report.api_latency, + "api_tokens": system_report.api_tokens, + } + + with open(quality_score_file, "w", encoding="utf-8") as f: + json.dump(output, f, indent=2) + + return quality_score_file + def _generate_text_summary_from_model( self, summary: EvaluationSummary, diff --git a/src/lightspeed_evaluation/core/system/loader.py b/src/lightspeed_evaluation/core/system/loader.py index 9b3e055d..90e54c7f 100644 --- a/src/lightspeed_evaluation/core/system/loader.py +++ b/src/lightspeed_evaluation/core/system/loader.py @@ -18,6 +18,7 @@ from lightspeed_evaluation.core.models.system import ( JudgePanelConfig, LLMPoolConfig, + QualityScoreConfig, ) from lightspeed_evaluation.core.storage.config import ( DatabaseBackendConfig, @@ -121,6 +122,23 @@ def load_system_config(self, config_path: str) -> SystemConfig: def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig: """Create SystemConfig object from validated configuration data.""" metrics_metadata = config_data.get("metrics_metadata", {}) + quality_score_data = config_data.get("quality_score") + + # Process quality_score defaults before creating SystemConfig + turn_level_metadata = metrics_metadata.get("turn_level", {}) + conversation_level_metadata = metrics_metadata.get("conversation_level", {}) + + if quality_score_data: + self._process_quality_score_defaults( + quality_score_data, + turn_level_metadata, + conversation_level_metadata, + ) + + # Parse quality_score config if present + quality_score_config = ( + QualityScoreConfig(**quality_score_data) if quality_score_data else None + ) # Parse llm_pool and judge_panel if present (Optional sections) llm_pool_data = config_data.get("llm_pool") @@ -129,7 +147,7 @@ def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig: judge_panel_data = config_data.get("judge_panel") judge_panel = JudgePanelConfig(**judge_panel_data) if judge_panel_data else None - # Parse storage backends with backward compatibility for legacy 'output' section + # Parse storage backends with backward compatibility storage_data = self._get_storage_config_with_backward_compat(config_data) storage_backends = self._parse_storage_config(storage_data) @@ -143,12 +161,69 @@ def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig: visualization=VisualizationConfig(**config_data.get("visualization", {})), llm_pool=llm_pool, judge_panel=judge_panel, - default_turn_metrics_metadata=metrics_metadata.get("turn_level", {}), - default_conversation_metrics_metadata=metrics_metadata.get( - "conversation_level", {} - ), + quality_score=quality_score_config, + default_turn_metrics_metadata=turn_level_metadata, + default_conversation_metrics_metadata=conversation_level_metadata, ) + def _process_quality_score_defaults( + self, + quality_score_config: dict[str, Any], + turn_level_metadata: dict[str, dict[str, Any]], + conversation_level_metadata: dict[str, dict[str, Any]], + ) -> None: + """Process quality_score.default to set defaults for quality score metrics. + + If quality_score.default is true, sets default: true for all metrics + listed in quality_score.metrics. Raises error if a metric is not defined + in turn_level or conversation_level metadata. + + Args: + quality_score_config: The quality_score configuration dict + turn_level_metadata: Turn-level metrics metadata (modified in-place) + conversation_level_metadata: Conversation-level metrics metadata (modified in-place) + + Raises: + ConfigurationError: If quality_score.default is true but a metric is not + defined in metadata, or if metrics list is empty/missing. + """ + if not quality_score_config: + return + + # Check if default flag is set to true + default_flag = quality_score_config.get("default", False) + if not default_flag: + return + + # Get the list of metrics for quality score + quality_score_metrics = quality_score_config.get("metrics", []) + if not quality_score_metrics: + raise ConfigurationError( + "quality_score.default is true but quality_score.metrics is empty or missing. " + "Please specify at least one metric in quality_score.metrics." + ) + + # Process each metric + for metric_id in quality_score_metrics: + # Check if metric exists in turn_level or conversation_level + if metric_id in turn_level_metadata: + # Set default: true for this metric + if not isinstance(turn_level_metadata[metric_id], dict): + turn_level_metadata[metric_id] = {} + turn_level_metadata[metric_id]["default"] = True + elif metric_id in conversation_level_metadata: + # Set default: true for this metric + if not isinstance(conversation_level_metadata[metric_id], dict): + conversation_level_metadata[metric_id] = {} + conversation_level_metadata[metric_id]["default"] = True + else: + # Metric not found - raise error + raise ConfigurationError( + f"Metric '{metric_id}' is listed in quality_score.metrics but not defined " + f"in metrics_metadata.turn_level or metrics_metadata.conversation_level. " + f"Please add metadata configuration for this metric before using it in quality_score." + ) + def _get_storage_config_with_backward_compat( self, config_data: dict[str, Any] ) -> list[dict[str, Any]]: