diff --git a/src/unitxt/dataset.py b/src/unitxt/dataset.py index 94529f42ff..77020f9fbd 100644 --- a/src/unitxt/dataset.py +++ b/src/unitxt/dataset.py @@ -29,6 +29,7 @@ from .inference import __file__ as _ from .instructions import __file__ as _ from .llm_as_judge import __file__ as _ +from .llm_as_judge_base import __file__ as _ from .llm_as_judge_chat_templates import __file__ as _ from .llm_as_judge_constants import __file__ as _ from .llm_as_judge_from_template import __file__ as _ diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 87488d1da7..cd65f1ba45 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -1339,7 +1339,10 @@ def _infer( ) -> Union[List[str], List[TextGenerationInferenceOutput]]: return [ self.get_return_object( - self.default_inference_value, instance, return_meta_data + self.default_inference_value, + self.default_inference_value, + instance, + return_meta_data, ) for instance in dataset ] @@ -1351,7 +1354,10 @@ def _infer_log_probs( ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]: return [ self.get_return_object( - self.default_inference_value_logprob, instance, return_meta_data + self.default_inference_value_logprob, + self.default_inference_value_logprob, + instance, + return_meta_data, ) for instance in dataset ] diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py index 35904d3b37..98b92d59d0 100644 --- a/src/unitxt/llm_as_judge.py +++ b/src/unitxt/llm_as_judge.py @@ -9,6 +9,7 @@ from .inference import ( InferenceEngine, ) +from .llm_as_judge_base import BaseLLMJudge from .llm_as_judge_chat_templates import direct_template_dict, pairwise_template_dict from .llm_as_judge_constants import ( DIRECT_CRITERIA, @@ -44,48 +45,49 @@ ) from .logging_utils import get_logger from .metric_utils import EmptyPrediction -from .metrics import BulkInstanceMetric +from .metrics import MapReduceMetric from .task import Task from .templates import Template logger = get_logger(__name__) -class LLMJudge(BulkInstanceMetric): +class LLMJudge(BaseLLMJudge): """A metric class to evaluate instances using LLM as a Judge. - Evaluations are performed in two steps. First, the LLM is asked to generate an assessment following a CoT approach based on the criteria. Then, the same LLM is asked to select one of the available options. A summary of the general assessment can be generated for easy consumption by end users. + Evaluations are performed in two steps. First, the LLM is asked to generate an assessment + following a CoT approach based on the criteria. Then, the same LLM is asked to select one + of the available options. A summary of the general assessment can be generated for easy + consumption by end users. + + Args: + inference_engine: The engine used for generating predictions in the different evaluation steps. + evaluator_name: The name of the evaluator. It is used for score naming. If not provided, + `self.inference_engine.get_engine_id()` is used. + check_positional_bias: Flag to check for positional bias. Detecting for positional bias + duplicates the amount of inference calls. Defaults to True. + context_fields: Fields to be used as context. If a dict is provided, the keys are used as + the final names in the prompts, while the values are used to access the context variable + values in the `task_data` object. Defaults to ["context"]. + generate_summaries: Flag to generate summaries of the assessments. Defaults to False. + format: The format used for the inference. Defaults to "formats.chat_api" (only allowed value). + include_prompts_in_result: Flag to include prompts in the result. Defaults to True. + criteria_field: The field specifying the evaluation criteria in the `task_data` object. + If the `criteria` is provided, it will take precedence. Defaults to None. + criteria: The criteria used for evaluation. Defaults to None. """ inference_engine: InferenceEngine - """The engine used for generating predictions in the different evaluation steps.""" - - evaluator_name: EvaluatorNameEnum = None - """The name of the evaluator. It is used for score naming. If not provided `self.inference_engine.get_engine_id()` is used.""" - + evaluator_name: Optional[EvaluatorNameEnum] = None check_positional_bias: bool = True - """Flag to check for positional bias. Detecting for positional bias duplicates the amount of inference calls.""" - context_fields: Union[str, List[str], Dict[str, str]] = ["context"] - """Fields to be used as context. If a dict is provided, the keys are used as the final names in the prompts, while the values are used to access the context variable values in the `task_data` object (it is recommended to provide the context_fields in the Criteria `context_fields` field as this field will be deprecated in the future).""" - generate_summaries: bool = False - """Flag to generate summaries of the assessments. Defaults to `False`.""" - format: str = "formats.chat_api" - """The format used for the inference. Defaults to `formats.chat_api` (only allowed value).""" - include_prompts_in_result: bool = True - """Flag to include prompts in the result. Defaults to `True`.""" - - criteria_field: str = None - """The field specifying the evaluation criteria in the `task_data` object. If the `criteria` is provided, it will take precedence.""" - + criteria_field: Optional[str] = None criteria: Criteria = None - """The criteria used for evaluation.""" def prepare(self): - """Prepares the `LLMJudge` instance by setting up context fields and evaluator name.""" super().prepare() self.context_fields = self.get_context_fields_as_dict(self.context_fields) @@ -93,11 +95,6 @@ def prepare(self): self.evaluator_name = self.inference_engine.get_engine_id() def before_process_multi_stream(self): - """Checks the criteria-related fields correctness before processing multiple streams. - - Raises: - UnitxtError: If both 'criteria' and 'criteria_field' are not set. - """ super().before_process_multi_stream() # We check the criteria here and not in verify(), because we want catalog # may contain a partially initialized object, and verify() method @@ -119,15 +116,6 @@ def get_context_fields_as_dict(self, context_fields: Union[str, List, Dict]): def get_contexts( self, task_data: List[Dict[str, Any]], criteria: List[Criteria] ) -> List[Dict[str, str]]: - """Extracts and parses context fields from task data. - - Args: - task_data (List[Dict[str, Any]]): The task data containing context information. - criteria ( List[Criteria]): The criteria list from which to take the context fields if they weren't provided in the self.context_fields field - - Returns: - List[Dict[str, str]]: A list of parsed context dictionaries. - """ parsed_contexts = [] for i, td in enumerate(task_data): context_fields_for_td = self.context_fields @@ -153,17 +141,6 @@ def perform_evaluation_step( template: Template, previous_messages: Optional[List[Dict[str, str]]] = None, ): - """Performs an evaluation step by generating predictions for the given instances. - - Args: - instances (list): The list of instances to evaluate. - task (Task): The task associated with the instances. - template (Template): The template used for generating predictions. - previous_messages (Optional[List[Dict[str, str]]]): Previous messages for context. - - Returns: - Tuple[List[str], List[str], List[str]]: A tuple containing prompts, raw predictions, and processed predictions. Raw predictions differ from processed predictions only in the completion step, where the processors.match_closest_option is used. - """ outputs_dataset = infer( instances, task=task, @@ -183,14 +160,6 @@ def perform_evaluation_step( return (prompts, raw_predictions, predictions) def clean_results(self, results: Union[dict, list]): - """Cleans the results by removing `None` values and empty lists and dictionaries. - - Args: - results (Union[dict, list]): The results to clean. - - Returns: - Union[dict, list]: The cleaned results. - """ if isinstance(results, list): return [self.clean_results(x) for x in results] cleaned = { @@ -198,7 +167,6 @@ def clean_results(self, results: Union[dict, list]): for k, v in results.items() if v is not None and not (isinstance(v, (list, dict)) and len(v) == 0) } - # Remove the dictionary itself if it becomes empty return { k: v for k, v in cleaned.items() @@ -206,18 +174,6 @@ def clean_results(self, results: Union[dict, list]): } def get_criteria(self, task_data, eval_count) -> List[Criteria]: - """Retrieves the evaluation criteria from the `criteria_field` or from `self`. - - Args: - task_data (List[Dict[str, Any]]): The task data containing criteria information. - eval_count (int): The number of evaluations to perform. - - Returns: - List[Criteria]: A list of criteria for evaluation. - - Raises: - UnitxtError: If the criteria field is not found in the task data. - """ if self.criteria is None: if self.criteria_field not in task_data[0]: raise UnitxtError( @@ -270,22 +226,32 @@ def get_predictions( return predictions + def _prepare_common_instance_data(self, prediction, references, task_data): + criteria = self.get_criteria([task_data], 1)[0] + pred = self.get_predictions([task_data], [criteria], [prediction])[0] + context = self.get_contexts([task_data], [criteria])[0] + + return criteria, pred, context + class LLMJudgeDirect(LLMJudge): - """LLMJudgeDirect is a specialized evaluation metric that performs Direct Assessment using an LLM to score responses based on a predefined evaluation criteria. + """Specialized evaluation metric that performs Direct Assessment using an LLM. Direct Assessment is an evaluation paradigm in which the LLM selects one of a predefined set of options based on an assessment criterion. This approach can be used for Likert-scale scoring (e.g., 1-5) or selecting from semantically conditioned literals (e.g., Yes/No, Pass/Fail). + + Attributes: + criteria: The evaluation criteria, including a name, description, a predefined + set of options and option_map. Defaults to None. + main_score: The primary score name used in the results. By default, it will + take the value of the criteria name (if only one criteria is being used + for evaluation) or "llm_as_judge" otherwise. Defaults to "llm_as_judge". """ criteria: CriteriaWithOptions = None - """The evaluation criteria, including a name, description, a predefined set of options and and option_map.""" main_score = "llm_as_judge" - """The primary score name used in the results. By default, it will take the value of the criteria name (if only one criteria is being used for evaluation) or "llm_as_judge" otherwise.""" - reduction_map = {"mean": ["llm_as_judge"]} - """A mapping used for score aggregation. By default, it will take the value of ``{'mean': []}`` .""" def prepare(self): super().prepare() @@ -324,7 +290,6 @@ def prepare(self): ) def before_process_multi_stream(self): - """Ensures that the criteria is of type `CriteriaWithOptions`, raising an exception otherwise.""" super().before_process_multi_stream() if self.criteria is not None and not isinstance( self.criteria, CriteriaWithOptions @@ -335,18 +300,6 @@ def before_process_multi_stream(self): return def __get_parsed_criteria(self, criteria: CriteriaWithOptions): - """Extracts key information from the given criteria. - - Args: - criteria (CriteriaWithOptions): The evaluation criteria. - - Returns: - Tuple[str, List[str], str, str]: - - Criteria description. - - List of option names. - - Formatted instruction for displaying options. - - Instruction for scoring options. - """ criteria_description = criteria.description criteria_option_names = [o.name for o in criteria.options] @@ -367,7 +320,7 @@ def __set_main_score(self, criterias: List[CriteriaWithOptions]): unique_criteria_names = list({criteria.name for criteria in criterias}) if len(unique_criteria_names) == 1 and criterias[0].name != "": self.main_score = "_".join(criterias[0].name.lower().split(" ")) - self.reduction_map = {"mean": [self.main_score]} + self.ci_score_names = ["score"] def __get_results( self, @@ -448,185 +401,129 @@ def __get_results( # add main_score to each result return [ { - f"{self.main_score}_{k}" if k != self.main_score else self.main_score: v + # Special handling for prompts field - should use criteria name instead of main_score + f"{criterias[i].name}_{k}" + if k == "prompts" + else ( + f"{self.main_score}_{k}" + if k != self.main_score + else self.main_score + ): v for k, v in r.items() } - for r in results + for i, r in enumerate(results) ] - def compute( - self, - references: List[List[str]], - predictions: List[str], - task_data: List[Dict[str, Any]], - ) -> List[Dict]: - r"""Performs direct assessment evaluation on the given predictions and references. - - This method evaluates the quality of of the predictions by calculating scores for each instance based on a criterion. - - Returns: - -------- - List[Dict] - A list of dictionaries containing the evaluation results for each instance. The results include the computed scores for each prediction. Each result will have the `score_name` as a prefix, which may be the criterion name if only one used, or "llm_as_judge" if several criteria were used. - - Explanation of fields: - - - `score`: a float representing the evaluation score for the response. The value is calculated from criteria.option_map[selected_option]. - - `using_`: Equal to score. - - `positional_bias`: Boolean indicating whether the assessment detected positional bias. Its final value is selected_option != positional_bias_selected_option - - `selected_option`: The criteria option that the evaluator chose (e.g., "Could be Improved"). It is calculated by processing `option_selection_completion` using `processors.match_closest_option` - - `positional_bias_selected_option`: The criteria option that the evaluator chose when checking positional bias. - - `assessment`: The inference engine's generated text using the `prompts.assessment` prompt. - - `positional_bias_assessment`: The inference engine's generated text using the `prompts.positional_bias_assessment` prompt. - - `summary`: An LLM-generated summary of the assessment. - - `positional_bias_summary`: A LLM-generated summary of the positional bias assessment. - - `prompts`: A dictionary of prompts used in different stages of evaluation. - - `assessment`: The prompt used to instruct the model on how to assess the response. - - `positional_bias_assessment`: The prompt used to instruct the model on how to assess the response in the positional bias check. - - `summarization`: The prompt used to generate summary of the assessment. - - `option_selection`: The prompt used to generate a final judgement. - - `positional_bias_option_selection`: The prompt used to generate a final judgement in the positional bias check. - - `option_selection_completion`: The inference engine's generated text using `prompts.option_selection`. - - `positional_bias_option_selection_completion`: The inference engine's generated text using `prompts.positional_bias_option_selection`. - - `criteria`: A JSON-like string representing the evaluation criteria's artifact. - - Result example: - - .. code-block:: python - - [ - { - "answer_relevance": 1, - "answer_relevance_using_granite3.0-2b_litellm": 1, - "answer_relevance_positional_bias": false, - "answer_relevance_selected_option": "Could be Improved", - "answer_relevance_positional_bias_selected_option": "Could be Improved", - "answer_relevance_assessment": "To assess the quality of the response, l...", - "answer_relevance_positional_bias_assessment": "To assess the quality of the response, l...", - "answer_relevance_summary": "A response about apprenticeships during ...", - "answer_relevance_positional_bias_summary": "A response about apprenticeships during ...", - "answer_relevance_prompts": { - "assessment": [ - { - "role": "user", - "content": "You are presented with a response gener..." - } - ], - "positional_bias_assessment": [ - { - "role": "user", - "content": "You are presented with a response gener..." - } - ], - "summarization": [ - { - "role": "user", - "content": "Transform the following assessment into ..." - } - ], - "option_selection": [ - { - "content": "You are presented with a response gener...", - "role": "user" - }, - { - "content": "To assess the quality of the response, l...", - "role": "assistant" - }, - { - "content": "Now consider the evaluation criteria and...", - "role": "user" - } - ], - "posional_bias_option_selection": [ - { - "content": "You are presented with a response gener...", - "role": "user" - }, - { - "content": "To assess the quality of the response, l...", - "role": "assistant" - }, - { - "content": "Now consider the evaluation criteria and...", - "role": "user" - } - ] - }, - "answer_relevance_option_selection_completion": "Could be Improved", - "answer_relevance_positional_bias_option_selection_completion": "Could be Improved", - "answer_relevance_criteria": "{ \"__type__\": \"criteria_with_options..." - } - ] - """ - logger.info( - f'Starting evaluation with evaluator "{self.evaluator_name}" and provider "{self.inference_engine.get_pretty_print_name()}' + def _prepare_instance_for_inference(self, prediction, references, task_data): + criteria, pred, context = self._prepare_common_instance_data( + prediction, references, task_data ) - evaluations_count = len(task_data) - # TODO: find out how to serialize and deserialize enums - criteria_list = self.get_criteria(task_data, evaluations_count) - predictions = self.get_predictions(task_data, criteria_list, predictions) - contexts = self.get_contexts(task_data, criteria_list) - self.__set_main_score(criteria_list) - if self.check_positional_bias: - criteria_list += [ - CriteriaWithOptions( - name=criteria.name, - description=criteria.description, - option_map=criteria.option_map, - options=list(reversed(criteria.options)), - ) - for criteria in criteria_list - ] - contexts += contexts - predictions += predictions + ( + criteria_description, + criteria_option_names, + display_options_instruction, + ) = self.__get_parsed_criteria(criteria) - parsed_criterias = [ - self.__get_parsed_criteria(criteria) for criteria in criteria_list - ] + return { + "prediction": pred, + "context": context, + "criteria": criteria, + "criteria_description": criteria_description, + "criteria_option_names": criteria_option_names, + "display_options_instruction": display_options_instruction, + } - ( - criteria_description_list, - criteria_option_names_list, - display_options_instruction_list, - ) = zip(*parsed_criterias) + def _run_inference_on_all(self, prepared_instances): + if prepared_instances: + criteria_list = [prepared_instances[0]["criteria"]] + self.__set_main_score(criteria_list) - assessment_for_summaries_slice = slice(0, evaluations_count) + # Prepare all assessment instances + assessment_instances = [] + instance_metadata = [] - assessment_instances = [ - { - "context_variables": context, - "response": prediction, - "display_options_instruction": display_options_instruction, - "criteria_description": criteria_description, - "data_classification_policy": ["public"], - } - for context, prediction, criteria_description, display_options_instruction in zip( - contexts, - predictions, - criteria_description_list, - display_options_instruction_list, + for i, prep in enumerate(prepared_instances): + # Store metadata for later use + instance_metadata.append( + { + "criteria": prep["criteria"], + "criteria_description": prep["criteria_description"], + "criteria_option_names": prep["criteria_option_names"], + "display_options_instruction": prep["display_options_instruction"], + "original_index": i, + } ) - ] + + # Create assessment instance + assessment_instances.append( + { + "context_variables": prep["context"], + "response": prep["prediction"], + "display_options_instruction": prep["display_options_instruction"], + "criteria_description": prep["criteria_description"], + "data_classification_policy": ["public"], + } + ) + + # If checking positional bias, add reversed version + if self.check_positional_bias: + reversed_criteria = CriteriaWithOptions( + name=prep["criteria"].name, + description=prep["criteria"].description, + option_map=prep["criteria"].option_map, + options=list(reversed(prep["criteria"].options)), + ) + ( + rev_criteria_description, + rev_criteria_option_names, + rev_display_options_instruction, + ) = self.__get_parsed_criteria(reversed_criteria) + + # Store reversed metadata + instance_metadata.append( + { + "criteria": reversed_criteria, + "criteria_description": rev_criteria_description, + "criteria_option_names": rev_criteria_option_names, + "display_options_instruction": rev_display_options_instruction, + "original_index": i, + "is_positional_bias": True, + } + ) + + # Add reversed assessment instance + assessment_instances.append( + { + "context_variables": prep["context"], + "response": prep["prediction"], + "display_options_instruction": rev_display_options_instruction, + "criteria_description": rev_criteria_description, + "data_classification_policy": ["public"], + } + ) + + # Perform assessment step on all instances at once assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step( assessment_instances, self.assessment_task, self.assessment_template ) logger.info("The assessment was generated successfully.") + # Summarization step (if enabled) summarization_prompts = None summarization_outputs = None if self.generate_summaries: - # Summarisation Stage + evaluations_count = len(prepared_instances) summarization_instances = [ { "assessment": assessment_output, "data_classification_policy": ["public"], } for assessment_output in assessment_outputs[ - assessment_for_summaries_slice - ] + :evaluations_count + ] # Only original assessments, not positional bias ] + ( summarization_prompts, summarization_outputs, @@ -638,23 +535,19 @@ def compute( ) logger.info("The summary was generated successfully.") - option_selection_instances = [ - { - "criteria_description": criteria_description, - "display_options_instruction": display_options_instruction, - "options": criteria_option_names, - "data_classification_policy": ["public"], - } - for ( - criteria_description, - display_options_instruction, - criteria_option_names, - ) in zip( - criteria_description_list, - display_options_instruction_list, - criteria_option_names_list, + # Option selection step + option_selection_instances = [] + for metadata in instance_metadata: + option_selection_instances.append( + { + "criteria_description": metadata["criteria_description"], + "display_options_instruction": metadata[ + "display_options_instruction" + ], + "options": metadata["criteria_option_names"], + "data_classification_policy": ["public"], + } ) - ] previous_messages = [ [assessment_prompt[0], {"role": "assistant", "content": assessment_output}] @@ -662,6 +555,7 @@ def compute( assessment_prompts, assessment_outputs ) ] + ( option_selection_prompts, option_selection_outputs, @@ -674,6 +568,10 @@ def compute( ) logger.info("The selections were calculated successfully.") + # Process results for each original instance + evaluations_count = len(prepared_instances) + criteria_list = [meta["criteria"] for meta in instance_metadata] + results = self.__get_results( assessment_prompts, assessment_outputs, @@ -690,15 +588,20 @@ def compute( class LLMJudgePairwise(LLMJudge): - """A judge for pairwise comparison evaluations, where two or more responses are compared to determine which one is preferred based on a criterion.""" + """Judge for pairwise comparison evaluations using an LLM. + + This class performs pairwise comparison evaluations, where two or more responses + are compared to determine which one is preferred based on a criterion. It computes + win rates and rankings for different systems or responses. + + Attributes: + main_score: The main score metric for pairwise evaluation. By default, its value + is "1_winrate", representing the win rate of the first system. Defaults to "1_winrate". + """ main_score = "1_winrate" - """The main score metric for pairwise evaluation. By default, its value is `1_winrate`, and will take the value of the winrate of the first system.""" - reduction_map = {"mean": ["score"]} - """A mapping specifying how scores should be reduced. By default, it will be ``{'main': ['score']}`` .""" def prepare(self): - """Prepares the pairwise comparison by initializing the necessary templates and tasks. These tasks will be used to assess, summarize, and select options from candidate responses.""" super().prepare() self.assessment_template = pairwise_template_dict["assessment"] self.summarization_template = pairwise_template_dict["summarization"] @@ -737,7 +640,6 @@ def prepare(self): ) def before_process_multi_stream(self): - """Verifies that the criteria is of the correct type before processing the multi-stream data.""" super().before_process_multi_stream() if self.criteria is not None and not isinstance(self.criteria, Criteria): raise Exception( @@ -759,24 +661,6 @@ def __get_instance_results( combination_indexes, criterion: Criteria, ): - """Computes the results for each instance by comparing the responses and calculating metrics such as winrate, ranking, and the responses overall performance. This method processes assessment, summarization, and option selection outputs to track contest results, positional bias, and winrate. - - Args: - instance_predictions (Dict[str, str]): The predictions for each response. - assessment_prompts (List[str]): The prompts for the assessment task. - assessment_outputs (List[str]): The results from the assessment task. - summarization_prompts (List[str]): The prompts for the summarization task. - summarization_outputs (List[str]): The results from the summarization task. - option_selection_prompts (List[str]): The prompts for the option selection task. - option_selection_outputs (List[str]): The results from the option selection task. - selections (List[str]): The selections made during the pairwise comparison. - contests_count (int): The total number of contests that were run. - combination_indexes (List[Tuple[int, int]]): The indexes of the response pairs that were compared. - criterion (Criteria): The criterion used to assess the responses. - - Returns: - dict: A dictionary containing the results for each response, including winrate, ranking, and other metrics. - """ response_names = list(instance_predictions.keys()) per_response_results = { response_key: { @@ -939,14 +823,6 @@ def __get_instance_results( return self.clean_results(all_results) def __parse_prediction_to_dict(self, predictions: Union[Dict[str, str], List[str]]): - """Converts a list or dictionary of predictions into a dictionary format. - - Args: - predictions (Union[Dict[str, str], List[str]]): The prediction data to convert. - - Returns: - dict: The prediction data in dictionary format. - """ if isinstance(predictions, list): return {f"{key + 1}": value for key, value in enumerate(predictions)} if isinstance(predictions, dict): @@ -955,583 +831,65 @@ def __parse_prediction_to_dict(self, predictions: Union[Dict[str, str], List[str f"Prediction may be a list or a dict. Instead got type {type(predictions)}" ) - def __convert_predictions_to_dicts( - self, predictions: Union[List[Dict[str, str]], List[str]] - ): - """Converts a list of predictions into a list of dictionaries. - - Args: - predictions (Union[List[Dict[str, str]], List[str]]): The predictions to convert. - - Returns: - List[dict]: A list of predictions in dictionary format. - """ - return [ - self.__parse_prediction_to_dict(prediction) for prediction in predictions - ] - def __set_main_score(self, predictions: List[Dict[str, str]]): self.main_score = f"{next(iter(predictions[0].keys()))}_winrate" - def compute( - self, - references: List[List[str]], - predictions: List[str], - task_data: List[Dict[str, str]], - ) -> List[Dict]: - r"""Executes the pairwise comparison evaluation, including assessment, summarization, and option selection. It computes the winrate and ranking for the responses. - - Args: - references (List[List[str]]): A list of reference responses for comparison. - predictions (List[str]): A list of predicted responses. - task_data (List[Dict[str, str]]): Task data to be used for evaluation. - - Returns: - -------- - List[Dict[str,Dict]] - The results of the evaluation, including winrate, ranking, and other metrics. - - For each instance result, the following metrics are included per response/system. Each of the metrics will have appended the systems name, if predictions were provided as a list of dicts, or their index, starting from 1, if predictions were provided as a list of lists. - - All the fields are arrays with length equal to `len(systems) - 1`. For any result at index `i`: `response_name[i]`'s contest against `compared_to[i]`'s result is `contest_results[i]`. - - Explanation of fields: - - - `summaries`: A list of LLM-generated summaries explaining the comparison results for each response. - - `contest_results`: A list of boolean values indicating whether the response won in each comparison. - - `selections`: A list of the selected system names, representing the preferred response in each comparison. - - `compared_to`: A list of system names that were compared against the given response. - - `assessments`: A list of LLM-generated assessments explaining the reasoning behind the evaluation results. - - `positional_bias_assessments`: A list of LLM-generated assessments focused on detecting positional bias in the evaluation. - - `option_selection_outputs`: A list of response names selected as the best choice based on the evaluation. - - `positional_bias`: A list of boolean values indicating whether positional bias was detected in the contest. - - `positional_bias_selection`: A list of response names representing the selected option when considering positional bias. - - `prompts`: A dictionary of prompts used in different stages of evaluation. - - `assessment`: The prompt used to instruct the model on how to assess the responses. - - `positional_bias_assessment`: The prompt used to instruct the model on how to assess positional bias. - - `option_selection`: The prompt used to guide the model in selecting the best response. - - `positional_bias_option_selection`: The prompt used for selecting the best response while checking for positional bias. - - `summary`: The prompt used to generate a summary of the assessment. - - `winrate`: A float representing the proportion of comparisons the response won. - - `llm_as_judge`: Equal to `winrate`. - - `ranking`: An integer representing the ranking position of the response based on the evaluation results. Best is 1. - - `response_name`: A string identifying the response in the evaluation. - - Result example: - - .. code-block:: python - - [ - { - "system1_contest_results": [ - true, - true - ], - "system1_selections": [ - "system1", - "system1" - ], - "system1_compared_to": [ - "system2", - "system3" - ], - "system1_assessments": [ - "To determine the better response accordi...", - "To determine the better response accordi..." - ], - "system1_positional_bias_assessments": [ - "To determine the better response accordi...", - "To determine the better response accordi..." - ], - "system1_option_selection_outputs": [ - "system1", - "system1" - ], - "system1_positional_bias": [ - false, - false - ], - "system1_positional_bias_selection": [ - "system1", - "system1" - ], - "system1_prompts": { - "assessment": [ - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ], - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ] - ], - "positional_bias_assessment": [ - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ], - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ] - ], - "option_selection": [ - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ], - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ] - ], - "positional_bias_option_selection": [ - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ], - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ] - ] - }, - "system1_winrate": 1.0, - "system1_llm_as_judge": 1.0, - "system1_ranking": 1, - "system1_response_name": "system1", - "system2_contest_results": [ - false, - true - ], - "system2_selections": [ - "system1", - "system2" - ], - "system2_compared_to": [ - "system1", - "system3" - ], - "system2_assessments": [ - "To determine the better response accordi...", - "To determine the better response accordi..." - ], - "system2_positional_bias_assessments": [ - "To determine the better response accordi...", - "To determine the better response accordi..." - ], - "system2_option_selection_outputs": [ - "system1", - "system2" - ], - "system2_positional_bias": [ - false, - false - ], - "system2_positional_bias_selection": [ - "system1", - "system2" - ], - "system2_prompts": { - "assessment": [ - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ], - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ] - ], - "positional_bias_assessment": [ - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ], - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ] - ], - "option_selection": [ - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ], - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ] - ], - "positional_bias_option_selection": [ - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ], - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ] - ] - }, - "system2_winrate": 0.5, - "system2_llm_as_judge": 0.5, - "system2_ranking": 2, - "system2_response_name": "system2", - "system3_contest_results": [ - false, - false - ], - "system3_selections": [ - "system1", - "system2" - ], - "system3_compared_to": [ - "system1", - "system2" - ], - "system3_assessments": [ - "To determine the better response accordi...", - "To determine the better response accordi..." - ], - "system3_positional_bias_assessments": [ - "To determine the better response accordi...", - "To determine the better response accordi..." - ], - "system3_option_selection_outputs": [ - "system1", - "system2" - ], - "system3_positional_bias": [ - false, - false - ], - "system3_positional_bias_selection": [ - "system1", - "system2" - ], - "system3_prompts": { - "assessment": [ - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ], - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ] - ], - "positional_bias_assessment": [ - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ], - [ - { - "role": "user", - "content": "You are provided a pair of responses (Re..." - } - ] - ], - "option_selection": [ - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ], - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ] - ], - "positional_bias_option_selection": [ - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ], - [ - { - "content": "You are provided a pair of responses (Re...", - "role": "user" - }, - { - "content": "To determine the better response accordi...", - "role": "assistant" - }, - { - "content": "Now considering the evaluation criteria,...", - "role": "user" - } - ] - ] - }, - "system3_winrate": 0.0, - "system3_llm_as_judge": 0.0, - "system3_ranking": 3, - "system3_response_name": "system3", - "criteria": "{ \"__type__\": \"criteria\", \"name\"..." - } - ] - """ - logger.info( - f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}' + def _prepare_instance_for_inference(self, prediction, references, task_data): + criteria, pred, context = self._prepare_common_instance_data( + prediction, references, task_data ) - instances_count = len(predictions) - criteria_list = self.get_criteria(task_data, instances_count) - contexts = self.get_contexts(task_data, criteria_list) - predictions = self.get_predictions(task_data, criteria_list, predictions) - predictions = self.__convert_predictions_to_dicts(predictions) - self.__set_main_score(predictions) - self.reduction_map = {"mean": ["score"]} - self.reduction_map["mean"].extend( - [f"{key}_winrate" for key in predictions[0].keys()] - ) + pred_dict = self.__parse_prediction_to_dict(pred) - predictions_count_list = [len(prediction) for prediction in predictions] - combination_indexes_list = [ - list(itertools.combinations(range(evaluations_count), 2)) - for evaluations_count in predictions_count_list - ] - contests_count_list = [ - len(combination_indexes) for combination_indexes in combination_indexes_list - ] + return { + "prediction_dict": pred_dict, + "context": context, + "criteria": criteria, + "task_data": task_data, + } - logger.info( - f"The evaluation will perform {sum(contests_count_list) * [1, 2][self.check_positional_bias]} ({' + '.join([f'{c * [1, 2][self.check_positional_bias]}' for c in contests_count_list])}) pairwise comparisons" - ) + def _run_inference_on_all(self, prepared_instances): + if not prepared_instances: + return [] - response_pairs_list: List[List[List[str]]] = [] - option_pairs_list: List[List[List[str]]] = [] - predictions_names = set(predictions[0].keys()) - for i, combination_indexes in enumerate(combination_indexes_list): - instance_predictions = predictions[i] - instance_predictions_names = list(instance_predictions.keys()) - if set(instance_predictions_names) != predictions_names: - raise Exception( - f"The set of prediction names is different between instance 0 and instance {i}. In prediction 0, it is {sorted(predictions_names)}. In prediction {i}, it is {sorted(instance_predictions_names)}. Make sure the same number of predictions is passed for all instances." - ) + # Set main score based on first instance + first_pred_dict = prepared_instances[0]["prediction_dict"] + self.__set_main_score([first_pred_dict]) + self._system_keys = list(first_pred_dict.keys()) - response_pairs: List[List[str]] = [] - option_pairs: List[List[str]] = [] - for combination in combination_indexes: - (idx_1, idx_2) = combination - response_name_1 = instance_predictions_names[idx_1] - response_name_2 = instance_predictions_names[idx_2] - response_pairs.append( - [ - instance_predictions[response_name_1], - instance_predictions[response_name_2], - ] - ) - option_pairs.append([response_name_1, response_name_2]) - response_pairs_list.append(response_pairs) - option_pairs_list.append(option_pairs) + # Prepare all assessment instances without keeping heavy data aggregated + all_assessment_instances = [] + all_instance_metadata = [] - if self.check_positional_bias: - criteria_list.extend(criteria_list) - contexts.extend(contexts) - for response_pairs, option_pairs in zip( - response_pairs_list, option_pairs_list - ): - response_pairs += [ - list(reversed(response_pair)) for response_pair in response_pairs - ] - option_pairs += [ - list(reversed(option_pair)) for option_pair in option_pairs - ] + for prep_instance in prepared_instances: + ( + assessment_instances, + instance_metadata, + ) = self._prepare_assessment_instances(prep_instance) + all_assessment_instances.extend(assessment_instances) + all_instance_metadata.append(instance_metadata) - assessment_instances = [ - { - "context_variables": contexts[i], - "response_a": response_pair[0], - "response_b": response_pair[1], - "option_a": option_pair[0], - "option_b": option_pair[1], - "criteria_name": criteria_list[i].name, - "criteria_description": criteria_list[i].description, - "data_classification_policy": ["public"], - } - for i, (response_pairs, option_pairs) in enumerate( - zip(response_pairs_list, option_pairs_list) - ) - for response_pair, option_pair in zip(response_pairs, option_pairs) - ] + # Perform assessment step on all instances at once assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step( - assessment_instances, self.assessment_task, self.assessment_template + all_assessment_instances, self.assessment_task, self.assessment_template ) logger.info("The assessment was generated successfully.") - # the slices used to get the assessment for each summary generation instance - # it will grab the whole assessment for a particular instance or half of it depending on the value of check_positional_bias - incremental_contests_count_list = [ - sum(contests_count_list[: i + 1]) for i in range(len(contests_count_list)) - ] - - # Summarisation Stage + # Summarization step (if enabled) summarization_prompts = None summarization_outputs = None if self.generate_summaries: - incremental_contests_count_with_positional_bias_list = [ - incremental_contests_count * [1, 2][self.check_positional_bias] - for incremental_contests_count in incremental_contests_count_list - ] - assessment_for_summaries_slice_list = [ - slice( - incremental_contests_count_with_positional_bias_list[i - 1] - if i > 0 - else 0, - ( - incremental_contests_count_with_positional_bias_list[i - 1] - if i > 0 - else 0 + summarization_instances = [] + for metadata in all_instance_metadata: + for i in range(metadata["contests_count"]): + summarization_instances.append( + { + "assessment": assessment_outputs[ + metadata["assessment_start_idx"] + i + ], + "data_classification_policy": ["public"], + } ) - + contests_count_list[i], - ) - for i in range(len(contests_count_list)) - ] - summarization_instances = [ - { - "assessment": assessment_output, - "data_classification_policy": ["public"], - } - for assessment_for_summaries_slice in assessment_for_summaries_slice_list - for assessment_output in assessment_outputs[ - assessment_for_summaries_slice - ] - ] ( summarization_prompts, @@ -1544,32 +902,21 @@ def compute( ) logger.info("The summary was generated successfully.") - score_option_instruction_list = [ - "".join( - [ - f'Choose "{option}" if Response {option} is better quality.\n' - for option in option_pair - ] - ) - for option_pairs in option_pairs_list - for option_pair in option_pairs - ] - - option_selection_instances = [ - { - "options": [f"Response {option}" for option in option_pair], - "score_option_instruction": score_option_instruction, - "data_classification_policy": ["public"], - } - for option_pair, score_option_instruction in zip( - [ - option_pair - for option_pairs in option_pairs_list - for option_pair in option_pairs - ], - score_option_instruction_list, + # Option selection step + option_selection_instances = [] + for assessment_instance in all_assessment_instances: + option_selection_instances.append( + { + "options": [ + f"Response {option}" + for option in assessment_instance["option_pair"] + ], + "score_option_instruction": assessment_instance[ + "score_option_instruction" + ], + "data_classification_policy": ["public"], + } ) - ] previous_messages = [ [assessment_prompt[0], {"role": "assistant", "content": assessment_output}] @@ -1588,23 +935,29 @@ def compute( self.option_selection_template, previous_messages, ) - # Selections are of the form 'Response n', so we just keep n selections = [selection.split(" ")[-1] for selection in selections] logger.info("The selections were calculated successfully.") + + # Process results for each instance results = [] - slice_start = 0 - for i, incremental_contests_count in enumerate(incremental_contests_count_list): - slice_end = slice_start + contests_count_list[i] + assessment_idx = 0 + for prep_instance, metadata in zip(prepared_instances, all_instance_metadata): + contests_count = metadata["contests_count"] + combination_indexes = metadata["combination_indexes"] + + slice_end = assessment_idx + contests_count if self.check_positional_bias: - slice_end += contests_count_list[i] - sli = slice(slice_start, slice_end) - sli_summarization = slice( - (incremental_contests_count_list[i - 1] if i > 0 else 0), - (incremental_contests_count_list[i - 1] if i > 0 else 0) - + incremental_contests_count, + slice_end += contests_count + + sli = slice(assessment_idx, slice_end) + sli_summarization = ( + slice(assessment_idx, assessment_idx + contests_count) + if self.generate_summaries + else None ) + instance_results = self.__get_instance_results( - predictions[i], + prep_instance["prediction_dict"], assessment_prompts[sli], assessment_outputs[sli], summarization_prompts[sli_summarization] @@ -1616,11 +969,147 @@ def compute( option_selection_prompts[sli], option_selection_outputs[sli], selections[sli], - contests_count_list[i], - combination_indexes_list[i], - criteria_list[i], + contests_count, + combination_indexes, + prep_instance["criteria"], ) results.append(instance_results) - slice_start = slice_end + assessment_idx = slice_end return results + + def _prepare_assessment_instances(self, prep_instance): + pred_dict = prep_instance["prediction_dict"] + context = prep_instance["context"] + criteria = prep_instance["criteria"] + + # Calculate combinations and prepare pairs + prediction_names = list(pred_dict.keys()) + combination_indexes = list( + itertools.combinations(range(len(prediction_names)), 2) + ) + contests_count = len(combination_indexes) + + # Prepare response pairs and option pairs + response_pairs = [] + option_pairs = [] + for combination in combination_indexes: + (idx_1, idx_2) = combination + response_name_1 = prediction_names[idx_1] + response_name_2 = prediction_names[idx_2] + response_pairs.append( + [pred_dict[response_name_1], pred_dict[response_name_2]] + ) + option_pairs.append([response_name_1, response_name_2]) + + # If checking positional bias, add reversed pairs + if self.check_positional_bias: + response_pairs += [list(reversed(pair)) for pair in response_pairs] + option_pairs += [list(reversed(pair)) for pair in option_pairs] + + # Create assessment instances + assessment_instances = [] + for response_pair, option_pair in zip(response_pairs, option_pairs): + score_option_instruction = "".join( + [ + f'Choose "{option}" if Response {option} is better quality.\n' + for option in option_pair + ] + ) + + assessment_instances.append( + { + "context_variables": context, + "response_a": response_pair[0], + "response_b": response_pair[1], + "option_a": option_pair[0], + "option_b": option_pair[1], + "criteria_name": criteria.name, + "criteria_description": criteria.description, + "data_classification_policy": ["public"], + "option_pair": option_pair, # Store for later use + "score_option_instruction": score_option_instruction, + } + ) + + metadata = { + "contests_count": contests_count, + "combination_indexes": combination_indexes, + "assessment_start_idx": 0, # Will be set correctly when called + } + + return assessment_instances, metadata + + def reduce(self, intermediates: List[Dict[str, Any]]) -> Dict[str, Any]: + if not intermediates: + return {} + + aggregated = {} + + fields_to_aggregate = ["score"] + if hasattr(self, "_system_keys"): + fields_to_aggregate.extend([f"{key}_winrate" for key in self._system_keys]) + + for field_name in fields_to_aggregate: + values = [] + for result in intermediates: + if field_name in result and isinstance( + result[field_name], (int, float) + ): + values.append(result[field_name]) + + if values: + aggregated[field_name] = sum(values) / len(values) + + if ( + hasattr(self, "main_score") + and self.main_score + and self.main_score in aggregated + ): + aggregated["score"] = aggregated[self.main_score] + aggregated["score_name"] = self.main_score + + return aggregated + + +__all__ = [ + # llm_as_judge_chat_templates + "direct_template_dict", + "pairwise_template_dict", + # llm_as_judge_constants + "DIRECT_CRITERIA", + "EVALUATOR_TO_MODEL_ID", + "EVALUATORS_METADATA", + "PAIRWISE_CRITERIA", + "Criteria", + "CriteriaOption", + "CriteriaWithOptions", + "DirectCriteriaCatalogEnum", + "EvaluatorMetadata", + "EvaluatorNameEnum", + "EvaluatorTypeEnum", + "ModelProviderEnum", + "PairwiseCriteriaCatalogEnum", + # llm_as_judge_from_template + "LLMAsJudge", + "LLMAsJudgeBase", + "TaskBasedLLMasJudge", + # llm_as_judge_operators + "CreateCriteriaFromDict", + "CreateCriteriaFromJson", + "CreateCriteriaFromString", + "CreateCriteriaWithOptionsFromDict", + "CreateCriteriaWithOptionsFromJson", + "CreateYesNoCriteriaFromString", + "CreateYesNoPartiallyCriteriaFromString", + "LoadCriteria", + "LoadCriteriaWithOptions", + # llm_as_judge_utils + "get_evaluator_metadata", + "get_parsed_context", + "rank_indexes", + # judges + "LLMJudge", + "LLMJudgePairwise", + "LLMJudgeDirect", +] diff --git a/src/unitxt/llm_as_judge_base.py b/src/unitxt/llm_as_judge_base.py new file mode 100644 index 0000000000..2b8b7c2f29 --- /dev/null +++ b/src/unitxt/llm_as_judge_base.py @@ -0,0 +1,109 @@ +from abc import abstractmethod +from typing import Any, Dict, List, Optional + +from .logging_utils import get_logger +from .metrics import MapReduceMetric + +logger = get_logger(__name__) + + +class BaseLLMJudge(MapReduceMetric[Any, Dict[str, Any]]): + """Base class for all LLM-as-Judge implementations with shared functionality. + + This class provides common map-reduce patterns, score aggregation, and confidence interval handling + for all LLM judge implementations. It defines the standard evaluation workflow using a two-step + process: instance preparation followed by batch inference execution. + + Args: + ci_score_names: Names of scores for which confidence intervals should be computed. + Defaults to None, which means no confidence intervals are calculated. + """ + + ci_score_names: Optional[List[str]] = None + + def map( + self, prediction: Any, references: List[Any], task_data: Dict[str, Any] + ) -> Dict[str, Any]: + """Single instance processing - redirects to map_stream for batch efficiency.""" + raise NotImplementedError( + "LLM judge metrics should override map_stream for efficient batch processing, not map" + ) + + def map_stream(self, evaluation_inputs_stream): + """Common map_stream implementation for all LLM judge subclasses.""" + logger.info( + f'Starting evaluation with {self.__class__.__name__} using "{self._get_engine_id()}"' + ) + + # Prepare all instances for inference without aggregating heavy data + prepared_instances = [] + for prediction, references, task_data in evaluation_inputs_stream: + prepared_instance = self._prepare_instance_for_inference( + prediction, references, task_data + ) + prepared_instances.append(prepared_instance) + + # Run all inference steps on the prepared instances + return self._run_inference_on_all(prepared_instances) + + @abstractmethod + def _prepare_instance_for_inference(self, prediction, references, task_data): + """Prepare a single instance for inference without keeping heavy data. + + This method should be implemented by each judge subclass to prepare + an individual instance for batch inference processing. + """ + pass + + @abstractmethod + def _run_inference_on_all(self, prepared_instances): + """Run inference on all prepared instances efficiently. + + This method should be implemented by each judge subclass to execute + inference on the batch of prepared instances and return results. + """ + pass + + def reduce(self, intermediates: List[Dict[str, Any]]) -> Dict[str, Any]: + """Aggregate individual instance results into global scores.""" + if not intermediates: + return {} + + aggregated = {} + + # For LLM judges, only aggregate the main score field (like original BulkInstanceMetric behavior) + if hasattr(self, "main_score") and self.main_score: + # Collect values only for the main score field + values = [] + for result in intermediates: + if self.main_score in result and isinstance( + result[self.main_score], (int, float) + ): + values.append(result[self.main_score]) + + if values: + aggregated[self.main_score] = sum(values) / len(values) + # Set the score alias + aggregated["score"] = aggregated[self.main_score] + aggregated["score_name"] = self.main_score + + return aggregated + + def reduce_one(self, intermediate: Dict[str, Any]) -> Dict[str, Any]: + """Return individual instance scores.""" + result = dict(intermediate) + if ( + hasattr(self, "main_score") + and self.main_score + and self.main_score in result + ): + result["score"] = result[self.main_score] + result["score_name"] = self.main_score + return result + + def _get_engine_id(self): + if hasattr(self, "inference_engine"): + return self.inference_engine.get_engine_id() + if hasattr(self, "inference_model"): + return self.inference_model.get_engine_id() + return "unknown_engine" diff --git a/src/unitxt/llm_as_judge_from_template.py b/src/unitxt/llm_as_judge_from_template.py index df2d5abab8..ce26edfc97 100644 --- a/src/unitxt/llm_as_judge_from_template.py +++ b/src/unitxt/llm_as_judge_from_template.py @@ -6,7 +6,8 @@ from .dataclass import Field from .formats import ChatAPIFormat, Format, SystemFormat from .inference import InferenceEngine, LogProbInferenceEngine, OpenAiInferenceEngine -from .metrics import BulkInstanceMetric +from .llm_as_judge_base import BaseLLMJudge +from .logging_utils import get_logger from .operator import SequentialOperator from .operators import ArtifactFetcherMixin from .settings_utils import get_settings @@ -14,6 +15,7 @@ from .templates import Template settings = get_settings() +logger = get_logger(__name__) def get_task_data_dict(task_data): @@ -24,20 +26,23 @@ def get_task_data_dict(task_data): return json.loads(task_data) if isinstance(task_data, str) else task_data -class LLMAsJudgeBase(BulkInstanceMetric, ArtifactFetcherMixin): - """LLM-as-judge-base metric class for evaluating correctness of generated predictions. +class LLMAsJudgeBase(BaseLLMJudge, ArtifactFetcherMixin): + """Base class for LLM-as-judge metrics that use templates for evaluation. - Attributes: - main_score (str): The main score label used for evaluation. - task (str): The type of task the llm as judge runs. This defines the output and input - format of the judge model. - template (Template): The template used when generating inputs for the judge llm. - format (Format): The format used when generating inputs for judge llm. - system_prompt (SystemPrompt): The system prompt used when generating inputs for judge llm. - inference_model (InferenceEngine): The module that creates the inference of the judge llm. - reduction_map (dict): A dictionary specifying the reduction method for the metric. - batch_size (int): The size of the bulk. + This class provides the foundation for template-based LLM judge implementations + that evaluate correctness of generated predictions using configurable tasks and templates. + Attributes: + main_score: The main score label used for evaluation. Defaults to "llm_as_judge". + task: The type of task the LLM as judge runs. This defines the output and input + format of the judge model. + template: The template used when generating inputs for the judge LLM. + system_prompt: The system prompt used when generating inputs for judge LLM. + Defaults to EmptySystemPrompt. + format: The format used when generating inputs for judge LLM. + Defaults to SystemFormat. + inference_model: The module that creates the inference of the judge LLM. + batch_size: The size of the batch for bulk processing. """ main_score: str = "llm_as_judge" @@ -46,7 +51,6 @@ class LLMAsJudgeBase(BulkInstanceMetric, ArtifactFetcherMixin): system_prompt: SystemPrompt = Field(default_factory=EmptySystemPrompt) format: Format = Field(default_factory=SystemFormat) inference_model: InferenceEngine - reduction_map: Optional[Dict[str, List[str]]] = None batch_size: int = 32 prediction_type = Any # Because handled with multiple tasks single_reference_per_prediction: bool = True @@ -90,15 +94,22 @@ def verify(self): def get_full_task_name(self): pass - def compute( - self, - references: List[List[Any]], - predictions: List[Any], - task_data: List[Dict], - ) -> List[Dict[str, Any]]: - instances = self.prepare_instances(references, predictions, task_data) - outputs = self.infer_instances(instances) - return self.get_metric_results_from_prediction_outputs(outputs) + def annotate_scores(self, scores): + return scores + + def _prepare_template_instance_for_inference( + self, prediction, references, task_data + ): + instances = self.prepare_instances([references], [prediction], [task_data]) + return instances[0] if instances else {} + + def _run_template_inference_on_all(self, prepared_instances): + if not prepared_instances: + return [] + + inference_outputs = self.infer_instances(prepared_instances) + + return self.get_metric_results_from_prediction_outputs(inference_outputs) @abstractmethod def prepare_instances( @@ -138,26 +149,12 @@ class LLMAsJudge(LLMAsJudgeBase): pairwise_comparative_rating.single_turn). Attributes: - main_score (str): The main score label used for evaluation. - - task (Literal["rating.single_turn","rating.single_turn_with_reference", - "pairwise_comparative_rating.single_turn"]): The type of task the llm as judge runs. - This defines the output and input format of the judge model. - - template (Template): The template used when generating inputs for the judge llm. - - format (Format): The format used when generating inputs for judge llm. - - system_prompt (SystemPrompt): The system prompt used when generating inputs for judge llm. - - strip_system_prompt_and_format_from_inputs (bool): Whether to strip the system prompt and formatting from the - inputs that the models that is being judges received, when they are inserted to the llm-as-judge prompt. - - inference_model (InferenceEngine): The module that creates the inference of the judge llm. - - reduction_map (dict): A dictionary specifying the reduction method for the metric. - - batch_size (int): The size of the bulk. + task: The type of task the LLM as judge runs. This defines the output and input format + of the judge model. Must be one of "rating.single_turn", "rating.single_turn_with_reference", + or "pairwise_comparative_rating.single_turn". + strip_system_prompt_and_format_from_inputs: Whether to strip the system prompt and + formatting from the inputs that the model being judged received, when they are + inserted to the LLM-as-judge prompt. Defaults to True. """ task: Literal[ @@ -254,10 +251,6 @@ def _get_instance_for_judge_model( def prepare(self): super().prepare() - if self.task == "pairwise_comparative_rating.single_turn": - self.reduction_map = {"weighted_win_rate": [self.main_score]} - if self.reduction_map is None: - self.reduction_map = {"mean": [self.main_score]} def verify(self): super().verify() @@ -315,56 +308,45 @@ def prepare_instances(self, references, predictions, task_data): instances = self._get_instance_for_judge_model( input_instances, predictions, references ) - # Copy the data classification policy from the original instance for instance, single_task_data in zip(instances, task_data): instance["data_classification_policy"] = single_task_data.get( "metadata", {} ).get("data_classification_policy") return instances + def _prepare_instance_for_inference(self, prediction, references, task_data): + return self._prepare_template_instance_for_inference( + prediction, references, task_data + ) + + def _run_inference_on_all(self, prepared_instances): + return self._run_template_inference_on_all(prepared_instances) + + def annotate_scores(self, scores): + from .metrics import MapReduceMetric + + return MapReduceMetric.annotate_scores(self, scores) + class TaskBasedLLMasJudge(LLMAsJudgeBase): """LLM-as-judge-based metric class for evaluating correctness of generated predictions. This class can use any task and matching template to evaluate the predictions. All task/templates field are taken from the instance's task_data. - The instances sent to the judge can either be: 1.a unitxt dataset, in which case the predictions are + The instances sent to the judge can either be: 1. a unitxt dataset, in which case the predictions are copied to a specified field of the task. 2. dictionaries with the fields required by the task and template. - Args: - main_score (str): - The main score label used for evaluation. - task (str): - The type of task the llm as judge runs. - This defines the output and input format of the judge model. - template (Template): - The template used when generating inputs for the judge llm. - format (Format): - The format used when generating inputs for judge llm. - system_prompt (SystemPrompt): - The system prompt used when generating inputs for judge llm. - strip_system_prompt_and_format_from_inputs (bool): - Whether to strip the system prompt and formatting from the - inputs that the models that is being judges received, - when they are inserted to the llm-as-judge prompt. - inference_model (InferenceEngine): - The module that creates the inference of the judge llm. - reduction_map (dict): - A dictionary specifying the reduction method for the metric. - batch_size (int): - The size of the bulk. - infer_log_probs(bool): - whether to perform the inference using logprobs. - If true, the template's post-processing must support the logprobs output. - judge_to_generator_fields_mapping (Dict[str, str]): - optional mapping between the names of the fields in the generator task and the - judge task. For example, if the generator task uses "reference_answers" and the judge task expect "ground_truth", - include {"ground_truth": "reference_answers"} in this dictionary. - prediction_field (str): - if indicated, and prediction exist, copy prediction to this field name in task_data. - include_meta_data (bool): - whether to include the inference per-instance metadata in the returned results. - + Attributes: + infer_log_probs: Whether to perform the inference using logprobs. If True, the template's + post-processing must support the logprobs output. Defaults to False. + judge_to_generator_fields_mapping: Optional mapping between the names of the fields in the + generator task and the judge task. For example, if the generator task uses "reference_answers" + and the judge task expects "ground_truth", include {"ground_truth": "reference_answers"} + in this dictionary. Defaults to empty dict. + prediction_field: If indicated, and prediction exists, copy prediction to this field name + in task_data. Defaults to None. + include_meta_data: Whether to include the inference per-instance metadata in the returned + results. Defaults to True. """ infer_log_probs: bool = False @@ -384,6 +366,32 @@ def preprocess_instance(self, instance): instance["references"] = [""] return instance + def _prepare_instance_for_inference(self, prediction, references, task_data): + return self._prepare_template_instance_for_inference( + prediction, references, task_data + ) + + def _run_inference_on_all(self, prepared_instances): + return self._run_template_inference_on_all(prepared_instances) + + def _instance_to_evaluation_input(self, instance): + instance = self.preprocess_instance(instance) + + # For TaskBasedLLMasJudge, the task_data contains the actual data fields + # while prediction and references are separate + task_data = instance.get("task_data", {}) + prediction = instance.get("prediction") + references = instance.get("references", [""]) + + # Return the evaluation input with the correct task_data + from .metrics import EvaluationInput + + return EvaluationInput( + prediction=prediction, + references=references, + task_data=task_data, # Pass the actual task_data, not the full instance + ) + def verify(self): super().verify() if self.infer_log_probs and not isinstance( @@ -405,7 +413,6 @@ def verify(self): def prepare(self): super().prepare() - self.reduction_map = {"mean": [self.main_score]} self.score_prefix = f"{self.inference_model.get_engine_id()}_" if not self.format: self.set_format_for_inference_engine() @@ -430,56 +437,113 @@ def get_full_task_name(self): def get_metric_results_from_prediction_outputs(self, outputs): results = [] for instance in outputs: + # Use the score_prefix to create engine-prefixed score names + prefixed_main_score = f"{self.score_prefix}{self.main_score}" result = { + # Keep the unprefixed main score for the base class self.main_score: instance["prediction"], - f"{self.main_score}_judge_raw_output": instance["raw_prediction"], - f"{self.main_score}_judge_raw_input": instance["source"], + # Add the prefixed versions for test compatibility + prefixed_main_score: instance["prediction"], + f"{prefixed_main_score}_judge_raw_output": instance["raw_prediction"], + f"{prefixed_main_score}_judge_raw_input": instance["source"], } if self.include_meta_data: meta_data = { - f"{self.main_score}_{k}": v + f"{prefixed_main_score}_{k}": v for k, v in instance["infer_meta_data"].items() } result.update(meta_data) results.append(result) return results - def prepare_instances(self, references, predictions, task_data): - from . import get_from_catalog - - instances = [] - judge_task = get_from_catalog(self.get_full_task_name()) + def _map_input_fields(self, input_instance, prediction, judge_task_input_fields): + """Map generator fields to judge fields using self.judge_to_generator_fields_mapping.""" + instance_task_data = {} + for judge_task_input_field in judge_task_input_fields: + orig_task_field_name = self.judge_to_generator_fields_mapping.get( + judge_task_input_field, judge_task_input_field + ) + new_val = input_instance.get(orig_task_field_name) + if new_val is None and isinstance(prediction, dict): + new_val = prediction.get(orig_task_field_name) + if new_val is not None: + instance_task_data[judge_task_input_field] = new_val + return instance_task_data + + def _apply_prediction( + self, instance_task_data, input_instance, prediction, judge_task_input_fields + ): + """Populate the prediction value according to self.prediction_field and fallbacks.""" + if self.prediction_field: + # explicit field path + if prediction is not None: + if isinstance(prediction, dict): + prediction = prediction[ + self.prediction_field + ] # keep KeyError behavior + instance_task_data[self.prediction_field] = prediction + else: + # try to fetch from input_instance when prediction is None + pred_value = input_instance.get(self.prediction_field) + if pred_value is not None: + instance_task_data[self.prediction_field] = pred_value + elif prediction is None: + # infer into common candidates when no explicit field and prediction is None + prediction_field_candidates = ["answer", "prediction", "response", "output"] + for candidate in prediction_field_candidates: + if candidate in judge_task_input_fields and candidate in input_instance: + instance_task_data[candidate] = input_instance[candidate] + break + # else: prediction provided but no prediction_field → the mapping stage already handled it + return instance_task_data + + def _fill_missing_defaults(self, instance_task_data, judge_task_input_fields): + """Ensure all required judge fields exist, with defaults matching the original behavior.""" + for field_name in judge_task_input_fields: + if field_name not in instance_task_data: + if field_name in ["choices", "contexts", "ground_truths"]: + instance_task_data[field_name] = ["-"] + elif field_name in ["answer", "question"]: + instance_task_data[field_name] = "-" + return instance_task_data + + def _finalize_instance(self, instance_task_data, input_instance, judge_task): + """Run judge_task.process and attach metadata.""" + instance_task_data = judge_task.process(instance_task_data)["input_fields"] + data_classification_policy = input_instance.get("metadata", {}).get( + "data_classification_policy" + ) + instance_task_data["data_classification_policy"] = data_classification_policy + return instance_task_data + + def _prepare_single_instance( + self, input_instance, prediction, reference, judge_task + ): + """Orchestrates the per-instance preparation without changing behavior.""" + input_instance = get_task_data_dict(input_instance) judge_task_input_fields = judge_task.input_fields - for input_instance, prediction, _ in zip(task_data, predictions, references): - input_instance = get_task_data_dict(input_instance) - - instance_task_data = {} - for judge_task_input_field in judge_task_input_fields: - orig_task_field_name = self.judge_to_generator_fields_mapping.get( - judge_task_input_field, judge_task_input_field - ) - new_val = input_instance.get(orig_task_field_name) - if new_val is None and isinstance(prediction, dict): - new_val = prediction.get(orig_task_field_name) - if new_val is not None: - instance_task_data[judge_task_input_field] = new_val + data = self._map_input_fields( + input_instance, prediction, judge_task_input_fields + ) + data = self._apply_prediction( + data, input_instance, prediction, judge_task_input_fields + ) + data = self._fill_missing_defaults(data, judge_task_input_fields) + return self._finalize_instance(data, input_instance, judge_task) - if self.prediction_field and prediction is not None: - if isinstance(prediction, dict): - prediction = prediction[self.prediction_field] - instance_task_data[self.prediction_field] = prediction - instance_task_data = judge_task.process(instance_task_data)["input_fields"] + def prepare_instances(self, references, predictions, task_data): + from . import get_from_catalog - data_classification_policy = input_instance.get("metadata", {}).get( - "data_classification_policy" + judge_task = get_from_catalog(self.get_full_task_name()) + return [ + self._prepare_single_instance( + input_instance, prediction, reference, judge_task ) - instance_task_data[ - "data_classification_policy" - ] = data_classification_policy - instances.append(instance_task_data) - - return instances + for input_instance, prediction, reference in zip( + task_data, predictions, references + ) + ] def infer_instances(self, instances): return infer( @@ -493,3 +557,24 @@ def infer_instances(self, instances): return_log_probs=self.infer_log_probs, return_meta_data=self.include_meta_data, ) + + def annotate_scores(self, scores): + result = dict(scores) + + prefixed_main_score = f"{self.score_prefix}{self.main_score}" + + if self.main_score in result: + result[prefixed_main_score] = result[self.main_score] + result["score"] = result[self.main_score] + result["score_name"] = prefixed_main_score + del result[self.main_score] + + keys_to_remove = [] + for key in result.keys(): + if key.endswith("_ci_high") or key.endswith("_ci_low"): + keys_to_remove.append(key) + + for key in keys_to_remove: + del result[key] + + return result diff --git a/src/unitxt/metric.py b/src/unitxt/metric.py index 822340fbcd..11bcb0a4ea 100644 --- a/src/unitxt/metric.py +++ b/src/unitxt/metric.py @@ -28,6 +28,7 @@ from .inference import __file__ as _ from .instructions import __file__ as _ from .llm_as_judge import __file__ as _ +from .llm_as_judge_base import __file__ as _ from .llm_as_judge_chat_templates import __file__ as _ from .llm_as_judge_constants import __file__ as _ from .llm_as_judge_from_template import __file__ as _ diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index de05034250..5e1aa6812d 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -349,13 +349,29 @@ def _instance_to_evaluation_input( if self.reference_field == "references": references = instance["references"] else: - references = task_data[self.reference_field] + # Check both task_data and instance top level for the reference field + if self.reference_field in task_data: + references = task_data[self.reference_field] + elif self.reference_field in instance: + references = instance[self.reference_field] + else: + raise KeyError( + f"Reference field '{self.reference_field}' not found in task_data or instance" + ) if not isinstance(references, list): references = [references] if self.prediction_field == "prediction": prediction = instance["prediction"] else: - prediction = task_data[self.prediction_field] + # Check both task_data and instance top level for the prediction field + if self.prediction_field in task_data: + prediction = task_data[self.prediction_field] + elif self.prediction_field in instance: + prediction = instance[self.prediction_field] + else: + raise KeyError( + f"Prediction field '{self.prediction_field}' not found in task_data or instance" + ) self._validate_prediction(prediction) self._validate_reference(references) diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py index 874acff626..6d8a9b6817 100644 --- a/tests/library/test_metrics.py +++ b/tests/library/test_metrics.py @@ -2114,389 +2114,171 @@ def test_text2sql_accuracy_incorrect_query(self): ) self.assertEqual(0.0, non_execution_outputs["score"]) + def test_task_based_llm_as_judge_metric(self): + model_id = "meta-llama/llama-3-8b-instruct" + format = "formats.llama3_instruct" + task = "tasks.rag_eval.answer_correctness.binary" + template = "templates.rag_eval.answer_correctness.judge_loose_match_no_context" -class TestConfidenceIntervals(UnitxtTestCase): - def test_confidence_interval_off(self): - """Test that when metric.n_resamples is set to None, no confidence intervals are computed.""" - # Test one GlobalMetric and one InstanceMetric - for metric in [Accuracy(), F1Macro()]: - metric.set_confidence_interval_calculation(return_confidence_interval=False) - outputs = apply_metric(metric=metric, predictions=["A"], references=[["A"]]) - - global_result = outputs[0]["score"]["global"] - # Check there are no confidence intervals in the result - for key in global_result: - self.assertTrue("ci_low" not in key) - self.assertTrue("ci_high" not in key) - - def test_instance_metric_confidence_interval(self): - """Test the calculation of confidence intervals for an instance metric (Accuracy is used as an instance of an InstanceMetric).""" - self._test_confidence_interval( - metric=Accuracy(), - expected_ci_low=0.71, - expected_ci_high=0.87, - ) - - def test_map_reduce_metric_confidence_interval(self): - """Test the calculation of confidence intervals for an instance metric (Accuracy is used as an instance of an InstanceMetric).""" - self._test_confidence_interval( - metric=AccuracyFast(), - expected_ci_low=0.71, - expected_ci_high=0.87, - ) - - def test_f1_micro_confidence_interval(self): - """Test the calculation of confidence intervals for an instance metric (Accuracy is used as an instance of an InstanceMetric).""" - self._test_confidence_interval( - metric=F1Micro(n_resamples=1000), - expected_ci_low=0.83, - expected_ci_high=0.93, - ) - - def test_f1_micro_fast_confidence_interval(self): - """Test the calculation of confidence intervals for an instance metric (Accuracy is used as an instance of an InstanceMetric).""" - self._test_confidence_interval( - metric=F1Fast(main_score="f1_micro", averages=["micro"]), - expected_ci_low=0.83, - expected_ci_high=0.93, - ) - - def test_instance_metric_with_multiple_scores_confidence_interval(self): - self._test_confidence_interval( - metric=TokenOverlap(), - expected_ci_low=0.71, - expected_ci_high=0.87, - ) - - def test_global_metric_confidence_interval(self): - """Test the calculation of confidence intervals for global metrics (F1Macro and F1Micro are used as instances of a GlobalMetric).""" - f1_macro_low, f1_macro_high = 0.8809213119223925, 0.9439681645177271 - self._test_confidence_interval( - metric=F1Macro(), - expected_ci_low=f1_macro_low, - expected_ci_high=f1_macro_high, - ) - f1_micro_low, f1_micro_high = 0.8439306358381503, 0.9223675337263242 - self._test_confidence_interval( - metric=F1Micro(), - expected_ci_low=f1_micro_low, - expected_ci_high=f1_micro_high, - ) - - # Now reverse the order and check things don't change - self._test_confidence_interval( - metric=F1Micro(), - expected_ci_low=f1_micro_low, - expected_ci_high=f1_micro_high, + inference_model = MockInferenceEngine( + model_name=model_id, default_inference_value="no" ) - self._test_confidence_interval( - metric=F1Macro(), - expected_ci_low=f1_macro_low, - expected_ci_high=f1_macro_high, + model_label = inference_model.get_engine_id() + template_label = template.split(".")[-1] + metric_label = f"answer_correctness_{template_label}" + metric = TaskBasedLLMasJudge( + inference_model=inference_model, + template=template, + task=task, + format=format, + main_score=metric_label, + infer_log_probs=False, + include_meta_data=False, ) - def _test_confidence_interval(self, metric, expected_ci_low, expected_ci_high): - """Test the calculation of confidence intervals for a given metric.""" - predictions = ["A", "B", "C", "D", "E"] * 20 # 100 predictions - references = [["B"], ["B"], ["C"], ["D"], ["E"]] * 20 # 80% are correct (4/5) + predictions = [None, None] + references = [[""], [""]] + task_data = [ + { + "question": "What foundation models are available in watsonx.ai ?", + "answer": "Watsonx.ai supports no foundation models", + "ground_truths": [ + "Many Large Language Models are supported by Watsonx.ai" + ], + "contexts": ["Many Large Language Models are supported by Watsonx.ai"], + } + ] * 2 outputs = apply_metric( - metric=metric, predictions=predictions, references=references + metric=metric, + predictions=predictions, + references=references, + task_data=task_data, ) + actual_scores = [output["score"] for output in outputs] + main_score = f"{model_label}_{metric_label}" + instance_targets = [ + { + main_score: 0.0, + "score": 0.0, + "score_name": main_score, + main_score + "_judge_raw_output": "no", + main_score + "_judge_raw_input": """<|begin_of_text|><|start_header_id|>system<|end_header_id|> - expected_global_result = { - f"{metric.main_score}_ci_low": expected_ci_low, - f"{metric.main_score}_ci_high": expected_ci_high, - "score_ci_low": expected_ci_low, - "score_ci_high": expected_ci_high, - } +You are given a question, the corresponding ground-truth answer and a prediction from a model. Compare the "Ground-truth answer" and the "Prediction" to determine whether the prediction correctly answers the question. +There should be no contradicting statements in the prediction. The prediction may contain extra information. If the prediction states something as a possibility, treat it as a definitive answer. +The prediction must contain all the important information presented in the ground truths, but doesn't have to fully match it. +Answer with only yes/no. +<|eot_id|><|start_header_id|>user<|end_header_id|> - global_result = outputs[0]["score"]["global"].copy() - logger.info(global_result) - for score_name, score_value in global_result.items(): - if score_name in expected_global_result: - # Verify that the output value is as the expected value - self.assertAlmostEqual( - score_value, expected_global_result[score_name], places=3 - ) - else: - # An output score that is not expected - # This is ok if the score_name is not related to confidence intervals - # Otherwise, there was some confidence interval calculation that was not supposed to occur. - self.assertTrue( - ("ci_low" not in score_name and "ci_high" not in score_name) - or score_name not in metric.ci_scores, - msg=f"Unexpected confidence interval score '{score_name}'.", - ) +Question: What foundation models are available in watsonx.ai ? - def test_grouped_instance_metric_confidence_interval(self): - """Test the calculation of confidence intervals for grouped instance metrics (sub-types of InstanceMetric with group_mean reduction).""" - self._test_grouped_instance_confidence_interval( - metric=FixedGroupMeanAccuracy(), - expected_ci_low=0.1, - expected_ci_high=0.48178555627359004, - ) +Ground-truth answer: Many Large Language Models are supported by Watsonx.ai - self._test_grouped_instance_confidence_interval( - metric=GroupMeanAccuracy(), - expected_ci_low=0.025, - expected_ci_high=0.4407250456645065, - ) +Prediction: Watsonx.ai supports no foundation models +<|eot_id|><|start_header_id|>assistant<|end_header_id|> - self._test_grouped_instance_confidence_interval( - metric=FixedGroupMeanStringContainment(), - expected_ci_low=0.0, - expected_ci_high=0.675, - ) +Answer: """, + } + ] * 2 + global_target = { + main_score: 0.0, + "score": 0.0, + "score_name": main_score, + "num_of_instances": 2, + } - self._test_grouped_instance_confidence_interval( - metric=GroupMeanStringContainment(), - expected_ci_low=0.15627449950197503, - expected_ci_high=0.7080527276705952, - ) + expected_scores = [ + { + "global": global_target, + "instance": instance_target, + } + for instance_target in instance_targets + ] - self._test_grouped_instance_confidence_interval( - metric=FixedGroupMeanBaselineAccuracy(), - expected_ci_low=0.0, - expected_ci_high=1.0, - ) + self.assertListEqual(actual_scores, expected_scores) - self._test_grouped_instance_confidence_interval( - metric=FixedGroupMeanParaphraseAccuracy(), - expected_ci_low=0.0, - expected_ci_high=0.3333333333333333, - ) + def test_llm_as_judge_metric(self): + model_id = "meta-llama/llama-3-8b-instruct" + format = "formats.llama3_instruct" + task = "rating.single_turn" + template = "templates.response_assessment.rating.mt_bench_single_turn" - self._test_grouped_instance_confidence_interval( - metric=FixedGroupMeanBaselineStringContainment(), - expected_ci_low=0.25, - expected_ci_high=1.0, + inference_model = MockInferenceEngine(model_name=model_id) + model_label = model_id.split("/")[1].replace("-", "_") + model_label = f"{model_label}_wml" + template_label = template.split(".")[-1] + metric_label = f"{model_label}_template_{template_label}" + metric = LLMAsJudge( + inference_model=inference_model, + template=template, + task=task, + format=format, + main_score=metric_label, ) - self._test_grouped_instance_confidence_interval( - metric=FixedGroupMeanParaphraseStringContainment(), - expected_ci_low=0.5, - expected_ci_high=0.6666666666666666, - ) + predictions = ["[[10]]"] * 3 + references = [["[[10]]"], ["[[10]]"], ["[[10]]"]] + task_data = [ + { + "input": "input", + "type_of_input": "type", + "output": "output", + "type_of_output": "type", + "source": "input", + "metadata": { + "template": "templates.generation.default", + "data_classification_policy": ["public"], + }, + } + ] * 3 - self._test_grouped_instance_confidence_interval( - metric=FixedGroupNormCohensHParaphraseAccuracy(), - expected_ci_low=-1.0, - expected_ci_high=0.33333333333333337, - ) - - # note, this metric has an issue where the ci_high on PCs on Travis slightly diverges from the local results - # hence this test may fail on a PC - self._test_grouped_instance_confidence_interval( - metric=FixedGroupNormCohensHParaphraseStringContainment(), - expected_ci_low=-0.49999999999999994, - expected_ci_high=-0.39182655203060723, - ) - - self._test_grouped_instance_confidence_interval( - metric=FixedGroupPDRParaphraseAccuracy(), - expected_ci_low=0.6666666666666666, - expected_ci_high=1.0, - ) - - self._test_grouped_instance_confidence_interval( - metric=FixedGroupPDRParaphraseStringContainment(), - expected_ci_low=0.3333333333333333, - expected_ci_high=0.5, - ) - - self._test_grouped_instance_confidence_interval( - metric=FixedGroupNormHedgesGParaphraseAccuracy(), - expected_ci_low=-1.0, - expected_ci_high=0.01892225367237965, - ) - - self._test_grouped_instance_confidence_interval( - metric=FixedGroupNormHedgesGParaphraseStringContainment(), - expected_ci_low=-0.09757387538180902, - expected_ci_high=-0.046656947481584346, - ) - - # absolute value of Hedges' g and Cohen's h - self._test_grouped_instance_confidence_interval( - metric=FixedGroupAbsvalNormCohensHParaphraseAccuracy(), - expected_ci_low=0.33333333333333337, - expected_ci_high=1.0, - ) - - self._test_grouped_instance_confidence_interval( - metric=FixedGroupAbsvalNormCohensHParaphraseStringContainment(), - expected_ci_low=0.39182655203060723, - expected_ci_high=0.49999999999999994, - ) - - self._test_grouped_instance_confidence_interval( - metric=FixedGroupAbsvalNormHedgesGParaphraseAccuracy(), - expected_ci_low=0.05633430321756243, - expected_ci_high=1.0, - ) - - self._test_grouped_instance_confidence_interval( - metric=FixedGroupAbsvalNormHedgesGParaphraseStringContainment(), - expected_ci_low=0.046656947481584346, - expected_ci_high=0.09757387538180902, - ) - - # pass global dict because there are additional fields other than the main score - for score_prefix in ["my_", ""]: - self._test_grouped_instance_confidence_interval( - metric=GroupMeanTokenOverlap(), - expected_global_result={ - f"group_mean_{score_prefix}recall": 0.525, - f"group_mean_{score_prefix}f1": 0.5083333333333333, - "score": 0.5083333333333333, - "score_name": f"group_mean_{score_prefix}f1", - f"group_mean_{score_prefix}precision": 0.5, - f"group_mean_{score_prefix}recall_ci_low": 0.25, - f"group_mean_{score_prefix}recall_ci_high": 0.7083333333333334, - f"group_mean_{score_prefix}f1_ci_low": 0.22302503471948287, - f"group_mean_{score_prefix}f1_ci_high": 0.6805555555555555, - "score_ci_low": 0.22302503471948287, - "score_ci_high": 0.6805555555555555, - f"group_mean_{score_prefix}precision_ci_low": 0.2095091529536007, - f"group_mean_{score_prefix}precision_ci_high": 0.6666666666666666, - }, - input_score_prefixes=[score_prefix], - ) - - def _test_grouped_instance_confidence_interval( - self, - metric, - expected_ci_low=0.0, - expected_ci_high=1.0, - expected_global_result=None, - input_score_prefixes=None, - ): - """Test the calculation of confidence intervals for a given metric with group_mean reduction.""" - input_expected_global_result_is_none = expected_global_result is None - # to remember between score_prefixes - - for score_prefix in ( - ["my_", ""] if input_score_prefixes is None else input_score_prefixes - ): - metric.score_prefix = score_prefix - outputs = apply_metric( - metric=metric, - predictions=GROUPED_INSTANCE_PREDICTIONS, - references=GROUPED_INSTANCE_REFERENCES, - task_data=GROUPED_INSTANCE_ADDL_INPUTS, - ) - # get first element of reduction_map values - reduction_params = next(iter(metric.reduction_map.values())) - prefix = "fixed_group" if reduction_params["agg_func"][2] else "group" - group_score_name = "_".join( - [ - prefix, - metric.reduction_map["group_mean"]["agg_func"][0], - score_prefix, - metric.main_score, - ] - ).replace("__", "_") # for the case of empty score_prefix - - if input_expected_global_result_is_none: - expected_global_result = { - f"{group_score_name}_ci_low": expected_ci_low, - f"{group_score_name}_ci_high": expected_ci_high, - "score_ci_low": expected_ci_low, - "score_ci_high": expected_ci_high, - } - - global_result = outputs[0]["score"]["global"].copy() - logger.info(global_result) - for score_name, score_value in global_result.items(): - if score_name in expected_global_result: - self.assertAlmostEqual( - expected_global_result[score_name], - score_value, - places=5, - msg=f"{score_name} score mismatch for {metric.__class__.__name__}, expected {expected_global_result[score_name]} but got {score_value}", - ) - else: - # An output score that is not expected - # This is ok if the score_name is not related to confidence intervals - # Otherwise, there was some confidence interval calculation that was not supposed to occur. - self.assertTrue( - "ci_low" not in score_name and "ci_high" not in score_name, - msg=f"Unexpected confidence interval score '{score_name}'.", - ) - - def test_task_based_llm_as_judge_metric(self): - model_id = "meta-llama/llama-3-8b-instruct" - format = "formats.llama3_instruct" - task = "tasks.rag_eval.answer_correctness.binary" - template = "templates.rag_eval.answer_correctness.judge_loose_match_no_context" - - inference_model = MockInferenceEngine( - model_name=model_id, default_inference_value="no" - ) - model_label = inference_model.get_engine_id() - template_label = template.split(".")[-1] - metric_label = f"answer_correctness_{template_label}" - metric = TaskBasedLLMasJudge( - inference_model=inference_model, - template=template, - task=task, - format=format, - main_score=metric_label, - infer_log_probs=False, - include_meta_data=False, - ) - - predictions = [None, None] - references = [[""], [""]] - task_data = [ - { - "question": "What foundation models are available in watsonx.ai ?", - "answer": "Watsonx.ai supports no foundation models", - "ground_truths": [ - "Many Large Language Models are supported by Watsonx.ai" - ], - "contexts": ["Many Large Language Models are supported by Watsonx.ai"], - } - ] * 2 - - outputs = apply_metric( - metric=metric, - predictions=predictions, - references=references, - task_data=task_data, + outputs = apply_metric( + metric=metric, + predictions=predictions, + references=references, + task_data=task_data, ) actual_scores = [output["score"] for output in outputs] - main_score = f"{model_label}_{metric_label}" instance_targets = [ { - main_score: 0.0, - "score": 0.0, - "score_name": main_score, - main_score + "_judge_raw_output": "no", - main_score + "_judge_raw_input": """<|begin_of_text|><|start_header_id|>system<|end_header_id|> - -You are given a question, the corresponding ground-truth answer and a prediction from a model. Compare the "Ground-truth answer" and the "Prediction" to determine whether the prediction correctly answers the question. -There should be no contradicting statements in the prediction. The prediction may contain extra information. If the prediction states something as a possibility, treat it as a definitive answer. -The prediction must contain all the important information presented in the ground truths, but doesn't have to fully match it. -Answer with only yes/no. -<|eot_id|><|start_header_id|>user<|end_header_id|> - -Question: What foundation models are available in watsonx.ai ? - -Ground-truth answer: Many Large Language Models are supported by Watsonx.ai - -Prediction: Watsonx.ai supports no foundation models -<|eot_id|><|start_header_id|>assistant<|end_header_id|> - -Answer: """, + metric_label: 1.0, + "score_name": metric_label, + "score": 1.0, + f"{metric_label}_judge_raw_input": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" + "Please act as an impartial judge and " + "evaluate the quality of the response " + "provided by an AI assistant to the user " + "question displayed below. Your evaluation " + "should consider factors such as the " + "helpfulness, relevance, accuracy, depth, " + "creativity, and level of detail of the " + "response. Begin your evaluation by " + "providing a short explanation. Be as " + "objective as possible. After providing your " + "explanation, you must rate the response on " + "a scale of 1 to 10 by strictly following " + 'this format: "[[rating]]", for example: ' + '"Rating: [[5]]".\n\n' + "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n" + "[Question]\n" + "Given the following type, generate the corresponding type. type: input\n\n\n" + "[The Start of Assistant's Answer]\n" + "[[10]]\n" + "[The End of Assistant's " + "Answer]<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + f"{metric_label}_judge_raw_output": "[[10]]", } - ] * 2 + ] * 3 global_target = { - main_score: 0.0, - "score": 0.0, - "score_name": main_score, - "num_of_instances": 2, + metric_label: 1.0, + "score": 1.0, + "score_name": metric_label, + f"{metric_label}_ci_high": 1.0, + f"{metric_label}_ci_low": 1.0, + "score_ci_high": 1.0, + "score_ci_low": 1.0, + "num_of_instances": 3, } expected_scores = [ @@ -2509,99 +2291,10 @@ def test_task_based_llm_as_judge_metric(self): self.assertListEqual(actual_scores, expected_scores) - def test_llm_as_judge_metric(self): + def test_llm_as_judge_metric_with_chat_api(self): model_id = "meta-llama/llama-3-8b-instruct" - format = "formats.llama3_instruct" - task = "rating.single_turn" - template = "templates.response_assessment.rating.mt_bench_single_turn" - - inference_model = MockInferenceEngine(model_name=model_id) - model_label = model_id.split("/")[1].replace("-", "_") - model_label = f"{model_label}_wml" - template_label = template.split(".")[-1] - metric_label = f"{model_label}_template_{template_label}" - metric = LLMAsJudge( - inference_model=inference_model, - template=template, - task=task, - format=format, - main_score=metric_label, - ) - - predictions = ["[[10]]"] * 3 - references = [["[[10]]"], ["[[10]]"], ["[[10]]"]] - task_data = [ - { - "input": "input", - "type_of_input": "type", - "output": "output", - "type_of_output": "type", - "source": "input", - "metadata": { - "template": "templates.generation.default", - "data_classification_policy": ["public"], - }, - } - ] * 3 - - outputs = apply_metric( - metric=metric, - predictions=predictions, - references=references, - task_data=task_data, - ) - actual_scores = [output["score"] for output in outputs] - instance_targets = [ - { - metric_label: 1.0, - "score_name": metric_label, - "score": 1.0, - f"{metric_label}_judge_raw_input": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" - "Please act as an impartial judge and " - "evaluate the quality of the response " - "provided by an AI assistant to the user " - "question displayed below. Your evaluation " - "should consider factors such as the " - "helpfulness, relevance, accuracy, depth, " - "creativity, and level of detail of the " - "response. Begin your evaluation by " - "providing a short explanation. Be as " - "objective as possible. After providing your " - "explanation, you must rate the response on " - "a scale of 1 to 10 by strictly following " - 'this format: "[[rating]]", for example: ' - '"Rating: [[5]]".\n\n' - "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n" - "[Question]\n" - "Given the following type, generate the corresponding type. type: input\n\n\n" - "[The Start of Assistant's Answer]\n" - "[[10]]\n" - "[The End of Assistant's " - "Answer]<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", - f"{metric_label}_judge_raw_output": "[[10]]", - } - ] * 3 - global_target = { - metric_label: 1.0, - "score": 1.0, - "score_name": metric_label, - "num_of_instances": 3, - } - - expected_scores = [ - { - "global": global_target, - "instance": instance_target, - } - for instance_target in instance_targets - ] - - self.assertListEqual(actual_scores, expected_scores) - - def test_llm_as_judge_metric_with_chat_api(self): - model_id = "meta-llama/llama-3-8b-instruct" - format = "formats.chat_api" - # format = "formats.llama3_instruct" + format = "formats.chat_api" + # format = "formats.llama3_instruct" task = "rating.single_turn" template = "templates.response_assessment.rating.mt_bench_single_turn" @@ -2663,6 +2356,10 @@ def test_llm_as_judge_metric_with_chat_api(self): metric_label: 1.0, "score": 1.0, "score_name": metric_label, + f"{metric_label}_ci_high": 1.0, + f"{metric_label}_ci_low": 1.0, + "score_ci_high": 1.0, + "score_ci_low": 1.0, "num_of_instances": 3, } @@ -3412,6 +3109,319 @@ def test_llm_as_judge(self): "num_of_instances": 3, "answer_relevance": 0.5, "score": 0.5, + "score_ci_high": 0.5, + "score_ci_low": 0.5, "score_name": "answer_relevance", }, ) + + +class TestConfidenceIntervals(UnitxtTestCase): + def test_confidence_interval_off(self): + """Test that when metric.n_resamples is set to None, no confidence intervals are computed.""" + # Test one GlobalMetric and one InstanceMetric + for metric in [Accuracy(), F1Macro()]: + metric.set_confidence_interval_calculation(return_confidence_interval=False) + outputs = apply_metric(metric=metric, predictions=["A"], references=[["A"]]) + + global_result = outputs[0]["score"]["global"] + # Check there are no confidence intervals in the result + for key in global_result: + self.assertTrue("ci_low" not in key) + self.assertTrue("ci_high" not in key) + + def test_instance_metric_confidence_interval(self): + """Test the calculation of confidence intervals for an instance metric (Accuracy is used as an instance of an InstanceMetric).""" + self._test_confidence_interval( + metric=Accuracy(), + expected_ci_low=0.71, + expected_ci_high=0.87, + ) + + def test_map_reduce_metric_confidence_interval(self): + """Test the calculation of confidence intervals for an instance metric (Accuracy is used as an instance of an InstanceMetric).""" + self._test_confidence_interval( + metric=AccuracyFast(), + expected_ci_low=0.71, + expected_ci_high=0.87, + ) + + def test_f1_micro_confidence_interval(self): + """Test the calculation of confidence intervals for an instance metric (Accuracy is used as an instance of an InstanceMetric).""" + self._test_confidence_interval( + metric=F1Micro(n_resamples=1000), + expected_ci_low=0.83, + expected_ci_high=0.93, + ) + + def test_f1_micro_fast_confidence_interval(self): + """Test the calculation of confidence intervals for an instance metric (Accuracy is used as an instance of an InstanceMetric).""" + self._test_confidence_interval( + metric=F1Fast(main_score="f1_micro", averages=["micro"]), + expected_ci_low=0.83, + expected_ci_high=0.93, + ) + + def test_instance_metric_with_multiple_scores_confidence_interval(self): + self._test_confidence_interval( + metric=TokenOverlap(), + expected_ci_low=0.71, + expected_ci_high=0.87, + ) + + def test_global_metric_confidence_interval(self): + """Test the calculation of confidence intervals for global metrics (F1Macro and F1Micro are used as instances of a GlobalMetric).""" + f1_macro_low, f1_macro_high = 0.8809213119223925, 0.9439681645177271 + self._test_confidence_interval( + metric=F1Macro(), + expected_ci_low=f1_macro_low, + expected_ci_high=f1_macro_high, + ) + f1_micro_low, f1_micro_high = 0.8439306358381503, 0.9223675337263242 + self._test_confidence_interval( + metric=F1Micro(), + expected_ci_low=f1_micro_low, + expected_ci_high=f1_micro_high, + ) + + # Now reverse the order and check things don't change + self._test_confidence_interval( + metric=F1Micro(), + expected_ci_low=f1_micro_low, + expected_ci_high=f1_micro_high, + ) + self._test_confidence_interval( + metric=F1Macro(), + expected_ci_low=f1_macro_low, + expected_ci_high=f1_macro_high, + ) + + def _test_confidence_interval(self, metric, expected_ci_low, expected_ci_high): + """Test the calculation of confidence intervals for a given metric.""" + predictions = ["A", "B", "C", "D", "E"] * 20 # 100 predictions + references = [["B"], ["B"], ["C"], ["D"], ["E"]] * 20 # 80% are correct (4/5) + + outputs = apply_metric( + metric=metric, predictions=predictions, references=references + ) + + expected_global_result = { + f"{metric.main_score}_ci_low": expected_ci_low, + f"{metric.main_score}_ci_high": expected_ci_high, + "score_ci_low": expected_ci_low, + "score_ci_high": expected_ci_high, + } + + global_result = outputs[0]["score"]["global"].copy() + logger.info(global_result) + for score_name, score_value in global_result.items(): + if score_name in expected_global_result: + # Verify that the output value is as the expected value + self.assertAlmostEqual( + score_value, expected_global_result[score_name], places=3 + ) + else: + # An output score that is not expected + # This is ok if the score_name is not related to confidence intervals + # Otherwise, there was some confidence interval calculation that was not supposed to occur. + self.assertTrue( + ("ci_low" not in score_name and "ci_high" not in score_name) + or score_name not in metric.ci_scores, + msg=f"Unexpected confidence interval score '{score_name}'.", + ) + + def test_grouped_instance_metric_confidence_interval(self): + """Test the calculation of confidence intervals for grouped instance metrics (sub-types of InstanceMetric with group_mean reduction).""" + self._test_grouped_instance_confidence_interval( + metric=FixedGroupMeanAccuracy(), + expected_ci_low=0.1, + expected_ci_high=0.48178555627359004, + ) + + self._test_grouped_instance_confidence_interval( + metric=GroupMeanAccuracy(), + expected_ci_low=0.025, + expected_ci_high=0.4407250456645065, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupMeanStringContainment(), + expected_ci_low=0.0, + expected_ci_high=0.675, + ) + + self._test_grouped_instance_confidence_interval( + metric=GroupMeanStringContainment(), + expected_ci_low=0.15627449950197503, + expected_ci_high=0.7080527276705952, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupMeanBaselineAccuracy(), + expected_ci_low=0.0, + expected_ci_high=1.0, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupMeanParaphraseAccuracy(), + expected_ci_low=0.0, + expected_ci_high=0.3333333333333333, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupMeanBaselineStringContainment(), + expected_ci_low=0.25, + expected_ci_high=1.0, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupMeanParaphraseStringContainment(), + expected_ci_low=0.5, + expected_ci_high=0.6666666666666666, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupNormCohensHParaphraseAccuracy(), + expected_ci_low=-1.0, + expected_ci_high=0.33333333333333337, + ) + + # note, this metric has an issue where the ci_high on PCs on Travis slightly diverges from the local results + # hence this test may fail on a PC + self._test_grouped_instance_confidence_interval( + metric=FixedGroupNormCohensHParaphraseStringContainment(), + expected_ci_low=-0.49999999999999994, + expected_ci_high=-0.39182655203060723, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupPDRParaphraseAccuracy(), + expected_ci_low=0.6666666666666666, + expected_ci_high=1.0, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupPDRParaphraseStringContainment(), + expected_ci_low=0.3333333333333333, + expected_ci_high=0.5, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupNormHedgesGParaphraseAccuracy(), + expected_ci_low=-1.0, + expected_ci_high=0.01892225367237965, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupNormHedgesGParaphraseStringContainment(), + expected_ci_low=-0.09757387538180902, + expected_ci_high=-0.046656947481584346, + ) + + # absolute value of Hedges' g and Cohen's h + self._test_grouped_instance_confidence_interval( + metric=FixedGroupAbsvalNormCohensHParaphraseAccuracy(), + expected_ci_low=0.33333333333333337, + expected_ci_high=1.0, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupAbsvalNormCohensHParaphraseStringContainment(), + expected_ci_low=0.39182655203060723, + expected_ci_high=0.49999999999999994, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupAbsvalNormHedgesGParaphraseAccuracy(), + expected_ci_low=0.05633430321756243, + expected_ci_high=1.0, + ) + + self._test_grouped_instance_confidence_interval( + metric=FixedGroupAbsvalNormHedgesGParaphraseStringContainment(), + expected_ci_low=0.046656947481584346, + expected_ci_high=0.09757387538180902, + ) + + # pass global dict because there are additional fields other than the main score + for score_prefix in ["my_", ""]: + self._test_grouped_instance_confidence_interval( + metric=GroupMeanTokenOverlap(), + expected_global_result={ + f"group_mean_{score_prefix}recall": 0.525, + f"group_mean_{score_prefix}f1": 0.5083333333333333, + "score": 0.5083333333333333, + "score_name": f"group_mean_{score_prefix}f1", + f"group_mean_{score_prefix}precision": 0.5, + f"group_mean_{score_prefix}recall_ci_low": 0.25, + f"group_mean_{score_prefix}recall_ci_high": 0.7083333333333334, + f"group_mean_{score_prefix}f1_ci_low": 0.22302503471948287, + f"group_mean_{score_prefix}f1_ci_high": 0.6805555555555555, + "score_ci_low": 0.22302503471948287, + "score_ci_high": 0.6805555555555555, + f"group_mean_{score_prefix}precision_ci_low": 0.2095091529536007, + f"group_mean_{score_prefix}precision_ci_high": 0.6666666666666666, + }, + input_score_prefixes=[score_prefix], + ) + + def _test_grouped_instance_confidence_interval( + self, + metric, + expected_ci_low=0.0, + expected_ci_high=1.0, + expected_global_result=None, + input_score_prefixes=None, + ): + """Test the calculation of confidence intervals for a given metric with group_mean reduction.""" + input_expected_global_result_is_none = expected_global_result is None + # to remember between score_prefixes + + for score_prefix in ( + ["my_", ""] if input_score_prefixes is None else input_score_prefixes + ): + metric.score_prefix = score_prefix + outputs = apply_metric( + metric=metric, + predictions=GROUPED_INSTANCE_PREDICTIONS, + references=GROUPED_INSTANCE_REFERENCES, + task_data=GROUPED_INSTANCE_ADDL_INPUTS, + ) + # get first element of reduction_map values + reduction_params = next(iter(metric.reduction_map.values())) + prefix = "fixed_group" if reduction_params["agg_func"][2] else "group" + group_score_name = "_".join( + [ + prefix, + metric.reduction_map["group_mean"]["agg_func"][0], + score_prefix, + metric.main_score, + ] + ).replace("__", "_") # for the case of empty score_prefix + + if input_expected_global_result_is_none: + expected_global_result = { + f"{group_score_name}_ci_low": expected_ci_low, + f"{group_score_name}_ci_high": expected_ci_high, + "score_ci_low": expected_ci_low, + "score_ci_high": expected_ci_high, + } + + global_result = outputs[0]["score"]["global"].copy() + logger.info(global_result) + for score_name, score_value in global_result.items(): + if score_name in expected_global_result: + self.assertAlmostEqual( + expected_global_result[score_name], + score_value, + places=5, + msg=f"{score_name} score mismatch for {metric.__class__.__name__}, expected {expected_global_result[score_name]} but got {score_value}", + ) + else: + # An output score that is not expected + # This is ok if the score_name is not related to confidence intervals + # Otherwise, there was some confidence interval calculation that was not supposed to occur. + self.assertTrue( + "ci_low" not in score_name and "ci_high" not in score_name, + msg=f"Unexpected confidence interval score '{score_name}'.", + )