|
| 1 | +# MIT License |
| 2 | + |
| 3 | +# Copyright (c) 2024 The HuggingFace Team |
| 4 | + |
| 5 | +# Permission is hereby granted, free of charge, to any person obtaining a copy |
| 6 | +# of this software and associated documentation files (the "Software"), to deal |
| 7 | +# in the Software without restriction, including without limitation the rights |
| 8 | +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 9 | +# copies of the Software, and to permit persons to whom the Software is |
| 10 | +# furnished to do so, subject to the following conditions: |
| 11 | + |
| 12 | +# The above copyright notice and this permission notice shall be included in all |
| 13 | +# copies or substantial portions of the Software. |
| 14 | + |
| 15 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 16 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 18 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 19 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 20 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 21 | +# SOFTWARE. |
| 22 | + |
| 23 | + |
| 24 | +import logging |
| 25 | +import re |
| 26 | + |
| 27 | +import numpy as np |
| 28 | +from aenum import extend_enum |
| 29 | + |
| 30 | +from lighteval.metrics.metrics import Metrics |
| 31 | +from lighteval.metrics.metrics_sample import JudgeLLM |
| 32 | +from lighteval.metrics.utils.metric_utils import ( |
| 33 | + CorpusLevelMetricGrouping, |
| 34 | + MetricCategory, |
| 35 | + MetricUseCase, |
| 36 | +) |
| 37 | +from lighteval.tasks.lighteval_task import LightevalTaskConfig |
| 38 | +from lighteval.tasks.requests import Doc |
| 39 | + |
| 40 | + |
| 41 | +logger = logging.getLogger(__name__) |
| 42 | + |
| 43 | +JUDGE_ANSWER_SYSTEM_PROMPT = """You will be provided with the summary of a document, a piece of text, a question generated from that text, and the correct or "gold" answer to the question. Additionally, you will receive a model answer. Your task is to determine wether the model answer is correct using the provided "gold" answer as a reference. |
| 44 | +
|
| 45 | +# Steps |
| 46 | +
|
| 47 | +1. **Document Understanding**: |
| 48 | + - Analyze the provided document summary to grasp the context and main themes. |
| 49 | +
|
| 50 | +2. **Chunk Understanding**: |
| 51 | + - Examine the provided text (chunk) to understand its content. |
| 52 | +
|
| 53 | +3. **Question Understanding**: |
| 54 | + - Interpret the given question to fully comprehend what is being asked. |
| 55 | +
|
| 56 | +4. **Ground Truth Answer Understanding**: |
| 57 | + - Understand the provided ground truth answer, identifying its key points. |
| 58 | +
|
| 59 | +6. **Answer Understanding**: |
| 60 | + - Examine the Model Answer, identifying key points and assessing accuracy and factuality. |
| 61 | +
|
| 62 | +7. **Final Answer**: |
| 63 | + - 0 or 1 (0 if the model answer is incorrect, 1 if it is correct). |
| 64 | +
|
| 65 | +# Output Format |
| 66 | +
|
| 67 | +- Provide your final evaluation of whether the answer is correct within `<final_answer>` XML tags. |
| 68 | +- Include a detailed analysis for each part within the designated XML tags: `<document_understanding>`, `<chunk_understanding>`, `<question_understanding>`, `<ground_truth_answer_understanding>`, `<model_answer_understanding>`, and `<final_answer>`. |
| 69 | +
|
| 70 | +# Examples |
| 71 | +
|
| 72 | +**Input**: |
| 73 | +```xml |
| 74 | +<document_summary> |
| 75 | +[Summary] |
| 76 | +</document_summary> |
| 77 | +
|
| 78 | +<piece_of_text> |
| 79 | +[Text] |
| 80 | +</piece_of_text> |
| 81 | +
|
| 82 | +<question> |
| 83 | +[Question] |
| 84 | +</question> |
| 85 | +
|
| 86 | +<gold_answer> |
| 87 | +[Gold Answer] |
| 88 | +</gold_answer> |
| 89 | +
|
| 90 | +<model_answer> |
| 91 | +[Model Answer] |
| 92 | +</model_answer> |
| 93 | +``` |
| 94 | +**Output**: |
| 95 | +```xml |
| 96 | +
|
| 97 | +<document_understanding> |
| 98 | +Understanding of the summary including key themes |
| 99 | +</document_understanding> |
| 100 | +
|
| 101 | +<chunk_understanding> |
| 102 | +Analysis of the piece of text |
| 103 | +</chunk_understanding> |
| 104 | +
|
| 105 | +<question_understanding> |
| 106 | +Comprehension of the question being asked |
| 107 | +</question_understanding> |
| 108 | +
|
| 109 | +<ground_truth_answer_understanding> |
| 110 | +Key points from the gold answer |
| 111 | +</ground_truth_answer_understanding> |
| 112 | +
|
| 113 | +<model_answer_understanding> |
| 114 | +Key points and accuracy of Answer A |
| 115 | +</model_answer_understanding> |
| 116 | +
|
| 117 | +<final_answer> |
| 118 | +1 or 0 (1 if the model answer is correct, 0 if it is incorrect) |
| 119 | +</final_answer> |
| 120 | +``` |
| 121 | +
|
| 122 | +# Notes |
| 123 | +
|
| 124 | +- Always focus on key points and factual correctness as per the ground truth. |
| 125 | +- Avoid any biases and rely solely on the evidence presented. |
| 126 | +- Enclose all evaluations and analyses in the specified XML tags for clarity and structure.""" |
| 127 | + |
| 128 | + |
| 129 | +JUDGE_ANSWER_USER_PROMPT = """<document_summary> |
| 130 | +{summary} |
| 131 | +</document_summary> |
| 132 | +
|
| 133 | +<piece_of_text> |
| 134 | +{chunk} |
| 135 | +</piece_of_text> |
| 136 | +
|
| 137 | +<question> |
| 138 | +{question} |
| 139 | +</question> |
| 140 | +
|
| 141 | +<gold_answer> |
| 142 | +{oracle_answer} |
| 143 | +</gold_answer> |
| 144 | +
|
| 145 | +<model_answer> |
| 146 | +{model_answer} |
| 147 | +</model_answer>""" |
| 148 | + |
| 149 | + |
| 150 | +def get_judge_prompt(question: str, answer: str, gold: str, **kwargs): |
| 151 | + chunk = kwargs.get("chunks", "") |
| 152 | + summary = kwargs.get("documents", "") |
| 153 | + |
| 154 | + return [ |
| 155 | + {"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT}, |
| 156 | + { |
| 157 | + "role": "user", |
| 158 | + "content": JUDGE_ANSWER_USER_PROMPT.format( |
| 159 | + summary=summary, chunk=chunk, question=question, oracle_answer=gold, model_answer=answer |
| 160 | + ), |
| 161 | + }, |
| 162 | + ] |
| 163 | + |
| 164 | + |
| 165 | +def process_judge_response_yourbench(response): |
| 166 | + # extract the final answer using regex from the response xml |
| 167 | + try: |
| 168 | + answer = re.search(r"<final_answer>(.*?)</final_answer>", response, re.DOTALL).group(1) |
| 169 | + return int(answer) |
| 170 | + except Exception as e: |
| 171 | + logger.error(f"Error processing judge response: {e}") |
| 172 | + return 0 |
| 173 | + |
| 174 | + |
| 175 | +class JudgeLLMYourBench(JudgeLLM): |
| 176 | + def __init__(self): |
| 177 | + super().__init__( |
| 178 | + judge_model_name="gpt-4o-2024-08-06", |
| 179 | + template=get_judge_prompt, |
| 180 | + process_judge_response=process_judge_response_yourbench, |
| 181 | + judge_backend="openai", |
| 182 | + short_judge_name="yourbench_judge", |
| 183 | + ) |
| 184 | + |
| 185 | + def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]: |
| 186 | + # If we are evaluating a multiturn task, we need to have specific field in the formatted doc |
| 187 | + questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs] |
| 188 | + golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs] |
| 189 | + predictions = [response[0].result[0] for response in responses] |
| 190 | + options = [None] * len(questions) |
| 191 | + chunks = [formatted_doc.specific["chunks"][0] for formatted_doc in formatted_docs] |
| 192 | + documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs] |
| 193 | + |
| 194 | + score, _, _ = self.judge.evaluate_answer_batch( |
| 195 | + questions, predictions, options, golds, chunks=chunks, documents=documents |
| 196 | + ) |
| 197 | + |
| 198 | + metrics = [] |
| 199 | + for i in range(len(sample_ids)): |
| 200 | + metrics.append( |
| 201 | + { |
| 202 | + "accuracy": score[i], |
| 203 | + } |
| 204 | + ) |
| 205 | + |
| 206 | + return metrics |
| 207 | + |
| 208 | + |
| 209 | +ZEROSHOT_QA_USER_PROMPT = """Answer the following question: |
| 210 | +
|
| 211 | +<question> |
| 212 | +{question} |
| 213 | +</question> |
| 214 | +
|
| 215 | +Enclose your full answer in <answer> XML tags. For example: |
| 216 | +
|
| 217 | +<answer> |
| 218 | +[your answer here] |
| 219 | +</answer>""" |
| 220 | + |
| 221 | + |
| 222 | +def yourbench_prompt(line, task_name: str = ""): |
| 223 | + return Doc( |
| 224 | + task_name=task_name, |
| 225 | + query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"]), |
| 226 | + choices=[line["ground_truth_answer"]], |
| 227 | + gold_index=0, |
| 228 | + specific={ |
| 229 | + "question_category": line["question_category"], |
| 230 | + "kind": line["kind"], |
| 231 | + "estimated_difficulty": line["estimated_difficulty"], |
| 232 | + "document_id": line["document_id"], |
| 233 | + "question_generating_model": line["question_generating_model"], |
| 234 | + "chunks": line["chunks"], |
| 235 | + "question": line["question"], |
| 236 | + "document": line["document"], |
| 237 | + }, |
| 238 | + ) |
| 239 | + |
| 240 | + |
| 241 | +yourbench_metrics = CorpusLevelMetricGrouping( |
| 242 | + metric_name=["accuracy"], |
| 243 | + higher_is_better={"accuracy": True}, |
| 244 | + category=MetricCategory.LLM_AS_JUDGE, |
| 245 | + use_case=MetricUseCase.ACCURACY, |
| 246 | + sample_level_fn=JudgeLLMYourBench().compute, |
| 247 | + corpus_level_fn={"accuracy": np.mean}, |
| 248 | +) |
| 249 | +extend_enum(Metrics, "yourbench_metrics", yourbench_metrics) |
| 250 | + |
| 251 | +yourbench = LightevalTaskConfig( |
| 252 | + name=HF_TASK_NAME, # noqa: F821 |
| 253 | + suite=["custom"], |
| 254 | + prompt_function=yourbench_prompt, |
| 255 | + hf_repo=HF_DATASET_NAME, # noqa: F821 |
| 256 | + hf_subset="lighteval_single_shot_questions", |
| 257 | + hf_avail_splits=["train"], |
| 258 | + evaluation_splits=["train"], |
| 259 | + few_shots_split=None, |
| 260 | + few_shots_select=None, |
| 261 | + generation_size=8192, |
| 262 | + metric=[Metrics.yourbench_metrics], |
| 263 | + stop_sequence=[], |
| 264 | + trust_dataset=True, |
| 265 | + version=0, |
| 266 | +) |
| 267 | + |
| 268 | + |
| 269 | +TASKS_TABLE = [yourbench] |
0 commit comments