diff --git a/docs/tasks/CommonsenseQAMC_OLMES.md b/docs/tasks/CommonsenseQAMC_OLMES.md index 1abd8d77..a91bb0a8 100644 --- a/docs/tasks/CommonsenseQAMC_OLMES.md +++ b/docs/tasks/CommonsenseQAMC_OLMES.md @@ -3,8 +3,8 @@ ```` NAME = CommonsenseQAMC_OLMES DATASET_PATH = tau/commonsense_qa -SAMPLE_SPLIT = validation -FEWSHOT_SPLIT = validation +SAMPLE_SPLIT = train +FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] SUBJECTS = ['no_subject'] diff --git a/docs/tasks/DropCompletion_OLMES.md b/docs/tasks/DropCompletion_OLMES.md new file mode 100644 index 00000000..a746d834 --- /dev/null +++ b/docs/tasks/DropCompletion_OLMES.md @@ -0,0 +1,20 @@ +# DropCompletion_OLMES + +```` +NAME = DropCompletion_OLMES +DATASET_PATH = EleutherAI/drop +SAMPLE_SPLIT = validation +FEWSHOT_SPLIT = train +RESPONSE_TYPE = COMPLETION +METRICS = [DropF1ExactMatch] +SUBJECTS = ['no_subject'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.drop` + +- File: [src/eval_framework/tasks/benchmarks/drop.py](../../src/eval_framework/tasks/benchmarks/drop.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/drop.py) + +- Link to dataset: [https://huggingface.co/datasets/EleutherAI/drop](https://huggingface.co/datasets/EleutherAI/drop) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "DropCompletion_OLMES"`. diff --git a/docs/tasks/MedQAMC_OLMES.md b/docs/tasks/MedQAMC_OLMES.md index b6108997..575efac1 100644 --- a/docs/tasks/MedQAMC_OLMES.md +++ b/docs/tasks/MedQAMC_OLMES.md @@ -4,7 +4,7 @@ NAME = MedQAMC_OLMES DATASET_PATH = davidheineman/medqa-en SAMPLE_SPLIT = test -FEWSHOT_SPLIT = dev +FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] SUBJECTS = ['no_subject'] diff --git a/docs/tasks/PIQA_OLMES.md b/docs/tasks/PIQA_OLMES.md index ca7610c0..2fb7294c 100644 --- a/docs/tasks/PIQA_OLMES.md +++ b/docs/tasks/PIQA_OLMES.md @@ -3,8 +3,8 @@ ```` NAME = PIQA_OLMES DATASET_PATH = ybisk/piqa -SAMPLE_SPLIT = validation -FEWSHOT_SPLIT = test +SAMPLE_SPLIT = train +FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] SUBJECTS = ['no_subject'] diff --git a/docs/tasks/README.md b/docs/tasks/README.md index 38ec0d69..052988ad 100644 --- a/docs/tasks/README.md +++ b/docs/tasks/README.md @@ -2,7 +2,7 @@ This directory contains the generated documentation for all benchmark tasks available in the package. -**Total number of tasks: 157** +**Total number of tasks: 158** The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`. @@ -42,6 +42,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [DUC_EXTRACTIVE](DUC_EXTRACTIVE.md) - [DropCloze](DropCloze.md) - [DropCompletion](DropCompletion.md) +- [DropCompletion_OLMES](DropCompletion_OLMES.md) - [DropMC](DropMC.md) - [DropMC_OLMES](DropMC_OLMES.md) - [Flores200](Flores200.md) diff --git a/docs/tasks/SCIQ_OLMES.md b/docs/tasks/SCIQ_OLMES.md index 4fe2a1d9..9cae57d9 100644 --- a/docs/tasks/SCIQ_OLMES.md +++ b/docs/tasks/SCIQ_OLMES.md @@ -3,8 +3,8 @@ ```` NAME = SCIQ_OLMES DATASET_PATH = allenai/sciq -SAMPLE_SPLIT = validation -FEWSHOT_SPLIT = test +SAMPLE_SPLIT = train +FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] SUBJECTS = ['no_subject'] diff --git a/docs/tasks/SocialIQAMC_OLMES.md b/docs/tasks/SocialIQAMC_OLMES.md index 5f4f7e7b..ae4a269f 100644 --- a/docs/tasks/SocialIQAMC_OLMES.md +++ b/docs/tasks/SocialIQAMC_OLMES.md @@ -3,7 +3,7 @@ ```` NAME = SocialIQAMC_OLMES DATASET_PATH = allenai/social_i_qa -SAMPLE_SPLIT = validation +SAMPLE_SPLIT = train FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] diff --git a/src/eval_framework/metrics/loglikelihood/bits_per_byte.py b/src/eval_framework/metrics/loglikelihood/bits_per_byte.py index adcc29fb..4a2909f4 100644 --- a/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +++ b/src/eval_framework/metrics/loglikelihood/bits_per_byte.py @@ -1,7 +1,7 @@ import math from eval_framework.metrics.base import BaseMetric, MetricResult -from eval_framework.shared.types import Loglikelihood +from eval_framework.shared.types import Error, Loglikelihood class BitsPerByteLoglikelihood(BaseMetric[Loglikelihood]): @@ -37,7 +37,12 @@ def calculate(self, response: Loglikelihood) -> list[MetricResult]: metric_name=self.NAME, value=None, higher_is_better=False, - error=response.error or "No ground-truth answer found in loglikelihoods", + error=response.error + or Error( + error_class="ValueError", + message="No ground-truth answer found in loglikelihoods", + traceback="", + ), ) ] @@ -48,7 +53,12 @@ def calculate(self, response: Loglikelihood) -> list[MetricResult]: metric_name=self.NAME, value=None, higher_is_better=False, - error=response.error or "Ground-truth answer has zero UTF-8 bytes", + error=response.error + or Error( + error_class="ValueError", + message="Ground-truth answer has zero UTF-8 bytes", + traceback="", + ), ) ] diff --git a/src/eval_framework/tasks/benchmarks/csqa.py b/src/eval_framework/tasks/benchmarks/csqa.py index a2a18d93..eb47f8f5 100644 --- a/src/eval_framework/tasks/benchmarks/csqa.py +++ b/src/eval_framework/tasks/benchmarks/csqa.py @@ -82,6 +82,8 @@ class CommonsenseQAMC_OLMES(CommonsenseQAMC): """ NAME = "CommonsenseQAMC_OLMES" + SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits + FEWSHOT_SPLIT = "train" def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item["question"] diff --git a/src/eval_framework/tasks/benchmarks/drop.py b/src/eval_framework/tasks/benchmarks/drop.py index 4f8d6aee..bdda87d3 100644 --- a/src/eval_framework/tasks/benchmarks/drop.py +++ b/src/eval_framework/tasks/benchmarks/drop.py @@ -86,14 +86,21 @@ def __init__(self, num_fewshot: int = 0) -> None: def _load_dataset(self, subject: str) -> None: hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH) - validation = list(hf_dataset.get(self.SAMPLE_SPLIT, [])) - processed = [] - for doc in validation: - parsed = _get_answers(doc) - if not parsed: - continue - processed.append({**doc, "parsed_answers": parsed}) - self.dataset = self._shuffle_splits(hf_dataset={self.SAMPLE_SPLIT: processed, self.FEWSHOT_SPLIT: processed}) + + def process(docs: list[dict[str, Any]]) -> list[dict[str, Any]]: + result = [] + for doc in docs: + parsed = _get_answers(doc) + if not parsed: + continue + result.append({**doc, "parsed_answers": parsed}) + return result + + sample_split = process(hf_dataset.get(self.SAMPLE_SPLIT, [])) + fewshot_split = process(hf_dataset.get(self.FEWSHOT_SPLIT, [])) + self.dataset = self._shuffle_splits( + hf_dataset={self.SAMPLE_SPLIT: sample_split, self.FEWSHOT_SPLIT: fewshot_split} + ) def _get_instruction_text(self, item: dict[str, Any]) -> str: passage = (item.get("passage") or "").strip() @@ -116,6 +123,17 @@ def _get_context(self, item: dict[str, Any]) -> DropMetricContext | None: return DropMetricContext(answer_tuples=[list(a) for a in answers]) +class DropCompletion_OLMES(DropCompletion): + """DropCompletion matching OLMES, using train split for fewshot and max tokens 100.""" + + NAME = "DropCompletion_OLMES" + FEWSHOT_SPLIT = "train" + + def __init__(self, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot) + self.max_tokens = 100 + + class DropMC(BaseTask[str]): """Multiple-choice variant using allenai/drop-gen2mc (passage_original, question_original, choices, answerKey).""" @@ -151,6 +169,9 @@ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: labels = item.get("choices", {}).get("label", []) return [f" {label}" for label in labels] + def _get_cue_text(self, item: dict[str, Any]) -> str: + return "Answer:" + class DropMC_OLMES(DropMC): """ diff --git a/src/eval_framework/tasks/benchmarks/medqa.py b/src/eval_framework/tasks/benchmarks/medqa.py index 38e00087..c5818095 100644 --- a/src/eval_framework/tasks/benchmarks/medqa.py +++ b/src/eval_framework/tasks/benchmarks/medqa.py @@ -75,6 +75,7 @@ class MedQAMC_OLMES(MedQAMC): """ NAME = "MedQAMC_OLMES" + FEWSHOT_SPLIT = "train" def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item["question"] diff --git a/src/eval_framework/tasks/benchmarks/piqa.py b/src/eval_framework/tasks/benchmarks/piqa.py index 5f4f6fdb..279adeba 100644 --- a/src/eval_framework/tasks/benchmarks/piqa.py +++ b/src/eval_framework/tasks/benchmarks/piqa.py @@ -52,6 +52,8 @@ class PIQA_OLMES(PIQA): """ NAME = "PIQA_OLMES" + SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits + FEWSHOT_SPLIT = "train" def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) @@ -61,7 +63,7 @@ def _get_instruction_text(self, item: dict[str, Any]) -> str: goal = item["goal"] choices = [item["sol1"], item["sol2"]] options = "\n".join(f" {key}. {choice}" for key, choice in zip(self.keys, choices)) - return f"Question: {goal}\n{options}\n" + return f"Goal: {goal}\n{options}\n" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: idx = 0 if item["label"] == 0 else 1 diff --git a/src/eval_framework/tasks/benchmarks/sciq.py b/src/eval_framework/tasks/benchmarks/sciq.py index b438d094..256bf39a 100644 --- a/src/eval_framework/tasks/benchmarks/sciq.py +++ b/src/eval_framework/tasks/benchmarks/sciq.py @@ -70,6 +70,8 @@ class SCIQ_OLMES(SCIQ): """ NAME = "SciQ_OLMES" + SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits + FEWSHOT_SPLIT = "train" def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) diff --git a/src/eval_framework/tasks/benchmarks/social_iqa.py b/src/eval_framework/tasks/benchmarks/social_iqa.py index ce489d81..074de084 100644 --- a/src/eval_framework/tasks/benchmarks/social_iqa.py +++ b/src/eval_framework/tasks/benchmarks/social_iqa.py @@ -155,7 +155,11 @@ class SocialIQACloze(BaseTask[str]): SAMPLE_SPLIT = "validation" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS - METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] + METRICS = [ + AccuracyLoglikelihood, + AccuracyNormLoglikelihood, + BitsPerByteLoglikelihood, + ] SUBJECTS = [NO_SUBJECT] PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] LANGUAGE = Language.ENG @@ -174,6 +178,11 @@ def _get_ground_truth(self, item: dict[str, Any]) -> str | None: choices = [item["answerA"], item["answerB"], item["answerC"]] return f" {choices[idx]}" + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + ground_truth = self._get_ground_truth(item) + assert ground_truth is not None + return f"{self._get_cue_text(item)}{ground_truth}" + def _get_cue_text(self, item: dict[str, Any]) -> str: return "Answer:" @@ -189,6 +198,7 @@ class SocialIQAMC_OLMES(SocialIQACloze): """ NAME = "SocialIQAMC_OLMES" + SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits def _get_instruction_text(self, item: dict[str, Any]) -> str: query = _social_iqa_context_question(item) @@ -214,6 +224,7 @@ class SocialIQAMC(SocialIQAMC_OLMES): """ NAME = "SocialIQAMC" + SAMPLE_SPLIT = "validation" def _get_instruction_text(self, item: dict[str, Any]) -> str: query = _social_iqa_context_question(item) diff --git a/src/eval_framework/tasks/task_names.py b/src/eval_framework/tasks/task_names.py index 0dd11e16..c9778d5e 100644 --- a/src/eval_framework/tasks/task_names.py +++ b/src/eval_framework/tasks/task_names.py @@ -158,6 +158,7 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.csqa.CommonsenseQAMC") register_lazy_task("eval_framework.tasks.benchmarks.csqa.CommonsenseQAMC_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.drop.DropCompletion") + register_lazy_task("eval_framework.tasks.benchmarks.drop.DropCompletion_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.drop.DropMC") register_lazy_task("eval_framework.tasks.benchmarks.drop.DropMC_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.drop.DropCloze") diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json index f953fddd..4f5fb4c8 100644 --- a/tests/tests_eval_framework/tasks/task-prompts-hashes.json +++ b/tests/tests_eval_framework/tasks/task-prompts-hashes.json @@ -53,8 +53,8 @@ "CommonsenseQAFullTextCloze.Llama3Formatter": "7433c7dfb21e0da6ff0566c886bdd29c", "CommonsenseQAMC.ConcatFormatter": "b49e21abf30a6c6b6f2d5665914fae6e", "CommonsenseQAMC.Llama3Formatter": "1b97fe5fecd8a27e3f587693318bf17a", - "CommonsenseQAMC_OLMES.ConcatFormatter": "b49e21abf30a6c6b6f2d5665914fae6e", - "CommonsenseQAMC_OLMES.Llama3Formatter": "1b97fe5fecd8a27e3f587693318bf17a", + "CommonsenseQAMC_OLMES.ConcatFormatter": "ca6acb05cfe4bf09a0f2321c5561d293", + "CommonsenseQAMC_OLMES.Llama3Formatter": "f41e71ef8389525e730c5f8b41026cf5", "DUC_ABSTRACTIVE.ConcatFormatter": "ab9a49f844aae7cd40470a9c8c1012ad", "DUC_ABSTRACTIVE.Llama3Formatter": "b425e6e924f57ebbaeaa7a333d565d05", "DUC_EXTRACTIVE.ConcatFormatter": "d773f051727e563369af29e183b2046b", @@ -63,10 +63,12 @@ "DropCloze.Llama3Formatter": "3085d2db0aee52171eca445f2a710ed2", "DropCompletion.ConcatFormatter": "5efb46197ee12236e2e674a5e9b93976", "DropCompletion.Llama3Formatter": "a7c7ef7e39aefdef7d6227c46ccc6f05", - "DropMC.ConcatFormatter": "eb61c2c3e928559645a950d1f38b2bd8", - "DropMC.Llama3Formatter": "9ef3c50c1e2977a8c365b297e72e6077", - "DropMC_OLMES.ConcatFormatter": "581072d364a18d07164e7c36351b1992", - "DropMC_OLMES.Llama3Formatter": "cc33eace09b710fc53b51ba3851d52c9", + "DropCompletion_OLMES.ConcatFormatter": "388e4b193e8a87fc4207cff336687684", + "DropCompletion_OLMES.Llama3Formatter": "e285c67c3d807d9e667af0a56aeb95ec", + "DropMC.ConcatFormatter": "f50d96ea6246b9838ea0f20cec7f6167", + "DropMC.Llama3Formatter": "4a6849c53fdf51f8e6743f99716f9b24", + "DropMC_OLMES.ConcatFormatter": "798cdb2ca51bcb8c95cb7d086f2ca6cf", + "DropMC_OLMES.Llama3Formatter": "abf0fa6d3b668f00df6cd8fbe6dcf611", "DropRC.ConcatFormatter": "b4fc7afed1c5d3fc7b185abfee10a94b", "DropRC.Llama3Formatter": "9da70cae44e38ea15e4144f50395b4a4", "Flores200.ConcatFormatter": "f9cc5b93d0611404471d88dd64aa6c19", @@ -200,8 +202,8 @@ "MedQACloze.Llama3Formatter": "1308150c38062e8c34785f12e89b8c63", "MedQAMC.ConcatFormatter": "786270f80b7b122fa0098ffbbfdba446", "MedQAMC.Llama3Formatter": "fac0fd8a8d5d70e54686dea58f57fcb3", - "MedQAMC_OLMES.ConcatFormatter": "b3c29f76fa80e85d027d1ba3d6750478", - "MedQAMC_OLMES.Llama3Formatter": "c99d9ddb97aa1562705e810b1aff2e32", + "MedQAMC_OLMES.ConcatFormatter": "d5bac9bda6fc752ecc751be2cc23474b", + "MedQAMC_OLMES.Llama3Formatter": "40f759deb287df3adb3247b07c857efd", "NaturalQsOpen.ConcatFormatter": "90a895244f9cd20ca3200808ff5ddbab", "NaturalQsOpen.Llama3Formatter": "0955486e04308ca936b2c5267e5e9be9", "NaturalQsOpenCloze.ConcatFormatter": "49042482667eeaa9363c6247927f642f", @@ -226,8 +228,8 @@ "PIQA.Llama3Formatter": "9d33895f4024a9a1e61a00626e312313", "PIQA_IDK.ConcatFormatter": "4c07fad4f1409a22bc9835eb848feecb", "PIQA_IDK.Llama3Formatter": "94061c00140859fb99aa88a1e63b1b23", - "PIQA_OLMES.ConcatFormatter": "b5dc0e23810e4bb4801d07fdd54e6407", - "PIQA_OLMES.Llama3Formatter": "087eb23ebf6a54d5d286f59713669ad8", + "PIQA_OLMES.ConcatFormatter": "dd8a928404989320ce7b712d149bf5f5", + "PIQA_OLMES.Llama3Formatter": "e605ca9fb312e9309206e5aeba259169", "QUALITY.ConcatFormatter": "49e990c091cf8b46b3afd1ae8db84797", "QUALITY.Llama3Formatter": "408165ab35d5153c9f144021cbd27544", "RenderableStructEval.ConcatFormatter": "40bd2d2bdd9e663bf1cbc147eceafced", @@ -240,8 +242,8 @@ "SCIQEvalHarness_IDK.Llama3Formatter": "3e8e56b3b7662cff6ea2943e1bb88ba0", "SCIQ_IDK.ConcatFormatter": "ffc25631a289d4dd536a5b6660cfb17c", "SCIQ_IDK.Llama3Formatter": "bffb5e3d9a9f1b18a96fd74659c43c27", - "SCIQ_OLMES.ConcatFormatter": "a3f16028e92d9690c58522246032fee8", - "SCIQ_OLMES.Llama3Formatter": "7d5edb428b339cefb272907c3a368621", + "SCIQ_OLMES.ConcatFormatter": "820f12c83c89dfe8bbe19e062c37d259", + "SCIQ_OLMES.Llama3Formatter": "4172e30c45aa2105e1db0cd49dcf9cb7", "SPHYR.ConcatFormatter": "f6b6f1044fda6325b744281256a81ba9", "SPHYR.Llama3Formatter": "e67de80f6465104b6e511c5d339b9e45", "SQUAD.ConcatFormatter": "88bf0d58404c024dd885217d19466f0d", @@ -250,12 +252,12 @@ "SQUAD2.Llama3Formatter": "118f3757859029bee7da4bdfdb9fdf0b", "SQUAD2BPB.ConcatFormatter": "a1ae38489334319a2d0ef3a8c503ec4c", "SQUAD2BPB.Llama3Formatter": "097cea2ecf27f0d24b52633fa3ce363a", - "SocialIQACloze.ConcatFormatter": "9bd18f84263ae1dd28ea1099d1392b32", - "SocialIQACloze.Llama3Formatter": "10b3c20fe7a78b803377caef818fb31f", - "SocialIQAMC.ConcatFormatter": "1659f621b1dc4f410536d55f2e81c932", - "SocialIQAMC.Llama3Formatter": "c5ed5cd29b021fa0c71061905c48dfc1", - "SocialIQAMC_OLMES.ConcatFormatter": "7d3017ef98a2fe64ae8439555c3c1440", - "SocialIQAMC_OLMES.Llama3Formatter": "a49d524f98ea43d1deafc7b833c34529", + "SocialIQACloze.ConcatFormatter": "e20a2c81592da55f2d6e6c88d3a7104f", + "SocialIQACloze.Llama3Formatter": "1d4ddde3cc8857a459bd37f7b90ed9db", + "SocialIQAMC.ConcatFormatter": "96f91645eace7a45a93d37e49451a64d", + "SocialIQAMC.Llama3Formatter": "e0d30544bde34934a4695bc20698c882", + "SocialIQAMC_OLMES.ConcatFormatter": "e5e5dac8348978b6b8e0311c66fa1877", + "SocialIQAMC_OLMES.Llama3Formatter": "ac1a7164ab0a1b4eb6d6a5ed055b4be0", "StructEval.ConcatFormatter": "280bb4a5e01945c4d25688bbb17468b5", "StructEval.Llama3Formatter": "3441d8806336ce590e09c9681d8c8f6f", "TRIVIAQA.ConcatFormatter": "4ee69bbf9d5ade2e7ed61e7455b06b0b",