From e06a7582a78492d818ca64d0995ac785bc543b8e Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 24 Feb 2026 15:48:07 +0000 Subject: [PATCH 01/16] fix: CSQA in OLMES uses train for fewshots --- src/eval_framework/tasks/benchmarks/csqa.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eval_framework/tasks/benchmarks/csqa.py b/src/eval_framework/tasks/benchmarks/csqa.py index a2a18d93..57c328ea 100644 --- a/src/eval_framework/tasks/benchmarks/csqa.py +++ b/src/eval_framework/tasks/benchmarks/csqa.py @@ -82,6 +82,7 @@ class CommonsenseQAMC_OLMES(CommonsenseQAMC): """ NAME = "CommonsenseQAMC_OLMES" + FEWSHOT_SPLIT = "train" def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item["question"] From e9bb06a631146926ec5282d6e23751d14af39c25 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 24 Feb 2026 15:48:45 +0000 Subject: [PATCH 02/16] fix: MedQAMC in OLMES uses train for fewshots --- src/eval_framework/tasks/benchmarks/medqa.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eval_framework/tasks/benchmarks/medqa.py b/src/eval_framework/tasks/benchmarks/medqa.py index 38e00087..c5818095 100644 --- a/src/eval_framework/tasks/benchmarks/medqa.py +++ b/src/eval_framework/tasks/benchmarks/medqa.py @@ -75,6 +75,7 @@ class MedQAMC_OLMES(MedQAMC): """ NAME = "MedQAMC_OLMES" + FEWSHOT_SPLIT = "train" def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item["question"] From 16f135e87b401d00215f0f9ce456b359c218b06e Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 24 Feb 2026 15:49:03 +0000 Subject: [PATCH 03/16] fix: SciQ in OLMES uses train for fewshots --- src/eval_framework/tasks/benchmarks/sciq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eval_framework/tasks/benchmarks/sciq.py b/src/eval_framework/tasks/benchmarks/sciq.py index b438d094..2280e87a 100644 --- a/src/eval_framework/tasks/benchmarks/sciq.py +++ b/src/eval_framework/tasks/benchmarks/sciq.py @@ -70,6 +70,7 @@ class SCIQ_OLMES(SCIQ): """ NAME = "SciQ_OLMES" + FEWSHOT_SPLIT = "train" def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) From 74926d82fb5fa5f7b6f3f4cd30cc3b6a841969ee Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 24 Feb 2026 15:49:44 +0000 Subject: [PATCH 04/16] fix: PIQA in OLMES uses train for fewshots; Use "Goal" instead of "Answer" as prefix. --- src/eval_framework/tasks/benchmarks/piqa.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/eval_framework/tasks/benchmarks/piqa.py b/src/eval_framework/tasks/benchmarks/piqa.py index 5f4f6fdb..128912a7 100644 --- a/src/eval_framework/tasks/benchmarks/piqa.py +++ b/src/eval_framework/tasks/benchmarks/piqa.py @@ -52,6 +52,7 @@ class PIQA_OLMES(PIQA): """ NAME = "PIQA_OLMES" + FEWSHOT_SPLIT = "train" def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) @@ -61,7 +62,7 @@ def _get_instruction_text(self, item: dict[str, Any]) -> str: goal = item["goal"] choices = [item["sol1"], item["sol2"]] options = "\n".join(f" {key}. {choice}" for key, choice in zip(self.keys, choices)) - return f"Question: {goal}\n{options}\n" + return f"Goal: {goal}\n{options}\n" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: idx = 0 if item["label"] == 0 else 1 From 313754343052e4374afba61defa2c5682acbc329 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 24 Feb 2026 15:51:15 +0000 Subject: [PATCH 05/16] fix: Add cue text for DropMC; DropCompletion in OLMES uses max tokens=100 and train for fewshots; --- src/eval_framework/tasks/benchmarks/drop.py | 37 ++++++++++++++++----- src/eval_framework/tasks/task_names.py | 1 + 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/eval_framework/tasks/benchmarks/drop.py b/src/eval_framework/tasks/benchmarks/drop.py index 4f8d6aee..a0d2b947 100644 --- a/src/eval_framework/tasks/benchmarks/drop.py +++ b/src/eval_framework/tasks/benchmarks/drop.py @@ -86,14 +86,21 @@ def __init__(self, num_fewshot: int = 0) -> None: def _load_dataset(self, subject: str) -> None: hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH) - validation = list(hf_dataset.get(self.SAMPLE_SPLIT, [])) - processed = [] - for doc in validation: - parsed = _get_answers(doc) - if not parsed: - continue - processed.append({**doc, "parsed_answers": parsed}) - self.dataset = self._shuffle_splits(hf_dataset={self.SAMPLE_SPLIT: processed, self.FEWSHOT_SPLIT: processed}) + + def process(docs): + result = [] + for doc in docs: + parsed = _get_answers(doc) + if not parsed: + continue + result.append({**doc, "parsed_answers": parsed}) + return result + + sample_split = process(hf_dataset.get(self.SAMPLE_SPLIT, [])) + fewshot_split = process(hf_dataset.get(self.FEWSHOT_SPLIT, [])) + self.dataset = self._shuffle_splits( + hf_dataset={self.SAMPLE_SPLIT: sample_split, self.FEWSHOT_SPLIT: fewshot_split} + ) def _get_instruction_text(self, item: dict[str, Any]) -> str: passage = (item.get("passage") or "").strip() @@ -116,6 +123,17 @@ def _get_context(self, item: dict[str, Any]) -> DropMetricContext | None: return DropMetricContext(answer_tuples=[list(a) for a in answers]) +class DropCompletion_OLMES(DropCompletion): + """DropCompletion matching OLMES, using train split for fewshot and max tokens 100.""" + + NAME = "DropCompletion_OLMES" + FEWSHOT_SPLIT = "train" + + def __init__(self, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot) + self.max_tokens = 100 + + class DropMC(BaseTask[str]): """Multiple-choice variant using allenai/drop-gen2mc (passage_original, question_original, choices, answerKey).""" @@ -151,6 +169,9 @@ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: labels = item.get("choices", {}).get("label", []) return [f" {label}" for label in labels] + def _get_cue_text(self, item: dict[str, Any]) -> str: + return "Answer:" + class DropMC_OLMES(DropMC): """ diff --git a/src/eval_framework/tasks/task_names.py b/src/eval_framework/tasks/task_names.py index 0dd11e16..c9778d5e 100644 --- a/src/eval_framework/tasks/task_names.py +++ b/src/eval_framework/tasks/task_names.py @@ -158,6 +158,7 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.csqa.CommonsenseQAMC") register_lazy_task("eval_framework.tasks.benchmarks.csqa.CommonsenseQAMC_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.drop.DropCompletion") + register_lazy_task("eval_framework.tasks.benchmarks.drop.DropCompletion_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.drop.DropMC") register_lazy_task("eval_framework.tasks.benchmarks.drop.DropMC_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.drop.DropCloze") From 57b5c4e2664de8b9373756bbe294acd3c5758025 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 24 Feb 2026 15:52:11 +0000 Subject: [PATCH 06/16] test: Update hashes with new tasks and different fewshot sources. --- .../tasks/task-prompts-hashes.json | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json index f953fddd..a5cae757 100644 --- a/tests/tests_eval_framework/tasks/task-prompts-hashes.json +++ b/tests/tests_eval_framework/tasks/task-prompts-hashes.json @@ -53,8 +53,8 @@ "CommonsenseQAFullTextCloze.Llama3Formatter": "7433c7dfb21e0da6ff0566c886bdd29c", "CommonsenseQAMC.ConcatFormatter": "b49e21abf30a6c6b6f2d5665914fae6e", "CommonsenseQAMC.Llama3Formatter": "1b97fe5fecd8a27e3f587693318bf17a", - "CommonsenseQAMC_OLMES.ConcatFormatter": "b49e21abf30a6c6b6f2d5665914fae6e", - "CommonsenseQAMC_OLMES.Llama3Formatter": "1b97fe5fecd8a27e3f587693318bf17a", + "CommonsenseQAMC_OLMES.ConcatFormatter": "3ab1a269213eab2a684fa186320b8f00", + "CommonsenseQAMC_OLMES.Llama3Formatter": "0509c4c4ba589a35600fdbdec2ed2f9c", "DUC_ABSTRACTIVE.ConcatFormatter": "ab9a49f844aae7cd40470a9c8c1012ad", "DUC_ABSTRACTIVE.Llama3Formatter": "b425e6e924f57ebbaeaa7a333d565d05", "DUC_EXTRACTIVE.ConcatFormatter": "d773f051727e563369af29e183b2046b", @@ -63,10 +63,12 @@ "DropCloze.Llama3Formatter": "3085d2db0aee52171eca445f2a710ed2", "DropCompletion.ConcatFormatter": "5efb46197ee12236e2e674a5e9b93976", "DropCompletion.Llama3Formatter": "a7c7ef7e39aefdef7d6227c46ccc6f05", - "DropMC.ConcatFormatter": "eb61c2c3e928559645a950d1f38b2bd8", - "DropMC.Llama3Formatter": "9ef3c50c1e2977a8c365b297e72e6077", - "DropMC_OLMES.ConcatFormatter": "581072d364a18d07164e7c36351b1992", - "DropMC_OLMES.Llama3Formatter": "cc33eace09b710fc53b51ba3851d52c9", + "DropCompletion_OLMES.ConcatFormatter": "388e4b193e8a87fc4207cff336687684", + "DropCompletion_OLMES.Llama3Formatter": "e285c67c3d807d9e667af0a56aeb95ec", + "DropMC.ConcatFormatter": "f50d96ea6246b9838ea0f20cec7f6167", + "DropMC.Llama3Formatter": "4a6849c53fdf51f8e6743f99716f9b24", + "DropMC_OLMES.ConcatFormatter": "798cdb2ca51bcb8c95cb7d086f2ca6cf", + "DropMC_OLMES.Llama3Formatter": "abf0fa6d3b668f00df6cd8fbe6dcf611", "DropRC.ConcatFormatter": "b4fc7afed1c5d3fc7b185abfee10a94b", "DropRC.Llama3Formatter": "9da70cae44e38ea15e4144f50395b4a4", "Flores200.ConcatFormatter": "f9cc5b93d0611404471d88dd64aa6c19", @@ -200,8 +202,8 @@ "MedQACloze.Llama3Formatter": "1308150c38062e8c34785f12e89b8c63", "MedQAMC.ConcatFormatter": "786270f80b7b122fa0098ffbbfdba446", "MedQAMC.Llama3Formatter": "fac0fd8a8d5d70e54686dea58f57fcb3", - "MedQAMC_OLMES.ConcatFormatter": "b3c29f76fa80e85d027d1ba3d6750478", - "MedQAMC_OLMES.Llama3Formatter": "c99d9ddb97aa1562705e810b1aff2e32", + "MedQAMC_OLMES.ConcatFormatter": "d5bac9bda6fc752ecc751be2cc23474b", + "MedQAMC_OLMES.Llama3Formatter": "40f759deb287df3adb3247b07c857efd", "NaturalQsOpen.ConcatFormatter": "90a895244f9cd20ca3200808ff5ddbab", "NaturalQsOpen.Llama3Formatter": "0955486e04308ca936b2c5267e5e9be9", "NaturalQsOpenCloze.ConcatFormatter": "49042482667eeaa9363c6247927f642f", @@ -226,8 +228,8 @@ "PIQA.Llama3Formatter": "9d33895f4024a9a1e61a00626e312313", "PIQA_IDK.ConcatFormatter": "4c07fad4f1409a22bc9835eb848feecb", "PIQA_IDK.Llama3Formatter": "94061c00140859fb99aa88a1e63b1b23", - "PIQA_OLMES.ConcatFormatter": "b5dc0e23810e4bb4801d07fdd54e6407", - "PIQA_OLMES.Llama3Formatter": "087eb23ebf6a54d5d286f59713669ad8", + "PIQA_OLMES.ConcatFormatter": "50fe249617e4bb36af4b114cd5d5023a", + "PIQA_OLMES.Llama3Formatter": "74aa663615a57e0184dddc8392b8189a", "QUALITY.ConcatFormatter": "49e990c091cf8b46b3afd1ae8db84797", "QUALITY.Llama3Formatter": "408165ab35d5153c9f144021cbd27544", "RenderableStructEval.ConcatFormatter": "40bd2d2bdd9e663bf1cbc147eceafced", @@ -240,8 +242,8 @@ "SCIQEvalHarness_IDK.Llama3Formatter": "3e8e56b3b7662cff6ea2943e1bb88ba0", "SCIQ_IDK.ConcatFormatter": "ffc25631a289d4dd536a5b6660cfb17c", "SCIQ_IDK.Llama3Formatter": "bffb5e3d9a9f1b18a96fd74659c43c27", - "SCIQ_OLMES.ConcatFormatter": "a3f16028e92d9690c58522246032fee8", - "SCIQ_OLMES.Llama3Formatter": "7d5edb428b339cefb272907c3a368621", + "SCIQ_OLMES.ConcatFormatter": "5fe7025f625b54eddf9dc41a0958374a", + "SCIQ_OLMES.Llama3Formatter": "b8551b6a33192e9746307a7f5ada0c77", "SPHYR.ConcatFormatter": "f6b6f1044fda6325b744281256a81ba9", "SPHYR.Llama3Formatter": "e67de80f6465104b6e511c5d339b9e45", "SQUAD.ConcatFormatter": "88bf0d58404c024dd885217d19466f0d", From 4318bd5e3c24156712e3667cffc43aad54bf72b3 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Tue, 24 Feb 2026 15:55:33 +0000 Subject: [PATCH 07/16] docs: Update docs with changes --- docs/tasks/CommonsenseQAMC_OLMES.md | 2 +- docs/tasks/DropCompletion_OLMES.md | 20 ++++++++++++++++++++ docs/tasks/MedQAMC_OLMES.md | 2 +- docs/tasks/PIQA_OLMES.md | 2 +- docs/tasks/README.md | 3 ++- docs/tasks/SCIQ_OLMES.md | 2 +- 6 files changed, 26 insertions(+), 5 deletions(-) create mode 100644 docs/tasks/DropCompletion_OLMES.md diff --git a/docs/tasks/CommonsenseQAMC_OLMES.md b/docs/tasks/CommonsenseQAMC_OLMES.md index 1abd8d77..2b4de843 100644 --- a/docs/tasks/CommonsenseQAMC_OLMES.md +++ b/docs/tasks/CommonsenseQAMC_OLMES.md @@ -4,7 +4,7 @@ NAME = CommonsenseQAMC_OLMES DATASET_PATH = tau/commonsense_qa SAMPLE_SPLIT = validation -FEWSHOT_SPLIT = validation +FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] SUBJECTS = ['no_subject'] diff --git a/docs/tasks/DropCompletion_OLMES.md b/docs/tasks/DropCompletion_OLMES.md new file mode 100644 index 00000000..a746d834 --- /dev/null +++ b/docs/tasks/DropCompletion_OLMES.md @@ -0,0 +1,20 @@ +# DropCompletion_OLMES + +```` +NAME = DropCompletion_OLMES +DATASET_PATH = EleutherAI/drop +SAMPLE_SPLIT = validation +FEWSHOT_SPLIT = train +RESPONSE_TYPE = COMPLETION +METRICS = [DropF1ExactMatch] +SUBJECTS = ['no_subject'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.drop` + +- File: [src/eval_framework/tasks/benchmarks/drop.py](../../src/eval_framework/tasks/benchmarks/drop.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/drop.py) + +- Link to dataset: [https://huggingface.co/datasets/EleutherAI/drop](https://huggingface.co/datasets/EleutherAI/drop) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "DropCompletion_OLMES"`. diff --git a/docs/tasks/MedQAMC_OLMES.md b/docs/tasks/MedQAMC_OLMES.md index b6108997..575efac1 100644 --- a/docs/tasks/MedQAMC_OLMES.md +++ b/docs/tasks/MedQAMC_OLMES.md @@ -4,7 +4,7 @@ NAME = MedQAMC_OLMES DATASET_PATH = davidheineman/medqa-en SAMPLE_SPLIT = test -FEWSHOT_SPLIT = dev +FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] SUBJECTS = ['no_subject'] diff --git a/docs/tasks/PIQA_OLMES.md b/docs/tasks/PIQA_OLMES.md index ca7610c0..4cffc9e2 100644 --- a/docs/tasks/PIQA_OLMES.md +++ b/docs/tasks/PIQA_OLMES.md @@ -4,7 +4,7 @@ NAME = PIQA_OLMES DATASET_PATH = ybisk/piqa SAMPLE_SPLIT = validation -FEWSHOT_SPLIT = test +FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] SUBJECTS = ['no_subject'] diff --git a/docs/tasks/README.md b/docs/tasks/README.md index 38ec0d69..052988ad 100644 --- a/docs/tasks/README.md +++ b/docs/tasks/README.md @@ -2,7 +2,7 @@ This directory contains the generated documentation for all benchmark tasks available in the package. -**Total number of tasks: 157** +**Total number of tasks: 158** The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`. @@ -42,6 +42,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [DUC_EXTRACTIVE](DUC_EXTRACTIVE.md) - [DropCloze](DropCloze.md) - [DropCompletion](DropCompletion.md) +- [DropCompletion_OLMES](DropCompletion_OLMES.md) - [DropMC](DropMC.md) - [DropMC_OLMES](DropMC_OLMES.md) - [Flores200](Flores200.md) diff --git a/docs/tasks/SCIQ_OLMES.md b/docs/tasks/SCIQ_OLMES.md index 4fe2a1d9..512bceb0 100644 --- a/docs/tasks/SCIQ_OLMES.md +++ b/docs/tasks/SCIQ_OLMES.md @@ -4,7 +4,7 @@ NAME = SCIQ_OLMES DATASET_PATH = allenai/sciq SAMPLE_SPLIT = validation -FEWSHOT_SPLIT = test +FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] SUBJECTS = ['no_subject'] From ad7a672cd25cdc8c08f329392480c6b3be1059cc Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Wed, 25 Feb 2026 15:58:30 +0000 Subject: [PATCH 08/16] fix: Add missing "Answer:" prefix in fewshots --- src/eval_framework/tasks/benchmarks/social_iqa.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/eval_framework/tasks/benchmarks/social_iqa.py b/src/eval_framework/tasks/benchmarks/social_iqa.py index ce489d81..05effef0 100644 --- a/src/eval_framework/tasks/benchmarks/social_iqa.py +++ b/src/eval_framework/tasks/benchmarks/social_iqa.py @@ -155,7 +155,11 @@ class SocialIQACloze(BaseTask[str]): SAMPLE_SPLIT = "validation" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS - METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] + METRICS = [ + AccuracyLoglikelihood, + AccuracyNormLoglikelihood, + BitsPerByteLoglikelihood, + ] SUBJECTS = [NO_SUBJECT] PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] LANGUAGE = Language.ENG @@ -174,6 +178,11 @@ def _get_ground_truth(self, item: dict[str, Any]) -> str | None: choices = [item["answerA"], item["answerB"], item["answerC"]] return f" {choices[idx]}" + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + ground_truth = self._get_ground_truth(item) + assert ground_truth is not None + return f"{self._get_cue_text(item)}{ground_truth}" + def _get_cue_text(self, item: dict[str, Any]) -> str: return "Answer:" From 49ed606adfaa47652eff7c3ec7b657b10224132e Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 26 Feb 2026 15:44:13 +0000 Subject: [PATCH 09/16] fix: If OLMES runs on all split, select the largest of those --- src/eval_framework/tasks/benchmarks/csqa.py | 1 + src/eval_framework/tasks/benchmarks/piqa.py | 1 + src/eval_framework/tasks/benchmarks/sciq.py | 1 + src/eval_framework/tasks/benchmarks/social_iqa.py | 1 + 4 files changed, 4 insertions(+) diff --git a/src/eval_framework/tasks/benchmarks/csqa.py b/src/eval_framework/tasks/benchmarks/csqa.py index 57c328ea..eb47f8f5 100644 --- a/src/eval_framework/tasks/benchmarks/csqa.py +++ b/src/eval_framework/tasks/benchmarks/csqa.py @@ -82,6 +82,7 @@ class CommonsenseQAMC_OLMES(CommonsenseQAMC): """ NAME = "CommonsenseQAMC_OLMES" + SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits FEWSHOT_SPLIT = "train" def _get_instruction_text(self, item: dict[str, Any]) -> str: diff --git a/src/eval_framework/tasks/benchmarks/piqa.py b/src/eval_framework/tasks/benchmarks/piqa.py index 128912a7..279adeba 100644 --- a/src/eval_framework/tasks/benchmarks/piqa.py +++ b/src/eval_framework/tasks/benchmarks/piqa.py @@ -52,6 +52,7 @@ class PIQA_OLMES(PIQA): """ NAME = "PIQA_OLMES" + SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits FEWSHOT_SPLIT = "train" def __init__(self, num_fewshot: int = 0) -> None: diff --git a/src/eval_framework/tasks/benchmarks/sciq.py b/src/eval_framework/tasks/benchmarks/sciq.py index 2280e87a..256bf39a 100644 --- a/src/eval_framework/tasks/benchmarks/sciq.py +++ b/src/eval_framework/tasks/benchmarks/sciq.py @@ -70,6 +70,7 @@ class SCIQ_OLMES(SCIQ): """ NAME = "SciQ_OLMES" + SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits FEWSHOT_SPLIT = "train" def __init__(self, num_fewshot: int = 0) -> None: diff --git a/src/eval_framework/tasks/benchmarks/social_iqa.py b/src/eval_framework/tasks/benchmarks/social_iqa.py index 05effef0..5bf627e2 100644 --- a/src/eval_framework/tasks/benchmarks/social_iqa.py +++ b/src/eval_framework/tasks/benchmarks/social_iqa.py @@ -198,6 +198,7 @@ class SocialIQAMC_OLMES(SocialIQACloze): """ NAME = "SocialIQAMC_OLMES" + SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits def _get_instruction_text(self, item: dict[str, Any]) -> str: query = _social_iqa_context_question(item) From d7f9c5041bfac68ee8d93e6cae1b655622421d48 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 27 Feb 2026 08:17:58 +0000 Subject: [PATCH 10/16] fix: MetricResults expects error to be Error or None not string. --- .../metrics/loglikelihood/bits_per_byte.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/eval_framework/metrics/loglikelihood/bits_per_byte.py b/src/eval_framework/metrics/loglikelihood/bits_per_byte.py index adcc29fb..0bff8168 100644 --- a/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +++ b/src/eval_framework/metrics/loglikelihood/bits_per_byte.py @@ -1,7 +1,7 @@ import math from eval_framework.metrics.base import BaseMetric, MetricResult -from eval_framework.shared.types import Loglikelihood +from eval_framework.shared.types import Error, Loglikelihood class BitsPerByteLoglikelihood(BaseMetric[Loglikelihood]): @@ -37,7 +37,11 @@ def calculate(self, response: Loglikelihood) -> list[MetricResult]: metric_name=self.NAME, value=None, higher_is_better=False, - error=response.error or "No ground-truth answer found in loglikelihoods", + error=response.error + or Error( + error_class="ValueError", + message="No ground-truth answer found in loglikelihoods", + ), ) ] @@ -48,7 +52,11 @@ def calculate(self, response: Loglikelihood) -> list[MetricResult]: metric_name=self.NAME, value=None, higher_is_better=False, - error=response.error or "Ground-truth answer has zero UTF-8 bytes", + error=response.error + or Error( + error_class="ValueError", + message="Ground-truth answer has zero UTF-8 bytes", + ), ) ] From e432cbf1224afe05fd0fe31e0768a3124caeda7b Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 27 Feb 2026 11:58:47 +0000 Subject: [PATCH 11/16] fix: MyPy errors --- src/eval_framework/tasks/benchmarks/drop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eval_framework/tasks/benchmarks/drop.py b/src/eval_framework/tasks/benchmarks/drop.py index a0d2b947..bdda87d3 100644 --- a/src/eval_framework/tasks/benchmarks/drop.py +++ b/src/eval_framework/tasks/benchmarks/drop.py @@ -87,7 +87,7 @@ def __init__(self, num_fewshot: int = 0) -> None: def _load_dataset(self, subject: str) -> None: hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH) - def process(docs): + def process(docs: list[dict[str, Any]]) -> list[dict[str, Any]]: result = [] for doc in docs: parsed = _get_answers(doc) From 1150f414ecc12dd97aef4ee1f33edd9ba5fdd6c2 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 27 Feb 2026 11:59:04 +0000 Subject: [PATCH 12/16] fix: MyPy errors --- src/eval_framework/metrics/loglikelihood/bits_per_byte.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/eval_framework/metrics/loglikelihood/bits_per_byte.py b/src/eval_framework/metrics/loglikelihood/bits_per_byte.py index 0bff8168..4a2909f4 100644 --- a/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +++ b/src/eval_framework/metrics/loglikelihood/bits_per_byte.py @@ -41,6 +41,7 @@ def calculate(self, response: Loglikelihood) -> list[MetricResult]: or Error( error_class="ValueError", message="No ground-truth answer found in loglikelihoods", + traceback="", ), ) ] @@ -56,6 +57,7 @@ def calculate(self, response: Loglikelihood) -> list[MetricResult]: or Error( error_class="ValueError", message="Ground-truth answer has zero UTF-8 bytes", + traceback="", ), ) ] From 41c03b4f5613ec14bd5e28ce05c31a162d5f1f3b Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 27 Feb 2026 12:34:34 +0000 Subject: [PATCH 13/16] test: Update hashes after split change --- .../tasks/task-prompts-hashes.json | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json index a5cae757..d6ded017 100644 --- a/tests/tests_eval_framework/tasks/task-prompts-hashes.json +++ b/tests/tests_eval_framework/tasks/task-prompts-hashes.json @@ -53,8 +53,8 @@ "CommonsenseQAFullTextCloze.Llama3Formatter": "7433c7dfb21e0da6ff0566c886bdd29c", "CommonsenseQAMC.ConcatFormatter": "b49e21abf30a6c6b6f2d5665914fae6e", "CommonsenseQAMC.Llama3Formatter": "1b97fe5fecd8a27e3f587693318bf17a", - "CommonsenseQAMC_OLMES.ConcatFormatter": "3ab1a269213eab2a684fa186320b8f00", - "CommonsenseQAMC_OLMES.Llama3Formatter": "0509c4c4ba589a35600fdbdec2ed2f9c", + "CommonsenseQAMC_OLMES.ConcatFormatter": "ca6acb05cfe4bf09a0f2321c5561d293", + "CommonsenseQAMC_OLMES.Llama3Formatter": "f41e71ef8389525e730c5f8b41026cf5", "DUC_ABSTRACTIVE.ConcatFormatter": "ab9a49f844aae7cd40470a9c8c1012ad", "DUC_ABSTRACTIVE.Llama3Formatter": "b425e6e924f57ebbaeaa7a333d565d05", "DUC_EXTRACTIVE.ConcatFormatter": "d773f051727e563369af29e183b2046b", @@ -228,8 +228,8 @@ "PIQA.Llama3Formatter": "9d33895f4024a9a1e61a00626e312313", "PIQA_IDK.ConcatFormatter": "4c07fad4f1409a22bc9835eb848feecb", "PIQA_IDK.Llama3Formatter": "94061c00140859fb99aa88a1e63b1b23", - "PIQA_OLMES.ConcatFormatter": "50fe249617e4bb36af4b114cd5d5023a", - "PIQA_OLMES.Llama3Formatter": "74aa663615a57e0184dddc8392b8189a", + "PIQA_OLMES.ConcatFormatter": "dd8a928404989320ce7b712d149bf5f5", + "PIQA_OLMES.Llama3Formatter": "e605ca9fb312e9309206e5aeba259169", "QUALITY.ConcatFormatter": "49e990c091cf8b46b3afd1ae8db84797", "QUALITY.Llama3Formatter": "408165ab35d5153c9f144021cbd27544", "RenderableStructEval.ConcatFormatter": "40bd2d2bdd9e663bf1cbc147eceafced", @@ -242,8 +242,8 @@ "SCIQEvalHarness_IDK.Llama3Formatter": "3e8e56b3b7662cff6ea2943e1bb88ba0", "SCIQ_IDK.ConcatFormatter": "ffc25631a289d4dd536a5b6660cfb17c", "SCIQ_IDK.Llama3Formatter": "bffb5e3d9a9f1b18a96fd74659c43c27", - "SCIQ_OLMES.ConcatFormatter": "5fe7025f625b54eddf9dc41a0958374a", - "SCIQ_OLMES.Llama3Formatter": "b8551b6a33192e9746307a7f5ada0c77", + "SCIQ_OLMES.ConcatFormatter": "820f12c83c89dfe8bbe19e062c37d259", + "SCIQ_OLMES.Llama3Formatter": "4172e30c45aa2105e1db0cd49dcf9cb7", "SPHYR.ConcatFormatter": "f6b6f1044fda6325b744281256a81ba9", "SPHYR.Llama3Formatter": "e67de80f6465104b6e511c5d339b9e45", "SQUAD.ConcatFormatter": "88bf0d58404c024dd885217d19466f0d", @@ -252,12 +252,12 @@ "SQUAD2.Llama3Formatter": "118f3757859029bee7da4bdfdb9fdf0b", "SQUAD2BPB.ConcatFormatter": "a1ae38489334319a2d0ef3a8c503ec4c", "SQUAD2BPB.Llama3Formatter": "097cea2ecf27f0d24b52633fa3ce363a", - "SocialIQACloze.ConcatFormatter": "9bd18f84263ae1dd28ea1099d1392b32", - "SocialIQACloze.Llama3Formatter": "10b3c20fe7a78b803377caef818fb31f", - "SocialIQAMC.ConcatFormatter": "1659f621b1dc4f410536d55f2e81c932", - "SocialIQAMC.Llama3Formatter": "c5ed5cd29b021fa0c71061905c48dfc1", - "SocialIQAMC_OLMES.ConcatFormatter": "7d3017ef98a2fe64ae8439555c3c1440", - "SocialIQAMC_OLMES.Llama3Formatter": "a49d524f98ea43d1deafc7b833c34529", + "SocialIQACloze.ConcatFormatter": "e20a2c81592da55f2d6e6c88d3a7104f", + "SocialIQACloze.Llama3Formatter": "1d4ddde3cc8857a459bd37f7b90ed9db", + "SocialIQAMC.ConcatFormatter": "bf1c17f0da9502617b279fde6c7c7c87", + "SocialIQAMC.Llama3Formatter": "73a0463313b86edfd902c061f9b85037", + "SocialIQAMC_OLMES.ConcatFormatter": "e5e5dac8348978b6b8e0311c66fa1877", + "SocialIQAMC_OLMES.Llama3Formatter": "ac1a7164ab0a1b4eb6d6a5ed055b4be0", "StructEval.ConcatFormatter": "280bb4a5e01945c4d25688bbb17468b5", "StructEval.Llama3Formatter": "3441d8806336ce590e09c9681d8c8f6f", "TRIVIAQA.ConcatFormatter": "4ee69bbf9d5ade2e7ed61e7455b06b0b", From f5cfe5a1423396131d929d0ec1b6dc658a450a72 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 27 Feb 2026 12:38:37 +0000 Subject: [PATCH 14/16] fix: Revert non_OLMES variant back to validation split --- src/eval_framework/tasks/benchmarks/social_iqa.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eval_framework/tasks/benchmarks/social_iqa.py b/src/eval_framework/tasks/benchmarks/social_iqa.py index 5bf627e2..074de084 100644 --- a/src/eval_framework/tasks/benchmarks/social_iqa.py +++ b/src/eval_framework/tasks/benchmarks/social_iqa.py @@ -224,6 +224,7 @@ class SocialIQAMC(SocialIQAMC_OLMES): """ NAME = "SocialIQAMC" + SAMPLE_SPLIT = "validation" def _get_instruction_text(self, item: dict[str, Any]) -> str: query = _social_iqa_context_question(item) From aac71f1675f715148e58944e56e8dbdb51d7c1ad Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 27 Feb 2026 12:45:22 +0000 Subject: [PATCH 15/16] test: Update hashes after split revert --- tests/tests_eval_framework/tasks/task-prompts-hashes.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json index d6ded017..4f5fb4c8 100644 --- a/tests/tests_eval_framework/tasks/task-prompts-hashes.json +++ b/tests/tests_eval_framework/tasks/task-prompts-hashes.json @@ -254,8 +254,8 @@ "SQUAD2BPB.Llama3Formatter": "097cea2ecf27f0d24b52633fa3ce363a", "SocialIQACloze.ConcatFormatter": "e20a2c81592da55f2d6e6c88d3a7104f", "SocialIQACloze.Llama3Formatter": "1d4ddde3cc8857a459bd37f7b90ed9db", - "SocialIQAMC.ConcatFormatter": "bf1c17f0da9502617b279fde6c7c7c87", - "SocialIQAMC.Llama3Formatter": "73a0463313b86edfd902c061f9b85037", + "SocialIQAMC.ConcatFormatter": "96f91645eace7a45a93d37e49451a64d", + "SocialIQAMC.Llama3Formatter": "e0d30544bde34934a4695bc20698c882", "SocialIQAMC_OLMES.ConcatFormatter": "e5e5dac8348978b6b8e0311c66fa1877", "SocialIQAMC_OLMES.Llama3Formatter": "ac1a7164ab0a1b4eb6d6a5ed055b4be0", "StructEval.ConcatFormatter": "280bb4a5e01945c4d25688bbb17468b5", From 3e8fed0edf720138ad5195b67f9969f84d17b256 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 27 Feb 2026 12:47:10 +0000 Subject: [PATCH 16/16] docs: Updated splits in docs --- docs/tasks/CommonsenseQAMC_OLMES.md | 2 +- docs/tasks/PIQA_OLMES.md | 2 +- docs/tasks/SCIQ_OLMES.md | 2 +- docs/tasks/SocialIQAMC_OLMES.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/tasks/CommonsenseQAMC_OLMES.md b/docs/tasks/CommonsenseQAMC_OLMES.md index 2b4de843..a91bb0a8 100644 --- a/docs/tasks/CommonsenseQAMC_OLMES.md +++ b/docs/tasks/CommonsenseQAMC_OLMES.md @@ -3,7 +3,7 @@ ```` NAME = CommonsenseQAMC_OLMES DATASET_PATH = tau/commonsense_qa -SAMPLE_SPLIT = validation +SAMPLE_SPLIT = train FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] diff --git a/docs/tasks/PIQA_OLMES.md b/docs/tasks/PIQA_OLMES.md index 4cffc9e2..2fb7294c 100644 --- a/docs/tasks/PIQA_OLMES.md +++ b/docs/tasks/PIQA_OLMES.md @@ -3,7 +3,7 @@ ```` NAME = PIQA_OLMES DATASET_PATH = ybisk/piqa -SAMPLE_SPLIT = validation +SAMPLE_SPLIT = train FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] diff --git a/docs/tasks/SCIQ_OLMES.md b/docs/tasks/SCIQ_OLMES.md index 512bceb0..9cae57d9 100644 --- a/docs/tasks/SCIQ_OLMES.md +++ b/docs/tasks/SCIQ_OLMES.md @@ -3,7 +3,7 @@ ```` NAME = SCIQ_OLMES DATASET_PATH = allenai/sciq -SAMPLE_SPLIT = validation +SAMPLE_SPLIT = train FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] diff --git a/docs/tasks/SocialIQAMC_OLMES.md b/docs/tasks/SocialIQAMC_OLMES.md index 5f4f7e7b..ae4a269f 100644 --- a/docs/tasks/SocialIQAMC_OLMES.md +++ b/docs/tasks/SocialIQAMC_OLMES.md @@ -3,7 +3,7 @@ ```` NAME = SocialIQAMC_OLMES DATASET_PATH = allenai/social_i_qa -SAMPLE_SPLIT = validation +SAMPLE_SPLIT = train FEWSHOT_SPLIT = train RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]