From c5e4f6325a170efdd6538a3269a90065323411a1 Mon Sep 17 00:00:00 2001 From: Tom Burns Date: Thu, 26 Feb 2026 12:27:27 +0000 Subject: [PATCH 1/3] feat: add humaneval OLMES variant --- docs/tasks/HumanEval_OLMES.md | 20 +++++++++ docs/tasks/README.md | 3 +- .../tasks/benchmarks/humaneval.py | 22 +++++++++ src/eval_framework/tasks/task_names.py | 1 + .../tasks/task-prompts-hashes.json | 4 +- .../tasks/test_all_formatters.py | 1 + .../tasks/test_humaneval.py | 45 ++++++++++++++++++- 7 files changed, 93 insertions(+), 3 deletions(-) create mode 100644 docs/tasks/HumanEval_OLMES.md diff --git a/docs/tasks/HumanEval_OLMES.md b/docs/tasks/HumanEval_OLMES.md new file mode 100644 index 00000000..1b5254be --- /dev/null +++ b/docs/tasks/HumanEval_OLMES.md @@ -0,0 +1,20 @@ +# HumanEval_OLMES + +```` +NAME = HumanEval_OLMES +DATASET_PATH = openai/openai_humaneval +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = test +RESPONSE_TYPE = COMPLETION +METRICS = [CodeCompletionAssertion] +SUBJECTS = ['no_subject'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.humaneval` + +- File: [src/eval_framework/tasks/benchmarks/humaneval.py](../../src/eval_framework/tasks/benchmarks/humaneval.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/humaneval.py) + +- Link to dataset: [https://huggingface.co/datasets/openai/openai_humaneval](https://huggingface.co/datasets/openai/openai_humaneval) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "HumanEval_OLMES"`. diff --git a/docs/tasks/README.md b/docs/tasks/README.md index 38ec0d69..1a300b74 100644 --- a/docs/tasks/README.md +++ b/docs/tasks/README.md @@ -2,7 +2,7 @@ This directory contains the generated documentation for all benchmark tasks available in the package. -**Total number of tasks: 157** +**Total number of tasks: 161** The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`. @@ -67,6 +67,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [HumanEval](HumanEval.md) - [HumanEvalBPB](HumanEvalBPB.md) - [HumanEvalInstruct](HumanEvalInstruct.md) +- [HumanEval_OLMES](HumanEval_OLMES.md) - [IFEval](IFEval.md) - [IFEvalDe](IFEvalDe.md) - [IFEvalFiSv](IFEvalFiSv.md) diff --git a/src/eval_framework/tasks/benchmarks/humaneval.py b/src/eval_framework/tasks/benchmarks/humaneval.py index 4a51b261..163d6a8d 100644 --- a/src/eval_framework/tasks/benchmarks/humaneval.py +++ b/src/eval_framework/tasks/benchmarks/humaneval.py @@ -99,6 +99,28 @@ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: return [gt] if gt else None +class HumanEval_OLMES(HumanEval): + """HumanEval OLMES variant replicating codex_humaneval:3shot::olmo3:n32:v2 from oe_eval. + + Recommended EvalConfig settings for full replication: + repeats: 32 + llm_args: {sampling_params: {temperature: 0.6, top_p: 0.6}} + """ + + NAME = "Human Eval OLMES" + + def __init__(self, num_fewshot: int = 3) -> None: + super().__init__(num_fewshot) + self.stop_sequences = ["\nclass", "\nif", "\nprint", "\n#", "\n```", "\n```\n\n", "<|eot_id|>"] + self.max_tokens = 1024 + + def _get_instruction_text(self, item: dict[str, Any]) -> str: + return "```python\n" + item["prompt"] + + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + return item["canonical_solution"] + "```" + + class HumanEvalInstruct(HumanEval): # See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/humaneval/humaneval_instruct.yaml NAME = "Human Eval Instruct" diff --git a/src/eval_framework/tasks/task_names.py b/src/eval_framework/tasks/task_names.py index 0dd11e16..28c89c10 100644 --- a/src/eval_framework/tasks/task_names.py +++ b/src/eval_framework/tasks/task_names.py @@ -61,6 +61,7 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.HELLASWAG_EU20_FR") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalBPB") + register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalInstruct") register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEval") register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEvalDe") diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json index f953fddd..2bc0149b 100644 --- a/tests/tests_eval_framework/tasks/task-prompts-hashes.json +++ b/tests/tests_eval_framework/tasks/task-prompts-hashes.json @@ -114,6 +114,8 @@ "HumanEvalBPB.Llama3Formatter": "57f5a23f0cf320ea2c675c178e9f5bad", "HumanEvalInstruct.ConcatFormatter": "100e994d25219d93daa3ace8a8beb730", "HumanEvalInstruct.Llama3Formatter": "0c1c4e07c9ecd0445257118bc5cecc09", + "HumanEval_OLMES.ConcatFormatter": "43d5d2b350304df54708165cf30e4009", + "HumanEval_OLMES.Llama3Formatter": "0595b2b93a403ceb3cfef74baa4dd7bf", "IFEval.ConcatFormatter": "b517d9d281cd8d5db2ea72b47bd44314", "IFEval.Llama3Formatter": "7739c2862af662f2b146f4eae61ac208", "IFEvalDe.ConcatFormatter": "798e567efa346a45b42deff904a40b22", @@ -314,4 +316,4 @@ "ZERO_SCROLLS_SPACE_DIGEST.Llama3Formatter": "4de6a7c03ae98629b8d031af1d3e5075", "ZERO_SCROLLS_SQUALITY.ConcatFormatter": "040577e1e0d0ae2ed79e43fb81ee9256", "ZERO_SCROLLS_SQUALITY.Llama3Formatter": "f822411e0b6267dde4cd74e089a4237f" -} +} \ No newline at end of file diff --git a/tests/tests_eval_framework/tasks/test_all_formatters.py b/tests/tests_eval_framework/tasks/test_all_formatters.py index caa1f082..9ccfe323 100644 --- a/tests/tests_eval_framework/tasks/test_all_formatters.py +++ b/tests/tests_eval_framework/tasks/test_all_formatters.py @@ -54,6 +54,7 @@ "HELLASWAG_DE": {"num_fewshot": 1}, "HELLASWAG_EU20_DE": {"num_fewshot": 1}, "HELLASWAG_EU20_FR": {"num_fewshot": 1}, + "HumanEval_OLMES": {"num_fewshot": 3}, "InfiniteBench_CodeDebug": {"num_fewshot": 0}, "InfiniteBench_CodeRun": {"num_fewshot": 0}, "InfiniteBench_EnDia": {"num_fewshot": 0}, diff --git a/tests/tests_eval_framework/tasks/test_humaneval.py b/tests/tests_eval_framework/tasks/test_humaneval.py index 64b5e39c..56b36cee 100644 --- a/tests/tests_eval_framework/tasks/test_humaneval.py +++ b/tests/tests_eval_framework/tasks/test_humaneval.py @@ -1,6 +1,6 @@ import pytest -from eval_framework.tasks.benchmarks.humaneval import HumanEval, HumanEvalInstruct +from eval_framework.tasks.benchmarks.humaneval import HumanEval, HumanEval_OLMES, HumanEvalInstruct from eval_framework.tasks.utils import run_python_code from tests.tests_eval_framework.utils import DatasetPatcher @@ -24,6 +24,49 @@ def test_code_is_executed(self, human_eval_task: HumanEval) -> None: assert i == 9 +class TestHumanEvalOLMES: + @pytest.fixture + def human_eval_olmes_task(self) -> HumanEval_OLMES: + with DatasetPatcher(HumanEval_OLMES, num_fewshot=3) as patched_task: + return patched_task + + def test_code_is_executed(self, human_eval_olmes_task: HumanEval_OLMES) -> None: + assert len(human_eval_olmes_task.SUBJECTS) > 0 + human_eval_olmes_task._load_dataset(human_eval_olmes_task.SUBJECTS[0]) + i = 0 + for i, item in enumerate(human_eval_olmes_task.dataset[human_eval_olmes_task.SAMPLE_SPLIT][:10]): + sample = human_eval_olmes_task._create_samples(item, i, human_eval_olmes_task.SUBJECTS[0])[0] + formatted_code = human_eval_olmes_task.post_process_generated_completion( + item["canonical_solution"], sample + ) + assert run_python_code(formatted_code).endswith("True") + formatted_code = human_eval_olmes_task.post_process_generated_completion("", sample) + assert not run_python_code(formatted_code).endswith("True") + assert i == 9 + + def test_olmes_settings(self, human_eval_olmes_task: HumanEval_OLMES) -> None: + assert human_eval_olmes_task.num_fewshot == 3 + assert human_eval_olmes_task.max_tokens == 1024 + assert "\nclass" in human_eval_olmes_task.stop_sequences + assert "\nif" in human_eval_olmes_task.stop_sequences + assert "\nprint" in human_eval_olmes_task.stop_sequences + assert "\n#" in human_eval_olmes_task.stop_sequences + assert "\n```" in human_eval_olmes_task.stop_sequences + assert human_eval_olmes_task.SAMPLE_SPLIT == "test" + assert human_eval_olmes_task.FEWSHOT_SPLIT == "test" + + def test_olmes_prompt_format(self, human_eval_olmes_task: HumanEval_OLMES) -> None: + human_eval_olmes_task._load_dataset(human_eval_olmes_task.SUBJECTS[0]) + item = human_eval_olmes_task.dataset[human_eval_olmes_task.SAMPLE_SPLIT][0] + instruction = human_eval_olmes_task._get_instruction_text(item) + assert instruction.startswith("```python\n") + assert instruction == "```python\n" + item["prompt"] + + fewshot_target = human_eval_olmes_task._get_fewshot_target_text(item) + assert fewshot_target.endswith("```") + assert fewshot_target == item["canonical_solution"] + "```" + + class TestHumanEvalInstructCode: @pytest.fixture def human_eval_task_inst(self) -> HumanEvalInstruct: From 55ea30106a74c8db548bfba261d2588832cd152b Mon Sep 17 00:00:00 2001 From: Tom Burns Date: Thu, 26 Feb 2026 12:32:19 +0000 Subject: [PATCH 2/3] fix: linting of tests --- tests/tests_eval_framework/tasks/task-prompts-hashes.json | 2 +- tests/tests_eval_framework/tasks/test_humaneval.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json index 2bc0149b..223df5d2 100644 --- a/tests/tests_eval_framework/tasks/task-prompts-hashes.json +++ b/tests/tests_eval_framework/tasks/task-prompts-hashes.json @@ -316,4 +316,4 @@ "ZERO_SCROLLS_SPACE_DIGEST.Llama3Formatter": "4de6a7c03ae98629b8d031af1d3e5075", "ZERO_SCROLLS_SQUALITY.ConcatFormatter": "040577e1e0d0ae2ed79e43fb81ee9256", "ZERO_SCROLLS_SQUALITY.Llama3Formatter": "f822411e0b6267dde4cd74e089a4237f" -} \ No newline at end of file +} diff --git a/tests/tests_eval_framework/tasks/test_humaneval.py b/tests/tests_eval_framework/tasks/test_humaneval.py index 56b36cee..80156951 100644 --- a/tests/tests_eval_framework/tasks/test_humaneval.py +++ b/tests/tests_eval_framework/tasks/test_humaneval.py @@ -36,9 +36,7 @@ def test_code_is_executed(self, human_eval_olmes_task: HumanEval_OLMES) -> None: i = 0 for i, item in enumerate(human_eval_olmes_task.dataset[human_eval_olmes_task.SAMPLE_SPLIT][:10]): sample = human_eval_olmes_task._create_samples(item, i, human_eval_olmes_task.SUBJECTS[0])[0] - formatted_code = human_eval_olmes_task.post_process_generated_completion( - item["canonical_solution"], sample - ) + formatted_code = human_eval_olmes_task.post_process_generated_completion(item["canonical_solution"], sample) assert run_python_code(formatted_code).endswith("True") formatted_code = human_eval_olmes_task.post_process_generated_completion("", sample) assert not run_python_code(formatted_code).endswith("True") From f42177304d952cfe9c29e18939a36e7a78bcb6b2 Mon Sep 17 00:00:00 2001 From: Tom Burns Date: Thu, 26 Feb 2026 14:04:43 +0000 Subject: [PATCH 3/3] fix: humaneval tests --- docs/tasks/README.md | 2 +- tests/tests_eval_framework/tasks/test_humaneval.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/tasks/README.md b/docs/tasks/README.md index 1a300b74..1ab31f04 100644 --- a/docs/tasks/README.md +++ b/docs/tasks/README.md @@ -2,7 +2,7 @@ This directory contains the generated documentation for all benchmark tasks available in the package. -**Total number of tasks: 161** +**Total number of tasks: 158** The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`. diff --git a/tests/tests_eval_framework/tasks/test_humaneval.py b/tests/tests_eval_framework/tasks/test_humaneval.py index 80156951..b6b5ec60 100644 --- a/tests/tests_eval_framework/tasks/test_humaneval.py +++ b/tests/tests_eval_framework/tasks/test_humaneval.py @@ -32,10 +32,12 @@ def human_eval_olmes_task(self) -> HumanEval_OLMES: def test_code_is_executed(self, human_eval_olmes_task: HumanEval_OLMES) -> None: assert len(human_eval_olmes_task.SUBJECTS) > 0 - human_eval_olmes_task._load_dataset(human_eval_olmes_task.SUBJECTS[0]) + subject = human_eval_olmes_task.SUBJECTS[0] + human_eval_olmes_task._load_dataset(subject) i = 0 for i, item in enumerate(human_eval_olmes_task.dataset[human_eval_olmes_task.SAMPLE_SPLIT][:10]): - sample = human_eval_olmes_task._create_samples(item, i, human_eval_olmes_task.SUBJECTS[0])[0] + item["subject"] = subject + sample = human_eval_olmes_task._create_samples(item, i, subject)[0] formatted_code = human_eval_olmes_task.post_process_generated_completion(item["canonical_solution"], sample) assert run_python_code(formatted_code).endswith("True") formatted_code = human_eval_olmes_task.post_process_generated_completion("", sample)