diff --git a/docs/tasks/HumanEval_OLMES.md b/docs/tasks/HumanEval_OLMES.md new file mode 100644 index 00000000..1b5254be --- /dev/null +++ b/docs/tasks/HumanEval_OLMES.md @@ -0,0 +1,20 @@ +# HumanEval_OLMES + +```` +NAME = HumanEval_OLMES +DATASET_PATH = openai/openai_humaneval +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = test +RESPONSE_TYPE = COMPLETION +METRICS = [CodeCompletionAssertion] +SUBJECTS = ['no_subject'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.humaneval` + +- File: [src/eval_framework/tasks/benchmarks/humaneval.py](../../src/eval_framework/tasks/benchmarks/humaneval.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/humaneval.py) + +- Link to dataset: [https://huggingface.co/datasets/openai/openai_humaneval](https://huggingface.co/datasets/openai/openai_humaneval) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "HumanEval_OLMES"`. diff --git a/docs/tasks/README.md b/docs/tasks/README.md index 38ec0d69..1ab31f04 100644 --- a/docs/tasks/README.md +++ b/docs/tasks/README.md @@ -2,7 +2,7 @@ This directory contains the generated documentation for all benchmark tasks available in the package. -**Total number of tasks: 157** +**Total number of tasks: 158** The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`. @@ -67,6 +67,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [HumanEval](HumanEval.md) - [HumanEvalBPB](HumanEvalBPB.md) - [HumanEvalInstruct](HumanEvalInstruct.md) +- [HumanEval_OLMES](HumanEval_OLMES.md) - [IFEval](IFEval.md) - [IFEvalDe](IFEvalDe.md) - [IFEvalFiSv](IFEvalFiSv.md) diff --git a/src/eval_framework/tasks/benchmarks/humaneval.py b/src/eval_framework/tasks/benchmarks/humaneval.py index 4a51b261..163d6a8d 100644 --- a/src/eval_framework/tasks/benchmarks/humaneval.py +++ b/src/eval_framework/tasks/benchmarks/humaneval.py @@ -99,6 +99,28 @@ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: return [gt] if gt else None +class HumanEval_OLMES(HumanEval): + """HumanEval OLMES variant replicating codex_humaneval:3shot::olmo3:n32:v2 from oe_eval. + + Recommended EvalConfig settings for full replication: + repeats: 32 + llm_args: {sampling_params: {temperature: 0.6, top_p: 0.6}} + """ + + NAME = "Human Eval OLMES" + + def __init__(self, num_fewshot: int = 3) -> None: + super().__init__(num_fewshot) + self.stop_sequences = ["\nclass", "\nif", "\nprint", "\n#", "\n```", "\n```\n\n", "<|eot_id|>"] + self.max_tokens = 1024 + + def _get_instruction_text(self, item: dict[str, Any]) -> str: + return "```python\n" + item["prompt"] + + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + return item["canonical_solution"] + "```" + + class HumanEvalInstruct(HumanEval): # See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/humaneval/humaneval_instruct.yaml NAME = "Human Eval Instruct" diff --git a/src/eval_framework/tasks/task_names.py b/src/eval_framework/tasks/task_names.py index 0dd11e16..28c89c10 100644 --- a/src/eval_framework/tasks/task_names.py +++ b/src/eval_framework/tasks/task_names.py @@ -61,6 +61,7 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.HELLASWAG_EU20_FR") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalBPB") + register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalInstruct") register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEval") register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEvalDe") diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json index f953fddd..223df5d2 100644 --- a/tests/tests_eval_framework/tasks/task-prompts-hashes.json +++ b/tests/tests_eval_framework/tasks/task-prompts-hashes.json @@ -114,6 +114,8 @@ "HumanEvalBPB.Llama3Formatter": "57f5a23f0cf320ea2c675c178e9f5bad", "HumanEvalInstruct.ConcatFormatter": "100e994d25219d93daa3ace8a8beb730", "HumanEvalInstruct.Llama3Formatter": "0c1c4e07c9ecd0445257118bc5cecc09", + "HumanEval_OLMES.ConcatFormatter": "43d5d2b350304df54708165cf30e4009", + "HumanEval_OLMES.Llama3Formatter": "0595b2b93a403ceb3cfef74baa4dd7bf", "IFEval.ConcatFormatter": "b517d9d281cd8d5db2ea72b47bd44314", "IFEval.Llama3Formatter": "7739c2862af662f2b146f4eae61ac208", "IFEvalDe.ConcatFormatter": "798e567efa346a45b42deff904a40b22", diff --git a/tests/tests_eval_framework/tasks/test_all_formatters.py b/tests/tests_eval_framework/tasks/test_all_formatters.py index caa1f082..9ccfe323 100644 --- a/tests/tests_eval_framework/tasks/test_all_formatters.py +++ b/tests/tests_eval_framework/tasks/test_all_formatters.py @@ -54,6 +54,7 @@ "HELLASWAG_DE": {"num_fewshot": 1}, "HELLASWAG_EU20_DE": {"num_fewshot": 1}, "HELLASWAG_EU20_FR": {"num_fewshot": 1}, + "HumanEval_OLMES": {"num_fewshot": 3}, "InfiniteBench_CodeDebug": {"num_fewshot": 0}, "InfiniteBench_CodeRun": {"num_fewshot": 0}, "InfiniteBench_EnDia": {"num_fewshot": 0}, diff --git a/tests/tests_eval_framework/tasks/test_humaneval.py b/tests/tests_eval_framework/tasks/test_humaneval.py index 64b5e39c..b6b5ec60 100644 --- a/tests/tests_eval_framework/tasks/test_humaneval.py +++ b/tests/tests_eval_framework/tasks/test_humaneval.py @@ -1,6 +1,6 @@ import pytest -from eval_framework.tasks.benchmarks.humaneval import HumanEval, HumanEvalInstruct +from eval_framework.tasks.benchmarks.humaneval import HumanEval, HumanEval_OLMES, HumanEvalInstruct from eval_framework.tasks.utils import run_python_code from tests.tests_eval_framework.utils import DatasetPatcher @@ -24,6 +24,49 @@ def test_code_is_executed(self, human_eval_task: HumanEval) -> None: assert i == 9 +class TestHumanEvalOLMES: + @pytest.fixture + def human_eval_olmes_task(self) -> HumanEval_OLMES: + with DatasetPatcher(HumanEval_OLMES, num_fewshot=3) as patched_task: + return patched_task + + def test_code_is_executed(self, human_eval_olmes_task: HumanEval_OLMES) -> None: + assert len(human_eval_olmes_task.SUBJECTS) > 0 + subject = human_eval_olmes_task.SUBJECTS[0] + human_eval_olmes_task._load_dataset(subject) + i = 0 + for i, item in enumerate(human_eval_olmes_task.dataset[human_eval_olmes_task.SAMPLE_SPLIT][:10]): + item["subject"] = subject + sample = human_eval_olmes_task._create_samples(item, i, subject)[0] + formatted_code = human_eval_olmes_task.post_process_generated_completion(item["canonical_solution"], sample) + assert run_python_code(formatted_code).endswith("True") + formatted_code = human_eval_olmes_task.post_process_generated_completion("", sample) + assert not run_python_code(formatted_code).endswith("True") + assert i == 9 + + def test_olmes_settings(self, human_eval_olmes_task: HumanEval_OLMES) -> None: + assert human_eval_olmes_task.num_fewshot == 3 + assert human_eval_olmes_task.max_tokens == 1024 + assert "\nclass" in human_eval_olmes_task.stop_sequences + assert "\nif" in human_eval_olmes_task.stop_sequences + assert "\nprint" in human_eval_olmes_task.stop_sequences + assert "\n#" in human_eval_olmes_task.stop_sequences + assert "\n```" in human_eval_olmes_task.stop_sequences + assert human_eval_olmes_task.SAMPLE_SPLIT == "test" + assert human_eval_olmes_task.FEWSHOT_SPLIT == "test" + + def test_olmes_prompt_format(self, human_eval_olmes_task: HumanEval_OLMES) -> None: + human_eval_olmes_task._load_dataset(human_eval_olmes_task.SUBJECTS[0]) + item = human_eval_olmes_task.dataset[human_eval_olmes_task.SAMPLE_SPLIT][0] + instruction = human_eval_olmes_task._get_instruction_text(item) + assert instruction.startswith("```python\n") + assert instruction == "```python\n" + item["prompt"] + + fewshot_target = human_eval_olmes_task._get_fewshot_target_text(item) + assert fewshot_target.endswith("```") + assert fewshot_target == item["canonical_solution"] + "```" + + class TestHumanEvalInstructCode: @pytest.fixture def human_eval_task_inst(self) -> HumanEvalInstruct: