Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/tasks/HumanEval_OLMES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# HumanEval_OLMES

````
NAME = HumanEval_OLMES
DATASET_PATH = openai/openai_humaneval
SAMPLE_SPLIT = test
FEWSHOT_SPLIT = test
RESPONSE_TYPE = COMPLETION
METRICS = [CodeCompletionAssertion]
SUBJECTS = ['no_subject']
LANGUAGE = <Language.ENG: 'English'>
````

- Module: `eval_framework.tasks.benchmarks.humaneval`

- File: [src/eval_framework/tasks/benchmarks/humaneval.py](../../src/eval_framework/tasks/benchmarks/humaneval.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/humaneval.py)

- Link to dataset: [https://huggingface.co/datasets/openai/openai_humaneval](https://huggingface.co/datasets/openai/openai_humaneval)

More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "HumanEval_OLMES"`.
3 changes: 2 additions & 1 deletion docs/tasks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

This directory contains the generated documentation for all benchmark tasks available in the package.

**Total number of tasks: 157**
**Total number of tasks: 158**

The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`.

Expand Down Expand Up @@ -67,6 +67,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not
- [HumanEval](HumanEval.md)
- [HumanEvalBPB](HumanEvalBPB.md)
- [HumanEvalInstruct](HumanEvalInstruct.md)
- [HumanEval_OLMES](HumanEval_OLMES.md)
- [IFEval](IFEval.md)
- [IFEvalDe](IFEvalDe.md)
- [IFEvalFiSv](IFEvalFiSv.md)
Expand Down
22 changes: 22 additions & 0 deletions src/eval_framework/tasks/benchmarks/humaneval.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,28 @@ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
return [gt] if gt else None


class HumanEval_OLMES(HumanEval):
"""HumanEval OLMES variant replicating codex_humaneval:3shot::olmo3:n32:v2 from oe_eval.

Recommended EvalConfig settings for full replication:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not related to the PR -- is there any way we can enforce these automatically?

repeats: 32
llm_args: {sampling_params: {temperature: 0.6, top_p: 0.6}}
"""

NAME = "Human Eval OLMES"

def __init__(self, num_fewshot: int = 3) -> None:
super().__init__(num_fewshot)
self.stop_sequences = ["\nclass", "\nif", "\nprint", "\n#", "\n```", "\n```\n\n", "<|eot_id|>"]
self.max_tokens = 1024

def _get_instruction_text(self, item: dict[str, Any]) -> str:
return "```python\n" + item["prompt"]

def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
return item["canonical_solution"] + "```"


class HumanEvalInstruct(HumanEval):
# See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/humaneval/humaneval_instruct.yaml
NAME = "Human Eval Instruct"
Expand Down
1 change: 1 addition & 0 deletions src/eval_framework/tasks/task_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def register_all_tasks() -> None:
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.HELLASWAG_EU20_FR")
register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval")
register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalBPB")
register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval_OLMES")
register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalInstruct")
register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEval")
register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEvalDe")
Expand Down
2 changes: 2 additions & 0 deletions tests/tests_eval_framework/tasks/task-prompts-hashes.json
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@
"HumanEvalBPB.Llama3Formatter": "57f5a23f0cf320ea2c675c178e9f5bad",
"HumanEvalInstruct.ConcatFormatter": "100e994d25219d93daa3ace8a8beb730",
"HumanEvalInstruct.Llama3Formatter": "0c1c4e07c9ecd0445257118bc5cecc09",
"HumanEval_OLMES.ConcatFormatter": "43d5d2b350304df54708165cf30e4009",
"HumanEval_OLMES.Llama3Formatter": "0595b2b93a403ceb3cfef74baa4dd7bf",
"IFEval.ConcatFormatter": "b517d9d281cd8d5db2ea72b47bd44314",
"IFEval.Llama3Formatter": "7739c2862af662f2b146f4eae61ac208",
"IFEvalDe.ConcatFormatter": "798e567efa346a45b42deff904a40b22",
Expand Down
1 change: 1 addition & 0 deletions tests/tests_eval_framework/tasks/test_all_formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"HELLASWAG_DE": {"num_fewshot": 1},
"HELLASWAG_EU20_DE": {"num_fewshot": 1},
"HELLASWAG_EU20_FR": {"num_fewshot": 1},
"HumanEval_OLMES": {"num_fewshot": 3},
"InfiniteBench_CodeDebug": {"num_fewshot": 0},
"InfiniteBench_CodeRun": {"num_fewshot": 0},
"InfiniteBench_EnDia": {"num_fewshot": 0},
Expand Down
45 changes: 44 additions & 1 deletion tests/tests_eval_framework/tasks/test_humaneval.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from eval_framework.tasks.benchmarks.humaneval import HumanEval, HumanEvalInstruct
from eval_framework.tasks.benchmarks.humaneval import HumanEval, HumanEval_OLMES, HumanEvalInstruct
from eval_framework.tasks.utils import run_python_code
from tests.tests_eval_framework.utils import DatasetPatcher

Expand All @@ -24,6 +24,49 @@ def test_code_is_executed(self, human_eval_task: HumanEval) -> None:
assert i == 9


class TestHumanEvalOLMES:
@pytest.fixture
def human_eval_olmes_task(self) -> HumanEval_OLMES:
with DatasetPatcher(HumanEval_OLMES, num_fewshot=3) as patched_task:
return patched_task

def test_code_is_executed(self, human_eval_olmes_task: HumanEval_OLMES) -> None:
assert len(human_eval_olmes_task.SUBJECTS) > 0
subject = human_eval_olmes_task.SUBJECTS[0]
human_eval_olmes_task._load_dataset(subject)
i = 0
for i, item in enumerate(human_eval_olmes_task.dataset[human_eval_olmes_task.SAMPLE_SPLIT][:10]):
item["subject"] = subject
sample = human_eval_olmes_task._create_samples(item, i, subject)[0]
formatted_code = human_eval_olmes_task.post_process_generated_completion(item["canonical_solution"], sample)
assert run_python_code(formatted_code).endswith("True")
formatted_code = human_eval_olmes_task.post_process_generated_completion("", sample)
assert not run_python_code(formatted_code).endswith("True")
assert i == 9

def test_olmes_settings(self, human_eval_olmes_task: HumanEval_OLMES) -> None:
assert human_eval_olmes_task.num_fewshot == 3
assert human_eval_olmes_task.max_tokens == 1024
assert "\nclass" in human_eval_olmes_task.stop_sequences
assert "\nif" in human_eval_olmes_task.stop_sequences
assert "\nprint" in human_eval_olmes_task.stop_sequences
assert "\n#" in human_eval_olmes_task.stop_sequences
assert "\n```" in human_eval_olmes_task.stop_sequences
assert human_eval_olmes_task.SAMPLE_SPLIT == "test"
assert human_eval_olmes_task.FEWSHOT_SPLIT == "test"

def test_olmes_prompt_format(self, human_eval_olmes_task: HumanEval_OLMES) -> None:
human_eval_olmes_task._load_dataset(human_eval_olmes_task.SUBJECTS[0])
item = human_eval_olmes_task.dataset[human_eval_olmes_task.SAMPLE_SPLIT][0]
instruction = human_eval_olmes_task._get_instruction_text(item)
assert instruction.startswith("```python\n")
assert instruction == "```python\n" + item["prompt"]

fewshot_target = human_eval_olmes_task._get_fewshot_target_text(item)
assert fewshot_target.endswith("```")
assert fewshot_target == item["canonical_solution"] + "```"


class TestHumanEvalInstructCode:
@pytest.fixture
def human_eval_task_inst(self) -> HumanEvalInstruct:
Expand Down