Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/tasks/MBPP_OLMES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# MBPP_OLMES

````
NAME = MBPP_OLMES
DATASET_PATH = google-research-datasets/mbpp
SAMPLE_SPLIT = test
FEWSHOT_SPLIT = test
RESPONSE_TYPE = COMPLETION
METRICS = [CodeCompletionAssertion]
SUBJECTS = ['full']
LANGUAGE = <Language.ENG: 'English'>
````

- Module: `eval_framework.tasks.benchmarks.mbpp`

- File: [src/eval_framework/tasks/benchmarks/mbpp.py](../../src/eval_framework/tasks/benchmarks/mbpp.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mbpp.py)

- Link to dataset: [https://huggingface.co/datasets/google-research-datasets/mbpp](https://huggingface.co/datasets/google-research-datasets/mbpp)

More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MBPP_OLMES"`.
3 changes: 2 additions & 1 deletion docs/tasks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

This directory contains the generated documentation for all benchmark tasks available in the package.

**Total number of tasks: 157**
**Total number of tasks: 158**

The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`.

Expand Down Expand Up @@ -92,6 +92,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not
- [MATHMinervaEvalHarness](MATHMinervaEvalHarness.md)
- [MBPP](MBPP.md)
- [MBPPBPB](MBPPBPB.md)
- [MBPP_OLMES](MBPP_OLMES.md)
- [MBPP_PROMPT_WITHOUT_TESTS](MBPP_PROMPT_WITHOUT_TESTS.md)
- [MBPP_PROMPT_WITHOUT_TESTS_SANITIZED](MBPP_PROMPT_WITHOUT_TESTS_SANITIZED.md)
- [MBPP_SANITIZED](MBPP_SANITIZED.md)
Expand Down
100 changes: 100 additions & 0 deletions src/eval_framework/tasks/benchmarks/mbpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,103 @@ def post_process_generated_completion(self, completion_text: str, sample: Sample
class MBPP_PROMPT_WITHOUT_TESTS_SANITIZED(MBPP_PROMPT_WITHOUT_TESTS):
NAME = "MBPP_PROMPT_WITHOUT_TESTS_SANITIZED"
SUBJECTS = ["sanitized"]


_OLMES_FEWSHOT_EXAMPLES: list[dict[str, Any]] = [
{
"text": "Write a function to find the similar elements from the given two tuple lists.",
"code": (
"def similar_elements(test_tup1, test_tup2):\n"
" res = tuple(set(test_tup1) & set(test_tup2))\n return (res)"
),
"test_list": [
"assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
"assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
"assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)",
],
},
{
"text": "Write a python function to identify non-prime numbers.",
"code": (
"import math\ndef is_not_prime(n):\n result = False\n"
" for i in range(2,int(math.sqrt(n)) + 1):\n"
" if n % i == 0:\n result = True\n return result"
),
"test_list": [
"assert is_not_prime(2) == False",
"assert is_not_prime(10) == True",
"assert is_not_prime(35) == True",
],
},
{
"text": (
"Write a function to find the largest integers from a given list of numbers using heap queue algorithm."
),
"code": (
"import heapq as hq\ndef heap_queue_largest(nums,n):\n"
" largest_nums = hq.nlargest(n, nums)\n return largest_nums"
),
"test_list": [
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] ",
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] ",
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]",
],
},
]


class MBPP_OLMES(MBPP):
"""
MBPP OLMES variant replicating oe_eval's ``mbpp:3shot::olmo3:n32:v2``.

Uses the EvalPlus prompt format with 3 hardcoded fewshot examples from the
original MBPP "prompt" split (matching oe_eval's ordering). Each prompt
shows one test case (the first) instead of all.

Recommended EvalConfig settings for full replication::

split: test
num_fewshot: 3 (hardcoded, prompt split)
metric: pass_at_1
temperature: 0.6
top_p: 0.6
repeats: 32
"""

NAME = "MBPP_OLMES"
FEWSHOT_SPLIT = "test"

def __init__(self, num_fewshot: int = 3) -> None:
super().__init__(num_fewshot)
assert num_fewshot == 3, "MBPP_OLMES requires exactly 3 fewshot examples"
self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"]

def _get_instruction_text(self, item: dict[str, Any]) -> str:
text = item["text"] if "text" in item else item["prompt"]
test = item["test_list"][0]
return (
"Please provide a self-contained Python script that solves the following problem"
f" in a markdown code block:\n```\n{text.strip()}\n{test}\n```\n"
)

def _get_cue_text(self, item: dict[str, Any]) -> str:
return "Here is the completed function:\n\n```python\n"

def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
return item["code"] + "\n"

def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
return list(_OLMES_FEWSHOT_EXAMPLES)

def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
assert sample is not None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to contradict the type hint in the definition

assert self.stop_sequences is not None

for stop_seq in self.stop_sequences:
if stop_seq in completion_text:
completion_text = completion_text.split(stop_seq)[0]

extracted_code = completion_text + "\n"
mbpp_ground_truth = str(sample.ground_truth)
code = self._code_expander(extracted_code, mbpp_ground_truth)
return code
1 change: 1 addition & 0 deletions src/eval_framework/tasks/task_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def register_all_tasks() -> None:
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_SANITIZED")
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS")
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS_SANITIZED")
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_OLMES")
register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU")
register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_IDK")
register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_OLMES")
Expand Down
10 changes: 7 additions & 3 deletions src/eval_framework/utils/generate_task_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,13 @@ def generate_docs_for_task(
try:
num_fewshot = 0
task = task_class(num_fewshot=num_fewshot)
except Exception as e:
print(f"Failed to instantiate task {task_name}: {e}")
return
except Exception:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any chance we can make it more specific?

try:
task = task_class()
num_fewshot = task.num_fewshot
except Exception as e:
print(f"Failed to instantiate task {task_name}: {e}")
return

with open(f"{output_docs_directory}/{task_name}.md", "w") as f:
f.write(f"# {task_name}\n\n")
Expand Down
1 change: 1 addition & 0 deletions tests/tests_eval_framework/tasks/test_all_formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
"MBPP_SANITIZED": {"num_fewshot": 1},
"MBPP_PROMPT_WITHOUT_TESTS": {"num_fewshot": 1},
"MBPP_PROMPT_WITHOUT_TESTS_SANITIZED": {"num_fewshot": 1},
"MBPP_OLMES": {"num_fewshot": 3},
"MMLU": {"num_fewshot": 1},
"FullTextMMLU": {"num_fewshot": 1},
"MMLU_EU20_DE": {"num_fewshot": 1},
Expand Down
104 changes: 104 additions & 0 deletions tests/tests_eval_framework/tasks/test_mbpp_olmes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import pytest

from eval_framework.tasks.benchmarks.mbpp import _OLMES_FEWSHOT_EXAMPLES, MBPP_OLMES
from eval_framework.tasks.utils import run_python_code
from template_formatting.formatter import ConcatFormatter
from tests.tests_eval_framework.utils import DatasetPatcher


class TestMBPP_OLMES:
@pytest.fixture
def task(self) -> MBPP_OLMES:
with DatasetPatcher(MBPP_OLMES, num_fewshot=3, num_samples=10) as patched_task:
return patched_task

def test_num_fewshot_must_be_3(self) -> None:
with pytest.raises(AssertionError, match="MBPP_OLMES requires exactly 3 fewshot examples"):
MBPP_OLMES(num_fewshot=1)

def test_stop_sequences(self) -> None:
task = MBPP_OLMES(num_fewshot=3)
assert task.stop_sequences == ["```", '\n"""', "\nassert", "\n#"]

def test_instruction_uses_evalplus_format(self, task: MBPP_OLMES) -> None:
task._load_dataset(task.SUBJECTS[0])
item = task.dataset[task.SAMPLE_SPLIT][0]
item["subject"] = task.SUBJECTS[0]
instruction = task._get_instruction_text(item)

expected_prefix = (
"Please provide a self-contained Python script that solves the following problem"
" in a markdown code block:\n```\n"
)
assert instruction.startswith(expected_prefix)
assert instruction.endswith("\n```\n")
assert item["test_list"][0] in instruction

def test_instruction_contains_only_one_test(self, task: MBPP_OLMES) -> None:
task._load_dataset(task.SUBJECTS[0])
item = task.dataset[task.SAMPLE_SPLIT][0]
item["subject"] = task.SUBJECTS[0]
instruction = task._get_instruction_text(item)

for test in item["test_list"][1:]:
assert test not in instruction

def test_cue_text(self, task: MBPP_OLMES) -> None:
task._load_dataset(task.SUBJECTS[0])
item = task.dataset[task.SAMPLE_SPLIT][0]
cue = task._get_cue_text(item)
assert cue == "Here is the completed function:\n\n```python\n"

def test_fewshot_examples_are_hardcoded(self, task: MBPP_OLMES) -> None:
task._load_dataset(task.SUBJECTS[0])
item = task.dataset[task.SAMPLE_SPLIT][0]

examples = task._sample_fewshot_examples(item)
assert len(examples) == 3
assert examples[0]["text"] == _OLMES_FEWSHOT_EXAMPLES[0]["text"]
assert examples[1]["text"] == _OLMES_FEWSHOT_EXAMPLES[1]["text"]
assert examples[2]["text"] == _OLMES_FEWSHOT_EXAMPLES[2]["text"]

def test_fewshot_examples_are_deterministic(self, task: MBPP_OLMES) -> None:
task._load_dataset(task.SUBJECTS[0])
item = task.dataset[task.SAMPLE_SPLIT][0]

examples_1 = task._sample_fewshot_examples(item)
examples_2 = task._sample_fewshot_examples(item)
assert examples_1 == examples_2

def test_fewshot_target_is_code_with_newline(self) -> None:
task = MBPP_OLMES(num_fewshot=3)
for example in _OLMES_FEWSHOT_EXAMPLES:
target = task._get_fewshot_target_text(example)
assert target == example["code"] + "\n"
assert "```" not in target

def test_code_execution_with_canonical_solution(self, task: MBPP_OLMES) -> None:
task._load_dataset(task.SUBJECTS[0])
for i, item in enumerate(task.dataset[task.SAMPLE_SPLIT][:5]):
# Verify that canonical code + test asserts execute correctly.
# We call _code_expander directly because in real usage the LLM
# engine truncates output at stop sequences (e.g. \n#) before it
# reaches post_process, and canonical solutions may contain those
# sequences (comments, asserts) that would corrupt the test.
code = MBPP_OLMES._code_expander(item["code"] + "\n", str(item["test_list"]))
result = run_python_code(code)
assert result.endswith("True"), f"Item {i} failed: {result}"

def test_prompt_format_matches_oe_eval(self, task: MBPP_OLMES) -> None:
"""Verify the assembled prompt has the expected structure with ConcatFormatter."""
task._load_dataset(task.SUBJECTS[0])
item = task.dataset[task.SAMPLE_SPLIT][0]
item["subject"] = task.SUBJECTS[0]
sample = task._create_samples(item, 0, task.SUBJECTS[0])[0]

formatter = ConcatFormatter()
formatted = formatter.format(sample.messages, output_mode="string")

assert "Please provide a self-contained Python script" in formatted
assert "Here is the completed function:" in formatted
assert "```python" in formatted

fewshot_count = formatted.count("Please provide a self-contained Python script")
assert fewshot_count == 4, f"Expected 4 occurrences (3 fewshot + 1 eval), got {fewshot_count}"
Loading