diff --git a/docs/tasks/MBPP_OLMES.md b/docs/tasks/MBPP_OLMES.md new file mode 100644 index 00000000..8bb6ecf3 --- /dev/null +++ b/docs/tasks/MBPP_OLMES.md @@ -0,0 +1,20 @@ +# MBPP_OLMES + +```` +NAME = MBPP_OLMES +DATASET_PATH = google-research-datasets/mbpp +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = test +RESPONSE_TYPE = COMPLETION +METRICS = [CodeCompletionAssertion] +SUBJECTS = ['full'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.mbpp` + +- File: [src/eval_framework/tasks/benchmarks/mbpp.py](../../src/eval_framework/tasks/benchmarks/mbpp.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mbpp.py) + +- Link to dataset: [https://huggingface.co/datasets/google-research-datasets/mbpp](https://huggingface.co/datasets/google-research-datasets/mbpp) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MBPP_OLMES"`. diff --git a/docs/tasks/README.md b/docs/tasks/README.md index 38ec0d69..9782c6d4 100644 --- a/docs/tasks/README.md +++ b/docs/tasks/README.md @@ -2,7 +2,7 @@ This directory contains the generated documentation for all benchmark tasks available in the package. -**Total number of tasks: 157** +**Total number of tasks: 158** The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`. @@ -92,6 +92,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [MATHMinervaEvalHarness](MATHMinervaEvalHarness.md) - [MBPP](MBPP.md) - [MBPPBPB](MBPPBPB.md) +- [MBPP_OLMES](MBPP_OLMES.md) - [MBPP_PROMPT_WITHOUT_TESTS](MBPP_PROMPT_WITHOUT_TESTS.md) - [MBPP_PROMPT_WITHOUT_TESTS_SANITIZED](MBPP_PROMPT_WITHOUT_TESTS_SANITIZED.md) - [MBPP_SANITIZED](MBPP_SANITIZED.md) diff --git a/src/eval_framework/tasks/benchmarks/mbpp.py b/src/eval_framework/tasks/benchmarks/mbpp.py index df31df02..72796914 100644 --- a/src/eval_framework/tasks/benchmarks/mbpp.py +++ b/src/eval_framework/tasks/benchmarks/mbpp.py @@ -212,3 +212,103 @@ def post_process_generated_completion(self, completion_text: str, sample: Sample class MBPP_PROMPT_WITHOUT_TESTS_SANITIZED(MBPP_PROMPT_WITHOUT_TESTS): NAME = "MBPP_PROMPT_WITHOUT_TESTS_SANITIZED" SUBJECTS = ["sanitized"] + + +_OLMES_FEWSHOT_EXAMPLES: list[dict[str, Any]] = [ + { + "text": "Write a function to find the similar elements from the given two tuple lists.", + "code": ( + "def similar_elements(test_tup1, test_tup2):\n" + " res = tuple(set(test_tup1) & set(test_tup2))\n return (res)" + ), + "test_list": [ + "assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)", + "assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)", + "assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)", + ], + }, + { + "text": "Write a python function to identify non-prime numbers.", + "code": ( + "import math\ndef is_not_prime(n):\n result = False\n" + " for i in range(2,int(math.sqrt(n)) + 1):\n" + " if n % i == 0:\n result = True\n return result" + ), + "test_list": [ + "assert is_not_prime(2) == False", + "assert is_not_prime(10) == True", + "assert is_not_prime(35) == True", + ], + }, + { + "text": ( + "Write a function to find the largest integers from a given list of numbers using heap queue algorithm." + ), + "code": ( + "import heapq as hq\ndef heap_queue_largest(nums,n):\n" + " largest_nums = hq.nlargest(n, nums)\n return largest_nums" + ), + "test_list": [ + "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] ", + "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] ", + "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]", + ], + }, +] + + +class MBPP_OLMES(MBPP): + """ + MBPP OLMES variant replicating oe_eval's ``mbpp:3shot::olmo3:n32:v2``. + + Uses the EvalPlus prompt format with 3 hardcoded fewshot examples from the + original MBPP "prompt" split (matching oe_eval's ordering). Each prompt + shows one test case (the first) instead of all. + + Recommended EvalConfig settings for full replication:: + + split: test + num_fewshot: 3 (hardcoded, prompt split) + metric: pass_at_1 + temperature: 0.6 + top_p: 0.6 + repeats: 32 + """ + + NAME = "MBPP_OLMES" + FEWSHOT_SPLIT = "test" + + def __init__(self, num_fewshot: int = 3) -> None: + super().__init__(num_fewshot) + assert num_fewshot == 3, "MBPP_OLMES requires exactly 3 fewshot examples" + self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"] + + def _get_instruction_text(self, item: dict[str, Any]) -> str: + text = item["text"] if "text" in item else item["prompt"] + test = item["test_list"][0] + return ( + "Please provide a self-contained Python script that solves the following problem" + f" in a markdown code block:\n```\n{text.strip()}\n{test}\n```\n" + ) + + def _get_cue_text(self, item: dict[str, Any]) -> str: + return "Here is the completed function:\n\n```python\n" + + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + return item["code"] + "\n" + + def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]: + return list(_OLMES_FEWSHOT_EXAMPLES) + + def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: + assert sample is not None + assert self.stop_sequences is not None + + for stop_seq in self.stop_sequences: + if stop_seq in completion_text: + completion_text = completion_text.split(stop_seq)[0] + + extracted_code = completion_text + "\n" + mbpp_ground_truth = str(sample.ground_truth) + code = self._code_expander(extracted_code, mbpp_ground_truth) + return code diff --git a/src/eval_framework/tasks/task_names.py b/src/eval_framework/tasks/task_names.py index 0dd11e16..20ef3988 100644 --- a/src/eval_framework/tasks/task_names.py +++ b/src/eval_framework/tasks/task_names.py @@ -87,6 +87,7 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_SANITIZED") register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS") register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS_SANITIZED") + register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_IDK") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_OLMES") diff --git a/src/eval_framework/utils/generate_task_docs.py b/src/eval_framework/utils/generate_task_docs.py index 6236500d..9784e78c 100644 --- a/src/eval_framework/utils/generate_task_docs.py +++ b/src/eval_framework/utils/generate_task_docs.py @@ -78,9 +78,13 @@ def generate_docs_for_task( try: num_fewshot = 0 task = task_class(num_fewshot=num_fewshot) - except Exception as e: - print(f"Failed to instantiate task {task_name}: {e}") - return + except Exception: + try: + task = task_class() + num_fewshot = task.num_fewshot + except Exception as e: + print(f"Failed to instantiate task {task_name}: {e}") + return with open(f"{output_docs_directory}/{task_name}.md", "w") as f: f.write(f"# {task_name}\n\n") diff --git a/tests/tests_eval_framework/tasks/test_all_formatters.py b/tests/tests_eval_framework/tasks/test_all_formatters.py index caa1f082..7b99b728 100644 --- a/tests/tests_eval_framework/tasks/test_all_formatters.py +++ b/tests/tests_eval_framework/tasks/test_all_formatters.py @@ -73,6 +73,7 @@ "MBPP_SANITIZED": {"num_fewshot": 1}, "MBPP_PROMPT_WITHOUT_TESTS": {"num_fewshot": 1}, "MBPP_PROMPT_WITHOUT_TESTS_SANITIZED": {"num_fewshot": 1}, + "MBPP_OLMES": {"num_fewshot": 3}, "MMLU": {"num_fewshot": 1}, "FullTextMMLU": {"num_fewshot": 1}, "MMLU_EU20_DE": {"num_fewshot": 1}, diff --git a/tests/tests_eval_framework/tasks/test_mbpp_olmes.py b/tests/tests_eval_framework/tasks/test_mbpp_olmes.py new file mode 100644 index 00000000..bb31a3eb --- /dev/null +++ b/tests/tests_eval_framework/tasks/test_mbpp_olmes.py @@ -0,0 +1,104 @@ +import pytest + +from eval_framework.tasks.benchmarks.mbpp import _OLMES_FEWSHOT_EXAMPLES, MBPP_OLMES +from eval_framework.tasks.utils import run_python_code +from template_formatting.formatter import ConcatFormatter +from tests.tests_eval_framework.utils import DatasetPatcher + + +class TestMBPP_OLMES: + @pytest.fixture + def task(self) -> MBPP_OLMES: + with DatasetPatcher(MBPP_OLMES, num_fewshot=3, num_samples=10) as patched_task: + return patched_task + + def test_num_fewshot_must_be_3(self) -> None: + with pytest.raises(AssertionError, match="MBPP_OLMES requires exactly 3 fewshot examples"): + MBPP_OLMES(num_fewshot=1) + + def test_stop_sequences(self) -> None: + task = MBPP_OLMES(num_fewshot=3) + assert task.stop_sequences == ["```", '\n"""', "\nassert", "\n#"] + + def test_instruction_uses_evalplus_format(self, task: MBPP_OLMES) -> None: + task._load_dataset(task.SUBJECTS[0]) + item = task.dataset[task.SAMPLE_SPLIT][0] + item["subject"] = task.SUBJECTS[0] + instruction = task._get_instruction_text(item) + + expected_prefix = ( + "Please provide a self-contained Python script that solves the following problem" + " in a markdown code block:\n```\n" + ) + assert instruction.startswith(expected_prefix) + assert instruction.endswith("\n```\n") + assert item["test_list"][0] in instruction + + def test_instruction_contains_only_one_test(self, task: MBPP_OLMES) -> None: + task._load_dataset(task.SUBJECTS[0]) + item = task.dataset[task.SAMPLE_SPLIT][0] + item["subject"] = task.SUBJECTS[0] + instruction = task._get_instruction_text(item) + + for test in item["test_list"][1:]: + assert test not in instruction + + def test_cue_text(self, task: MBPP_OLMES) -> None: + task._load_dataset(task.SUBJECTS[0]) + item = task.dataset[task.SAMPLE_SPLIT][0] + cue = task._get_cue_text(item) + assert cue == "Here is the completed function:\n\n```python\n" + + def test_fewshot_examples_are_hardcoded(self, task: MBPP_OLMES) -> None: + task._load_dataset(task.SUBJECTS[0]) + item = task.dataset[task.SAMPLE_SPLIT][0] + + examples = task._sample_fewshot_examples(item) + assert len(examples) == 3 + assert examples[0]["text"] == _OLMES_FEWSHOT_EXAMPLES[0]["text"] + assert examples[1]["text"] == _OLMES_FEWSHOT_EXAMPLES[1]["text"] + assert examples[2]["text"] == _OLMES_FEWSHOT_EXAMPLES[2]["text"] + + def test_fewshot_examples_are_deterministic(self, task: MBPP_OLMES) -> None: + task._load_dataset(task.SUBJECTS[0]) + item = task.dataset[task.SAMPLE_SPLIT][0] + + examples_1 = task._sample_fewshot_examples(item) + examples_2 = task._sample_fewshot_examples(item) + assert examples_1 == examples_2 + + def test_fewshot_target_is_code_with_newline(self) -> None: + task = MBPP_OLMES(num_fewshot=3) + for example in _OLMES_FEWSHOT_EXAMPLES: + target = task._get_fewshot_target_text(example) + assert target == example["code"] + "\n" + assert "```" not in target + + def test_code_execution_with_canonical_solution(self, task: MBPP_OLMES) -> None: + task._load_dataset(task.SUBJECTS[0]) + for i, item in enumerate(task.dataset[task.SAMPLE_SPLIT][:5]): + # Verify that canonical code + test asserts execute correctly. + # We call _code_expander directly because in real usage the LLM + # engine truncates output at stop sequences (e.g. \n#) before it + # reaches post_process, and canonical solutions may contain those + # sequences (comments, asserts) that would corrupt the test. + code = MBPP_OLMES._code_expander(item["code"] + "\n", str(item["test_list"])) + result = run_python_code(code) + assert result.endswith("True"), f"Item {i} failed: {result}" + + def test_prompt_format_matches_oe_eval(self, task: MBPP_OLMES) -> None: + """Verify the assembled prompt has the expected structure with ConcatFormatter.""" + task._load_dataset(task.SUBJECTS[0]) + item = task.dataset[task.SAMPLE_SPLIT][0] + item["subject"] = task.SUBJECTS[0] + sample = task._create_samples(item, 0, task.SUBJECTS[0])[0] + + formatter = ConcatFormatter() + formatted = formatter.format(sample.messages, output_mode="string") + + assert "Please provide a self-contained Python script" in formatted + assert "Here is the completed function:" in formatted + assert "```python" in formatted + + fewshot_count = formatted.count("Please provide a self-contained Python script") + assert fewshot_count == 4, f"Expected 4 occurrences (3 fewshot + 1 eval), got {fewshot_count}"