Aleph-Alpha-Research · tfburns · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · prabhuteja12
diff --git a/docs/tasks/MBPP_OLMES.md b/docs/tasks/MBPP_OLMES.md
@@ -0,0 +1,20 @@
+# MBPP_OLMES
+
+````
+NAME = MBPP_OLMES
+DATASET_PATH = google-research-datasets/mbpp
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = test
+RESPONSE_TYPE = COMPLETION
+METRICS = [CodeCompletionAssertion]
+SUBJECTS = ['full']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.mbpp`
+
+- File: [src/eval_framework/tasks/benchmarks/mbpp.py](../../src/eval_framework/tasks/benchmarks/mbpp.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mbpp.py)
+
+- Link to dataset: [https://huggingface.co/datasets/google-research-datasets/mbpp](https://huggingface.co/datasets/google-research-datasets/mbpp)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MBPP_OLMES"`.
diff --git a/docs/tasks/README.md b/docs/tasks/README.md
@@ -2,7 +2,7 @@
 
 This directory contains the generated documentation for all benchmark tasks available in the package.
 
-**Total number of tasks: 157**
+**Total number of tasks: 158**
 
 The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`.
 
@@ -92,6 +92,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not
 - [MATHMinervaEvalHarness](MATHMinervaEvalHarness.md)
 - [MBPP](MBPP.md)
 - [MBPPBPB](MBPPBPB.md)
+- [MBPP_OLMES](MBPP_OLMES.md)
 - [MBPP_PROMPT_WITHOUT_TESTS](MBPP_PROMPT_WITHOUT_TESTS.md)
 - [MBPP_PROMPT_WITHOUT_TESTS_SANITIZED](MBPP_PROMPT_WITHOUT_TESTS_SANITIZED.md)
 - [MBPP_SANITIZED](MBPP_SANITIZED.md)

diff --git a/src/eval_framework/tasks/benchmarks/mbpp.py b/src/eval_framework/tasks/benchmarks/mbpp.py
@@ -212,3 +212,103 @@ def post_process_generated_completion(self, completion_text: str, sample: Sample
 class MBPP_PROMPT_WITHOUT_TESTS_SANITIZED(MBPP_PROMPT_WITHOUT_TESTS):
     NAME = "MBPP_PROMPT_WITHOUT_TESTS_SANITIZED"
     SUBJECTS = ["sanitized"]
+
+
+_OLMES_FEWSHOT_EXAMPLES: list[dict[str, Any]] = [
+    {
+        "text": "Write a function to find the similar elements from the given two tuple lists.",
+        "code": (
+            "def similar_elements(test_tup1, test_tup2):\n"
+            "  res = tuple(set(test_tup1) & set(test_tup2))\n  return (res)"
+        ),
+        "test_list": [
+            "assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
+            "assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
+            "assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)",
+        ],
+    },
+    {
+        "text": "Write a python function to identify non-prime numbers.",
+        "code": (
+            "import math\ndef is_not_prime(n):\n    result = False\n"
+            "    for i in range(2,int(math.sqrt(n)) + 1):\n"
+            "        if n % i == 0:\n            result = True\n    return result"
+        ),
+        "test_list": [
+            "assert is_not_prime(2) == False",
+            "assert is_not_prime(10) == True",
+            "assert is_not_prime(35) == True",
+        ],
+    },
+    {
+        "text": (
+            "Write a function to find the largest integers from a given list of numbers using heap queue algorithm."
+        ),
+        "code": (
+            "import heapq as hq\ndef heap_queue_largest(nums,n):\n"
+            "  largest_nums = hq.nlargest(n, nums)\n  return largest_nums"
+        ),
+        "test_list": [
+            "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] ",
+            "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] ",
+            "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]",
+        ],
+    },
+]
+
+
+class MBPP_OLMES(MBPP):
+    """
+    MBPP OLMES variant replicating oe_eval's ``mbpp:3shot::olmo3:n32:v2``.
+
+    Uses the EvalPlus prompt format with 3 hardcoded fewshot examples from the
+    original MBPP "prompt" split (matching oe_eval's ordering). Each prompt
+    shows one test case (the first) instead of all.
+
+    Recommended EvalConfig settings for full replication::
+
+        split: test
+        num_fewshot: 3 (hardcoded, prompt split)
+        metric: pass_at_1
+        temperature: 0.6
+        top_p: 0.6
+        repeats: 32
+    """
+
+    NAME = "MBPP_OLMES"
+    FEWSHOT_SPLIT = "test"
+
+    def __init__(self, num_fewshot: int = 3) -> None:
+        super().__init__(num_fewshot)
+        assert num_fewshot == 3, "MBPP_OLMES requires exactly 3 fewshot examples"
+        self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"]
+
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        text = item["text"] if "text" in item else item["prompt"]
+        test = item["test_list"][0]
+        return (
+            "Please provide a self-contained Python script that solves the following problem"
+            f" in a markdown code block:\n```\n{text.strip()}\n{test}\n```\n"
+        )
+
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Here is the completed function:\n\n```python\n"
+
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        return item["code"] + "\n"
+
+    def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
+        return list(_OLMES_FEWSHOT_EXAMPLES)
+
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        assert sample is not None
+        assert self.stop_sequences is not None
+
+        for stop_seq in self.stop_sequences:
+            if stop_seq in completion_text:
+                completion_text = completion_text.split(stop_seq)[0]
+
+        extracted_code = completion_text + "\n"
+        mbpp_ground_truth = str(sample.ground_truth)
+        code = self._code_expander(extracted_code, mbpp_ground_truth)
+        return code
diff --git a/src/eval_framework/tasks/task_names.py b/src/eval_framework/tasks/task_names.py
@@ -87,6 +87,7 @@ def register_all_tasks() -> None:
     register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_SANITIZED")
     register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS")
     register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS_SANITIZED")
+    register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_OLMES")
     register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU")
     register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_IDK")
     register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_OLMES")

diff --git a/src/eval_framework/utils/generate_task_docs.py b/src/eval_framework/utils/generate_task_docs.py
@@ -78,9 +78,13 @@ def generate_docs_for_task(
         try:
             num_fewshot = 0
             task = task_class(num_fewshot=num_fewshot)
-        except Exception as e:
-            print(f"Failed to instantiate task {task_name}: {e}")
-            return
+        except Exception:
+            try:
+                task = task_class()
+                num_fewshot = task.num_fewshot
+            except Exception as e:
+                print(f"Failed to instantiate task {task_name}: {e}")
+                return
 
     with open(f"{output_docs_directory}/{task_name}.md", "w") as f:
         f.write(f"# {task_name}\n\n")

diff --git a/tests/tests_eval_framework/tasks/test_all_formatters.py b/tests/tests_eval_framework/tasks/test_all_formatters.py
@@ -73,6 +73,7 @@
     "MBPP_SANITIZED": {"num_fewshot": 1},
     "MBPP_PROMPT_WITHOUT_TESTS": {"num_fewshot": 1},
     "MBPP_PROMPT_WITHOUT_TESTS_SANITIZED": {"num_fewshot": 1},
+    "MBPP_OLMES": {"num_fewshot": 3},
     "MMLU": {"num_fewshot": 1},
     "FullTextMMLU": {"num_fewshot": 1},
     "MMLU_EU20_DE": {"num_fewshot": 1},

diff --git a/tests/tests_eval_framework/tasks/test_mbpp_olmes.py b/tests/tests_eval_framework/tasks/test_mbpp_olmes.py
@@ -0,0 +1,104 @@
+import pytest
+
+from eval_framework.tasks.benchmarks.mbpp import _OLMES_FEWSHOT_EXAMPLES, MBPP_OLMES
+from eval_framework.tasks.utils import run_python_code
+from template_formatting.formatter import ConcatFormatter
+from tests.tests_eval_framework.utils import DatasetPatcher
+
+
+class TestMBPP_OLMES:
+    @pytest.fixture
+    def task(self) -> MBPP_OLMES:
+        with DatasetPatcher(MBPP_OLMES, num_fewshot=3, num_samples=10) as patched_task:
+            return patched_task
+
+    def test_num_fewshot_must_be_3(self) -> None:
+        with pytest.raises(AssertionError, match="MBPP_OLMES requires exactly 3 fewshot examples"):
+            MBPP_OLMES(num_fewshot=1)
+
+    def test_stop_sequences(self) -> None:
+        task = MBPP_OLMES(num_fewshot=3)
+        assert task.stop_sequences == ["```", '\n"""', "\nassert", "\n#"]
+
+    def test_instruction_uses_evalplus_format(self, task: MBPP_OLMES) -> None:
+        task._load_dataset(task.SUBJECTS[0])
+        item = task.dataset[task.SAMPLE_SPLIT][0]
+        item["subject"] = task.SUBJECTS[0]
+        instruction = task._get_instruction_text(item)
+
+        expected_prefix = (
+            "Please provide a self-contained Python script that solves the following problem"
+            " in a markdown code block:\n```\n"
+        )
+        assert instruction.startswith(expected_prefix)
+        assert instruction.endswith("\n```\n")
+        assert item["test_list"][0] in instruction
+
+    def test_instruction_contains_only_one_test(self, task: MBPP_OLMES) -> None:
+        task._load_dataset(task.SUBJECTS[0])
+        item = task.dataset[task.SAMPLE_SPLIT][0]
+        item["subject"] = task.SUBJECTS[0]
+        instruction = task._get_instruction_text(item)
+
+        for test in item["test_list"][1:]:
+            assert test not in instruction
+
+    def test_cue_text(self, task: MBPP_OLMES) -> None:
+        task._load_dataset(task.SUBJECTS[0])
+        item = task.dataset[task.SAMPLE_SPLIT][0]
+        cue = task._get_cue_text(item)
+        assert cue == "Here is the completed function:\n\n```python\n"
+
+    def test_fewshot_examples_are_hardcoded(self, task: MBPP_OLMES) -> None:
+        task._load_dataset(task.SUBJECTS[0])
+        item = task.dataset[task.SAMPLE_SPLIT][0]
+
+        examples = task._sample_fewshot_examples(item)
+        assert len(examples) == 3
+        assert examples[0]["text"] == _OLMES_FEWSHOT_EXAMPLES[0]["text"]
+        assert examples[1]["text"] == _OLMES_FEWSHOT_EXAMPLES[1]["text"]
+        assert examples[2]["text"] == _OLMES_FEWSHOT_EXAMPLES[2]["text"]
+
+    def test_fewshot_examples_are_deterministic(self, task: MBPP_OLMES) -> None:
+        task._load_dataset(task.SUBJECTS[0])
+        item = task.dataset[task.SAMPLE_SPLIT][0]
+
+        examples_1 = task._sample_fewshot_examples(item)
+        examples_2 = task._sample_fewshot_examples(item)
+        assert examples_1 == examples_2
+
+    def test_fewshot_target_is_code_with_newline(self) -> None:
+        task = MBPP_OLMES(num_fewshot=3)
+        for example in _OLMES_FEWSHOT_EXAMPLES:
+            target = task._get_fewshot_target_text(example)
+            assert target == example["code"] + "\n"
+            assert "```" not in target
+
+    def test_code_execution_with_canonical_solution(self, task: MBPP_OLMES) -> None:
+        task._load_dataset(task.SUBJECTS[0])
+        for i, item in enumerate(task.dataset[task.SAMPLE_SPLIT][:5]):
+            # Verify that canonical code + test asserts execute correctly.
+            # We call _code_expander directly because in real usage the LLM
+            # engine truncates output at stop sequences (e.g. \n#) before it
+            # reaches post_process, and canonical solutions may contain those
+            # sequences (comments, asserts) that would corrupt the test.
+            code = MBPP_OLMES._code_expander(item["code"] + "\n", str(item["test_list"]))
+            result = run_python_code(code)
+            assert result.endswith("True"), f"Item {i} failed: {result}"
+
+    def test_prompt_format_matches_oe_eval(self, task: MBPP_OLMES) -> None:
+        """Verify the assembled prompt has the expected structure with ConcatFormatter."""
+        task._load_dataset(task.SUBJECTS[0])
+        item = task.dataset[task.SAMPLE_SPLIT][0]
+        item["subject"] = task.SUBJECTS[0]
+        sample = task._create_samples(item, 0, task.SUBJECTS[0])[0]
+
+        formatter = ConcatFormatter()
+        formatted = formatter.format(sample.messages, output_mode="string")
+
+        assert "Please provide a self-contained Python script" in formatted
+        assert "Here is the completed function:" in formatted
+        assert "```python" in formatted
+
+        fewshot_count = formatted.count("Please provide a self-contained Python script")
+        assert fewshot_count == 4, f"Expected 4 occurrences (3 fewshot + 1 eval), got {fewshot_count}"