From 0d59c8d5d1315c0859ab8e9ca1189f081762c101 Mon Sep 17 00:00:00 2001
From: lduignan <liam.duignan@cea.fr>
Date: Tue, 17 Feb 2026 13:58:34 +0100
Subject: [PATCH 1/6] Add MathAlea benchmark for French math multiple-choice
 evaluation

---
 community_tasks/mathalea.py | 80 +++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 community_tasks/mathalea.py

diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py
new file mode 100644
index 000000000..823c21f67
--- /dev/null
+++ b/community_tasks/mathalea.py
@@ -0,0 +1,80 @@
+"""
+MathAlea French math multiple-choice benchmark for lighteval.
+
+Evaluates LLMs on French secondary school math problems across 5 grade levels:
+cinquième, quatrième, troisième, première, terminale.
+
+Dataset: OpenLLM-BPI/MathAleaMCQ
+"""
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+GRADE_LEVELS = {
+    "cinquième": "cinquieme",
+    "quatrième": "quatrieme",
+    "troisième": "troisieme",
+    "première": "premiere",
+    "terminale": "terminale",
+}
+
+
+def prompt_mathalea(line, task_name: str = None):
+    """Build a multiple-choice prompt from a MathAlea dataset line."""
+    choices = line["choices"]
+    query = f"{line['question'].strip()}\n"
+    query += "".join(
+        f"{letter}. {choice}\n"
+        for letter, choice in zip(LETTER_INDICES, choices)
+    )
+    query += "Réponse :"
+
+    gold_index = LETTER_INDICES.index(line["answerKey"])
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=[f" {LETTER_INDICES[i]}" for i in range(len(choices))],
+        gold_index=gold_index,
+    )
+
+
+TASKS_TABLE = [
+    # Combined task: all grade levels at once
+    LightevalTaskConfig(
+        name="mathalea:all",
+        prompt_function=prompt_mathalea,
+        suite=["community"],
+        hf_repo="OpenLLM-BPI/MathAleaMCQ",
+        hf_subset="all",
+        hf_avail_splits=["dev", "test"],
+        evaluation_splits=["test"],
+        few_shots_split="dev",
+        few_shots_select="sequential",
+        generation_size=1,
+        metrics=[Metrics.loglikelihood_acc],
+        stop_sequence=["\n"],
+        version=0,
+    ),
+] + [
+    # Per-grade tasks
+    LightevalTaskConfig(
+        name=f"mathalea:{alias}",
+        prompt_function=prompt_mathalea,
+        suite=["community"],
+        hf_repo="OpenLLM-BPI/MathAleaMCQ",
+        hf_subset=subset,
+        hf_avail_splits=["dev", "test"],
+        evaluation_splits=["test"],
+        few_shots_split="dev",
+        few_shots_select="sequential",
+        generation_size=1,
+        metrics=[Metrics.loglikelihood_acc],
+        stop_sequence=["\n"],
+        version=0,
+    )
+    for subset, alias in GRADE_LEVELS.items()
+]

From 78599934d6ee8cf424fd8a142d53f8fef7d26788 Mon Sep 17 00:00:00 2001
From: lduignan <liam.duignan@cea.fr>
Date: Wed, 18 Feb 2026 19:42:00 +0100
Subject: [PATCH 2/6] Fix gold index retrieval in prompt_mathalea function

---
 community_tasks/mathalea.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py
index 823c21f67..f6e4d9fbf 100644
--- a/community_tasks/mathalea.py
+++ b/community_tasks/mathalea.py
@@ -32,7 +32,7 @@ def prompt_mathalea(line, task_name: str = None):
     )
     query += "Réponse :"
 
-    gold_index = LETTER_INDICES.index(line["answerKey"])
+    gold_index = int(line["answerKey"])
 
     return Doc(
         task_name=task_name,

From 335454193d2940e458eaa689b2deb7c58d6e3086 Mon Sep 17 00:00:00 2001
From: lduignan <liam.duignan@cea.fr>
Date: Fri, 6 Mar 2026 13:42:38 +0100
Subject: [PATCH 3/6] Update MathAlea metadata with detailed description,
 language, and tags

---
 community_tasks/mathalea.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py
index f6e4d9fbf..773b73745 100644
--- a/community_tasks/mathalea.py
+++ b/community_tasks/mathalea.py
@@ -1,10 +1,23 @@
 """
-MathAlea French math multiple-choice benchmark for lighteval.
+name:
+MathAlea
 
-Evaluates LLMs on French secondary school math problems across 5 grade levels:
-cinquième, quatrième, troisième, première, terminale.
+dataset:
+OpenLLM-France/MathAleaMCQ
+
+abstract:
+MathAlea is a dataset of multiple-choice math questions for French middle and high school students.
+It covers a range of topics and difficulty levels, making it a valuable resource for evaluating the 
+mathematical reasoning capabilities of language models in the context of education.
+
+languages:
+french
+
+tags:
+math, question-answering, multiple-choice
+
+paper:
 
-Dataset: OpenLLM-BPI/MathAleaMCQ
 """
 
 from lighteval.metrics.metrics import Metrics

From e372a0f11f98ff623850f5fb591987dd486cebec Mon Sep 17 00:00:00 2001
From: lduignan <liam.duignan@cea.fr>
Date: Fri, 6 Mar 2026 14:07:10 +0100
Subject: [PATCH 4/6] Fix dataset reference in MathAlea metadata

---
 community_tasks/mathalea.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py
index 773b73745..c4eef8667 100644
--- a/community_tasks/mathalea.py
+++ b/community_tasks/mathalea.py
@@ -3,7 +3,7 @@
 MathAlea
 
 dataset:
-OpenLLM-France/MathAleaMCQ
+OpenLLM-BPI/MathAleaMCQ
 
 abstract:
 MathAlea is a dataset of multiple-choice math questions for French middle and high school students.

From d42f5fd426e8c3c62ada14012375a9a7f6198bf9 Mon Sep 17 00:00:00 2001
From: lduignan <liam.duignan@cea.fr>
Date: Wed, 11 Mar 2026 17:02:09 +0100
Subject: [PATCH 5/6] Refactor MathAlea dataset configuration and prompt
 generation functions

---
 community_tasks/mathalea.py | 107 ++++++++++++++++++------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py
index c4eef8667..5260858e8 100644
--- a/community_tasks/mathalea.py
+++ b/community_tasks/mathalea.py
@@ -7,7 +7,7 @@
 
 abstract:
 MathAlea is a dataset of multiple-choice math questions for French middle and high school students.
-It covers a range of topics and difficulty levels, making it a valuable resource for evaluating the 
+It covers a range of topics and difficulty levels, making it a valuable resource for evaluating the
 mathematical reasoning capabilities of language models in the context of education.
 
 languages:
@@ -20,63 +20,55 @@
 
 """
 
-from lighteval.metrics.metrics import Metrics
-from lighteval.tasks.default_prompts import LETTER_INDICES
+import unicodedata
+
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
+from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
+from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
+from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
+from lighteval.tasks.templates.utils.formulation import (
+    CFFormulation,
+    HybridFormulation,
+    MCFFormulation,
+)
+from lighteval.utils.language import Language
 
 
-GRADE_LEVELS = {
-    "cinquième": "cinquieme",
-    "quatrième": "quatrieme",
-    "troisième": "troisieme",
-    "première": "premiere",
-    "terminale": "terminale",
-}
+GRADE_LEVELS = ["cinquième", "quatrième", "troisième", "première", "terminale"]
 
 
-def prompt_mathalea(line, task_name: str = None):
-    """Build a multiple-choice prompt from a MathAlea dataset line."""
-    choices = line["choices"]
-    query = f"{line['question'].strip()}\n"
-    query += "".join(
-        f"{letter}. {choice}\n"
-        for letter, choice in zip(LETTER_INDICES, choices)
-    )
-    query += "Réponse :"
+def remove_accents(text: str) -> str:
+    return "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
 
-    gold_index = int(line["answerKey"])
+FORMULATIONS = [MCFFormulation(), CFFormulation(), HybridFormulation()]
 
-    return Doc(
-        task_name=task_name,
-        query=query,
-        choices=[f" {LETTER_INDICES[i]}" for i in range(len(choices))],
-        gold_index=gold_index,
-    )
 
+def format_choice(choice):
+    if isinstance(choice, str):
+        if choice.endswith("\qquad"):
+            choice = choice[:-6].strip()
+        return choice.strip()
+    if isinstance(choice, list):
+        return [format_choice(c) for c in choice]
+    raise ValueError(f"Unsupported choice type: {type(choice)}")
 
-TASKS_TABLE = [
-    # Combined task: all grade levels at once
-    LightevalTaskConfig(
-        name="mathalea:all",
-        prompt_function=prompt_mathalea,
-        suite=["community"],
-        hf_repo="OpenLLM-BPI/MathAleaMCQ",
-        hf_subset="all",
-        hf_avail_splits=["dev", "test"],
-        evaluation_splits=["test"],
-        few_shots_split="dev",
-        few_shots_select="sequential",
-        generation_size=1,
-        metrics=[Metrics.loglikelihood_acc],
-        stop_sequence=["\n"],
-        version=0,
-    ),
-] + [
-    # Per-grade tasks
-    LightevalTaskConfig(
-        name=f"mathalea:{alias}",
-        prompt_function=prompt_mathalea,
+def format_question(question):
+    return question.replace("\\", "\n").strip()
+
+
+def _make_tasks(subset, alias, formulation):
+    return LightevalTaskConfig(
+        name=f"mathalea_{formulation.name.lower()}:{alias}",
+        prompt_function=get_mcq_prompt_function(
+            Language.FRENCH,
+            lambda line: {
+                "question": format_question(line["question"]),
+                "choices": format_choice(line["choices"]),
+                "gold_idx": int(line["answerKey"]),
+            },
+            formulation=formulation,
+        ),
         suite=["community"],
         hf_repo="OpenLLM-BPI/MathAleaMCQ",
         hf_subset=subset,
@@ -84,10 +76,21 @@ def prompt_mathalea(line, task_name: str = None):
         evaluation_splits=["test"],
         few_shots_split="dev",
         few_shots_select="sequential",
-        generation_size=1,
-        metrics=[Metrics.loglikelihood_acc],
+        generation_size=-1,
+        metrics=get_metrics_for_formulation(
+            formulation,
+            [
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+            ],
+        ),
         stop_sequence=["\n"],
         version=0,
     )
-    for subset, alias in GRADE_LEVELS.items()
+
+
+TASKS_TABLE = [
+    _make_tasks(subset, remove_accents(subset), formulation)
+    for subset in ["all"] + GRADE_LEVELS
+    for formulation in FORMULATIONS
 ]

From ce6848f24ea1688935c9bd850680d38e69af40b8 Mon Sep 17 00:00:00 2001
From: lduignan <liam.duignan@cea.fr>
Date: Mon, 23 Mar 2026 16:25:10 +0100
Subject: [PATCH 6/6] add system prompts in french and english

---
 community_tasks/mathalea.py | 49 +++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py
index 5260858e8..792b76625 100644
--- a/community_tasks/mathalea.py
+++ b/community_tasks/mathalea.py
@@ -44,28 +44,40 @@ def remove_accents(text: str) -> str:
 FORMULATIONS = [MCFFormulation(), CFFormulation(), HybridFormulation()]
 
 
-def format_choice(choice):
-    if isinstance(choice, str):
-        if choice.endswith("\qquad"):
-            choice = choice[:-6].strip()
-        return choice.strip()
-    if isinstance(choice, list):
-        return [format_choice(c) for c in choice]
-    raise ValueError(f"Unsupported choice type: {type(choice)}")
+PROMPT_CONFIGS = {
+    "frprompt": {
+        "all": "Vous êtes un assistant mathématique pour les élèves du secondaire français.\n\n",
+        "grade": "Vous êtes un assistant mathématique pour les élèves de {subset}.\n\n",
+    },
+    "enprompt": {
+        "all": "You are a helpful math assistant for French secondary school students.\n\n",
+        "grade": "You are a helpful math assistant for French students in grade {subset}.\n\n",
+    },
+    "noprompt": None,
+}
+
+
+def _get_instruction(prompt_key, subset):
+    prompt_cfg = PROMPT_CONFIGS[prompt_key]
+    if prompt_cfg is None:
+        return None
+    if subset == "all":
+        return prompt_cfg["all"]
+    return prompt_cfg["grade"].format(subset=subset)
+
+
+def _make_tasks(subset, alias, formulation, prompt_key):
+    instruction = _get_instruction(prompt_key, subset)
 
-def format_question(question):
-    return question.replace("\\", "\n").strip()
-
-
-def _make_tasks(subset, alias, formulation):
     return LightevalTaskConfig(
-        name=f"mathalea_{formulation.name.lower()}:{alias}",
+        name=f"mathalea_{formulation.name.lower()}_{prompt_key}:{alias}",
         prompt_function=get_mcq_prompt_function(
             Language.FRENCH,
-            lambda line: {
-                "question": format_question(line["question"]),
-                "choices": format_choice(line["choices"]),
+            lambda line, instr=instruction: {
+                "question": line["question"],
+                "choices": line["choices"],
                 "gold_idx": int(line["answerKey"]),
+                **({"instruction": instr} if instr else {}),
             },
             formulation=formulation,
         ),
@@ -90,7 +102,8 @@ def _make_tasks(subset, alias, formulation):
 
 
 TASKS_TABLE = [
-    _make_tasks(subset, remove_accents(subset), formulation)
+    _make_tasks(subset, remove_accents(subset), formulation, prompt_key)
     for subset in ["all"] + GRADE_LEVELS
     for formulation in FORMULATIONS
+    for prompt_key in PROMPT_CONFIGS
 ]