From 0d59c8d5d1315c0859ab8e9ca1189f081762c101 Mon Sep 17 00:00:00 2001 From: lduignan Date: Tue, 17 Feb 2026 13:58:34 +0100 Subject: [PATCH 1/6] Add MathAlea benchmark for French math multiple-choice evaluation --- community_tasks/mathalea.py | 80 +++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 community_tasks/mathalea.py diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py new file mode 100644 index 000000000..823c21f67 --- /dev/null +++ b/community_tasks/mathalea.py @@ -0,0 +1,80 @@ +""" +MathAlea French math multiple-choice benchmark for lighteval. + +Evaluates LLMs on French secondary school math problems across 5 grade levels: +cinquième, quatrième, troisième, première, terminale. + +Dataset: OpenLLM-BPI/MathAleaMCQ +""" + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.default_prompts import LETTER_INDICES +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +GRADE_LEVELS = { + "cinquième": "cinquieme", + "quatrième": "quatrieme", + "troisième": "troisieme", + "première": "premiere", + "terminale": "terminale", +} + + +def prompt_mathalea(line, task_name: str = None): + """Build a multiple-choice prompt from a MathAlea dataset line.""" + choices = line["choices"] + query = f"{line['question'].strip()}\n" + query += "".join( + f"{letter}. {choice}\n" + for letter, choice in zip(LETTER_INDICES, choices) + ) + query += "Réponse :" + + gold_index = LETTER_INDICES.index(line["answerKey"]) + + return Doc( + task_name=task_name, + query=query, + choices=[f" {LETTER_INDICES[i]}" for i in range(len(choices))], + gold_index=gold_index, + ) + + +TASKS_TABLE = [ + # Combined task: all grade levels at once + LightevalTaskConfig( + name="mathalea:all", + prompt_function=prompt_mathalea, + suite=["community"], + hf_repo="OpenLLM-BPI/MathAleaMCQ", + hf_subset="all", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, + ), +] + [ + # Per-grade tasks + LightevalTaskConfig( + name=f"mathalea:{alias}", + prompt_function=prompt_mathalea, + suite=["community"], + hf_repo="OpenLLM-BPI/MathAleaMCQ", + hf_subset=subset, + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + version=0, + ) + for subset, alias in GRADE_LEVELS.items() +] From 78599934d6ee8cf424fd8a142d53f8fef7d26788 Mon Sep 17 00:00:00 2001 From: lduignan Date: Wed, 18 Feb 2026 19:42:00 +0100 Subject: [PATCH 2/6] Fix gold index retrieval in prompt_mathalea function --- community_tasks/mathalea.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py index 823c21f67..f6e4d9fbf 100644 --- a/community_tasks/mathalea.py +++ b/community_tasks/mathalea.py @@ -32,7 +32,7 @@ def prompt_mathalea(line, task_name: str = None): ) query += "Réponse :" - gold_index = LETTER_INDICES.index(line["answerKey"]) + gold_index = int(line["answerKey"]) return Doc( task_name=task_name, From 335454193d2940e458eaa689b2deb7c58d6e3086 Mon Sep 17 00:00:00 2001 From: lduignan Date: Fri, 6 Mar 2026 13:42:38 +0100 Subject: [PATCH 3/6] Update MathAlea metadata with detailed description, language, and tags --- community_tasks/mathalea.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py index f6e4d9fbf..773b73745 100644 --- a/community_tasks/mathalea.py +++ b/community_tasks/mathalea.py @@ -1,10 +1,23 @@ """ -MathAlea French math multiple-choice benchmark for lighteval. +name: +MathAlea -Evaluates LLMs on French secondary school math problems across 5 grade levels: -cinquième, quatrième, troisième, première, terminale. +dataset: +OpenLLM-France/MathAleaMCQ + +abstract: +MathAlea is a dataset of multiple-choice math questions for French middle and high school students. +It covers a range of topics and difficulty levels, making it a valuable resource for evaluating the +mathematical reasoning capabilities of language models in the context of education. + +languages: +french + +tags: +math, question-answering, multiple-choice + +paper: -Dataset: OpenLLM-BPI/MathAleaMCQ """ from lighteval.metrics.metrics import Metrics From e372a0f11f98ff623850f5fb591987dd486cebec Mon Sep 17 00:00:00 2001 From: lduignan Date: Fri, 6 Mar 2026 14:07:10 +0100 Subject: [PATCH 4/6] Fix dataset reference in MathAlea metadata --- community_tasks/mathalea.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py index 773b73745..c4eef8667 100644 --- a/community_tasks/mathalea.py +++ b/community_tasks/mathalea.py @@ -3,7 +3,7 @@ MathAlea dataset: -OpenLLM-France/MathAleaMCQ +OpenLLM-BPI/MathAleaMCQ abstract: MathAlea is a dataset of multiple-choice math questions for French middle and high school students. From d42f5fd426e8c3c62ada14012375a9a7f6198bf9 Mon Sep 17 00:00:00 2001 From: lduignan Date: Wed, 11 Mar 2026 17:02:09 +0100 Subject: [PATCH 5/6] Refactor MathAlea dataset configuration and prompt generation functions --- community_tasks/mathalea.py | 107 ++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 52 deletions(-) diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py index c4eef8667..5260858e8 100644 --- a/community_tasks/mathalea.py +++ b/community_tasks/mathalea.py @@ -7,7 +7,7 @@ abstract: MathAlea is a dataset of multiple-choice math questions for French middle and high school students. -It covers a range of topics and difficulty levels, making it a valuable resource for evaluating the +It covers a range of topics and difficulty levels, making it a valuable resource for evaluating the mathematical reasoning capabilities of language models in the context of education. languages: @@ -20,63 +20,55 @@ """ -from lighteval.metrics.metrics import Metrics -from lighteval.tasks.default_prompts import LETTER_INDICES +import unicodedata + +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric +from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.requests import Doc +from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation +from lighteval.tasks.templates.multichoice import get_mcq_prompt_function +from lighteval.tasks.templates.utils.formulation import ( + CFFormulation, + HybridFormulation, + MCFFormulation, +) +from lighteval.utils.language import Language -GRADE_LEVELS = { - "cinquième": "cinquieme", - "quatrième": "quatrieme", - "troisième": "troisieme", - "première": "premiere", - "terminale": "terminale", -} +GRADE_LEVELS = ["cinquième", "quatrième", "troisième", "première", "terminale"] -def prompt_mathalea(line, task_name: str = None): - """Build a multiple-choice prompt from a MathAlea dataset line.""" - choices = line["choices"] - query = f"{line['question'].strip()}\n" - query += "".join( - f"{letter}. {choice}\n" - for letter, choice in zip(LETTER_INDICES, choices) - ) - query += "Réponse :" +def remove_accents(text: str) -> str: + return "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn") - gold_index = int(line["answerKey"]) +FORMULATIONS = [MCFFormulation(), CFFormulation(), HybridFormulation()] - return Doc( - task_name=task_name, - query=query, - choices=[f" {LETTER_INDICES[i]}" for i in range(len(choices))], - gold_index=gold_index, - ) +def format_choice(choice): + if isinstance(choice, str): + if choice.endswith("\qquad"): + choice = choice[:-6].strip() + return choice.strip() + if isinstance(choice, list): + return [format_choice(c) for c in choice] + raise ValueError(f"Unsupported choice type: {type(choice)}") -TASKS_TABLE = [ - # Combined task: all grade levels at once - LightevalTaskConfig( - name="mathalea:all", - prompt_function=prompt_mathalea, - suite=["community"], - hf_repo="OpenLLM-BPI/MathAleaMCQ", - hf_subset="all", - hf_avail_splits=["dev", "test"], - evaluation_splits=["test"], - few_shots_split="dev", - few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - version=0, - ), -] + [ - # Per-grade tasks - LightevalTaskConfig( - name=f"mathalea:{alias}", - prompt_function=prompt_mathalea, +def format_question(question): + return question.replace("\\", "\n").strip() + + +def _make_tasks(subset, alias, formulation): + return LightevalTaskConfig( + name=f"mathalea_{formulation.name.lower()}:{alias}", + prompt_function=get_mcq_prompt_function( + Language.FRENCH, + lambda line: { + "question": format_question(line["question"]), + "choices": format_choice(line["choices"]), + "gold_idx": int(line["answerKey"]), + }, + formulation=formulation, + ), suite=["community"], hf_repo="OpenLLM-BPI/MathAleaMCQ", hf_subset=subset, @@ -84,10 +76,21 @@ def prompt_mathalea(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="dev", few_shots_select="sequential", - generation_size=1, - metrics=[Metrics.loglikelihood_acc], + generation_size=-1, + metrics=get_metrics_for_formulation( + formulation, + [ + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + ], + ), stop_sequence=["\n"], version=0, ) - for subset, alias in GRADE_LEVELS.items() + + +TASKS_TABLE = [ + _make_tasks(subset, remove_accents(subset), formulation) + for subset in ["all"] + GRADE_LEVELS + for formulation in FORMULATIONS ] From ce6848f24ea1688935c9bd850680d38e69af40b8 Mon Sep 17 00:00:00 2001 From: lduignan Date: Mon, 23 Mar 2026 16:25:10 +0100 Subject: [PATCH 6/6] add system prompts in french and english --- community_tasks/mathalea.py | 49 +++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/community_tasks/mathalea.py b/community_tasks/mathalea.py index 5260858e8..792b76625 100644 --- a/community_tasks/mathalea.py +++ b/community_tasks/mathalea.py @@ -44,28 +44,40 @@ def remove_accents(text: str) -> str: FORMULATIONS = [MCFFormulation(), CFFormulation(), HybridFormulation()] -def format_choice(choice): - if isinstance(choice, str): - if choice.endswith("\qquad"): - choice = choice[:-6].strip() - return choice.strip() - if isinstance(choice, list): - return [format_choice(c) for c in choice] - raise ValueError(f"Unsupported choice type: {type(choice)}") +PROMPT_CONFIGS = { + "frprompt": { + "all": "Vous êtes un assistant mathématique pour les élèves du secondaire français.\n\n", + "grade": "Vous êtes un assistant mathématique pour les élèves de {subset}.\n\n", + }, + "enprompt": { + "all": "You are a helpful math assistant for French secondary school students.\n\n", + "grade": "You are a helpful math assistant for French students in grade {subset}.\n\n", + }, + "noprompt": None, +} + + +def _get_instruction(prompt_key, subset): + prompt_cfg = PROMPT_CONFIGS[prompt_key] + if prompt_cfg is None: + return None + if subset == "all": + return prompt_cfg["all"] + return prompt_cfg["grade"].format(subset=subset) + + +def _make_tasks(subset, alias, formulation, prompt_key): + instruction = _get_instruction(prompt_key, subset) -def format_question(question): - return question.replace("\\", "\n").strip() - - -def _make_tasks(subset, alias, formulation): return LightevalTaskConfig( - name=f"mathalea_{formulation.name.lower()}:{alias}", + name=f"mathalea_{formulation.name.lower()}_{prompt_key}:{alias}", prompt_function=get_mcq_prompt_function( Language.FRENCH, - lambda line: { - "question": format_question(line["question"]), - "choices": format_choice(line["choices"]), + lambda line, instr=instruction: { + "question": line["question"], + "choices": line["choices"], "gold_idx": int(line["answerKey"]), + **({"instruction": instr} if instr else {}), }, formulation=formulation, ), @@ -90,7 +102,8 @@ def _make_tasks(subset, alias, formulation): TASKS_TABLE = [ - _make_tasks(subset, remove_accents(subset), formulation) + _make_tasks(subset, remove_accents(subset), formulation, prompt_key) for subset in ["all"] + GRADE_LEVELS for formulation in FORMULATIONS + for prompt_key in PROMPT_CONFIGS ]