Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/tasks/CommonsenseQAMC_OLMES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
````
NAME = CommonsenseQAMC_OLMES
DATASET_PATH = tau/commonsense_qa
SAMPLE_SPLIT = validation
FEWSHOT_SPLIT = validation
SAMPLE_SPLIT = train
FEWSHOT_SPLIT = train
RESPONSE_TYPE = LOGLIKELIHOODS
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
SUBJECTS = ['no_subject']
Expand Down
20 changes: 20 additions & 0 deletions docs/tasks/DropCompletion_OLMES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# DropCompletion_OLMES

````
NAME = DropCompletion_OLMES
DATASET_PATH = EleutherAI/drop
SAMPLE_SPLIT = validation
FEWSHOT_SPLIT = train
RESPONSE_TYPE = COMPLETION
METRICS = [DropF1ExactMatch]
SUBJECTS = ['no_subject']
LANGUAGE = <Language.ENG: 'English'>
````

- Module: `eval_framework.tasks.benchmarks.drop`

- File: [src/eval_framework/tasks/benchmarks/drop.py](../../src/eval_framework/tasks/benchmarks/drop.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/drop.py)

- Link to dataset: [https://huggingface.co/datasets/EleutherAI/drop](https://huggingface.co/datasets/EleutherAI/drop)

More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "DropCompletion_OLMES"`.
2 changes: 1 addition & 1 deletion docs/tasks/MedQAMC_OLMES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
NAME = MedQAMC_OLMES
DATASET_PATH = davidheineman/medqa-en
SAMPLE_SPLIT = test
FEWSHOT_SPLIT = dev
FEWSHOT_SPLIT = train
RESPONSE_TYPE = LOGLIKELIHOODS
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
SUBJECTS = ['no_subject']
Expand Down
4 changes: 2 additions & 2 deletions docs/tasks/PIQA_OLMES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
````
NAME = PIQA_OLMES
DATASET_PATH = ybisk/piqa
SAMPLE_SPLIT = validation
FEWSHOT_SPLIT = test
SAMPLE_SPLIT = train
FEWSHOT_SPLIT = train
RESPONSE_TYPE = LOGLIKELIHOODS
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
SUBJECTS = ['no_subject']
Expand Down
3 changes: 2 additions & 1 deletion docs/tasks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

This directory contains the generated documentation for all benchmark tasks available in the package.

**Total number of tasks: 157**
**Total number of tasks: 158**

The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`.

Expand Down Expand Up @@ -42,6 +42,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not
- [DUC_EXTRACTIVE](DUC_EXTRACTIVE.md)
- [DropCloze](DropCloze.md)
- [DropCompletion](DropCompletion.md)
- [DropCompletion_OLMES](DropCompletion_OLMES.md)
- [DropMC](DropMC.md)
- [DropMC_OLMES](DropMC_OLMES.md)
- [Flores200](Flores200.md)
Expand Down
4 changes: 2 additions & 2 deletions docs/tasks/SCIQ_OLMES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
````
NAME = SCIQ_OLMES
DATASET_PATH = allenai/sciq
SAMPLE_SPLIT = validation
FEWSHOT_SPLIT = test
SAMPLE_SPLIT = train
FEWSHOT_SPLIT = train
RESPONSE_TYPE = LOGLIKELIHOODS
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
SUBJECTS = ['no_subject']
Expand Down
2 changes: 1 addition & 1 deletion docs/tasks/SocialIQAMC_OLMES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
````
NAME = SocialIQAMC_OLMES
DATASET_PATH = allenai/social_i_qa
SAMPLE_SPLIT = validation
SAMPLE_SPLIT = train
FEWSHOT_SPLIT = train
RESPONSE_TYPE = LOGLIKELIHOODS
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
Expand Down
16 changes: 13 additions & 3 deletions src/eval_framework/metrics/loglikelihood/bits_per_byte.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import math

from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Loglikelihood
from eval_framework.shared.types import Error, Loglikelihood


class BitsPerByteLoglikelihood(BaseMetric[Loglikelihood]):
Expand Down Expand Up @@ -37,7 +37,12 @@ def calculate(self, response: Loglikelihood) -> list[MetricResult]:
metric_name=self.NAME,
value=None,
higher_is_better=False,
error=response.error or "No ground-truth answer found in loglikelihoods",
error=response.error
or Error(
error_class="ValueError",
message="No ground-truth answer found in loglikelihoods",
traceback="",
),
)
]

Expand All @@ -48,7 +53,12 @@ def calculate(self, response: Loglikelihood) -> list[MetricResult]:
metric_name=self.NAME,
value=None,
higher_is_better=False,
error=response.error or "Ground-truth answer has zero UTF-8 bytes",
error=response.error
or Error(
error_class="ValueError",
message="Ground-truth answer has zero UTF-8 bytes",
traceback="",
),
)
]

Expand Down
2 changes: 2 additions & 0 deletions src/eval_framework/tasks/benchmarks/csqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class CommonsenseQAMC_OLMES(CommonsenseQAMC):
"""

NAME = "CommonsenseQAMC_OLMES"
SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits
FEWSHOT_SPLIT = "train"

def _get_instruction_text(self, item: dict[str, Any]) -> str:
question = item["question"]
Expand Down
37 changes: 29 additions & 8 deletions src/eval_framework/tasks/benchmarks/drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,21 @@ def __init__(self, num_fewshot: int = 0) -> None:

def _load_dataset(self, subject: str) -> None:
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH)
validation = list(hf_dataset.get(self.SAMPLE_SPLIT, []))
processed = []
for doc in validation:
parsed = _get_answers(doc)
if not parsed:
continue
processed.append({**doc, "parsed_answers": parsed})
self.dataset = self._shuffle_splits(hf_dataset={self.SAMPLE_SPLIT: processed, self.FEWSHOT_SPLIT: processed})

def process(docs: list[dict[str, Any]]) -> list[dict[str, Any]]:
result = []
for doc in docs:
parsed = _get_answers(doc)
if not parsed:
continue
result.append({**doc, "parsed_answers": parsed})
return result

sample_split = process(hf_dataset.get(self.SAMPLE_SPLIT, []))
fewshot_split = process(hf_dataset.get(self.FEWSHOT_SPLIT, []))
self.dataset = self._shuffle_splits(
hf_dataset={self.SAMPLE_SPLIT: sample_split, self.FEWSHOT_SPLIT: fewshot_split}
)

def _get_instruction_text(self, item: dict[str, Any]) -> str:
passage = (item.get("passage") or "").strip()
Expand All @@ -116,6 +123,17 @@ def _get_context(self, item: dict[str, Any]) -> DropMetricContext | None:
return DropMetricContext(answer_tuples=[list(a) for a in answers])


class DropCompletion_OLMES(DropCompletion):
"""DropCompletion matching OLMES, using train split for fewshot and max tokens 100."""

NAME = "DropCompletion_OLMES"
FEWSHOT_SPLIT = "train"

def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
self.max_tokens = 100


class DropMC(BaseTask[str]):
"""Multiple-choice variant using allenai/drop-gen2mc (passage_original, question_original, choices, answerKey)."""

Expand Down Expand Up @@ -151,6 +169,9 @@ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
labels = item.get("choices", {}).get("label", [])
return [f" {label}" for label in labels]

def _get_cue_text(self, item: dict[str, Any]) -> str:
return "Answer:"


class DropMC_OLMES(DropMC):
"""
Expand Down
1 change: 1 addition & 0 deletions src/eval_framework/tasks/benchmarks/medqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class MedQAMC_OLMES(MedQAMC):
"""

NAME = "MedQAMC_OLMES"
FEWSHOT_SPLIT = "train"

def _get_instruction_text(self, item: dict[str, Any]) -> str:
question = item["question"]
Expand Down
4 changes: 3 additions & 1 deletion src/eval_framework/tasks/benchmarks/piqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ class PIQA_OLMES(PIQA):
"""

NAME = "PIQA_OLMES"
SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits
FEWSHOT_SPLIT = "train"

def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
Expand All @@ -61,7 +63,7 @@ def _get_instruction_text(self, item: dict[str, Any]) -> str:
goal = item["goal"]
choices = [item["sol1"], item["sol2"]]
options = "\n".join(f" {key}. {choice}" for key, choice in zip(self.keys, choices))
return f"Question: {goal}\n{options}\n"
return f"Goal: {goal}\n{options}\n"

def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
idx = 0 if item["label"] == 0 else 1
Expand Down
2 changes: 2 additions & 0 deletions src/eval_framework/tasks/benchmarks/sciq.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ class SCIQ_OLMES(SCIQ):
"""

NAME = "SciQ_OLMES"
SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits
FEWSHOT_SPLIT = "train"

def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
Expand Down
13 changes: 12 additions & 1 deletion src/eval_framework/tasks/benchmarks/social_iqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,11 @@ class SocialIQACloze(BaseTask[str]):
SAMPLE_SPLIT = "validation"
FEWSHOT_SPLIT = "train"
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
METRICS = [
AccuracyLoglikelihood,
AccuracyNormLoglikelihood,
BitsPerByteLoglikelihood,
]
SUBJECTS = [NO_SUBJECT]
PERTURBATION_UNMODIFIABLE_WORDS = ["Question"]
LANGUAGE = Language.ENG
Expand All @@ -174,6 +178,11 @@ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
choices = [item["answerA"], item["answerB"], item["answerC"]]
return f" {choices[idx]}"

def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
ground_truth = self._get_ground_truth(item)
assert ground_truth is not None
return f"{self._get_cue_text(item)}{ground_truth}"

def _get_cue_text(self, item: dict[str, Any]) -> str:
return "Answer:"

Expand All @@ -189,6 +198,7 @@ class SocialIQAMC_OLMES(SocialIQACloze):
"""

NAME = "SocialIQAMC_OLMES"
SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits

def _get_instruction_text(self, item: dict[str, Any]) -> str:
query = _social_iqa_context_question(item)
Expand All @@ -214,6 +224,7 @@ class SocialIQAMC(SocialIQAMC_OLMES):
"""

NAME = "SocialIQAMC"
SAMPLE_SPLIT = "validation"

def _get_instruction_text(self, item: dict[str, Any]) -> str:
query = _social_iqa_context_question(item)
Expand Down
1 change: 1 addition & 0 deletions src/eval_framework/tasks/task_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def register_all_tasks() -> None:
register_lazy_task("eval_framework.tasks.benchmarks.csqa.CommonsenseQAMC")
register_lazy_task("eval_framework.tasks.benchmarks.csqa.CommonsenseQAMC_OLMES")
register_lazy_task("eval_framework.tasks.benchmarks.drop.DropCompletion")
register_lazy_task("eval_framework.tasks.benchmarks.drop.DropCompletion_OLMES")
register_lazy_task("eval_framework.tasks.benchmarks.drop.DropMC")
register_lazy_task("eval_framework.tasks.benchmarks.drop.DropMC_OLMES")
register_lazy_task("eval_framework.tasks.benchmarks.drop.DropCloze")
Expand Down
38 changes: 20 additions & 18 deletions tests/tests_eval_framework/tasks/task-prompts-hashes.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@
"CommonsenseQAFullTextCloze.Llama3Formatter": "7433c7dfb21e0da6ff0566c886bdd29c",
"CommonsenseQAMC.ConcatFormatter": "b49e21abf30a6c6b6f2d5665914fae6e",
"CommonsenseQAMC.Llama3Formatter": "1b97fe5fecd8a27e3f587693318bf17a",
"CommonsenseQAMC_OLMES.ConcatFormatter": "b49e21abf30a6c6b6f2d5665914fae6e",
"CommonsenseQAMC_OLMES.Llama3Formatter": "1b97fe5fecd8a27e3f587693318bf17a",
"CommonsenseQAMC_OLMES.ConcatFormatter": "ca6acb05cfe4bf09a0f2321c5561d293",
"CommonsenseQAMC_OLMES.Llama3Formatter": "f41e71ef8389525e730c5f8b41026cf5",
"DUC_ABSTRACTIVE.ConcatFormatter": "ab9a49f844aae7cd40470a9c8c1012ad",
"DUC_ABSTRACTIVE.Llama3Formatter": "b425e6e924f57ebbaeaa7a333d565d05",
"DUC_EXTRACTIVE.ConcatFormatter": "d773f051727e563369af29e183b2046b",
Expand All @@ -63,10 +63,12 @@
"DropCloze.Llama3Formatter": "3085d2db0aee52171eca445f2a710ed2",
"DropCompletion.ConcatFormatter": "5efb46197ee12236e2e674a5e9b93976",
"DropCompletion.Llama3Formatter": "a7c7ef7e39aefdef7d6227c46ccc6f05",
"DropMC.ConcatFormatter": "eb61c2c3e928559645a950d1f38b2bd8",
"DropMC.Llama3Formatter": "9ef3c50c1e2977a8c365b297e72e6077",
"DropMC_OLMES.ConcatFormatter": "581072d364a18d07164e7c36351b1992",
"DropMC_OLMES.Llama3Formatter": "cc33eace09b710fc53b51ba3851d52c9",
"DropCompletion_OLMES.ConcatFormatter": "388e4b193e8a87fc4207cff336687684",
"DropCompletion_OLMES.Llama3Formatter": "e285c67c3d807d9e667af0a56aeb95ec",
"DropMC.ConcatFormatter": "f50d96ea6246b9838ea0f20cec7f6167",
"DropMC.Llama3Formatter": "4a6849c53fdf51f8e6743f99716f9b24",
"DropMC_OLMES.ConcatFormatter": "798cdb2ca51bcb8c95cb7d086f2ca6cf",
"DropMC_OLMES.Llama3Formatter": "abf0fa6d3b668f00df6cd8fbe6dcf611",
"DropRC.ConcatFormatter": "b4fc7afed1c5d3fc7b185abfee10a94b",
"DropRC.Llama3Formatter": "9da70cae44e38ea15e4144f50395b4a4",
"Flores200.ConcatFormatter": "f9cc5b93d0611404471d88dd64aa6c19",
Expand Down Expand Up @@ -200,8 +202,8 @@
"MedQACloze.Llama3Formatter": "1308150c38062e8c34785f12e89b8c63",
"MedQAMC.ConcatFormatter": "786270f80b7b122fa0098ffbbfdba446",
"MedQAMC.Llama3Formatter": "fac0fd8a8d5d70e54686dea58f57fcb3",
"MedQAMC_OLMES.ConcatFormatter": "b3c29f76fa80e85d027d1ba3d6750478",
"MedQAMC_OLMES.Llama3Formatter": "c99d9ddb97aa1562705e810b1aff2e32",
"MedQAMC_OLMES.ConcatFormatter": "d5bac9bda6fc752ecc751be2cc23474b",
"MedQAMC_OLMES.Llama3Formatter": "40f759deb287df3adb3247b07c857efd",
"NaturalQsOpen.ConcatFormatter": "90a895244f9cd20ca3200808ff5ddbab",
"NaturalQsOpen.Llama3Formatter": "0955486e04308ca936b2c5267e5e9be9",
"NaturalQsOpenCloze.ConcatFormatter": "49042482667eeaa9363c6247927f642f",
Expand All @@ -226,8 +228,8 @@
"PIQA.Llama3Formatter": "9d33895f4024a9a1e61a00626e312313",
"PIQA_IDK.ConcatFormatter": "4c07fad4f1409a22bc9835eb848feecb",
"PIQA_IDK.Llama3Formatter": "94061c00140859fb99aa88a1e63b1b23",
"PIQA_OLMES.ConcatFormatter": "b5dc0e23810e4bb4801d07fdd54e6407",
"PIQA_OLMES.Llama3Formatter": "087eb23ebf6a54d5d286f59713669ad8",
"PIQA_OLMES.ConcatFormatter": "dd8a928404989320ce7b712d149bf5f5",
"PIQA_OLMES.Llama3Formatter": "e605ca9fb312e9309206e5aeba259169",
"QUALITY.ConcatFormatter": "49e990c091cf8b46b3afd1ae8db84797",
"QUALITY.Llama3Formatter": "408165ab35d5153c9f144021cbd27544",
"RenderableStructEval.ConcatFormatter": "40bd2d2bdd9e663bf1cbc147eceafced",
Expand All @@ -240,8 +242,8 @@
"SCIQEvalHarness_IDK.Llama3Formatter": "3e8e56b3b7662cff6ea2943e1bb88ba0",
"SCIQ_IDK.ConcatFormatter": "ffc25631a289d4dd536a5b6660cfb17c",
"SCIQ_IDK.Llama3Formatter": "bffb5e3d9a9f1b18a96fd74659c43c27",
"SCIQ_OLMES.ConcatFormatter": "a3f16028e92d9690c58522246032fee8",
"SCIQ_OLMES.Llama3Formatter": "7d5edb428b339cefb272907c3a368621",
"SCIQ_OLMES.ConcatFormatter": "820f12c83c89dfe8bbe19e062c37d259",
"SCIQ_OLMES.Llama3Formatter": "4172e30c45aa2105e1db0cd49dcf9cb7",
"SPHYR.ConcatFormatter": "f6b6f1044fda6325b744281256a81ba9",
"SPHYR.Llama3Formatter": "e67de80f6465104b6e511c5d339b9e45",
"SQUAD.ConcatFormatter": "88bf0d58404c024dd885217d19466f0d",
Expand All @@ -250,12 +252,12 @@
"SQUAD2.Llama3Formatter": "118f3757859029bee7da4bdfdb9fdf0b",
"SQUAD2BPB.ConcatFormatter": "a1ae38489334319a2d0ef3a8c503ec4c",
"SQUAD2BPB.Llama3Formatter": "097cea2ecf27f0d24b52633fa3ce363a",
"SocialIQACloze.ConcatFormatter": "9bd18f84263ae1dd28ea1099d1392b32",
"SocialIQACloze.Llama3Formatter": "10b3c20fe7a78b803377caef818fb31f",
"SocialIQAMC.ConcatFormatter": "1659f621b1dc4f410536d55f2e81c932",
"SocialIQAMC.Llama3Formatter": "c5ed5cd29b021fa0c71061905c48dfc1",
"SocialIQAMC_OLMES.ConcatFormatter": "7d3017ef98a2fe64ae8439555c3c1440",
"SocialIQAMC_OLMES.Llama3Formatter": "a49d524f98ea43d1deafc7b833c34529",
"SocialIQACloze.ConcatFormatter": "e20a2c81592da55f2d6e6c88d3a7104f",
"SocialIQACloze.Llama3Formatter": "1d4ddde3cc8857a459bd37f7b90ed9db",
"SocialIQAMC.ConcatFormatter": "96f91645eace7a45a93d37e49451a64d",
"SocialIQAMC.Llama3Formatter": "e0d30544bde34934a4695bc20698c882",
"SocialIQAMC_OLMES.ConcatFormatter": "e5e5dac8348978b6b8e0311c66fa1877",
"SocialIQAMC_OLMES.Llama3Formatter": "ac1a7164ab0a1b4eb6d6a5ed055b4be0",
"StructEval.ConcatFormatter": "280bb4a5e01945c4d25688bbb17468b5",
"StructEval.Llama3Formatter": "3441d8806336ce590e09c9681d8c8f6f",
"TRIVIAQA.ConcatFormatter": "4ee69bbf9d5ade2e7ed61e7455b06b0b",
Expand Down