Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions examples/custom_tasks/benchmark_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from eval_framework.metrics.completion_metrics import AccuracyCompletion # Import your metrics
from eval_framework.tasks.base import BaseTask, ResponseType, Sample
from eval_framework.tasks.dataloader import Dataloader


class YourBenchmarkTask(BaseTask[str]): # Replace with your class name
Expand All @@ -30,8 +31,8 @@ class YourBenchmarkTask(BaseTask[str]): # Replace with your class name
METRICS = [AccuracyCompletion] # List your metric classes
SUBJECTS = ["subject1", "subject2"] # Define your subjects/categories

def __init__(self, subjects: list[str] | None = None, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
def __init__(self, dataloader: Dataloader, subjects: list[str] | None = None, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)
self.subjects = subjects or self.SUBJECTS

def _get_instruction_text(self, item: dict[str, Any]) -> str:
Expand Down Expand Up @@ -94,8 +95,8 @@ class GeographyQATask(BaseTask[str]):
METRICS = [AccuracyCompletion]
SUBJECTS = ["Europe", "Asia"]

def __init__(self, subjects: list[str] | None = None, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
def __init__(self, dataloader: Dataloader, subjects: list[str] | None = None, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)
self.subjects = subjects or self.SUBJECTS

def _get_instruction_text(self, item: dict[str, Any]) -> str:
Expand Down
7 changes: 5 additions & 2 deletions src/eval_framework/response_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from functools import partial
from typing import Any, Callable, List

from eval_framework.tasks.dataloader import HFDataloader

try:
from determined import get_cluster_info
except ImportError:
Expand Down Expand Up @@ -52,16 +54,17 @@ def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFi
self.result_processor = result_processor
self.num_samples = config.num_samples
self.save_intermediate_results = config.save_intermediate_results
self.dataloader = HFDataloader()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would the idea be that we're keeping this hard coded atm and only once new dataloaders would be needed we refactor such that you can inject a dataloader into the ResponseGenerator?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering about the requirements: is it that a task can be loaded from HF as well as from some other source by using the same path (just by switching a dataloader) or that certain tasks are to be loaded from HF and some other tasks from some other source?


task_class = config.task_name.value
task_class.SUBJECTS = self._filter_task_subjects()
task_class.HF_REVISION = self._set_hf_revision()

if config.perturbation_config is not None:
perturbation_task_class = create_perturbation_class(task_class, config.perturbation_config)
self.task = perturbation_task_class(self.few_shot)
self.task = perturbation_task_class(num_fewshot=self.few_shot, dataloader=self.dataloader)
else:
self.task = task_class(self.few_shot)
self.task = task_class(num_fewshot=self.few_shot, dataloader=self.dataloader)

self.response_type = task_class.RESPONSE_TYPE

Expand Down
3 changes: 2 additions & 1 deletion src/eval_framework/task_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
ZERO_SCROLLS_SPACE_DIGEST,
ZERO_SCROLLS_SQUALITY,
)
from eval_framework.tasks.dataloader import HFDataloader

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -224,7 +225,7 @@ def _check_no_duplicate_names(cls) -> None:

def make_sure_all_hf_datasets_are_in_cache() -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was not part of this PR, but I realized this function is only used in the ci. Maybe a comment about that would be adequate. wdyt?

for task_name in TaskName:
task = task_name.value()
task = task_name.value(dataloader=HFDataloader())
for attempt in range(100):
try:
for _ in task.iterate_samples(num_samples=1):
Expand Down
46 changes: 10 additions & 36 deletions src/eval_framework/tasks/base.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
import os
import random
from abc import ABC, abstractmethod
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Any, Generic, Iterable, TypeVar
from typing import TYPE_CHECKING, Any, Generic, Iterable, TypeVar, cast

import iso639
from datasets import DownloadConfig, load_dataset
from huggingface_hub import HfApi
from huggingface_hub.errors import RevisionNotFoundError
from datasets import Features
from pydantic import BaseModel, ConfigDict

from eval_framework.shared.types import BaseMetricContext
from eval_framework.tasks.dataloader import Dataloader
from template_formatting.formatter import Message, Role

if TYPE_CHECKING:
Expand Down Expand Up @@ -81,6 +78,7 @@ class BaseTask(ABC, Generic[SubjectType]):
METRICS: list[type["BaseMetric"]]
SUBJECTS: list[SubjectType]
HF_REVISION: str | None = None # tag name, or branch name, or commit hash to ensure reproducibility
FEATURES: Features | None = None

# Words in _get_instruction_text() not to be perturbed. List of words is case insensitive. No special characters
# or whitespace should be included.
Expand All @@ -89,41 +87,17 @@ class BaseTask(ABC, Generic[SubjectType]):
# language by subtopic, or `None` (for tasks not specific to a single language).
LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None

def __init__(self, num_fewshot: int = 0) -> None:
def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None:
self.num_fewshot = num_fewshot
self.stop_sequences: list[str] | None = None
self.max_tokens: int | None = None

def _load_hf_dataset(self, **kwargs: Any) -> Any:
# Check if the HF_REVISION is valid before loading the dataset
if self.HF_REVISION:
try:
_ = HfApi().dataset_info(repo_id=kwargs["path"], revision=self.HF_REVISION, timeout=100.0)
except Exception as e:
if isinstance(e, RevisionNotFoundError):
raise e

cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
try:
return load_dataset(
**kwargs,
revision=self.HF_REVISION,
trust_remote_code=True,
cache_dir=cache_dir,
download_config=download_config,
)
except Exception:
return load_dataset(
**kwargs,
revision=self.HF_REVISION,
trust_remote_code=True,
cache_dir=f"{Path.home()}/.cache/eval-framework",
)
self.dataloader = dataloader

def _load_dataset(self, subject: SubjectType) -> None:
name = subject if subject != NO_SUBJECT else None
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
name = cast(str, subject) if subject != NO_SUBJECT else None
hf_dataset = self.dataloader.load(
path=self.DATASET_PATH, name=name, revision=self.HF_REVISION, features=self.FEATURES
)
self.dataset = {}
self.rnd = random.Random(RANDOM_SEED)

Expand Down
5 changes: 3 additions & 2 deletions src/eval_framework/tasks/benchmarks/arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.dataloader import Dataloader
from eval_framework.tasks.utils import get_n_letters


Expand All @@ -21,8 +22,8 @@ class ARC(BaseTask[str]):
PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(5)
LANGUAGE = Language.ENG

def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)

self.keys = get_n_letters(5) # needs to be 5 because there is one sample with 5 answer possibilities
self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
Expand Down
5 changes: 3 additions & 2 deletions src/eval_framework/tasks/benchmarks/arc_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
from eval_framework.tasks.dataloader import Dataloader
from eval_framework.tasks.utils import get_n_letters


Expand All @@ -21,8 +22,8 @@ class ARC_DE(BaseTask[str]):
PERTURBATION_UNMODIFIABLE_WORDS = ["Frage"] + get_n_letters(5)
LANGUAGE = Language.DEU

def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)

self.keys = get_n_letters(5) # needs to be 5 because there is one sample with 5 answer possibilities
self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
Expand Down
5 changes: 3 additions & 2 deletions src/eval_framework/tasks/benchmarks/arc_fi.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.dataloader import Dataloader
from eval_framework.tasks.utils import get_n_letters


Expand All @@ -21,8 +22,8 @@ class ARC_FI(BaseTask[str]):
PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(5)
LANGUAGE = Language.FIN

def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)

self.keys = get_n_letters(5) # needs to be 5 because there is one sample with 5 answer possibilities
self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
Expand Down
6 changes: 3 additions & 3 deletions src/eval_framework/tasks/benchmarks/belebele.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.dataloader import Dataloader
from eval_framework.tasks.utils import get_n_letters


Expand All @@ -23,9 +24,8 @@ class BELEBELE(BaseTask[str]):
PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4)
LANGUAGE = Language.ENG

def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)

def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)
self.keys = get_n_letters(4)
self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}

Expand Down
7 changes: 4 additions & 3 deletions src/eval_framework/tasks/benchmarks/bigcodebench.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
Sample,
SubjectType,
)
from eval_framework.tasks.dataloader import Dataloader

PROMPT_INSTRUCTION = (
"Please provide a self-contained Python script, without tests or example usage, that solves the following "
Expand All @@ -39,12 +40,12 @@ class BigCodeBench(BaseTask[str]):
SUBJECTS = ["original", "calibrated"]
LANGUAGE = Language.ENG

def __init__(self, num_fewshot: int = 0) -> None:
def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "Fewshot is not supported for BigCodeBench"
super().__init__(num_fewshot)
super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)

def _load_dataset(self, subject: SubjectType) -> None:
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None)
hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=None)
self.dataset = {}

self.rnd = random.Random(RANDOM_SEED)
Expand Down
2 changes: 1 addition & 1 deletion src/eval_framework/tasks/benchmarks/casehold.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CASEHOLD(BaseTask[str]):
def _load_dataset(self, subject: str) -> None:
name = subject if subject != NO_SUBJECT else None

hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=name)
self.dataset = {}

self.rnd = random.Random(RANDOM_SEED)
Expand Down
6 changes: 3 additions & 3 deletions src/eval_framework/tasks/benchmarks/chembench.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.dataloader import Dataloader
from eval_framework.tasks.utils import get_n_letters

CHEMBENCH_SUBJECTS = [
Expand Down Expand Up @@ -33,10 +34,9 @@ class ChemBenchMultipleChoice(BaseTask[str]):
SUBJECTS = CHEMBENCH_SUBJECTS
LANGUAGE = Language.ENG

def __init__(self, num_fewshot: int = 0) -> None:
def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "Fewshot is not supported for ChemBench"
super().__init__(num_fewshot)

super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)
self.keys = get_n_letters(16)

def _load_dataset(self, subject: str) -> None:
Expand Down
11 changes: 4 additions & 7 deletions src/eval_framework/tasks/benchmarks/duc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from eval_framework.metrics.base import BaseMetric
from eval_framework.metrics.completion_metrics.accuracy_completion import AccuracyCompletion
from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
from eval_framework.tasks.dataloader import Dataloader


class DUC(BaseTask[str], ABC):
Expand All @@ -20,9 +21,8 @@ class DUC(BaseTask[str], ABC):
PERTURBATION_UNMODIFIABLE_WORDS = ["Text", "Keyphrase"]
LANGUAGE = Language.ENG

def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)

def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)
self.stop_sequences: list[str] = ["Text:"]
self.max_tokens = 50 # longest keyphrase is less than 50 characters long

Expand Down Expand Up @@ -68,16 +68,13 @@ def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:

def _load_dataset(self, subject: str) -> None:
# not all samples have abstractive keyphrases
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject)
hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=subject)
self.dataset = {}

for split, data in hf_dataset.items():
data_list = list(filter(lambda x: len(x["abstractive_keyphrases"]) > 0, data))

if split == self.SAMPLE_SPLIT:
self.rnd = random.Random(RANDOM_SEED)
self.rnd.shuffle(data_list)

if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
self.dataset[split] = data_list

Expand Down
5 changes: 3 additions & 2 deletions src/eval_framework/tasks/benchmarks/flores200.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from eval_framework.metrics.completion_metrics.bleu import BLEU
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
from eval_framework.tasks.dataloader import Dataloader

FLORES_LANGUAGES = [
"deu_Latn",
Expand Down Expand Up @@ -33,8 +34,8 @@ class Flores200(BaseTask[str]):
"nld_Latn": Language.NLD,
}

def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)
self.stop_sequences = ["\n"]

def _get_instruction_text(self, item: dict[str, Any]) -> str:
Expand Down
10 changes: 5 additions & 5 deletions src/eval_framework/tasks/benchmarks/flores_plus.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from eval_framework.metrics.completion_metrics.COMET import COMET
from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
from eval_framework.tasks.dataloader import Dataloader

LANG_MAP = {
"deu_Latn": "German",
Expand Down Expand Up @@ -44,16 +45,15 @@ class FloresPlus(BaseTask[str]):
"ukr_Cyrl": Language.UKR,
}

def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot=num_fewshot, dataloader=dataloader)
self.stop_sequences = ["\n"]

def _load_dataset(self, subject: str) -> None:
hf_dataset_src = self._load_hf_dataset(path=self.DATASET_PATH, name=subject.split("-")[0])
hf_dataset_tgt = self._load_hf_dataset(path=self.DATASET_PATH, name=subject.split("-")[1])
hf_dataset_src = self.dataloader.load(path=self.DATASET_PATH, name=subject.split("-")[0])
hf_dataset_tgt = self.dataloader.load(path=self.DATASET_PATH, name=subject.split("-")[1])
self.dataset = {}
self.rnd = random.Random(42)

for split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
data_src = hf_dataset_src[split]
data_tgt = hf_dataset_tgt[split]
Expand Down
Loading
Loading