diff --git a/examples/custom_tasks/benchmark_template.py b/examples/custom_tasks/benchmark_template.py index 74a44d18..f802980c 100644 --- a/examples/custom_tasks/benchmark_template.py +++ b/examples/custom_tasks/benchmark_template.py @@ -18,6 +18,7 @@ from eval_framework.metrics.completion_metrics import AccuracyCompletion # Import your metrics from eval_framework.tasks.base import BaseTask, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader class YourBenchmarkTask(BaseTask[str]): # Replace with your class name @@ -30,8 +31,8 @@ class YourBenchmarkTask(BaseTask[str]): # Replace with your class name METRICS = [AccuracyCompletion] # List your metric classes SUBJECTS = ["subject1", "subject2"] # Define your subjects/categories - def __init__(self, subjects: list[str] | None = None, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, subjects: list[str] | None = None, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.subjects = subjects or self.SUBJECTS def _get_instruction_text(self, item: dict[str, Any]) -> str: @@ -94,8 +95,8 @@ class GeographyQATask(BaseTask[str]): METRICS = [AccuracyCompletion] SUBJECTS = ["Europe", "Asia"] - def __init__(self, subjects: list[str] | None = None, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, subjects: list[str] | None = None, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.subjects = subjects or self.SUBJECTS def _get_instruction_text(self, item: dict[str, Any]) -> str: diff --git a/src/eval_framework/response_generator.py b/src/eval_framework/response_generator.py index 87d91c7f..4dbf98ac 100644 --- a/src/eval_framework/response_generator.py +++ b/src/eval_framework/response_generator.py @@ -4,6 +4,8 @@ from functools import partial from typing import Any, Callable, List +from eval_framework.tasks.dataloader import HFDataloader + try: from determined import get_cluster_info except ImportError: @@ -52,6 +54,7 @@ def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFi self.result_processor = result_processor self.num_samples = config.num_samples self.save_intermediate_results = config.save_intermediate_results + self.dataloader = HFDataloader() task_class = config.task_name.value task_class.SUBJECTS = self._filter_task_subjects() @@ -59,9 +62,9 @@ def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFi if config.perturbation_config is not None: perturbation_task_class = create_perturbation_class(task_class, config.perturbation_config) - self.task = perturbation_task_class(self.few_shot) + self.task = perturbation_task_class(num_fewshot=self.few_shot, dataloader=self.dataloader) else: - self.task = task_class(self.few_shot) + self.task = task_class(num_fewshot=self.few_shot, dataloader=self.dataloader) self.response_type = task_class.RESPONSE_TYPE diff --git a/src/eval_framework/task_names.py b/src/eval_framework/task_names.py index 8923a575..7c61d911 100644 --- a/src/eval_framework/task_names.py +++ b/src/eval_framework/task_names.py @@ -91,6 +91,7 @@ ZERO_SCROLLS_SPACE_DIGEST, ZERO_SCROLLS_SQUALITY, ) +from eval_framework.tasks.dataloader import HFDataloader logger = logging.getLogger(__name__) @@ -224,7 +225,7 @@ def _check_no_duplicate_names(cls) -> None: def make_sure_all_hf_datasets_are_in_cache() -> None: for task_name in TaskName: - task = task_name.value() + task = task_name.value(dataloader=HFDataloader()) for attempt in range(100): try: for _ in task.iterate_samples(num_samples=1): diff --git a/src/eval_framework/tasks/base.py b/src/eval_framework/tasks/base.py index 440da156..a886ad8f 100644 --- a/src/eval_framework/tasks/base.py +++ b/src/eval_framework/tasks/base.py @@ -1,17 +1,14 @@ -import os import random from abc import ABC, abstractmethod from enum import Enum -from pathlib import Path -from typing import TYPE_CHECKING, Any, Generic, Iterable, TypeVar +from typing import TYPE_CHECKING, Any, Generic, Iterable, TypeVar, cast import iso639 -from datasets import DownloadConfig, load_dataset -from huggingface_hub import HfApi -from huggingface_hub.errors import RevisionNotFoundError +from datasets import Features from pydantic import BaseModel, ConfigDict from eval_framework.shared.types import BaseMetricContext +from eval_framework.tasks.dataloader import Dataloader from template_formatting.formatter import Message, Role if TYPE_CHECKING: @@ -81,6 +78,7 @@ class BaseTask(ABC, Generic[SubjectType]): METRICS: list[type["BaseMetric"]] SUBJECTS: list[SubjectType] HF_REVISION: str | None = None # tag name, or branch name, or commit hash to ensure reproducibility + FEATURES: Features | None = None # Words in _get_instruction_text() not to be perturbed. List of words is case insensitive. No special characters # or whitespace should be included. @@ -89,41 +87,17 @@ class BaseTask(ABC, Generic[SubjectType]): # language by subtopic, or `None` (for tasks not specific to a single language). LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: self.num_fewshot = num_fewshot self.stop_sequences: list[str] | None = None self.max_tokens: int | None = None - - def _load_hf_dataset(self, **kwargs: Any) -> Any: - # Check if the HF_REVISION is valid before loading the dataset - if self.HF_REVISION: - try: - _ = HfApi().dataset_info(repo_id=kwargs["path"], revision=self.HF_REVISION, timeout=100.0) - except Exception as e: - if isinstance(e, RevisionNotFoundError): - raise e - - cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets") - download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5) - try: - return load_dataset( - **kwargs, - revision=self.HF_REVISION, - trust_remote_code=True, - cache_dir=cache_dir, - download_config=download_config, - ) - except Exception: - return load_dataset( - **kwargs, - revision=self.HF_REVISION, - trust_remote_code=True, - cache_dir=f"{Path.home()}/.cache/eval-framework", - ) + self.dataloader = dataloader def _load_dataset(self, subject: SubjectType) -> None: - name = subject if subject != NO_SUBJECT else None - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name) + name = cast(str, subject) if subject != NO_SUBJECT else None + hf_dataset = self.dataloader.load( + path=self.DATASET_PATH, name=name, revision=self.HF_REVISION, features=self.FEATURES + ) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) diff --git a/src/eval_framework/tasks/benchmarks/arc.py b/src/eval_framework/tasks/benchmarks/arc.py index 09a41c75..ff4b6189 100644 --- a/src/eval_framework/tasks/benchmarks/arc.py +++ b/src/eval_framework/tasks/benchmarks/arc.py @@ -5,6 +5,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters @@ -21,8 +22,8 @@ class ARC(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(5) LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(5) # needs to be 5 because there is one sample with 5 answer possibilities self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)} diff --git a/src/eval_framework/tasks/benchmarks/arc_de.py b/src/eval_framework/tasks/benchmarks/arc_de.py index e4c9dce7..53dc7de6 100644 --- a/src/eval_framework/tasks/benchmarks/arc_de.py +++ b/src/eval_framework/tasks/benchmarks/arc_de.py @@ -5,6 +5,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters @@ -21,8 +22,8 @@ class ARC_DE(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Frage"] + get_n_letters(5) LANGUAGE = Language.DEU - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(5) # needs to be 5 because there is one sample with 5 answer possibilities self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)} diff --git a/src/eval_framework/tasks/benchmarks/arc_fi.py b/src/eval_framework/tasks/benchmarks/arc_fi.py index c1fd71d4..2c28603c 100644 --- a/src/eval_framework/tasks/benchmarks/arc_fi.py +++ b/src/eval_framework/tasks/benchmarks/arc_fi.py @@ -5,6 +5,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters @@ -21,8 +22,8 @@ class ARC_FI(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(5) LANGUAGE = Language.FIN - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(5) # needs to be 5 because there is one sample with 5 answer possibilities self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)} diff --git a/src/eval_framework/tasks/benchmarks/belebele.py b/src/eval_framework/tasks/benchmarks/belebele.py index 2b6bee61..9bba8592 100644 --- a/src/eval_framework/tasks/benchmarks/belebele.py +++ b/src/eval_framework/tasks/benchmarks/belebele.py @@ -5,6 +5,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters @@ -23,9 +24,8 @@ class BELEBELE(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4) LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) - + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(4) self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)} diff --git a/src/eval_framework/tasks/benchmarks/bigcodebench.py b/src/eval_framework/tasks/benchmarks/bigcodebench.py index a0330ed0..ca0300c9 100644 --- a/src/eval_framework/tasks/benchmarks/bigcodebench.py +++ b/src/eval_framework/tasks/benchmarks/bigcodebench.py @@ -14,6 +14,7 @@ Sample, SubjectType, ) +from eval_framework.tasks.dataloader import Dataloader PROMPT_INSTRUCTION = ( "Please provide a self-contained Python script, without tests or example usage, that solves the following " @@ -39,12 +40,12 @@ class BigCodeBench(BaseTask[str]): SUBJECTS = ["original", "calibrated"] LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Fewshot is not supported for BigCodeBench" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _load_dataset(self, subject: SubjectType) -> None: - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=None) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) diff --git a/src/eval_framework/tasks/benchmarks/casehold.py b/src/eval_framework/tasks/benchmarks/casehold.py index 00bb977c..69d37217 100644 --- a/src/eval_framework/tasks/benchmarks/casehold.py +++ b/src/eval_framework/tasks/benchmarks/casehold.py @@ -21,7 +21,7 @@ class CASEHOLD(BaseTask[str]): def _load_dataset(self, subject: str) -> None: name = subject if subject != NO_SUBJECT else None - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=name) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) diff --git a/src/eval_framework/tasks/benchmarks/chembench.py b/src/eval_framework/tasks/benchmarks/chembench.py index 27a7e730..bb5c0026 100644 --- a/src/eval_framework/tasks/benchmarks/chembench.py +++ b/src/eval_framework/tasks/benchmarks/chembench.py @@ -6,6 +6,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters CHEMBENCH_SUBJECTS = [ @@ -33,10 +34,9 @@ class ChemBenchMultipleChoice(BaseTask[str]): SUBJECTS = CHEMBENCH_SUBJECTS LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Fewshot is not supported for ChemBench" - super().__init__(num_fewshot) - + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(16) def _load_dataset(self, subject: str) -> None: diff --git a/src/eval_framework/tasks/benchmarks/duc.py b/src/eval_framework/tasks/benchmarks/duc.py index 896e67bf..151e2acb 100644 --- a/src/eval_framework/tasks/benchmarks/duc.py +++ b/src/eval_framework/tasks/benchmarks/duc.py @@ -6,6 +6,7 @@ from eval_framework.metrics.base import BaseMetric from eval_framework.metrics.completion_metrics.accuracy_completion import AccuracyCompletion from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader class DUC(BaseTask[str], ABC): @@ -20,9 +21,8 @@ class DUC(BaseTask[str], ABC): PERTURBATION_UNMODIFIABLE_WORDS = ["Text", "Keyphrase"] LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) - + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["Text:"] self.max_tokens = 50 # longest keyphrase is less than 50 characters long @@ -68,16 +68,13 @@ def _get_ground_truth(self, item: dict[str, Any]) -> list[str]: def _load_dataset(self, subject: str) -> None: # not all samples have abstractive keyphrases - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=subject) self.dataset = {} - for split, data in hf_dataset.items(): data_list = list(filter(lambda x: len(x["abstractive_keyphrases"]) > 0, data)) - if split == self.SAMPLE_SPLIT: self.rnd = random.Random(RANDOM_SEED) self.rnd.shuffle(data_list) - if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]: self.dataset[split] = data_list diff --git a/src/eval_framework/tasks/benchmarks/flores200.py b/src/eval_framework/tasks/benchmarks/flores200.py index 5c0eec45..e322c984 100644 --- a/src/eval_framework/tasks/benchmarks/flores200.py +++ b/src/eval_framework/tasks/benchmarks/flores200.py @@ -4,6 +4,7 @@ from eval_framework.metrics.completion_metrics.bleu import BLEU from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader FLORES_LANGUAGES = [ "deu_Latn", @@ -33,8 +34,8 @@ class Flores200(BaseTask[str]): "nld_Latn": Language.NLD, } - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences = ["\n"] def _get_instruction_text(self, item: dict[str, Any]) -> str: diff --git a/src/eval_framework/tasks/benchmarks/flores_plus.py b/src/eval_framework/tasks/benchmarks/flores_plus.py index f105cc20..38676b49 100644 --- a/src/eval_framework/tasks/benchmarks/flores_plus.py +++ b/src/eval_framework/tasks/benchmarks/flores_plus.py @@ -7,6 +7,7 @@ from eval_framework.metrics.completion_metrics.COMET import COMET from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader LANG_MAP = { "deu_Latn": "German", @@ -44,16 +45,15 @@ class FloresPlus(BaseTask[str]): "ukr_Cyrl": Language.UKR, } - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences = ["\n"] def _load_dataset(self, subject: str) -> None: - hf_dataset_src = self._load_hf_dataset(path=self.DATASET_PATH, name=subject.split("-")[0]) - hf_dataset_tgt = self._load_hf_dataset(path=self.DATASET_PATH, name=subject.split("-")[1]) + hf_dataset_src = self.dataloader.load(path=self.DATASET_PATH, name=subject.split("-")[0]) + hf_dataset_tgt = self.dataloader.load(path=self.DATASET_PATH, name=subject.split("-")[1]) self.dataset = {} self.rnd = random.Random(42) - for split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]: data_src = hf_dataset_src[split] data_tgt = hf_dataset_tgt[split] diff --git a/src/eval_framework/tasks/benchmarks/gpqa.py b/src/eval_framework/tasks/benchmarks/gpqa.py index a984a8ae..a5421a8d 100644 --- a/src/eval_framework/tasks/benchmarks/gpqa.py +++ b/src/eval_framework/tasks/benchmarks/gpqa.py @@ -9,7 +9,8 @@ AccuracyLoglikelihood, AccuracyNormLoglikelihood, ) -from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType +from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters logger = logging.getLogger(__name__) @@ -28,17 +29,17 @@ class GPQA(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(4) LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences = ["Question:"] self.keys = get_n_letters(4) self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)} self.rnd_choice_shuffle = random.Random(RANDOM_SEED) - def _load_dataset(self, subject: SubjectType) -> None: + def _load_dataset(self, subject: str) -> None: name = subject if subject != NO_SUBJECT else None - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=name) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) @@ -126,9 +127,9 @@ class GPQA_COT(GPQA): ) ANS_RE = re.compile(r"Therefore, the answer is \(([ABCDEFGHIJ])\)") - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Fewshot is not supported for GPQA_COT" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["Question:"] self.keys = get_n_letters(4) self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)} diff --git a/src/eval_framework/tasks/benchmarks/gsm8k.py b/src/eval_framework/tasks/benchmarks/gsm8k.py index 3cd60ac3..0cef0651 100644 --- a/src/eval_framework/tasks/benchmarks/gsm8k.py +++ b/src/eval_framework/tasks/benchmarks/gsm8k.py @@ -3,6 +3,7 @@ from eval_framework.metrics.completion_metrics.accuracy_completion import AccuracyCompletion from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)") @@ -98,8 +99,8 @@ class GSM8K(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) # until: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml self.stop_sequences: list[str] = ["Question:"] @@ -134,9 +135,9 @@ class GSM8KLlamaVersion(GSM8K): NAME = "GSM8K Llama Version" FEWSHOT_SPLIT = "" # Changed to empty string since we're using predefined examples - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot <= len(FEWSHOT_ITEMS), f"Fewshot larger than {len(FEWSHOT_ITEMS)} is not supported for GSM8K" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_instruction_text(self, item: dict[str, Any]) -> str: # Remove the bracketed computations from the question diff --git a/src/eval_framework/tasks/benchmarks/humaneval.py b/src/eval_framework/tasks/benchmarks/humaneval.py index b07c2b2b..a1312b74 100644 --- a/src/eval_framework/tasks/benchmarks/humaneval.py +++ b/src/eval_framework/tasks/benchmarks/humaneval.py @@ -3,6 +3,7 @@ from eval_framework.metrics.completion_metrics.code_assertion import CodeCompletionAssertion from eval_framework.shared.types import BaseMetricContext from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader CODE_TO_EXECUTE = """ {start_of_code} @@ -35,8 +36,8 @@ class HumanEval(BaseTask[str]): SUBJECTS = [NO_SUBJECT] LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["```"] def _get_instruction_text(self, item: dict[str, Any]) -> str: @@ -82,9 +83,9 @@ class HumanEvalInstruct(HumanEval): NAME = "Human Eval Instruct" CUE_PREFIX = "Here is the completed function:\n```python\n" - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Fewshot is not supported for Human Eval Instruct" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_instruction_text(self, item: dict[str, Any]) -> str: instruction_text = ( diff --git a/src/eval_framework/tasks/benchmarks/ifeval.py b/src/eval_framework/tasks/benchmarks/ifeval.py index 4247d8df..dd14d253 100644 --- a/src/eval_framework/tasks/benchmarks/ifeval.py +++ b/src/eval_framework/tasks/benchmarks/ifeval.py @@ -2,6 +2,7 @@ from eval_framework.metrics.completion_metrics.ifeval import IFEvalMetric, IFEvalMetricContext from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader class IFEval(BaseTask[str]): @@ -16,9 +17,9 @@ class IFEval(BaseTask[str]): SUBJECTS = [NO_SUBJECT] LANGUAGE = {NO_SUBJECT: Language.ENG} - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "IFEval does not support few-shot prompting." + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_instruction_text(self, item: dict[str, Any]) -> str: return item["prompt"] diff --git a/src/eval_framework/tasks/benchmarks/include.py b/src/eval_framework/tasks/benchmarks/include.py index f59ddc96..bb63a34f 100644 --- a/src/eval_framework/tasks/benchmarks/include.py +++ b/src/eval_framework/tasks/benchmarks/include.py @@ -5,6 +5,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters INCLUDE_SUBJECTS = [ @@ -80,8 +81,8 @@ class INCLUDE(BaseTask[str]): SUBJECTS = INCLUDE_SUBJECTS LANGUAGE = {lang: subject_to_language(lang) for lang in INCLUDE_SUBJECTS} - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(4) def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: diff --git a/src/eval_framework/tasks/benchmarks/infinitebench.py b/src/eval_framework/tasks/benchmarks/infinitebench.py index a3109ab4..3089524f 100644 --- a/src/eval_framework/tasks/benchmarks/infinitebench.py +++ b/src/eval_framework/tasks/benchmarks/infinitebench.py @@ -1,14 +1,13 @@ -import os import re from abc import ABC -from pathlib import Path from typing import Any -from datasets import DownloadConfig, Features, Sequence, Value, load_dataset +from datasets import Features, Sequence, Value from eval_framework.metrics.completion_metrics.accuracy_completion import AccuracyCompletion from eval_framework.metrics.loglikelihood_metrics.accuracy_loglikelihood import AccuracyLoglikelihood from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader class InfiniteBench(BaseTask[str], ABC): @@ -21,34 +20,19 @@ class InfiniteBench(BaseTask[str], ABC): SUBJECTS = ["default"] LANGUAGE = Language.ENG PERTURBATION_UNMODIFIABLE_WORDS = None - - def __init__(self, num_fewshot: int = 0) -> None: + FEATURES = Features( + { + "id": Value("int64"), + "context": Value("string"), + "input": Value("string"), + "answer": Sequence(Value("string")), + "options": Sequence(Value("string")), + } + ) + + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Few-shots are not supported for long-context InfiniteBench tasks" - super().__init__(num_fewshot) - - def _load_hf_dataset(self, **kwargs: Any) -> Any: - cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets") - download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5) - ft = Features( - { - "id": Value("int64"), - "context": Value("string"), - "input": Value("string"), - "answer": Sequence(Value("string")), - "options": Sequence(Value("string")), - } - ) - try: - return load_dataset( - **kwargs, trust_remote_code=True, cache_dir=cache_dir, download_config=download_config, features=ft - ) - except Exception: - return load_dataset( - **kwargs, - trust_remote_code=True, - cache_dir=f"{Path.home()}/.cache/eval-framework", - features=ft, - ) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) class InfiniteBenchLoglikelihood(InfiniteBench, ABC): @@ -104,8 +88,8 @@ class InfiniteBench_CodeRun(InfiniteBenchCompletion): SAMPLE_SPLIT = "code_run" FEWSHOT_SPLIT = SAMPLE_SPLIT - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["\n"] self.max_tokens = 30 # Avg Output Tokens: 1.3 @@ -130,8 +114,8 @@ class InfiniteBench_EnDia(InfiniteBenchCompletion): SAMPLE_SPLIT = "longdialogue_qa_eng" FEWSHOT_SPLIT = SAMPLE_SPLIT - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["\n"] self.max_tokens = 30 # Avg Output Tokens: 3.4 @@ -159,8 +143,8 @@ class InfiniteBench_EnQA(InfiniteBenchCompletion): SAMPLE_SPLIT = "longbook_qa_eng" FEWSHOT_SPLIT = SAMPLE_SPLIT - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["\n"] self.max_tokens = 30 # Avg Output Tokens: 4.8 @@ -185,8 +169,8 @@ class InfiniteBench_MathFind(InfiniteBenchCompletion): SAMPLE_SPLIT = "math_find" FEWSHOT_SPLIT = SAMPLE_SPLIT - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["\n"] self.max_tokens = 30 # Avg Output Tokens: 1.3 @@ -211,8 +195,8 @@ class InfiniteBench_RetrieveKV2(InfiniteBenchCompletion): SAMPLE_SPLIT = "kv_retrieval" FEWSHOT_SPLIT = SAMPLE_SPLIT - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["\n"] self.max_tokens = 40 # Avg Output Tokens: 22.7 (all answers are 36 chars) @@ -240,8 +224,8 @@ class InfiniteBench_RetrieveNumber(InfiniteBenchCompletion): SAMPLE_SPLIT = "number_string" FEWSHOT_SPLIT = SAMPLE_SPLIT - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["\n"] self.max_tokens = 12 # Avg Output Tokens: 4.0 (all answers are 10 digits integers) @@ -272,8 +256,8 @@ class InfiniteBench_RetrievePassKey1(InfiniteBenchCompletion): SAMPLE_SPLIT = "passkey" FEWSHOT_SPLIT = SAMPLE_SPLIT - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["\n"] self.max_tokens = 8 # Avg Output Tokens: 2.0 (all answers are 5 digits integers) diff --git a/src/eval_framework/tasks/benchmarks/math_reasoning.py b/src/eval_framework/tasks/benchmarks/math_reasoning.py index 0d332315..eef21487 100644 --- a/src/eval_framework/tasks/benchmarks/math_reasoning.py +++ b/src/eval_framework/tasks/benchmarks/math_reasoning.py @@ -6,6 +6,7 @@ from eval_framework.metrics.completion_metrics.language_checker import LanguageRawConsistencyChecker from eval_framework.metrics.completion_metrics.math_reasoning_completion import MathReasoningCompletion from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader SubjectType = TypeVar("SubjectType") @@ -26,8 +27,8 @@ class MATHReasoning(BaseTask[str]): ANSWER_PATTERN = r"(?i)Answer\s*:\s*(.*)" LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) # Max tokens are going to be determined by the model. # however GPT paper and results used 1024 tokens, s1 used 2048 @@ -346,9 +347,9 @@ class AIME2024(MATHReasoning): Problem: {Question}""" # noqa: E501 ANSWER_PATTERN = r"Therefore, the final answer is:(.*?). I hope it is correct." - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "AIME evaluation does not include few shot" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_instruction_text(self, item: dict[str, Any]) -> str: return self.QUERY_TEMPLATE.format(Question=item["problem"]) @@ -388,9 +389,9 @@ class MATH500(MATHReasoning): where [answer] is just the final number or expression that solves the problem. """.strip() # noqa: E501 - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "MATH-500 evaluation does not include few shot" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: extracted_answer_boxed = self._extract_answer(completion_text) @@ -432,8 +433,8 @@ class MATH(MATHReasoning): ] LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences = ["\nProblem:", "\nProblem", "\n\nProblem:", "\n\nProblem"] def extract_last_two_dollar_text(self, s: str) -> str: @@ -483,10 +484,10 @@ def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]: class MATHLvl5(MATH): NAME = "Math Lvl 5" - def _load_dataset(self, subject: SubjectType) -> None: + def _load_dataset(self, subject: str) -> None: name = subject if subject != NO_SUBJECT else None - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=name) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) @@ -530,9 +531,9 @@ class GSM8KReasoning(MATHReasoning): Answer:""" - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "GSM8K Reasoning is designed for zero-shot evaluation only" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = [] def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: diff --git a/src/eval_framework/tasks/benchmarks/mbpp.py b/src/eval_framework/tasks/benchmarks/mbpp.py index e2fc409a..9dec4c49 100644 --- a/src/eval_framework/tasks/benchmarks/mbpp.py +++ b/src/eval_framework/tasks/benchmarks/mbpp.py @@ -8,6 +8,7 @@ ) from eval_framework.shared.types import BaseMetricContext from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader logger = logging.getLogger(__name__) @@ -38,8 +39,8 @@ class MBPP(BaseTask[str]): SUBJECTS = ["full"] # , "sanitized"] # these are HF dataset SUBSETS! LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences = [END] @staticmethod diff --git a/src/eval_framework/tasks/benchmarks/mmlu.py b/src/eval_framework/tasks/benchmarks/mmlu.py index 3ca680bb..bfeed56e 100644 --- a/src/eval_framework/tasks/benchmarks/mmlu.py +++ b/src/eval_framework/tasks/benchmarks/mmlu.py @@ -7,6 +7,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters MMLU_SUBJECTS = [ @@ -83,9 +84,8 @@ class MMLU(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4) LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) - + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(4) def _get_subject_name(self, item: dict[str, Any]) -> str: @@ -152,9 +152,9 @@ class MMLU_COT(MMLU): ANS_RE = re.compile(r"Therefore, the answer is: ([ABCD])") - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Fewshot is not supported for MMLU_COT" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["Question:"] def _extract_answer(self, completion: str) -> str: diff --git a/src/eval_framework/tasks/benchmarks/mmlu_de.py b/src/eval_framework/tasks/benchmarks/mmlu_de.py index 8dd88df3..5b1ab6ac 100644 --- a/src/eval_framework/tasks/benchmarks/mmlu_de.py +++ b/src/eval_framework/tasks/benchmarks/mmlu_de.py @@ -5,6 +5,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters MMLU_SUBJECTS_TRANSLATION = { @@ -81,8 +82,8 @@ class MMLU_DE(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Frage"] + get_n_letters(4) LANGUAGE = Language.DEU - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(4) diff --git a/src/eval_framework/tasks/benchmarks/mmlu_pro.py b/src/eval_framework/tasks/benchmarks/mmlu_pro.py index a8ae5c62..06f8550f 100644 --- a/src/eval_framework/tasks/benchmarks/mmlu_pro.py +++ b/src/eval_framework/tasks/benchmarks/mmlu_pro.py @@ -8,6 +8,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters MMLU_PRO_SUBJECTS = [ @@ -41,15 +42,14 @@ class MMLU_PRO(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = get_n_letters(10) LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) - + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(10) def _load_dataset(self, subject: str) -> None: name = subject if subject != NO_SUBJECT else None - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH) hf_dataset = hf_dataset.filter(lambda example: example["category"] == name) @@ -97,9 +97,9 @@ class MMLU_PRO_COT(MMLU_PRO): ) ANS_RE = re.compile(r"Therefore, the answer is \(([ABCDEFGHIJ])\)") - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Fewshot is not supported for MMLU_PRO_COT" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["Question:"] def _extract_answer(self, completion: str) -> str: diff --git a/src/eval_framework/tasks/benchmarks/mmmlu.py b/src/eval_framework/tasks/benchmarks/mmmlu.py index e15aff52..caf45335 100644 --- a/src/eval_framework/tasks/benchmarks/mmmlu.py +++ b/src/eval_framework/tasks/benchmarks/mmmlu.py @@ -11,6 +11,7 @@ ) from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample from eval_framework.tasks.benchmarks.mmlu import MMLU_SUBJECTS +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters MMMLU_LANGS = ["FR_FR", "DE_DE", "ES_LA", "IT_IT", "PT_BR", "AR_XY"] @@ -432,13 +433,13 @@ class MMMLU(BaseTask[tuple[str, str]]): for subject in subjects } - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(4) def _load_dataset(self, subject: tuple[str, str]) -> None: lang, current_subject = subject - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=lang) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=lang) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) @@ -490,9 +491,9 @@ class MMMLU_GERMAN_COT(MMMLU): ANS_RE = re.compile(r"Daher lautet die Antwort: ([ABCD])") - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Fewshot is not supported for MMMLU_GERMAN_COT" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["Frage:", "Question:"] def _extract_answer(self, completion: str) -> str: diff --git a/src/eval_framework/tasks/benchmarks/openbookqa.py b/src/eval_framework/tasks/benchmarks/openbookqa.py index 27da64cf..9fe6b004 100644 --- a/src/eval_framework/tasks/benchmarks/openbookqa.py +++ b/src/eval_framework/tasks/benchmarks/openbookqa.py @@ -5,6 +5,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters @@ -21,8 +22,8 @@ class OPENBOOKQA(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = get_n_letters(4) LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(4) self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)} diff --git a/src/eval_framework/tasks/benchmarks/opengptx_eu20.py b/src/eval_framework/tasks/benchmarks/opengptx_eu20.py index 97fcf38d..b6fbebe5 100644 --- a/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +++ b/src/eval_framework/tasks/benchmarks/opengptx_eu20.py @@ -140,7 +140,7 @@ def _load_dataset(self, subject: SubjectType) -> None: subject names to huggingface.""" self.target_identifier = f"{str(subject)}_targets" - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name="mc_DE") + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name="mc_DE") self.dataset = {} self.rnd = random.Random(RANDOM_SEED) @@ -186,7 +186,7 @@ class TRUTHFULQA_EU20_FR(TRUTHFULQA): def _load_dataset(self, subject: SubjectType) -> None: self.target_identifier = f"{str(subject)}_targets" - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name="mc_FR") + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name="mc_FR") self.dataset = {} self.rnd = random.Random(RANDOM_SEED) @@ -220,10 +220,10 @@ class MMLU_EU20_DE(MMLU): PERTURBATION_UNMODIFIABLE_WORDS = MMLU.PERTURBATION_UNMODIFIABLE_WORDS + ["Frage"] LANGUAGE = Language.DEU - def _load_dataset(self, subject: SubjectType) -> None: + def _load_dataset(self, subject: str) -> None: name = subject if subject != NO_SUBJECT else None - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=name) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) @@ -326,10 +326,10 @@ class MMLU_EU20_FR(MMLU): SUBJECTS = [i + "_FR" for i in MMLU_SUBJECTS] LANGUAGE = Language.FRA - def _load_dataset(self, subject: SubjectType) -> None: + def _load_dataset(self, subject: str) -> None: name = subject if subject != NO_SUBJECT else None - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=name) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) diff --git a/src/eval_framework/tasks/benchmarks/pawsx.py b/src/eval_framework/tasks/benchmarks/pawsx.py index 629d3614..59a33c5a 100644 --- a/src/eval_framework/tasks/benchmarks/pawsx.py +++ b/src/eval_framework/tasks/benchmarks/pawsx.py @@ -18,9 +18,6 @@ class PAWSX(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Ja", "Nein", "Paraphrasen", "Yes", "No", "paraphrases"] LANGUAGE = {"en": Language.ENG, "de": Language.DEU} - def __init__(self, num_fewshot: int = 0) -> None: - self.num_fewshot = num_fewshot - def _get_instruction_text(self, item: dict[str, Any]) -> str: # PARAPHRASUS seems to use English prompt for all languages but that's a bit weird, let's do it properly. match item["subject"]: diff --git a/src/eval_framework/tasks/benchmarks/quality.py b/src/eval_framework/tasks/benchmarks/quality.py index ed231657..43ceac64 100644 --- a/src/eval_framework/tasks/benchmarks/quality.py +++ b/src/eval_framework/tasks/benchmarks/quality.py @@ -6,6 +6,7 @@ AccuracyNormLoglikelihood, ) from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, SubjectType +from eval_framework.tasks.dataloader import Dataloader class QUALITY(BaseTask[str]): @@ -20,12 +21,12 @@ class QUALITY(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Article", "Question", "Answer"] LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "QuALITY only supports zero fewshot examples" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _load_dataset(self, subject: SubjectType) -> None: - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) diff --git a/src/eval_framework/tasks/benchmarks/sphyr.py b/src/eval_framework/tasks/benchmarks/sphyr.py index 2bebd01b..74a9d82d 100644 --- a/src/eval_framework/tasks/benchmarks/sphyr.py +++ b/src/eval_framework/tasks/benchmarks/sphyr.py @@ -2,6 +2,7 @@ from eval_framework.metrics.completion_metrics.grid_difference import GridDifference from eval_framework.tasks.base import BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader SUBJECTS = [ "1_random_cell_easy", @@ -59,9 +60,9 @@ class SPHYR(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = None LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Fewshot is not supported for SPHYR" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None: FILL_INSTRUCTION = EASY_FILL_INSTRUCTION if "easy" in item["subject"] else HARD_FILL_INSTRUCTION diff --git a/src/eval_framework/tasks/benchmarks/squad.py b/src/eval_framework/tasks/benchmarks/squad.py index e25c77b7..0ea1b746 100644 --- a/src/eval_framework/tasks/benchmarks/squad.py +++ b/src/eval_framework/tasks/benchmarks/squad.py @@ -3,7 +3,8 @@ from eval_framework.metrics.completion_metrics.accuracy_completion import AccuracyCompletion from eval_framework.metrics.completion_metrics.f1 import F1 -from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, SubjectType +from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType +from eval_framework.tasks.dataloader import Dataloader class SQUAD2(BaseTask[str]): @@ -20,16 +21,16 @@ class SQUAD2(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer", "Context", "unanswerable"] LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences = [".\n"] self.max_tokens = 300 # the max length of the ground truth is 160 characters while the average is ~19 self.rnd_choice_shuffle = random.Random() - def _load_dataset(self, subject: SubjectType) -> None: + def _load_dataset(self, subject: str) -> None: name = subject if subject != NO_SUBJECT else None - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=name) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) diff --git a/src/eval_framework/tasks/benchmarks/struct_eval.py b/src/eval_framework/tasks/benchmarks/struct_eval.py index 8a35d904..da86325a 100644 --- a/src/eval_framework/tasks/benchmarks/struct_eval.py +++ b/src/eval_framework/tasks/benchmarks/struct_eval.py @@ -7,6 +7,7 @@ from eval_framework.metrics.completion_metrics.struct_eval_metrics import RenderableStructMetric, StructMetric from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader StructEvalSubjects = [ "CSV to YAML", @@ -43,13 +44,13 @@ class StructEval(BaseTask[str]): SUBJECTS = StructEvalSubjects LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: if num_fewshot > 0: raise ValueError("StructEval only supports zero-shot evaluation.") - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _load_dataset(self, subject: str) -> None: - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, revision=self.HF_REVISION) assert isinstance(hf_dataset, DatasetDict), "Expected a Hugging Face Dataset object." hf_dataset = hf_dataset.filter(lambda item: item["task_name"] == subject, num_proc=os.cpu_count()) self.dataset = {} diff --git a/src/eval_framework/tasks/benchmarks/tablebench.py b/src/eval_framework/tasks/benchmarks/tablebench.py index 7b9b5d1a..a9e06fe3 100644 --- a/src/eval_framework/tasks/benchmarks/tablebench.py +++ b/src/eval_framework/tasks/benchmarks/tablebench.py @@ -9,6 +9,7 @@ from eval_framework.exceptions import LogicError from eval_framework.metrics.completion_metrics.rouge_l import ROUGE_L from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import run_python_code from template_formatting.formatter import Role @@ -41,13 +42,13 @@ class TableBench(BaseTask[tuple[str, str]]): SUBJECTS = list(product(TABLE_BENCH_INSTRUCTION_TYPES, TABLE_BENCH_SUBJECTS)) LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Fewshot is not supported for TableBench" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _load_dataset(self, subject: tuple[str, str]) -> None: instruction_type, qtype = subject - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None) + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name=None, revision=self.HF_REVISION) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) diff --git a/src/eval_framework/tasks/benchmarks/triviaqa.py b/src/eval_framework/tasks/benchmarks/triviaqa.py index 4996735e..80263004 100644 --- a/src/eval_framework/tasks/benchmarks/triviaqa.py +++ b/src/eval_framework/tasks/benchmarks/triviaqa.py @@ -4,6 +4,7 @@ from eval_framework.metrics.completion_metrics.accuracy_completion import AccuracyCompletion from eval_framework.metrics.completion_metrics.f1 import F1 from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader class TRIVIAQA(BaseTask[str]): @@ -19,8 +20,8 @@ class TRIVIAQA(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences = ["\n"] self.max_tokens = 400 # the max length of the ground truth is 282 characters while the average is ~16 self.rnd_choice_shuffle = random.Random() diff --git a/src/eval_framework/tasks/benchmarks/truthfulqa.py b/src/eval_framework/tasks/benchmarks/truthfulqa.py index 5f2f468c..8498ac12 100644 --- a/src/eval_framework/tasks/benchmarks/truthfulqa.py +++ b/src/eval_framework/tasks/benchmarks/truthfulqa.py @@ -7,6 +7,7 @@ ) from eval_framework.metrics.loglikelihood_metrics.probability_mass import ProbabilityMass, ProbabilityMassNorm from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, SubjectType +from eval_framework.tasks.dataloader import Dataloader # fewshot examples from Appendix E in https://arxiv.org/pdf/2109.07958 FEWSHOT_ITEMS = [ @@ -45,9 +46,9 @@ class TRUTHFULQA(BaseTask[str]): FEWSHOT_ITEMS = FEWSHOT_ITEMS LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot <= 6, f"Fewshot larger than 6 is not supported for {self.NAME}" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _load_dataset(self, subject: SubjectType) -> None: """The original dataset only provides one subject 'multiple_choice', but with multiple target columns @@ -56,7 +57,7 @@ def _load_dataset(self, subject: SubjectType) -> None: subject names to huggingface.""" self.target_identifier = f"{subject}_targets" - hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name="multiple_choice") + hf_dataset = self.dataloader.load(path=self.DATASET_PATH, name="multiple_choice") self.dataset = {} self.rnd = random.Random(RANDOM_SEED) diff --git a/src/eval_framework/tasks/benchmarks/wmt.py b/src/eval_framework/tasks/benchmarks/wmt.py index 98daa760..65c8f2f9 100644 --- a/src/eval_framework/tasks/benchmarks/wmt.py +++ b/src/eval_framework/tasks/benchmarks/wmt.py @@ -9,6 +9,7 @@ from eval_framework.metrics.completion_metrics.chrf import LINEWISE_CHRF from eval_framework.metrics.completion_metrics.ter import LINEWISE_TER from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader class WMT(BaseTask[str], ABC): @@ -22,8 +23,8 @@ class WMT(BaseTask[str], ABC): METRICS = [LINEWISE_BLEU, LINEWISE_CHRF, LINEWISE_TER] PERTURBATION_UNMODIFIABLE_WORDS = ["phrase"] - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = [".\n", " phrase: ", "phrase:", "phrase: ", " phrase:", "\n\n"] def _load_dataset(self, subject: str | None) -> None: @@ -103,8 +104,8 @@ class WMT_INSTRUCT(WMT): PERTURBATION_UNMODIFIABLE_WORDS = ["Please", "translate"] COMPLETION_PREFIX = "This is the translation:" - def __init__(self, num_fewshot: int = 0) -> None: - super().__init__(num_fewshot) + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.stop_sequences: list[str] = ["Please translate"] def _get_instruction_text(self, item: dict[str, Any]) -> str: diff --git a/src/eval_framework/tasks/benchmarks/zero_scrolls.py b/src/eval_framework/tasks/benchmarks/zero_scrolls.py index 12984483..a9a99cdd 100644 --- a/src/eval_framework/tasks/benchmarks/zero_scrolls.py +++ b/src/eval_framework/tasks/benchmarks/zero_scrolls.py @@ -8,6 +8,7 @@ AccuracyLoglikelihood, ) from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.dataloader import Dataloader from eval_framework.tasks.utils import get_n_letters @@ -25,9 +26,9 @@ class ZERO_SCROLLS_QUALITY(BaseTask[str]): PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"] LANGUAGE = Language.ENG - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "ZeroSCROLLS QuALITY only supports zero fewshot examples" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) self.keys = get_n_letters(4) def _get_instruction_text(self, item: dict[str, Any]) -> str: @@ -62,9 +63,9 @@ class ZERO_SCROLLS_GOV_REPORT(ZERO_SCROLLS_COMPLETION): SUBJECTS = ["gov_report"] PERTURBATION_UNMODIFIABLE_WORDS = ["Summary"] - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "ZeroSCROLLS GovReport only supports zero fewshot examples" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_instruction_text(self, item: dict[str, Any]) -> str: query_end_index = item["query_end_index"] @@ -77,9 +78,9 @@ class ZERO_SCROLLS_QMSUM(ZERO_SCROLLS_COMPLETION): SUBJECTS = ["qmsum"] PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"] - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "ZeroSCROLLS QMSum only supports zero fewshot examples" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_instruction_text(self, item: dict[str, Any]) -> str: query_end_index = item["query_end_index"] @@ -92,9 +93,9 @@ class ZERO_SCROLLS_SQUALITY(ZERO_SCROLLS_COMPLETION): SUBJECTS = ["squality"] PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"] - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "ZeroSCROLLS SQuALITY only supports zero fewshot examples" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_instruction_text(self, item: dict[str, Any]) -> str: query_end_index = item["query_end_index"] @@ -107,9 +108,9 @@ class ZERO_SCROLLS_QASPER(ZERO_SCROLLS_COMPLETION): SUBJECTS = ["qasper"] PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"] - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "ZeroSCROLLS Qasper only supports zero fewshot examples" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_instruction_text(self, item: dict[str, Any]) -> str: query_end_index = item["query_end_index"] @@ -122,9 +123,9 @@ class ZERO_SCROLLS_NARRATIVEQA(ZERO_SCROLLS_COMPLETION): SUBJECTS = ["narrative_qa"] PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"] - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "ZeroSCROLLS NarrativeQA only supports zero fewshot examples" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_instruction_text(self, item: dict[str, Any]) -> str: query_end_index = item["query_end_index"] @@ -137,9 +138,9 @@ class ZERO_SCROLLS_MUSIQUE(ZERO_SCROLLS_COMPLETION): SUBJECTS = ["musique"] PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"] - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "ZeroSCROLLS MuSiQue only supports zero fewshot examples" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def _get_instruction_text(self, item: dict[str, Any]) -> str: query_end_index = item["query_end_index"] @@ -152,9 +153,9 @@ class ZERO_SCROLLS_SPACE_DIGEST(ZERO_SCROLLS_COMPLETION): SUBJECTS = ["space_digest"] PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"] - def __init__(self, num_fewshot: int = 0) -> None: + def __init__(self, dataloader: Dataloader, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "ZeroSCROLLS SpaceDigest only supports zero fewshot examples" - super().__init__(num_fewshot) + super().__init__(num_fewshot=num_fewshot, dataloader=dataloader) def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: # First, try to find patterns like "X%" or "X percent" or "X percentage" diff --git a/src/eval_framework/tasks/dataloader.py b/src/eval_framework/tasks/dataloader.py new file mode 100644 index 00000000..a787ab1c --- /dev/null +++ b/src/eval_framework/tasks/dataloader.py @@ -0,0 +1,63 @@ +import os +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Union + +from datasets import Dataset, DatasetDict, DownloadConfig, Features, IterableDataset, IterableDatasetDict, load_dataset +from huggingface_hub import HfApi +from huggingface_hub.errors import RevisionNotFoundError + + +class Dataloader(ABC): + @abstractmethod + def load( + self, + path: str, + name: str | None = None, + revision: str | None = None, + features: Features | None = None, + streaming: bool = False, + ) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]: + pass + + +class HFDataloader(Dataloader): + def load( + self, + path: str, + name: str | None = None, + revision: str | None = None, + features: Features | None = None, + streaming: bool = False, + ) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]: + # Check if the HF_REVISION is valid before loading the dataset + if revision: + try: + _ = HfApi().dataset_info(repo_id=path, revision=revision, timeout=100.0) + except Exception as e: + if isinstance(e, RevisionNotFoundError): + raise e + + cache_dir = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets") + download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5) + try: + return load_dataset( + path=path, + name=name, + revision=revision, + trust_remote_code=True, + cache_dir=cache_dir, + download_config=download_config, + features=features, + streaming=streaming, + ) + except Exception: + return load_dataset( + name=name, + path=path, + revision=revision, + trust_remote_code=True, + cache_dir=f"{Path.home()}/.cache/eval-framework", + features=features, + streaming=streaming, + ) diff --git a/tests/tasks/test_infinitebench.py b/tests/tasks/test_infinitebench.py index 36d9142a..e2271dfe 100644 --- a/tests/tasks/test_infinitebench.py +++ b/tests/tasks/test_infinitebench.py @@ -9,13 +9,14 @@ InfiniteBench_RetrieveNumber, InfiniteBench_RetrievePassKey1, ) +from eval_framework.tasks.dataloader import HFDataloader from tests.utils import DatasetPatcher class Test_InfiniteBench_CodeRun: @pytest.fixture def task(self) -> InfiniteBench_CodeRun: - return InfiniteBench_CodeRun(0) + return InfiniteBench_CodeRun(num_fewshot=0, dataloader=HFDataloader()) def test_InfiniteBench_CodeRun_postprocessing(self, task: InfiniteBench_CodeRun) -> None: assert ( diff --git a/tests/tasks/test_struct_eval.py b/tests/tasks/test_struct_eval.py index 3a4525d5..e14d49a3 100644 --- a/tests/tasks/test_struct_eval.py +++ b/tests/tasks/test_struct_eval.py @@ -1,12 +1,13 @@ import pytest from eval_framework.tasks.benchmarks.struct_eval import StructEval +from eval_framework.tasks.dataloader import HFDataloader class TestStructEval: @pytest.fixture def struct_eval_task(self) -> StructEval: - return StructEval(0) + return StructEval(num_fewshot=0, dataloader=HFDataloader()) @pytest.mark.parametrize("subject", StructEval.SUBJECTS) def test_struct_eval_task_loads_dataset_for_subjects(self, struct_eval_task: StructEval, subject: str) -> None: diff --git a/tests/tasks/test_zero_scrolls_space_digest.py b/tests/tasks/test_zero_scrolls_space_digest.py index 3fb9f9b5..d43e6fca 100644 --- a/tests/tasks/test_zero_scrolls_space_digest.py +++ b/tests/tasks/test_zero_scrolls_space_digest.py @@ -3,6 +3,7 @@ from eval_framework.tasks.benchmarks.zero_scrolls import ( ZERO_SCROLLS_SPACE_DIGEST, ) +from eval_framework.tasks.dataloader import HFDataloader @pytest.mark.parametrize( @@ -39,6 +40,6 @@ ], ) def test_post_process_generated_completion(completion_text: str, expected_result: str) -> None: - task = ZERO_SCROLLS_SPACE_DIGEST(num_fewshot=0) + task = ZERO_SCROLLS_SPACE_DIGEST(num_fewshot=0, dataloader=HFDataloader()) result = task.post_process_generated_completion(completion_text) assert result == expected_result diff --git a/tests/test_response_generator.py b/tests/test_response_generator.py index 60b992c5..cba39558 100644 --- a/tests/test_response_generator.py +++ b/tests/test_response_generator.py @@ -11,6 +11,7 @@ from eval_framework.shared.types import Completion, RawCompletion from eval_framework.task_names import TaskName from eval_framework.tasks.base import Sample +from eval_framework.tasks.dataloader import HFDataloader from eval_framework.tasks.eval_config import EvalConfig from eval_framework.tasks.perturbation import PerturbationConfig, PerturbationType from template_formatting.formatter import Message, Role @@ -433,7 +434,7 @@ def test_perturbed_response_differs(tmp_path: Path, perturbation_type: Perturbat save_intermediate_results=False, ) - task = perturbed_eval_config.task_name.value() + task = perturbed_eval_config.task_name.value(HFDataloader()) perturbed_response_generator = ResponseGenerator(MockLLM(), perturbed_eval_config, Mock(spec=ResultsFileProcessor)) task._load_dataset(task.SUBJECTS[0]) diff --git a/tests/utils.py b/tests/utils.py index fdd883df..a0c8e9d1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -13,6 +13,7 @@ from eval_framework.constants import RED, RESET from eval_framework.result_processors.base import Result from eval_framework.tasks.base import BaseTask, SubjectType +from eval_framework.tasks.dataloader import HFDataloader T = TypeVar("T", bound=BaseTask) @@ -101,22 +102,21 @@ def __init__(self, task_class: type[T], num_samples: int = 2, num_fewshot: int = self.patch_obj = None def __enter__(self) -> T: - task = self.task_class(num_fewshot=self.num_fewshot) + dataloader = HFDataloader() + task = self.task_class(num_fewshot=self.num_fewshot, dataloader=dataloader) - # First, we record what arguments are passed to _load_hf_dataset for each subject - # We do this by patching the _load_hf_dataset method and recording the keyword arguments - # (_load_hf_dataset is used only with kwargs) + # First, we record what arguments are passed to load_hf_dataset for each subject captured_kwargs = [] def mock_get_arguments(**kwargs: Any) -> None: captured_kwargs.append(kwargs) - with patch.object(task, "_load_hf_dataset", side_effect=mock_get_arguments): + with patch.object(dataloader, "load", side_effect=mock_get_arguments): for subject in task.SUBJECTS: # We expect it to error out because mock_get_arguments returns None try: task._load_dataset(subject) - raise Exception("_load_hf_dataset should have errored out") + raise Exception("dataloader.load should have errored out") except Exception: pass @@ -133,7 +133,7 @@ def mock_get_arguments(**kwargs: Any) -> None: # Patch HF_DATASET_CACHE_DIR environment variable when loading the dataset with patch.dict("os.environ", {"HF_DATASET_CACHE_DIR": str(self.tmp_cache_dir)}): - hf_dataset = task._load_hf_dataset(**kwargs_copy) + hf_dataset = dataloader.load(**kwargs_copy) extracted_items = {} @@ -156,9 +156,9 @@ def mock_get_arguments(**kwargs: Any) -> None: if self.tmp_cache_dir.exists(): shutil.rmtree(self.tmp_cache_dir) - # Then, we patch the _load_hf_dataset method to the data subsets we extracted + # Patch the dataloader's load method to return the data subsets we extracted mock_load_hf_dataset = create_mock_load_hf_dataset(task.SUBJECTS, captured_kwargs) - self.patch_obj = patch("eval_framework.tasks.base.BaseTask._load_hf_dataset", mock_load_hf_dataset) # type: ignore[assignment] + self.patch_obj = patch.object(dataloader, "load", mock_load_hf_dataset) # type: ignore[assignment] assert self.patch_obj is not None self.patch_obj.__enter__() diff --git a/utils/generate-task-docs.py b/utils/generate-task-docs.py index a8f7a6d9..7fabedfc 100644 --- a/utils/generate-task-docs.py +++ b/utils/generate-task-docs.py @@ -7,6 +7,7 @@ from eval_framework.task_loader import load_extra_tasks from eval_framework.task_names import TaskName +from eval_framework.tasks.dataloader import HFDataloader from template_formatting.formatter import BaseFormatter, ConcatFormatter, Llama3Formatter OUTPUT_DOCS_DIRECTORY = "docs/tasks" @@ -71,11 +72,11 @@ def generate_docs_for_task(task_name: str, formatters: list[BaseFormatter], add_ try: num_fewshot = 1 - task = task_class(num_fewshot) + task = task_class(num_fewshot=num_fewshot, dataloader=HFDataloader()) except Exception: try: num_fewshot = 0 - task = task_class(num_fewshot) + task = task_class(num_fewshot=num_fewshot, dataloader=HFDataloader()) except Exception as e: print(f"Failed to instantiate task {t}: {e}") return