diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 976b21c86..e58cce64c 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -44,7 +44,7 @@ VersionsLogger, ) from lighteval.utils.imports import is_package_available, not_installed_error_message -from lighteval.utils.utils import obj_to_markdown +from lighteval.utils.utils import obj_to_markdown, sanitize_filename logger = logging.getLogger(__name__) @@ -336,16 +336,21 @@ def load_details_datasets(self, date_id: str, task_names: list[str]) -> dict[str logger.info(f"Loading details from {output_dir_details_sub_folder}") date_id = output_dir_details_sub_folder.name # Overwrite date_id in case of latest details_datasets = {} + sanitized_task_names = {sanitize_filename(tn): tn for tn in task_names} for file in self.fs.glob(str(output_dir_details_sub_folder / f"details_*_{date_id}.parquet")): - task_name = Path(file).stem.replace("details_", "").replace(f"_{date_id}", "") - if "|".join(task_name.split("|")[:-1]) not in task_names: - logger.info(f"Skipping {task_name} because it is not in the task_names list") - continue + sanitized_task_name = Path(file).stem.replace("details_", "").replace(f"_{date_id}", "") + if sanitized_task_name in sanitized_task_names: + task_name = sanitized_task_names[sanitized_task_name] + else: + task_name = sanitized_task_name.replace("__", "|") + if "|".join(task_name.split("|")[:-1]) not in task_names: + logger.info(f"Skipping {task_name} because it is not in the task_names list") + continue dataset = load_dataset("parquet", data_files=file, split="train") details_datasets[task_name] = dataset for task_name in task_names: - if not any(task_name.startswith(task_name) for task_name in details_datasets.keys()): + if not any(task_name.startswith(tn) for tn in details_datasets.keys()): raise ValueError( f"Task {task_name} not found in details datasets. Check the tasks to be evaluated or the date_id used to load the details ({date_id})." ) @@ -356,7 +361,8 @@ def save_details(self, date_id: str, details_datasets: dict[str, Dataset]): self.fs.mkdirs(output_dir_details_sub_folder, exist_ok=True) logger.info(f"Saving details to {output_dir_details_sub_folder}") for task_name, dataset in details_datasets.items(): - output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet" + sanitized_task_name = sanitize_filename(task_name) + output_file_details = output_dir_details_sub_folder / f"details_{sanitized_task_name}_{date_id}.parquet" with self.fs.open(str(output_file_details), "wb") as f: dataset.to_parquet(f) @@ -421,7 +427,8 @@ def push_to_hub( results_dataset.to_parquet(f"{fsspec_repo_uri}/{result_file_base_name}.parquet") for task_name, dataset in details.items(): - output_file_details = Path(date_id) / f"details_{task_name}_{date_id}.parquet" + sanitized_task_name = sanitize_filename(task_name) + output_file_details = Path(date_id) / f"details_{sanitized_task_name}_{date_id}.parquet" dataset.to_parquet(f"{fsspec_repo_uri}/{output_file_details}") self.recreate_metadata_card(repo_id) @@ -474,11 +481,15 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901 # subfile have this general format: # `2023-09-03T10-57-04.203304/details_harness|hendrycksTest-us_foreign_policy|5_2023-09-03T10-57-04.203304.parquet` + # or with sanitized names: `2023-09-03T10-57-04.203304/details_harness_hendrycksTest-us_foreign_policy__5_2023-09-03T10-57-04.203304.parquet` # in the iso date, the `:` are replaced by `-` because windows does not allow `:` in their filenames - task_name = ( + sanitized_task_name = ( details_file_regex.match(os.path.basename(sub_file)).group("task_name") # type: ignore ) - # task_name is then equal to `leaderboard|mmlu:us_foreign_policy|5` + # Reconstruct original task name by replacing underscores with pipes + # This handles both old format (with pipes) and new format (sanitized with underscores) + task_name = sanitized_task_name.replace("__", "|") + # task_name is then equal to `leaderboard|mmlu:us_foreign_policy|5` (or sanitized equivalent) # to be able to parse the filename as iso dates, we need to re-replace the `-` with `:` # iso_date[13] = iso_date[16] = ':' @@ -514,7 +525,9 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901 task_name_match = details_file_regex.match(filename) # type: ignore if not task_name_match: raise ValueError(f"Could not parse task name from filename: {filename}") - task_name = task_name_match.group("task_name") + sanitized_task_name = task_name_match.group("task_name") + # Reconstruct original task name by replacing underscores with pipes + task_name = sanitized_task_name.replace("__", "|") eval_date = task_name_match.group("date") sanitized_task = re.sub(r"\W", "_", task_name) diff --git a/src/lighteval/utils/cache_management.py b/src/lighteval/utils/cache_management.py index 962f8b083..de6d58e5b 100644 --- a/src/lighteval/utils/cache_management.py +++ b/src/lighteval/utils/cache_management.py @@ -37,7 +37,7 @@ from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.registry import Registry from lighteval.tasks.requests import Doc, SamplingMethod -from lighteval.utils.utils import as_list +from lighteval.utils.utils import as_list, sanitize_filename logger = logging.getLogger(__name__) @@ -114,7 +114,10 @@ def _load_cached_indices(self) -> dict: try: # cache_file.parts gives all the subfolders of the url, up to the file name # last 3 are task_name/task_hash/file_name.parquet, so we take -3 and -2 - task_name, task_hash = cache_file.parts[-3:-1] + sanitized_task_name, task_hash = cache_file.parts[-3:-1] + # Reconstruct original task name by replacing underscores with pipes + # This works because task names use "|" as separators and we sanitize by replacing "|" with "__" + task_name = sanitized_task_name.replace("__", "|") sampling_method = SamplingMethod[cache_file.stem] # removes the file extension task_id = TaskID(task_name, task_hash, sampling_method) @@ -191,7 +194,8 @@ def get_cache_path(self, task_id: TaskID) -> Path: Returns: Path: Path to the cache file for the given task and sample type """ - return self.cache_dir / task_id.task_name / task_id.task_hash / f"{task_id.sampling_method.name}.parquet" + sanitized_task_name = sanitize_filename(task_id.task_name) + return self.cache_dir / sanitized_task_name / task_id.task_hash / f"{task_id.sampling_method.name}.parquet" def get_task_id(self, task_name: str, sampling_method: SamplingMethod) -> TaskID: """Returns a unique task indentifier. Depends on the task name, diff --git a/src/lighteval/utils/utils.py b/src/lighteval/utils/utils.py index 3ab5976d8..c4be9667c 100644 --- a/src/lighteval/utils/utils.py +++ b/src/lighteval/utils/utils.py @@ -315,3 +315,17 @@ def remove_reasoning_tags(text: str, tag_pairs: list[tuple[str, str]]) -> str: break return result + + +def sanitize_filename(name: str) -> str: + """Sanitizes a filename by replacing forbidden characters. + + Replaces characters that are not allowed in filenames on various operating systems: + - Windows: < > : " / \\ | ? * + - Linux/Mac: / and \0 + """ + forbidden_chars = ["|", ":", "/", "\\", "<", ">", '"', "?", "*"] + sanitized = name + for char in forbidden_chars: + sanitized = sanitized.replace(char, "__") + return sanitized diff --git a/tests/reference_details/Qwen2.5-VL-3B-Instruct-vlm/details_mmmu_pro:standard-4|0_2025-11-05T15-23-34.026089.parquet b/tests/reference_details/Qwen2.5-VL-3B-Instruct-vlm/details_mmmu_pro__standard-4__0_2025-11-05T15-23-34.026089.parquet similarity index 100% rename from tests/reference_details/Qwen2.5-VL-3B-Instruct-vlm/details_mmmu_pro:standard-4|0_2025-11-05T15-23-34.026089.parquet rename to tests/reference_details/Qwen2.5-VL-3B-Instruct-vlm/details_mmmu_pro__standard-4__0_2025-11-05T15-23-34.026089.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:aqua-rat|0_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__aqua-rat__0_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:aqua-rat|0_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__aqua-rat__0_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:logiqa-en|0_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__logiqa-en__0_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:logiqa-en|0_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__logiqa-en__0_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-ar|0_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__lsat-ar__0_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-ar|0_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__lsat-ar__0_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-lr|0_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__lsat-lr__0_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-lr|0_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__lsat-lr__0_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-rc|0_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__lsat-rc__0_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-rc|0_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__lsat-rc__0_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:sat-en-without-passage|0_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__sat-en-without-passage__0_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:sat-en-without-passage|0_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__sat-en-without-passage__0_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:sat-en|0_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__sat-en__0_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:sat-en|0_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval__sat-en__0_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_arc:challenge|25_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_arc__challenge__25_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_arc:challenge|25_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_arc__challenge__25_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:causal_judgment|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__causal_judgment__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:causal_judgment|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__causal_judgment__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:date_understanding|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__date_understanding__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:date_understanding|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__date_understanding__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__disambiguation_qa__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__disambiguation_qa__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:geometric_shapes|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__geometric_shapes__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:geometric_shapes|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__geometric_shapes__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__logical_deduction_five_objects__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__logical_deduction_five_objects__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__logical_deduction_seven_objects__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__logical_deduction_seven_objects__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:movie_recommendation|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__movie_recommendation__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:movie_recommendation|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__movie_recommendation__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:navigate|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__navigate__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:navigate|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__navigate__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:ruin_names|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__ruin_names__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:ruin_names|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__ruin_names__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__salient_translation_error_detection__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__salient_translation_error_detection__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:snarks|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__snarks__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:snarks|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__snarks__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:temporal_sequences|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__temporal_sequences__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:temporal_sequences|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__temporal_sequences__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__tracking_shuffled_objects_five_objects__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__tracking_shuffled_objects_five_objects__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__tracking_shuffled_objects_seven_objects__3_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard__tracking_shuffled_objects_seven_objects__3_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_gsm8k_test|0_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_gsm8k_test__0_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_gsm8k_test|0_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_gsm8k_test__0_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_hellaswag|10_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_hellaswag__10_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_hellaswag|10_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_hellaswag__10_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu:college_chemistry|5_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu__college_chemistry__5_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu:college_chemistry|5_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu__college_chemistry__5_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu:us_foreign_policy|5_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu__us_foreign_policy__5_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu:us_foreign_policy|5_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu__us_foreign_policy__5_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_truthfulqa:mc|0_2025-11-05T14-43-47.148527.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_truthfulqa__mc__0_2025-11-05T14-43-47.148527.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_truthfulqa:mc|0_2025-11-05T14-43-47.148527.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_truthfulqa__mc__0_2025-11-05T14-43-47.148527.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:aqua-rat|0_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__aqua-rat__0_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:aqua-rat|0_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__aqua-rat__0_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:logiqa-en|0_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__logiqa-en__0_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:logiqa-en|0_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__logiqa-en__0_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-ar|0_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__lsat-ar__0_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-ar|0_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__lsat-ar__0_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-lr|0_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__lsat-lr__0_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-lr|0_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__lsat-lr__0_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-rc|0_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__lsat-rc__0_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-rc|0_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__lsat-rc__0_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:sat-en-without-passage|0_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__sat-en-without-passage__0_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:sat-en-without-passage|0_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__sat-en-without-passage__0_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:sat-en|0_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__sat-en__0_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:sat-en|0_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval__sat-en__0_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_arc:challenge|25_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_arc__challenge__25_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_arc:challenge|25_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_arc__challenge__25_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:causal_judgment|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__causal_judgment__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:causal_judgment|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__causal_judgment__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:date_understanding|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__date_understanding__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:date_understanding|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__date_understanding__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__disambiguation_qa__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__disambiguation_qa__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:geometric_shapes|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__geometric_shapes__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:geometric_shapes|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__geometric_shapes__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__logical_deduction_five_objects__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__logical_deduction_five_objects__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__logical_deduction_seven_objects__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__logical_deduction_seven_objects__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:movie_recommendation|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__movie_recommendation__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:movie_recommendation|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__movie_recommendation__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:navigate|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__navigate__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:navigate|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__navigate__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:ruin_names|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__ruin_names__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:ruin_names|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__ruin_names__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__salient_translation_error_detection__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__salient_translation_error_detection__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:snarks|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__snarks__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:snarks|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__snarks__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:temporal_sequences|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__temporal_sequences__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:temporal_sequences|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__temporal_sequences__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__tracking_shuffled_objects_five_objects__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__tracking_shuffled_objects_five_objects__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__tracking_shuffled_objects_seven_objects__3_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard__tracking_shuffled_objects_seven_objects__3_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_gsm8k_test|0_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_gsm8k_test__0_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_gsm8k_test|0_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_gsm8k_test__0_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_hellaswag|10_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_hellaswag__10_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_hellaswag|10_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_hellaswag__10_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu:college_chemistry|5_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu__college_chemistry__5_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu:college_chemistry|5_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu__college_chemistry__5_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu:us_foreign_policy|5_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu__us_foreign_policy__5_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu:us_foreign_policy|5_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu__us_foreign_policy__5_2025-11-05T14-52-08.352779.parquet diff --git a/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_truthfulqa:mc|0_2025-11-05T14-52-08.352779.parquet b/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_truthfulqa__mc__0_2025-11-05T14-52-08.352779.parquet similarity index 100% rename from tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_truthfulqa:mc|0_2025-11-05T14-52-08.352779.parquet rename to tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_truthfulqa__mc__0_2025-11-05T14-52-08.352779.parquet diff --git a/tests/slow_tests/sample_comparison.py b/tests/slow_tests/sample_comparison.py index 393d8943f..29429a92f 100644 --- a/tests/slow_tests/sample_comparison.py +++ b/tests/slow_tests/sample_comparison.py @@ -46,10 +46,13 @@ def load_sample_details(details_dir: str): return details for parquet_file in details_path.glob("details_*.parquet"): - # Extract task name from parquet filename, keeping the full task path with "|" separators - task_name = parquet_file.stem.replace("details_", "").rsplit("_", 1)[ + # Extract task name from parquet filename + # Handle both sanitized format (with underscores) and old format (with pipes) + sanitized_task_name = parquet_file.stem.replace("details_", "").rsplit("_", 1)[ 0 - ] # Split from right to preserve task name with "|" + ] # Split from right to get task name (before date_id) + # Reconstruct original task name by replacing underscores with pipes + task_name = sanitized_task_name.replace("__", "|") dataset = Dataset.from_parquet(str(parquet_file)) details[task_name] = list(dataset)