Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 24 additions & 11 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
VersionsLogger,
)
from lighteval.utils.imports import is_package_available, not_installed_error_message
from lighteval.utils.utils import obj_to_markdown
from lighteval.utils.utils import obj_to_markdown, sanitize_filename


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -336,16 +336,21 @@ def load_details_datasets(self, date_id: str, task_names: list[str]) -> dict[str
logger.info(f"Loading details from {output_dir_details_sub_folder}")
date_id = output_dir_details_sub_folder.name # Overwrite date_id in case of latest
details_datasets = {}
sanitized_task_names = {sanitize_filename(tn): tn for tn in task_names}
for file in self.fs.glob(str(output_dir_details_sub_folder / f"details_*_{date_id}.parquet")):
task_name = Path(file).stem.replace("details_", "").replace(f"_{date_id}", "")
if "|".join(task_name.split("|")[:-1]) not in task_names:
logger.info(f"Skipping {task_name} because it is not in the task_names list")
continue
sanitized_task_name = Path(file).stem.replace("details_", "").replace(f"_{date_id}", "")
if sanitized_task_name in sanitized_task_names:
task_name = sanitized_task_names[sanitized_task_name]
else:
task_name = sanitized_task_name.replace("__", "|")
if "|".join(task_name.split("|")[:-1]) not in task_names:
logger.info(f"Skipping {task_name} because it is not in the task_names list")
continue
dataset = load_dataset("parquet", data_files=file, split="train")
details_datasets[task_name] = dataset

for task_name in task_names:
if not any(task_name.startswith(task_name) for task_name in details_datasets.keys()):
if not any(task_name.startswith(tn) for tn in details_datasets.keys()):
raise ValueError(
f"Task {task_name} not found in details datasets. Check the tasks to be evaluated or the date_id used to load the details ({date_id})."
)
Expand All @@ -356,7 +361,8 @@ def save_details(self, date_id: str, details_datasets: dict[str, Dataset]):
self.fs.mkdirs(output_dir_details_sub_folder, exist_ok=True)
logger.info(f"Saving details to {output_dir_details_sub_folder}")
for task_name, dataset in details_datasets.items():
output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
sanitized_task_name = sanitize_filename(task_name)
output_file_details = output_dir_details_sub_folder / f"details_{sanitized_task_name}_{date_id}.parquet"
with self.fs.open(str(output_file_details), "wb") as f:
dataset.to_parquet(f)

Expand Down Expand Up @@ -421,7 +427,8 @@ def push_to_hub(
results_dataset.to_parquet(f"{fsspec_repo_uri}/{result_file_base_name}.parquet")

for task_name, dataset in details.items():
output_file_details = Path(date_id) / f"details_{task_name}_{date_id}.parquet"
sanitized_task_name = sanitize_filename(task_name)
output_file_details = Path(date_id) / f"details_{sanitized_task_name}_{date_id}.parquet"
dataset.to_parquet(f"{fsspec_repo_uri}/{output_file_details}")

self.recreate_metadata_card(repo_id)
Expand Down Expand Up @@ -474,11 +481,15 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901

# subfile have this general format:
# `2023-09-03T10-57-04.203304/details_harness|hendrycksTest-us_foreign_policy|5_2023-09-03T10-57-04.203304.parquet`
# or with sanitized names: `2023-09-03T10-57-04.203304/details_harness_hendrycksTest-us_foreign_policy__5_2023-09-03T10-57-04.203304.parquet`
# in the iso date, the `:` are replaced by `-` because windows does not allow `:` in their filenames
task_name = (
sanitized_task_name = (
details_file_regex.match(os.path.basename(sub_file)).group("task_name") # type: ignore
)
# task_name is then equal to `leaderboard|mmlu:us_foreign_policy|5`
# Reconstruct original task name by replacing underscores with pipes
# This handles both old format (with pipes) and new format (sanitized with underscores)
task_name = sanitized_task_name.replace("__", "|")
# task_name is then equal to `leaderboard|mmlu:us_foreign_policy|5` (or sanitized equivalent)

# to be able to parse the filename as iso dates, we need to re-replace the `-` with `:`
# iso_date[13] = iso_date[16] = ':'
Expand Down Expand Up @@ -514,7 +525,9 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901
task_name_match = details_file_regex.match(filename) # type: ignore
if not task_name_match:
raise ValueError(f"Could not parse task name from filename: {filename}")
task_name = task_name_match.group("task_name")
sanitized_task_name = task_name_match.group("task_name")
# Reconstruct original task name by replacing underscores with pipes
task_name = sanitized_task_name.replace("__", "|")
eval_date = task_name_match.group("date")

sanitized_task = re.sub(r"\W", "_", task_name)
Expand Down
10 changes: 7 additions & 3 deletions src/lighteval/utils/cache_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.registry import Registry
from lighteval.tasks.requests import Doc, SamplingMethod
from lighteval.utils.utils import as_list
from lighteval.utils.utils import as_list, sanitize_filename


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -114,7 +114,10 @@ def _load_cached_indices(self) -> dict:
try:
# cache_file.parts gives all the subfolders of the url, up to the file name
# last 3 are task_name/task_hash/file_name.parquet, so we take -3 and -2
task_name, task_hash = cache_file.parts[-3:-1]
sanitized_task_name, task_hash = cache_file.parts[-3:-1]
# Reconstruct original task name by replacing underscores with pipes
# This works because task names use "|" as separators and we sanitize by replacing "|" with "__"
task_name = sanitized_task_name.replace("__", "|")
sampling_method = SamplingMethod[cache_file.stem] # removes the file extension
task_id = TaskID(task_name, task_hash, sampling_method)

Expand Down Expand Up @@ -191,7 +194,8 @@ def get_cache_path(self, task_id: TaskID) -> Path:
Returns:
Path: Path to the cache file for the given task and sample type
"""
return self.cache_dir / task_id.task_name / task_id.task_hash / f"{task_id.sampling_method.name}.parquet"
sanitized_task_name = sanitize_filename(task_id.task_name)
return self.cache_dir / sanitized_task_name / task_id.task_hash / f"{task_id.sampling_method.name}.parquet"

def get_task_id(self, task_name: str, sampling_method: SamplingMethod) -> TaskID:
"""Returns a unique task indentifier. Depends on the task name,
Expand Down
14 changes: 14 additions & 0 deletions src/lighteval/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,3 +315,17 @@ def remove_reasoning_tags(text: str, tag_pairs: list[tuple[str, str]]) -> str:
break

return result


def sanitize_filename(name: str) -> str:
"""Sanitizes a filename by replacing forbidden characters.

Replaces characters that are not allowed in filenames on various operating systems:
- Windows: < > : " / \\ | ? *
- Linux/Mac: / and \0
"""
forbidden_chars = ["|", ":", "/", "\\", "<", ">", '"', "?", "*"]
sanitized = name
for char in forbidden_chars:
sanitized = sanitized.replace(char, "__")
return sanitized
9 changes: 6 additions & 3 deletions tests/slow_tests/sample_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,13 @@ def load_sample_details(details_dir: str):
return details

for parquet_file in details_path.glob("details_*.parquet"):
# Extract task name from parquet filename, keeping the full task path with "|" separators
task_name = parquet_file.stem.replace("details_", "").rsplit("_", 1)[
# Extract task name from parquet filename
# Handle both sanitized format (with underscores) and old format (with pipes)
sanitized_task_name = parquet_file.stem.replace("details_", "").rsplit("_", 1)[
0
] # Split from right to preserve task name with "|"
] # Split from right to get task name (before date_id)
# Reconstruct original task name by replacing underscores with pipes
task_name = sanitized_task_name.replace("__", "|")
dataset = Dataset.from_parquet(str(parquet_file))
details[task_name] = list(dataset)

Expand Down
Loading