diff --git a/src/kagglehub/datasets.py b/src/kagglehub/datasets.py index 51787f1..609da7f 100755 --- a/src/kagglehub/datasets.py +++ b/src/kagglehub/datasets.py @@ -49,6 +49,8 @@ def dataset_upload( local_dataset_dir: str, version_notes: str = "", ignore_patterns: Optional[Union[list[str], str]] = None, + *, + follow_links: bool = False, ) -> None: """Upload dataset files. Args: @@ -62,6 +64,7 @@ def dataset_upload( https://docs.python.org/3/library/fnmatch.html. Use a pattern ending with "/" to ignore the whole dir, e.g., ".git/" is equivalent to ".git/*". + follow_links: (bool) Enable to follow symbolic link directories. """ h = parse_dataset_handle(handle) logger.info(f"Uploading Dataset {h.to_url()} ...") @@ -73,6 +76,7 @@ def dataset_upload( local_dataset_dir, item_type="dataset", ignore_patterns=normalize_patterns(default=DEFAULT_IGNORE_PATTERNS, additional=ignore_patterns), + follow_links=follow_links, ) create_dataset_or_version(h, tokens, version_notes) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 2172acc..18efad0 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -69,7 +69,9 @@ def get_size(size: float, precision: int = 0) -> str: return f"{size:.{precision}f}{suffixes[suffix_index]}" -def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str]) -> Iterable[tuple[str, list[str], list[str]]]: +def filtered_walk( + *, base_dir: str, ignore_patterns: Sequence[str], follow_links: bool = False, max_warning: int = 0 +) -> Iterable[tuple[str, list[str], list[str]]]: """An `os.walk` like directory tree generator with filtering. This method filters out files matching any ignore pattern. @@ -78,12 +80,27 @@ def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str]) -> Iterable[ base_dir (str): The base dir to walk in. ignore_patterns (Sequence[str]): The patterns for ignored files. These are standard wildcards relative to base_dir. + follow_links (bool): If `True`, follows symbolic linked directories. + If links that point to already visited directories are detected, skip them. + max_warning (int): Maximum number of warning to be displayed. Yields: Iterable[tuple[str, list[str], list[str]]]: (base_dir_path, list[dir_names], list[filtered_file_names]) """ - for dir_path, dir_names, file_names in os.walk(base_dir): + visited = set() + num_warning = 0 + for dir_path, dir_names, file_names in os.walk(base_dir, followlinks=follow_links): dir_p = pathlib.Path(dir_path) + + resolved_dir_p = dir_p.resolve() + if resolved_dir_p in visited: + # Avoid inclusion of circular links or duplicate directories in the dataset when follow_links=True. + if num_warning < max_warning: + logger.warning(f"Skip duplicated symlinks pointing to the same directory: {dir_p}({resolved_dir_p})") + num_warning += 1 + continue + visited.add(resolved_dir_p) + filtered_files = [] for file_name in file_names: rel_file_p = (dir_p / file_name).relative_to(base_dir) @@ -188,17 +205,44 @@ def _upload_blob(file_path: str, item_type: str) -> str: return response["token"] +def _check_uploadable_files(folder: str, *, ignore_patterns: Sequence[str], follow_links: bool) -> int: + file_count = 0 + symlinked_dirs = set() + for root, dirs, files in filtered_walk( + base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links, max_warning=5 + ): + root_path = pathlib.Path(root) + for d in dirs: + dir_p = root_path / d + if dir_p.is_symlink(): + symlinked_dirs.add(dir_p) + file_count += len(files) + + n_links = len(symlinked_dirs) + if not follow_links and n_links > 0: + logger.warning( + f"Skip {n_links} symbolic link directories." + " If you want to include these directores, set `follow_links=True`." + ) + + if file_count == 0 and os.path.isdir(folder): + no_uploadable_exception = "No uploadable files are found. At least one file is needed." + raise ValueError(no_uploadable_exception) + + return file_count + + def upload_files_and_directories( folder: str, *, ignore_patterns: Sequence[str], item_type: str, quiet: bool = False, + follow_links: bool = False, ) -> UploadDirectoryInfo: + # Count the total number of files - file_count = 0 - for _, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns): - file_count += len(files) + file_count = _check_uploadable_files(folder, ignore_patterns=ignore_patterns, follow_links=follow_links) if file_count > MAX_FILES_TO_UPLOAD: if not quiet: @@ -207,7 +251,9 @@ def upload_files_and_directories( with TemporaryDirectory() as temp_dir: zip_path = os.path.join(temp_dir, TEMP_ARCHIVE_FILE) with zipfile.ZipFile(zip_path, "w") as zipf: - for root, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns): + for root, _, files in filtered_walk( + base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links + ): for file in files: file_path = os.path.join(root, file) zipf.write(file_path, os.path.relpath(file_path, folder)) @@ -226,10 +272,11 @@ def upload_files_and_directories( if token: root_dict.files.append(token) else: - for root, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns): + for root, _, files in filtered_walk( + base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links + ): # Path of the current folder relative to the base folder path = os.path.relpath(root, folder) - # Navigate or create the dictionary path to the current folder current_dict = root_dict if path != ".": diff --git a/tests/test_gcs_upload.py b/tests/test_gcs_upload.py index 0bcda1f..6dbd6d9 100644 --- a/tests/test_gcs_upload.py +++ b/tests/test_gcs_upload.py @@ -61,3 +61,69 @@ def test_filtered_walk(self) -> None: for file_name in file_names: walked_files.append(pathlib.Path(dir_path) / file_name) self.assertEqual(set(walked_files), expected_files) + + def _setup_link_dir(self, tmp_dir_p: pathlib.Path) -> None: + """setup the following structure + tmp_dir/ + root/ + link_dir -> tmp_dir/extern/ + real_dir/ + a.txt + extern/ + loop_dir -> tmp_dir/extern/ + b.txt + """ + extern_dir = tmp_dir_p / "extern" + extern_dir.mkdir() + (extern_dir / "b.txt").touch() + (extern_dir / "loop_dir").symlink_to(extern_dir, target_is_directory=True) + + root_dir = tmp_dir_p / "root" + (root_dir / "real_dir").mkdir(parents=True) + (root_dir / "real_dir" / "a.txt").touch() + (root_dir / "link_dir").symlink_to(extern_dir, target_is_directory=True) + + def test_follow_link_dir(self) -> None: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_dir_p = pathlib.Path(tmp_dir) + + try: + self._setup_link_dir(tmp_dir_p) + except Exception: + self.skipTest("failed to setup linked dir") + + follow_links = True + root_dir = tmp_dir_p / "root" + expected_files = { + root_dir / "real_dir" / "a.txt", + root_dir / "link_dir" / "b.txt", + } + walked_files = [] + for dir_path, _, file_names in filtered_walk( + base_dir=root_dir, ignore_patterns=[], follow_links=follow_links + ): + for file_name in file_names: + walked_files.append(pathlib.Path(dir_path) / file_name) + self.assertEqual(set(walked_files), expected_files) + + def test_skip_link_dir(self) -> None: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_dir_p = pathlib.Path(tmp_dir) + + try: + self._setup_link_dir(tmp_dir_p) + except Exception: + self.skipTest("failed to setup linked dir") + + follow_links = False + root_dir = tmp_dir_p / "root" + expected_files = { + root_dir / "real_dir" / "a.txt", + } + walked_files = [] + for dir_path, _, file_names in filtered_walk( + base_dir=root_dir, ignore_patterns=[], follow_links=follow_links + ): + for file_name in file_names: + walked_files.append(pathlib.Path(dir_path) / file_name) + self.assertEqual(set(walked_files), expected_files)