From d8d3892b41d8bdf2399002b500839fcd660ec81f Mon Sep 17 00:00:00 2001 From: akiyuki ishikawa Date: Sun, 18 May 2025 19:56:39 +0900 Subject: [PATCH 1/5] add follow_links flag and related log message --- src/kagglehub/datasets.py | 3 ++ src/kagglehub/gcs_upload.py | 50 ++++++++++++++++++++++++----- tests/test_gcs_upload.py | 63 +++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 8 deletions(-) diff --git a/src/kagglehub/datasets.py b/src/kagglehub/datasets.py index 51787f11..a5236b49 100755 --- a/src/kagglehub/datasets.py +++ b/src/kagglehub/datasets.py @@ -49,6 +49,7 @@ def dataset_upload( local_dataset_dir: str, version_notes: str = "", ignore_patterns: Optional[Union[list[str], str]] = None, + follow_links: bool = False, ) -> None: """Upload dataset files. Args: @@ -62,6 +63,7 @@ def dataset_upload( https://docs.python.org/3/library/fnmatch.html. Use a pattern ending with "/" to ignore the whole dir, e.g., ".git/" is equivalent to ".git/*". + follow_links: (bool) Enable to follow symbolic linked directories. """ h = parse_dataset_handle(handle) logger.info(f"Uploading Dataset {h.to_url()} ...") @@ -73,6 +75,7 @@ def dataset_upload( local_dataset_dir, item_type="dataset", ignore_patterns=normalize_patterns(default=DEFAULT_IGNORE_PATTERNS, additional=ignore_patterns), + follow_links=follow_links, ) create_dataset_or_version(h, tokens, version_notes) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 2172accd..5dc0d292 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -69,7 +69,7 @@ def get_size(size: float, precision: int = 0) -> str: return f"{size:.{precision}f}{suffixes[suffix_index]}" -def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str]) -> Iterable[tuple[str, list[str], list[str]]]: +def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str], follow_links: bool = False) -> Iterable[tuple[str, list[str], list[str]]]: """An `os.walk` like directory tree generator with filtering. This method filters out files matching any ignore pattern. @@ -78,12 +78,27 @@ def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str]) -> Iterable[ base_dir (str): The base dir to walk in. ignore_patterns (Sequence[str]): The patterns for ignored files. These are standard wildcards relative to base_dir. + follow_links (bool): If `True`, follows symbolic linked directories. If links that point to already visited directories are detected, skip them. + Yields: Iterable[tuple[str, list[str], list[str]]]: (base_dir_path, list[dir_names], list[filtered_file_names]) """ - for dir_path, dir_names, file_names in os.walk(base_dir): + visited = set() + max_warning = 5 + num_warning = 0 + for dir_path, dir_names, file_names in os.walk(base_dir, followlinks=follow_links): dir_p = pathlib.Path(dir_path) + + resolved_dir_p = dir_p.resolve() + if resolved_dir_p in visited: + # Avoid inclusion of circular links or duplicate directories in the dataset when follow_links=True. + if num_warning <= max_warning: + logger.warning(f"Skip duplicated symlinks pointing to the same directory: {dir_p}({resolved_dir_p})") + num_warning += 1 + continue + visited.add(resolved_dir_p) + filtered_files = [] for file_name in file_names: rel_file_p = (dir_p / file_name).relative_to(base_dir) @@ -188,17 +203,37 @@ def _upload_blob(file_path: str, item_type: str) -> str: return response["token"] +def _check_uploadable_files(folder, ignore_patterns, follow_links): + file_count = 0 + symlinked_dirs = set() + for root, dirs, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links): + root_path = pathlib.Path(root) + for d in dirs: + dir_p = root_path / d + if dir_p.is_symlink(): + symlinked_dirs.add(dir_p) + file_count += len(files) + + if file_count == 0: + raise ValueError("No uploadable files are found. At least one file is needed.") + + n_links = len(symlinked_dirs) + if not follow_links and n_links > 0: + logger.warning(f"Skip {n_links} symbolic link directories. If you want to include these directores, set `follow_links=True`.") + return file_count + + def upload_files_and_directories( folder: str, *, ignore_patterns: Sequence[str], item_type: str, quiet: bool = False, + follow_links: bool = False, ) -> UploadDirectoryInfo: + # Count the total number of files - file_count = 0 - for _, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns): - file_count += len(files) + file_count = _check_uploadable_files(folder, ignore_patterns, follow_links) if file_count > MAX_FILES_TO_UPLOAD: if not quiet: @@ -207,7 +242,7 @@ def upload_files_and_directories( with TemporaryDirectory() as temp_dir: zip_path = os.path.join(temp_dir, TEMP_ARCHIVE_FILE) with zipfile.ZipFile(zip_path, "w") as zipf: - for root, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns): + for root, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links): for file in files: file_path = os.path.join(root, file) zipf.write(file_path, os.path.relpath(file_path, folder)) @@ -226,10 +261,9 @@ def upload_files_and_directories( if token: root_dict.files.append(token) else: - for root, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns): + for root, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links): # Path of the current folder relative to the base folder path = os.path.relpath(root, folder) - # Navigate or create the dictionary path to the current folder current_dict = root_dict if path != ".": diff --git a/tests/test_gcs_upload.py b/tests/test_gcs_upload.py index 0bcda1f6..6b70c5c9 100644 --- a/tests/test_gcs_upload.py +++ b/tests/test_gcs_upload.py @@ -61,3 +61,66 @@ def test_filtered_walk(self) -> None: for file_name in file_names: walked_files.append(pathlib.Path(dir_path) / file_name) self.assertEqual(set(walked_files), expected_files) + + + def _setup_link_dir(self, tmp_dir_p: pathlib.Path): + """ setup the following structure + tmp_dir/ + root/ + link_dir -> tmp_dir/extern/ + real_dir/ + a.txt + extern/ + loop_dir -> tmp_dir/extern/ + b.txt + """ + extern_dir = tmp_dir_p / "extern" + extern_dir.mkdir() + (extern_dir / "b.txt") .touch() + (extern_dir / "loop_dir").symlink_to(extern_dir, target_is_directory=True) + + root_dir = tmp_dir_p / "root" + (root_dir / "real_dir").mkdir(parents=True) + (root_dir / "real_dir" / "a.txt").touch() + (root_dir / "link_dir").symlink_to(extern_dir, target_is_directory=True) + + def test_follow_link_dir(self) -> None: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_dir_p = pathlib.Path(tmp_dir) + + try: + self._setup_link_dir(tmp_dir_p) + except Exception: + self.skipTest("failed to setup linked dir") + + follow_links = True + root_dir = tmp_dir_p / "root" + expected_files = { + root_dir / "real_dir" / "a.txt", + root_dir / "link_dir" / "b.txt", + } + walked_files = [] + for dir_path, _, file_names in filtered_walk(base_dir=root_dir, ignore_patterns=[], follow_links=follow_links): + for file_name in file_names: + walked_files.append(pathlib.Path(dir_path) / file_name) + self.assertEqual(set(walked_files), expected_files) + + def test_skip_link_dir(self) -> None: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_dir_p = pathlib.Path(tmp_dir) + + try: + self._setup_link_dir(tmp_dir_p) + except Exception: + self.skipTest("failed to setup linked dir") + + follow_links = False + root_dir = tmp_dir_p / "root" + expected_files = { + root_dir / "real_dir" / "a.txt", + } + walked_files = [] + for dir_path, _, file_names in filtered_walk(base_dir=root_dir, ignore_patterns=[], follow_links=follow_links): + for file_name in file_names: + walked_files.append(pathlib.Path(dir_path) / file_name) + self.assertEqual(set(walked_files), expected_files) From 3750ca84df6863eed84e640b76493cc677c2d6f5 Mon Sep 17 00:00:00 2001 From: akiyuki ishikawa Date: Sun, 18 May 2025 21:29:51 +0900 Subject: [PATCH 2/5] fix error condition and max_warning argment --- src/kagglehub/gcs_upload.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 5dc0d292..06f02c85 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -69,7 +69,7 @@ def get_size(size: float, precision: int = 0) -> str: return f"{size:.{precision}f}{suffixes[suffix_index]}" -def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str], follow_links: bool = False) -> Iterable[tuple[str, list[str], list[str]]]: +def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str], follow_links: bool = False, max_warning: int = 0) -> Iterable[tuple[str, list[str], list[str]]]: """An `os.walk` like directory tree generator with filtering. This method filters out files matching any ignore pattern. @@ -85,7 +85,6 @@ def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str], follow_links Iterable[tuple[str, list[str], list[str]]]: (base_dir_path, list[dir_names], list[filtered_file_names]) """ visited = set() - max_warning = 5 num_warning = 0 for dir_path, dir_names, file_names in os.walk(base_dir, followlinks=follow_links): dir_p = pathlib.Path(dir_path) @@ -206,7 +205,7 @@ def _upload_blob(file_path: str, item_type: str) -> str: def _check_uploadable_files(folder, ignore_patterns, follow_links): file_count = 0 symlinked_dirs = set() - for root, dirs, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links): + for root, dirs, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links, max_warning=5): root_path = pathlib.Path(root) for d in dirs: dir_p = root_path / d @@ -214,7 +213,7 @@ def _check_uploadable_files(folder, ignore_patterns, follow_links): symlinked_dirs.add(dir_p) file_count += len(files) - if file_count == 0: + if file_count == 0 and os.path.isdir(folder): raise ValueError("No uploadable files are found. At least one file is needed.") n_links = len(symlinked_dirs) From 53e5762a6cff2d125b6ee2f6f73934370dac40ba Mon Sep 17 00:00:00 2001 From: akiyuki ishikawa Date: Sun, 18 May 2025 22:16:43 +0900 Subject: [PATCH 3/5] Fix linter errors --- src/kagglehub/datasets.py | 3 ++- src/kagglehub/gcs_upload.py | 35 ++++++++++++++++++++++++----------- tests/test_gcs_upload.py | 15 +++++++++------ 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/src/kagglehub/datasets.py b/src/kagglehub/datasets.py index a5236b49..609da7f4 100755 --- a/src/kagglehub/datasets.py +++ b/src/kagglehub/datasets.py @@ -49,6 +49,7 @@ def dataset_upload( local_dataset_dir: str, version_notes: str = "", ignore_patterns: Optional[Union[list[str], str]] = None, + *, follow_links: bool = False, ) -> None: """Upload dataset files. @@ -63,7 +64,7 @@ def dataset_upload( https://docs.python.org/3/library/fnmatch.html. Use a pattern ending with "/" to ignore the whole dir, e.g., ".git/" is equivalent to ".git/*". - follow_links: (bool) Enable to follow symbolic linked directories. + follow_links: (bool) Enable to follow symbolic link directories. """ h = parse_dataset_handle(handle) logger.info(f"Uploading Dataset {h.to_url()} ...") diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 06f02c85..00446a81 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -69,7 +69,9 @@ def get_size(size: float, precision: int = 0) -> str: return f"{size:.{precision}f}{suffixes[suffix_index]}" -def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str], follow_links: bool = False, max_warning: int = 0) -> Iterable[tuple[str, list[str], list[str]]]: +def filtered_walk( + *, base_dir: str, ignore_patterns: Sequence[str], follow_links: bool = False, max_warning: int = 0 +) -> Iterable[tuple[str, list[str], list[str]]]: """An `os.walk` like directory tree generator with filtering. This method filters out files matching any ignore pattern. @@ -78,8 +80,9 @@ def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str], follow_links base_dir (str): The base dir to walk in. ignore_patterns (Sequence[str]): The patterns for ignored files. These are standard wildcards relative to base_dir. - follow_links (bool): If `True`, follows symbolic linked directories. If links that point to already visited directories are detected, skip them. - + follow_links (bool): If `True`, follows symbolic linked directories. + If links that point to already visited directories are detected, skip them. + max_warning (int): Maxmum number of warning to be displayed. Yields: Iterable[tuple[str, list[str], list[str]]]: (base_dir_path, list[dir_names], list[filtered_file_names]) @@ -92,7 +95,7 @@ def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str], follow_links resolved_dir_p = dir_p.resolve() if resolved_dir_p in visited: # Avoid inclusion of circular links or duplicate directories in the dataset when follow_links=True. - if num_warning <= max_warning: + if num_warning < max_warning: logger.warning(f"Skip duplicated symlinks pointing to the same directory: {dir_p}({resolved_dir_p})") num_warning += 1 continue @@ -202,10 +205,12 @@ def _upload_blob(file_path: str, item_type: str) -> str: return response["token"] -def _check_uploadable_files(folder, ignore_patterns, follow_links): +def _check_uploadable_files(folder: str, *, ignore_patterns: Sequence[str], follow_links: bool) -> int: file_count = 0 symlinked_dirs = set() - for root, dirs, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links, max_warning=5): + for root, dirs, files in filtered_walk( + base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links, max_warning=5 + ): root_path = pathlib.Path(root) for d in dirs: dir_p = root_path / d @@ -214,11 +219,15 @@ def _check_uploadable_files(folder, ignore_patterns, follow_links): file_count += len(files) if file_count == 0 and os.path.isdir(folder): - raise ValueError("No uploadable files are found. At least one file is needed.") + no_uploadable_exception = "No uploadable files are found. At least one file is needed." + raise ValueError(no_uploadable_exception) n_links = len(symlinked_dirs) if not follow_links and n_links > 0: - logger.warning(f"Skip {n_links} symbolic link directories. If you want to include these directores, set `follow_links=True`.") + logger.warning( + f"Skip {n_links} symbolic link directories." + " If you want to include these directores, set `follow_links=True`." + ) return file_count @@ -232,7 +241,7 @@ def upload_files_and_directories( ) -> UploadDirectoryInfo: # Count the total number of files - file_count = _check_uploadable_files(folder, ignore_patterns, follow_links) + file_count = _check_uploadable_files(folder, ignore_patterns=ignore_patterns, follow_links=follow_links) if file_count > MAX_FILES_TO_UPLOAD: if not quiet: @@ -241,7 +250,9 @@ def upload_files_and_directories( with TemporaryDirectory() as temp_dir: zip_path = os.path.join(temp_dir, TEMP_ARCHIVE_FILE) with zipfile.ZipFile(zip_path, "w") as zipf: - for root, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links): + for root, _, files in filtered_walk( + base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links + ): for file in files: file_path = os.path.join(root, file) zipf.write(file_path, os.path.relpath(file_path, folder)) @@ -260,7 +271,9 @@ def upload_files_and_directories( if token: root_dict.files.append(token) else: - for root, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links): + for root, _, files in filtered_walk( + base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links + ): # Path of the current folder relative to the base folder path = os.path.relpath(root, folder) # Navigate or create the dictionary path to the current folder diff --git a/tests/test_gcs_upload.py b/tests/test_gcs_upload.py index 6b70c5c9..6dbd6d99 100644 --- a/tests/test_gcs_upload.py +++ b/tests/test_gcs_upload.py @@ -62,9 +62,8 @@ def test_filtered_walk(self) -> None: walked_files.append(pathlib.Path(dir_path) / file_name) self.assertEqual(set(walked_files), expected_files) - - def _setup_link_dir(self, tmp_dir_p: pathlib.Path): - """ setup the following structure + def _setup_link_dir(self, tmp_dir_p: pathlib.Path) -> None: + """setup the following structure tmp_dir/ root/ link_dir -> tmp_dir/extern/ @@ -76,7 +75,7 @@ def _setup_link_dir(self, tmp_dir_p: pathlib.Path): """ extern_dir = tmp_dir_p / "extern" extern_dir.mkdir() - (extern_dir / "b.txt") .touch() + (extern_dir / "b.txt").touch() (extern_dir / "loop_dir").symlink_to(extern_dir, target_is_directory=True) root_dir = tmp_dir_p / "root" @@ -100,7 +99,9 @@ def test_follow_link_dir(self) -> None: root_dir / "link_dir" / "b.txt", } walked_files = [] - for dir_path, _, file_names in filtered_walk(base_dir=root_dir, ignore_patterns=[], follow_links=follow_links): + for dir_path, _, file_names in filtered_walk( + base_dir=root_dir, ignore_patterns=[], follow_links=follow_links + ): for file_name in file_names: walked_files.append(pathlib.Path(dir_path) / file_name) self.assertEqual(set(walked_files), expected_files) @@ -120,7 +121,9 @@ def test_skip_link_dir(self) -> None: root_dir / "real_dir" / "a.txt", } walked_files = [] - for dir_path, _, file_names in filtered_walk(base_dir=root_dir, ignore_patterns=[], follow_links=follow_links): + for dir_path, _, file_names in filtered_walk( + base_dir=root_dir, ignore_patterns=[], follow_links=follow_links + ): for file_name in file_names: walked_files.append(pathlib.Path(dir_path) / file_name) self.assertEqual(set(walked_files), expected_files) From eb49609d6bb86e369d008856db9af2ef8a61cfe2 Mon Sep 17 00:00:00 2001 From: akiyuki ishikawa Date: Sun, 18 May 2025 22:31:15 +0900 Subject: [PATCH 4/5] show warning before error --- src/kagglehub/gcs_upload.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 00446a81..45ddda42 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -218,16 +218,17 @@ def _check_uploadable_files(folder: str, *, ignore_patterns: Sequence[str], foll symlinked_dirs.add(dir_p) file_count += len(files) - if file_count == 0 and os.path.isdir(folder): - no_uploadable_exception = "No uploadable files are found. At least one file is needed." - raise ValueError(no_uploadable_exception) - n_links = len(symlinked_dirs) if not follow_links and n_links > 0: logger.warning( f"Skip {n_links} symbolic link directories." " If you want to include these directores, set `follow_links=True`." ) + + if file_count == 0 and os.path.isdir(folder): + no_uploadable_exception = "No uploadable files are found. At least one file is needed." + raise ValueError(no_uploadable_exception) + return file_count From aa9dbb222ef1b662b635f42f38f87ee3418aa3fd Mon Sep 17 00:00:00 2001 From: akiyuki ishikawa Date: Sun, 18 May 2025 22:47:09 +0900 Subject: [PATCH 5/5] fix typo --- src/kagglehub/gcs_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kagglehub/gcs_upload.py b/src/kagglehub/gcs_upload.py index 45ddda42..18efad0c 100644 --- a/src/kagglehub/gcs_upload.py +++ b/src/kagglehub/gcs_upload.py @@ -82,7 +82,7 @@ def filtered_walk( The patterns for ignored files. These are standard wildcards relative to base_dir. follow_links (bool): If `True`, follows symbolic linked directories. If links that point to already visited directories are detected, skip them. - max_warning (int): Maxmum number of warning to be displayed. + max_warning (int): Maximum number of warning to be displayed. Yields: Iterable[tuple[str, list[str], list[str]]]: (base_dir_path, list[dir_names], list[filtered_file_names])