From 5a8bcac4fda846121affd4a01259016929a215d3 Mon Sep 17 00:00:00 2001 From: kongtiaowang Date: Mon, 9 Mar 2026 16:00:44 -0400 Subject: [PATCH 1/2] fix --- scripts/Crawlers/OSFCrawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Crawlers/OSFCrawler.py b/scripts/Crawlers/OSFCrawler.py index 470814dd6..3408dfcd3 100644 --- a/scripts/Crawlers/OSFCrawler.py +++ b/scripts/Crawlers/OSFCrawler.py @@ -598,4 +598,4 @@ def _setup_private_dataset( return False def _is_private_dataset(self, files_url) -> bool: - return True if requests.get(files_url).status_code == 401 else False + return True if requests.get(files_url).status_code == 401 else False From c21484f61ea147a0fb884f273c38578bedb06fb4 Mon Sep 17 00:00:00 2001 From: kongtiaowang Date: Tue, 10 Mar 2026 08:12:39 -0400 Subject: [PATCH 2/2] fix --- scripts/Crawlers/OSFCrawler.py | 43 ++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/scripts/Crawlers/OSFCrawler.py b/scripts/Crawlers/OSFCrawler.py index 3408dfcd3..eb9bf1ea2 100644 --- a/scripts/Crawlers/OSFCrawler.py +++ b/scripts/Crawlers/OSFCrawler.py @@ -130,19 +130,36 @@ def _download_files( else: filename = file["attributes"]["name"] url = file["links"]["download"] - - # Handle zip files: only register URL, don't download - if filename.endswith(".zip"): - target_path = os.path.join(inner_path, filename) - if self.verbose: - print("Registering zip (no download):", target_path) - annex("addurl", "--fast", url, "--file", target_path) + file_size = file["attributes"]["size"] # in bytes + threshold = 1048576 # 1 MB + + if file_size < threshold: + # Original behaviour: download zip with archive=True, others normally + if filename.endswith(".zip"): + d.download_url( + url, + path=os.path.join(inner_path, ""), + archive=True, + ) + else: + d.download_url( + url, + path=os.path.join(inner_path, ""), + archive=False, + ) else: - d.download_url( - url, - path=os.path.join(inner_path, ""), - archive=False, - ) + # New behaviour: for zip files only register the URL, others download normally + if filename.endswith(".zip"): + target_path = os.path.join(inner_path, filename) + if self.verbose: + print("Registering zip (no download):", target_path) + annex("addurl", "--fast", url, "--file", target_path) + else: + d.download_url( + url, + path=os.path.join(inner_path, ""), + archive=False, + ) # append the size of the downloaded file to the sizes array file_size = file["attributes"]["size"] @@ -598,4 +615,4 @@ def _setup_private_dataset( return False def _is_private_dataset(self, files_url) -> bool: - return True if requests.get(files_url).status_code == 401 else False + return True if requests.get(files_url).status_code == 401 else False