CONP-PCNO · kongtiaowang · Mar 9, 2026 · Mar 10, 2026
diff --git a/scripts/Crawlers/OSFCrawler.py b/scripts/Crawlers/OSFCrawler.py
@@ -130,19 +130,36 @@ def _download_files(
                 else:
                     filename = file["attributes"]["name"]
                     url = file["links"]["download"]
-
-                    # Handle zip files: only register URL, don't download
-                    if filename.endswith(".zip"):
-                        target_path = os.path.join(inner_path, filename)
-                        if self.verbose:
-                            print("Registering zip (no download):", target_path)
-                        annex("addurl", "--fast", url, "--file", target_path)
+                    file_size = file["attributes"]["size"]  # in bytes
+                    threshold = 1048576  # 1 MB
+
+                    if file_size < threshold:
+                        # Original behaviour: download zip with archive=True, others normally
+                        if filename.endswith(".zip"):
+                            d.download_url(
+                                url,
+                                path=os.path.join(inner_path, ""),
+                                archive=True,
+                            )
+                        else:
+                            d.download_url(
+                                url,
+                                path=os.path.join(inner_path, ""),
+                                archive=False,
+                            )
                     else:
-                        d.download_url(
-                            url,
-                            path=os.path.join(inner_path, ""),
-                            archive=False,
-                        )
+                        # New behaviour: for zip files only register the URL, others download normally
+                        if filename.endswith(".zip"):
+                            target_path = os.path.join(inner_path, filename)
+                            if self.verbose:
+                                print("Registering zip (no download):", target_path)
+                            annex("addurl", "--fast", url, "--file", target_path)
+                        else:
+                            d.download_url(
+                                url,
+                                path=os.path.join(inner_path, ""),
+                                archive=False,
+                            )
 
                 # append the size of the downloaded file to the sizes array
                 file_size = file["attributes"]["size"]
@@ -598,4 +615,4 @@ def _setup_private_dataset(
         return False
 
     def _is_private_dataset(self, files_url) -> bool:
-        return True if requests.get(files_url).status_code == 401 else False  
+        return True if requests.get(files_url).status_code == 401 else False