From 75ea9121247ceef5db652f725f09d257d84640cb Mon Sep 17 00:00:00 2001 From: kongtiaowang Date: Mon, 23 Mar 2026 10:22:12 -0400 Subject: [PATCH 1/2] BaseCrawler --- scripts/Crawlers/BaseCrawler.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/scripts/Crawlers/BaseCrawler.py b/scripts/Crawlers/BaseCrawler.py index 0d6f565fc..1fe76b92a 100644 --- a/scripts/Crawlers/BaseCrawler.py +++ b/scripts/Crawlers/BaseCrawler.py @@ -326,14 +326,29 @@ def run(self): modified = True commit_msg = "Created " + dataset_description["title"] else: # Dataset already existing locally - self.repo.git.checkout("-f", branch_name) + try: + # Try normal checkout first + self.repo.git.checkout("-f", branch_name) + except git.exc.GitCommandError as e: + if "filter-process" in str(e): + self.repo.git.checkout("-c", "filter.process=", "-f", branch_name) + else: + raise + try: self.repo.git.merge("-n", "--no-verify", "master") except Exception as e: print(f"Error while merging master into {branch_name}: {e}") print("Skipping this dataset") self.repo.git.merge("--abort") - self.repo.git.checkout("-f", "master") + try: + # Use the same safe checkout to go back to master + self.repo.git.checkout("-f", "master") + except git.exc.GitCommandError as e: + if "filter-process" in str(e): + self.repo.git.checkout("-c", "filter.process=", "-f", "master") + else: + raise continue modified = self.update_if_necessary( From 06210f4d8f9335e0dabc0d6c3d9148d5f02a6018 Mon Sep 17 00:00:00 2001 From: kongtiaowang Date: Mon, 23 Mar 2026 11:39:03 -0400 Subject: [PATCH 2/2] fix block --- scripts/Crawlers/BaseCrawler.py | 34 +++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/scripts/Crawlers/BaseCrawler.py b/scripts/Crawlers/BaseCrawler.py index 1fe76b92a..7ea524dc1 100644 --- a/scripts/Crawlers/BaseCrawler.py +++ b/scripts/Crawlers/BaseCrawler.py @@ -331,7 +331,16 @@ def run(self): self.repo.git.checkout("-f", branch_name) except git.exc.GitCommandError as e: if "filter-process" in str(e): - self.repo.git.checkout("-c", "filter.process=", "-f", branch_name) + self.repo.git.execute( + [ + "git", + "-c", + "filter.annex.process=", + "checkout", + "-f", + branch_name, + ] + ) else: raise @@ -346,7 +355,16 @@ def run(self): self.repo.git.checkout("-f", "master") except git.exc.GitCommandError as e: if "filter-process" in str(e): - self.repo.git.checkout("-c", "filter.process=", "-f", "master") + self.repo.git.execute( + [ + "git", + "-c", + "filter.annex.process=", + "checkout", + "-f", + "master", + ] + ) else: raise continue @@ -394,7 +412,7 @@ def run(self): ) d.save() d.publish(to="origin") - commit_msg = "Updated " + dataset_description["title"] + commit_msg = "Updated " + dataset_description["title"] # If modification detected in dataset, push to branch and create PR if modified: @@ -407,7 +425,15 @@ def run(self): print(e) # Go back to master - self.repo.git.checkout("master") + try: + self.repo.git.checkout("master") + except git.exc.GitCommandError as e: + if "filter-process" in str(e): + self.repo.git.execute( + ["git", "-c", "filter.annex.process=", "checkout", "master"] + ) + else: + raise def _add_github_repo_description(self, repo_title, dataset_description): url = "https://api.github.com/repos/{}/{}".format(