From 7d66adc2fe7ecb8935b6849806b99bf8b1b17e34 Mon Sep 17 00:00:00 2001 From: devs6186 Date: Thu, 19 Feb 2026 02:11:03 +0530 Subject: [PATCH] [db/models] Deduplicate repos by repo_src_id on CLI add Fixes #3056 - is_valid_github_repo now returns repo_src_id (GitHub numeric ID) alongside repo_type; the numeric ID is stable across renames - add_cli_repo passes repo_src_id through to insert_github_repo and insert_gitlab_repo - insert_github_repo checks for an existing Repo row with the same repo_src_id before inserting; if found, returns its repo_id so the renamed repo is not ingested twice under a different URL Signed-off-by: devs6186 --- augur/application/db/models/augur_data.py | 11 ++++++++++- augur/util/repo_load_controller.py | 13 ++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 78c2ce7151..54a3b7e1d2 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -954,7 +954,7 @@ def is_valid_github_repo(gh_session, url: str) -> bool: return False, {"status": f"Github Error: {data['message']}"} - return True, {"status": "Valid repo", "repo_type": data["owner"]["type"]} + return True, {"status": "Valid repo", "repo_type": data["owner"]["type"], "repo_src_id": data["id"]} return False, {"status": "Failed to validate repo after multiple attempts"} @@ -1145,6 +1145,15 @@ def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_ if not owner or not repo: return None + # If we know the numeric source ID, check whether this repo is already + # tracked under a different URL (e.g. after a GitHub rename/transfer). + # This prevents duplicate rows when the same repo is added twice with + # different URLs (issue #3056). + if repo_src_id is not None: + existing = session.query(Repo).filter(Repo.repo_src_id == repo_src_id).first() + if existing: + return existing.repo_id + repo_data = { "repo_group_id": repo_group_id, "repo_git": url, diff --git a/augur/util/repo_load_controller.py b/augur/util/repo_load_controller.py index af46ce3260..d180c4982a 100644 --- a/augur/util/repo_load_controller.py +++ b/augur/util/repo_load_controller.py @@ -49,6 +49,8 @@ def add_cli_repo(self, repo_data: Dict[str, Any], from_org_list=False, repo_type url = repo_data["url"] repo_group_id = repo_data["repo_group_id"] + repo_src_id = None + # if it is from not from an org list then we need to check its validity, and get the repo type if not from_org_list: if "gitlab" in url: @@ -57,19 +59,24 @@ def add_cli_repo(self, repo_data: Dict[str, Any], from_org_list=False, repo_type result = Repo.is_valid_github_repo(self.session, url) if not result[0]: return False, {"status": result[1]["status"], "repo_url": url} - + try: repo_type = result[1]["repo_type"] except KeyError: print("Skipping repo type...") + # Capture the source ID so we can deduplicate by it (issue #3056). + # Repos that were renamed keep the same numeric src ID even though + # their URL changes, so using src_id prevents double-ingestion. + repo_src_id = result[1].get("repo_src_id") + # if the repo doesn't exist it adds it if "gitlab" in url: - repo_id = Repo.insert_gitlab_repo(self.session, url, repo_group_id, "CLI") + repo_id = Repo.insert_gitlab_repo(self.session, url, repo_group_id, "CLI", repo_src_id=repo_src_id) CollectionStatus.insert(self.session, logger, repo_id) else: - repo_id = Repo.insert_github_repo(self.session, url, repo_group_id, "CLI", repo_type) + repo_id = Repo.insert_github_repo(self.session, url, repo_group_id, "CLI", repo_type, repo_src_id=repo_src_id) CollectionStatus.insert(self.session, logger, repo_id) if not repo_id: