From 4bf75029638c10a94a8b72f59eb76a6bd9974a50 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 12 Feb 2026 13:08:24 -0500 Subject: [PATCH 01/13] start moving repo issue endpoints into the GithubDataAccess class Signed-off-by: Adrian Edwards --- augur/tasks/github/events.py | 12 ++++++------ augur/tasks/github/issues.py | 8 ++++---- augur/tasks/github/messages.py | 6 +++--- augur/tasks/github/util/github_data_access.py | 9 +++++++++ 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py index a2a8736c8a..34ff40fdf1 100644 --- a/augur/tasks/github/events.py +++ b/augur/tasks/github/events.py @@ -48,10 +48,10 @@ def collect_events(repo_git: str, full_collection: bool): def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo): - url = f"https://api.github.com/repos/{owner}/{repo}/issues/events?per_page=100" - github_data_access = GithubDataAccess(key_auth, logger) + url = github_data_access.issues_endpoint_url(owner, repo) + "events?per_page=100" + page_count = github_data_access.get_resource_page_count(url) if page_count > 300: @@ -133,11 +133,11 @@ def collect(self, repo_git, key_auth, since): def _collect_events(self, repo_git: str, key_auth, since): owner, repo = get_owner_repo(repo_git) - - url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" github_data_access = GithubDataAccess(key_auth, self._logger) + url = github_data_access.issues_endpoint_url(owner, repo) + "events" + for event in github_data_access.paginate_resource(url): yield event @@ -314,7 +314,7 @@ def _collect_and_process_issue_events(self, owner, repo, repo_id, key_auth, sinc issue_number = issue["issue_number"] - event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/events" + event_url = github_data_access.issues_endpoint_url(owner, repo) + f"{issue_number}/events" try: @@ -377,7 +377,7 @@ def _collect_and_process_pr_events(self, owner, repo, repo_id, key_auth, since): pr_number = pr["gh_pr_number"] - event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{pr_number}/events" + event_url = github_data_access.issues_endpoint_url(owner, repo) + f"{pr_number}/events" try: diff --git a/augur/tasks/github/issues.py b/augur/tasks/github/issues.py index 6b7b3dd8b7..55aacc0cb8 100644 --- a/augur/tasks/github/issues.py +++ b/augur/tasks/github/issues.py @@ -103,13 +103,13 @@ def retrieve_all_issue_data(repo_git: str, logger: logging.Logger, key_auth: Git logger.info(f"Collecting issues for {owner}/{repo}") - url = f"https://api.github.com/repos/{owner}/{repo}/issues?state=all" + github_data_access = GithubDataAccess(key_auth, logger) + + url = github_data_access.issues_endpoint_url(owner, repo, trailing_slash=False) + "?state=all" if since: url += f"&since={since.isoformat()}" - - github_data_access = GithubDataAccess(key_auth, logger) - + num_pages = github_data_access.get_resource_page_count(url) logger.info(f"{owner}/{repo}: Retrieving {num_pages} pages of issues") diff --git a/augur/tasks/github/messages.py b/augur/tasks/github/messages.py index e8453a18df..32974b6b2a 100644 --- a/augur/tasks/github/messages.py +++ b/augur/tasks/github/messages.py @@ -64,8 +64,10 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas owner, repo = get_owner_repo(repo_git) + github_data_access = GithubDataAccess(key_auth, logger) + # url to get issue and pull request comments - url = f"https://api.github.com/repos/{owner}/{repo}/issues/comments" + url = github_data_access.issues_endpoint_url(owner, repo) + "comments" if since: url += f"?since={since.isoformat()}" @@ -73,8 +75,6 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas # define logger for task logger.info(f"Collecting github comments for {owner}/{repo}") - github_data_access = GithubDataAccess(key_auth, logger) - message_count = github_data_access.get_resource_count(url) logger.info(f"{task_name}: Collecting {message_count} github messages") diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 6df216dfda..2de084b03c 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -36,6 +36,15 @@ def __init__(self, message="Resource returned HTTP 410 Gone. It is likely intent class GithubDataAccess: + def _base_url(self): + return "https://api.github.com/" + + def issues_endpoint_url(self, owner, repo, trailing_slash = True): + """ https://api.github.com/repos/{owner}/{repo}/issues/ """ + return f"https://api.github.com/repos/{owner}/{repo}/issues" + "/" if trailing_slash else "" + + + def __init__(self, key_manager, logger: logging.Logger, feature="rest"): self.logger = logger From 2572914548ff82bfa9d440ec1619e0146d51cc1c Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 12 Feb 2026 13:15:53 -0500 Subject: [PATCH 02/13] add doc comment to main class Signed-off-by: Adrian Edwards --- augur/tasks/github/util/github_data_access.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 2de084b03c..507734fa2f 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -35,6 +35,8 @@ def __init__(self, message="Resource returned HTTP 410 Gone. It is likely intent super().__init__(message) class GithubDataAccess: + """Utilities for accessing the GitHub REST API + """ def _base_url(self): return "https://api.github.com/" From f47065bcef19842138565afddd63f8443e5fb41b Mon Sep 17 00:00:00 2001 From: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> Date: Fri, 13 Feb 2026 09:03:24 -0500 Subject: [PATCH 03/13] fix operator precedence issue Signed-off-by: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> --- augur/tasks/github/util/github_data_access.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 507734fa2f..08b6ee9644 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -43,7 +43,7 @@ def _base_url(self): def issues_endpoint_url(self, owner, repo, trailing_slash = True): """ https://api.github.com/repos/{owner}/{repo}/issues/ """ - return f"https://api.github.com/repos/{owner}/{repo}/issues" + "/" if trailing_slash else "" + return f"https://api.github.com/repos/{owner}/{repo}/issues" + ("/" if trailing_slash else "") From 7279df336e33076b55b15fd418747c81b7b081e2 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 13 Feb 2026 09:08:36 -0500 Subject: [PATCH 04/13] actually use base URL Signed-off-by: Adrian Edwards --- augur/tasks/github/util/github_data_access.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 08b6ee9644..e141ed23d7 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -43,7 +43,7 @@ def _base_url(self): def issues_endpoint_url(self, owner, repo, trailing_slash = True): """ https://api.github.com/repos/{owner}/{repo}/issues/ """ - return f"https://api.github.com/repos/{owner}/{repo}/issues" + ("/" if trailing_slash else "") + return f"{self._base_url()}repos/{owner}/{repo}/issues" + ("/" if trailing_slash else "") From e8df436c5c00aaaf755c465327ede185a0551b3d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 13 Feb 2026 09:08:46 -0500 Subject: [PATCH 05/13] doc comments and python types Signed-off-by: Adrian Edwards --- augur/tasks/github/util/github_data_access.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index e141ed23d7..ce60254dcf 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -38,11 +38,26 @@ class GithubDataAccess: """Utilities for accessing the GitHub REST API """ - def _base_url(self): + def _base_url(self) -> str: + """the github base URL with HTTP scheme and trailing slash, suitable for building specific API urls. + + Returns: + str: the base url + """ return "https://api.github.com/" - def issues_endpoint_url(self, owner, repo, trailing_slash = True): - """ https://api.github.com/repos/{owner}/{repo}/issues/ """ + def issues_endpoint_url(self, owner:str, repo:str, trailing_slash = True) -> str: + """the github REST API url for the issues endpoint + + Args: + owner (str): the owner/org of the repo + repo (str): the repo name + trailing_slash (bool, optional): Whether to include the trailing slash or not. Defaults to True. + + Returns: + str: the assembled URL with values filled in. Example: https://api.github.com/repos/owner/repo/issues/ + """ + return f"{self._base_url()}repos/{owner}/{repo}/issues" + ("/" if trailing_slash else "") From 34f3a28967ffad4a4204b3845e8d5360a1c9c402 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 26 Feb 2026 14:48:16 -0500 Subject: [PATCH 06/13] refactor github contributors url Signed-off-by: Adrian Edwards --- augur/tasks/github/facade_github/core.py | 9 +++------ augur/tasks/github/util/github_data_access.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/augur/tasks/github/facade_github/core.py b/augur/tasks/github/facade_github/core.py index d60c0c18d8..d8d3db5ba5 100644 --- a/augur/tasks/github/facade_github/core.py +++ b/augur/tasks/github/facade_github/core.py @@ -26,12 +26,6 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too logger.error(f"Encountered bad url: {github_url}") raise e - # Set the base of the url and place to hold contributors to insert - contributors_url = ( - f"https://api.github.com/repos/{owner}/{name}/" + - "contributors?state=all" - ) - # Get contributors that we already have stored # Set our duplicate and update column map keys (something other than PK) to # check dupicates/needed column updates with @@ -42,6 +36,9 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too github_data_access = GithubDataAccess(key_auth, logger) + # Set the base of the url and place to hold contributors to insert + contributors_url = github_data_access.contributors_endpoint_url(owner, repo) + "?state=all" + contributor_count = github_data_access.get_resource_count(contributors_url) logger.info("Count of contributors needing insertion: " + str(contributor_count) + "\n") diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index ce60254dcf..79245cc9b9 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -59,8 +59,20 @@ def issues_endpoint_url(self, owner:str, repo:str, trailing_slash = True) -> str """ return f"{self._base_url()}repos/{owner}/{repo}/issues" + ("/" if trailing_slash else "") + + def contributors_endpoint_url(self, owner:str, repo:str, trailing_slash = True) -> str: + """the github REST API url for the contributors endpoint + Args: + owner (str): the owner/org of the repo + repo (str): the repo name + trailing_slash (bool, optional): Whether to include the trailing slash or not. Defaults to True. + Returns: + str: the assembled URL with values filled in. Example: https://api.github.com/repos/owner/repo/contributors/ + """ + + return f"{self._base_url()}repos/{owner}/{repo}/contributors" + ("/" if trailing_slash else "") def __init__(self, key_manager, logger: logging.Logger, feature="rest"): From a4e27f9ed3cdcc84074c35315a099f304209bc4b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 26 Feb 2026 14:51:50 -0500 Subject: [PATCH 07/13] refactor user endpoint url Signed-off-by: Adrian Edwards --- augur/tasks/github/facade_github/tasks.py | 2 +- augur/tasks/github/util/github_data_access.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 823201ad38..6864ae0a19 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -87,7 +87,7 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id # move on to the next contributor continue - url = ("https://api.github.com/users/" + login) + url = github_data_access.user_endpoint_url(login) try: user_data = github_data_access.get_resource(url) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 79245cc9b9..d01355a290 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -73,6 +73,19 @@ def contributors_endpoint_url(self, owner:str, repo:str, trailing_slash = True) """ return f"{self._base_url()}repos/{owner}/{repo}/contributors" + ("/" if trailing_slash else "") + + def user_endpoint_url(self, username:str, trailing_slash = True) -> str: + """the github REST API url for the users endpoint + + Args: + username (str): the github username to query + trailing_slash (bool, optional): Whether to include the trailing slash or not. Defaults to True. + + Returns: + str: the assembled URL with values filled in. Example: https://api.github.com/repos/owner/repo/contributors/ + """ + + return f"{self._base_url()}users/{username}" + ("/" if trailing_slash else "") def __init__(self, key_manager, logger: logging.Logger, feature="rest"): From ae071374b3fb20ec11c14fc6aa5189bdf009a718 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 26 Feb 2026 15:16:08 -0500 Subject: [PATCH 08/13] tiny refactor to just consolodate github data access calls Signed-off-by: Adrian Edwards --- augur/tasks/github/facade_github/tasks.py | 6 ++---- augur/tasks/github/util/github_data_access.py | 6 ++++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 6864ae0a19..e6140b92ec 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -86,11 +86,9 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id f"Could not create new unresolved email {email}. Error: {e}") # move on to the next contributor continue - - url = github_data_access.user_endpoint_url(login) - + try: - user_data = github_data_access.get_resource(url) + user_data = github_data_access.get_user(login) except UrlNotFoundException as e: logger.warning(f"User of {login} not found on github. Skipping...") continue diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index d01355a290..7eaf82a67a 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -95,6 +95,12 @@ def __init__(self, key_manager, logger: logging.Logger, feature="rest"): self.key = None self.expired_keys_for_request = [] + + def get_user(self, username:str): + url = self.user_endpoint_url(username) + + return self.get_resource(url) + def get_resource_count(self, url): # set per_page to 100 explicitly so we know each page is 100 long From 593c895f5a825e4a840a2019470d82538dbabb70 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 26 Feb 2026 16:14:29 -0500 Subject: [PATCH 09/13] rebuild github endpoint urls in the data access class Signed-off-by: Adrian Edwards --- augur/tasks/github/util/github_data_access.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 7eaf82a67a..777c6e4579 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -86,6 +86,32 @@ def user_endpoint_url(self, username:str, trailing_slash = True) -> str: """ return f"{self._base_url()}users/{username}" + ("/" if trailing_slash else "") + + def user_endpoint_urls(self, username:str) -> dict: + """the github REST API urls beneath the users endpoint, in dict form. + Intended to enable the recreation of a subset of what is returned by the github API + + Args: + username (str): the github username to query + + Returns: + dict: a dict of various user sub urls like would be returned by github's API. + """ + user_url = self.user_endpoint_url(username, trailing_slash=False) + return { + "url": user_url, + "html_url": f"https://github.com/{username}", + "followers_url": f"{user_url}/followers", + "following_url": user_url + "/following{/other_user}", + "gists_url": user_url + "/gists{/gist_id}", + "starred_url": user_url + "/starred{/owner}{/repo}", + "subscriptions_url": f"{user_url}/subscriptions", + "organizations_url": f"{user_url}/orgs", + "repos_url": f"{user_url}/repos", + "events_url": user_url + "/events{/privacy}", + "received_events_url": f"{user_url}/received_events", + } + def __init__(self, key_manager, logger: logging.Logger, feature="rest"): From 386cd96d7890722262085468bc1eac8d2434119d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 26 Feb 2026 17:02:58 -0500 Subject: [PATCH 10/13] factor out base domain Signed-off-by: Adrian Edwards --- augur/tasks/github/util/github_data_access.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 777c6e4579..6cbffd6c5d 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -38,13 +38,21 @@ class GithubDataAccess: """Utilities for accessing the GitHub REST API """ + def _base_domain(self) -> str: + """the base domain against which api calls are assembled + + Returns: + str: the base domain by itself + """ + return "github.com" + def _base_url(self) -> str: """the github base URL with HTTP scheme and trailing slash, suitable for building specific API urls. Returns: str: the base url """ - return "https://api.github.com/" + return f"https://api.{self._base_domain()}/" def issues_endpoint_url(self, owner:str, repo:str, trailing_slash = True) -> str: """the github REST API url for the issues endpoint From 503c14f83301e1f4ff5c00f7d2a4723c451db804 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 4 Mar 2026 19:33:17 -0500 Subject: [PATCH 11/13] create a function for crafting search endpoints on github Signed-off-by: Adrian Edwards --- augur/tasks/github/util/github_data_access.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 6cbffd6c5d..25f385dc50 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -5,6 +5,7 @@ from urllib.parse import urlparse, parse_qs, urlencode from keyman.KeyClient import KeyClient from augur.util.keys import mask_key +import urllib.parse GITHUB_RATELIMIT_REMAINING_CAP = 50 @@ -120,7 +121,27 @@ def user_endpoint_urls(self, username:str) -> dict: "received_events_url": f"{user_url}/received_events", } - + + def search_endpoint(self, topic: str, query: str) -> str: + """construct a github API call to perform a search + + Args: + topic (str): the topic to search. Valid options are: users, code, commits, issues, labels, repositories, topics. + query (str): the query string to search as you'd type it into githubs serach bar. Example: "email@example.com in:email type:user" + + Raises: + ValueError: if an invalid topic is provided + + Returns: + str: a URL that can be queried to perform the search + """ + topic = topic.lower() + if topic not in ["users", "code", "commits", "issues", "labels", "repositories", "topics" ]: + raise ValueError(f"Invalid topic '{topic}' provided for searching github.") + + return f"{self._base_url()}search/{topic}?q={urllib.parse.quote(query)}" + + def __init__(self, key_manager, logger: logging.Logger, feature="rest"): self.logger = logger From e11df002866888eef071de1afc30c8ae5dc4d7a0 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 27 Feb 2026 16:45:26 -0500 Subject: [PATCH 12/13] create a function that performs searches Signed-off-by: Adrian Edwards --- augur/tasks/github/util/github_data_access.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 25f385dc50..b2c2e00f87 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -156,6 +156,10 @@ def get_user(self, username:str): return self.get_resource(url) + def perform_search(self, topic: str, query: str): + url = self.search_endpoint(topic, query) + return self.get_resource(url) + def get_resource_count(self, url): # set per_page to 100 explicitly so we know each page is 100 long From 5b6e5f8965a8196377ff9668ac22e66ca4d72324 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 27 Feb 2026 16:45:36 -0500 Subject: [PATCH 13/13] github data access usage docs Signed-off-by: Adrian Edwards --- augur/tasks/github/util/github_data_access.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index b2c2e00f87..1c17960826 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -37,6 +37,9 @@ def __init__(self, message="Resource returned HTTP 410 Gone. It is likely intent class GithubDataAccess: """Utilities for accessing the GitHub REST API + + Public facing functions in this class should refrain from returning data in a structure + that is derived from githubs API responses to keep all platform-specific parsing here. """ def _base_domain(self) -> str: