From 71dbabb89ba4745b353b951cd1144600ca611b2f Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 19 Mar 2026 14:28:05 -0400 Subject: [PATCH 1/5] add a function in GitHubDataAccess to query pull request availability Signed-off-by: Adrian Edwards --- augur/tasks/github/pull_requests/tasks.py | 4 ++++ augur/tasks/github/util/github_data_access.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 60cc9e1e66..d09c4339fd 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -79,6 +79,10 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[ github_data_access = GithubDataAccess(key_auth, logger) + if not github_data_access.check_prs_enabled(owner, repo): + logger.info(f"{owner}/{repo}: Pull requests appear to be disabled for this repo. Skipping.") + return + num_pages = github_data_access.get_resource_page_count(url) logger.debug(f"{owner}/{repo}: Retrieving {num_pages} pages of pull requests") diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 6df216dfda..e1959d0df3 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -60,6 +60,22 @@ def get_resource_count(self, url): return (100 * (num_pages -1)) + len(data) + def check_prs_enabled(self, owner: str, repo: str,) -> bool: + """ + Checks whether pull requests are enabled for a repository. + Returns False if PRs are disabled (404 on /pulls) and true if there are PRs. + """ + + url = f"https://api.github.com/repos/{owner}/{repo}/pulls?per_page=1" + + try: + self.get_resource_page_count(url) + return True + except UrlNotFoundException: + self.logger.info(f"{owner}/{repo}: Pull requests are disabled. Skipping PR collection.") + return False + + def paginate_resource(self, url): response = self.make_request_with_retries(url) From 9640afe8ce2043054e4445028e494c6e53f1ed1d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 09:33:38 -0400 Subject: [PATCH 2/5] factor URL creation into a function in GithubDataAccess for better handling of future query param encoding needs and sharing responsibility for url creation (GhDA owns the domain/base url, the caller owns the path and query params they want to use) Signed-off-by: Adrian Edwards --- augur/tasks/github/util/github_data_access.py | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index e1959d0df3..5b29acbf24 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -5,6 +5,7 @@ from urllib.parse import urlparse, parse_qs, urlencode from keyman.KeyClient import KeyClient from augur.util.keys import mask_key +import urllib.parse GITHUB_RATELIMIT_REMAINING_CAP = 50 @@ -44,6 +45,35 @@ def __init__(self, key_manager, logger: logging.Logger, feature="rest"): self.key = None self.expired_keys_for_request = [] + def endpoint_url(self, path: str, params: dict = None) -> str: + """Build a URL for a github endpoint using the specified path and query parameters + + Args: + path (str): the path to use (i.e. "/users/MoralCode") + params (dict): optional query parameters to add to the url, as a dict + + Returns: + str: the full URL to the specified resource. + """ + # using pythons url processing library this way helps handle accidental + # inclusion of query parameters in the path string, ensuring all query + # parameters are properly encoded and escaped + + input_url_parts = urllib.parse.urlsplit(path) + final_query_parameters = dict() + + if input_url_parts.query != '': + final_query_parameters.update( + parse_qs(input_url_parts.query) + ) + + if params != None: + final_query_parameters.update(params) + + return urllib.parse.urlunsplit( + ('https', 'api.github.com', input_url_parts.path, urllib.parse.urlencode(final_query_parameters), '') + ) + def get_resource_count(self, url): # set per_page to 100 explicitly so we know each page is 100 long @@ -65,10 +95,8 @@ def check_prs_enabled(self, owner: str, repo: str,) -> bool: Checks whether pull requests are enabled for a repository. Returns False if PRs are disabled (404 on /pulls) and true if there are PRs. """ - - url = f"https://api.github.com/repos/{owner}/{repo}/pulls?per_page=1" - try: + url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls", {"per_page": "1"}) self.get_resource_page_count(url) return True except UrlNotFoundException: From 9a4e57ef94a334170cb1e46bba6f73827cb018fb Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 09:34:08 -0400 Subject: [PATCH 3/5] perform pull requests lookup with the new endpoint url builder Signed-off-by: Adrian Edwards --- augur/tasks/github/pull_requests/tasks.py | 5 +++-- augur/tasks/github/util/github_data_access.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index d09c4339fd..96343b2886 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -75,10 +75,11 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[ logger.debug(f"Collecting pull requests for {owner}/{repo}") - url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all&direction=desc&sort=updated" - github_data_access = GithubDataAccess(key_auth, logger) + search_args = {"state": "all", "direction": "desc", "sort": "updated"} + url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls", search_args) + if not github_data_access.check_prs_enabled(owner, repo): logger.info(f"{owner}/{repo}: Pull requests appear to be disabled for this repo. Skipping.") return diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 5b29acbf24..00dbd2cbfb 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -96,7 +96,7 @@ def check_prs_enabled(self, owner: str, repo: str,) -> bool: Returns False if PRs are disabled (404 on /pulls) and true if there are PRs. """ try: - url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls", {"per_page": "1"}) + url = self.endpoint_url(f"repos/{owner}/{repo}/pulls", {"per_page": "1"}) self.get_resource_page_count(url) return True except UrlNotFoundException: From 9b267a941a9e27e383401485bf6dec8c1dcb1c43 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 19:29:31 -0400 Subject: [PATCH 4/5] add some deprecation warnings to functions that should be replaced by this in the future Signed-off-by: Adrian Edwards --- .../contributor_interfaceable/contributor_interface.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index ff65da319b..3bd018aec3 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -8,7 +8,7 @@ # Debugger from augur.tasks.github.util.github_paginator import GithubApiResult from augur.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql, get_contributors_by_github_user_id - +from typing_extensions import deprecated ##TODO: maybe have a TaskSession class that holds information about the database, logger, config, etc. @@ -107,7 +107,7 @@ def request_dict_from_endpoint(logger, session, url, timeout_wait=10): return response_data - +@deprecated("Please use GithubDataAcess.endpoint_url() instead") def create_endpoint_from_email(email): # Note: I added "+type:user" to avoid having user owned organizations be returned # Also stopped splitting per note above. @@ -117,7 +117,7 @@ def create_endpoint_from_email(email): return url - +@deprecated("Please use GithubDataAcess.endpoint_url() instead") def create_endpoint_from_commit_sha(logger, commit_sha, repo_id): logger.debug( f"Trying to create endpoint from commit hash: {commit_sha}") From 1cea61af63393c5ec5cb8f266a88783db2719fb5 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 27 Mar 2026 08:21:26 -0400 Subject: [PATCH 5/5] refactor to reuse an existing internal function, rather than rewriting what it already does Signed-off-by: Adrian Edwards --- augur/tasks/github/util/github_data_access.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 00dbd2cbfb..99496a9627 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -5,7 +5,6 @@ from urllib.parse import urlparse, parse_qs, urlencode from keyman.KeyClient import KeyClient from augur.util.keys import mask_key -import urllib.parse GITHUB_RATELIMIT_REMAINING_CAP = 50 @@ -55,24 +54,16 @@ def endpoint_url(self, path: str, params: dict = None) -> str: Returns: str: the full URL to the specified resource. """ - # using pythons url processing library this way helps handle accidental + # using pythons url processing library helps handle accidental # inclusion of query parameters in the path string, ensuring all query # parameters are properly encoded and escaped - input_url_parts = urllib.parse.urlsplit(path) - final_query_parameters = dict() + if not path.startswith("/"): + path = "/" + path - if input_url_parts.query != '': - final_query_parameters.update( - parse_qs(input_url_parts.query) - ) - - if params != None: - final_query_parameters.update(params) + url = "https://api.github.com" + path - return urllib.parse.urlunsplit( - ('https', 'api.github.com', input_url_parts.path, urllib.parse.urlencode(final_query_parameters), '') - ) + return self.__add_query_params(url, params or {}) def get_resource_count(self, url):