diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py index a2a8736c8a..34ff40fdf1 100644 --- a/augur/tasks/github/events.py +++ b/augur/tasks/github/events.py @@ -48,10 +48,10 @@ def collect_events(repo_git: str, full_collection: bool): def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo): - url = f"https://api.github.com/repos/{owner}/{repo}/issues/events?per_page=100" - github_data_access = GithubDataAccess(key_auth, logger) + url = github_data_access.issues_endpoint_url(owner, repo) + "events?per_page=100" + page_count = github_data_access.get_resource_page_count(url) if page_count > 300: @@ -133,11 +133,11 @@ def collect(self, repo_git, key_auth, since): def _collect_events(self, repo_git: str, key_auth, since): owner, repo = get_owner_repo(repo_git) - - url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" github_data_access = GithubDataAccess(key_auth, self._logger) + url = github_data_access.issues_endpoint_url(owner, repo) + "events" + for event in github_data_access.paginate_resource(url): yield event @@ -314,7 +314,7 @@ def _collect_and_process_issue_events(self, owner, repo, repo_id, key_auth, sinc issue_number = issue["issue_number"] - event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/events" + event_url = github_data_access.issues_endpoint_url(owner, repo) + f"{issue_number}/events" try: @@ -377,7 +377,7 @@ def _collect_and_process_pr_events(self, owner, repo, repo_id, key_auth, since): pr_number = pr["gh_pr_number"] - event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{pr_number}/events" + event_url = github_data_access.issues_endpoint_url(owner, repo) + f"{pr_number}/events" try: diff --git a/augur/tasks/github/facade_github/core.py b/augur/tasks/github/facade_github/core.py index d60c0c18d8..d8d3db5ba5 100644 --- a/augur/tasks/github/facade_github/core.py +++ b/augur/tasks/github/facade_github/core.py @@ -26,12 +26,6 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too logger.error(f"Encountered bad url: {github_url}") raise e - # Set the base of the url and place to hold contributors to insert - contributors_url = ( - f"https://api.github.com/repos/{owner}/{name}/" + - "contributors?state=all" - ) - # Get contributors that we already have stored # Set our duplicate and update column map keys (something other than PK) to # check dupicates/needed column updates with @@ -42,6 +36,9 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too github_data_access = GithubDataAccess(key_auth, logger) + # Set the base of the url and place to hold contributors to insert + contributors_url = github_data_access.contributors_endpoint_url(owner, repo) + "?state=all" + contributor_count = github_data_access.get_resource_count(contributors_url) logger.info("Count of contributors needing insertion: " + str(contributor_count) + "\n") diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 823201ad38..e6140b92ec 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -86,11 +86,9 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id f"Could not create new unresolved email {email}. Error: {e}") # move on to the next contributor continue - - url = ("https://api.github.com/users/" + login) - + try: - user_data = github_data_access.get_resource(url) + user_data = github_data_access.get_user(login) except UrlNotFoundException as e: logger.warning(f"User of {login} not found on github. Skipping...") continue diff --git a/augur/tasks/github/issues.py b/augur/tasks/github/issues.py index 6b7b3dd8b7..55aacc0cb8 100644 --- a/augur/tasks/github/issues.py +++ b/augur/tasks/github/issues.py @@ -103,13 +103,13 @@ def retrieve_all_issue_data(repo_git: str, logger: logging.Logger, key_auth: Git logger.info(f"Collecting issues for {owner}/{repo}") - url = f"https://api.github.com/repos/{owner}/{repo}/issues?state=all" + github_data_access = GithubDataAccess(key_auth, logger) + + url = github_data_access.issues_endpoint_url(owner, repo, trailing_slash=False) + "?state=all" if since: url += f"&since={since.isoformat()}" - - github_data_access = GithubDataAccess(key_auth, logger) - + num_pages = github_data_access.get_resource_page_count(url) logger.info(f"{owner}/{repo}: Retrieving {num_pages} pages of issues") diff --git a/augur/tasks/github/messages.py b/augur/tasks/github/messages.py index e8453a18df..32974b6b2a 100644 --- a/augur/tasks/github/messages.py +++ b/augur/tasks/github/messages.py @@ -64,8 +64,10 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas owner, repo = get_owner_repo(repo_git) + github_data_access = GithubDataAccess(key_auth, logger) + # url to get issue and pull request comments - url = f"https://api.github.com/repos/{owner}/{repo}/issues/comments" + url = github_data_access.issues_endpoint_url(owner, repo) + "comments" if since: url += f"?since={since.isoformat()}" @@ -73,8 +75,6 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas # define logger for task logger.info(f"Collecting github comments for {owner}/{repo}") - github_data_access = GithubDataAccess(key_auth, logger) - message_count = github_data_access.get_resource_count(url) logger.info(f"{task_name}: Collecting {message_count} github messages") diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 6df216dfda..1c17960826 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -5,6 +5,7 @@ from urllib.parse import urlparse, parse_qs, urlencode from keyman.KeyClient import KeyClient from augur.util.keys import mask_key +import urllib.parse GITHUB_RATELIMIT_REMAINING_CAP = 50 @@ -35,6 +36,114 @@ def __init__(self, message="Resource returned HTTP 410 Gone. It is likely intent super().__init__(message) class GithubDataAccess: + """Utilities for accessing the GitHub REST API + + Public facing functions in this class should refrain from returning data in a structure + that is derived from githubs API responses to keep all platform-specific parsing here. + """ + + def _base_domain(self) -> str: + """the base domain against which api calls are assembled + + Returns: + str: the base domain by itself + """ + return "github.com" + + def _base_url(self) -> str: + """the github base URL with HTTP scheme and trailing slash, suitable for building specific API urls. + + Returns: + str: the base url + """ + return f"https://api.{self._base_domain()}/" + + def issues_endpoint_url(self, owner:str, repo:str, trailing_slash = True) -> str: + """the github REST API url for the issues endpoint + + Args: + owner (str): the owner/org of the repo + repo (str): the repo name + trailing_slash (bool, optional): Whether to include the trailing slash or not. Defaults to True. + + Returns: + str: the assembled URL with values filled in. Example: https://api.github.com/repos/owner/repo/issues/ + """ + + return f"{self._base_url()}repos/{owner}/{repo}/issues" + ("/" if trailing_slash else "") + + def contributors_endpoint_url(self, owner:str, repo:str, trailing_slash = True) -> str: + """the github REST API url for the contributors endpoint + + Args: + owner (str): the owner/org of the repo + repo (str): the repo name + trailing_slash (bool, optional): Whether to include the trailing slash or not. Defaults to True. + + Returns: + str: the assembled URL with values filled in. Example: https://api.github.com/repos/owner/repo/contributors/ + """ + + return f"{self._base_url()}repos/{owner}/{repo}/contributors" + ("/" if trailing_slash else "") + + def user_endpoint_url(self, username:str, trailing_slash = True) -> str: + """the github REST API url for the users endpoint + + Args: + username (str): the github username to query + trailing_slash (bool, optional): Whether to include the trailing slash or not. Defaults to True. + + Returns: + str: the assembled URL with values filled in. Example: https://api.github.com/repos/owner/repo/contributors/ + """ + + return f"{self._base_url()}users/{username}" + ("/" if trailing_slash else "") + + def user_endpoint_urls(self, username:str) -> dict: + """the github REST API urls beneath the users endpoint, in dict form. + Intended to enable the recreation of a subset of what is returned by the github API + + Args: + username (str): the github username to query + + Returns: + dict: a dict of various user sub urls like would be returned by github's API. + """ + user_url = self.user_endpoint_url(username, trailing_slash=False) + return { + "url": user_url, + "html_url": f"https://github.com/{username}", + "followers_url": f"{user_url}/followers", + "following_url": user_url + "/following{/other_user}", + "gists_url": user_url + "/gists{/gist_id}", + "starred_url": user_url + "/starred{/owner}{/repo}", + "subscriptions_url": f"{user_url}/subscriptions", + "organizations_url": f"{user_url}/orgs", + "repos_url": f"{user_url}/repos", + "events_url": user_url + "/events{/privacy}", + "received_events_url": f"{user_url}/received_events", + } + + + def search_endpoint(self, topic: str, query: str) -> str: + """construct a github API call to perform a search + + Args: + topic (str): the topic to search. Valid options are: users, code, commits, issues, labels, repositories, topics. + query (str): the query string to search as you'd type it into githubs serach bar. Example: "email@example.com in:email type:user" + + Raises: + ValueError: if an invalid topic is provided + + Returns: + str: a URL that can be queried to perform the search + """ + topic = topic.lower() + if topic not in ["users", "code", "commits", "issues", "labels", "repositories", "topics" ]: + raise ValueError(f"Invalid topic '{topic}' provided for searching github.") + + return f"{self._base_url()}search/{topic}?q={urllib.parse.quote(query)}" + def __init__(self, key_manager, logger: logging.Logger, feature="rest"): @@ -44,6 +153,16 @@ def __init__(self, key_manager, logger: logging.Logger, feature="rest"): self.key = None self.expired_keys_for_request = [] + + def get_user(self, username:str): + url = self.user_endpoint_url(username) + + return self.get_resource(url) + + def perform_search(self, topic: str, query: str): + url = self.search_endpoint(topic, query) + return self.get_resource(url) + def get_resource_count(self, url): # set per_page to 100 explicitly so we know each page is 100 long