diff --git a/news/gh-extractor-command.rst b/news/gh-extractor-command.rst new file mode 100644 index 000000000..c71839a4b --- /dev/null +++ b/news/gh-extractor-command.rst @@ -0,0 +1,23 @@ +**Added:** + +* Add command line utility to Github Repos metadata extractor to build `releaselist` + +**Changed:** + +* + +**Deprecated:** + +* + +**Removed:** + +* + +**Fixed:** + +* + +**Security:** + +* diff --git a/src/regolith/GHextractor.py b/src/regolith/GHextractor.py index e9f3d2076..30bc0dd58 100644 --- a/src/regolith/GHextractor.py +++ b/src/regolith/GHextractor.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 import base64 -import os import re import tomllib from datetime import datetime # noqa @@ -13,6 +12,17 @@ class GitHubRepoExtractor: def __init__(self, owner: str, repo: str, token: Optional[str] = None): + """Constructor for the GitHubRepoExtractor. + + Parameters + ---------- + owner: str + The name of the owner of the package. + repo: str + The name of the repository. + token: str + The private token for GitHub. + """ self.owner = owner self.repo = repo self.session = requests.Session() @@ -29,15 +39,45 @@ def _get(self, path: str) -> Any: return r.json() def get_repo_metadata(self) -> Dict[str, Any]: + """Get the metadata of the repository. + + Returns + ------- + The dictionary of the metadata for the repository. + """ return self._get(f"/repos/{self.owner}/{self.repo}") def get_contributors(self) -> List[Dict[str, Any]]: + """Get the contributors of the repository. + + Returns + ------- + The list of names of the contributors for the repository. + """ return self._get(f"/repos/{self.owner}/{self.repo}/contributors") def get_releases(self) -> List[Dict[str, Any]]: + """Get the summaries of each release of the repository. + + Returns + ------- + The dictionary of releases for the repository. + """ + return self._get(f"/repos/{self.owner}/{self.repo}/releases") def get_file(self, path: str) -> Optional[str]: + """Get the corresponding file based on path given. + + Parameters + ---------- + path: str + The absolute/relative path of the file. + + Returns + ------- + The decoded file based on the path. + """ try: data = self._get(f"/repos/{self.owner}/{self.repo}/contents/{path}") content = base64.b64decode(data["content"]) @@ -50,6 +90,17 @@ def get_file(self, path: str) -> Optional[str]: VERSION_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)(?:-rc\.(\d+))?$") def parse_version(self, tag: str) -> Optional[Dict[str, Any]]: + """Parse the version of the repository with a given tag. + + Parameters + ---------- + tag: str + The tag/version of the package. The default format is <*.*.*> + + Returns + ------- + The parsed version of the tag + """ match = self.VERSION_RE.match(tag) if not match: return None @@ -69,6 +120,16 @@ def parse_version(self, tag: str) -> Optional[Dict[str, Any]]: } def parse_release(self, release: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Parse the releases of the repository. + + Parameters + ---------- + release: The dictionary of all releases for the given repository. + + Returns + ------- + The parsed dictionary of each release for the given repository. + """ version = self.parse_version(release["tag_name"]) if not version: return None @@ -85,10 +146,22 @@ def parse_release(self, release: Dict[str, Any]) -> Optional[Dict[str, Any]]: } def extract_authors(self) -> List[str]: + """Extract the author of the repository. + + Returns + ------- + The list of names who are (co)authors of the repository. + """ contributors = self.get_contributors() return [contributor["login"] for contributor in contributors] def extract_releases(self) -> List[Dict[str, Any]]: + """Extract releases history of the repository. + + Returns + ------- + The parsed releases of the repository. + """ releases = self.get_releases() parsed = [] for release in releases: @@ -98,6 +171,12 @@ def extract_releases(self) -> List[Dict[str, Any]]: return parsed def extract(self) -> Dict[str, Any]: + """Wrapper of extractor for all metadata of a given repository. + + Returns + ------- + The dictionary of metadata of the repository.e + """ repo = self.get_repo_metadata() pyproject = self.get_file("pyproject.toml") @@ -111,29 +190,105 @@ def extract(self) -> Dict[str, Any]: "program_description": ( tomllib.loads(pyproject)["project"]["description"] if pyproject else repo.get("description") ), + "grants": "all", "release": self.extract_releases(), } return data + def get_owner_type(self) -> str: + """Detect whether the owner is a user or an organization. + + Returns + ------- + str + "org" or "user" + """ + data = self._get(f"/users/{self.owner}") + return "org" if data.get("type") == "Organization" else "user" + + def get_active_repositories_for_owner(self) -> List[Dict[str, Any]]: + """Get all active repositories for the owner. + + Returns + ------- + List of repository dictionaries. + """ + owner_type = self.get_owner_type() + page = 1 + repos: List[Dict[str, Any]] = [] + + while True: + if owner_type == "org": + path = f"/orgs/{self.owner}/repos" + else: + path = f"/users/{self.owner}/repos" + + response = self._get(f"{path}?per_page=100&page={page}") + if not response: + break + for repo in response: + if not repo.get("archived") and not repo.get("disabled"): + repos.append(repo) + page += 1 + return repos + + def extract_all_active_repositories(self) -> List[Dict[str, Any]]: + """Extract metadata for all active repositories under the owner. + + Returns + ------- + List of extracted repository metadata dictionaries. + """ + repos = self.get_active_repositories_for_owner() + results = [] -def main(): - import argparse - import json + for repo in repos: + repo_name = repo["name"] + extractor = GitHubRepoExtractor(self.owner, repo_name) + extractor.session = self.session + try: + results.append(extractor.extract()) + except Exception as exc: + print(f"Skipping {self.owner}/{repo_name}: {exc}") + return results - parser = argparse.ArgumentParser(description="Extract GitHub repository metadata") - parser.add_argument("repo", help="Repository in form owner/name") - parser.add_argument("--token", help="GitHub token (or set GITHUB_TOKEN)") - args = parser.parse_args() - owner, name = args.repo.split("/", 1) - token = args.token or os.getenv("GITHUB_TOKEN") +def extract_github( + owner: str, + repo: Optional[str] = None, + *, + all_repos: bool = False, + token: Optional[str] = None, +) -> List[Dict[str, Any]]: + """Programmatic entry point for Regolith.""" + if all_repos: + extractor = GitHubRepoExtractor(owner, "", token) + return extractor.extract_all_active_repositories() + else: + if not repo: + raise ValueError("repo must be provided unless --all is set") + extractor = GitHubRepoExtractor(owner, repo, token) + return [extractor.extract()] - extractor = GitHubRepoExtractor(owner, name, token) - data = extractor.extract() - print(json.dumps(data, indent=4)) +def to_software_yaml(data): + """Convert a list of software records into a YAML-ready dictionary + keyed by software ID. + Parameters + ---------- + data: dict + The list of dicts of software metadata. -if __name__ == "__main__": - main() + Returns + ------- + The dicttionary for yaml file of all software. + """ + yaml_data = {} + for entry in data: + full_id = entry["_id"] + software_id = full_id.split(".", 1)[1] + content = {k: v for k, v in entry.items() if k != "_id"} + yaml_data[software_id] = content + return yaml_data diff --git a/src/regolith/commands.py b/src/regolith/commands.py index 8669bf26e..aa2aa9988 100644 --- a/src/regolith/commands.py +++ b/src/regolith/commands.py @@ -7,10 +7,13 @@ from copy import copy from pprint import pprint +from ruamel.yaml import YAML + from regolith import storage from regolith.builder import BUILDERS, builder from regolith.deploy import deploy as dploy from regolith.emailer import emailer +from regolith.GHextractor import extract_github, to_software_yaml from regolith.helper import FAST_UPDATER_WHITELIST, HELPERS, UPDATER_HELPERS, helpr from regolith.runcontrol import RunControl from regolith.tools import string_types @@ -258,12 +261,36 @@ def validate(rc): # sys.exit(f"Validation failed on some records") +def ghextractor(rc): + """Extract GitHub repository metadata and write software YAML.""" + owner = rc.owner + repo = getattr(rc, "repo", None) + all_repos = getattr(rc, "all", False) + token = getattr(rc, "token", None) or os.getenv("GITHUB_TOKEN") + data = extract_github( + owner, + repo=repo, + all_repos=all_repos, + token=token, + ) + yaml_dict = to_software_yaml(data) + output = getattr(rc, "output", "software.yml") + yaml = YAML() + yaml.default_flow_style = False + yaml.indent(mapping=2, sequence=4, offset=2) + yaml.allow_unicode = True + with open(output, "w", encoding="utf-8") as f: + yaml.dump(yaml_dict, f) + print(f"Wrote {output}") + + DISCONNECTED_COMMANDS = { "rc": lambda rc: print(rc._pformat()), "deploy": deploy, "store": storage.main, "json-to-yaml": json_to_yaml, "yaml-to-json": yaml_to_json, + "gh-extractor": ghextractor, } CONNECTED_COMMANDS = { diff --git a/src/regolith/main.py b/src/regolith/main.py index d8a6e0120..de98c4811 100644 --- a/src/regolith/main.py +++ b/src/regolith/main.py @@ -254,6 +254,16 @@ def create_parser(): default=None, ) + # GitHub extractor subparser + ghe = subp.add_parser( + "gh-extractor", + help="Extract GitHub repository metadata and write software YAML", + ) + ghe.add_argument("owner", help="GitHub owner or organization") + ghe.add_argument("--repo", help="Single repository name") + ghe.add_argument("--all", action="store_true", help="Extract all active repositories") + ghe.add_argument("--token", help="GitHub token (or set GITHUB_TOKEN)") + # Validator val = subp.add_parser("validate", help="Validates db") val.add_argument(