Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions news/gh-extractor-command.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
**Added:**

* Add command line utility to Github Repos metadata extractor to build `releaselist`

**Changed:**

* <news item>

**Deprecated:**

* <news item>

**Removed:**

* <news item>

**Fixed:**

* <news item>

**Security:**

* <news item>
185 changes: 170 additions & 15 deletions src/regolith/GHextractor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3
import base64
import os
import re
import tomllib
from datetime import datetime # noqa
Expand All @@ -13,6 +12,17 @@

class GitHubRepoExtractor:
def __init__(self, owner: str, repo: str, token: Optional[str] = None):
"""Constructor for the GitHubRepoExtractor.

Parameters
----------
owner: str
The name of the owner of the package.
repo: str
The name of the repository.
token: str
The private token for GitHub.
"""
self.owner = owner
self.repo = repo
self.session = requests.Session()
Expand All @@ -29,15 +39,45 @@ def _get(self, path: str) -> Any:
return r.json()

def get_repo_metadata(self) -> Dict[str, Any]:
"""Get the metadata of the repository.

Returns
-------
The dictionary of the metadata for the repository.
"""
return self._get(f"/repos/{self.owner}/{self.repo}")

def get_contributors(self) -> List[Dict[str, Any]]:
"""Get the contributors of the repository.

Returns
-------
The list of names of the contributors for the repository.
"""
return self._get(f"/repos/{self.owner}/{self.repo}/contributors")

def get_releases(self) -> List[Dict[str, Any]]:
"""Get the summaries of each release of the repository.

Returns
-------
The dictionary of releases for the repository.
"""

return self._get(f"/repos/{self.owner}/{self.repo}/releases")

def get_file(self, path: str) -> Optional[str]:
"""Get the corresponding file based on path given.

Parameters
----------
path: str
The absolute/relative path of the file.

Returns
-------
The decoded file based on the path.
"""
try:
data = self._get(f"/repos/{self.owner}/{self.repo}/contents/{path}")
content = base64.b64decode(data["content"])
Expand All @@ -50,6 +90,17 @@ def get_file(self, path: str) -> Optional[str]:
VERSION_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)(?:-rc\.(\d+))?$")

def parse_version(self, tag: str) -> Optional[Dict[str, Any]]:
"""Parse the version of the repository with a given tag.

Parameters
----------
tag: str
The tag/version of the package. The default format is <*.*.*>

Returns
-------
The parsed version of the tag
"""
match = self.VERSION_RE.match(tag)
if not match:
return None
Expand All @@ -69,6 +120,16 @@ def parse_version(self, tag: str) -> Optional[Dict[str, Any]]:
}

def parse_release(self, release: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Parse the releases of the repository.

Parameters
----------
release: The dictionary of all releases for the given repository.

Returns
-------
The parsed dictionary of each release for the given repository.
"""
version = self.parse_version(release["tag_name"])
if not version:
return None
Expand All @@ -85,10 +146,22 @@ def parse_release(self, release: Dict[str, Any]) -> Optional[Dict[str, Any]]:
}

def extract_authors(self) -> List[str]:
"""Extract the author of the repository.

Returns
-------
The list of names who are (co)authors of the repository.
"""
contributors = self.get_contributors()
return [contributor["login"] for contributor in contributors]

def extract_releases(self) -> List[Dict[str, Any]]:
"""Extract releases history of the repository.

Returns
-------
The parsed releases of the repository.
"""
releases = self.get_releases()
parsed = []
for release in releases:
Expand All @@ -98,6 +171,12 @@ def extract_releases(self) -> List[Dict[str, Any]]:
return parsed

def extract(self) -> Dict[str, Any]:
"""Wrapper of extractor for all metadata of a given repository.

Returns
-------
The dictionary of metadata of the repository.e
"""
repo = self.get_repo_metadata()
pyproject = self.get_file("pyproject.toml")

Expand All @@ -111,29 +190,105 @@ def extract(self) -> Dict[str, Any]:
"program_description": (
tomllib.loads(pyproject)["project"]["description"] if pyproject else repo.get("description")
),
"grants": "all",
"release": self.extract_releases(),
}

return data

def get_owner_type(self) -> str:
"""Detect whether the owner is a user or an organization.

Returns
-------
str
"org" or "user"
"""
data = self._get(f"/users/{self.owner}")
return "org" if data.get("type") == "Organization" else "user"

def get_active_repositories_for_owner(self) -> List[Dict[str, Any]]:
"""Get all active repositories for the owner.

Returns
-------
List of repository dictionaries.
"""
owner_type = self.get_owner_type()
page = 1
repos: List[Dict[str, Any]] = []

while True:
if owner_type == "org":
path = f"/orgs/{self.owner}/repos"
else:
path = f"/users/{self.owner}/repos"

response = self._get(f"{path}?per_page=100&page={page}")
if not response:
break
for repo in response:
if not repo.get("archived") and not repo.get("disabled"):
repos.append(repo)
page += 1
return repos

def extract_all_active_repositories(self) -> List[Dict[str, Any]]:
"""Extract metadata for all active repositories under the owner.

Returns
-------
List of extracted repository metadata dictionaries.
"""
repos = self.get_active_repositories_for_owner()
results = []

def main():
import argparse
import json
for repo in repos:
repo_name = repo["name"]
extractor = GitHubRepoExtractor(self.owner, repo_name)
extractor.session = self.session
try:
results.append(extractor.extract())
except Exception as exc:
print(f"Skipping {self.owner}/{repo_name}: {exc}")
return results

parser = argparse.ArgumentParser(description="Extract GitHub repository metadata")
parser.add_argument("repo", help="Repository in form owner/name")
parser.add_argument("--token", help="GitHub token (or set GITHUB_TOKEN)")
args = parser.parse_args()

owner, name = args.repo.split("/", 1)
token = args.token or os.getenv("GITHUB_TOKEN")
def extract_github(
owner: str,
repo: Optional[str] = None,
*,
all_repos: bool = False,
token: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""Programmatic entry point for Regolith."""
if all_repos:
extractor = GitHubRepoExtractor(owner, "", token)
return extractor.extract_all_active_repositories()
else:
if not repo:
raise ValueError("repo must be provided unless --all is set")
extractor = GitHubRepoExtractor(owner, repo, token)
return [extractor.extract()]

extractor = GitHubRepoExtractor(owner, name, token)
data = extractor.extract()

print(json.dumps(data, indent=4))
def to_software_yaml(data):
"""Convert a list of software records into a YAML-ready dictionary
keyed by software ID.

Parameters
----------
data: dict
The list of dicts of software metadata.

if __name__ == "__main__":
main()
Returns
-------
The dicttionary for yaml file of all software.
"""
yaml_data = {}
for entry in data:
full_id = entry["_id"]
software_id = full_id.split(".", 1)[1]
content = {k: v for k, v in entry.items() if k != "_id"}
yaml_data[software_id] = content
return yaml_data
27 changes: 27 additions & 0 deletions src/regolith/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
from copy import copy
from pprint import pprint

from ruamel.yaml import YAML

from regolith import storage
from regolith.builder import BUILDERS, builder
from regolith.deploy import deploy as dploy
from regolith.emailer import emailer
from regolith.GHextractor import extract_github, to_software_yaml
from regolith.helper import FAST_UPDATER_WHITELIST, HELPERS, UPDATER_HELPERS, helpr
from regolith.runcontrol import RunControl
from regolith.tools import string_types
Expand Down Expand Up @@ -258,12 +261,36 @@ def validate(rc):
# sys.exit(f"Validation failed on some records")


def ghextractor(rc):
"""Extract GitHub repository metadata and write software YAML."""
owner = rc.owner
repo = getattr(rc, "repo", None)
all_repos = getattr(rc, "all", False)
token = getattr(rc, "token", None) or os.getenv("GITHUB_TOKEN")
data = extract_github(
owner,
repo=repo,
all_repos=all_repos,
token=token,
)
yaml_dict = to_software_yaml(data)
output = getattr(rc, "output", "software.yml")
yaml = YAML()
yaml.default_flow_style = False
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.allow_unicode = True
with open(output, "w", encoding="utf-8") as f:
yaml.dump(yaml_dict, f)
print(f"Wrote {output}")


DISCONNECTED_COMMANDS = {
"rc": lambda rc: print(rc._pformat()),
"deploy": deploy,
"store": storage.main,
"json-to-yaml": json_to_yaml,
"yaml-to-json": yaml_to_json,
"gh-extractor": ghextractor,
}

CONNECTED_COMMANDS = {
Expand Down
10 changes: 10 additions & 0 deletions src/regolith/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,16 @@ def create_parser():
default=None,
)

# GitHub extractor subparser
ghe = subp.add_parser(
"gh-extractor",
help="Extract GitHub repository metadata and write software YAML",
)
ghe.add_argument("owner", help="GitHub owner or organization")
ghe.add_argument("--repo", help="Single repository name")
ghe.add_argument("--all", action="store_true", help="Extract all active repositories")
ghe.add_argument("--token", help="GitHub token (or set GITHUB_TOKEN)")

# Validator
val = subp.add_parser("validate", help="Validates db")
val.add_argument(
Expand Down
Loading