From 564b0160c77015c46cb3c3ebe1b3e4038a1b6bfc Mon Sep 17 00:00:00 2001 From: herr kaste Date: Mon, 23 Mar 2026 14:41:21 +0100 Subject: [PATCH 1/5] Add artifacts metadata for the status page Enhance enrich_logs.py to attach artifact metadata to log entries, including id, name, size, and a stable GitHub web URL derived from the resolved repository and run id. Introduce fetch_logs_metadata.py which orchestrates Github API calls using `gh` for convenience. Also incorporates the ad-hoc usage in publish.yml. Introduce refresh_logs.py as a one-shot local helper. --- .github/workflows/publish.yml | 16 +- README.md | 27 +++ scripts/enrich_logs.py | 104 +++++++- scripts/fetch_logs_metadata.py | 378 ++++++++++++++++++++++++++++++ scripts/refresh_logs.py | 205 ++++++++++++++++ tests/test_enrich_logs.py | 75 ++++++ tests/test_fetch_logs_metadata.py | 225 ++++++++++++++++++ 7 files changed, 1006 insertions(+), 24 deletions(-) create mode 100644 scripts/fetch_logs_metadata.py create mode 100644 scripts/refresh_logs.py create mode 100644 tests/test_fetch_logs_metadata.py diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 385728918..b09558a9d 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -276,18 +276,11 @@ jobs: restore-keys: | wrk-cache- - - name: Fetch recent workflow runs + - name: Fetch workflow metadata for logs enrichment run: | - SINCE=$(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ) - gh api \ - repos/${{ github.repository }}/actions/workflows/${{ env.WORKFLOW_ID }}/runs \ - --method GET \ - -f per_page=100 \ - -f status=completed \ - -f created=">=$SINCE" \ - --paginate \ - --jq '[.workflow_runs[] | {id, conclusion, run_started_at}]' \ - > "$RUNNER_TEMP/workflow_runs.json" + uv run -m scripts.fetch_logs_metadata \ + --runs-output "$RUNNER_TEMP/workflow_runs.json" \ + --artifacts-output "$RUNNER_TEMP/workflow_artifacts.json" - name: Enrich logs.json run: | @@ -299,6 +292,7 @@ jobs: uv run -m scripts.enrich_logs \ --runs "$RUNNER_TEMP/workflow_runs.json" \ + --artifacts "$RUNNER_TEMP/workflow_artifacts.json" \ -i ./wrk/logs.json \ -o ./wrk/logs.json diff --git a/README.md b/README.md index ea9086299..bb6643228 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,33 @@ The command above reuses the same layout as [CI](https://github.com/packagecontr --- +### `refresh_logs.py` (plus lower-level helpers) + +Use this when you want to reproduce/update `logs.json` locally with GitHub Actions metadata. +`gh` is required for the ad-hoc queries I make herein. + +```bash +# one-shot: download logs (if missing), fetch metadata, enrich logs +uv run -m scripts.refresh_logs --pretty +``` + +Defaults are tuned for local use: +- repo: inferred from `GITHUB_REPOSITORY` or local `git origin` +- workflow id: inferred from `crawl.yml` if not set +- since window: `--since-hours 24` +- metadata files: `./workflow_runs.json`, `./workflow_artifacts.json` +- artifact scan cap: `--artifacts-max-pages 10` +- logs path: `./logs.json` + +If you want explicit control, run the two low-level commands, that's what we do in publish.yml: + +```bash +uv run -m scripts.fetch_logs_metadata --pretty +uv run -m scripts.enrich_logs -i ./logs.json -o ./logs.json --pretty +``` + +--- + ### `snapshot_test.py` Creates a compact, single-file snapshot for regression testing (`registry + channel`) from a reduced package set. diff --git a/scripts/enrich_logs.py b/scripts/enrich_logs.py index 9405b42a0..53d25e7b6 100644 --- a/scripts/enrich_logs.py +++ b/scripts/enrich_logs.py @@ -2,9 +2,10 @@ import argparse import json +from collections import defaultdict from dataclasses import dataclass from pathlib import Path -from typing import Any +from typing import Any, TypedDict from ._utils import write_json @@ -14,9 +15,28 @@ class Args: input: str output: str runs: str + artifacts: str | None pretty: bool +type RunId = str + + +class RuntimeArtifact(TypedDict): + run_id: RunId + id: int + name: str + size: int + url: str + + +class ArtifactMetadata(TypedDict): + id: int + name: str + size: int + url: str + + def main(): args = parse_args() update_logs(args) @@ -24,7 +44,10 @@ def main(): def parse_args() -> Args: parser = argparse.ArgumentParser( - description="Enrich logs.json with workflow run metadata and fill missing runs." + description=( + "Enrich logs.json with workflow run metadata, artifacts metadata, " + "and fill missing runs." + ) ) parser.add_argument( "-i", @@ -41,8 +64,19 @@ def parse_args() -> Args: parser.add_argument( "-r", "--runs", - required=True, - help="Path to workflow_runs.json fetched from the GitHub API.", + default="./workflow_runs.json", + help=( + "Path to workflow_runs.json fetched from the GitHub API " + "(default: ./workflow_runs.json)." + ), + ) + parser.add_argument( + "--artifacts", + default="./workflow_artifacts.json", + help=( + "Optional path to workflow_artifacts.json fetched from the GitHub API " + "(default: ./workflow_artifacts.json)." + ), ) parser.add_argument( "--pretty", @@ -54,6 +88,7 @@ def parse_args() -> Args: input=ns.input, output=ns.output, runs=ns.runs, + artifacts=ns.artifacts, pretty=ns.pretty, ) @@ -61,50 +96,93 @@ def parse_args() -> Args: def update_logs(args: Args): entries = load_json(Path(args.input)) or [] runs = load_json(Path(args.runs)) or [] + artifacts: list[RuntimeArtifact] = [] + if args.artifacts: + artifacts = load_json(Path(args.artifacts)) or [] + + if not isinstance(entries, list): + raise SystemExit(f"enrich_logs: input must be a JSON array: {args.input}") + if not isinstance(runs, list): + raise SystemExit(f"enrich_logs: runs must be a JSON array: {args.runs}") + if not isinstance(artifacts, list): + raise SystemExit(f"enrich_logs: artifacts must be a JSON array: {args.artifacts}") enriched = 0 created = 0 + artifacts_attached = 0 runs_index = { - str(run.get("id")): { + run_id: { "conclusion": run.get("conclusion"), "run_started_at": run.get("run_started_at"), } for run in runs - if "id" in run + if (run_id := str(run.get("id", ""))) } + artifacts_index = build_artifacts_index(artifacts) seen = set() for entry in entries: - run_id = str(entry.get("run_id", "")) - if not run_id: - continue + run_id = entry["run_id"] seen.add(run_id) + info = runs_index.get(run_id) - if not info: - continue - if not entry.get("conclusion") and info.get("conclusion"): + if info and not entry.get("conclusion") and info.get("conclusion"): entry["conclusion"] = info["conclusion"] enriched += 1 + run_artifacts = artifacts_index.get(run_id) + if run_artifacts and entry.get("artifacts") != run_artifacts: + entry["artifacts"] = run_artifacts + artifacts_attached += 1 + for run_id, info in runs_index.items(): if run_id in seen: continue + date = info.get("run_started_at") if not date: continue + new_entry = { "date": date, "run_id": run_id, } if conclusion := info.get("conclusion"): new_entry["conclusion"] = conclusion + if run_artifacts := artifacts_index.get(run_id): + new_entry["artifacts"] = run_artifacts + artifacts_attached += 1 + entries.append(new_entry) created += 1 entries.sort(key=lambda entry: entry.get("date", ""), reverse=True) write_json(args.output, entries, pretty=args.pretty, ensure_ascii=True) - print(f"Enriched entries: {enriched}, added missing runs: {created}") + print( + "Enriched entries: " + f"{enriched}, " + f"added missing runs: {created}, " + f"attached artifacts on entries: {artifacts_attached}" + ) + + +def build_artifacts_index(artifacts: list[RuntimeArtifact]) -> dict[RunId, list[ArtifactMetadata]]: + artifacts_by_run: defaultdict[RunId, list[ArtifactMetadata]] = defaultdict(list) + + for artifact in artifacts: + run_id = artifact["run_id"] + artifacts_by_run[run_id].append({ + "id": artifact["id"], + "name": artifact["name"], + "size": artifact["size"], + "url": artifact["url"], + }) + + for run_artifacts in artifacts_by_run.values(): + run_artifacts.sort(key=lambda item: (item["name"].casefold(), str(item["id"]))) + + return dict(artifacts_by_run) def load_json(path: Path) -> Any: diff --git a/scripts/fetch_logs_metadata.py b/scripts/fetch_logs_metadata.py new file mode 100644 index 000000000..e28d7697f --- /dev/null +++ b/scripts/fetch_logs_metadata.py @@ -0,0 +1,378 @@ +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from typing import Any, Iterator + +from ._utils import write_json + + +DEFAULT_RUNS_OUTPUT = "./workflow_runs.json" +DEFAULT_ARTIFACTS_OUTPUT = "./workflow_artifacts.json" +DEFAULT_WORKFLOW_FILE = "crawl.yml" + + +@dataclass +class Args: + repo: str + workflow_id: str + runs_output: str + artifacts_output: str + since: str + artifacts_max_pages: int + pretty: bool + + +def main(): + args = parse_args() + fetch_logs_metadata(args) + + +def parse_args() -> Args: + parser = argparse.ArgumentParser( + description=( + "Fetch workflow runs and artifacts metadata via gh api for logs enrichment." + ) + ) + parser.add_argument( + "--repo", + default=None, + help=( + "GitHub repository in owner/name format. Defaults to GITHUB_REPOSITORY " + "or the local git origin remote." + ), + ) + parser.add_argument( + "--workflow-id", + default=None, + help=( + "Workflow ID to query runs for. Defaults to WORKFLOW_ID. " + "If missing, it is resolved from --workflow-file." + ), + ) + parser.add_argument( + "--workflow-file", + default=DEFAULT_WORKFLOW_FILE, + help=( + "Workflow filename used to resolve --workflow-id when it is not set " + f"(default: {DEFAULT_WORKFLOW_FILE})." + ), + ) + parser.add_argument( + "--since", + default=None, + help=( + "ISO8601 timestamp for the lower bound of runs to fetch, for example " + "2026-03-23T00:00:00Z." + ), + ) + parser.add_argument( + "--since-hours", + type=int, + default=24, + help="Fetch runs started within the last N hours when --since is not set (default: 24).", + ) + parser.add_argument( + "--runs-output", + default=DEFAULT_RUNS_OUTPUT, + help=f"Output path for workflow runs JSON (default: {DEFAULT_RUNS_OUTPUT}).", + ) + parser.add_argument( + "--artifacts-output", + default=DEFAULT_ARTIFACTS_OUTPUT, + help=( + "Output path for workflow artifacts JSON " + f"(default: {DEFAULT_ARTIFACTS_OUTPUT})." + ), + ) + parser.add_argument( + "--artifacts-max-pages", + type=int, + default=10, + help=( + "Maximum number of repository artifacts pages to scan " + "(default: 10)." + ), + ) + parser.add_argument( + "--pretty", + action="store_true", + help="Pretty-print JSON output (indent=2).", + ) + ns = parser.parse_args() + + if ns.artifacts_max_pages < 1: + raise SystemExit("fetch_logs_metadata: --artifacts-max-pages must be >= 1") + + repo = resolve_repo(ns.repo) + workflow_id = resolve_workflow_id(ns.workflow_id, repo, ns.workflow_file) + since = ns.since or iso_utc_hours_ago(ns.since_hours) + + return Args( + repo=repo, + workflow_id=workflow_id, + runs_output=ns.runs_output, + artifacts_output=ns.artifacts_output, + since=since, + artifacts_max_pages=ns.artifacts_max_pages, + pretty=ns.pretty, + ) + + +def fetch_logs_metadata(args: Args): + runs = fetch_runs(args.repo, args.workflow_id, args.since) + run_ids = { + str(run.get("id")) + for run in runs + if isinstance(run, dict) and run.get("id") is not None + } + + artifacts = fetch_artifacts( + args.repo, + run_ids, + max_pages=args.artifacts_max_pages, + ) + write_json(args.runs_output, runs, pretty=args.pretty, ensure_ascii=True) + write_json(args.artifacts_output, artifacts, pretty=args.pretty, ensure_ascii=True) + + print( + "Fetched metadata: " + f"repo={args.repo}, " + f"workflow_id={args.workflow_id}, " + f"runs={len(runs)}, " + f"artifacts_kept={len(artifacts)}, " + f"since={args.since}" + ) + + +def fetch_runs(repo: str, workflow_id: str, since: str) -> list[dict[str, Any]]: + runs: list[dict[str, Any]] = [] + + for run in run_gh_paginated( + "workflow_runs", + [ + f"repos/{repo}/actions/workflows/{workflow_id}/runs", + "--method", + "GET", + "-f", + "status=completed", + "-f", + f"created=>={since}", + ], + max_pages=None, + ): + if not isinstance(run, dict): + continue + run_id = run.get("id") + if run_id is None: + continue + + runs.append({ + "id": run_id, + "conclusion": run.get("conclusion"), + "run_started_at": run.get("run_started_at"), + }) + + return runs + + +def fetch_artifacts( + repo: str, + run_ids: set[str], + *, + max_pages: int, +) -> list[dict[str, Any]]: + if not run_ids: + return [] + + remaining_run_ids = set(run_ids) + artifacts: list[dict[str, Any]] = [] + + for artifact in run_gh_paginated( + "artifacts", + [ + f"repos/{repo}/actions/artifacts", + "--method", + "GET", + ], + max_pages=max_pages, + ): + if not isinstance(artifact, dict): + continue + + workflow_run = artifact.get("workflow_run") + workflow_run_id = None + if isinstance(workflow_run, dict): + workflow_run_id = workflow_run.get("id") + + if workflow_run_id is None: + continue + + run_id = str(workflow_run_id) + if run_id in run_ids: + # Mark run_id as seen immediately. Otherwise malformed artifacts could prevent + # remaining_run_ids from emptying and keep pagination running unnecessarily. + remaining_run_ids.discard(run_id) + + artifact_id = artifact.get("id") + artifact_name = artifact.get("name") + artifact_size = artifact.get("size_in_bytes") + if artifact_id is None or artifact_name is None or artifact_size is None: + continue + + artifacts.append({ + "run_id": run_id, + "id": artifact_id, + "name": str(artifact_name), + "size": int(artifact_size), + "url": f"https://github.com/{repo}/actions/runs/{run_id}/artifacts/{artifact_id}", + }) + # We only know whether we're done when we see the *next* item. Keep scanning + # because multiple artifacts for an interesting run_id can appear consecutively. + continue + + if not remaining_run_ids: + # Assumes API items are ordered: once all interesting run_ids were seen and we + # hit a non-interesting run_id, there are no more relevant artifacts to collect. + break + + artifacts.sort(key=lambda item: (item["run_id"], item["name"].casefold(), str(item["id"]))) + return artifacts + + +def resolve_repo(repo: str | None) -> str: + if repo: + return repo + + if env_repo := os.environ.get("GITHUB_REPOSITORY"): + return env_repo + + if origin_url := read_git_origin_url(): + if parsed_repo := parse_repo_from_remote(origin_url): + return parsed_repo + + raise SystemExit( + "fetch_logs_metadata: unable to resolve repo. " + "Set --repo, GITHUB_REPOSITORY, or configure git origin." + ) + + +def resolve_workflow_id(workflow_id: str | None, repo: str, workflow_file: str) -> str: + if workflow_id: + return str(workflow_id) + + if env_workflow_id := os.environ.get("WORKFLOW_ID"): + return env_workflow_id + + resolved = run_gh_json([ + f"repos/{repo}/actions/workflows/{workflow_file}", + "--method", + "GET", + "--jq", + ".id", + ]) + if resolved is None: + raise SystemExit( + "fetch_logs_metadata: failed to resolve workflow id " + f"for {repo}/{workflow_file}" + ) + + return str(resolved) + + +def iso_utc_hours_ago(hours: int) -> str: + timestamp = datetime.now(timezone.utc) - timedelta(hours=hours) + return timestamp.strftime("%Y-%m-%dT%H:%M:%SZ") + + +def read_git_origin_url() -> str | None: + process = subprocess.run( + ["git", "config", "--get", "remote.origin.url"], + capture_output=True, + text=True, + check=False, + ) + if process.returncode != 0: + return None + + origin = process.stdout.strip() + return origin or None + + +def parse_repo_from_remote(remote_url: str) -> str | None: + # https://github.com/owner/repo(.git) + if match := re.search(r"github\.com[:/]([^/]+)/([^/]+?)(?:\.git)?$", remote_url): + owner, repo = match.group(1), match.group(2) + return f"{owner}/{repo}" + return None + + +def run_gh_paginated( + key: str, + cmd: list[str], + *, + max_pages: int | None, + per_page: int = 100, +) -> Iterator[Any]: + page = 1 + while max_pages is None or page <= max_pages: + payload = run_gh_json([ + *cmd, + "-f", + f"per_page={per_page}", + "-f", + f"page={page}", + ]) + if not isinstance(payload, dict): + return + + page_items = payload.get(key) + if not isinstance(page_items, list) or not page_items: + return + + yield from page_items + + if len(page_items) < per_page: + return + + page += 1 + + +def run_gh_json(args: list[str]) -> Any: + cmd = ["gh", "api", *args] + command_str = " ".join(cmd) + print(f"fetch_logs_metadata: {command_str}", file=sys.stderr) + + process = subprocess.run(cmd, capture_output=True, text=True, check=False) + if process.returncode != 0: + stderr = process.stderr.strip() + raise SystemExit( + "fetch_logs_metadata: gh command failed\n" + f"command: {command_str}\n" + f"stderr: {stderr}" + ) + + output = process.stdout.strip() + if not output: + return [] + + try: + return json.loads(output) + except json.JSONDecodeError as exc: + excerpt = output[:260] + raise SystemExit( + "fetch_logs_metadata: invalid JSON from gh command\n" + f"command: {command_str}\n" + f"error: {exc}\n" + f"context: {excerpt}" + ) from exc + + +if __name__ == "__main__": + main() diff --git a/scripts/refresh_logs.py b/scripts/refresh_logs.py new file mode 100644 index 000000000..fe6963634 --- /dev/null +++ b/scripts/refresh_logs.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import argparse +import subprocess +from dataclasses import dataclass +from pathlib import Path + +from . import enrich_logs, fetch_logs_metadata + + +DEFAULT_LOGS_PATH = "./logs.json" +DEFAULT_RELEASE_TAG = "crawler-status" + + +@dataclass +class Args: + logs: str + repo: str | None + workflow_id: str | None + workflow_file: str + runs_output: str + artifacts_output: str + artifacts_max_pages: int + since: str | None + since_hours: int + release_tag: str + skip_download: bool + pretty: bool + + +def main(): + args = parse_args() + refresh_logs(args) + + +def parse_args() -> Args: + parser = argparse.ArgumentParser( + description=( + "Refresh logs.json locally by downloading current logs (optional), " + "fetching run/artifact metadata in batch, and enriching the file." + ) + ) + parser.add_argument( + "--logs", + default=DEFAULT_LOGS_PATH, + help=f"Path to logs.json to refresh (default: {DEFAULT_LOGS_PATH}).", + ) + parser.add_argument( + "--repo", + default=None, + help=( + "GitHub repository in owner/name format. Defaults to GITHUB_REPOSITORY " + "or local git origin." + ), + ) + parser.add_argument( + "--workflow-id", + default=None, + help="Workflow ID. If omitted, resolved from --workflow-file.", + ) + parser.add_argument( + "--workflow-file", + default=fetch_logs_metadata.DEFAULT_WORKFLOW_FILE, + help=( + "Workflow filename used when resolving workflow id " + f"(default: {fetch_logs_metadata.DEFAULT_WORKFLOW_FILE})." + ), + ) + parser.add_argument( + "--runs-output", + default=fetch_logs_metadata.DEFAULT_RUNS_OUTPUT, + help=( + "Output path for workflow runs JSON " + f"(default: {fetch_logs_metadata.DEFAULT_RUNS_OUTPUT})." + ), + ) + parser.add_argument( + "--artifacts-output", + default=fetch_logs_metadata.DEFAULT_ARTIFACTS_OUTPUT, + help=( + "Output path for workflow artifacts JSON " + f"(default: {fetch_logs_metadata.DEFAULT_ARTIFACTS_OUTPUT})." + ), + ) + parser.add_argument( + "--artifacts-max-pages", + type=int, + default=10, + help="Maximum artifact pages to scan (default: 10).", + ) + parser.add_argument( + "--since", + default=None, + help="ISO8601 lower bound; overrides --since-hours.", + ) + parser.add_argument( + "--since-hours", + type=int, + default=24, + help="Lower bound in hours when --since is not provided (default: 24).", + ) + parser.add_argument( + "--release-tag", + default=DEFAULT_RELEASE_TAG, + help=f"Release tag used for logs download (default: {DEFAULT_RELEASE_TAG}).", + ) + parser.add_argument( + "--skip-download", + action="store_true", + help="Do not attempt to download logs.json from release if missing.", + ) + parser.add_argument( + "--pretty", + action="store_true", + help="Pretty-print generated JSON files.", + ) + ns = parser.parse_args() + if ns.artifacts_max_pages < 1: + raise SystemExit("refresh_logs: --artifacts-max-pages must be >= 1") + + return Args( + logs=ns.logs, + repo=ns.repo, + workflow_id=ns.workflow_id, + workflow_file=ns.workflow_file, + runs_output=ns.runs_output, + artifacts_output=ns.artifacts_output, + artifacts_max_pages=ns.artifacts_max_pages, + since=ns.since, + since_hours=ns.since_hours, + release_tag=ns.release_tag, + skip_download=ns.skip_download, + pretty=ns.pretty, + ) + + +def refresh_logs(args: Args): + repo = fetch_logs_metadata.resolve_repo(args.repo) + workflow_id = fetch_logs_metadata.resolve_workflow_id( + args.workflow_id, + repo, + args.workflow_file, + ) + + logs_path = Path(args.logs) + ensure_logs_file(logs_path, repo, args.release_tag, args.skip_download) + + fetch_args = fetch_logs_metadata.Args( + repo=repo, + workflow_id=workflow_id, + runs_output=args.runs_output, + artifacts_output=args.artifacts_output, + artifacts_max_pages=args.artifacts_max_pages, + since=args.since or fetch_logs_metadata.iso_utc_hours_ago(args.since_hours), + pretty=args.pretty, + ) + fetch_logs_metadata.fetch_logs_metadata(fetch_args) + + enrich_args = enrich_logs.Args( + input=str(logs_path), + output=str(logs_path), + runs=args.runs_output, + artifacts=args.artifacts_output, + pretty=args.pretty, + ) + enrich_logs.update_logs(enrich_args) + + print(f"Refreshed logs file: {logs_path}") + + +def ensure_logs_file(logs_path: Path, repo: str, release_tag: str, skip_download: bool): + logs_path.parent.mkdir(parents=True, exist_ok=True) + if logs_path.exists(): + return + + if skip_download: + raise SystemExit( + f"refresh_logs: logs file not found and --skip-download was set: {logs_path}" + ) + + cmd = [ + "gh", + "release", + "download", + release_tag, + "--repo", + repo, + "--pattern", + "logs.json", + "--output", + str(logs_path), + "--clobber", + ] + process = subprocess.run(cmd, capture_output=True, text=True, check=False) + if process.returncode != 0: + raise SystemExit( + "refresh_logs: failed to download logs.json from release\n" + f"repo={repo}\n" + f"tag={release_tag}\n" + f"stderr={process.stderr.strip()}" + ) + + +if __name__ == "__main__": + main() diff --git a/tests/test_enrich_logs.py b/tests/test_enrich_logs.py index 0b34fb932..379e5e236 100644 --- a/tests/test_enrich_logs.py +++ b/tests/test_enrich_logs.py @@ -28,6 +28,7 @@ def test_enrich_updates_existing_and_adds_missing(tmp_path): input=str(logs_path), output=str(logs_path), runs=str(runs_path), + artifacts=None, pretty=True, ) enrich_logs.update_logs(args) @@ -41,3 +42,77 @@ def test_enrich_updates_existing_and_adds_missing(tmp_path): second = next(entry for entry in data if entry["run_id"] == "2") assert second["conclusion"] == "failure" assert second["date"] == "2024-10-06T09:00:00Z" + + +def test_enrich_attaches_artifacts_by_run_id(tmp_path): + logs_path = tmp_path / "logs.json" + runs_path = tmp_path / "workflow_runs.json" + artifacts_path = tmp_path / "workflow_artifacts.json" + + logs_path.write_text(json.dumps([ + {"run_id": "1", "notes": "has notes", "date": "2024-10-05T10:00:00Z"}, + ]), encoding="utf-8") + + runs_path.write_text(json.dumps([ + { + "id": 1, + "conclusion": "success", + "run_started_at": "2024-10-05T10:00:00Z", + }, + { + "id": 2, + "conclusion": "failure", + "run_started_at": "2024-10-06T09:00:00Z", + }, + ]), encoding="utf-8") + + artifacts_path.write_text(json.dumps([ + { + "run_id": "1", + "id": 101, + "name": "crawl-backup", + "size": 1234, + "url": "https://github.com/owner/repo/actions/runs/1/artifacts/101", + }, + { + "run_id": "2", + "id": 202, + "name": "stats-backup", + "size": 4321, + "url": "https://github.com/owner/repo/actions/runs/2/artifacts/202", + }, + { + "run_id": "999", + "id": 303, + "name": "ignored", + "size": 999, + "url": "https://github.com/owner/repo/actions/runs/999/artifacts/303", + }, + ]), encoding="utf-8") + + args = enrich_logs.Args( + input=str(logs_path), + output=str(logs_path), + runs=str(runs_path), + artifacts=str(artifacts_path), + pretty=True, + ) + enrich_logs.update_logs(args) + + data = json.loads(logs_path.read_text(encoding="utf-8")) + + first = next(entry for entry in data if entry["run_id"] == "1") + assert first["artifacts"] == [{ + "id": 101, + "name": "crawl-backup", + "size": 1234, + "url": "https://github.com/owner/repo/actions/runs/1/artifacts/101", + }] + + second = next(entry for entry in data if entry["run_id"] == "2") + assert second["artifacts"] == [{ + "id": 202, + "name": "stats-backup", + "size": 4321, + "url": "https://github.com/owner/repo/actions/runs/2/artifacts/202", + }] diff --git a/tests/test_fetch_logs_metadata.py b/tests/test_fetch_logs_metadata.py new file mode 100644 index 000000000..64996e411 --- /dev/null +++ b/tests/test_fetch_logs_metadata.py @@ -0,0 +1,225 @@ +import scripts.fetch_logs_metadata as fetch_logs_metadata + + +def test_fetch_logs_metadata_writes_fetch_outputs(monkeypatch): + def fake_fetch_runs(_repo, _workflow_id, _since): + return [{"id": 2}, {"id": 1}] + + artifact_data = [ + { + "run_id": "1", + "id": 100, + "name": "crawl-backup", + "size": 200, + "url": "https://github.com/owner/repo/actions/runs/1/artifacts/100", + }, + { + "run_id": "2", + "id": 101, + "name": "stats-backup", + "size": 300, + "url": "https://github.com/owner/repo/actions/runs/2/artifacts/101", + }, + ] + + def fake_fetch_artifacts(_repo, run_ids, *, max_pages): + assert run_ids == {"1", "2"} + assert max_pages == 10 + return artifact_data + + writes = [] + + def fake_write_json(path, data, *, pretty, ensure_ascii): + writes.append((path, data, pretty, ensure_ascii)) + + monkeypatch.setattr(fetch_logs_metadata, "fetch_runs", fake_fetch_runs) + monkeypatch.setattr(fetch_logs_metadata, "fetch_artifacts", fake_fetch_artifacts) + monkeypatch.setattr(fetch_logs_metadata, "write_json", fake_write_json) + + args = fetch_logs_metadata.Args( + repo="owner/repo", + workflow_id="123", + runs_output="runs.json", + artifacts_output="artifacts.json", + since="2026-01-01T00:00:00Z", + artifacts_max_pages=10, + pretty=True, + ) + + fetch_logs_metadata.fetch_logs_metadata(args) + + assert writes[0] == ("runs.json", [{"id": 2}, {"id": 1}], True, True) + assert writes[1] == ("artifacts.json", artifact_data, True, True) + + +def test_parse_repo_from_remote_supports_https_and_ssh(): + https_url = "https://github.com/packagecontrol/thecrawl.git" + ssh_url = "git@github.com:packagecontrol/thecrawl.git" + + assert fetch_logs_metadata.parse_repo_from_remote(https_url) == "packagecontrol/thecrawl" + assert fetch_logs_metadata.parse_repo_from_remote(ssh_url) == "packagecontrol/thecrawl" + + +def test_resolve_workflow_id_falls_back_to_workflow_filename(monkeypatch): + monkeypatch.delenv("WORKFLOW_ID", raising=False) + + called_args = [] + + def fake_run_gh_json(args): + called_args.append(args) + return 123456 + + monkeypatch.setattr(fetch_logs_metadata, "run_gh_json", fake_run_gh_json) + + resolved = fetch_logs_metadata.resolve_workflow_id(None, "owner/repo", "crawl.yml") + + assert resolved == "123456" + assert called_args == [[ + "repos/owner/repo/actions/workflows/crawl.yml", + "--method", + "GET", + "--jq", + ".id", + ]] + + +def test_run_gh_json_raises_on_invalid_json(monkeypatch): + class Process: + returncode = 0 + stdout = "{invalid-json}" + stderr = "" + + def fake_run(*_args, **_kwargs): + return Process() + + monkeypatch.setattr(fetch_logs_metadata.subprocess, "run", fake_run) + + try: + fetch_logs_metadata.run_gh_json(["repos/owner/repo/actions/runs"]) + assert False, "Expected SystemExit for invalid JSON" + except SystemExit as exc: + message = str(exc) + assert "invalid JSON" in message + assert "gh api repos/owner/repo/actions/runs" in message + + +def test_fetch_runs_pages_until_short_page(monkeypatch): + calls = [] + + def fake_run_gh_json(args): + calls.append(args) + page_arg = next(item for item in args if item.startswith("page=")) + page = int(page_arg.split("=", 1)[1]) + + if page == 1: + return { + "workflow_runs": [ + { + "id": run_id, + "conclusion": "success", + "run_started_at": f"2026-01-01T00:{run_id:02d}:00Z", + } + for run_id in range(1, 101) + ] + } + + return { + "workflow_runs": [ + { + "id": 101, + "conclusion": "failure", + "run_started_at": "2026-01-01T01:41:00Z", + }, + ] + } + + monkeypatch.setattr(fetch_logs_metadata, "run_gh_json", fake_run_gh_json) + + runs = fetch_logs_metadata.fetch_runs( + "owner/repo", + "123", + "2026-01-01T00:00:00Z", + ) + + assert len(runs) == 101 + assert [ + next(item for item in call if item.startswith("page=")) + for call in calls + ] == ["page=1", "page=2"] + assert runs[0] == { + "id": 1, + "conclusion": "success", + "run_started_at": "2026-01-01T00:01:00Z", + } + assert runs[-1] == { + "id": 101, + "conclusion": "failure", + "run_started_at": "2026-01-01T01:41:00Z", + } + + +def test_fetch_artifacts_keeps_collecting_matching_items_after_remaining_empty(monkeypatch): + def fake_run_gh_paginated(_key, _cmd, **_kwargs): + yield {"id": 1, "name": "a", "size_in_bytes": 11, "workflow_run": {"id": 42}} + yield {"id": 2, "name": "b", "size_in_bytes": 22, "workflow_run": {"id": 42}} + yield {"id": 3, "name": "c", "size_in_bytes": 33, "workflow_run": {"id": 999}} + yield {"id": 4, "name": "d", "size_in_bytes": 44, "workflow_run": {"id": 42}} + + monkeypatch.setattr(fetch_logs_metadata, "run_gh_paginated", fake_run_gh_paginated) + + artifacts = fetch_logs_metadata.fetch_artifacts( + "owner/repo", + {"42"}, + max_pages=10, + ) + + assert artifacts == [ + { + "run_id": "42", + "id": 1, + "name": "a", + "size": 11, + "url": "https://github.com/owner/repo/actions/runs/42/artifacts/1", + }, + { + "run_id": "42", + "id": 2, + "name": "b", + "size": 22, + "url": "https://github.com/owner/repo/actions/runs/42/artifacts/2", + }, + ] + + +def test_fetch_artifacts_stops_after_matching_run_ids(monkeypatch): + calls = [] + + def fake_run_gh_json(args): + calls.append(args) + return { + "artifacts": [ + { + "id": 1, + "name": "crawl-backup", + "size_in_bytes": 11, + "workflow_run": {"id": 42}, + }, + ] + } + + monkeypatch.setattr(fetch_logs_metadata, "run_gh_json", fake_run_gh_json) + + artifacts = fetch_logs_metadata.fetch_artifacts( + "owner/repo", + {"42"}, + max_pages=10, + ) + + assert len(calls) == 1 + assert artifacts == [{ + "run_id": "42", + "id": 1, + "name": "crawl-backup", + "size": 11, + "url": "https://github.com/owner/repo/actions/runs/42/artifacts/1", + }] From 4a12765f1826423cc04c8e6a4adbc8ac86f7b063 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Mon, 23 Mar 2026 17:44:27 +0100 Subject: [PATCH 2/5] Simplify enrich_logs CLI input flow Make enrich_logs use a positional input argument instead of --input. The positional defaults to logs.json, and --output now defaults to the same path as input when omitted. --- .github/workflows/publish.yml | 3 +-- README.md | 2 +- scripts/enrich_logs.py | 10 +++++----- tests/test_enrich_logs.py | 19 +++++++++++++++++++ 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index b09558a9d..b1eb9ea78 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -293,8 +293,7 @@ jobs: uv run -m scripts.enrich_logs \ --runs "$RUNNER_TEMP/workflow_runs.json" \ --artifacts "$RUNNER_TEMP/workflow_artifacts.json" \ - -i ./wrk/logs.json \ - -o ./wrk/logs.json + ./wrk/logs.json retry_5() { local n=1 diff --git a/README.md b/README.md index bb6643228..2e44f3173 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ If you want explicit control, run the two low-level commands, that's what we do ```bash uv run -m scripts.fetch_logs_metadata --pretty -uv run -m scripts.enrich_logs -i ./logs.json -o ./logs.json --pretty +uv run -m scripts.enrich_logs --pretty ``` --- diff --git a/scripts/enrich_logs.py b/scripts/enrich_logs.py index 53d25e7b6..a80aad7b2 100644 --- a/scripts/enrich_logs.py +++ b/scripts/enrich_logs.py @@ -50,16 +50,16 @@ def parse_args() -> Args: ) ) parser.add_argument( - "-i", - "--input", + "input", + nargs="?", default="logs.json", help="Input logs file (default: logs.json).", ) parser.add_argument( "-o", "--output", - default="logs.json", - help="Output logs file (default: logs.json).", + default=None, + help="Output logs file (defaults to INPUT).", ) parser.add_argument( "-r", @@ -86,7 +86,7 @@ def parse_args() -> Args: ns = parser.parse_args() return Args( input=ns.input, - output=ns.output, + output=ns.output or ns.input, runs=ns.runs, artifacts=ns.artifacts, pretty=ns.pretty, diff --git a/tests/test_enrich_logs.py b/tests/test_enrich_logs.py index 379e5e236..6e53446ba 100644 --- a/tests/test_enrich_logs.py +++ b/tests/test_enrich_logs.py @@ -1,4 +1,5 @@ import json +import sys import scripts.enrich_logs as enrich_logs @@ -116,3 +117,21 @@ def test_enrich_attaches_artifacts_by_run_id(tmp_path): "size": 4321, "url": "https://github.com/owner/repo/actions/runs/2/artifacts/202", }] + + +def test_parse_args_defaults_output_to_input(monkeypatch): + monkeypatch.setattr( + sys, + "argv", + [ + "enrich_logs.py", + "./wrk/logs.json", + "--runs", + "./workflow_runs.json", + ], + ) + + args = enrich_logs.parse_args() + + assert args.input == "./wrk/logs.json" + assert args.output == "./wrk/logs.json" From fbb389ac03bdbff43034c853ebd00b25941d74b0 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Tue, 24 Mar 2026 09:36:19 +0100 Subject: [PATCH 3/5] Run nightly Pages publish on tag pushes Add a tag push trigger to publish.yml and update the gh-pages-nightly job condition to allow both scheduled runs and tag pushes. Restrict channel build jobs to workflow_run and workflow_dispatch so tag pushes do not trigger channel rebuilds. --- .github/workflows/publish.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index b1eb9ea78..ba42fe342 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -10,6 +10,9 @@ on: types: - completed workflow_dispatch: + push: + tags: + - '*' schedule: - cron: '22 8 * * *' @@ -21,7 +24,7 @@ permissions: jobs: build-st4-channel: runs-on: ubuntu-latest - if: github.event_name != 'schedule' + if: github.event_name == 'workflow_run' || github.event_name == 'workflow_dispatch' outputs: update_needed: ${{ steps.check.outputs.update_needed }} env: @@ -133,7 +136,7 @@ jobs: build-st3-channel: runs-on: ubuntu-latest - if: github.event_name != 'schedule' + if: github.event_name == 'workflow_run' || github.event_name == 'workflow_dispatch' env: RELEASE_TAG: the-st3-channel GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -348,7 +351,7 @@ jobs: gh-pages-nightly: runs-on: ubuntu-latest needs: enrich_logs - if: github.event_name == 'schedule' + if: github.event_name == 'schedule' || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')) environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} From 6ea07a75e9edca30677b5dcb0996f616235ecfb2 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Tue, 24 Mar 2026 09:45:54 +0100 Subject: [PATCH 4/5] Group failing report by reason class Sort the "Currently failing" section in three groups: 1. reasons starting with a three-digit status code 2. all remaining reasons 3. reasons starting with "fatal: 404" Within each group, entries are still sorted by package name. Status code entries are additionally ordered numerically by code. --- scripts/generate_channel.py | 16 ++++- .../generate_channel/test_generate_channel.py | 62 +++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/scripts/generate_channel.py b/scripts/generate_channel.py index 99a47352b..894c9e693 100644 --- a/scripts/generate_channel.py +++ b/scripts/generate_channel.py @@ -6,6 +6,7 @@ import json import sys import os +import re from typing import Generator, Literal, NotRequired, TypedDict from ._utils import flatten, pick, pl, write_json, parse_version @@ -218,11 +219,24 @@ def main(registry_path, workspace_path, channel_path, berlin: bool, pretty: bool failing_info = "\n".join( f"- **{pkg['name']}** [{failing_since(pkg, berlin)}]\n" f" {pkg['fail_reason'].strip().replace('\n', '\n ')}" - for pkg in sorted(failing, key=lambda p: p['name'].lower()) + for pkg in sorted(failing, key=failing_sort_key) ) print(f"\n#### Currently failing\n{failing_info}") +def failing_sort_key(pkg) -> tuple[int, int, str]: + reason = pkg["fail_reason"].strip() + name = pkg["name"].lower() + + if status_match := re.match(r"^(\d{3})\s", reason): + return (0, int(status_match.group(1)), name) + + if reason.startswith("fatal: 404"): + return (2, 0, name) + + return (1, 0, name) + + def normalize_package(pkg) -> Package | None: name = pkg.get("name") if not name: diff --git a/tests/generate_channel/test_generate_channel.py b/tests/generate_channel/test_generate_channel.py index b54fe56e7..b94fcb620 100644 --- a/tests/generate_channel/test_generate_channel.py +++ b/tests/generate_channel/test_generate_channel.py @@ -119,6 +119,68 @@ def test_generate_channel_filters_removed_and_dropped_libraries(tmp_path): ] +def test_generate_channel_sorts_failing_report_in_groups(tmp_path, capsys): + registry = {"repositories": []} + + def failing_pkg(name: str, fail_reason: str): + return { + "name": name, + "author": ["Ada"], + "last_modified": "2026-03-24T10:00:00Z", + "source": "https://repo.one", + "failing_since": "2026-03-24T10:00:00Z", + "fail_reason": fail_reason, + "releases": [ + { + "sublime_text": "4100", + "platforms": ["*"], + "version": "1.0.0", + "url": "https://repo.one/pkg.zip", + "date": "2026-03-24T10:00:00Z", + } + ], + } + + workspace = { + "packages": { + "status_502_z": failing_pkg("Zeta Status", "502 Bad Gateway"), + "status_403_a": failing_pkg("Alpha Status", "403 Forbidden"), + "status_502_b": failing_pkg("Beta Status", "502 Bad Gateway"), + "other_b": failing_pkg("Beta Other", "denied: blocked by rule"), + "other_a": failing_pkg("Alpha Other", "fatal: 500 Server Error"), + "fatal_404_b": failing_pkg("Beta 404", "fatal: 404 Not Found"), + "fatal_404_a": failing_pkg("Alpha 404", "fatal: 404 Not Found"), + }, + "libraries": {}, + } + + registry_path = tmp_path / "registry.json" + workspace_path = tmp_path / "workspace.json" + output_path = tmp_path / "channel.json" + + registry_path.write_text(json.dumps(registry), encoding="utf-8") + workspace_path.write_text(json.dumps(workspace), encoding="utf-8") + + main(str(registry_path), str(workspace_path), str(output_path), False, False) + + stdout = capsys.readouterr().out + names = [ + line.split("**")[1] + for line in stdout.splitlines() + if line.startswith("- **") + ] + + assert names == [ + "Alpha Status", + "Beta Status", + "Zeta Status", + "Alpha Other", + "Beta Other", + "Alpha 404", + "Beta 404", + ] + + def test_normalize_package_formats_fields_and_defaults(): pkg = { "name": "Example", From deb54e3b9254a0b1248ed1b055d69768e4cacd71 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Tue, 24 Mar 2026 12:32:03 +0100 Subject: [PATCH 5/5] Add `collect_logs` section to the README --- README.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2e44f3173..4eabe6fcf 100644 --- a/README.md +++ b/README.md @@ -174,7 +174,23 @@ The command above reuses the same layout as [CI](https://github.com/packagecontr --- -### `refresh_logs.py` (plus lower-level helpers) +### Logs handling + +#### `collect_logs.py` + +`scripts/collect_logs.py` appends the current run's `notes.txt` to `logs.json` (rolling history), +keyed by run id and trimmed to a retention window (`--history-days`, default 32). + +If you pass a `--workspace` it include a `found_updates` list for packages detected in that run. + +```bash +uv run -m scripts.collect_logs --output ./logs.json --workspace ./workspace.json ./notes.txt +``` + +This is a very mechanical step done in the crawl.yml; after the job is done we enrich the logs +in publish.yml: + +#### `refresh_logs.py` (plus lower-level helpers) Use this when you want to reproduce/update `logs.json` locally with GitHub Actions metadata. `gh` is required for the ad-hoc queries I make herein.