From 564b0160c77015c46cb3c3ebe1b3e4038a1b6bfc Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 14:41:21 +0100
Subject: [PATCH 1/5] Add artifacts metadata for the status page

Enhance enrich_logs.py to attach artifact metadata to log entries,
including id, name, size, and a stable GitHub web URL derived from
the resolved repository and run id.

Introduce fetch_logs_metadata.py which orchestrates Github API calls
using `gh` for convenience.  Also incorporates the ad-hoc usage in
publish.yml.

Introduce refresh_logs.py as a one-shot local helper.
---
 .github/workflows/publish.yml     |  16 +-
 README.md                         |  27 +++
 scripts/enrich_logs.py            | 104 +++++++-
 scripts/fetch_logs_metadata.py    | 378 ++++++++++++++++++++++++++++++
 scripts/refresh_logs.py           | 205 ++++++++++++++++
 tests/test_enrich_logs.py         |  75 ++++++
 tests/test_fetch_logs_metadata.py | 225 ++++++++++++++++++
 7 files changed, 1006 insertions(+), 24 deletions(-)
 create mode 100644 scripts/fetch_logs_metadata.py
 create mode 100644 scripts/refresh_logs.py
 create mode 100644 tests/test_fetch_logs_metadata.py

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 385728918..b09558a9d 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -276,18 +276,11 @@ jobs:
           restore-keys: |
             wrk-cache-
 
-      - name: Fetch recent workflow runs
+      - name: Fetch workflow metadata for logs enrichment
         run: |
-          SINCE=$(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ)
-          gh api \
-            repos/${{ github.repository }}/actions/workflows/${{ env.WORKFLOW_ID }}/runs \
-            --method GET \
-            -f per_page=100 \
-            -f status=completed \
-            -f created=">=$SINCE" \
-            --paginate \
-            --jq '[.workflow_runs[] | {id, conclusion, run_started_at}]' \
-            > "$RUNNER_TEMP/workflow_runs.json"
+          uv run -m scripts.fetch_logs_metadata \
+            --runs-output "$RUNNER_TEMP/workflow_runs.json" \
+            --artifacts-output "$RUNNER_TEMP/workflow_artifacts.json"
 
       - name: Enrich logs.json
         run: |
@@ -299,6 +292,7 @@ jobs:
 
           uv run -m scripts.enrich_logs \
             --runs "$RUNNER_TEMP/workflow_runs.json" \
+            --artifacts "$RUNNER_TEMP/workflow_artifacts.json" \
             -i ./wrk/logs.json \
             -o ./wrk/logs.json
 
diff --git a/README.md b/README.md
index ea9086299..bb6643228 100644
--- a/README.md
+++ b/README.md
@@ -174,6 +174,33 @@ The command above reuses the same layout as [CI](https://github.com/packagecontr
 
 ---
 
+### `refresh_logs.py` (plus lower-level helpers)
+
+Use this when you want to reproduce/update `logs.json` locally with GitHub Actions metadata.
+`gh` is required for the ad-hoc queries I make herein.
+
+```bash
+# one-shot: download logs (if missing), fetch metadata, enrich logs
+uv run -m scripts.refresh_logs --pretty
+```
+
+Defaults are tuned for local use:
+- repo: inferred from `GITHUB_REPOSITORY` or local `git origin`
+- workflow id: inferred from `crawl.yml` if not set
+- since window: `--since-hours 24`
+- metadata files: `./workflow_runs.json`, `./workflow_artifacts.json`
+- artifact scan cap: `--artifacts-max-pages 10`
+- logs path: `./logs.json`
+
+If you want explicit control, run the two low-level commands, that's what we do in publish.yml:
+
+```bash
+uv run -m scripts.fetch_logs_metadata --pretty
+uv run -m scripts.enrich_logs -i ./logs.json -o ./logs.json --pretty
+```
+
+---
+
 ### `snapshot_test.py`
 
 Creates a compact, single-file snapshot for regression testing (`registry + channel`) from a reduced package set.
diff --git a/scripts/enrich_logs.py b/scripts/enrich_logs.py
index 9405b42a0..53d25e7b6 100644
--- a/scripts/enrich_logs.py
+++ b/scripts/enrich_logs.py
@@ -2,9 +2,10 @@
 
 import argparse
 import json
+from collections import defaultdict
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
+from typing import Any, TypedDict
 
 from ._utils import write_json
 
@@ -14,9 +15,28 @@ class Args:
     input: str
     output: str
     runs: str
+    artifacts: str | None
     pretty: bool
 
 
+type RunId = str
+
+
+class RuntimeArtifact(TypedDict):
+    run_id: RunId
+    id: int
+    name: str
+    size: int
+    url: str
+
+
+class ArtifactMetadata(TypedDict):
+    id: int
+    name: str
+    size: int
+    url: str
+
+
 def main():
     args = parse_args()
     update_logs(args)
@@ -24,7 +44,10 @@ def main():
 
 def parse_args() -> Args:
     parser = argparse.ArgumentParser(
-        description="Enrich logs.json with workflow run metadata and fill missing runs."
+        description=(
+            "Enrich logs.json with workflow run metadata, artifacts metadata, "
+            "and fill missing runs."
+        )
     )
     parser.add_argument(
         "-i",
@@ -41,8 +64,19 @@ def parse_args() -> Args:
     parser.add_argument(
         "-r",
         "--runs",
-        required=True,
-        help="Path to workflow_runs.json fetched from the GitHub API.",
+        default="./workflow_runs.json",
+        help=(
+            "Path to workflow_runs.json fetched from the GitHub API "
+            "(default: ./workflow_runs.json)."
+        ),
+    )
+    parser.add_argument(
+        "--artifacts",
+        default="./workflow_artifacts.json",
+        help=(
+            "Optional path to workflow_artifacts.json fetched from the GitHub API "
+            "(default: ./workflow_artifacts.json)."
+        ),
     )
     parser.add_argument(
         "--pretty",
@@ -54,6 +88,7 @@ def parse_args() -> Args:
         input=ns.input,
         output=ns.output,
         runs=ns.runs,
+        artifacts=ns.artifacts,
         pretty=ns.pretty,
     )
 
@@ -61,50 +96,93 @@ def parse_args() -> Args:
 def update_logs(args: Args):
     entries = load_json(Path(args.input)) or []
     runs = load_json(Path(args.runs)) or []
+    artifacts: list[RuntimeArtifact] = []
+    if args.artifacts:
+        artifacts = load_json(Path(args.artifacts)) or []
+
+    if not isinstance(entries, list):
+        raise SystemExit(f"enrich_logs: input must be a JSON array: {args.input}")
+    if not isinstance(runs, list):
+        raise SystemExit(f"enrich_logs: runs must be a JSON array: {args.runs}")
+    if not isinstance(artifacts, list):
+        raise SystemExit(f"enrich_logs: artifacts must be a JSON array: {args.artifacts}")
 
     enriched = 0
     created = 0
+    artifacts_attached = 0
 
     runs_index = {
-        str(run.get("id")): {
+        run_id: {
             "conclusion": run.get("conclusion"),
             "run_started_at": run.get("run_started_at"),
         }
         for run in runs
-        if "id" in run
+        if (run_id := str(run.get("id", "")))
     }
+    artifacts_index = build_artifacts_index(artifacts)
 
     seen = set()
     for entry in entries:
-        run_id = str(entry.get("run_id", ""))
-        if not run_id:
-            continue
+        run_id = entry["run_id"]
         seen.add(run_id)
+
         info = runs_index.get(run_id)
-        if not info:
-            continue
-        if not entry.get("conclusion") and info.get("conclusion"):
+        if info and not entry.get("conclusion") and info.get("conclusion"):
             entry["conclusion"] = info["conclusion"]
             enriched += 1
 
+        run_artifacts = artifacts_index.get(run_id)
+        if run_artifacts and entry.get("artifacts") != run_artifacts:
+            entry["artifacts"] = run_artifacts
+            artifacts_attached += 1
+
     for run_id, info in runs_index.items():
         if run_id in seen:
             continue
+
         date = info.get("run_started_at")
         if not date:
             continue
+
         new_entry = {
             "date": date,
             "run_id": run_id,
         }
         if conclusion := info.get("conclusion"):
             new_entry["conclusion"] = conclusion
+        if run_artifacts := artifacts_index.get(run_id):
+            new_entry["artifacts"] = run_artifacts
+            artifacts_attached += 1
+
         entries.append(new_entry)
         created += 1
 
     entries.sort(key=lambda entry: entry.get("date", ""), reverse=True)
     write_json(args.output, entries, pretty=args.pretty, ensure_ascii=True)
-    print(f"Enriched entries: {enriched}, added missing runs: {created}")
+    print(
+        "Enriched entries: "
+        f"{enriched}, "
+        f"added missing runs: {created}, "
+        f"attached artifacts on entries: {artifacts_attached}"
+    )
+
+
+def build_artifacts_index(artifacts: list[RuntimeArtifact]) -> dict[RunId, list[ArtifactMetadata]]:
+    artifacts_by_run: defaultdict[RunId, list[ArtifactMetadata]] = defaultdict(list)
+
+    for artifact in artifacts:
+        run_id = artifact["run_id"]
+        artifacts_by_run[run_id].append({
+            "id": artifact["id"],
+            "name": artifact["name"],
+            "size": artifact["size"],
+            "url": artifact["url"],
+        })
+
+    for run_artifacts in artifacts_by_run.values():
+        run_artifacts.sort(key=lambda item: (item["name"].casefold(), str(item["id"])))
+
+    return dict(artifacts_by_run)
 
 
 def load_json(path: Path) -> Any:
diff --git a/scripts/fetch_logs_metadata.py b/scripts/fetch_logs_metadata.py
new file mode 100644
index 000000000..e28d7697f
--- /dev/null
+++ b/scripts/fetch_logs_metadata.py
@@ -0,0 +1,378 @@
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from typing import Any, Iterator
+
+from ._utils import write_json
+
+
+DEFAULT_RUNS_OUTPUT = "./workflow_runs.json"
+DEFAULT_ARTIFACTS_OUTPUT = "./workflow_artifacts.json"
+DEFAULT_WORKFLOW_FILE = "crawl.yml"
+
+
+@dataclass
+class Args:
+    repo: str
+    workflow_id: str
+    runs_output: str
+    artifacts_output: str
+    since: str
+    artifacts_max_pages: int
+    pretty: bool
+
+
+def main():
+    args = parse_args()
+    fetch_logs_metadata(args)
+
+
+def parse_args() -> Args:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Fetch workflow runs and artifacts metadata via gh api for logs enrichment."
+        )
+    )
+    parser.add_argument(
+        "--repo",
+        default=None,
+        help=(
+            "GitHub repository in owner/name format. Defaults to GITHUB_REPOSITORY "
+            "or the local git origin remote."
+        ),
+    )
+    parser.add_argument(
+        "--workflow-id",
+        default=None,
+        help=(
+            "Workflow ID to query runs for. Defaults to WORKFLOW_ID. "
+            "If missing, it is resolved from --workflow-file."
+        ),
+    )
+    parser.add_argument(
+        "--workflow-file",
+        default=DEFAULT_WORKFLOW_FILE,
+        help=(
+            "Workflow filename used to resolve --workflow-id when it is not set "
+            f"(default: {DEFAULT_WORKFLOW_FILE})."
+        ),
+    )
+    parser.add_argument(
+        "--since",
+        default=None,
+        help=(
+            "ISO8601 timestamp for the lower bound of runs to fetch, for example "
+            "2026-03-23T00:00:00Z."
+        ),
+    )
+    parser.add_argument(
+        "--since-hours",
+        type=int,
+        default=24,
+        help="Fetch runs started within the last N hours when --since is not set (default: 24).",
+    )
+    parser.add_argument(
+        "--runs-output",
+        default=DEFAULT_RUNS_OUTPUT,
+        help=f"Output path for workflow runs JSON (default: {DEFAULT_RUNS_OUTPUT}).",
+    )
+    parser.add_argument(
+        "--artifacts-output",
+        default=DEFAULT_ARTIFACTS_OUTPUT,
+        help=(
+            "Output path for workflow artifacts JSON "
+            f"(default: {DEFAULT_ARTIFACTS_OUTPUT})."
+        ),
+    )
+    parser.add_argument(
+        "--artifacts-max-pages",
+        type=int,
+        default=10,
+        help=(
+            "Maximum number of repository artifacts pages to scan "
+            "(default: 10)."
+        ),
+    )
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty-print JSON output (indent=2).",
+    )
+    ns = parser.parse_args()
+
+    if ns.artifacts_max_pages < 1:
+        raise SystemExit("fetch_logs_metadata: --artifacts-max-pages must be >= 1")
+
+    repo = resolve_repo(ns.repo)
+    workflow_id = resolve_workflow_id(ns.workflow_id, repo, ns.workflow_file)
+    since = ns.since or iso_utc_hours_ago(ns.since_hours)
+
+    return Args(
+        repo=repo,
+        workflow_id=workflow_id,
+        runs_output=ns.runs_output,
+        artifacts_output=ns.artifacts_output,
+        since=since,
+        artifacts_max_pages=ns.artifacts_max_pages,
+        pretty=ns.pretty,
+    )
+
+
+def fetch_logs_metadata(args: Args):
+    runs = fetch_runs(args.repo, args.workflow_id, args.since)
+    run_ids = {
+        str(run.get("id"))
+        for run in runs
+        if isinstance(run, dict) and run.get("id") is not None
+    }
+
+    artifacts = fetch_artifacts(
+        args.repo,
+        run_ids,
+        max_pages=args.artifacts_max_pages,
+    )
+    write_json(args.runs_output, runs, pretty=args.pretty, ensure_ascii=True)
+    write_json(args.artifacts_output, artifacts, pretty=args.pretty, ensure_ascii=True)
+
+    print(
+        "Fetched metadata: "
+        f"repo={args.repo}, "
+        f"workflow_id={args.workflow_id}, "
+        f"runs={len(runs)}, "
+        f"artifacts_kept={len(artifacts)}, "
+        f"since={args.since}"
+    )
+
+
+def fetch_runs(repo: str, workflow_id: str, since: str) -> list[dict[str, Any]]:
+    runs: list[dict[str, Any]] = []
+
+    for run in run_gh_paginated(
+        "workflow_runs",
+        [
+            f"repos/{repo}/actions/workflows/{workflow_id}/runs",
+            "--method",
+            "GET",
+            "-f",
+            "status=completed",
+            "-f",
+            f"created=>={since}",
+        ],
+        max_pages=None,
+    ):
+        if not isinstance(run, dict):
+            continue
+        run_id = run.get("id")
+        if run_id is None:
+            continue
+
+        runs.append({
+            "id": run_id,
+            "conclusion": run.get("conclusion"),
+            "run_started_at": run.get("run_started_at"),
+        })
+
+    return runs
+
+
+def fetch_artifacts(
+    repo: str,
+    run_ids: set[str],
+    *,
+    max_pages: int,
+) -> list[dict[str, Any]]:
+    if not run_ids:
+        return []
+
+    remaining_run_ids = set(run_ids)
+    artifacts: list[dict[str, Any]] = []
+
+    for artifact in run_gh_paginated(
+        "artifacts",
+        [
+            f"repos/{repo}/actions/artifacts",
+            "--method",
+            "GET",
+        ],
+        max_pages=max_pages,
+    ):
+        if not isinstance(artifact, dict):
+            continue
+
+        workflow_run = artifact.get("workflow_run")
+        workflow_run_id = None
+        if isinstance(workflow_run, dict):
+            workflow_run_id = workflow_run.get("id")
+
+        if workflow_run_id is None:
+            continue
+
+        run_id = str(workflow_run_id)
+        if run_id in run_ids:
+            # Mark run_id as seen immediately. Otherwise malformed artifacts could prevent
+            # remaining_run_ids from emptying and keep pagination running unnecessarily.
+            remaining_run_ids.discard(run_id)
+
+            artifact_id = artifact.get("id")
+            artifact_name = artifact.get("name")
+            artifact_size = artifact.get("size_in_bytes")
+            if artifact_id is None or artifact_name is None or artifact_size is None:
+                continue
+
+            artifacts.append({
+                "run_id": run_id,
+                "id": artifact_id,
+                "name": str(artifact_name),
+                "size": int(artifact_size),
+                "url": f"https://github.com/{repo}/actions/runs/{run_id}/artifacts/{artifact_id}",
+            })
+            # We only know whether we're done when we see the *next* item. Keep scanning
+            # because multiple artifacts for an interesting run_id can appear consecutively.
+            continue
+
+        if not remaining_run_ids:
+            # Assumes API items are ordered: once all interesting run_ids were seen and we
+            # hit a non-interesting run_id, there are no more relevant artifacts to collect.
+            break
+
+    artifacts.sort(key=lambda item: (item["run_id"], item["name"].casefold(), str(item["id"])))
+    return artifacts
+
+
+def resolve_repo(repo: str | None) -> str:
+    if repo:
+        return repo
+
+    if env_repo := os.environ.get("GITHUB_REPOSITORY"):
+        return env_repo
+
+    if origin_url := read_git_origin_url():
+        if parsed_repo := parse_repo_from_remote(origin_url):
+            return parsed_repo
+
+    raise SystemExit(
+        "fetch_logs_metadata: unable to resolve repo. "
+        "Set --repo, GITHUB_REPOSITORY, or configure git origin."
+    )
+
+
+def resolve_workflow_id(workflow_id: str | None, repo: str, workflow_file: str) -> str:
+    if workflow_id:
+        return str(workflow_id)
+
+    if env_workflow_id := os.environ.get("WORKFLOW_ID"):
+        return env_workflow_id
+
+    resolved = run_gh_json([
+        f"repos/{repo}/actions/workflows/{workflow_file}",
+        "--method",
+        "GET",
+        "--jq",
+        ".id",
+    ])
+    if resolved is None:
+        raise SystemExit(
+            "fetch_logs_metadata: failed to resolve workflow id "
+            f"for {repo}/{workflow_file}"
+        )
+
+    return str(resolved)
+
+
+def iso_utc_hours_ago(hours: int) -> str:
+    timestamp = datetime.now(timezone.utc) - timedelta(hours=hours)
+    return timestamp.strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def read_git_origin_url() -> str | None:
+    process = subprocess.run(
+        ["git", "config", "--get", "remote.origin.url"],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if process.returncode != 0:
+        return None
+
+    origin = process.stdout.strip()
+    return origin or None
+
+
+def parse_repo_from_remote(remote_url: str) -> str | None:
+    # https://github.com/owner/repo(.git)
+    if match := re.search(r"github\.com[:/]([^/]+)/([^/]+?)(?:\.git)?$", remote_url):
+        owner, repo = match.group(1), match.group(2)
+        return f"{owner}/{repo}"
+    return None
+
+
+def run_gh_paginated(
+    key: str,
+    cmd: list[str],
+    *,
+    max_pages: int | None,
+    per_page: int = 100,
+) -> Iterator[Any]:
+    page = 1
+    while max_pages is None or page <= max_pages:
+        payload = run_gh_json([
+            *cmd,
+            "-f",
+            f"per_page={per_page}",
+            "-f",
+            f"page={page}",
+        ])
+        if not isinstance(payload, dict):
+            return
+
+        page_items = payload.get(key)
+        if not isinstance(page_items, list) or not page_items:
+            return
+
+        yield from page_items
+
+        if len(page_items) < per_page:
+            return
+
+        page += 1
+
+
+def run_gh_json(args: list[str]) -> Any:
+    cmd = ["gh", "api", *args]
+    command_str = " ".join(cmd)
+    print(f"fetch_logs_metadata: {command_str}", file=sys.stderr)
+
+    process = subprocess.run(cmd, capture_output=True, text=True, check=False)
+    if process.returncode != 0:
+        stderr = process.stderr.strip()
+        raise SystemExit(
+            "fetch_logs_metadata: gh command failed\n"
+            f"command: {command_str}\n"
+            f"stderr: {stderr}"
+        )
+
+    output = process.stdout.strip()
+    if not output:
+        return []
+
+    try:
+        return json.loads(output)
+    except json.JSONDecodeError as exc:
+        excerpt = output[:260]
+        raise SystemExit(
+            "fetch_logs_metadata: invalid JSON from gh command\n"
+            f"command: {command_str}\n"
+            f"error: {exc}\n"
+            f"context: {excerpt}"
+        ) from exc
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/refresh_logs.py b/scripts/refresh_logs.py
new file mode 100644
index 000000000..fe6963634
--- /dev/null
+++ b/scripts/refresh_logs.py
@@ -0,0 +1,205 @@
+from __future__ import annotations
+
+import argparse
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+
+from . import enrich_logs, fetch_logs_metadata
+
+
+DEFAULT_LOGS_PATH = "./logs.json"
+DEFAULT_RELEASE_TAG = "crawler-status"
+
+
+@dataclass
+class Args:
+    logs: str
+    repo: str | None
+    workflow_id: str | None
+    workflow_file: str
+    runs_output: str
+    artifacts_output: str
+    artifacts_max_pages: int
+    since: str | None
+    since_hours: int
+    release_tag: str
+    skip_download: bool
+    pretty: bool
+
+
+def main():
+    args = parse_args()
+    refresh_logs(args)
+
+
+def parse_args() -> Args:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Refresh logs.json locally by downloading current logs (optional), "
+            "fetching run/artifact metadata in batch, and enriching the file."
+        )
+    )
+    parser.add_argument(
+        "--logs",
+        default=DEFAULT_LOGS_PATH,
+        help=f"Path to logs.json to refresh (default: {DEFAULT_LOGS_PATH}).",
+    )
+    parser.add_argument(
+        "--repo",
+        default=None,
+        help=(
+            "GitHub repository in owner/name format. Defaults to GITHUB_REPOSITORY "
+            "or local git origin."
+        ),
+    )
+    parser.add_argument(
+        "--workflow-id",
+        default=None,
+        help="Workflow ID. If omitted, resolved from --workflow-file.",
+    )
+    parser.add_argument(
+        "--workflow-file",
+        default=fetch_logs_metadata.DEFAULT_WORKFLOW_FILE,
+        help=(
+            "Workflow filename used when resolving workflow id "
+            f"(default: {fetch_logs_metadata.DEFAULT_WORKFLOW_FILE})."
+        ),
+    )
+    parser.add_argument(
+        "--runs-output",
+        default=fetch_logs_metadata.DEFAULT_RUNS_OUTPUT,
+        help=(
+            "Output path for workflow runs JSON "
+            f"(default: {fetch_logs_metadata.DEFAULT_RUNS_OUTPUT})."
+        ),
+    )
+    parser.add_argument(
+        "--artifacts-output",
+        default=fetch_logs_metadata.DEFAULT_ARTIFACTS_OUTPUT,
+        help=(
+            "Output path for workflow artifacts JSON "
+            f"(default: {fetch_logs_metadata.DEFAULT_ARTIFACTS_OUTPUT})."
+        ),
+    )
+    parser.add_argument(
+        "--artifacts-max-pages",
+        type=int,
+        default=10,
+        help="Maximum artifact pages to scan (default: 10).",
+    )
+    parser.add_argument(
+        "--since",
+        default=None,
+        help="ISO8601 lower bound; overrides --since-hours.",
+    )
+    parser.add_argument(
+        "--since-hours",
+        type=int,
+        default=24,
+        help="Lower bound in hours when --since is not provided (default: 24).",
+    )
+    parser.add_argument(
+        "--release-tag",
+        default=DEFAULT_RELEASE_TAG,
+        help=f"Release tag used for logs download (default: {DEFAULT_RELEASE_TAG}).",
+    )
+    parser.add_argument(
+        "--skip-download",
+        action="store_true",
+        help="Do not attempt to download logs.json from release if missing.",
+    )
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty-print generated JSON files.",
+    )
+    ns = parser.parse_args()
+    if ns.artifacts_max_pages < 1:
+        raise SystemExit("refresh_logs: --artifacts-max-pages must be >= 1")
+
+    return Args(
+        logs=ns.logs,
+        repo=ns.repo,
+        workflow_id=ns.workflow_id,
+        workflow_file=ns.workflow_file,
+        runs_output=ns.runs_output,
+        artifacts_output=ns.artifacts_output,
+        artifacts_max_pages=ns.artifacts_max_pages,
+        since=ns.since,
+        since_hours=ns.since_hours,
+        release_tag=ns.release_tag,
+        skip_download=ns.skip_download,
+        pretty=ns.pretty,
+    )
+
+
+def refresh_logs(args: Args):
+    repo = fetch_logs_metadata.resolve_repo(args.repo)
+    workflow_id = fetch_logs_metadata.resolve_workflow_id(
+        args.workflow_id,
+        repo,
+        args.workflow_file,
+    )
+
+    logs_path = Path(args.logs)
+    ensure_logs_file(logs_path, repo, args.release_tag, args.skip_download)
+
+    fetch_args = fetch_logs_metadata.Args(
+        repo=repo,
+        workflow_id=workflow_id,
+        runs_output=args.runs_output,
+        artifacts_output=args.artifacts_output,
+        artifacts_max_pages=args.artifacts_max_pages,
+        since=args.since or fetch_logs_metadata.iso_utc_hours_ago(args.since_hours),
+        pretty=args.pretty,
+    )
+    fetch_logs_metadata.fetch_logs_metadata(fetch_args)
+
+    enrich_args = enrich_logs.Args(
+        input=str(logs_path),
+        output=str(logs_path),
+        runs=args.runs_output,
+        artifacts=args.artifacts_output,
+        pretty=args.pretty,
+    )
+    enrich_logs.update_logs(enrich_args)
+
+    print(f"Refreshed logs file: {logs_path}")
+
+
+def ensure_logs_file(logs_path: Path, repo: str, release_tag: str, skip_download: bool):
+    logs_path.parent.mkdir(parents=True, exist_ok=True)
+    if logs_path.exists():
+        return
+
+    if skip_download:
+        raise SystemExit(
+            f"refresh_logs: logs file not found and --skip-download was set: {logs_path}"
+        )
+
+    cmd = [
+        "gh",
+        "release",
+        "download",
+        release_tag,
+        "--repo",
+        repo,
+        "--pattern",
+        "logs.json",
+        "--output",
+        str(logs_path),
+        "--clobber",
+    ]
+    process = subprocess.run(cmd, capture_output=True, text=True, check=False)
+    if process.returncode != 0:
+        raise SystemExit(
+            "refresh_logs: failed to download logs.json from release\n"
+            f"repo={repo}\n"
+            f"tag={release_tag}\n"
+            f"stderr={process.stderr.strip()}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_enrich_logs.py b/tests/test_enrich_logs.py
index 0b34fb932..379e5e236 100644
--- a/tests/test_enrich_logs.py
+++ b/tests/test_enrich_logs.py
@@ -28,6 +28,7 @@ def test_enrich_updates_existing_and_adds_missing(tmp_path):
         input=str(logs_path),
         output=str(logs_path),
         runs=str(runs_path),
+        artifacts=None,
         pretty=True,
     )
     enrich_logs.update_logs(args)
@@ -41,3 +42,77 @@ def test_enrich_updates_existing_and_adds_missing(tmp_path):
     second = next(entry for entry in data if entry["run_id"] == "2")
     assert second["conclusion"] == "failure"
     assert second["date"] == "2024-10-06T09:00:00Z"
+
+
+def test_enrich_attaches_artifacts_by_run_id(tmp_path):
+    logs_path = tmp_path / "logs.json"
+    runs_path = tmp_path / "workflow_runs.json"
+    artifacts_path = tmp_path / "workflow_artifacts.json"
+
+    logs_path.write_text(json.dumps([
+        {"run_id": "1", "notes": "has notes", "date": "2024-10-05T10:00:00Z"},
+    ]), encoding="utf-8")
+
+    runs_path.write_text(json.dumps([
+        {
+            "id": 1,
+            "conclusion": "success",
+            "run_started_at": "2024-10-05T10:00:00Z",
+        },
+        {
+            "id": 2,
+            "conclusion": "failure",
+            "run_started_at": "2024-10-06T09:00:00Z",
+        },
+    ]), encoding="utf-8")
+
+    artifacts_path.write_text(json.dumps([
+        {
+            "run_id": "1",
+            "id": 101,
+            "name": "crawl-backup",
+            "size": 1234,
+            "url": "https://github.com/owner/repo/actions/runs/1/artifacts/101",
+        },
+        {
+            "run_id": "2",
+            "id": 202,
+            "name": "stats-backup",
+            "size": 4321,
+            "url": "https://github.com/owner/repo/actions/runs/2/artifacts/202",
+        },
+        {
+            "run_id": "999",
+            "id": 303,
+            "name": "ignored",
+            "size": 999,
+            "url": "https://github.com/owner/repo/actions/runs/999/artifacts/303",
+        },
+    ]), encoding="utf-8")
+
+    args = enrich_logs.Args(
+        input=str(logs_path),
+        output=str(logs_path),
+        runs=str(runs_path),
+        artifacts=str(artifacts_path),
+        pretty=True,
+    )
+    enrich_logs.update_logs(args)
+
+    data = json.loads(logs_path.read_text(encoding="utf-8"))
+
+    first = next(entry for entry in data if entry["run_id"] == "1")
+    assert first["artifacts"] == [{
+        "id": 101,
+        "name": "crawl-backup",
+        "size": 1234,
+        "url": "https://github.com/owner/repo/actions/runs/1/artifacts/101",
+    }]
+
+    second = next(entry for entry in data if entry["run_id"] == "2")
+    assert second["artifacts"] == [{
+        "id": 202,
+        "name": "stats-backup",
+        "size": 4321,
+        "url": "https://github.com/owner/repo/actions/runs/2/artifacts/202",
+    }]
diff --git a/tests/test_fetch_logs_metadata.py b/tests/test_fetch_logs_metadata.py
new file mode 100644
index 000000000..64996e411
--- /dev/null
+++ b/tests/test_fetch_logs_metadata.py
@@ -0,0 +1,225 @@
+import scripts.fetch_logs_metadata as fetch_logs_metadata
+
+
+def test_fetch_logs_metadata_writes_fetch_outputs(monkeypatch):
+    def fake_fetch_runs(_repo, _workflow_id, _since):
+        return [{"id": 2}, {"id": 1}]
+
+    artifact_data = [
+        {
+            "run_id": "1",
+            "id": 100,
+            "name": "crawl-backup",
+            "size": 200,
+            "url": "https://github.com/owner/repo/actions/runs/1/artifacts/100",
+        },
+        {
+            "run_id": "2",
+            "id": 101,
+            "name": "stats-backup",
+            "size": 300,
+            "url": "https://github.com/owner/repo/actions/runs/2/artifacts/101",
+        },
+    ]
+
+    def fake_fetch_artifacts(_repo, run_ids, *, max_pages):
+        assert run_ids == {"1", "2"}
+        assert max_pages == 10
+        return artifact_data
+
+    writes = []
+
+    def fake_write_json(path, data, *, pretty, ensure_ascii):
+        writes.append((path, data, pretty, ensure_ascii))
+
+    monkeypatch.setattr(fetch_logs_metadata, "fetch_runs", fake_fetch_runs)
+    monkeypatch.setattr(fetch_logs_metadata, "fetch_artifacts", fake_fetch_artifacts)
+    monkeypatch.setattr(fetch_logs_metadata, "write_json", fake_write_json)
+
+    args = fetch_logs_metadata.Args(
+        repo="owner/repo",
+        workflow_id="123",
+        runs_output="runs.json",
+        artifacts_output="artifacts.json",
+        since="2026-01-01T00:00:00Z",
+        artifacts_max_pages=10,
+        pretty=True,
+    )
+
+    fetch_logs_metadata.fetch_logs_metadata(args)
+
+    assert writes[0] == ("runs.json", [{"id": 2}, {"id": 1}], True, True)
+    assert writes[1] == ("artifacts.json", artifact_data, True, True)
+
+
+def test_parse_repo_from_remote_supports_https_and_ssh():
+    https_url = "https://github.com/packagecontrol/thecrawl.git"
+    ssh_url = "git@github.com:packagecontrol/thecrawl.git"
+
+    assert fetch_logs_metadata.parse_repo_from_remote(https_url) == "packagecontrol/thecrawl"
+    assert fetch_logs_metadata.parse_repo_from_remote(ssh_url) == "packagecontrol/thecrawl"
+
+
+def test_resolve_workflow_id_falls_back_to_workflow_filename(monkeypatch):
+    monkeypatch.delenv("WORKFLOW_ID", raising=False)
+
+    called_args = []
+
+    def fake_run_gh_json(args):
+        called_args.append(args)
+        return 123456
+
+    monkeypatch.setattr(fetch_logs_metadata, "run_gh_json", fake_run_gh_json)
+
+    resolved = fetch_logs_metadata.resolve_workflow_id(None, "owner/repo", "crawl.yml")
+
+    assert resolved == "123456"
+    assert called_args == [[
+        "repos/owner/repo/actions/workflows/crawl.yml",
+        "--method",
+        "GET",
+        "--jq",
+        ".id",
+    ]]
+
+
+def test_run_gh_json_raises_on_invalid_json(monkeypatch):
+    class Process:
+        returncode = 0
+        stdout = "{invalid-json}"
+        stderr = ""
+
+    def fake_run(*_args, **_kwargs):
+        return Process()
+
+    monkeypatch.setattr(fetch_logs_metadata.subprocess, "run", fake_run)
+
+    try:
+        fetch_logs_metadata.run_gh_json(["repos/owner/repo/actions/runs"])
+        assert False, "Expected SystemExit for invalid JSON"
+    except SystemExit as exc:
+        message = str(exc)
+        assert "invalid JSON" in message
+        assert "gh api repos/owner/repo/actions/runs" in message
+
+
+def test_fetch_runs_pages_until_short_page(monkeypatch):
+    calls = []
+
+    def fake_run_gh_json(args):
+        calls.append(args)
+        page_arg = next(item for item in args if item.startswith("page="))
+        page = int(page_arg.split("=", 1)[1])
+
+        if page == 1:
+            return {
+                "workflow_runs": [
+                    {
+                        "id": run_id,
+                        "conclusion": "success",
+                        "run_started_at": f"2026-01-01T00:{run_id:02d}:00Z",
+                    }
+                    for run_id in range(1, 101)
+                ]
+            }
+
+        return {
+            "workflow_runs": [
+                {
+                    "id": 101,
+                    "conclusion": "failure",
+                    "run_started_at": "2026-01-01T01:41:00Z",
+                },
+            ]
+        }
+
+    monkeypatch.setattr(fetch_logs_metadata, "run_gh_json", fake_run_gh_json)
+
+    runs = fetch_logs_metadata.fetch_runs(
+        "owner/repo",
+        "123",
+        "2026-01-01T00:00:00Z",
+    )
+
+    assert len(runs) == 101
+    assert [
+        next(item for item in call if item.startswith("page="))
+        for call in calls
+    ] == ["page=1", "page=2"]
+    assert runs[0] == {
+        "id": 1,
+        "conclusion": "success",
+        "run_started_at": "2026-01-01T00:01:00Z",
+    }
+    assert runs[-1] == {
+        "id": 101,
+        "conclusion": "failure",
+        "run_started_at": "2026-01-01T01:41:00Z",
+    }
+
+
+def test_fetch_artifacts_keeps_collecting_matching_items_after_remaining_empty(monkeypatch):
+    def fake_run_gh_paginated(_key, _cmd, **_kwargs):
+        yield {"id": 1, "name": "a", "size_in_bytes": 11, "workflow_run": {"id": 42}}
+        yield {"id": 2, "name": "b", "size_in_bytes": 22, "workflow_run": {"id": 42}}
+        yield {"id": 3, "name": "c", "size_in_bytes": 33, "workflow_run": {"id": 999}}
+        yield {"id": 4, "name": "d", "size_in_bytes": 44, "workflow_run": {"id": 42}}
+
+    monkeypatch.setattr(fetch_logs_metadata, "run_gh_paginated", fake_run_gh_paginated)
+
+    artifacts = fetch_logs_metadata.fetch_artifacts(
+        "owner/repo",
+        {"42"},
+        max_pages=10,
+    )
+
+    assert artifacts == [
+        {
+            "run_id": "42",
+            "id": 1,
+            "name": "a",
+            "size": 11,
+            "url": "https://github.com/owner/repo/actions/runs/42/artifacts/1",
+        },
+        {
+            "run_id": "42",
+            "id": 2,
+            "name": "b",
+            "size": 22,
+            "url": "https://github.com/owner/repo/actions/runs/42/artifacts/2",
+        },
+    ]
+
+
+def test_fetch_artifacts_stops_after_matching_run_ids(monkeypatch):
+    calls = []
+
+    def fake_run_gh_json(args):
+        calls.append(args)
+        return {
+            "artifacts": [
+                {
+                    "id": 1,
+                    "name": "crawl-backup",
+                    "size_in_bytes": 11,
+                    "workflow_run": {"id": 42},
+                },
+            ]
+        }
+
+    monkeypatch.setattr(fetch_logs_metadata, "run_gh_json", fake_run_gh_json)
+
+    artifacts = fetch_logs_metadata.fetch_artifacts(
+        "owner/repo",
+        {"42"},
+        max_pages=10,
+    )
+
+    assert len(calls) == 1
+    assert artifacts == [{
+        "run_id": "42",
+        "id": 1,
+        "name": "crawl-backup",
+        "size": 11,
+        "url": "https://github.com/owner/repo/actions/runs/42/artifacts/1",
+    }]

From 4a12765f1826423cc04c8e6a4adbc8ac86f7b063 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 17:44:27 +0100
Subject: [PATCH 2/5] Simplify enrich_logs CLI input flow

Make enrich_logs use a positional input argument instead of --input.
The positional defaults to logs.json, and --output now defaults to the
same path as input when omitted.
---
 .github/workflows/publish.yml |  3 +--
 README.md                     |  2 +-
 scripts/enrich_logs.py        | 10 +++++-----
 tests/test_enrich_logs.py     | 19 +++++++++++++++++++
 4 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index b09558a9d..b1eb9ea78 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -293,8 +293,7 @@ jobs:
           uv run -m scripts.enrich_logs \
             --runs "$RUNNER_TEMP/workflow_runs.json" \
             --artifacts "$RUNNER_TEMP/workflow_artifacts.json" \
-            -i ./wrk/logs.json \
-            -o ./wrk/logs.json
+            ./wrk/logs.json
 
           retry_5() {
             local n=1
diff --git a/README.md b/README.md
index bb6643228..2e44f3173 100644
--- a/README.md
+++ b/README.md
@@ -196,7 +196,7 @@ If you want explicit control, run the two low-level commands, that's what we do
 
 ```bash
 uv run -m scripts.fetch_logs_metadata --pretty
-uv run -m scripts.enrich_logs -i ./logs.json -o ./logs.json --pretty
+uv run -m scripts.enrich_logs --pretty
 ```
 
 ---
diff --git a/scripts/enrich_logs.py b/scripts/enrich_logs.py
index 53d25e7b6..a80aad7b2 100644
--- a/scripts/enrich_logs.py
+++ b/scripts/enrich_logs.py
@@ -50,16 +50,16 @@ def parse_args() -> Args:
         )
     )
     parser.add_argument(
-        "-i",
-        "--input",
+        "input",
+        nargs="?",
         default="logs.json",
         help="Input logs file (default: logs.json).",
     )
     parser.add_argument(
         "-o",
         "--output",
-        default="logs.json",
-        help="Output logs file (default: logs.json).",
+        default=None,
+        help="Output logs file (defaults to INPUT).",
     )
     parser.add_argument(
         "-r",
@@ -86,7 +86,7 @@ def parse_args() -> Args:
     ns = parser.parse_args()
     return Args(
         input=ns.input,
-        output=ns.output,
+        output=ns.output or ns.input,
         runs=ns.runs,
         artifacts=ns.artifacts,
         pretty=ns.pretty,
diff --git a/tests/test_enrich_logs.py b/tests/test_enrich_logs.py
index 379e5e236..6e53446ba 100644
--- a/tests/test_enrich_logs.py
+++ b/tests/test_enrich_logs.py
@@ -1,4 +1,5 @@
 import json
+import sys
 
 import scripts.enrich_logs as enrich_logs
 
@@ -116,3 +117,21 @@ def test_enrich_attaches_artifacts_by_run_id(tmp_path):
         "size": 4321,
         "url": "https://github.com/owner/repo/actions/runs/2/artifacts/202",
     }]
+
+
+def test_parse_args_defaults_output_to_input(monkeypatch):
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "enrich_logs.py",
+            "./wrk/logs.json",
+            "--runs",
+            "./workflow_runs.json",
+        ],
+    )
+
+    args = enrich_logs.parse_args()
+
+    assert args.input == "./wrk/logs.json"
+    assert args.output == "./wrk/logs.json"

From fbb389ac03bdbff43034c853ebd00b25941d74b0 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Tue, 24 Mar 2026 09:36:19 +0100
Subject: [PATCH 3/5] Run nightly Pages publish on tag pushes

Add a tag push trigger to publish.yml and update the gh-pages-nightly
job condition to allow both scheduled runs and tag pushes.

Restrict channel build jobs to workflow_run and workflow_dispatch so
tag pushes do not trigger channel rebuilds.
---
 .github/workflows/publish.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index b1eb9ea78..ba42fe342 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -10,6 +10,9 @@ on:
     types:
       - completed
   workflow_dispatch:
+  push:
+    tags:
+      - '*'
   schedule:
     - cron: '22 8 * * *'
 
@@ -21,7 +24,7 @@ permissions:
 jobs:
   build-st4-channel:
     runs-on: ubuntu-latest
-    if: github.event_name != 'schedule'
+    if: github.event_name == 'workflow_run' || github.event_name == 'workflow_dispatch'
     outputs:
       update_needed: ${{ steps.check.outputs.update_needed }}
     env:
@@ -133,7 +136,7 @@ jobs:
 
   build-st3-channel:
     runs-on: ubuntu-latest
-    if: github.event_name != 'schedule'
+    if: github.event_name == 'workflow_run' || github.event_name == 'workflow_dispatch'
     env:
       RELEASE_TAG: the-st3-channel
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -348,7 +351,7 @@ jobs:
   gh-pages-nightly:
     runs-on: ubuntu-latest
     needs: enrich_logs
-    if: github.event_name == 'schedule'
+    if: github.event_name == 'schedule' || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/'))
     environment:
       name: github-pages
       url: ${{ steps.deployment.outputs.page_url }}

From 6ea07a75e9edca30677b5dcb0996f616235ecfb2 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Tue, 24 Mar 2026 09:45:54 +0100
Subject: [PATCH 4/5] Group failing report by reason class

Sort the "Currently failing" section in three groups:

1. reasons starting with a three-digit status code
2. all remaining reasons
3. reasons starting with "fatal: 404"

Within each group, entries are still sorted by package name. Status
code entries are additionally ordered numerically by code.
---
 scripts/generate_channel.py                   | 16 ++++-
 .../generate_channel/test_generate_channel.py | 62 +++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/scripts/generate_channel.py b/scripts/generate_channel.py
index 99a47352b..894c9e693 100644
--- a/scripts/generate_channel.py
+++ b/scripts/generate_channel.py
@@ -6,6 +6,7 @@
 import json
 import sys
 import os
+import re
 from typing import Generator, Literal, NotRequired, TypedDict
 
 from ._utils import flatten, pick, pl, write_json, parse_version
@@ -218,11 +219,24 @@ def main(registry_path, workspace_path, channel_path, berlin: bool, pretty: bool
         failing_info = "\n".join(
             f"- **{pkg['name']}** [{failing_since(pkg, berlin)}]\n"
             f"    {pkg['fail_reason'].strip().replace('\n', '\n    ')}"
-            for pkg in sorted(failing, key=lambda p: p['name'].lower())
+            for pkg in sorted(failing, key=failing_sort_key)
         )
         print(f"\n#### Currently failing\n{failing_info}")
 
 
+def failing_sort_key(pkg) -> tuple[int, int, str]:
+    reason = pkg["fail_reason"].strip()
+    name = pkg["name"].lower()
+
+    if status_match := re.match(r"^(\d{3})\s", reason):
+        return (0, int(status_match.group(1)), name)
+
+    if reason.startswith("fatal: 404"):
+        return (2, 0, name)
+
+    return (1, 0, name)
+
+
 def normalize_package(pkg) -> Package | None:
     name = pkg.get("name")
     if not name:
diff --git a/tests/generate_channel/test_generate_channel.py b/tests/generate_channel/test_generate_channel.py
index b54fe56e7..b94fcb620 100644
--- a/tests/generate_channel/test_generate_channel.py
+++ b/tests/generate_channel/test_generate_channel.py
@@ -119,6 +119,68 @@ def test_generate_channel_filters_removed_and_dropped_libraries(tmp_path):
     ]
 
 
+def test_generate_channel_sorts_failing_report_in_groups(tmp_path, capsys):
+    registry = {"repositories": []}
+
+    def failing_pkg(name: str, fail_reason: str):
+        return {
+            "name": name,
+            "author": ["Ada"],
+            "last_modified": "2026-03-24T10:00:00Z",
+            "source": "https://repo.one",
+            "failing_since": "2026-03-24T10:00:00Z",
+            "fail_reason": fail_reason,
+            "releases": [
+                {
+                    "sublime_text": "4100",
+                    "platforms": ["*"],
+                    "version": "1.0.0",
+                    "url": "https://repo.one/pkg.zip",
+                    "date": "2026-03-24T10:00:00Z",
+                }
+            ],
+        }
+
+    workspace = {
+        "packages": {
+            "status_502_z": failing_pkg("Zeta Status", "502 Bad Gateway"),
+            "status_403_a": failing_pkg("Alpha Status", "403 Forbidden"),
+            "status_502_b": failing_pkg("Beta Status", "502 Bad Gateway"),
+            "other_b": failing_pkg("Beta Other", "denied: blocked by rule"),
+            "other_a": failing_pkg("Alpha Other", "fatal: 500 Server Error"),
+            "fatal_404_b": failing_pkg("Beta 404", "fatal: 404 Not Found"),
+            "fatal_404_a": failing_pkg("Alpha 404", "fatal: 404 Not Found"),
+        },
+        "libraries": {},
+    }
+
+    registry_path = tmp_path / "registry.json"
+    workspace_path = tmp_path / "workspace.json"
+    output_path = tmp_path / "channel.json"
+
+    registry_path.write_text(json.dumps(registry), encoding="utf-8")
+    workspace_path.write_text(json.dumps(workspace), encoding="utf-8")
+
+    main(str(registry_path), str(workspace_path), str(output_path), False, False)
+
+    stdout = capsys.readouterr().out
+    names = [
+        line.split("**")[1]
+        for line in stdout.splitlines()
+        if line.startswith("- **")
+    ]
+
+    assert names == [
+        "Alpha Status",
+        "Beta Status",
+        "Zeta Status",
+        "Alpha Other",
+        "Beta Other",
+        "Alpha 404",
+        "Beta 404",
+    ]
+
+
 def test_normalize_package_formats_fields_and_defaults():
     pkg = {
         "name": "Example",

From deb54e3b9254a0b1248ed1b055d69768e4cacd71 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Tue, 24 Mar 2026 12:32:03 +0100
Subject: [PATCH 5/5] Add `collect_logs` section to the README

---
 README.md | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2e44f3173..4eabe6fcf 100644
--- a/README.md
+++ b/README.md
@@ -174,7 +174,23 @@ The command above reuses the same layout as [CI](https://github.com/packagecontr
 
 ---
 
-### `refresh_logs.py` (plus lower-level helpers)
+### Logs handling
+
+#### `collect_logs.py`
+
+`scripts/collect_logs.py` appends the current run's `notes.txt` to `logs.json` (rolling history),
+keyed by run id and trimmed to a retention window (`--history-days`, default 32).
+
+If you pass a `--workspace` it include a `found_updates` list for packages detected in that run.
+
+```bash
+uv run -m scripts.collect_logs --output ./logs.json --workspace ./workspace.json ./notes.txt
+```
+
+This is a very mechanical step done in the crawl.yml; after the job is done we enrich the logs
+in publish.yml:
+
+#### `refresh_logs.py` (plus lower-level helpers)
 
 Use this when you want to reproduce/update `logs.json` locally with GitHub Actions metadata.
 `gh` is required for the ad-hoc queries I make herein.