diff --git a/.github/workflows/crawl.yml b/.github/workflows/crawl.yml index 111c2be86..df9146e29 100644 --- a/.github/workflows/crawl.yml +++ b/.github/workflows/crawl.yml @@ -29,6 +29,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v6 + - name: Checkout registry worktree + run: | + git fetch origin the-registry + git worktree add --force -B the-registry ./.the-registry origin/the-registry + - name: Set up Python uses: actions/setup-python@v6 with: @@ -79,11 +84,15 @@ jobs: run: | set -o pipefail PYTHONUNBUFFERED=1 uv run -m scripts.generate_registry \ + --seed ./.the-registry/registry.json \ --channel https://raw.githubusercontent.com/packagecontrol/channel/refs/heads/main/repository.json \ --channel https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/channel.json \ -o ./wrk/registry.json \ 2> >(tee registry.log >&2) + - name: Sync registry branch + run: bash ./.github/workflows/sync_registry_branch.sh ./.the-registry/ ./wrk/registry.json + - name: Run crawler run: | set -o pipefail diff --git a/.github/workflows/sync_registry_branch.sh b/.github/workflows/sync_registry_branch.sh new file mode 100644 index 000000000..1770c99df --- /dev/null +++ b/.github/workflows/sync_registry_branch.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail + +UV_BINARY="${UV_BINARY:-uv}" +REGISTRY_DIR="${1:-./.the-registry}" +OLD="${REGISTRY_DIR%/}/registry.json" +NEW="${2:-./wrk/registry.json}" + +if cmp -s "$OLD" "$NEW"; then + echo "No registry changes." + exit 0 +fi + +MSG="" +DESC_ERR="$(mktemp)" +if ! MSG="$("$UV_BINARY" run -m scripts.describe_registry_changes -a "$OLD" -b "$NEW" 2>"$DESC_ERR")"; then + DESC_ERR_CONTENT="$(cat "$DESC_ERR")" + echo "::warning::describe_registry_changes failed; using fallback subject" + if [ -n "$DESC_ERR_CONTENT" ]; then + echo "::notice::describe_registry_changes stderr: $(head -n 1 "$DESC_ERR")" + MSG="$(cat < --channel ``` +`generate_registry` supports implicit lifecycle enrichment. I.e. newly discovered +packages get a `first_seen` timestamp, packages that are removed from the input +channels/repositories are re-added in a tombstoned way. Also input channels/repositories +that throw on access will lead to marked entries *if* a previous registry/seed is +present; this way we prevent packages from jumping between different sources, esp. +malicious ones. + +This behavior is on by default, however you might not notice it as we don't blindly +add "first_seen: now" stamps when there is in fact no prior registry found. + +Concretely, + +- default seed path is `--output`; this way running generate_registry multiple times + from the same directory, will use the file we wrote on the last run as seed-input. + +- `--seed [PATH]` enforces seed input (supports `registry.json`, `workspace.json`, + and `seed.json`-style package maps). Will raise if the seed is not present/readable. + +- `--no-seed` disables lifecycle enrichment (`first_seen`/`removed` tombstones); + however, source-failure marking still applies when prior seed data is available. + +```bash +uv run -m scripts.generate_registry --output registry.json +uv run -m scripts.generate_registry --output registry.json --seed ./the-registry/registry.json +uv run -m scripts.generate_registry --output registry.json --no-seed +``` + +To create a compact archival seed, use `generate_seed` with either a workspace +or a registry as input (but not both): + +```bash +uv run -m scripts.generate_seed --workspace ./workspace.json --output ./seed.json +uv run -m scripts.generate_seed --registry ./registry.json --output ./seed.json +``` + --- ### 2. `crawl.py` diff --git a/scripts/_explain_package.py b/scripts/_explain_package.py index 69fdc26f5..7567a31ef 100644 --- a/scripts/_explain_package.py +++ b/scripts/_explain_package.py @@ -1,8 +1,11 @@ from __future__ import annotations +from copy import deepcopy from difflib import SequenceMatcher import json -from typing import Any +from typing import Any, Literal + +from ._utils import parse_sublime_text_max from rich import box from rich.console import Console @@ -32,6 +35,64 @@ def print_package_explain( ) +def print_package_explain_effective(name: str, normalized: dict[str, Any]) -> None: + releases = normalized.get("releases", []) + sorted_releases = sorted_release_definitions(releases) + tags_mode = classify_tags_mode(sorted_releases) + + normalized_effective = deepcopy(normalized) + normalized_effective["releases"] = keep_newest_release_definitions(sorted_releases) + + if tags_mode: + effectively = "(effectively) " if tags_mode == "effective" else "" + print(f"{name} uses {effectively}the tags-mode.") + print(json.dumps(normalized_effective, ensure_ascii=False, sort_keys=True)) + + +def classify_tags_mode( + sorted_releases: list[dict[str, Any]], +) -> bool | Literal["effective"]: + if not sorted_releases: + return False + + if all(release_uses_tags_mode(release) for release in sorted_releases): + return True + + if release_uses_tags_mode(sorted_releases[-1]): + return "effective" + + return False + + +def sorted_release_definitions(releases: list[dict[str, Any]]) -> list[dict[str, Any]]: + return sorted(releases, key=release_definition_sort_key) + + +def keep_newest_release_definitions( + releases: list[dict[str, Any]], +) -> list[dict[str, Any]]: + if not releases: + return [] + + newest_build = parse_sublime_text_max(releases[-1].get("sublime_text")) + return [ + release + for release in releases + if parse_sublime_text_max(release.get("sublime_text")) == newest_build + ] + + +def release_definition_sort_key(release: dict[str, Any]) -> tuple[float, str]: + return ( + parse_sublime_text_max(release.get("sublime_text")), + _tags_sort_value(release.get("tags")), + ) + + +def release_uses_tags_mode(release: dict[str, Any]) -> bool: + return bool(release.get("tags", False)) + + def print_library_explain( name: str, rows: list[tuple[dict[str, Any], list[dict[str, Any]]]], @@ -85,6 +146,15 @@ def print_library_explain( console.print(table) +def _tags_sort_value(value: Any) -> str: + # Place plain `True` after common prefixes like `st2-`. + if value is True: + return "~~true" + if isinstance(value, str): + return value + return "" + + def _render_json_diff_table( title: str, left_obj: dict[str, Any] | list[Any], diff --git a/scripts/_utils.py b/scripts/_utils.py index 979497940..2ed1dd232 100644 --- a/scripts/_utils.py +++ b/scripts/_utils.py @@ -278,6 +278,41 @@ def is_semver(s: str) -> bool: return parse_version(s) is not None +def parse_sublime_text_max(selector: object) -> float: + if not isinstance(selector, str): + return float("inf") + + s = re.sub(r"\s+", "", selector) + if s in ("", "*"): + return float("inf") + + range_index = s.find("-") + if range_index != -1: + right = s[range_index + 1:] + n = parse_int_prefix(right) + return float(n) if n is not None else float("inf") + + if s.startswith("<="): + n = parse_int_prefix(s[2:]) + return float(n) if n is not None else float("inf") + + if s.startswith("<"): + n = parse_int_prefix(s[1:]) + return float(max(0, n - 1)) if n is not None else float("inf") + + if s.startswith(">=") or s.startswith(">"): + return float("inf") + + n = parse_int_prefix(s) + return float(n) if n is not None else float("inf") + + +def parse_int_prefix(text: str) -> int | None: + if match := re.match(r"^\d+", text): + return int(match.group(0)) + return None + + SECONDS_PER_DAY = 24 * 60 * 60 diff --git a/scripts/crawl.py b/scripts/crawl.py index d7b91d516..cb98100dc 100644 --- a/scripts/crawl.py +++ b/scripts/crawl.py @@ -10,13 +10,13 @@ import os import re import sys -from typing import Any, Literal, Mapping, NotRequired, Required, TypedDict +from typing import Literal, Mapping, NotRequired, Required, TypedDict import packaging from packaging.specifiers import SpecifierSet from .bitbucket import fetch_bitbucket_info, RepoInfo as BitbucketRepoInfo -from .generate_registry import Registry, PackageEntry as RegistryEntry +from .generate_registry import Registry, RegistryEntry from .github import ( fetch_github_info, rate_limit_info, RepoInfo as GithubRepoInfo, ReleaseAssetInfo @@ -29,10 +29,17 @@ normalize_version_spec, ) from ._utils import ( - format_name_list, parse_version, resolve_url, update_url, write_json, pl, pick, + format_name_list, + parse_sublime_text_max, + parse_version, + resolve_url, + update_url, + write_json, + pl, + pick, VersionInfo, ) -from ._explain_package import print_package_explain +from ._explain_package import print_package_explain, print_package_explain_effective import traceback @@ -41,8 +48,11 @@ EXPLAIN_EFFECTIVE_ENV = "EFFECTIVE" UTC_FORMAT = "%Y-%m-%dT%H:%M:%SZ" STYLIZED_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" +MAIN_REPOSITORY_SOURCE = ( + "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json" +) TRUSTED_SOURCES = { - "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + MAIN_REPOSITORY_SOURCE, "https://raw.githubusercontent.com/sublimelsp/repository/main/repository.json", "https://raw.githubusercontent.com/SublimeLinter/package_control_channel/master/packages.json", } @@ -130,8 +140,15 @@ def explain_main(registry: str, name: str) -> int: err(f"Package '{name}' not found in registry.") return 1 + effective_mode = env_flag(EXPLAIN_EFFECTIVE_ENV) + if "removed" in package: + err(f"Package '{name}' is tombstoned in the registry.") + if not effective_mode: + print(json.dumps(package, indent=2, ensure_ascii=False, sort_keys=True)) + return 0 + normalized = normalize_registry_entry(deepcopy(package)) - if env_flag(EXPLAIN_EFFECTIVE_ENV): + if effective_mode: print_package_explain_effective(name, normalized) # type: ignore[arg-type] return 0 @@ -139,73 +156,6 @@ def explain_main(registry: str, name: str) -> int: return 0 -def print_package_explain_effective(name: str, normalized: dict[str, Any]) -> None: - releases = normalized.get("releases", []) - sorted_releases = sorted_release_definitions(releases) - tags_mode = classify_tags_mode(sorted_releases) - - normalized_effective = deepcopy(normalized) - normalized_effective["releases"] = keep_newest_release_definitions(sorted_releases) - - if tags_mode: - effectively = "(effectively) " if tags_mode == "effective" else "" - print(f"{name} uses {effectively}the tags-mode.") - print(json.dumps(normalized_effective, ensure_ascii=False, sort_keys=True)) - - -def classify_tags_mode( - sorted_releases: list[dict[str, Any]], -) -> bool | Literal["effective"]: - if not sorted_releases: - return False - - if all(release_uses_tags_mode(release) for release in sorted_releases): - return True - - if release_uses_tags_mode(sorted_releases[-1]): - return "effective" - - return False - - -def sorted_release_definitions(releases: list[dict[str, Any]]) -> list[dict[str, Any]]: - return sorted(releases, key=release_definition_sort_key) - - -def keep_newest_release_definitions( - releases: list[dict[str, Any]], -) -> list[dict[str, Any]]: - if not releases: - return [] - - newest_build = parse_sublime_text_max(releases[-1].get("sublime_text")) - return [ - release - for release in releases - if parse_sublime_text_max(release.get("sublime_text")) == newest_build - ] - - -def release_definition_sort_key(release: dict[str, Any]) -> tuple[float, str]: - return ( - parse_sublime_text_max(release.get("sublime_text")), - tags_sort_value(release.get("tags")), - ) - - -def tags_sort_value(value: Any) -> str: - # Place plain `True` after common prefixes like `st2-`. - if value is True: - return "~~true" - if isinstance(value, str): - return value - return "" - - -def release_uses_tags_mode(release: dict[str, Any]) -> bool: - return bool(release.get("tags", False)) - - async def main( registry: str, workspace: str, @@ -248,6 +198,9 @@ async def main_( if not package: err(f"Package '{name}' not found in registry.") return + if "removed" in package: + err(f"Package '{name}' is tombstoned in the registry.") + return tocrawl = [package] else: maintenance(registry, workspace) @@ -302,6 +255,7 @@ def next_packages_to_crawl( entry for entry in packages if not entry.get("fetching_source_failed") + if "removed" not in entry if presto or ( workspace["packages"] # type: ignore[call-overload] .get(entry["name"], {}) @@ -319,6 +273,7 @@ def next_packages_to_crawl( entry for entry in packages if not entry.get("fetching_source_failed") + if "removed" not in entry ), key=lambda pkg: ( workspace["packages"] # type: ignore[call-overload] @@ -362,12 +317,18 @@ def next_packages_to_crawl( def maintenance(registry: Registry, workspace: Workspace) -> None: + packages = workspace["packages"] + + for entry in registry["packages"]: + if "removed" in entry: + packages[entry["name"]] = {**entry} # type: ignore[typeddict-item] + + # Legacy; # lookup all packages in workspace and mark them as `removed` # if they have been removed from the registry now = now_ts() now_string = now.strftime(UTC_FORMAT) current_package_names = {entry["name"] for entry in registry["packages"]} - packages = workspace["packages"] for name in packages.keys() - current_package_names: packages[name].setdefault("removed", now_string) @@ -385,6 +346,12 @@ async def crawl( out = await crawl_package(session, package, existing) except Exception as e: out = {**existing} + + # Keep existing source authoritative when present (important for denied + # source moves). Only backfill source from the registry package when + # the existing workspace entry has never had one. + out.setdefault("source", package["source"]) + out["failing_since"] = existing.get("failing_since", now_string) # We mark errors as fatal if we MUST de-list the package immediately. @@ -833,18 +800,34 @@ def ensure_secure_source( entry: RegistryEntry, existing: WorkspaceEntry ) -> None: + existing_source = source_for_security_check(existing) + entry_source = entry.get("source") if ( - existing.get("source") - and entry.get("source") - and existing.get("source") != entry.get("source") - and entry.get("source") not in TRUSTED_SOURCES + existing_source + and entry_source + and existing_source != entry_source + and entry_source not in TRUSTED_SOURCES ): + source_display = existing.get("source") or "" raise DeniedUpdating( f"Repository source changed for *{entry.get('name')}* from " - f"{existing.get('source')} to untrusted {entry.get('source')}" + f"{source_display} to untrusted {entry_source}" ) +def source_for_security_check(existing: WorkspaceEntry) -> str | None: + source = existing.get("source") + if source: + return source + + if existing.get("removed"): + # Old imported tombstones can miss source. + # For security checks we treat them as coming from the main trusted source. + return MAIN_REPOSITORY_SOURCE + + return None + + def keys_missing_from_release(release: Mapping) -> set[str]: return {"sublime_text", "platforms", "version", "url", "date"} - release.keys() @@ -946,41 +929,6 @@ def maybe_make_auto_open_ended_tags_release( } -def parse_sublime_text_max(selector) -> float: - if not isinstance(selector, str): - return float("inf") - - s = re.sub(r"\s+", "", selector) - if s in ("", "*"): - return float("inf") - - range_index = s.find("-") - if range_index != -1: - right = s[range_index + 1:] - n = parse_int_prefix(right) - return float(n) if n is not None else float("inf") - - if s.startswith("<="): - n = parse_int_prefix(s[2:]) - return float(n) if n is not None else float("inf") - - if s.startswith("<"): - n = parse_int_prefix(s[1:]) - return float(max(0, n - 1)) if n is not None else float("inf") - - if s.startswith(">=") or s.startswith(">"): - return float("inf") - - n = parse_int_prefix(s) - return float(n) if n is not None else float("inf") - - -def parse_int_prefix(text: str) -> int | None: - if match := re.match(r"^\d+", text): - return int(match.group(0)) - return None - - def compile_release_asset_pattern( pattern: str, version: str, diff --git a/scripts/describe_registry_changes.py b/scripts/describe_registry_changes.py new file mode 100644 index 000000000..d2a8be45e --- /dev/null +++ b/scripts/describe_registry_changes.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +FALLBACK_SUBJECT = "Update registry.json" + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + old_registry = read_registry_json(args.old) + new_registry = read_registry_json(args.new) + + try: + message = describe_registry_changes(old_registry, new_registry) + except Exception: + message = FALLBACK_SUBJECT + + print(message) + return 0 + + +@dataclass +class Args: + old: str + new: str + + +def parse_args(argv: list[str] | None = None) -> Args: + parser = argparse.ArgumentParser( + description="Describe registry changes.", + ) + parser.add_argument("-a", "--old", required=True, help="Path to old registry JSON") + parser.add_argument("-b", "--new", required=True, help="Path to new registry JSON") + ns = parser.parse_args(argv) + return Args(old=ns.old, new=ns.new) + + +def describe_registry_changes( + old_registry: dict[str, Any], + new_registry: dict[str, Any], +) -> str: + package_changes = collect_changes( + old_items=entity_map(old_registry, "packages"), + new_items=entity_map(new_registry, "packages"), + kind="package", + ) + library_changes = collect_changes( + old_items=entity_map(old_registry, "libraries"), + new_items=entity_map(new_registry, "libraries"), + kind="library", + ) + changes = merge_changes(package_changes, library_changes) + + repositories_changed = old_registry.get("repositories") != new_registry.get("repositories") + other_changed = strip_for_other_compare(old_registry) != strip_for_other_compare(new_registry) + + if not changes.any_changes: + if repositories_changed or other_changed: + return FALLBACK_SUBJECT + return "Same." + + if changes.disappeared or other_changed: + return FALLBACK_SUBJECT + + if changes.single_added: + return f"Added `{changes.added[0]}`" + + if changes.single_tombstoned: + return f"Tombstoned `{changes.tombstoned[0]}`" + + if changes.single_resurrected: + return f"Resurrected `{changes.resurrected[0]}`" + + if changes.single_metadata_changed: + return f"Changed metadata of `{changes.metadata_changed[0]}`" + + if changes.only_metadata_changed: + return build_metadata_bulk_message(changes.metadata_changed) + + if changes.added or changes.tombstoned or changes.resurrected or changes.metadata_changed: + return build_bulk_edit_message( + added=changes.added, + tombstoned=changes.tombstoned, + resurrected=changes.resurrected, + metadata_changed=changes.metadata_changed, + ) + + return FALLBACK_SUBJECT + + +@dataclass +class ChangeSet: + added: list[str] + tombstoned: list[str] + resurrected: list[str] + metadata_changed: list[str] + disappeared: list[str] + + @property + def any_changes(self) -> bool: + return bool( + self.added + or self.tombstoned + or self.resurrected + or self.metadata_changed + or self.disappeared + ) + + @property + def single_added(self) -> bool: + return self.added_only and len(self.added) == 1 + + @property + def single_tombstoned(self) -> bool: + return self.tombstoned_only and len(self.tombstoned) == 1 + + @property + def single_resurrected(self) -> bool: + return self.resurrected_only and len(self.resurrected) == 1 + + @property + def single_metadata_changed(self) -> bool: + return self.only_metadata_changed and len(self.metadata_changed) == 1 + + @property + def added_only(self) -> bool: + return bool(self.added) and not ( + self.tombstoned or self.resurrected or self.metadata_changed + ) + + @property + def tombstoned_only(self) -> bool: + return bool(self.tombstoned) and not ( + self.added or self.resurrected or self.metadata_changed + ) + + @property + def resurrected_only(self) -> bool: + return bool(self.resurrected) and not ( + self.added or self.tombstoned or self.metadata_changed + ) + + @property + def only_metadata_changed(self) -> bool: + return bool(self.metadata_changed) and not ( + self.added or self.tombstoned or self.resurrected + ) + + +def merge_changes(*changesets: ChangeSet) -> ChangeSet: + return ChangeSet( + added=sort_names(name for c in changesets for name in c.added), + tombstoned=sort_names(name for c in changesets for name in c.tombstoned), + resurrected=sort_names(name for c in changesets for name in c.resurrected), + metadata_changed=sort_names(name for c in changesets for name in c.metadata_changed), + disappeared=sort_names(name for c in changesets for name in c.disappeared), + ) + + +def read_registry_json(path: str) -> dict[str, Any]: + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def collect_changes( + old_items: dict[str, dict[str, Any]], + new_items: dict[str, dict[str, Any]], + *, + kind: str, +) -> ChangeSet: + added: list[str] = [] + tombstoned: list[str] = [] + resurrected: list[str] = [] + metadata_changed: list[str] = [] + + disappeared = [display_name(kind, name) for name in old_items.keys() - new_items.keys()] + + for name in sort_names(new_items.keys() - old_items.keys()): + display = display_name(kind, name) + if is_tombstoned(new_items[name]): + tombstoned.append(display) + else: + added.append(display) + + for name in sort_names(old_items.keys() & new_items.keys()): + old_entry = old_items[name] + new_entry = new_items[name] + display = display_name(kind, name) + old_tombstoned = is_tombstoned(old_entry) + new_tombstoned = is_tombstoned(new_entry) + + if old_tombstoned and not new_tombstoned: + resurrected.append(display) + continue + + if not old_tombstoned and new_tombstoned: + tombstoned.append(display) + continue + + if old_entry != new_entry: + metadata_changed.append(display) + + return ChangeSet( + added=sort_names(added), + tombstoned=sort_names(tombstoned), + resurrected=sort_names(resurrected), + metadata_changed=sort_names(metadata_changed), + disappeared=sort_names(disappeared), + ) + + +def entity_map(registry: dict[str, Any], key: str) -> dict[str, dict[str, Any]]: + entries = registry.get(key, []) + return { + entry["name"]: entry + for entry in entries + } + + +def strip_for_other_compare(registry: dict[str, Any]) -> dict[str, Any]: + return { + key: value + for key, value in registry.items() + if key not in {"packages", "libraries", "repositories"} + } + + +def build_metadata_bulk_message(metadata_changed: list[str]) -> str: + lines = [ + "Metadata bulk edit", + "", + "Record metadata edits of the following packages:", + *[f"- {name}" for name in metadata_changed], + ] + return "\n".join(lines) + + +def build_bulk_edit_message( + *, + added: list[str], + tombstoned: list[str], + resurrected: list[str], + metadata_changed: list[str], +) -> str: + sections: list[str] = ["Bulk edit"] + + if added: + sections.extend([ + "", + "Record addition of following packages:", + *[f"- {name}" for name in added], + ]) + + if tombstoned: + sections.extend([ + "", + "Record tombstoning the following packages:", + *[f"- {name}" for name in tombstoned], + ]) + + if resurrected: + sections.extend([ + "", + "Record resurrection of the following packages:", + *[f"- {name}" for name in resurrected], + ]) + + if metadata_changed: + sections.extend([ + "", + "Record metadata edit of the following packages:", + *[f"- {name}" for name in metadata_changed], + ]) + + return "\n".join(sections) + + +def sort_names(names: Any) -> list[str]: + return sorted(names, key=str.casefold) + + +def display_name(kind: str, name: str) -> str: + if kind == "library": + return f"{name} (library)" + return name + + +def is_tombstoned(entry: dict[str, Any]) -> bool: + return "removed" in entry + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index 595fbc601..117a4d55d 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -3,15 +3,16 @@ import argparse import asyncio import aiohttp +from dataclasses import dataclass from datetime import datetime, timezone import json import os import sys import time from urllib.parse import urlparse -from typing import Callable, Iterable, Mapping, NotRequired, TypedDict +from typing import Any, Callable, Iterable, Mapping, NotRequired, TypedDict, TypeGuard -from ._utils import flatten, resolve_urls, update_url, write_json, pl +from ._utils import flatten, pick, resolve_urls, update_url, write_json, pl DEFAULT_OUTPUT_FILE = "./registry.json" @@ -26,25 +27,48 @@ type IsoTimestamp = str -class PackageEntry(TypedDict, total=False): +class RawRepositoryEntry(TypedDict, total=False): + name: str + details: NotRequired[str] + labels: NotRequired[list[str]] + + +class RegistryEntry(TypedDict, total=False): source: Url schema_version: str name: str details: NotRequired[str] + labels: NotRequired[list[str]] + first_seen: NotRequired[IsoTimestamp] + removed: NotRequired[IsoTimestamp] fetching_source_failed: NotRequired[IsoTimestamp] +class SeedEntry(TypedDict): + name: str + first_seen: IsoTimestamp + source: NotRequired[Url | None] + removed: NotRequired[IsoTimestamp] + labels: NotRequired[list[str]] + + class Registry(TypedDict): repositories: list[str] - packages: list[PackageEntry] - libraries: list[PackageEntry] + packages: list[RegistryEntry] + libraries: list[RegistryEntry] class RepositorySchema(TypedDict): self: Url schema_version: str - packages: list[PackageEntry] - libraries: list[PackageEntry] + packages: list[RawRepositoryEntry] + libraries: list[RawRepositoryEntry] + + +@dataclass +class SeedDb: + db: dict[str, Any] + has_registry_shape: bool def parse_args() -> argparse.Namespace: @@ -67,30 +91,72 @@ def parse_args() -> argparse.Namespace: "If not given, uses the official channel from wbond/package_control_channel." ), ) + parser.add_argument( + "--seed", + nargs="?", + const="", + default=None, + help=( + "Seed input for lifecycle enrichment. Omit to use implicit seed mode: " + "read --output if available, otherwise continue without lifecycle fields. " + "Provide without a value to require --output as seed (fail if unreadable), " + "or provide a path to require that file. Supports registry.json, " + "workspace.json, or seed.json-style package maps." + ), + ) + parser.add_argument( + "--no-seed", + action="store_true", + help="Disable lifecycle enrichment and emit raw registry output.", + ) return parser.parse_args() -async def main(output_file: str, channels: list[str]) -> None: - # Try to read previous db if it exists - try: - with open(output_file, 'r') as f: - prev_db = json.load(f) - except (OSError, json.JSONDecodeError): - prev_db = {} +async def main( + output_file: str, + channels: list[str], + *, + seed_path: str | None = None, + no_seed: bool = False, +) -> None: + effective_seed_path, strict_seed = resolve_seed_path( + output_file=output_file, + seed_path=seed_path, + ) + seed = read_seed_db(effective_seed_path, strict=strict_seed) + failure_recovery = resolve_failure_recovery_db( + output_file, + effective_seed_path, + seed, + ) try: async with asyncio.timeout(GLOBAL_TIMEOUT): - db = await fetch_packages(channels, prev_db) + db = await fetch_packages( + channels, + failure_recovery, + seed=seed, + no_seed=no_seed, + ) + if seed and not no_seed: + apply_seed_lifecycle(db, seed, now_utc_string()) + write_json(output_file, db, pretty=True, ensure_ascii=True) print(f"Saved registry as {output_file}") except asyncio.TimeoutError: print(f"Timeout: script took more than {GLOBAL_TIMEOUT} seconds") -async def fetch_packages(channels: list[str], db: Registry = None) -> Registry: +async def fetch_packages( + channels: list[str], + recovery_db: Registry | None = None, + *, + seed: SeedDb | None = None, + no_seed: bool = False, +) -> Registry: print("Fetching registered packages...") now = time.monotonic() - now_string = datetime.now(timezone.utc).strftime(UTC_FORMAT) + now_string = now_utc_string() async with aiohttp.ClientSession() as session: # Fetch repositories from all channels in parallel @@ -100,24 +166,35 @@ async def fetch_packages(channels: list[str], db: Registry = None) -> Registry: repos: list[str] = list(flatten(repos_lists)) unseen = Unseen(repos) sem = asyncio.Semaphore(MAX_CONCURRENCY) + repo_results = await asyncio.gather(*[ + asyncio.create_task(fetch_repository(url, unseen, sem, session)) + for url in repos + ], return_exceptions=True) + result: dict[Url, RepositorySchema] = {} - result = { - repo["self"]: repo - for repo in await asyncio.gather(*[ - asyncio.create_task(fetch_repository(url, unseen, sem, session)) - for url in repos - ]) - if repo - if not repo.get("schema_version", "1.").startswith("1.") - } + for url, repo_result in zip(repos, repo_results): + if isinstance(repo_result, Exception): + err(f"Error fetching {url}: {repo_result}") + warn_unrecoverable_seed_entries( + url, + recovery_db=recovery_db, + seed=seed, + no_seed=no_seed, + ) + continue + if isinstance(repo_result, BaseException): + raise repo_result + + if not repo_result.get("schema_version", "1.").startswith("1."): + result[repo_result["self"]] = repo_result # Flatten packages and libraries, adding source, schema_version, and # ensuring a unique name. - def add_unique_(container: list[PackageEntry], kind: str) -> Callable[[PackageEntry], None]: + def add_unique_(container: list[RegistryEntry], kind: str) -> Callable[[RegistryEntry], None]: seen = set() - def add(entry: PackageEntry) -> None: + def add(entry: RegistryEntry) -> None: name = extract_package_name(entry) if name and name not in seen: seen.add(name) @@ -132,32 +209,32 @@ def add(entry: PackageEntry) -> None: return add - packages: list[PackageEntry] = [] - libraries: list[PackageEntry] = [] + packages: list[RegistryEntry] = [] + libraries: list[RegistryEntry] = [] add_package = add_unique_(packages, "Package") add_library = add_unique_(libraries, "Library") for url in repos: if repo := result.get(url): - repo_info: PackageEntry + repo_info: RegistryEntry repo_info = { "source": repo["self"], "schema_version": repo["schema_version"], } for pkg in repo["packages"]: - add_package(pkg | repo_info) + add_package(pkg | repo_info) # type: ignore[arg-type] for library in repo["libraries"]: - add_library(library | repo_info) + add_library(library | repo_info) # type: ignore[arg-type] - elif db: - # recreate the repo from db - fail_info: PackageEntry + elif recovery_db: + # recreate the repo from recovery_db (always registry-shaped) + fail_info: RegistryEntry fail_info = {"fetching_source_failed": now_string} - for pkg in db.get("packages", []): + for pkg in recovery_db.get("packages", []): if pkg.get("source") == url: add_package(fail_info | pkg) - for library in db.get("libraries", []): + for library in recovery_db.get("libraries", []): if library.get("source") == url: add_library(fail_info | library) @@ -215,13 +292,9 @@ async def fetch_repository( location: Url, unseen: Unseen[Url], sem: asyncio.Semaphore, - session: aiohttp.ClientSession -) -> RepositorySchema | None: - try: - result = await __fetch_repo(location, sem, session) - except Exception as e: - err(f"Error fetching {location}: {e}") - return None + session: aiohttp.ClientSession, +) -> RepositorySchema: + result = await __fetch_repo(location, sem, session) repository: RepositorySchema = { "self": location, @@ -269,6 +342,184 @@ async def http_get(location: str, session: aiohttp.ClientSession) -> str: return await resp.text() +def resolve_seed_path(output_file: str, seed_path: str | None) -> tuple[str, bool]: + if seed_path is None: + return output_file, False + + if seed_path == "": + return output_file, True + + return os.path.abspath(seed_path), True + + +def read_seed_db(path: str, *, strict: bool) -> SeedDb | None: + try: + text = open(path, "r", encoding="utf-8").read() + except OSError as exc: + if strict: + raise FileNotFoundError(f"Could not read seed path: {path}") from exc + return None + + try: + data = json.loads(text) + except json.JSONDecodeError as exc: + if strict: + raise ValueError(f"Seed is not valid JSON: {path}") from exc + return None + + if not isinstance(data, dict): + if strict: + raise ValueError(f"Seed JSON must be an object: {path}") + return None + + return SeedDb( + db=data, + has_registry_shape=is_registry_recovery_db(data), + ) + + +def resolve_failure_recovery_db( + output_file: str, + effective_seed_path: str, + seed: SeedDb | None, +) -> Registry | None: + if seed and seed.has_registry_shape: + return seed.db # type: ignore[return-value] + + output_is_seed = os.path.abspath(output_file) == os.path.abspath(effective_seed_path) + if not output_is_seed: + output_db = read_seed_db(output_file, strict=False) + if output_db and output_db.has_registry_shape: + return output_db.db # type: ignore[return-value] + + return None + + +def is_registry_recovery_db(db: Mapping[str, Any]) -> TypeGuard[Registry]: + return ( + isinstance(db.get("packages"), list) + and isinstance(db.get("libraries"), list) + ) + + +def apply_seed_lifecycle( + registry: Registry, + seed: SeedDb, + now_string: IsoTimestamp, +) -> None: + seed_packages = { + entry["name"]: entry + for entry in iter_package_entries(seed.db) + } + current = { + pkg["name"]: pkg + for pkg in registry["packages"] + } + + for name, package in current.items(): + entry = seed_packages.get(name) + if entry and (first_seen := entry.get("first_seen")): + package["first_seen"] = first_seen + elif "first_seen" not in package: + package["first_seen"] = now_string + + for name, entry in seed_packages.items(): + if name not in current: + current[name] = build_tombstone(entry, now_string) + + registry["packages"] = \ + sorted(current.values(), key=lambda entry: entry["name"].casefold()) + + +def iter_package_entries(db: Mapping[str, Any]) -> Iterable[SeedEntry]: + entries = db.get("packages") + # Shape: registry.json + if isinstance(entries, list): + for entry in entries: + yield entry + + # Shape: workspace.json + elif isinstance(entries, dict): + for name, entry in entries.items(): + yield entry + + # Shape: seed.json + elif "packages" not in db: + for name, entry in db.items(): + yield entry + + +def warn_unrecoverable_seed_entries( + source_url: str, + *, + recovery_db: Registry | None, + seed: SeedDb | None, + no_seed: bool, +) -> None: + if seed is None: + return + + if has_recovery_entries_for_source(recovery_db, source_url): + return + + mode_outcome = "dropped" if no_seed else "tombstoned" + + if is_compact_seed(seed.db): + err( + "ATTENTION: repository recovery cannot be guaranteed with a compact seed. " + "Check the output. Consider using a full registry.json seed for complete " + f"recovery; missing packages are {mode_outcome}." + ) + return + + if lost_names := seed_package_names_for_source(seed.db, source_url): + err( + "ATTENTION: seed file knows " + f"{pl(len(lost_names), 'packages')} in the failed repository " + "but has no data to recover full entries; " + f"these are {mode_outcome}." + ) + + +def has_recovery_entries_for_source( + recovery_db: Registry | None, + source_url: str, +) -> bool: + if not recovery_db: + return False + + for kind in ("packages", "libraries"): + for entry in recovery_db[kind]: + if entry.get("source") == source_url: + return True + + return False + + +def seed_package_names_for_source(seed_db: Mapping[str, Any], source_url: str) -> list[str]: + names = { + entry["name"] + for entry in iter_package_entries(seed_db) + if entry.get("source") == source_url + } + return sorted(names, key=str.casefold) + + +def is_compact_seed(seed_db: Mapping[str, Any]) -> bool: + return "packages" not in seed_db + + +def build_tombstone(seed: Mapping[str, Any], now_string: IsoTimestamp) -> RegistryEntry: + return ( + {"first_seen": now_string, "removed": now_string} # type: ignore[operator] + | pick(("name", "source", "first_seen", "removed", "labels"), seed) + ) + + +def now_utc_string() -> IsoTimestamp: + return datetime.now(timezone.utc).strftime(UTC_FORMAT) + + def err(*args, **kwargs) -> None: print(*args, **kwargs, file=sys.stderr) @@ -308,4 +559,16 @@ def extend(self, items: Iterable[T]) -> Iterable[T]: args = parse_args() output_file = os.path.abspath(args.output) channels = args.channel if args.channel else [DEFAULT_CHANNEL] - asyncio.run(main(output_file, channels)) + seed_path = ( + None + if args.seed is None + else (output_file if args.seed == "" else os.path.abspath(args.seed)) + ) + asyncio.run( + main( + output_file, + channels, + seed_path=seed_path, + no_seed=args.no_seed, + ) + ) diff --git a/scripts/generate_seed.py b/scripts/generate_seed.py new file mode 100644 index 000000000..2434f373d --- /dev/null +++ b/scripts/generate_seed.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +import sys +from typing import Any, Literal, Mapping, NotRequired, TypedDict + +from ._utils import pick + +DEFAULT_WORKSPACE = "workspace.json" +DEFAULT_REGISTRY = "registry.json" +DEFAULT_OUTPUT = "seed.json" +INCOMPLETE_SHAPE_WARN_THRESHOLD = 0.10 + + +type IsoTimestamp = str +type Url = str + + +class ActivePackage(TypedDict): + name: str + first_seen: IsoTimestamp + + +class TombstonedPackage(TypedDict): + name: str + first_seen: IsoTimestamp + source: NotRequired[Url | None] + removed: IsoTimestamp + labels: NotRequired[list[str]] + + +SeedEntry = ActivePackage | TombstonedPackage + + +def main() -> None: + args = parse_args() + input_kind, input_path = resolve_input(args) + input_data = load_json(input_path) + + seed = ( + extract_seed_from_workspace(input_data) + if input_kind == "workspace" + else extract_seed_from_registry(input_data) + ) + write_seed(args.output, seed) + + removed_count = sum(1 for entry in seed.values() if "removed" in entry) + alive_count = len(seed) - removed_count + print( + f"Wrote {len(seed)} entries to {args.output} " + f"({alive_count} alive, {removed_count} removed)." + ) + + incomplete_count, total_count = count_incomplete_shapes(seed) + if should_warn_about_incomplete_shapes(incomplete_count, total_count): + if total_count > 0 and incomplete_count == total_count: + msg = "Check the output. All packages have an incomplete shape." + else: + incomplete_percent = (incomplete_count * 100) / total_count + msg = ( + "Check the output. " + f"{incomplete_percent:.1f}% of the packages have an incomplete shape." + ) + + print(msg, file=sys.stderr) + + +class Args(argparse.Namespace): + workspace: str | None + registry: str | None + output: str + + +def parse_args() -> Args: + parser = argparse.ArgumentParser( + description=( + "Generate a compact seed JSON from either workspace.json or registry.json" + ), + ) + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument( + "--workspace", + nargs="?", + const=DEFAULT_WORKSPACE, + help=( + "Path to workspace.json. If provided without a value, defaults to " + f"{DEFAULT_WORKSPACE}." + ), + ) + input_group.add_argument( + "--registry", + nargs="?", + const=DEFAULT_REGISTRY, + help=( + "Path to registry.json. If provided without a value, defaults to " + f"{DEFAULT_REGISTRY}." + ), + ) + parser.add_argument( + "-o", + "--output", + default=DEFAULT_OUTPUT, + help=f"Output path for seed JSON (default: {DEFAULT_OUTPUT})", + ) + return parser.parse_args(namespace=Args()) + + +def resolve_input(args: Args) -> tuple[Literal["workspace", "registry"], str]: + if args.workspace is not None: + return "workspace", args.workspace + if args.registry is not None: + return "registry", args.registry + raise AssertionError("Either --workspace or --registry must be provided") + + +def load_json(path: str) -> dict[str, Any]: + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def extract_seed_from_workspace(workspace: dict[str, Any]) -> dict[str, Mapping[str, Any]]: + seed: dict[str, Mapping[str, Any]] = {} + for package in workspace["packages"].values(): + entry = build_seed_entry(package) + seed[entry["name"]] = entry + return sort_seed(seed) + + +def extract_seed_from_registry(registry: dict[str, Any]) -> dict[str, Mapping[str, Any]]: + seed: dict[str, Mapping[str, Any]] = {} + for package in registry["packages"]: + entry = build_seed_entry(package) + seed[entry["name"]] = entry + return sort_seed(seed) + + +def build_seed_entry(package: dict[str, Any]) -> Mapping[str, Any]: + if "removed" in package: + return pick(("name", "first_seen", "removed", "labels", "source"), package) + else: + return pick(("name", "first_seen"), package) + + +def sort_seed(seed: dict[str, Mapping[str, Any]]) -> dict[str, Mapping[str, Any]]: + return dict(sorted(seed.items(), key=lambda item: item[0].casefold())) + + +def count_incomplete_shapes(seed: Mapping[str, Mapping[str, Any]]) -> tuple[int, int]: + incomplete_count = 0 + total_count = len(seed) + for entry in seed.values(): + expected_key_count = 5 if "removed" in entry else 2 + if len(entry) != expected_key_count: + incomplete_count += 1 + return incomplete_count, total_count + + +def should_warn_about_incomplete_shapes(incomplete_count: int, total_count: int) -> bool: + if total_count == 0: + return False + return (incomplete_count / total_count) > INCOMPLETE_SHAPE_WARN_THRESHOLD + + +def write_seed(path: str, seed: dict[str, Mapping[str, Any]]) -> None: + output_path = Path(path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text( + json.dumps(seed, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +if __name__ == "__main__": + main() diff --git a/tests/crawl/test_deny_rules.py b/tests/crawl/test_deny_rules.py index 2cdaaed82..673ce2a6a 100644 --- a/tests/crawl/test_deny_rules.py +++ b/tests/crawl/test_deny_rules.py @@ -312,6 +312,59 @@ async def test_move_between_untrusted_sources_is_denied(set_now, set_github_info assert result.get("source") == existing["source"] +@pytest.mark.asyncio +async def test_removed_without_source_defaults_to_trusted_for_security(set_now, set_github_info): + entry = { + "name": "SourceMoved", + "details": "https://github.com/example/source-moved", + "releases": [ + { + "sublime_text": "*", + "branch": True + } + ], + "source": "https://example.com/untrusted/new.json", + "schema_version": "3.0.0" + } + + existing = { + "name": "SourceMoved", + "details": "https://github.com/example/source-moved", + "removed": "2024-01-01T00:00:00Z", + "id": "SAME_ID" + } + + github_info = { + "metadata": { + "id": "SAME_ID", + "name": "SourceMoved", + "description": "Fixture package with missing source on tombstone", + "homepage": "https://github.com/example/source-moved", + "author": "example", + "readme": "https://raw.githubusercontent.com/example/source-moved/main/README.md", + "default_branch": "main", + "stars": 0, + "created_at": "2024-01-01T00:00:00Z" + }, + "tags": [], + "branches": [ + { + "name": "main", + "date": "2024-05-10T12:00:00Z", + "url": "https://codeload.github.com/example/source-moved/zip/main" + } + ] + } + + set_now("2024-05-11T00:00:00Z") + set_github_info(github_info) + + result = await crawl(object(), entry, existing) + fail_reason = result.get("fail_reason", "") + assert fail_reason.startswith("denied:") + assert "from to untrusted" in fail_reason + + @pytest.mark.asyncio @pytest.mark.parametrize( "trusted_source", diff --git a/tests/crawl/test_explain_effective.py b/tests/crawl/test_explain_effective.py index e0c939ca5..5d1b42c56 100644 --- a/tests/crawl/test_explain_effective.py +++ b/tests/crawl/test_explain_effective.py @@ -1,11 +1,11 @@ import json -from scripts.crawl import ( +from scripts._explain_package import ( classify_tags_mode, - explain_main, keep_newest_release_definitions, sorted_release_definitions, ) +from scripts.crawl import explain_main def test_classify_tags_mode_when_all_releases_use_tags() -> None: @@ -114,3 +114,53 @@ def test_explain_main_effective_mode_omits_status_line_when_not_tags_mode( assert out.startswith("{") explained = json.loads(out) assert explained["name"] == "NoTagsPkg" + + +def test_explain_main_tombstoned_pretty_prints_raw_entry(capsys, tmp_path) -> None: + registry = { + "packages": [ + { + "name": "GonePkg", + "first_seen": "2020-01-01T00:00:00Z", + "removed": "2021-01-01T00:00:00Z", + "labels": ["theme"], + } + ] + } + registry_path = tmp_path / "registry.json" + registry_path.write_text(json.dumps(registry), encoding="utf-8") + + assert explain_main(str(registry_path), "GonePkg") == 0 + + captured = capsys.readouterr() + assert "Package 'GonePkg' is tombstoned in the registry." in captured.err + assert captured.out.startswith("{\n") + explained = json.loads(captured.out) + assert explained == registry["packages"][0] + + +def test_explain_main_tombstoned_effective_mode_emits_only_status( + monkeypatch, + capsys, + tmp_path, +) -> None: + registry = { + "packages": [ + { + "name": "GonePkg", + "first_seen": "2020-01-01T00:00:00Z", + "removed": "2021-01-01T00:00:00Z", + "labels": ["theme"], + } + ] + } + registry_path = tmp_path / "registry.json" + registry_path.write_text(json.dumps(registry), encoding="utf-8") + + monkeypatch.setenv("EFFECTIVE", "1") + + assert explain_main(str(registry_path), "GonePkg") == 0 + + captured = capsys.readouterr() + assert "Package 'GonePkg' is tombstoned in the registry." in captured.err + assert captured.out == "" diff --git a/tests/crawl/test_main_name_tombstoned.py b/tests/crawl/test_main_name_tombstoned.py new file mode 100644 index 000000000..f17f5c507 --- /dev/null +++ b/tests/crawl/test_main_name_tombstoned.py @@ -0,0 +1,22 @@ +import pytest + +from scripts import crawl as crawl_script + + +@pytest.mark.asyncio +async def test_main_name_rejects_tombstoned_package(capsys): + registry = { + "packages": [ + { + "name": "Gone", + "removed": "2025-01-01T00:00:00Z", + } + ] + } + workspace = {"packages": {}, "libraries": {}} + + await crawl_script.main_(registry, workspace, "Gone", limit=1) + + err = capsys.readouterr().err + assert "Package 'Gone' is tombstoned in the registry." in err + assert workspace["packages"] == {} diff --git a/tests/crawl/test_maintenance.py b/tests/crawl/test_maintenance.py new file mode 100644 index 000000000..bf5b4963d --- /dev/null +++ b/tests/crawl/test_maintenance.py @@ -0,0 +1,77 @@ +from scripts import crawl as crawl_script + + +def test_maintenance_imports_registry_tombstones_into_workspace(): + tombstone = { + "name": "Gone", + "first_seen": "2012-01-01T00:00:00Z", + "removed": "2024-01-01T00:00:00Z", + "labels": ["theme"], + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "3.0.0", + } + registry = { + "packages": [ + { + "name": "Alive", + "details": "https://github.com/example/alive", + "releases": [{"sublime_text": "*", "branch": True}], + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "3.0.0", + }, + tombstone, + ] + } + workspace = {"packages": {}, "libraries": {}} + + crawl_script.maintenance(registry, workspace) + + assert workspace["packages"]["Gone"] == tombstone + assert "Alive" not in workspace["packages"] + + +def test_maintenance_overwrites_existing_entry_with_registry_tombstone(): + tombstone = { + "name": "Gone", + "first_seen": "2012-01-01T00:00:00Z", + "removed": "2024-01-01T00:00:00Z", + "labels": ["theme"], + } + registry = {"packages": [tombstone]} + workspace = { + "packages": { + "Gone": { + "name": "Gone", + "details": "https://github.com/example/gone", + "releases": [{"version": "1.0.0"}], + "source": "https://example.com/old.json", + "first_seen": "2012-01-01T00:00:00Z", + "fail_reason": "fatal: 404 Not Found", + } + }, + "libraries": {}, + } + + crawl_script.maintenance(registry, workspace) + + assert workspace["packages"]["Gone"] == tombstone + assert "details" not in workspace["packages"]["Gone"] + + +def test_maintenance_still_marks_workspace_orphans_removed(set_now): + set_now("2026-03-27T11:00:00Z") + + registry = {"packages": []} + workspace = { + "packages": { + "Orphan": { + "name": "Orphan", + "first_seen": "2020-01-01T00:00:00Z", + } + }, + "libraries": {}, + } + + crawl_script.maintenance(registry, workspace) + + assert workspace["packages"]["Orphan"]["removed"] == "2026-03-27T11:00:00Z" diff --git a/tests/crawl/test_next_packages_to_crawl.py b/tests/crawl/test_next_packages_to_crawl.py new file mode 100644 index 000000000..8f5f92215 --- /dev/null +++ b/tests/crawl/test_next_packages_to_crawl.py @@ -0,0 +1,76 @@ +from scripts.crawl import next_packages_to_crawl + + +def test_next_packages_to_crawl_skips_removed_entries(set_now): + set_now("2026-03-26T00:00:00Z") + + registry = { + "packages": [ + make_registry_entry("Alive"), + make_registry_entry("Gone", removed="2025-01-01T00:00:00Z"), + ] + } + workspace = {"packages": {}, "libraries": {}} + + result = next_packages_to_crawl(registry, workspace, limit=200, presto=False) + + assert [entry["name"] for entry in result] == ["Alive"] + + +def test_next_packages_to_crawl_skips_removed_entries_in_presto_mode(set_now): + set_now("2026-03-26T00:00:00Z") + + registry = { + "packages": [ + make_registry_entry("Alive"), + make_registry_entry("Gone", removed="2025-01-01T00:00:00Z"), + ] + } + workspace = { + "packages": { + "Alive": {"name": "Alive", "last_seen": "2026-03-25T23:00:00Z"}, + "Gone": {"name": "Gone", "last_seen": "2026-03-25T22:00:00Z"}, + }, + "libraries": {}, + } + + result = next_packages_to_crawl(registry, workspace, limit=200, presto=True) + + assert [entry["name"] for entry in result] == ["Alive"] + + +def test_next_package_hint_ignores_removed_entries(set_now, capsys): + set_now("2026-03-26T00:00:00Z") + + registry = { + "packages": [ + make_registry_entry("Gone", removed="2025-01-01T00:00:00Z"), + make_registry_entry("Alive"), + ] + } + workspace = { + "packages": { + "Gone": {"name": "Gone", "next_crawl": "2026-03-26T00:01:00Z"}, + "Alive": {"name": "Alive", "next_crawl": "2026-03-26T00:10:00Z"}, + }, + "libraries": {}, + } + + result = next_packages_to_crawl(registry, workspace, limit=200, presto=False) + + assert result == [] + out = capsys.readouterr().out + assert "Next package runs in 10 minutes." in out + + +def make_registry_entry(name: str, removed: str | None = None): + entry = { + "name": name, + "details": f"https://github.com/example/{name.lower()}", + "releases": [{"sublime_text": "*", "branch": True}], + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "3.0.0", + } + if removed: + entry["removed"] = removed + return entry diff --git a/tests/crawl/test_resurrect_rules.py b/tests/crawl/test_resurrect_rules.py index e41898a7c..8f0a5685d 100644 --- a/tests/crawl/test_resurrect_rules.py +++ b/tests/crawl/test_resurrect_rules.py @@ -1,7 +1,7 @@ import pytest import scripts.crawl as crawl_mod -from scripts.crawl import SkipCrawling, crawl, main_ +from scripts.crawl import SkipCrawling, crawl, main_, maintenance @pytest.mark.asyncio @@ -104,6 +104,31 @@ async def test_crawl_keeps_fail_reason_on_404_skip(set_now): assert result.get("failing_since") == existing["failing_since"] +@pytest.mark.asyncio +async def test_crawl_error_adopts_registry_source_when_missing(set_now, monkeypatch): + entry = { + "name": "MissingSource", + "details": "https://github.com/example/missing-source", + "releases": [{"sublime_text": "*", "branch": True}], + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "3.0.0", + } + existing = { + "name": "MissingSource", + "details": "https://github.com/example/missing-source", + } + + async def stub(*args, **kwargs): + raise RuntimeError("boom") + + monkeypatch.setattr("scripts.crawl.crawl_package", stub) + set_now("2024-06-01T00:00:00Z") + + result = await crawl(object(), entry, existing) + + assert result.get("source") == entry["source"] + + @pytest.mark.asyncio async def test_removed_package_is_resurrected_on_trusted_source(set_now, set_github_info): registry = { @@ -158,6 +183,70 @@ async def test_removed_package_is_resurrected_on_trusted_source(set_now, set_git assert "removed" not in workspace["packages"]["Reappeared"] +@pytest.mark.asyncio +async def test_maintenance_imported_tombstone_resurrects_without_special_case( + set_now, + set_github_info, +): + source = "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json" + + workspace = {"packages": {}, "libraries": {}} + registry_tombstoned = { + "packages": [ + { + "name": "Reappeared", + "first_seen": "2019-01-01T00:00:00Z", + "removed": "2024-05-01T00:00:00Z", + "source": source, + "labels": ["theme"], + } + ] + } + maintenance(registry_tombstoned, workspace) + + registry_active = { + "packages": [ + { + "name": "Reappeared", + "details": "https://github.com/example/reappeared", + "releases": [{"sublime_text": "*", "branch": True}], + "source": source, + "schema_version": "3.0.0", + } + ], + } + + set_now("2024-06-01T00:00:00Z") + set_github_info({ + "metadata": { + "id": "SAME_ID", + "name": "Reappeared", + "description": "Fixture reappeared package", + "homepage": "https://github.com/example/reappeared", + "author": "example", + "readme": "https://raw.githubusercontent.com/example/reappeared/main/README.md", + "default_branch": "main", + "stars": 0, + "created_at": "2024-01-01T00:00:00Z", + }, + "tags": [], + "branches": [ + { + "name": "main", + "date": "2024-05-31T00:00:00Z", + "url": "https://codeload.github.com/example/reappeared/zip/main", + } + ], + }) + + await main_(registry_active, workspace, None, 100) + + package = workspace["packages"]["Reappeared"] + assert "removed" not in package + assert package["source"] == source + assert package["first_seen"] == "2019-01-01T00:00:00Z" + + @pytest.mark.asyncio async def test_retry_recent_404(set_now, monkeypatch): entry = { diff --git a/tests/registry/test_generate_registry.py b/tests/registry/test_generate_registry.py index 895204f8d..00e9acbfb 100644 --- a/tests/registry/test_generate_registry.py +++ b/tests/registry/test_generate_registry.py @@ -31,7 +31,12 @@ def make_channel(path: Path, repositories: list[Path]): path.write_text(json.dumps(channel_data)) -def make_repository(path: Path, package_names: list[str]): +def make_repository( + path: Path, + package_names: list[str], + *, + library_names: list[str] | None = None, +): repo_data = { "schema_version": "3.0.0", "packages": [ @@ -41,7 +46,12 @@ def make_repository(path: Path, package_names: list[str]): } for name in package_names ], - "libraries": [] + "libraries": [ + { + "name": name, + } + for name in (library_names or []) + ], } path.write_text(json.dumps(repo_data)) @@ -234,3 +244,428 @@ async def test_main_with_successful_repo_and_last_run_clears_fetching_source_fai result = json.load(f) # Package should no longer have fetching_source_failed assert "fetching_source_failed" not in result["packages"][0] + + +@pytest.mark.asyncio +async def test_implicit_seed_preserves_lifecycle_data(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Keep", "New"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [{ + "name": "Keep", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "details": "https://github.com/example/Keep", + "first_seen": "2020-01-01T00:00:00Z", + }], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + by_name = {pkg["name"]: pkg for pkg in result["packages"]} + assert by_name["Keep"]["first_seen"] == "2020-01-01T00:00:00Z" + datetime.strptime(by_name["New"]["first_seen"], "%Y-%m-%dT%H:%M:%SZ") + + +@pytest.mark.asyncio +async def test_explicit_seed_overrides_output_path(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Keep"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "output.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [{ + "name": "Keep", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "first_seen": "2011-01-01T00:00:00Z", + }], + "libraries": [], + })) + + seed_file = tmp_path / "seed.json" + seed_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [{ + "name": "Keep", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "first_seen": "2010-01-01T00:00:00Z", + }], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()], seed_path=str(seed_file)) + + result = json.loads(output_file.read_text()) + assert result["packages"][0]["first_seen"] == "2010-01-01T00:00:00Z" + + +@pytest.mark.asyncio +async def test_explicit_missing_seed_fails_hard(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Keep"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + output_file = tmp_path / "output.json" + + with pytest.raises(FileNotFoundError): + await main( + str(output_file), + [channel_path.as_uri()], + seed_path=str(tmp_path / "missing.json"), + ) + + +@pytest.mark.asyncio +async def test_no_seed_outputs_raw_registry_without_lifecycle_fields(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Keep"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "Keep", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "first_seen": "2010-01-01T00:00:00Z", + }, + { + "name": "Gone", + "source": repo_path.as_uri(), + "first_seen": "2011-01-01T00:00:00Z", + }, + ], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()], no_seed=True) + + result = json.loads(output_file.read_text()) + by_name = {pkg["name"]: pkg for pkg in result["packages"]} + assert "first_seen" not in by_name["Keep"] + assert "Gone" not in by_name + + +@pytest.mark.asyncio +async def test_package_disappearance_creates_minimal_tombstone(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, []) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "Gone", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "first_seen": "2010-01-01T00:00:00Z", + "labels": ["theme"], + }, + ], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + assert len(result["packages"]) == 1 + tombstone = result["packages"][0] + assert set(tombstone.keys()) == {"name", "source", "first_seen", "removed", "labels"} + assert tombstone["name"] == "Gone" + assert tombstone["source"] == repo_path.as_uri() + assert tombstone["first_seen"] == "2010-01-01T00:00:00Z" + assert tombstone["labels"] == ["theme"] + datetime.strptime(tombstone["removed"], "%Y-%m-%dT%H:%M:%SZ") + + +@pytest.mark.asyncio +async def test_existing_tombstone_keeps_removed_timestamp(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, []) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "Gone", + "source": repo_path.as_uri(), + "first_seen": "2010-01-01T00:00:00Z", + "removed": "2020-02-02T00:00:00Z", + }, + ], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + assert result["packages"][0]["removed"] == "2020-02-02T00:00:00Z" + + +@pytest.mark.asyncio +async def test_tombstoned_package_resurrection_preserves_first_seen(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Phoenix"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "Phoenix", + "source": repo_path.as_uri(), + "first_seen": "2010-01-01T00:00:00Z", + "removed": "2020-02-02T00:00:00Z", + }, + ], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + phoenix = result["packages"][0] + assert phoenix["name"] == "Phoenix" + assert phoenix["first_seen"] == "2010-01-01T00:00:00Z" + assert "removed" not in phoenix + + +@pytest.mark.asyncio +async def test_disappeared_libraries_are_not_tombstoned(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, []) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [], + "libraries": [ + { + "name": "GoneLib", + "source": repo_path.as_uri(), + "removed": "2020-02-02T00:00:00Z", + } + ], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + assert result["libraries"] == [] + + +@pytest.mark.asyncio +async def test_seeded_output_packages_are_name_sorted(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Zulu", "Bravo"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "Alpha", + "source": repo_path.as_uri(), + "first_seen": "2010-01-01T00:00:00Z", + }, + { + "name": "Zulu", + "source": repo_path.as_uri(), + "first_seen": "2011-01-01T00:00:00Z", + }, + ], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + assert [pkg["name"] for pkg in result["packages"]] == ["Alpha", "Bravo", "Zulu"] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("no_seed", [False, True]) +async def test_fetching_source_failed_behavior_unchanged_with_no_seed_toggle(tmp_path, no_seed): + repo_path = tmp_path / "missing.json" + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [{ + "name": "Lost", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "details": "https://github.com/example/Lost", + }], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()], no_seed=no_seed) + + result = json.loads(output_file.read_text()) + assert "fetching_source_failed" in result["packages"][0] + + +@pytest.mark.asyncio +async def test_non_registry_seed_does_not_recreate_failed_repo_entries(tmp_path, capsys): + repo_path = tmp_path / "missing.json" + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + seed_file = tmp_path / "workspace.json" + seed_file.write_text(json.dumps({ + "packages": { + "SFTP": { + "name": "SFTP", + "source": repo_path.as_uri(), + "first_seen": "2011-12-15T14:11:26Z", + "description": "Commercial SFTP/FTP plugin", + "labels": ["ftp", "sync"], + } + } + })) + + await main( + str(output_file), + [channel_path.as_uri()], + seed_path=str(seed_file), + ) + + result = json.loads(output_file.read_text()) + pkg = result["packages"][0] + assert pkg["name"] == "SFTP" + assert "description" not in pkg + assert "fetching_source_failed" not in pkg + + captured = capsys.readouterr() + assert "seed file knows 1 package" in captured.err + + +@pytest.mark.asyncio +async def test_workspace_seed_uses_prior_output_registry_for_failed_repo_recovery(tmp_path, capsys): + repo_path = tmp_path / "missing.json" + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "SFTP", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "details": "https://example.com/sftp", + "first_seen": "2015-01-01T00:00:00Z", + } + ], + "libraries": [], + })) + + seed_file = tmp_path / "workspace.json" + seed_file.write_text(json.dumps({ + "packages": { + "SFTP": { + "name": "SFTP", + "source": repo_path.as_uri(), + "first_seen": "2011-12-15T14:11:26Z", + "description": "Commercial SFTP/FTP plugin", + } + } + })) + + await main( + str(output_file), + [channel_path.as_uri()], + seed_path=str(seed_file), + ) + + result = json.loads(output_file.read_text()) + pkg = result["packages"][0] + assert pkg["name"] == "SFTP" + assert pkg["first_seen"] == "2011-12-15T14:11:26Z" + assert "fetching_source_failed" in pkg + assert "description" not in pkg + + captured = capsys.readouterr() + assert "recover full entries" not in captured.err + + +@pytest.mark.asyncio +async def test_compact_seed_without_sources_warns_on_failed_repo(tmp_path, capsys): + repo_path = tmp_path / "missing.json" + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + seed_file = tmp_path / "seed.json" + seed_file.write_text(json.dumps({ + "SFTP": { + "name": "SFTP", + "first_seen": "2011-12-15T14:11:26Z", + } + })) + + await main( + str(output_file), + [channel_path.as_uri()], + seed_path=str(seed_file), + ) + + captured = capsys.readouterr() + assert "repository recovery cannot be guaranteed with a compact seed" in captured.err + assert "full registry.json seed for complete recovery" in captured.err + assert "recover full entries" not in captured.err + + +@pytest.mark.asyncio +async def test_registry_seed_without_source_entries_does_not_emit_compact_warning(tmp_path, capsys): + repo_path = tmp_path / "missing.json" + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + seed_file = tmp_path / "registry_seed.json" + seed_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [], + "libraries": [], + })) + + await main( + str(output_file), + [channel_path.as_uri()], + seed_path=str(seed_file), + ) + + captured = capsys.readouterr() + assert "repository recovery cannot be guaranteed with a compact seed" not in captured.err diff --git a/tests/test_describe_registry_changes.py b/tests/test_describe_registry_changes.py new file mode 100644 index 000000000..d4f13b9ba --- /dev/null +++ b/tests/test_describe_registry_changes.py @@ -0,0 +1,239 @@ +import scripts.describe_registry_changes as describe_script + + +def test_describe_registry_changes_same() -> None: + old = {"packages": [pkg("A")]} + new = {"packages": [pkg("A")]} + + assert describe_script.describe_registry_changes(old, new) == "Same." + + +def test_describe_registry_changes_single_added() -> None: + old = {"packages": []} + new = {"packages": [pkg("Gloom")]} + + assert describe_script.describe_registry_changes(old, new) == "Added `Gloom`" + + +def test_describe_registry_changes_single_tombstoned() -> None: + old = {"packages": [pkg("Gloom")]} + new = {"packages": [pkg("Gloom", removed="2026-01-01T00:00:00Z")]} + + assert describe_script.describe_registry_changes(old, new) == "Tombstoned `Gloom`" + + +def test_describe_registry_changes_single_resurrected() -> None: + old = {"packages": [pkg("Gloom", removed="2026-01-01T00:00:00Z")]} + new = {"packages": [pkg("Gloom")]} + + assert describe_script.describe_registry_changes(old, new) == "Resurrected `Gloom`" + + +def test_describe_registry_changes_single_metadata_change() -> None: + old = {"packages": [pkg("Gloom", labels=["theme"])]} + new = {"packages": [pkg("Gloom", labels=["theme", "dark"])]} + + assert describe_script.describe_registry_changes(old, new) == "Changed metadata of `Gloom`" + + +def test_describe_registry_changes_single_added_library() -> None: + old = {"packages": [], "libraries": []} + new = {"packages": [], "libraries": [lib("JsonSchema")]} + + assert describe_script.describe_registry_changes(old, new) == "Added `JsonSchema (library)`" + + +def test_describe_registry_changes_mixed_package_and_library_bulk_edit() -> None: + old = { + "packages": [pkg("DropMe")], + "libraries": [lib("JsonSchema", removed="2026-01-01T00:00:00Z")], + } + new = { + "packages": [pkg("DropMe", removed="2026-01-01T00:00:00Z")], + "libraries": [lib("JsonSchema")], + } + + assert describe_script.describe_registry_changes(old, new) == ( + "Bulk edit\n\n" + "Record tombstoning the following packages:\n" + "- DropMe\n\n" + "Record resurrection of the following packages:\n" + "- JsonSchema (library)" + ) + + +def test_describe_registry_changes_metadata_bulk_edit() -> None: + old = { + "packages": [ + pkg("A", labels=["x"]), + pkg("B", labels=["x"]), + ] + } + new = { + "packages": [ + pkg("A", labels=["x", "y"]), + pkg("B", labels=["x", "y"]), + ] + } + + assert describe_script.describe_registry_changes(old, new) == ( + "Metadata bulk edit\n\n" + "Record metadata edits of the following packages:\n" + "- A\n" + "- B" + ) + + +def test_describe_registry_changes_bulk_edit() -> None: + old = { + "packages": [ + pkg("DropMe"), + pkg("Meta", labels=["a"]), + ] + } + new = { + "packages": [ + pkg("DropMe", removed="2026-01-01T00:00:00Z"), + pkg("Meta", labels=["a", "b"]), + pkg("NewPkg"), + ] + } + + assert describe_script.describe_registry_changes(old, new) == ( + "Bulk edit\n\n" + "Record addition of following packages:\n" + "- NewPkg\n\n" + "Record tombstoning the following packages:\n" + "- DropMe\n\n" + "Record metadata edit of the following packages:\n" + "- Meta" + ) + + +def test_describe_registry_changes_falls_back_when_package_disappears() -> None: + old = {"packages": [pkg("A")]} + new = {"packages": []} + + assert describe_script.describe_registry_changes(old, new) == "Update registry.json" + + +def test_describe_registry_changes_falls_back_when_only_repositories_change() -> None: + old = { + "repositories": ["https://raw.githubusercontent.com/a/repository.json"], + "packages": [pkg("A")], + "libraries": [lib("L")], + } + new = { + "repositories": ["https://raw.githubusercontent.com/b/repository.json"], + "packages": [pkg("A")], + "libraries": [lib("L")], + } + + assert describe_script.describe_registry_changes(old, new) == "Update registry.json" + + +def test_describe_registry_changes_bulk_resurrection() -> None: + old = { + "packages": [ + pkg("A", removed="2026-01-01T00:00:00Z"), + pkg("B", removed="2026-01-01T00:00:00Z"), + ] + } + new = {"packages": [pkg("A"), pkg("B")]} + + assert describe_script.describe_registry_changes(old, new) == ( + "Bulk edit\n\n" + "Record resurrection of the following packages:\n" + "- A\n" + "- B" + ) + + +def test_describe_registry_changes_bulk_edit_with_resurrection_and_other_changes() -> None: + old = { + "packages": [ + pkg("OldGone", removed="2026-01-01T00:00:00Z"), + pkg("KeepMeta", labels=["a"]), + ] + } + new = { + "packages": [ + pkg("OldGone"), + pkg("KeepMeta", labels=["a", "b"]), + pkg("NewPkg"), + ] + } + + assert describe_script.describe_registry_changes(old, new) == ( + "Bulk edit\n\n" + "Record addition of following packages:\n" + "- NewPkg\n\n" + "Record resurrection of the following packages:\n" + "- OldGone\n\n" + "Record metadata edit of the following packages:\n" + "- KeepMeta" + ) + + +def test_main_reads_both_registries_via_loader_and_prints_message(capsys, monkeypatch) -> None: + old = {"packages": []} + new = {"packages": [pkg("Gloom")]} + + def fake_read(path: str): + if path == "old_registry.json": + return old + if path == "new_registry.json": + return new + raise AssertionError(path) + + monkeypatch.setattr(describe_script, "read_registry_json", fake_read) + + rc = describe_script.main(["-a", "old_registry.json", "-b", "new_registry.json"]) + + assert rc == 0 + assert capsys.readouterr().out.strip() == "Added `Gloom`" + + +def test_main_falls_back_when_classifier_crashes(capsys, monkeypatch) -> None: + monkeypatch.setattr(describe_script, "read_registry_json", lambda path: {"packages": []}) + + def boom(old_registry, new_registry): + raise RuntimeError("boom") + + monkeypatch.setattr(describe_script, "describe_registry_changes", boom) + + rc = describe_script.main(["-a", "old_registry.json", "-b", "new_registry.json"]) + + assert rc == 0 + assert capsys.readouterr().out.strip() == "Update registry.json" + + +def pkg( + name: str, + *, + removed: str | None = None, + labels: list[str] | None = None, +) -> dict[str, object]: + entry: dict[str, object] = { + "name": name, + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "3.0.0", + "releases": [{"url": f"https://example.com/{name}.zip", "date": "2026-01-01T00:00:00Z"}], + } + if removed is not None: + entry["removed"] = removed + if labels is not None: + entry["labels"] = labels + return entry + + +def lib(name: str, *, removed: str | None = None) -> dict[str, object]: + entry: dict[str, object] = { + "name": name, + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "4.0.0", + "releases": [{"version": "1.0.0"}], + } + if removed is not None: + entry["removed"] = removed + return entry diff --git a/tests/test_sync_registry_branch_script.py b/tests/test_sync_registry_branch_script.py new file mode 100644 index 000000000..54cd5741e --- /dev/null +++ b/tests/test_sync_registry_branch_script.py @@ -0,0 +1,215 @@ +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import tempfile +from pathlib import Path + +import pytest + + +def test_sync_registry_branch_happy_path() -> None: + require_shell_tools() + + with tempfile.TemporaryDirectory(dir=project_root()) as temp_dir: + sandbox = Path(temp_dir) + origin_dir, registry_dir = init_registry_branch_repo(sandbox) + assert_origin_is_local(registry_dir, origin_dir) + + new_registry = sandbox / "wrk-registry.json" + write_registry(new_registry, package_names=["DemoPkg"]) + + result = run_sync_script(registry_dir, new_registry) + + assert result.returncode == 0, result.stderr + assert "fallback subject used" not in result.stdout + + commit_subject = git_output( + ["git", "-C", str(registry_dir), "log", "-1", "--pretty=%s"], + ).strip() + assert commit_subject == "Added `DemoPkg`" + + local_registry = json.loads((registry_dir / "registry.json").read_text(encoding="utf-8")) + generated_registry = json.loads(new_registry.read_text(encoding="utf-8")) + assert local_registry == generated_registry + + local_head = git_output(["git", "-C", str(registry_dir), "rev-parse", "HEAD"]).strip() + remote_head = git_output( + ["git", "--git-dir", str(origin_dir), "rev-parse", "refs/heads/the-registry"], + ).strip() + assert local_head == remote_head + + +def test_sync_registry_branch_noop_when_files_match() -> None: + require_shell_tools() + + with tempfile.TemporaryDirectory(dir=project_root()) as temp_dir: + sandbox = Path(temp_dir) + origin_dir, registry_dir = init_registry_branch_repo(sandbox) + assert_origin_is_local(registry_dir, origin_dir) + + new_registry = sandbox / "wrk-registry.json" + write_registry(new_registry, package_names=[]) + + before_head = git_output(["git", "-C", str(registry_dir), "rev-parse", "HEAD"]).strip() + result = run_sync_script(registry_dir, new_registry) + after_head = git_output(["git", "-C", str(registry_dir), "rev-parse", "HEAD"]).strip() + + assert result.returncode == 0, result.stderr + assert "No registry changes." in result.stdout + assert before_head == after_head + + +def test_sync_registry_branch_commits_when_describe_crashes() -> None: + require_shell_tools() + + with tempfile.TemporaryDirectory(dir=project_root()) as temp_dir: + sandbox = Path(temp_dir) + origin_dir, registry_dir = init_registry_branch_repo(sandbox) + assert_origin_is_local(registry_dir, origin_dir) + + new_registry = sandbox / "wrk-registry.json" + write_registry(new_registry, package_names=["CrashPathPkg"]) + + fake_uv = sandbox / "fake-uv" + fake_uv.write_text( + "#!/usr/bin/env bash\n" + "echo 'Traceback (most recent call last):' >&2\n" + "echo ' File \"scripts/describe_registry_changes.py\", line 1, in ' >&2\n" + "echo 'RuntimeError: boom' >&2\n" + "exit 1\n", + encoding="utf-8", + ) + fake_uv.chmod(0o755) + + env = os.environ.copy() + env["UV_BINARY"] = to_script_arg(fake_uv) + + result = run_sync_script(registry_dir, new_registry, env=env) + + assert result.returncode == 0, result.stderr + assert "describe_registry_changes failed; using fallback subject" in result.stdout + assert ( + "describe_registry_changes stderr: Traceback (most recent call last):" + in result.stdout + ) + + commit_subject = git_output( + ["git", "-C", str(registry_dir), "log", "-1", "--pretty=%s"], + ).strip() + assert commit_subject == "Update registry.json" + + commit_body = git_output( + ["git", "-C", str(registry_dir), "log", "-1", "--pretty=%B"], + ) + assert "describe_registry_changes.py raised" in commit_body + assert "Traceback (most recent call last):" in commit_body + assert "RuntimeError: boom" in commit_body + + local_head = git_output(["git", "-C", str(registry_dir), "rev-parse", "HEAD"]).strip() + remote_head = git_output( + ["git", "--git-dir", str(origin_dir), "rev-parse", "refs/heads/the-registry"], + ).strip() + assert local_head == remote_head + + +def require_shell_tools() -> None: + if shutil.which("git") is None: + pytest.skip("git is required for sync script tests") + if bash_executable() is None: + pytest.skip("bash is required for sync script tests") + + +def run_sync_script( + registry_dir: Path, + new_registry_path: Path, + *, + env: dict[str, str] | None = None, +) -> subprocess.CompletedProcess[str]: + bash_path = bash_executable() + if bash_path is None: + raise RuntimeError("bash is required for sync script tests") + + effective_env = os.environ.copy() + if env: + effective_env.update(env) + effective_env["GIT_ALLOW_PROTOCOL"] = "file" + effective_env["GIT_TERMINAL_PROMPT"] = "0" + + return subprocess.run( + [ + bash_path, + "./.github/workflows/sync_registry_branch.sh", + to_script_arg(registry_dir), + to_script_arg(new_registry_path), + ], + cwd=project_root(), + env=effective_env, + capture_output=True, + text=True, + check=False, + ) + + +def init_registry_branch_repo(sandbox: Path) -> tuple[Path, Path]: + origin_dir = sandbox / "origin.git" + registry_dir = sandbox / ".the-registry" + + run_checked(["git", "init", "--bare", str(origin_dir)]) + run_checked(["git", "clone", str(origin_dir), str(registry_dir)]) + run_checked(["git", "-C", str(registry_dir), "checkout", "-b", "the-registry"]) + run_checked(["git", "-C", str(registry_dir), "config", "user.name", "Test User"]) + run_checked(["git", "-C", str(registry_dir), "config", "user.email", "test@example.com"]) + + write_registry(registry_dir / "registry.json", package_names=[]) + run_checked(["git", "-C", str(registry_dir), "add", "registry.json"]) + run_checked(["git", "-C", str(registry_dir), "commit", "-m", "Initial"]) + run_checked(["git", "-C", str(registry_dir), "push", "-u", "origin", "the-registry"]) + + return origin_dir, registry_dir + + +def assert_origin_is_local(registry_dir: Path, origin_dir: Path) -> None: + origin_url = git_output(["git", "-C", str(registry_dir), "remote", "get-url", "origin"]).strip() + assert Path(origin_url).resolve() == origin_dir.resolve() + + +def to_script_arg(path: Path) -> str: + rel = path.resolve().relative_to(project_root()) + return f"./{rel.as_posix()}" + + +def write_registry(path: Path, *, package_names: list[str]) -> None: + payload = { + "packages": [{"name": name} for name in package_names], + "libraries": [], + } + path.write_text(json.dumps(payload), encoding="utf-8") + + +def git_output(command: list[str]) -> str: + return subprocess.check_output(command, text=True) + + +def run_checked(command: list[str]) -> None: + subprocess.run(command, check=True, capture_output=True, text=True) + + +def bash_executable() -> str | None: + try: + where_output = subprocess.check_output(["where", "bash"], text=True) + except Exception: + return shutil.which("bash") + + for line in where_output.splitlines(): + candidate = line.strip() + if "git" in candidate.lower() and candidate.lower().endswith("bash.exe"): + return candidate + + return shutil.which("bash") + + +def project_root() -> Path: + return Path(__file__).resolve().parents[1]