From 6070872e424f0fb5246b5ea3ce2402d82e273fae Mon Sep 17 00:00:00 2001 From: herr kaste Date: Fri, 27 Mar 2026 10:45:51 +0100 Subject: [PATCH 01/30] Harden secure-source checks for tombstones Treat removed workspace entries without a stored source as coming from MAIN_REPOSITORY_SOURCE when enforcing ensure_secure_source(). This closes a takeover gap for imported tombstones that lacked source data. Keep the denial message honest by showing the persisted workspace value in diagnostics. When source is missing, report "" instead of a synthesized trusted source. Add a deny-rules test that covers removed entries without source and asserts both denial behavior and the new message wording. --- scripts/crawl.py | 31 ++++++++++++++++---- tests/crawl/test_deny_rules.py | 53 ++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/scripts/crawl.py b/scripts/crawl.py index d7b91d516..ca53cd3c9 100644 --- a/scripts/crawl.py +++ b/scripts/crawl.py @@ -41,8 +41,11 @@ EXPLAIN_EFFECTIVE_ENV = "EFFECTIVE" UTC_FORMAT = "%Y-%m-%dT%H:%M:%SZ" STYLIZED_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" +MAIN_REPOSITORY_SOURCE = ( + "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json" +) TRUSTED_SOURCES = { - "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + MAIN_REPOSITORY_SOURCE, "https://raw.githubusercontent.com/sublimelsp/repository/main/repository.json", "https://raw.githubusercontent.com/SublimeLinter/package_control_channel/master/packages.json", } @@ -833,18 +836,34 @@ def ensure_secure_source( entry: RegistryEntry, existing: WorkspaceEntry ) -> None: + existing_source = source_for_security_check(existing) + entry_source = entry.get("source") if ( - existing.get("source") - and entry.get("source") - and existing.get("source") != entry.get("source") - and entry.get("source") not in TRUSTED_SOURCES + existing_source + and entry_source + and existing_source != entry_source + and entry_source not in TRUSTED_SOURCES ): + source_display = existing.get("source") or "" raise DeniedUpdating( f"Repository source changed for *{entry.get('name')}* from " - f"{existing.get('source')} to untrusted {entry.get('source')}" + f"{source_display} to untrusted {entry_source}" ) +def source_for_security_check(existing: WorkspaceEntry) -> str | None: + source = existing.get("source") + if source: + return source + + if existing.get("removed"): + # Old imported tombstones can miss source. + # For security checks we treat them as coming from the main trusted source. + return MAIN_REPOSITORY_SOURCE + + return None + + def keys_missing_from_release(release: Mapping) -> set[str]: return {"sublime_text", "platforms", "version", "url", "date"} - release.keys() diff --git a/tests/crawl/test_deny_rules.py b/tests/crawl/test_deny_rules.py index 2cdaaed82..673ce2a6a 100644 --- a/tests/crawl/test_deny_rules.py +++ b/tests/crawl/test_deny_rules.py @@ -312,6 +312,59 @@ async def test_move_between_untrusted_sources_is_denied(set_now, set_github_info assert result.get("source") == existing["source"] +@pytest.mark.asyncio +async def test_removed_without_source_defaults_to_trusted_for_security(set_now, set_github_info): + entry = { + "name": "SourceMoved", + "details": "https://github.com/example/source-moved", + "releases": [ + { + "sublime_text": "*", + "branch": True + } + ], + "source": "https://example.com/untrusted/new.json", + "schema_version": "3.0.0" + } + + existing = { + "name": "SourceMoved", + "details": "https://github.com/example/source-moved", + "removed": "2024-01-01T00:00:00Z", + "id": "SAME_ID" + } + + github_info = { + "metadata": { + "id": "SAME_ID", + "name": "SourceMoved", + "description": "Fixture package with missing source on tombstone", + "homepage": "https://github.com/example/source-moved", + "author": "example", + "readme": "https://raw.githubusercontent.com/example/source-moved/main/README.md", + "default_branch": "main", + "stars": 0, + "created_at": "2024-01-01T00:00:00Z" + }, + "tags": [], + "branches": [ + { + "name": "main", + "date": "2024-05-10T12:00:00Z", + "url": "https://codeload.github.com/example/source-moved/zip/main" + } + ] + } + + set_now("2024-05-11T00:00:00Z") + set_github_info(github_info) + + result = await crawl(object(), entry, existing) + fail_reason = result.get("fail_reason", "") + assert fail_reason.startswith("denied:") + assert "from to untrusted" in fail_reason + + @pytest.mark.asyncio @pytest.mark.parametrize( "trusted_source", From a573f8ac261aec959345a2a3d399b34355bcc40d Mon Sep 17 00:00:00 2001 From: herr kaste Date: Fri, 27 Mar 2026 11:10:55 +0100 Subject: [PATCH 02/30] Preserve source on crawl failures When crawl_package() raises, keep existing workspace state but ensure source is present by defaulting from the registry package contract. This keeps security behavior stable for denied source moves while also repairing entries that never had a successful crawl and thus missed source entirely. Add a regression test that verifies failed crawls adopt source from the registry entry when the existing workspace entry has no source. --- scripts/crawl.py | 6 ++++++ tests/crawl/test_resurrect_rules.py | 25 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/scripts/crawl.py b/scripts/crawl.py index ca53cd3c9..39a7ffcd9 100644 --- a/scripts/crawl.py +++ b/scripts/crawl.py @@ -388,6 +388,12 @@ async def crawl( out = await crawl_package(session, package, existing) except Exception as e: out = {**existing} + + # Keep existing source authoritative when present (important for denied + # source moves). Only backfill source from the registry package when + # the existing workspace entry has never had one. + out.setdefault("source", package["source"]) + out["failing_since"] = existing.get("failing_since", now_string) # We mark errors as fatal if we MUST de-list the package immediately. diff --git a/tests/crawl/test_resurrect_rules.py b/tests/crawl/test_resurrect_rules.py index e41898a7c..136569592 100644 --- a/tests/crawl/test_resurrect_rules.py +++ b/tests/crawl/test_resurrect_rules.py @@ -104,6 +104,31 @@ async def test_crawl_keeps_fail_reason_on_404_skip(set_now): assert result.get("failing_since") == existing["failing_since"] +@pytest.mark.asyncio +async def test_crawl_error_adopts_registry_source_when_missing(set_now, monkeypatch): + entry = { + "name": "MissingSource", + "details": "https://github.com/example/missing-source", + "releases": [{"sublime_text": "*", "branch": True}], + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "3.0.0", + } + existing = { + "name": "MissingSource", + "details": "https://github.com/example/missing-source", + } + + async def stub(*args, **kwargs): + raise RuntimeError("boom") + + monkeypatch.setattr("scripts.crawl.crawl_package", stub) + set_now("2024-06-01T00:00:00Z") + + result = await crawl(object(), entry, existing) + + assert result.get("source") == entry["source"] + + @pytest.mark.asyncio async def test_removed_package_is_resurrected_on_trusted_source(set_now, set_github_info): registry = { From 3c969add75cb9fe006ba54f5076397b9be39aa30 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Fri, 27 Mar 2026 11:39:58 +0100 Subject: [PATCH 03/30] Skip tombstoned packages in crawl scheduling Exclude registry entries with a removed field from the scheduler in both normal and presto modes. Also block explicit --name crawls for tombstoned packages with a clear message so manual runs follow the same tombstone rule. Add focused scheduler tests that verify removed entries are skipped and that the next-run hint ignores tombstoned packages. --- scripts/crawl.py | 2 + tests/crawl/test_next_packages_to_crawl.py | 76 ++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 tests/crawl/test_next_packages_to_crawl.py diff --git a/scripts/crawl.py b/scripts/crawl.py index 39a7ffcd9..b7dd5cb94 100644 --- a/scripts/crawl.py +++ b/scripts/crawl.py @@ -305,6 +305,7 @@ def next_packages_to_crawl( entry for entry in packages if not entry.get("fetching_source_failed") + if "removed" not in entry if presto or ( workspace["packages"] # type: ignore[call-overload] .get(entry["name"], {}) @@ -322,6 +323,7 @@ def next_packages_to_crawl( entry for entry in packages if not entry.get("fetching_source_failed") + if "removed" not in entry ), key=lambda pkg: ( workspace["packages"] # type: ignore[call-overload] diff --git a/tests/crawl/test_next_packages_to_crawl.py b/tests/crawl/test_next_packages_to_crawl.py new file mode 100644 index 000000000..8f5f92215 --- /dev/null +++ b/tests/crawl/test_next_packages_to_crawl.py @@ -0,0 +1,76 @@ +from scripts.crawl import next_packages_to_crawl + + +def test_next_packages_to_crawl_skips_removed_entries(set_now): + set_now("2026-03-26T00:00:00Z") + + registry = { + "packages": [ + make_registry_entry("Alive"), + make_registry_entry("Gone", removed="2025-01-01T00:00:00Z"), + ] + } + workspace = {"packages": {}, "libraries": {}} + + result = next_packages_to_crawl(registry, workspace, limit=200, presto=False) + + assert [entry["name"] for entry in result] == ["Alive"] + + +def test_next_packages_to_crawl_skips_removed_entries_in_presto_mode(set_now): + set_now("2026-03-26T00:00:00Z") + + registry = { + "packages": [ + make_registry_entry("Alive"), + make_registry_entry("Gone", removed="2025-01-01T00:00:00Z"), + ] + } + workspace = { + "packages": { + "Alive": {"name": "Alive", "last_seen": "2026-03-25T23:00:00Z"}, + "Gone": {"name": "Gone", "last_seen": "2026-03-25T22:00:00Z"}, + }, + "libraries": {}, + } + + result = next_packages_to_crawl(registry, workspace, limit=200, presto=True) + + assert [entry["name"] for entry in result] == ["Alive"] + + +def test_next_package_hint_ignores_removed_entries(set_now, capsys): + set_now("2026-03-26T00:00:00Z") + + registry = { + "packages": [ + make_registry_entry("Gone", removed="2025-01-01T00:00:00Z"), + make_registry_entry("Alive"), + ] + } + workspace = { + "packages": { + "Gone": {"name": "Gone", "next_crawl": "2026-03-26T00:01:00Z"}, + "Alive": {"name": "Alive", "next_crawl": "2026-03-26T00:10:00Z"}, + }, + "libraries": {}, + } + + result = next_packages_to_crawl(registry, workspace, limit=200, presto=False) + + assert result == [] + out = capsys.readouterr().out + assert "Next package runs in 10 minutes." in out + + +def make_registry_entry(name: str, removed: str | None = None): + entry = { + "name": name, + "details": f"https://github.com/example/{name.lower()}", + "releases": [{"sublime_text": "*", "branch": True}], + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "3.0.0", + } + if removed: + entry["removed"] = removed + return entry From 65d7ea94081ed26d7d56d7e580b1750d8d1b7157 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Fri, 27 Mar 2026 11:43:00 +0100 Subject: [PATCH 04/30] Reject tombstoned package in --name mode Keep --name handling simple and explicit: if the selected registry package is tombstoned, print a clear message and return without crawl. Add a focused regression test for main_() that verifies tombstoned packages are rejected in name mode and workspace remains unchanged. --- scripts/crawl.py | 3 +++ tests/crawl/test_main_name_tombstoned.py | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 tests/crawl/test_main_name_tombstoned.py diff --git a/scripts/crawl.py b/scripts/crawl.py index b7dd5cb94..91f6b1896 100644 --- a/scripts/crawl.py +++ b/scripts/crawl.py @@ -251,6 +251,9 @@ async def main_( if not package: err(f"Package '{name}' not found in registry.") return + if "removed" in package: + err(f"Package '{name}' is tombstoned in the registry.") + return tocrawl = [package] else: maintenance(registry, workspace) diff --git a/tests/crawl/test_main_name_tombstoned.py b/tests/crawl/test_main_name_tombstoned.py new file mode 100644 index 000000000..f17f5c507 --- /dev/null +++ b/tests/crawl/test_main_name_tombstoned.py @@ -0,0 +1,22 @@ +import pytest + +from scripts import crawl as crawl_script + + +@pytest.mark.asyncio +async def test_main_name_rejects_tombstoned_package(capsys): + registry = { + "packages": [ + { + "name": "Gone", + "removed": "2025-01-01T00:00:00Z", + } + ] + } + workspace = {"packages": {}, "libraries": {}} + + await crawl_script.main_(registry, workspace, "Gone", limit=1) + + err = capsys.readouterr().err + assert "Package 'Gone' is tombstoned in the registry." in err + assert workspace["packages"] == {} From 51e0203b5e5efeb864aa174c7c7fc411cdf3763f Mon Sep 17 00:00:00 2001 From: herr kaste Date: Fri, 27 Mar 2026 11:48:47 +0100 Subject: [PATCH 05/30] Handle tombstones in explain mode Teach explain_main() to treat tombstoned registry entries explicitly. For tombstones, print a clear status line to stderr. In normal mode, print the raw entry as pretty JSON. In EFFECTIVE mode, print only the status line and no JSON payload since there is no effective release view. Keep this path simple by inlining the tombstone JSON print instead of adding a helper wrapper. --- scripts/crawl.py | 9 ++++- tests/crawl/test_explain_effective.py | 50 +++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/scripts/crawl.py b/scripts/crawl.py index 91f6b1896..f4afdd4ec 100644 --- a/scripts/crawl.py +++ b/scripts/crawl.py @@ -133,8 +133,15 @@ def explain_main(registry: str, name: str) -> int: err(f"Package '{name}' not found in registry.") return 1 + effective_mode = env_flag(EXPLAIN_EFFECTIVE_ENV) + if "removed" in package: + err(f"Package '{name}' is tombstoned in the registry.") + if not effective_mode: + print(json.dumps(package, indent=2, ensure_ascii=False, sort_keys=True)) + return 0 + normalized = normalize_registry_entry(deepcopy(package)) - if env_flag(EXPLAIN_EFFECTIVE_ENV): + if effective_mode: print_package_explain_effective(name, normalized) # type: ignore[arg-type] return 0 diff --git a/tests/crawl/test_explain_effective.py b/tests/crawl/test_explain_effective.py index e0c939ca5..8a506f2c2 100644 --- a/tests/crawl/test_explain_effective.py +++ b/tests/crawl/test_explain_effective.py @@ -114,3 +114,53 @@ def test_explain_main_effective_mode_omits_status_line_when_not_tags_mode( assert out.startswith("{") explained = json.loads(out) assert explained["name"] == "NoTagsPkg" + + +def test_explain_main_tombstoned_pretty_prints_raw_entry(capsys, tmp_path) -> None: + registry = { + "packages": [ + { + "name": "GonePkg", + "first_seen": "2020-01-01T00:00:00Z", + "removed": "2021-01-01T00:00:00Z", + "labels": ["theme"], + } + ] + } + registry_path = tmp_path / "registry.json" + registry_path.write_text(json.dumps(registry), encoding="utf-8") + + assert explain_main(str(registry_path), "GonePkg") == 0 + + captured = capsys.readouterr() + assert "Package 'GonePkg' is tombstoned in the registry." in captured.err + assert captured.out.startswith("{\n") + explained = json.loads(captured.out) + assert explained == registry["packages"][0] + + +def test_explain_main_tombstoned_effective_mode_emits_only_status( + monkeypatch, + capsys, + tmp_path, +) -> None: + registry = { + "packages": [ + { + "name": "GonePkg", + "first_seen": "2020-01-01T00:00:00Z", + "removed": "2021-01-01T00:00:00Z", + "labels": ["theme"], + } + ] + } + registry_path = tmp_path / "registry.json" + registry_path.write_text(json.dumps(registry), encoding="utf-8") + + monkeypatch.setenv("EFFECTIVE", "1") + + assert explain_main(str(registry_path), "GonePkg") == 0 + + captured = capsys.readouterr() + assert "Package 'GonePkg' is tombstoned in the registry." in captured.err + assert captured.out == "" From 6ddc81ebd3fd82c014690ae0700c11f7007d5f0f Mon Sep 17 00:00:00 2001 From: herr kaste Date: Fri, 27 Mar 2026 12:01:00 +0100 Subject: [PATCH 06/30] Move explain helpers into _explain_package Move the effective explain logic and its helper functions out of crawl.py and into _explain_package.py so the explain-specific code lives together in one place. As part of that extraction, move the shared sublime_text selector parsing helpers into _utils.py so both crawl runtime logic and explain logic use the same implementation. Update the explain tests to import the helper-facing functions from _explain_package. --- scripts/_explain_package.py | 72 +++++++++++++++- scripts/_utils.py | 35 ++++++++ scripts/crawl.py | 115 +++----------------------- tests/crawl/test_explain_effective.py | 4 +- 4 files changed, 118 insertions(+), 108 deletions(-) diff --git a/scripts/_explain_package.py b/scripts/_explain_package.py index 69fdc26f5..7567a31ef 100644 --- a/scripts/_explain_package.py +++ b/scripts/_explain_package.py @@ -1,8 +1,11 @@ from __future__ import annotations +from copy import deepcopy from difflib import SequenceMatcher import json -from typing import Any +from typing import Any, Literal + +from ._utils import parse_sublime_text_max from rich import box from rich.console import Console @@ -32,6 +35,64 @@ def print_package_explain( ) +def print_package_explain_effective(name: str, normalized: dict[str, Any]) -> None: + releases = normalized.get("releases", []) + sorted_releases = sorted_release_definitions(releases) + tags_mode = classify_tags_mode(sorted_releases) + + normalized_effective = deepcopy(normalized) + normalized_effective["releases"] = keep_newest_release_definitions(sorted_releases) + + if tags_mode: + effectively = "(effectively) " if tags_mode == "effective" else "" + print(f"{name} uses {effectively}the tags-mode.") + print(json.dumps(normalized_effective, ensure_ascii=False, sort_keys=True)) + + +def classify_tags_mode( + sorted_releases: list[dict[str, Any]], +) -> bool | Literal["effective"]: + if not sorted_releases: + return False + + if all(release_uses_tags_mode(release) for release in sorted_releases): + return True + + if release_uses_tags_mode(sorted_releases[-1]): + return "effective" + + return False + + +def sorted_release_definitions(releases: list[dict[str, Any]]) -> list[dict[str, Any]]: + return sorted(releases, key=release_definition_sort_key) + + +def keep_newest_release_definitions( + releases: list[dict[str, Any]], +) -> list[dict[str, Any]]: + if not releases: + return [] + + newest_build = parse_sublime_text_max(releases[-1].get("sublime_text")) + return [ + release + for release in releases + if parse_sublime_text_max(release.get("sublime_text")) == newest_build + ] + + +def release_definition_sort_key(release: dict[str, Any]) -> tuple[float, str]: + return ( + parse_sublime_text_max(release.get("sublime_text")), + _tags_sort_value(release.get("tags")), + ) + + +def release_uses_tags_mode(release: dict[str, Any]) -> bool: + return bool(release.get("tags", False)) + + def print_library_explain( name: str, rows: list[tuple[dict[str, Any], list[dict[str, Any]]]], @@ -85,6 +146,15 @@ def print_library_explain( console.print(table) +def _tags_sort_value(value: Any) -> str: + # Place plain `True` after common prefixes like `st2-`. + if value is True: + return "~~true" + if isinstance(value, str): + return value + return "" + + def _render_json_diff_table( title: str, left_obj: dict[str, Any] | list[Any], diff --git a/scripts/_utils.py b/scripts/_utils.py index 979497940..2ed1dd232 100644 --- a/scripts/_utils.py +++ b/scripts/_utils.py @@ -278,6 +278,41 @@ def is_semver(s: str) -> bool: return parse_version(s) is not None +def parse_sublime_text_max(selector: object) -> float: + if not isinstance(selector, str): + return float("inf") + + s = re.sub(r"\s+", "", selector) + if s in ("", "*"): + return float("inf") + + range_index = s.find("-") + if range_index != -1: + right = s[range_index + 1:] + n = parse_int_prefix(right) + return float(n) if n is not None else float("inf") + + if s.startswith("<="): + n = parse_int_prefix(s[2:]) + return float(n) if n is not None else float("inf") + + if s.startswith("<"): + n = parse_int_prefix(s[1:]) + return float(max(0, n - 1)) if n is not None else float("inf") + + if s.startswith(">=") or s.startswith(">"): + return float("inf") + + n = parse_int_prefix(s) + return float(n) if n is not None else float("inf") + + +def parse_int_prefix(text: str) -> int | None: + if match := re.match(r"^\d+", text): + return int(match.group(0)) + return None + + SECONDS_PER_DAY = 24 * 60 * 60 diff --git a/scripts/crawl.py b/scripts/crawl.py index f4afdd4ec..f461f92f8 100644 --- a/scripts/crawl.py +++ b/scripts/crawl.py @@ -10,7 +10,7 @@ import os import re import sys -from typing import Any, Literal, Mapping, NotRequired, Required, TypedDict +from typing import Literal, Mapping, NotRequired, Required, TypedDict import packaging from packaging.specifiers import SpecifierSet @@ -29,10 +29,17 @@ normalize_version_spec, ) from ._utils import ( - format_name_list, parse_version, resolve_url, update_url, write_json, pl, pick, + format_name_list, + parse_sublime_text_max, + parse_version, + resolve_url, + update_url, + write_json, + pl, + pick, VersionInfo, ) -from ._explain_package import print_package_explain +from ._explain_package import print_package_explain, print_package_explain_effective import traceback @@ -149,73 +156,6 @@ def explain_main(registry: str, name: str) -> int: return 0 -def print_package_explain_effective(name: str, normalized: dict[str, Any]) -> None: - releases = normalized.get("releases", []) - sorted_releases = sorted_release_definitions(releases) - tags_mode = classify_tags_mode(sorted_releases) - - normalized_effective = deepcopy(normalized) - normalized_effective["releases"] = keep_newest_release_definitions(sorted_releases) - - if tags_mode: - effectively = "(effectively) " if tags_mode == "effective" else "" - print(f"{name} uses {effectively}the tags-mode.") - print(json.dumps(normalized_effective, ensure_ascii=False, sort_keys=True)) - - -def classify_tags_mode( - sorted_releases: list[dict[str, Any]], -) -> bool | Literal["effective"]: - if not sorted_releases: - return False - - if all(release_uses_tags_mode(release) for release in sorted_releases): - return True - - if release_uses_tags_mode(sorted_releases[-1]): - return "effective" - - return False - - -def sorted_release_definitions(releases: list[dict[str, Any]]) -> list[dict[str, Any]]: - return sorted(releases, key=release_definition_sort_key) - - -def keep_newest_release_definitions( - releases: list[dict[str, Any]], -) -> list[dict[str, Any]]: - if not releases: - return [] - - newest_build = parse_sublime_text_max(releases[-1].get("sublime_text")) - return [ - release - for release in releases - if parse_sublime_text_max(release.get("sublime_text")) == newest_build - ] - - -def release_definition_sort_key(release: dict[str, Any]) -> tuple[float, str]: - return ( - parse_sublime_text_max(release.get("sublime_text")), - tags_sort_value(release.get("tags")), - ) - - -def tags_sort_value(value: Any) -> str: - # Place plain `True` after common prefixes like `st2-`. - if value is True: - return "~~true" - if isinstance(value, str): - return value - return "" - - -def release_uses_tags_mode(release: dict[str, Any]) -> bool: - return bool(release.get("tags", False)) - - async def main( registry: str, workspace: str, @@ -983,41 +923,6 @@ def maybe_make_auto_open_ended_tags_release( } -def parse_sublime_text_max(selector) -> float: - if not isinstance(selector, str): - return float("inf") - - s = re.sub(r"\s+", "", selector) - if s in ("", "*"): - return float("inf") - - range_index = s.find("-") - if range_index != -1: - right = s[range_index + 1:] - n = parse_int_prefix(right) - return float(n) if n is not None else float("inf") - - if s.startswith("<="): - n = parse_int_prefix(s[2:]) - return float(n) if n is not None else float("inf") - - if s.startswith("<"): - n = parse_int_prefix(s[1:]) - return float(max(0, n - 1)) if n is not None else float("inf") - - if s.startswith(">=") or s.startswith(">"): - return float("inf") - - n = parse_int_prefix(s) - return float(n) if n is not None else float("inf") - - -def parse_int_prefix(text: str) -> int | None: - if match := re.match(r"^\d+", text): - return int(match.group(0)) - return None - - def compile_release_asset_pattern( pattern: str, version: str, diff --git a/tests/crawl/test_explain_effective.py b/tests/crawl/test_explain_effective.py index 8a506f2c2..5d1b42c56 100644 --- a/tests/crawl/test_explain_effective.py +++ b/tests/crawl/test_explain_effective.py @@ -1,11 +1,11 @@ import json -from scripts.crawl import ( +from scripts._explain_package import ( classify_tags_mode, - explain_main, keep_newest_release_definitions, sorted_release_definitions, ) +from scripts.crawl import explain_main def test_classify_tags_mode_when_all_releases_use_tags() -> None: From 1b9ecc0653f3b1ef2a26f6cf45a32178efd7235d Mon Sep 17 00:00:00 2001 From: herr kaste Date: Fri, 27 Mar 2026 12:31:25 +0100 Subject: [PATCH 07/30] Import tombstones during maintenance Teach maintenance() to copy tombstoned registry entries into workspace.packages before the legacy orphan-marking step. This keeps removed packages present in workspace and intentionally overwrites stale crawl-only fields with the canonical tombstone data. Add focused maintenance tests for tombstone import, overwrite behavior, and continued orphan removed marking. --- scripts/crawl.py | 8 +++- tests/crawl/test_maintenance.py | 77 +++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 tests/crawl/test_maintenance.py diff --git a/scripts/crawl.py b/scripts/crawl.py index f461f92f8..be5375276 100644 --- a/scripts/crawl.py +++ b/scripts/crawl.py @@ -317,12 +317,18 @@ def next_packages_to_crawl( def maintenance(registry: Registry, workspace: Workspace) -> None: + packages = workspace["packages"] + + for entry in registry["packages"]: + if "removed" in entry: + packages[entry["name"]] = {**entry} # type: ignore[typeddict-item] + + # Legacy; # lookup all packages in workspace and mark them as `removed` # if they have been removed from the registry now = now_ts() now_string = now.strftime(UTC_FORMAT) current_package_names = {entry["name"] for entry in registry["packages"]} - packages = workspace["packages"] for name in packages.keys() - current_package_names: packages[name].setdefault("removed", now_string) diff --git a/tests/crawl/test_maintenance.py b/tests/crawl/test_maintenance.py new file mode 100644 index 000000000..bf5b4963d --- /dev/null +++ b/tests/crawl/test_maintenance.py @@ -0,0 +1,77 @@ +from scripts import crawl as crawl_script + + +def test_maintenance_imports_registry_tombstones_into_workspace(): + tombstone = { + "name": "Gone", + "first_seen": "2012-01-01T00:00:00Z", + "removed": "2024-01-01T00:00:00Z", + "labels": ["theme"], + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "3.0.0", + } + registry = { + "packages": [ + { + "name": "Alive", + "details": "https://github.com/example/alive", + "releases": [{"sublime_text": "*", "branch": True}], + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "3.0.0", + }, + tombstone, + ] + } + workspace = {"packages": {}, "libraries": {}} + + crawl_script.maintenance(registry, workspace) + + assert workspace["packages"]["Gone"] == tombstone + assert "Alive" not in workspace["packages"] + + +def test_maintenance_overwrites_existing_entry_with_registry_tombstone(): + tombstone = { + "name": "Gone", + "first_seen": "2012-01-01T00:00:00Z", + "removed": "2024-01-01T00:00:00Z", + "labels": ["theme"], + } + registry = {"packages": [tombstone]} + workspace = { + "packages": { + "Gone": { + "name": "Gone", + "details": "https://github.com/example/gone", + "releases": [{"version": "1.0.0"}], + "source": "https://example.com/old.json", + "first_seen": "2012-01-01T00:00:00Z", + "fail_reason": "fatal: 404 Not Found", + } + }, + "libraries": {}, + } + + crawl_script.maintenance(registry, workspace) + + assert workspace["packages"]["Gone"] == tombstone + assert "details" not in workspace["packages"]["Gone"] + + +def test_maintenance_still_marks_workspace_orphans_removed(set_now): + set_now("2026-03-27T11:00:00Z") + + registry = {"packages": []} + workspace = { + "packages": { + "Orphan": { + "name": "Orphan", + "first_seen": "2020-01-01T00:00:00Z", + } + }, + "libraries": {}, + } + + crawl_script.maintenance(registry, workspace) + + assert workspace["packages"]["Orphan"]["removed"] == "2026-03-27T11:00:00Z" From 7d568009da5f35a6e67816ccaa318c5216e92954 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Fri, 27 Mar 2026 12:36:31 +0100 Subject: [PATCH 08/30] Test tombstone resurrection after maintenance import Add a regression test that imports a tombstoned package via maintenance(), then runs main_() with an active registry entry for the same name. Verify resurrection works without special-case code: the package is crawled, removed is cleared, source remains stable, and first_seen is preserved. --- tests/crawl/test_resurrect_rules.py | 66 ++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/crawl/test_resurrect_rules.py b/tests/crawl/test_resurrect_rules.py index 136569592..8f0a5685d 100644 --- a/tests/crawl/test_resurrect_rules.py +++ b/tests/crawl/test_resurrect_rules.py @@ -1,7 +1,7 @@ import pytest import scripts.crawl as crawl_mod -from scripts.crawl import SkipCrawling, crawl, main_ +from scripts.crawl import SkipCrawling, crawl, main_, maintenance @pytest.mark.asyncio @@ -183,6 +183,70 @@ async def test_removed_package_is_resurrected_on_trusted_source(set_now, set_git assert "removed" not in workspace["packages"]["Reappeared"] +@pytest.mark.asyncio +async def test_maintenance_imported_tombstone_resurrects_without_special_case( + set_now, + set_github_info, +): + source = "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json" + + workspace = {"packages": {}, "libraries": {}} + registry_tombstoned = { + "packages": [ + { + "name": "Reappeared", + "first_seen": "2019-01-01T00:00:00Z", + "removed": "2024-05-01T00:00:00Z", + "source": source, + "labels": ["theme"], + } + ] + } + maintenance(registry_tombstoned, workspace) + + registry_active = { + "packages": [ + { + "name": "Reappeared", + "details": "https://github.com/example/reappeared", + "releases": [{"sublime_text": "*", "branch": True}], + "source": source, + "schema_version": "3.0.0", + } + ], + } + + set_now("2024-06-01T00:00:00Z") + set_github_info({ + "metadata": { + "id": "SAME_ID", + "name": "Reappeared", + "description": "Fixture reappeared package", + "homepage": "https://github.com/example/reappeared", + "author": "example", + "readme": "https://raw.githubusercontent.com/example/reappeared/main/README.md", + "default_branch": "main", + "stars": 0, + "created_at": "2024-01-01T00:00:00Z", + }, + "tags": [], + "branches": [ + { + "name": "main", + "date": "2024-05-31T00:00:00Z", + "url": "https://codeload.github.com/example/reappeared/zip/main", + } + ], + }) + + await main_(registry_active, workspace, None, 100) + + package = workspace["packages"]["Reappeared"] + assert "removed" not in package + assert package["source"] == source + assert package["first_seen"] == "2019-01-01T00:00:00Z" + + @pytest.mark.asyncio async def test_retry_recent_404(set_now, monkeypatch): entry = { From 3d2c29c2bf3cf265981a274a495455e82c6be715 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Fri, 27 Mar 2026 13:51:12 +0100 Subject: [PATCH 09/30] Classify package and library registry changes Add describe_registry_changes.py to generate commit-message text from old/new registry snapshots. Implement change classification for both packages and libraries, including single-change messages, metadata bulk edits, and mixed bulk edits with additions, tombstones, and resurrections. Keep repositories out of primary classification, but fall back to "Update registry.json" when repositories change without any entity change. This keeps "Same." strict so it only appears when no commit is needed. Add focused tests for all supported classifications and fallback cases, using loader mocking for CLI tests. --- scripts/describe_registry_changes.py | 296 ++++++++++++++++++++++++ tests/test_describe_registry_changes.py | 239 +++++++++++++++++++ 2 files changed, 535 insertions(+) create mode 100644 scripts/describe_registry_changes.py create mode 100644 tests/test_describe_registry_changes.py diff --git a/scripts/describe_registry_changes.py b/scripts/describe_registry_changes.py new file mode 100644 index 000000000..d2a8be45e --- /dev/null +++ b/scripts/describe_registry_changes.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +FALLBACK_SUBJECT = "Update registry.json" + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + old_registry = read_registry_json(args.old) + new_registry = read_registry_json(args.new) + + try: + message = describe_registry_changes(old_registry, new_registry) + except Exception: + message = FALLBACK_SUBJECT + + print(message) + return 0 + + +@dataclass +class Args: + old: str + new: str + + +def parse_args(argv: list[str] | None = None) -> Args: + parser = argparse.ArgumentParser( + description="Describe registry changes.", + ) + parser.add_argument("-a", "--old", required=True, help="Path to old registry JSON") + parser.add_argument("-b", "--new", required=True, help="Path to new registry JSON") + ns = parser.parse_args(argv) + return Args(old=ns.old, new=ns.new) + + +def describe_registry_changes( + old_registry: dict[str, Any], + new_registry: dict[str, Any], +) -> str: + package_changes = collect_changes( + old_items=entity_map(old_registry, "packages"), + new_items=entity_map(new_registry, "packages"), + kind="package", + ) + library_changes = collect_changes( + old_items=entity_map(old_registry, "libraries"), + new_items=entity_map(new_registry, "libraries"), + kind="library", + ) + changes = merge_changes(package_changes, library_changes) + + repositories_changed = old_registry.get("repositories") != new_registry.get("repositories") + other_changed = strip_for_other_compare(old_registry) != strip_for_other_compare(new_registry) + + if not changes.any_changes: + if repositories_changed or other_changed: + return FALLBACK_SUBJECT + return "Same." + + if changes.disappeared or other_changed: + return FALLBACK_SUBJECT + + if changes.single_added: + return f"Added `{changes.added[0]}`" + + if changes.single_tombstoned: + return f"Tombstoned `{changes.tombstoned[0]}`" + + if changes.single_resurrected: + return f"Resurrected `{changes.resurrected[0]}`" + + if changes.single_metadata_changed: + return f"Changed metadata of `{changes.metadata_changed[0]}`" + + if changes.only_metadata_changed: + return build_metadata_bulk_message(changes.metadata_changed) + + if changes.added or changes.tombstoned or changes.resurrected or changes.metadata_changed: + return build_bulk_edit_message( + added=changes.added, + tombstoned=changes.tombstoned, + resurrected=changes.resurrected, + metadata_changed=changes.metadata_changed, + ) + + return FALLBACK_SUBJECT + + +@dataclass +class ChangeSet: + added: list[str] + tombstoned: list[str] + resurrected: list[str] + metadata_changed: list[str] + disappeared: list[str] + + @property + def any_changes(self) -> bool: + return bool( + self.added + or self.tombstoned + or self.resurrected + or self.metadata_changed + or self.disappeared + ) + + @property + def single_added(self) -> bool: + return self.added_only and len(self.added) == 1 + + @property + def single_tombstoned(self) -> bool: + return self.tombstoned_only and len(self.tombstoned) == 1 + + @property + def single_resurrected(self) -> bool: + return self.resurrected_only and len(self.resurrected) == 1 + + @property + def single_metadata_changed(self) -> bool: + return self.only_metadata_changed and len(self.metadata_changed) == 1 + + @property + def added_only(self) -> bool: + return bool(self.added) and not ( + self.tombstoned or self.resurrected or self.metadata_changed + ) + + @property + def tombstoned_only(self) -> bool: + return bool(self.tombstoned) and not ( + self.added or self.resurrected or self.metadata_changed + ) + + @property + def resurrected_only(self) -> bool: + return bool(self.resurrected) and not ( + self.added or self.tombstoned or self.metadata_changed + ) + + @property + def only_metadata_changed(self) -> bool: + return bool(self.metadata_changed) and not ( + self.added or self.tombstoned or self.resurrected + ) + + +def merge_changes(*changesets: ChangeSet) -> ChangeSet: + return ChangeSet( + added=sort_names(name for c in changesets for name in c.added), + tombstoned=sort_names(name for c in changesets for name in c.tombstoned), + resurrected=sort_names(name for c in changesets for name in c.resurrected), + metadata_changed=sort_names(name for c in changesets for name in c.metadata_changed), + disappeared=sort_names(name for c in changesets for name in c.disappeared), + ) + + +def read_registry_json(path: str) -> dict[str, Any]: + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def collect_changes( + old_items: dict[str, dict[str, Any]], + new_items: dict[str, dict[str, Any]], + *, + kind: str, +) -> ChangeSet: + added: list[str] = [] + tombstoned: list[str] = [] + resurrected: list[str] = [] + metadata_changed: list[str] = [] + + disappeared = [display_name(kind, name) for name in old_items.keys() - new_items.keys()] + + for name in sort_names(new_items.keys() - old_items.keys()): + display = display_name(kind, name) + if is_tombstoned(new_items[name]): + tombstoned.append(display) + else: + added.append(display) + + for name in sort_names(old_items.keys() & new_items.keys()): + old_entry = old_items[name] + new_entry = new_items[name] + display = display_name(kind, name) + old_tombstoned = is_tombstoned(old_entry) + new_tombstoned = is_tombstoned(new_entry) + + if old_tombstoned and not new_tombstoned: + resurrected.append(display) + continue + + if not old_tombstoned and new_tombstoned: + tombstoned.append(display) + continue + + if old_entry != new_entry: + metadata_changed.append(display) + + return ChangeSet( + added=sort_names(added), + tombstoned=sort_names(tombstoned), + resurrected=sort_names(resurrected), + metadata_changed=sort_names(metadata_changed), + disappeared=sort_names(disappeared), + ) + + +def entity_map(registry: dict[str, Any], key: str) -> dict[str, dict[str, Any]]: + entries = registry.get(key, []) + return { + entry["name"]: entry + for entry in entries + } + + +def strip_for_other_compare(registry: dict[str, Any]) -> dict[str, Any]: + return { + key: value + for key, value in registry.items() + if key not in {"packages", "libraries", "repositories"} + } + + +def build_metadata_bulk_message(metadata_changed: list[str]) -> str: + lines = [ + "Metadata bulk edit", + "", + "Record metadata edits of the following packages:", + *[f"- {name}" for name in metadata_changed], + ] + return "\n".join(lines) + + +def build_bulk_edit_message( + *, + added: list[str], + tombstoned: list[str], + resurrected: list[str], + metadata_changed: list[str], +) -> str: + sections: list[str] = ["Bulk edit"] + + if added: + sections.extend([ + "", + "Record addition of following packages:", + *[f"- {name}" for name in added], + ]) + + if tombstoned: + sections.extend([ + "", + "Record tombstoning the following packages:", + *[f"- {name}" for name in tombstoned], + ]) + + if resurrected: + sections.extend([ + "", + "Record resurrection of the following packages:", + *[f"- {name}" for name in resurrected], + ]) + + if metadata_changed: + sections.extend([ + "", + "Record metadata edit of the following packages:", + *[f"- {name}" for name in metadata_changed], + ]) + + return "\n".join(sections) + + +def sort_names(names: Any) -> list[str]: + return sorted(names, key=str.casefold) + + +def display_name(kind: str, name: str) -> str: + if kind == "library": + return f"{name} (library)" + return name + + +def is_tombstoned(entry: dict[str, Any]) -> bool: + return "removed" in entry + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_describe_registry_changes.py b/tests/test_describe_registry_changes.py new file mode 100644 index 000000000..d4f13b9ba --- /dev/null +++ b/tests/test_describe_registry_changes.py @@ -0,0 +1,239 @@ +import scripts.describe_registry_changes as describe_script + + +def test_describe_registry_changes_same() -> None: + old = {"packages": [pkg("A")]} + new = {"packages": [pkg("A")]} + + assert describe_script.describe_registry_changes(old, new) == "Same." + + +def test_describe_registry_changes_single_added() -> None: + old = {"packages": []} + new = {"packages": [pkg("Gloom")]} + + assert describe_script.describe_registry_changes(old, new) == "Added `Gloom`" + + +def test_describe_registry_changes_single_tombstoned() -> None: + old = {"packages": [pkg("Gloom")]} + new = {"packages": [pkg("Gloom", removed="2026-01-01T00:00:00Z")]} + + assert describe_script.describe_registry_changes(old, new) == "Tombstoned `Gloom`" + + +def test_describe_registry_changes_single_resurrected() -> None: + old = {"packages": [pkg("Gloom", removed="2026-01-01T00:00:00Z")]} + new = {"packages": [pkg("Gloom")]} + + assert describe_script.describe_registry_changes(old, new) == "Resurrected `Gloom`" + + +def test_describe_registry_changes_single_metadata_change() -> None: + old = {"packages": [pkg("Gloom", labels=["theme"])]} + new = {"packages": [pkg("Gloom", labels=["theme", "dark"])]} + + assert describe_script.describe_registry_changes(old, new) == "Changed metadata of `Gloom`" + + +def test_describe_registry_changes_single_added_library() -> None: + old = {"packages": [], "libraries": []} + new = {"packages": [], "libraries": [lib("JsonSchema")]} + + assert describe_script.describe_registry_changes(old, new) == "Added `JsonSchema (library)`" + + +def test_describe_registry_changes_mixed_package_and_library_bulk_edit() -> None: + old = { + "packages": [pkg("DropMe")], + "libraries": [lib("JsonSchema", removed="2026-01-01T00:00:00Z")], + } + new = { + "packages": [pkg("DropMe", removed="2026-01-01T00:00:00Z")], + "libraries": [lib("JsonSchema")], + } + + assert describe_script.describe_registry_changes(old, new) == ( + "Bulk edit\n\n" + "Record tombstoning the following packages:\n" + "- DropMe\n\n" + "Record resurrection of the following packages:\n" + "- JsonSchema (library)" + ) + + +def test_describe_registry_changes_metadata_bulk_edit() -> None: + old = { + "packages": [ + pkg("A", labels=["x"]), + pkg("B", labels=["x"]), + ] + } + new = { + "packages": [ + pkg("A", labels=["x", "y"]), + pkg("B", labels=["x", "y"]), + ] + } + + assert describe_script.describe_registry_changes(old, new) == ( + "Metadata bulk edit\n\n" + "Record metadata edits of the following packages:\n" + "- A\n" + "- B" + ) + + +def test_describe_registry_changes_bulk_edit() -> None: + old = { + "packages": [ + pkg("DropMe"), + pkg("Meta", labels=["a"]), + ] + } + new = { + "packages": [ + pkg("DropMe", removed="2026-01-01T00:00:00Z"), + pkg("Meta", labels=["a", "b"]), + pkg("NewPkg"), + ] + } + + assert describe_script.describe_registry_changes(old, new) == ( + "Bulk edit\n\n" + "Record addition of following packages:\n" + "- NewPkg\n\n" + "Record tombstoning the following packages:\n" + "- DropMe\n\n" + "Record metadata edit of the following packages:\n" + "- Meta" + ) + + +def test_describe_registry_changes_falls_back_when_package_disappears() -> None: + old = {"packages": [pkg("A")]} + new = {"packages": []} + + assert describe_script.describe_registry_changes(old, new) == "Update registry.json" + + +def test_describe_registry_changes_falls_back_when_only_repositories_change() -> None: + old = { + "repositories": ["https://raw.githubusercontent.com/a/repository.json"], + "packages": [pkg("A")], + "libraries": [lib("L")], + } + new = { + "repositories": ["https://raw.githubusercontent.com/b/repository.json"], + "packages": [pkg("A")], + "libraries": [lib("L")], + } + + assert describe_script.describe_registry_changes(old, new) == "Update registry.json" + + +def test_describe_registry_changes_bulk_resurrection() -> None: + old = { + "packages": [ + pkg("A", removed="2026-01-01T00:00:00Z"), + pkg("B", removed="2026-01-01T00:00:00Z"), + ] + } + new = {"packages": [pkg("A"), pkg("B")]} + + assert describe_script.describe_registry_changes(old, new) == ( + "Bulk edit\n\n" + "Record resurrection of the following packages:\n" + "- A\n" + "- B" + ) + + +def test_describe_registry_changes_bulk_edit_with_resurrection_and_other_changes() -> None: + old = { + "packages": [ + pkg("OldGone", removed="2026-01-01T00:00:00Z"), + pkg("KeepMeta", labels=["a"]), + ] + } + new = { + "packages": [ + pkg("OldGone"), + pkg("KeepMeta", labels=["a", "b"]), + pkg("NewPkg"), + ] + } + + assert describe_script.describe_registry_changes(old, new) == ( + "Bulk edit\n\n" + "Record addition of following packages:\n" + "- NewPkg\n\n" + "Record resurrection of the following packages:\n" + "- OldGone\n\n" + "Record metadata edit of the following packages:\n" + "- KeepMeta" + ) + + +def test_main_reads_both_registries_via_loader_and_prints_message(capsys, monkeypatch) -> None: + old = {"packages": []} + new = {"packages": [pkg("Gloom")]} + + def fake_read(path: str): + if path == "old_registry.json": + return old + if path == "new_registry.json": + return new + raise AssertionError(path) + + monkeypatch.setattr(describe_script, "read_registry_json", fake_read) + + rc = describe_script.main(["-a", "old_registry.json", "-b", "new_registry.json"]) + + assert rc == 0 + assert capsys.readouterr().out.strip() == "Added `Gloom`" + + +def test_main_falls_back_when_classifier_crashes(capsys, monkeypatch) -> None: + monkeypatch.setattr(describe_script, "read_registry_json", lambda path: {"packages": []}) + + def boom(old_registry, new_registry): + raise RuntimeError("boom") + + monkeypatch.setattr(describe_script, "describe_registry_changes", boom) + + rc = describe_script.main(["-a", "old_registry.json", "-b", "new_registry.json"]) + + assert rc == 0 + assert capsys.readouterr().out.strip() == "Update registry.json" + + +def pkg( + name: str, + *, + removed: str | None = None, + labels: list[str] | None = None, +) -> dict[str, object]: + entry: dict[str, object] = { + "name": name, + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "3.0.0", + "releases": [{"url": f"https://example.com/{name}.zip", "date": "2026-01-01T00:00:00Z"}], + } + if removed is not None: + entry["removed"] = removed + if labels is not None: + entry["labels"] = labels + return entry + + +def lib(name: str, *, removed: str | None = None) -> dict[str, object]: + entry: dict[str, object] = { + "name": name, + "source": "https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/repository.json", + "schema_version": "4.0.0", + "releases": [{"version": "1.0.0"}], + } + if removed is not None: + entry["removed"] = removed + return entry From 4efb2bf83d87b12e76a036a3d99d22c09a40a868 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sat, 28 Mar 2026 12:35:37 +0100 Subject: [PATCH 10/30] Add seeded lifecycle mode to generate_registry Implement implicit seed loading in generate_registry based on --output, with explicit overrides via --seed and opt-out via --no-seed. In seeded mode, preserve package first_seen, synthesize tombstones for missing packages, preserve tombstone removed timestamps, and keep resurrection first_seen. Libraries remain non-tombstoned. Keep fetching_source_failed behavior intact and add focused registry tests that cover seed/no-seed behavior, tombstones, resurrection, library handling, and deterministic package ordering. --- scripts/generate_registry.py | 198 +++++++++++++-- tests/registry/test_generate_registry.py | 302 ++++++++++++++++++++++- 2 files changed, 484 insertions(+), 16 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index 595fbc601..e31980917 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -3,13 +3,14 @@ import argparse import asyncio import aiohttp +from dataclasses import dataclass from datetime import datetime, timezone import json import os import sys import time from urllib.parse import urlparse -from typing import Callable, Iterable, Mapping, NotRequired, TypedDict +from typing import Any, Callable, Iterable, Mapping, NotRequired, TypedDict from ._utils import flatten, resolve_urls, update_url, write_json, pl @@ -31,6 +32,9 @@ class PackageEntry(TypedDict, total=False): schema_version: str name: str details: NotRequired[str] + labels: NotRequired[list[str]] + first_seen: NotRequired[IsoTimestamp] + removed: NotRequired[IsoTimestamp] fetching_source_failed: NotRequired[IsoTimestamp] @@ -47,6 +51,12 @@ class RepositorySchema(TypedDict): libraries: list[PackageEntry] +@dataclass +class SeedLoad: + db: dict[str, Any] + available: bool + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Generate a registry of Sublime Text packages." @@ -67,30 +77,56 @@ def parse_args() -> argparse.Namespace: "If not given, uses the official channel from wbond/package_control_channel." ), ) + parser.add_argument( + "--seed", + nargs="?", + const="", + default=None, + help=( + "Optional path to seed JSON. If provided without a value, defaults to --output. " + "Explicit seed paths must exist and be readable." + ), + ) + parser.add_argument( + "--no-seed", + action="store_true", + help="Disable lifecycle enrichment and emit raw registry output.", + ) return parser.parse_args() -async def main(output_file: str, channels: list[str]) -> None: - # Try to read previous db if it exists - try: - with open(output_file, 'r') as f: - prev_db = json.load(f) - except (OSError, json.JSONDecodeError): - prev_db = {} +async def main( + output_file: str, + channels: list[str], + *, + seed_path: str | None = None, + no_seed: bool = False, +) -> None: + effective_seed_path, explicit_seed = resolve_seed_path( + output_file=output_file, + seed_path=seed_path, + ) + seed = read_seed_db(effective_seed_path, explicit=explicit_seed) try: async with asyncio.timeout(GLOBAL_TIMEOUT): - db = await fetch_packages(channels, prev_db) + db = await fetch_packages(channels, seed.db if seed.available else {}) + if seed.available and not no_seed: + db["packages"] = apply_seed_lifecycle( + db["packages"], + seed.db, + now_utc_string(), + ) write_json(output_file, db, pretty=True, ensure_ascii=True) print(f"Saved registry as {output_file}") except asyncio.TimeoutError: print(f"Timeout: script took more than {GLOBAL_TIMEOUT} seconds") -async def fetch_packages(channels: list[str], db: Registry = None) -> Registry: +async def fetch_packages(channels: list[str], db: Mapping[str, Any] | None = None) -> Registry: print("Fetching registered packages...") now = time.monotonic() - now_string = datetime.now(timezone.utc).strftime(UTC_FORMAT) + now_string = now_utc_string() async with aiohttp.ClientSession() as session: # Fetch repositories from all channels in parallel @@ -153,11 +189,11 @@ def add(entry: PackageEntry) -> None: # recreate the repo from db fail_info: PackageEntry fail_info = {"fetching_source_failed": now_string} - for pkg in db.get("packages", []): + for pkg in iter_seed_entries(db, "packages"): if pkg.get("source") == url: add_package(fail_info | pkg) - for library in db.get("libraries", []): + for library in iter_seed_entries(db, "libraries"): if library.get("source") == url: add_library(fail_info | library) @@ -269,6 +305,128 @@ async def http_get(location: str, session: aiohttp.ClientSession) -> str: return await resp.text() +def resolve_seed_path(output_file: str, seed_path: str | None) -> tuple[str, bool]: + if seed_path is None: + return output_file, False + + if seed_path == "": + return output_file, True + + return os.path.abspath(seed_path), True + + +def read_seed_db(path: str, *, explicit: bool) -> SeedLoad: + try: + text = open(path, "r", encoding="utf-8").read() + except OSError as exc: + if explicit: + raise FileNotFoundError(f"Could not read explicit seed path: {path}") from exc + return SeedLoad(db={}, available=False) + + try: + data = json.loads(text) + except json.JSONDecodeError as exc: + if explicit: + raise ValueError(f"Explicit seed is not valid JSON: {path}") from exc + return SeedLoad(db={}, available=False) + + if not isinstance(data, dict): + if explicit: + raise ValueError(f"Explicit seed JSON must be an object: {path}") + return SeedLoad(db={}, available=False) + + return SeedLoad(db=data, available=True) + + +def apply_seed_lifecycle( + packages: list[PackageEntry], + seed_db: Mapping[str, Any], + now_string: IsoTimestamp, +) -> list[PackageEntry]: + seed_packages = extract_seed_packages(seed_db) + current = { + pkg["name"]: dict(pkg) + for pkg in packages + if isinstance(pkg.get("name"), str) + } + + for name, package in current.items(): + seed = seed_packages.get(name) + if seed and (first_seen := seed.get("first_seen")): + package["first_seen"] = first_seen + elif "removed" not in package: + package["first_seen"] = now_string + + if "removed" not in package: + package.pop("removed", None) + + for name, seed in seed_packages.items(): + if name not in current: + current[name] = build_tombstone(seed, now_string) + + return sorted(current.values(), key=package_name_sort_key) + + +def extract_seed_packages(seed_db: Mapping[str, Any]) -> dict[str, PackageEntry]: + out: dict[str, PackageEntry] = {} + for entry in iter_seed_entries(seed_db, "packages"): + if not isinstance(name := entry.get("name"), str): + continue + + seed: PackageEntry = {"name": name} + if isinstance(source := entry.get("source"), str): + seed["source"] = source + if isinstance(first_seen := entry.get("first_seen"), str): + seed["first_seen"] = first_seen + if isinstance(removed := entry.get("removed"), str): + seed["removed"] = removed + if isinstance(labels := entry.get("labels"), list): + seed["labels"] = [str(label) for label in labels] + + out[name] = seed + return out + + +def iter_seed_entries(seed_db: Mapping[str, Any], kind: str) -> Iterable[PackageEntry]: + entries = seed_db.get(kind) + if isinstance(entries, list): + for entry in entries: + if isinstance(entry, dict): + yield entry + return + + if isinstance(entries, dict): + for name, entry in entries.items(): + if isinstance(entry, dict): + yield {"name": str(name)} | entry + return + + if kind == "packages" and "packages" not in seed_db: + for name, entry in seed_db.items(): + if isinstance(entry, dict): + yield {"name": str(name)} | entry + + +def build_tombstone(seed: PackageEntry, now_string: IsoTimestamp) -> PackageEntry: + tombstone: PackageEntry = { + "name": seed["name"], + "source": str(seed.get("source", "")), + "first_seen": str(seed.get("first_seen", now_string)), + "removed": str(seed.get("removed", now_string)), + } + if labels := seed.get("labels"): + tombstone["labels"] = labels + return tombstone + + +def package_name_sort_key(entry: Mapping[str, Any]) -> str: + return str(entry.get("name", "")).casefold() + + +def now_utc_string() -> IsoTimestamp: + return datetime.now(timezone.utc).strftime(UTC_FORMAT) + + def err(*args, **kwargs) -> None: print(*args, **kwargs, file=sys.stderr) @@ -308,4 +466,16 @@ def extend(self, items: Iterable[T]) -> Iterable[T]: args = parse_args() output_file = os.path.abspath(args.output) channels = args.channel if args.channel else [DEFAULT_CHANNEL] - asyncio.run(main(output_file, channels)) + seed_path = ( + None + if args.seed is None + else (output_file if args.seed == "" else os.path.abspath(args.seed)) + ) + asyncio.run( + main( + output_file, + channels, + seed_path=seed_path, + no_seed=args.no_seed, + ) + ) diff --git a/tests/registry/test_generate_registry.py b/tests/registry/test_generate_registry.py index 895204f8d..92ea7b631 100644 --- a/tests/registry/test_generate_registry.py +++ b/tests/registry/test_generate_registry.py @@ -31,7 +31,12 @@ def make_channel(path: Path, repositories: list[Path]): path.write_text(json.dumps(channel_data)) -def make_repository(path: Path, package_names: list[str]): +def make_repository( + path: Path, + package_names: list[str], + *, + library_names: list[str] | None = None, +): repo_data = { "schema_version": "3.0.0", "packages": [ @@ -41,7 +46,12 @@ def make_repository(path: Path, package_names: list[str]): } for name in package_names ], - "libraries": [] + "libraries": [ + { + "name": name, + } + for name in (library_names or []) + ], } path.write_text(json.dumps(repo_data)) @@ -234,3 +244,291 @@ async def test_main_with_successful_repo_and_last_run_clears_fetching_source_fai result = json.load(f) # Package should no longer have fetching_source_failed assert "fetching_source_failed" not in result["packages"][0] + + +@pytest.mark.asyncio +async def test_implicit_seed_preserves_lifecycle_data(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Keep", "New"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [{ + "name": "Keep", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "details": "https://github.com/example/Keep", + "first_seen": "2020-01-01T00:00:00Z", + }], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + by_name = {pkg["name"]: pkg for pkg in result["packages"]} + assert by_name["Keep"]["first_seen"] == "2020-01-01T00:00:00Z" + datetime.strptime(by_name["New"]["first_seen"], "%Y-%m-%dT%H:%M:%SZ") + + +@pytest.mark.asyncio +async def test_explicit_seed_overrides_output_path(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Keep"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "output.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [{ + "name": "Keep", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "first_seen": "2011-01-01T00:00:00Z", + }], + "libraries": [], + })) + + seed_file = tmp_path / "seed.json" + seed_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [{ + "name": "Keep", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "first_seen": "2010-01-01T00:00:00Z", + }], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()], seed_path=str(seed_file)) + + result = json.loads(output_file.read_text()) + assert result["packages"][0]["first_seen"] == "2010-01-01T00:00:00Z" + + +@pytest.mark.asyncio +async def test_explicit_missing_seed_fails_hard(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Keep"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + output_file = tmp_path / "output.json" + + with pytest.raises(FileNotFoundError): + await main( + str(output_file), + [channel_path.as_uri()], + seed_path=str(tmp_path / "missing.json"), + ) + + +@pytest.mark.asyncio +async def test_no_seed_outputs_raw_registry_without_lifecycle_fields(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Keep"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "Keep", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "first_seen": "2010-01-01T00:00:00Z", + }, + { + "name": "Gone", + "source": repo_path.as_uri(), + "first_seen": "2011-01-01T00:00:00Z", + }, + ], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()], no_seed=True) + + result = json.loads(output_file.read_text()) + by_name = {pkg["name"]: pkg for pkg in result["packages"]} + assert "first_seen" not in by_name["Keep"] + assert "Gone" not in by_name + + +@pytest.mark.asyncio +async def test_package_disappearance_creates_minimal_tombstone(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, []) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "Gone", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "first_seen": "2010-01-01T00:00:00Z", + "labels": ["theme"], + }, + ], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + assert len(result["packages"]) == 1 + tombstone = result["packages"][0] + assert set(tombstone.keys()) == {"name", "source", "first_seen", "removed", "labels"} + assert tombstone["name"] == "Gone" + assert tombstone["source"] == repo_path.as_uri() + assert tombstone["first_seen"] == "2010-01-01T00:00:00Z" + assert tombstone["labels"] == ["theme"] + datetime.strptime(tombstone["removed"], "%Y-%m-%dT%H:%M:%SZ") + + +@pytest.mark.asyncio +async def test_existing_tombstone_keeps_removed_timestamp(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, []) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "Gone", + "source": repo_path.as_uri(), + "first_seen": "2010-01-01T00:00:00Z", + "removed": "2020-02-02T00:00:00Z", + }, + ], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + assert result["packages"][0]["removed"] == "2020-02-02T00:00:00Z" + + +@pytest.mark.asyncio +async def test_tombstoned_package_resurrection_preserves_first_seen(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Phoenix"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "Phoenix", + "source": repo_path.as_uri(), + "first_seen": "2010-01-01T00:00:00Z", + "removed": "2020-02-02T00:00:00Z", + }, + ], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + phoenix = result["packages"][0] + assert phoenix["name"] == "Phoenix" + assert phoenix["first_seen"] == "2010-01-01T00:00:00Z" + assert "removed" not in phoenix + + +@pytest.mark.asyncio +async def test_disappeared_libraries_are_not_tombstoned(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, []) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [], + "libraries": [ + { + "name": "GoneLib", + "source": repo_path.as_uri(), + "removed": "2020-02-02T00:00:00Z", + } + ], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + assert result["libraries"] == [] + + +@pytest.mark.asyncio +async def test_seeded_output_packages_are_name_sorted(tmp_path): + repo_path = tmp_path / "repo.json" + make_repository(repo_path, ["Zulu", "Bravo"]) + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "Alpha", + "source": repo_path.as_uri(), + "first_seen": "2010-01-01T00:00:00Z", + }, + { + "name": "Zulu", + "source": repo_path.as_uri(), + "first_seen": "2011-01-01T00:00:00Z", + }, + ], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()]) + + result = json.loads(output_file.read_text()) + assert [pkg["name"] for pkg in result["packages"]] == ["Alpha", "Bravo", "Zulu"] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("no_seed", [False, True]) +async def test_fetching_source_failed_behavior_unchanged_with_no_seed_toggle(tmp_path, no_seed): + repo_path = tmp_path / "missing.json" + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [{ + "name": "Lost", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "details": "https://github.com/example/Lost", + }], + "libraries": [], + })) + + await main(str(output_file), [channel_path.as_uri()], no_seed=no_seed) + + result = json.loads(output_file.read_text()) + assert "fetching_source_failed" in result["packages"][0] From dea0047044f10b853f3cba429f314970d5d3ff3c Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sat, 28 Mar 2026 13:05:33 +0100 Subject: [PATCH 11/30] Tighten seeded lifecycle cleanup in registry generation Simplify seeded lifecycle handling after initial implementation. Use pick() for seed extraction, inline package sorting by name, and remove a no-op removed-field cleanup path. Also ensure first_seen is populated when missing, including for tombstoned entries, while still preserving seeded first_seen when available. --- scripts/generate_registry.py | 46 +++++++++--------------------------- 1 file changed, 11 insertions(+), 35 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index e31980917..6746b03d5 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -12,7 +12,7 @@ from urllib.parse import urlparse from typing import Any, Callable, Iterable, Mapping, NotRequired, TypedDict -from ._utils import flatten, resolve_urls, update_url, write_json, pl +from ._utils import flatten, pick, resolve_urls, update_url, write_json, pl DEFAULT_OUTPUT_FILE = "./registry.json" @@ -354,36 +354,21 @@ def apply_seed_lifecycle( seed = seed_packages.get(name) if seed and (first_seen := seed.get("first_seen")): package["first_seen"] = first_seen - elif "removed" not in package: + elif "first_seen" not in package: package["first_seen"] = now_string - if "removed" not in package: - package.pop("removed", None) - for name, seed in seed_packages.items(): if name not in current: current[name] = build_tombstone(seed, now_string) - return sorted(current.values(), key=package_name_sort_key) + return sorted(current.values(), key=lambda entry: entry["name"].casefold()) def extract_seed_packages(seed_db: Mapping[str, Any]) -> dict[str, PackageEntry]: out: dict[str, PackageEntry] = {} for entry in iter_seed_entries(seed_db, "packages"): - if not isinstance(name := entry.get("name"), str): - continue - - seed: PackageEntry = {"name": name} - if isinstance(source := entry.get("source"), str): - seed["source"] = source - if isinstance(first_seen := entry.get("first_seen"), str): - seed["first_seen"] = first_seen - if isinstance(removed := entry.get("removed"), str): - seed["removed"] = removed - if isinstance(labels := entry.get("labels"), list): - seed["labels"] = [str(label) for label in labels] - - out[name] = seed + seed = pick(("name", "source", "first_seen", "removed", "labels"), entry) + out[seed["name"]] = seed # type: ignore[assignment, index] return out @@ -398,29 +383,20 @@ def iter_seed_entries(seed_db: Mapping[str, Any], kind: str) -> Iterable[Package if isinstance(entries, dict): for name, entry in entries.items(): if isinstance(entry, dict): - yield {"name": str(name)} | entry + yield {"name": name} | entry return if kind == "packages" and "packages" not in seed_db: for name, entry in seed_db.items(): if isinstance(entry, dict): - yield {"name": str(name)} | entry + yield {"name": name} | entry def build_tombstone(seed: PackageEntry, now_string: IsoTimestamp) -> PackageEntry: - tombstone: PackageEntry = { - "name": seed["name"], - "source": str(seed.get("source", "")), - "first_seen": str(seed.get("first_seen", now_string)), - "removed": str(seed.get("removed", now_string)), - } - if labels := seed.get("labels"): - tombstone["labels"] = labels - return tombstone - - -def package_name_sort_key(entry: Mapping[str, Any]) -> str: - return str(entry.get("name", "")).casefold() + return ( + {"first_seen": now_string, "removed": now_string} # type: ignore[operator] + | pick(("name", "source", "first_seen", "removed", "labels"), seed) + ) def now_utc_string() -> IsoTimestamp: From 57922775660b7e0a9a6a64787f6135c0986d24ff Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sat, 28 Mar 2026 15:39:33 +0100 Subject: [PATCH 12/30] Some type work --- scripts/crawl.py | 2 +- scripts/generate_registry.py | 73 +++++++++++++++++++++--------------- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/scripts/crawl.py b/scripts/crawl.py index be5375276..cb98100dc 100644 --- a/scripts/crawl.py +++ b/scripts/crawl.py @@ -16,7 +16,7 @@ from packaging.specifiers import SpecifierSet from .bitbucket import fetch_bitbucket_info, RepoInfo as BitbucketRepoInfo -from .generate_registry import Registry, PackageEntry as RegistryEntry +from .generate_registry import Registry, RegistryEntry from .github import ( fetch_github_info, rate_limit_info, RepoInfo as GithubRepoInfo, ReleaseAssetInfo diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index 6746b03d5..5af0a4b47 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -27,7 +27,13 @@ type IsoTimestamp = str -class PackageEntry(TypedDict, total=False): +class RawRepositoryEntry(TypedDict, total=False): + name: str + details: NotRequired[str] + labels: NotRequired[list[str]] + + +class RegistryEntry(TypedDict, total=False): source: Url schema_version: str name: str @@ -38,17 +44,25 @@ class PackageEntry(TypedDict, total=False): fetching_source_failed: NotRequired[IsoTimestamp] +class SeedEntry(TypedDict): + name: str + first_seen: IsoTimestamp + source: NotRequired[Url | None] + removed: NotRequired[IsoTimestamp] + labels: NotRequired[list[str]] + + class Registry(TypedDict): repositories: list[str] - packages: list[PackageEntry] - libraries: list[PackageEntry] + packages: list[RegistryEntry] + libraries: list[RegistryEntry] class RepositorySchema(TypedDict): self: Url schema_version: str - packages: list[PackageEntry] - libraries: list[PackageEntry] + packages: list[RawRepositoryEntry] + libraries: list[RawRepositoryEntry] @dataclass @@ -150,10 +164,10 @@ async def fetch_packages(channels: list[str], db: Mapping[str, Any] | None = Non # Flatten packages and libraries, adding source, schema_version, and # ensuring a unique name. - def add_unique_(container: list[PackageEntry], kind: str) -> Callable[[PackageEntry], None]: + def add_unique_(container: list[RegistryEntry], kind: str) -> Callable[[RegistryEntry], None]: seen = set() - def add(entry: PackageEntry) -> None: + def add(entry: RegistryEntry) -> None: name = extract_package_name(entry) if name and name not in seen: seen.add(name) @@ -168,26 +182,26 @@ def add(entry: PackageEntry) -> None: return add - packages: list[PackageEntry] = [] - libraries: list[PackageEntry] = [] + packages: list[RegistryEntry] = [] + libraries: list[RegistryEntry] = [] add_package = add_unique_(packages, "Package") add_library = add_unique_(libraries, "Library") for url in repos: if repo := result.get(url): - repo_info: PackageEntry + repo_info: RegistryEntry repo_info = { "source": repo["self"], "schema_version": repo["schema_version"], } for pkg in repo["packages"]: - add_package(pkg | repo_info) + add_package(pkg | repo_info) # type: ignore[arg-type] for library in repo["libraries"]: - add_library(library | repo_info) + add_library(library | repo_info) # type: ignore[arg-type] elif db: # recreate the repo from db - fail_info: PackageEntry + fail_info: RegistryEntry fail_info = {"fetching_source_failed": now_string} for pkg in iter_seed_entries(db, "packages"): if pkg.get("source") == url: @@ -339,13 +353,14 @@ def read_seed_db(path: str, *, explicit: bool) -> SeedLoad: def apply_seed_lifecycle( - packages: list[PackageEntry], + packages: list[RegistryEntry], seed_db: Mapping[str, Any], now_string: IsoTimestamp, -) -> list[PackageEntry]: +) -> list[RegistryEntry]: seed_packages = extract_seed_packages(seed_db) + current: dict[str, RegistryEntry] current = { - pkg["name"]: dict(pkg) + pkg["name"]: pkg for pkg in packages if isinstance(pkg.get("name"), str) } @@ -364,35 +379,33 @@ def apply_seed_lifecycle( return sorted(current.values(), key=lambda entry: entry["name"].casefold()) -def extract_seed_packages(seed_db: Mapping[str, Any]) -> dict[str, PackageEntry]: - out: dict[str, PackageEntry] = {} +def extract_seed_packages(seed_db: Mapping[str, Any]) -> dict[str, SeedEntry]: + out: dict[str, SeedEntry] = {} for entry in iter_seed_entries(seed_db, "packages"): seed = pick(("name", "source", "first_seen", "removed", "labels"), entry) out[seed["name"]] = seed # type: ignore[assignment, index] return out -def iter_seed_entries(seed_db: Mapping[str, Any], kind: str) -> Iterable[PackageEntry]: +def iter_seed_entries(seed_db: Mapping[str, Any], kind: str) -> Iterable[RegistryEntry]: entries = seed_db.get(kind) + # Shape: registry.json if isinstance(entries, list): for entry in entries: - if isinstance(entry, dict): - yield entry - return + yield entry - if isinstance(entries, dict): + # Shape: workspace.json + elif isinstance(entries, dict): for name, entry in entries.items(): - if isinstance(entry, dict): - yield {"name": name} | entry - return + yield entry - if kind == "packages" and "packages" not in seed_db: + # Shape: seed.json + elif kind == "packages" and "packages" not in seed_db: for name, entry in seed_db.items(): - if isinstance(entry, dict): - yield {"name": name} | entry + yield entry -def build_tombstone(seed: PackageEntry, now_string: IsoTimestamp) -> PackageEntry: +def build_tombstone(seed: SeedEntry, now_string: IsoTimestamp) -> RegistryEntry: return ( {"first_seen": now_string, "removed": now_string} # type: ignore[operator] | pick(("name", "source", "first_seen", "removed", "labels"), seed) From 3a639fba887e092b8344017fea5263b899aab937 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sat, 28 Mar 2026 16:16:59 +0100 Subject: [PATCH 13/30] Document seed mode and add seed extractor Clarify generate_registry seed semantics in both CLI help and README. The docs now explain implicit vs explicit --seed behavior and the interaction with --no-seed plus fetching_source_failed. Add scripts.seed_from_workspace as a first-class script and document its usage inline with generate_registry. The script emits sparse output for optional fields and avoids writing null source values. --- README.md | 34 +++++++++++++ scripts/generate_registry.py | 7 ++- scripts/seed_from_workspace.py | 90 ++++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 2 deletions(-) create mode 100644 scripts/seed_from_workspace.py diff --git a/README.md b/README.md index 4eabe6fcf..3f5a301f6 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,40 @@ uv run -m scripts.generate_registry uv run -m scripts.generate_registry --output myreg.json --channel --channel ``` +`generate_registry` supports implicit lifecycle enrichment. I.e. newly discovered +packages get a `first_seen` timestamp, packages that are removed from the input +channels/repositories are re-added in a tombstoned way. Also input channels/repositories +that throw on access will lead to marked entries *if* a previous registry/seed is +present; this way we prevent packages from jumping between different sources, esp. +malicious ones. + +This behavior is on by default, however you might not notice it as we don't blindly +add "first_seen: now" stamps when there is in fact no prior registry found. + +Concretely, + +- default seed path is `--output`; this way running generate_registry multiple times + from the same directory, will use the file we wrote on the last run as seed-input. + +- `--seed [PATH]` enforces seed input (supports `registry.json`, `workspace.json`, + and `seed.json`-style package maps). Will raise if the seed is not present/readable. + +- `--no-seed` disables lifecycle enrichment (`first_seen`/`removed` tombstones); + however, source-failure marking still applies when prior seed data is available. + +```bash +uv run -m scripts.generate_registry --output registry.json +uv run -m scripts.generate_registry --output registry.json --seed ./the-registry/registry.json +uv run -m scripts.generate_registry --output registry.json --no-seed +``` + +To create a compact seed from an existing workspace, use `seed_from_workspace`: + +```bash +uv run -m scripts.seed_from_workspace +uv run -m scripts.seed_from_workspace --workspace ./workspace.json --output ./seed.json +``` + --- ### 2. `crawl.py` diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index 5af0a4b47..073824007 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -97,8 +97,11 @@ def parse_args() -> argparse.Namespace: const="", default=None, help=( - "Optional path to seed JSON. If provided without a value, defaults to --output. " - "Explicit seed paths must exist and be readable." + "Seed input for lifecycle enrichment. Omit to use implicit seed mode: " + "read --output if available, otherwise continue without lifecycle fields. " + "Provide without a value to require --output as seed (fail if unreadable), " + "or provide a path to require that file. Supports registry.json, " + "workspace.json, or seed.json-style package maps." ), ) parser.add_argument( diff --git a/scripts/seed_from_workspace.py b/scripts/seed_from_workspace.py new file mode 100644 index 000000000..727c6a59f --- /dev/null +++ b/scripts/seed_from_workspace.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +DEFAULT_WORKSPACE = "workspace.json" +DEFAULT_OUTPUT = "seed.json" + + +def main() -> None: + args = parse_args() + workspace = load_workspace(args.workspace) + seed = extract_seed(workspace) + write_seed(args.output, seed) + + removed_count = sum(1 for entry in seed.values() if "removed" in entry) + alive_count = len(seed) - removed_count + print( + f"Wrote {len(seed)} entries to {args.output} " + f"({alive_count} alive, {removed_count} removed)." + ) + + +class Args(argparse.Namespace): + workspace: str + output: str + + +def parse_args() -> Args: + parser = argparse.ArgumentParser( + description="Extract minimal package seed data from workspace.json", + ) + parser.add_argument( + "--workspace", + default=DEFAULT_WORKSPACE, + help=f"Path to workspace.json (default: {DEFAULT_WORKSPACE})", + ) + parser.add_argument( + "-o", + "--output", + default=DEFAULT_OUTPUT, + help=f"Output path for seed JSON (default: {DEFAULT_OUTPUT})", + ) + return parser.parse_args(namespace=Args()) + + +def load_workspace(path: str) -> dict[str, Any]: + workspace_path = Path(path) + return json.loads(workspace_path.read_text(encoding="utf-8")) + + +def extract_seed(workspace: dict[str, Any]) -> dict[str, dict[str, Any]]: + seed: dict[str, dict[str, Any]] = {} + + for package in workspace["packages"].values(): + name = package["name"] + first_seen = package["first_seen"] + + if removed := package.get("removed"): + entry = { + "name": name, + "first_seen": first_seen, + "removed": removed, + "labels": package.get("labels", []), + } + if source := package.get("source"): + entry["source"] = source + seed[name] = entry + else: + seed[name] = { + "name": name, + "first_seen": first_seen, + } + + return dict(sorted(seed.items(), key=lambda item: item[0].casefold())) + + +def write_seed(path: str, seed: dict[str, dict[str, Any]]) -> None: + output_path = Path(path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text( + json.dumps(seed, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +if __name__ == "__main__": + main() From e0881d5249d9fb796d09842b63d4dad6cbf16659 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sat, 28 Mar 2026 17:03:33 +0100 Subject: [PATCH 14/30] Replace seed_from_workspace with generate_seed Introduce scripts.generate_seed as the new seed extraction command. It accepts exactly one input source via a required mutually exclusive flag: --workspace [PATH] or --registry [PATH]. Update README examples to use generate_seed and document both supported input modes. Remove the old seed_from_workspace script. --- README.md | 7 +- scripts/generate_seed.py | 144 +++++++++++++++++++++++++++++++++ scripts/seed_from_workspace.py | 90 --------------------- 3 files changed, 148 insertions(+), 93 deletions(-) create mode 100644 scripts/generate_seed.py delete mode 100644 scripts/seed_from_workspace.py diff --git a/README.md b/README.md index 3f5a301f6..f082fabc0 100644 --- a/README.md +++ b/README.md @@ -94,11 +94,12 @@ uv run -m scripts.generate_registry --output registry.json --seed ./the-registry uv run -m scripts.generate_registry --output registry.json --no-seed ``` -To create a compact seed from an existing workspace, use `seed_from_workspace`: +To create a compact archival seed, use `generate_seed` with either a workspace +or a registry as input (but not both): ```bash -uv run -m scripts.seed_from_workspace -uv run -m scripts.seed_from_workspace --workspace ./workspace.json --output ./seed.json +uv run -m scripts.generate_seed --workspace ./workspace.json --output ./seed.json +uv run -m scripts.generate_seed --registry ./registry.json --output ./seed.json ``` --- diff --git a/scripts/generate_seed.py b/scripts/generate_seed.py new file mode 100644 index 000000000..34ea1750a --- /dev/null +++ b/scripts/generate_seed.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any, Literal, Mapping, NotRequired, TypedDict + +from ._utils import pick + +DEFAULT_WORKSPACE = "workspace.json" +DEFAULT_REGISTRY = "registry.json" +DEFAULT_OUTPUT = "seed.json" + + +type IsoTimestamp = str +type Url = str + + +class ActivePackage(TypedDict): + name: str + first_seen: IsoTimestamp + + +class TombstonedPackage(TypedDict): + name: str + first_seen: IsoTimestamp + source: NotRequired[Url | None] + removed: IsoTimestamp + labels: NotRequired[list[str]] + + +SeedEntry = ActivePackage | TombstonedPackage + + +def main() -> None: + args = parse_args() + input_kind, input_path = resolve_input(args) + input_data = load_json(input_path) + + seed = ( + extract_seed_from_workspace(input_data) + if input_kind == "workspace" + else extract_seed_from_registry(input_data) + ) + write_seed(args.output, seed) + + removed_count = sum(1 for entry in seed.values() if "removed" in entry) + alive_count = len(seed) - removed_count + print( + f"Wrote {len(seed)} entries to {args.output} " + f"({alive_count} alive, {removed_count} removed)." + ) + + +class Args(argparse.Namespace): + workspace: str | None + registry: str | None + output: str + + +def parse_args() -> Args: + parser = argparse.ArgumentParser( + description=( + "Generate a compact seed JSON from either workspace.json or registry.json" + ), + ) + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument( + "--workspace", + nargs="?", + const=DEFAULT_WORKSPACE, + help=( + "Path to workspace.json. If provided without a value, defaults to " + f"{DEFAULT_WORKSPACE}." + ), + ) + input_group.add_argument( + "--registry", + nargs="?", + const=DEFAULT_REGISTRY, + help=( + "Path to registry.json. If provided without a value, defaults to " + f"{DEFAULT_REGISTRY}." + ), + ) + parser.add_argument( + "-o", + "--output", + default=DEFAULT_OUTPUT, + help=f"Output path for seed JSON (default: {DEFAULT_OUTPUT})", + ) + return parser.parse_args(namespace=Args()) + + +def resolve_input(args: Args) -> tuple[Literal["workspace", "registry"], str]: + if args.workspace is not None: + return "workspace", args.workspace + if args.registry is not None: + return "registry", args.registry + raise AssertionError("Either --workspace or --registry must be provided") + + +def load_json(path: str) -> dict[str, Any]: + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def extract_seed_from_workspace(workspace: dict[str, Any]) -> dict[str, Mapping[str, Any]]: + seed: dict[str, Mapping[str, Any]] = {} + for package in workspace["packages"].values(): + entry = build_seed_entry(package) + seed[entry["name"]] = entry + return sort_seed(seed) + + +def extract_seed_from_registry(registry: dict[str, Any]) -> dict[str, Mapping[str, Any]]: + seed: dict[str, Mapping[str, Any]] = {} + for package in registry["packages"]: + entry = build_seed_entry(package) + seed[entry["name"]] = entry + return sort_seed(seed) + + +def build_seed_entry(package: dict[str, Any]) -> Mapping[str, Any]: + if "removed" in package: + return pick(("name", "first_seen", "removed", "labels", "source"), package) + else: + return pick(("name", "first_seen"), package) + + +def sort_seed(seed: dict[str, Mapping[str, Any]]) -> dict[str, Mapping[str, Any]]: + return dict(sorted(seed.items(), key=lambda item: item[0].casefold())) + + +def write_seed(path: str, seed: dict[str, Mapping[str, Any]]) -> None: + output_path = Path(path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text( + json.dumps(seed, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/seed_from_workspace.py b/scripts/seed_from_workspace.py deleted file mode 100644 index 727c6a59f..000000000 --- a/scripts/seed_from_workspace.py +++ /dev/null @@ -1,90 +0,0 @@ -from __future__ import annotations - -import argparse -import json -from pathlib import Path -from typing import Any - -DEFAULT_WORKSPACE = "workspace.json" -DEFAULT_OUTPUT = "seed.json" - - -def main() -> None: - args = parse_args() - workspace = load_workspace(args.workspace) - seed = extract_seed(workspace) - write_seed(args.output, seed) - - removed_count = sum(1 for entry in seed.values() if "removed" in entry) - alive_count = len(seed) - removed_count - print( - f"Wrote {len(seed)} entries to {args.output} " - f"({alive_count} alive, {removed_count} removed)." - ) - - -class Args(argparse.Namespace): - workspace: str - output: str - - -def parse_args() -> Args: - parser = argparse.ArgumentParser( - description="Extract minimal package seed data from workspace.json", - ) - parser.add_argument( - "--workspace", - default=DEFAULT_WORKSPACE, - help=f"Path to workspace.json (default: {DEFAULT_WORKSPACE})", - ) - parser.add_argument( - "-o", - "--output", - default=DEFAULT_OUTPUT, - help=f"Output path for seed JSON (default: {DEFAULT_OUTPUT})", - ) - return parser.parse_args(namespace=Args()) - - -def load_workspace(path: str) -> dict[str, Any]: - workspace_path = Path(path) - return json.loads(workspace_path.read_text(encoding="utf-8")) - - -def extract_seed(workspace: dict[str, Any]) -> dict[str, dict[str, Any]]: - seed: dict[str, dict[str, Any]] = {} - - for package in workspace["packages"].values(): - name = package["name"] - first_seen = package["first_seen"] - - if removed := package.get("removed"): - entry = { - "name": name, - "first_seen": first_seen, - "removed": removed, - "labels": package.get("labels", []), - } - if source := package.get("source"): - entry["source"] = source - seed[name] = entry - else: - seed[name] = { - "name": name, - "first_seen": first_seen, - } - - return dict(sorted(seed.items(), key=lambda item: item[0].casefold())) - - -def write_seed(path: str, seed: dict[str, dict[str, Any]]) -> None: - output_path = Path(path) - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text( - json.dumps(seed, indent=2, ensure_ascii=False) + "\n", - encoding="utf-8", - ) - - -if __name__ == "__main__": - main() From 53331e2d33bd4324884cf6b8da1440091aff2a3d Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sat, 28 Mar 2026 18:32:01 +0100 Subject: [PATCH 15/30] Warn when seed shape completeness is poor Add an incomplete-shape warning in generate_seed based on expected entry sizes (2 keys for active entries, 5 for tombstoned entries). The warning triggers when more than 10% of entries are incomplete. Special-case the all-incomplete scenario with a clearer message: "All packages have an incomplete shape". --- scripts/generate_seed.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/scripts/generate_seed.py b/scripts/generate_seed.py index 34ea1750a..2434f373d 100644 --- a/scripts/generate_seed.py +++ b/scripts/generate_seed.py @@ -3,6 +3,7 @@ import argparse import json from pathlib import Path +import sys from typing import Any, Literal, Mapping, NotRequired, TypedDict from ._utils import pick @@ -10,6 +11,7 @@ DEFAULT_WORKSPACE = "workspace.json" DEFAULT_REGISTRY = "registry.json" DEFAULT_OUTPUT = "seed.json" +INCOMPLETE_SHAPE_WARN_THRESHOLD = 0.10 type IsoTimestamp = str @@ -51,6 +53,19 @@ def main() -> None: f"({alive_count} alive, {removed_count} removed)." ) + incomplete_count, total_count = count_incomplete_shapes(seed) + if should_warn_about_incomplete_shapes(incomplete_count, total_count): + if total_count > 0 and incomplete_count == total_count: + msg = "Check the output. All packages have an incomplete shape." + else: + incomplete_percent = (incomplete_count * 100) / total_count + msg = ( + "Check the output. " + f"{incomplete_percent:.1f}% of the packages have an incomplete shape." + ) + + print(msg, file=sys.stderr) + class Args(argparse.Namespace): workspace: str | None @@ -131,6 +146,22 @@ def sort_seed(seed: dict[str, Mapping[str, Any]]) -> dict[str, Mapping[str, Any] return dict(sorted(seed.items(), key=lambda item: item[0].casefold())) +def count_incomplete_shapes(seed: Mapping[str, Mapping[str, Any]]) -> tuple[int, int]: + incomplete_count = 0 + total_count = len(seed) + for entry in seed.values(): + expected_key_count = 5 if "removed" in entry else 2 + if len(entry) != expected_key_count: + incomplete_count += 1 + return incomplete_count, total_count + + +def should_warn_about_incomplete_shapes(incomplete_count: int, total_count: int) -> bool: + if total_count == 0: + return False + return (incomplete_count / total_count) > INCOMPLETE_SHAPE_WARN_THRESHOLD + + def write_seed(path: str, seed: dict[str, Mapping[str, Any]]) -> None: output_path = Path(path) output_path.parent.mkdir(parents=True, exist_ok=True) From 5214320f423978f2b19f2848e3315c05f040e0c2 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 11:41:46 +0200 Subject: [PATCH 16/30] Harden seed fallback for failed repositories Separate lifecycle seeding from source-failure recovery in generate_registry. Recovery of failed repositories now requires registry-shaped data and no longer reconstructs entries from workspace/seed maps. If an explicit seed is not registry-shaped, the command falls back to prior --output when available for recovery data. On fetch failures, emit a focused warning when the seed knows package names but no full recovery entries exist, with message text that reflects --no-seed behavior. Add regression tests for non-registry seed input and fallback-to-output recovery behavior. --- scripts/generate_registry.py | 136 ++++++++++++++++++++--- tests/registry/test_generate_registry.py | 86 ++++++++++++++ 2 files changed, 204 insertions(+), 18 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index 073824007..b884c1658 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -124,10 +124,20 @@ async def main( seed_path=seed_path, ) seed = read_seed_db(effective_seed_path, explicit=explicit_seed) + failure_recovery = resolve_failure_recovery_db( + output_file, + effective_seed_path, + seed, + ) try: async with asyncio.timeout(GLOBAL_TIMEOUT): - db = await fetch_packages(channels, seed.db if seed.available else {}) + db = await fetch_packages( + channels, + failure_recovery.db if failure_recovery.available else {}, + seed_hint_db=seed.db if seed.available else None, + no_seed=no_seed, + ) if seed.available and not no_seed: db["packages"] = apply_seed_lifecycle( db["packages"], @@ -140,7 +150,13 @@ async def main( print(f"Timeout: script took more than {GLOBAL_TIMEOUT} seconds") -async def fetch_packages(channels: list[str], db: Mapping[str, Any] | None = None) -> Registry: +async def fetch_packages( + channels: list[str], + db: Mapping[str, Any] | None = None, + *, + seed_hint_db: Mapping[str, Any] | None = None, + no_seed: bool = False, +) -> Registry: print("Fetching registered packages...") now = time.monotonic() now_string = now_utc_string() @@ -153,16 +169,27 @@ async def fetch_packages(channels: list[str], db: Mapping[str, Any] | None = Non repos: list[str] = list(flatten(repos_lists)) unseen = Unseen(repos) sem = asyncio.Semaphore(MAX_CONCURRENCY) + repo_results = await asyncio.gather(*[ + asyncio.create_task(fetch_repository(url, unseen, sem, session)) + for url in repos + ], return_exceptions=True) + result: dict[Url, RepositorySchema] = {} - result = { - repo["self"]: repo - for repo in await asyncio.gather(*[ - asyncio.create_task(fetch_repository(url, unseen, sem, session)) - for url in repos - ]) - if repo - if not repo.get("schema_version", "1.").startswith("1.") - } + for url, repo_result in zip(repos, repo_results): + if isinstance(repo_result, Exception): + err(f"Error fetching {url}: {repo_result}") + warn_unrecoverable_seed_entries( + url, + recovery_db=db, + seed_hint_db=seed_hint_db, + no_seed=no_seed, + ) + continue + if isinstance(repo_result, BaseException): + raise repo_result + + if not repo_result.get("schema_version", "1.").startswith("1."): + result[repo_result["self"]] = repo_result # Flatten packages and libraries, adding source, schema_version, and # ensuring a unique name. @@ -268,13 +295,9 @@ async def fetch_repository( location: Url, unseen: Unseen[Url], sem: asyncio.Semaphore, - session: aiohttp.ClientSession -) -> RepositorySchema | None: - try: - result = await __fetch_repo(location, sem, session) - except Exception as e: - err(f"Error fetching {location}: {e}") - return None + session: aiohttp.ClientSession, +) -> RepositorySchema: + result = await __fetch_repo(location, sem, session) repository: RepositorySchema = { "self": location, @@ -355,6 +378,30 @@ def read_seed_db(path: str, *, explicit: bool) -> SeedLoad: return SeedLoad(db=data, available=True) +def resolve_failure_recovery_db( + output_file: str, + effective_seed_path: str, + seed: SeedLoad, +) -> SeedLoad: + if seed.available and is_registry_recovery_db(seed.db): + return seed + + output_is_seed = os.path.abspath(output_file) == os.path.abspath(effective_seed_path) + if not output_is_seed: + output_db = read_seed_db(output_file, explicit=False) + if output_db.available and is_registry_recovery_db(output_db.db): + return output_db + + return SeedLoad(db={}, available=False) + + +def is_registry_recovery_db(db: Mapping[str, Any]) -> bool: + return ( + isinstance(db.get("packages"), list) + and isinstance(db.get("libraries"), list) + ) + + def apply_seed_lifecycle( packages: list[RegistryEntry], seed_db: Mapping[str, Any], @@ -408,6 +455,59 @@ def iter_seed_entries(seed_db: Mapping[str, Any], kind: str) -> Iterable[Registr yield entry +def warn_unrecoverable_seed_entries( + source_url: str, + *, + recovery_db: Mapping[str, Any] | None, + seed_hint_db: Mapping[str, Any] | None, + no_seed: bool, +) -> None: + if not seed_hint_db: + return + if has_recovery_entries_for_source(recovery_db, source_url): + return + + lost_names = seed_package_names_for_source(seed_hint_db, source_url) + if not lost_names: + return + + mode_outcome = ( + "these are dropped." + if no_seed + else "these are tombstoned." + ) + err( + "ATTENTION: seed file knows " + f"{pl(len(lost_names), 'packages')} in the failed repository " + "but has no data to recover full entries; " + f"{mode_outcome}" + ) + + +def has_recovery_entries_for_source( + recovery_db: Mapping[str, Any] | None, + source_url: str, +) -> bool: + if not recovery_db: + return False + + for kind in ("packages", "libraries"): + for entry in iter_seed_entries(recovery_db, kind): + if entry.get("source") == source_url: + return True + return False + + +def seed_package_names_for_source(seed_db: Mapping[str, Any], source_url: str) -> list[str]: + names = { + entry["name"] + for entry in iter_seed_entries(seed_db, "packages") + if entry.get("source") == source_url + if isinstance(entry.get("name"), str) + } + return sorted(names, key=str.casefold) + + def build_tombstone(seed: SeedEntry, now_string: IsoTimestamp) -> RegistryEntry: return ( {"first_seen": now_string, "removed": now_string} # type: ignore[operator] diff --git a/tests/registry/test_generate_registry.py b/tests/registry/test_generate_registry.py index 92ea7b631..645c0a756 100644 --- a/tests/registry/test_generate_registry.py +++ b/tests/registry/test_generate_registry.py @@ -532,3 +532,89 @@ async def test_fetching_source_failed_behavior_unchanged_with_no_seed_toggle(tmp result = json.loads(output_file.read_text()) assert "fetching_source_failed" in result["packages"][0] + + +@pytest.mark.asyncio +async def test_non_registry_seed_does_not_recreate_failed_repo_entries(tmp_path, capsys): + repo_path = tmp_path / "missing.json" + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + seed_file = tmp_path / "workspace.json" + seed_file.write_text(json.dumps({ + "packages": { + "SFTP": { + "name": "SFTP", + "source": repo_path.as_uri(), + "first_seen": "2011-12-15T14:11:26Z", + "description": "Commercial SFTP/FTP plugin", + "labels": ["ftp", "sync"], + } + } + })) + + await main( + str(output_file), + [channel_path.as_uri()], + seed_path=str(seed_file), + ) + + result = json.loads(output_file.read_text()) + pkg = result["packages"][0] + assert pkg["name"] == "SFTP" + assert "description" not in pkg + assert "fetching_source_failed" not in pkg + + captured = capsys.readouterr() + assert "seed file knows 1 package" in captured.err + + +@pytest.mark.asyncio +async def test_workspace_seed_uses_prior_output_registry_for_failed_repo_recovery(tmp_path, capsys): + repo_path = tmp_path / "missing.json" + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + output_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [ + { + "name": "SFTP", + "source": repo_path.as_uri(), + "schema_version": "3.0.0", + "details": "https://example.com/sftp", + "first_seen": "2015-01-01T00:00:00Z", + } + ], + "libraries": [], + })) + + seed_file = tmp_path / "workspace.json" + seed_file.write_text(json.dumps({ + "packages": { + "SFTP": { + "name": "SFTP", + "source": repo_path.as_uri(), + "first_seen": "2011-12-15T14:11:26Z", + "description": "Commercial SFTP/FTP plugin", + } + } + })) + + await main( + str(output_file), + [channel_path.as_uri()], + seed_path=str(seed_file), + ) + + result = json.loads(output_file.read_text()) + pkg = result["packages"][0] + assert pkg["name"] == "SFTP" + assert pkg["first_seen"] == "2011-12-15T14:11:26Z" + assert "fetching_source_failed" in pkg + assert "description" not in pkg + + captured = capsys.readouterr() + assert "recover full entries" not in captured.err From e6bdfcfede9d17fc6db3c38a9c4bee1fc842d191 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 14:08:08 +0200 Subject: [PATCH 17/30] Internally rename variable db -> recovery_db --- scripts/generate_registry.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index b884c1658..eacf38bce 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -152,7 +152,7 @@ async def main( async def fetch_packages( channels: list[str], - db: Mapping[str, Any] | None = None, + recovery_db: Mapping[str, Any] | None = None, *, seed_hint_db: Mapping[str, Any] | None = None, no_seed: bool = False, @@ -180,7 +180,7 @@ async def fetch_packages( err(f"Error fetching {url}: {repo_result}") warn_unrecoverable_seed_entries( url, - recovery_db=db, + recovery_db=recovery_db, seed_hint_db=seed_hint_db, no_seed=no_seed, ) @@ -229,15 +229,15 @@ def add(entry: RegistryEntry) -> None: for library in repo["libraries"]: add_library(library | repo_info) # type: ignore[arg-type] - elif db: - # recreate the repo from db + elif recovery_db: + # recreate the repo from recovery_db fail_info: RegistryEntry fail_info = {"fetching_source_failed": now_string} - for pkg in iter_seed_entries(db, "packages"): + for pkg in iter_seed_entries(recovery_db, "packages"): if pkg.get("source") == url: add_package(fail_info | pkg) - for library in iter_seed_entries(db, "libraries"): + for library in iter_seed_entries(recovery_db, "libraries"): if library.get("source") == url: add_library(fail_info | library) From 2d3c91c37e196fbee54db08efeb3b4caf0834f70 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 14:56:37 +0200 Subject: [PATCH 18/30] Simplify recovery-shape checks in seed loading Use has_registry_shape directly in resolve_failure_recovery_db without additionally checking available. The shape flag already encodes successful load plus registry-compatible structure. This keeps behavior unchanged while reducing redundant conditions. --- scripts/generate_registry.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index eacf38bce..6f912a3f8 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -69,6 +69,7 @@ class RepositorySchema(TypedDict): class SeedLoad: db: dict[str, Any] available: bool + has_registry_shape: bool def parse_args() -> argparse.Namespace: @@ -361,21 +362,25 @@ def read_seed_db(path: str, *, explicit: bool) -> SeedLoad: except OSError as exc: if explicit: raise FileNotFoundError(f"Could not read explicit seed path: {path}") from exc - return SeedLoad(db={}, available=False) + return SeedLoad(db={}, available=False, has_registry_shape=False) try: data = json.loads(text) except json.JSONDecodeError as exc: if explicit: raise ValueError(f"Explicit seed is not valid JSON: {path}") from exc - return SeedLoad(db={}, available=False) + return SeedLoad(db={}, available=False, has_registry_shape=False) if not isinstance(data, dict): if explicit: raise ValueError(f"Explicit seed JSON must be an object: {path}") - return SeedLoad(db={}, available=False) + return SeedLoad(db={}, available=False, has_registry_shape=False) - return SeedLoad(db=data, available=True) + return SeedLoad( + db=data, + available=True, + has_registry_shape=is_registry_recovery_db(data), + ) def resolve_failure_recovery_db( @@ -383,16 +388,16 @@ def resolve_failure_recovery_db( effective_seed_path: str, seed: SeedLoad, ) -> SeedLoad: - if seed.available and is_registry_recovery_db(seed.db): + if seed.has_registry_shape: return seed output_is_seed = os.path.abspath(output_file) == os.path.abspath(effective_seed_path) if not output_is_seed: output_db = read_seed_db(output_file, explicit=False) - if output_db.available and is_registry_recovery_db(output_db.db): + if output_db.has_registry_shape: return output_db - return SeedLoad(db={}, available=False) + return SeedLoad(db={}, available=False, has_registry_shape=False) def is_registry_recovery_db(db: Mapping[str, Any]) -> bool: From 013e84bd700de7a09a964b7126d80ed6055b0eef Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 15:10:58 +0200 Subject: [PATCH 19/30] Drop SeedLoad null-object and clarify seed strictness Replace the SeedLoad available/null-object pattern with SeedDb | None. This removes ambiguous truthiness handling and makes seed presence explicit at call sites. Also rename read_seed_db(explicit=...) to strict=... to better reflect its behavior: strict mode raises on read/parse errors, while implicit mode returns None. --- scripts/generate_registry.py | 50 +++++++++++++++++------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index 6f912a3f8..e1ddbfdb4 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -66,9 +66,8 @@ class RepositorySchema(TypedDict): @dataclass -class SeedLoad: +class SeedDb: db: dict[str, Any] - available: bool has_registry_shape: bool @@ -120,11 +119,11 @@ async def main( seed_path: str | None = None, no_seed: bool = False, ) -> None: - effective_seed_path, explicit_seed = resolve_seed_path( + effective_seed_path, strict_seed = resolve_seed_path( output_file=output_file, seed_path=seed_path, ) - seed = read_seed_db(effective_seed_path, explicit=explicit_seed) + seed = read_seed_db(effective_seed_path, strict=strict_seed) failure_recovery = resolve_failure_recovery_db( output_file, effective_seed_path, @@ -135,11 +134,11 @@ async def main( async with asyncio.timeout(GLOBAL_TIMEOUT): db = await fetch_packages( channels, - failure_recovery.db if failure_recovery.available else {}, - seed_hint_db=seed.db if seed.available else None, + failure_recovery.db if failure_recovery else {}, + seed_hint_db=seed.db if seed else None, no_seed=no_seed, ) - if seed.available and not no_seed: + if seed and not no_seed: db["packages"] = apply_seed_lifecycle( db["packages"], seed.db, @@ -356,29 +355,28 @@ def resolve_seed_path(output_file: str, seed_path: str | None) -> tuple[str, boo return os.path.abspath(seed_path), True -def read_seed_db(path: str, *, explicit: bool) -> SeedLoad: +def read_seed_db(path: str, *, strict: bool) -> SeedDb | None: try: text = open(path, "r", encoding="utf-8").read() except OSError as exc: - if explicit: - raise FileNotFoundError(f"Could not read explicit seed path: {path}") from exc - return SeedLoad(db={}, available=False, has_registry_shape=False) + if strict: + raise FileNotFoundError(f"Could not read seed path: {path}") from exc + return None try: data = json.loads(text) except json.JSONDecodeError as exc: - if explicit: - raise ValueError(f"Explicit seed is not valid JSON: {path}") from exc - return SeedLoad(db={}, available=False, has_registry_shape=False) + if strict: + raise ValueError(f"Seed is not valid JSON: {path}") from exc + return None if not isinstance(data, dict): - if explicit: - raise ValueError(f"Explicit seed JSON must be an object: {path}") - return SeedLoad(db={}, available=False, has_registry_shape=False) + if strict: + raise ValueError(f"Seed JSON must be an object: {path}") + return None - return SeedLoad( + return SeedDb( db=data, - available=True, has_registry_shape=is_registry_recovery_db(data), ) @@ -386,18 +384,18 @@ def read_seed_db(path: str, *, explicit: bool) -> SeedLoad: def resolve_failure_recovery_db( output_file: str, effective_seed_path: str, - seed: SeedLoad, -) -> SeedLoad: - if seed.has_registry_shape: + seed: SeedDb | None, +) -> SeedDb | None: + if seed and seed.has_registry_shape: return seed output_is_seed = os.path.abspath(output_file) == os.path.abspath(effective_seed_path) if not output_is_seed: - output_db = read_seed_db(output_file, explicit=False) - if output_db.has_registry_shape: + output_db = read_seed_db(output_file, strict=False) + if output_db and output_db.has_registry_shape: return output_db - return SeedLoad(db={}, available=False, has_registry_shape=False) + return None def is_registry_recovery_db(db: Mapping[str, Any]) -> bool: @@ -467,7 +465,7 @@ def warn_unrecoverable_seed_entries( seed_hint_db: Mapping[str, Any] | None, no_seed: bool, ) -> None: - if not seed_hint_db: + if seed_hint_db is None: return if has_recovery_entries_for_source(recovery_db, source_url): return From 37f31048037a787a903dcf3510584b3d4d5b3585 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 15:21:11 +0200 Subject: [PATCH 20/30] Inline seed package index for lifecycle merge Simplify lifecycle seed handling by building the name->entry map directly inside apply_seed_lifecycle and removing extract_seed_packages. Also generalize build_tombstone to accept Mapping[str, Any], keeping the key filtering in one place. --- scripts/generate_registry.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index e1ddbfdb4..fa4788718 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -410,7 +410,10 @@ def apply_seed_lifecycle( seed_db: Mapping[str, Any], now_string: IsoTimestamp, ) -> list[RegistryEntry]: - seed_packages = extract_seed_packages(seed_db) + seed_packages = { + entry["name"]: entry + for entry in iter_seed_entries(seed_db, "packages") + } current: dict[str, RegistryEntry] current = { pkg["name"]: pkg @@ -432,14 +435,6 @@ def apply_seed_lifecycle( return sorted(current.values(), key=lambda entry: entry["name"].casefold()) -def extract_seed_packages(seed_db: Mapping[str, Any]) -> dict[str, SeedEntry]: - out: dict[str, SeedEntry] = {} - for entry in iter_seed_entries(seed_db, "packages"): - seed = pick(("name", "source", "first_seen", "removed", "labels"), entry) - out[seed["name"]] = seed # type: ignore[assignment, index] - return out - - def iter_seed_entries(seed_db: Mapping[str, Any], kind: str) -> Iterable[RegistryEntry]: entries = seed_db.get(kind) # Shape: registry.json @@ -511,7 +506,7 @@ def seed_package_names_for_source(seed_db: Mapping[str, Any], source_url: str) - return sorted(names, key=str.casefold) -def build_tombstone(seed: SeedEntry, now_string: IsoTimestamp) -> RegistryEntry: +def build_tombstone(seed: Mapping[str, Any], now_string: IsoTimestamp) -> RegistryEntry: return ( {"first_seen": now_string, "removed": now_string} # type: ignore[operator] | pick(("name", "source", "first_seen", "removed", "labels"), seed) From 38a77b33801a61bf00a4cce4b61d250adc0483e1 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 15:39:47 +0200 Subject: [PATCH 21/30] Type recovery DB as registry-shaped structure Replace generic Mapping typing for recovery_db with a dedicated RecoveryDb shape. This matches runtime guarantees and simplifies recovery iteration code. Use a TypeGuard for registry-shape detection and return RecoveryDb | None from resolve_failure_recovery_db. --- scripts/generate_registry.py | 44 ++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index fa4788718..cf7ece57f 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -10,7 +10,7 @@ import sys import time from urllib.parse import urlparse -from typing import Any, Callable, Iterable, Mapping, NotRequired, TypedDict +from typing import Any, Callable, Iterable, Mapping, NotRequired, TypedDict, TypeGuard from ._utils import flatten, pick, resolve_urls, update_url, write_json, pl @@ -65,6 +65,11 @@ class RepositorySchema(TypedDict): libraries: list[RawRepositoryEntry] +class RecoveryDb(TypedDict): + packages: list[RegistryEntry] + libraries: list[RegistryEntry] + + @dataclass class SeedDb: db: dict[str, Any] @@ -134,7 +139,7 @@ async def main( async with asyncio.timeout(GLOBAL_TIMEOUT): db = await fetch_packages( channels, - failure_recovery.db if failure_recovery else {}, + failure_recovery, seed_hint_db=seed.db if seed else None, no_seed=no_seed, ) @@ -152,7 +157,7 @@ async def main( async def fetch_packages( channels: list[str], - recovery_db: Mapping[str, Any] | None = None, + recovery_db: RecoveryDb | None = None, *, seed_hint_db: Mapping[str, Any] | None = None, no_seed: bool = False, @@ -230,14 +235,14 @@ def add(entry: RegistryEntry) -> None: add_library(library | repo_info) # type: ignore[arg-type] elif recovery_db: - # recreate the repo from recovery_db + # recreate the repo from recovery_db (always registry-shaped) fail_info: RegistryEntry fail_info = {"fetching_source_failed": now_string} - for pkg in iter_seed_entries(recovery_db, "packages"): + for pkg in recovery_db.get("packages", []): if pkg.get("source") == url: add_package(fail_info | pkg) - for library in iter_seed_entries(recovery_db, "libraries"): + for library in recovery_db.get("libraries", []): if library.get("source") == url: add_library(fail_info | library) @@ -385,20 +390,20 @@ def resolve_failure_recovery_db( output_file: str, effective_seed_path: str, seed: SeedDb | None, -) -> SeedDb | None: +) -> RecoveryDb | None: if seed and seed.has_registry_shape: - return seed + return seed.db # type: ignore[return-value] output_is_seed = os.path.abspath(output_file) == os.path.abspath(effective_seed_path) if not output_is_seed: output_db = read_seed_db(output_file, strict=False) if output_db and output_db.has_registry_shape: - return output_db + return output_db.db # type: ignore[return-value] return None -def is_registry_recovery_db(db: Mapping[str, Any]) -> bool: +def is_registry_recovery_db(db: Mapping[str, Any]) -> TypeGuard[RecoveryDb]: return ( isinstance(db.get("packages"), list) and isinstance(db.get("libraries"), list) @@ -412,7 +417,7 @@ def apply_seed_lifecycle( ) -> list[RegistryEntry]: seed_packages = { entry["name"]: entry - for entry in iter_seed_entries(seed_db, "packages") + for entry in iter_db_entries(seed_db, "packages") } current: dict[str, RegistryEntry] current = { @@ -435,8 +440,8 @@ def apply_seed_lifecycle( return sorted(current.values(), key=lambda entry: entry["name"].casefold()) -def iter_seed_entries(seed_db: Mapping[str, Any], kind: str) -> Iterable[RegistryEntry]: - entries = seed_db.get(kind) +def iter_db_entries(db: Mapping[str, Any], kind: str) -> Iterable[RegistryEntry]: + entries = db.get(kind) # Shape: registry.json if isinstance(entries, list): for entry in entries: @@ -448,15 +453,15 @@ def iter_seed_entries(seed_db: Mapping[str, Any], kind: str) -> Iterable[Registr yield entry # Shape: seed.json - elif kind == "packages" and "packages" not in seed_db: - for name, entry in seed_db.items(): + elif kind == "packages" and "packages" not in db: + for name, entry in db.items(): yield entry def warn_unrecoverable_seed_entries( source_url: str, *, - recovery_db: Mapping[str, Any] | None, + recovery_db: RecoveryDb | None, seed_hint_db: Mapping[str, Any] | None, no_seed: bool, ) -> None: @@ -483,23 +488,24 @@ def warn_unrecoverable_seed_entries( def has_recovery_entries_for_source( - recovery_db: Mapping[str, Any] | None, + recovery_db: RecoveryDb | None, source_url: str, ) -> bool: if not recovery_db: return False for kind in ("packages", "libraries"): - for entry in iter_seed_entries(recovery_db, kind): + for entry in recovery_db[kind]: if entry.get("source") == source_url: return True + return False def seed_package_names_for_source(seed_db: Mapping[str, Any], source_url: str) -> list[str]: names = { entry["name"] - for entry in iter_seed_entries(seed_db, "packages") + for entry in iter_db_entries(seed_db, "packages") if entry.get("source") == source_url if isinstance(entry.get("name"), str) } From 4fbb643c23c79af33b7bd0b6add7b81338f29152 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 15:41:48 +0200 Subject: [PATCH 22/30] Simplify package-entry iteration helper Replace iter_db_entries(db, kind) with iter_package_entries(db), since this helper is only used for package traversal. This removes an unused selector parameter and makes call sites more explicit. --- scripts/generate_registry.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index cf7ece57f..caf738b2f 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -417,7 +417,7 @@ def apply_seed_lifecycle( ) -> list[RegistryEntry]: seed_packages = { entry["name"]: entry - for entry in iter_db_entries(seed_db, "packages") + for entry in iter_package_entries(seed_db) } current: dict[str, RegistryEntry] current = { @@ -440,8 +440,8 @@ def apply_seed_lifecycle( return sorted(current.values(), key=lambda entry: entry["name"].casefold()) -def iter_db_entries(db: Mapping[str, Any], kind: str) -> Iterable[RegistryEntry]: - entries = db.get(kind) +def iter_package_entries(db: Mapping[str, Any]) -> Iterable[RegistryEntry]: + entries = db.get("packages") # Shape: registry.json if isinstance(entries, list): for entry in entries: @@ -453,7 +453,7 @@ def iter_db_entries(db: Mapping[str, Any], kind: str) -> Iterable[RegistryEntry] yield entry # Shape: seed.json - elif kind == "packages" and "packages" not in db: + elif "packages" not in db: for name, entry in db.items(): yield entry @@ -505,7 +505,7 @@ def has_recovery_entries_for_source( def seed_package_names_for_source(seed_db: Mapping[str, Any], source_url: str) -> list[str]: names = { entry["name"] - for entry in iter_db_entries(seed_db, "packages") + for entry in iter_package_entries(seed_db) if entry.get("source") == source_url if isinstance(entry.get("name"), str) } From 8607efd9c2414cf67699b94d9b61a58a706a8975 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 15:46:32 +0200 Subject: [PATCH 23/30] Use `Registry` type in favor of `RecoveryDb` --- scripts/generate_registry.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index caf738b2f..d698eb53c 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -65,11 +65,6 @@ class RepositorySchema(TypedDict): libraries: list[RawRepositoryEntry] -class RecoveryDb(TypedDict): - packages: list[RegistryEntry] - libraries: list[RegistryEntry] - - @dataclass class SeedDb: db: dict[str, Any] @@ -157,7 +152,7 @@ async def main( async def fetch_packages( channels: list[str], - recovery_db: RecoveryDb | None = None, + recovery_db: Registry | None = None, *, seed_hint_db: Mapping[str, Any] | None = None, no_seed: bool = False, @@ -390,7 +385,7 @@ def resolve_failure_recovery_db( output_file: str, effective_seed_path: str, seed: SeedDb | None, -) -> RecoveryDb | None: +) -> Registry | None: if seed and seed.has_registry_shape: return seed.db # type: ignore[return-value] @@ -403,7 +398,7 @@ def resolve_failure_recovery_db( return None -def is_registry_recovery_db(db: Mapping[str, Any]) -> TypeGuard[RecoveryDb]: +def is_registry_recovery_db(db: Mapping[str, Any]) -> TypeGuard[Registry]: return ( isinstance(db.get("packages"), list) and isinstance(db.get("libraries"), list) @@ -461,7 +456,7 @@ def iter_package_entries(db: Mapping[str, Any]) -> Iterable[RegistryEntry]: def warn_unrecoverable_seed_entries( source_url: str, *, - recovery_db: RecoveryDb | None, + recovery_db: Registry | None, seed_hint_db: Mapping[str, Any] | None, no_seed: bool, ) -> None: @@ -488,7 +483,7 @@ def warn_unrecoverable_seed_entries( def has_recovery_entries_for_source( - recovery_db: RecoveryDb | None, + recovery_db: Registry | None, source_url: str, ) -> bool: if not recovery_db: From e5f160c47ec60122ae7264b4fc30cfc8efe11c57 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 15:50:48 +0200 Subject: [PATCH 24/30] Pass `seed` down to `apply_seed_lifecycle` --- scripts/generate_registry.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index d698eb53c..42978d3b5 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -141,7 +141,7 @@ async def main( if seed and not no_seed: db["packages"] = apply_seed_lifecycle( db["packages"], - seed.db, + seed, now_utc_string(), ) write_json(output_file, db, pretty=True, ensure_ascii=True) @@ -407,12 +407,12 @@ def is_registry_recovery_db(db: Mapping[str, Any]) -> TypeGuard[Registry]: def apply_seed_lifecycle( packages: list[RegistryEntry], - seed_db: Mapping[str, Any], + seed: SeedDb, now_string: IsoTimestamp, ) -> list[RegistryEntry]: seed_packages = { entry["name"]: entry - for entry in iter_package_entries(seed_db) + for entry in iter_package_entries(seed.db) } current: dict[str, RegistryEntry] current = { @@ -422,15 +422,15 @@ def apply_seed_lifecycle( } for name, package in current.items(): - seed = seed_packages.get(name) - if seed and (first_seen := seed.get("first_seen")): + entry = seed_packages.get(name) + if entry and (first_seen := entry.get("first_seen")): package["first_seen"] = first_seen elif "first_seen" not in package: package["first_seen"] = now_string - for name, seed in seed_packages.items(): + for name, entry in seed_packages.items(): if name not in current: - current[name] = build_tombstone(seed, now_string) + current[name] = build_tombstone(entry, now_string) return sorted(current.values(), key=lambda entry: entry["name"].casefold()) From e46166f0cb6147132655c50348cd2e699cd65c7c Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 15:55:10 +0200 Subject: [PATCH 25/30] Pass registry (`db`) down to `apply_seed_lifecycle` --- scripts/generate_registry.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index 42978d3b5..7ae9a91ff 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -139,11 +139,8 @@ async def main( no_seed=no_seed, ) if seed and not no_seed: - db["packages"] = apply_seed_lifecycle( - db["packages"], - seed, - now_utc_string(), - ) + apply_seed_lifecycle(db, seed, now_utc_string()) + write_json(output_file, db, pretty=True, ensure_ascii=True) print(f"Saved registry as {output_file}") except asyncio.TimeoutError: @@ -406,18 +403,17 @@ def is_registry_recovery_db(db: Mapping[str, Any]) -> TypeGuard[Registry]: def apply_seed_lifecycle( - packages: list[RegistryEntry], + registry: Registry, seed: SeedDb, now_string: IsoTimestamp, -) -> list[RegistryEntry]: +) -> None: seed_packages = { entry["name"]: entry for entry in iter_package_entries(seed.db) } - current: dict[str, RegistryEntry] current = { pkg["name"]: pkg - for pkg in packages + for pkg in registry["packages"] if isinstance(pkg.get("name"), str) } @@ -432,7 +428,7 @@ def apply_seed_lifecycle( if name not in current: current[name] = build_tombstone(entry, now_string) - return sorted(current.values(), key=lambda entry: entry["name"].casefold()) + registry["packages"] = sorted(current.values(), key=lambda entry: entry["name"].casefold()) def iter_package_entries(db: Mapping[str, Any]) -> Iterable[RegistryEntry]: From aa0519837d8415b03876468536d5d04c1583ab7b Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 15:55:41 +0200 Subject: [PATCH 26/30] Remove two redundant str tests --- scripts/generate_registry.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index 7ae9a91ff..cf8c79f3f 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -414,7 +414,6 @@ def apply_seed_lifecycle( current = { pkg["name"]: pkg for pkg in registry["packages"] - if isinstance(pkg.get("name"), str) } for name, package in current.items(): @@ -498,7 +497,6 @@ def seed_package_names_for_source(seed_db: Mapping[str, Any], source_url: str) - entry["name"] for entry in iter_package_entries(seed_db) if entry.get("source") == source_url - if isinstance(entry.get("name"), str) } return sorted(names, key=str.casefold) From 17dd24563a1ebd414aecf30ff54ca143419aee93 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 16:01:20 +0200 Subject: [PATCH 27/30] Pass `seed` down to `fetch_packages` --- scripts/generate_registry.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index cf8c79f3f..fccc88a16 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -135,7 +135,7 @@ async def main( db = await fetch_packages( channels, failure_recovery, - seed_hint_db=seed.db if seed else None, + seed=seed, no_seed=no_seed, ) if seed and not no_seed: @@ -151,7 +151,7 @@ async def fetch_packages( channels: list[str], recovery_db: Registry | None = None, *, - seed_hint_db: Mapping[str, Any] | None = None, + seed: SeedDb | None = None, no_seed: bool = False, ) -> Registry: print("Fetching registered packages...") @@ -178,7 +178,7 @@ async def fetch_packages( warn_unrecoverable_seed_entries( url, recovery_db=recovery_db, - seed_hint_db=seed_hint_db, + seed=seed, no_seed=no_seed, ) continue @@ -430,7 +430,7 @@ def apply_seed_lifecycle( registry["packages"] = sorted(current.values(), key=lambda entry: entry["name"].casefold()) -def iter_package_entries(db: Mapping[str, Any]) -> Iterable[RegistryEntry]: +def iter_package_entries(db: Mapping[str, Any]) -> Iterable[SeedEntry]: entries = db.get("packages") # Shape: registry.json if isinstance(entries, list): @@ -452,15 +452,15 @@ def warn_unrecoverable_seed_entries( source_url: str, *, recovery_db: Registry | None, - seed_hint_db: Mapping[str, Any] | None, + seed: SeedDb | None, no_seed: bool, ) -> None: - if seed_hint_db is None: + if seed is None: return if has_recovery_entries_for_source(recovery_db, source_url): return - lost_names = seed_package_names_for_source(seed_hint_db, source_url) + lost_names = seed_package_names_for_source(seed.db, source_url) if not lost_names: return From 63f0b3442a67168d1b1995d5f56d0b8490974886 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 17:25:43 +0200 Subject: [PATCH 28/30] Refine failed-source warnings for compact seeds Avoid false-positive recovery warnings when a full registry-shaped seed is present but a failed repository has no known entries. Emit a clear generic warning only for compact seed.json inputs where failed-source recovery cannot be guaranteed, and point users to a full registry.json seed for complete recovery. Update and extend registry tests to cover compact-seed warning behavior and registry-seed non-warning behavior. --- scripts/generate_registry.py | 33 +++++++++------ tests/registry/test_generate_registry.py | 51 ++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 13 deletions(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index fccc88a16..d2ca29fe3 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -457,24 +457,27 @@ def warn_unrecoverable_seed_entries( ) -> None: if seed is None: return + if has_recovery_entries_for_source(recovery_db, source_url): return - lost_names = seed_package_names_for_source(seed.db, source_url) - if not lost_names: + mode_outcome = "dropped" if no_seed else "tombstoned" + + if is_compact_seed(seed.db): + err( + "ATTENTION: repository recovery cannot be guaranteed with a compact seed. " + "Check the output. Consider using a full registry.json seed for complete " + f"recovery; missing packages are {mode_outcome}." + ) return - mode_outcome = ( - "these are dropped." - if no_seed - else "these are tombstoned." - ) - err( - "ATTENTION: seed file knows " - f"{pl(len(lost_names), 'packages')} in the failed repository " - "but has no data to recover full entries; " - f"{mode_outcome}" - ) + if lost_names := seed_package_names_for_source(seed.db, source_url): + err( + "ATTENTION: seed file knows " + f"{pl(len(lost_names), 'packages')} in the failed repository " + "but has no data to recover full entries; " + f"these are {mode_outcome}." + ) def has_recovery_entries_for_source( @@ -501,6 +504,10 @@ def seed_package_names_for_source(seed_db: Mapping[str, Any], source_url: str) - return sorted(names, key=str.casefold) +def is_compact_seed(seed_db: Mapping[str, Any]) -> bool: + return "packages" not in seed_db + + def build_tombstone(seed: Mapping[str, Any], now_string: IsoTimestamp) -> RegistryEntry: return ( {"first_seen": now_string, "removed": now_string} # type: ignore[operator] diff --git a/tests/registry/test_generate_registry.py b/tests/registry/test_generate_registry.py index 645c0a756..00e9acbfb 100644 --- a/tests/registry/test_generate_registry.py +++ b/tests/registry/test_generate_registry.py @@ -618,3 +618,54 @@ async def test_workspace_seed_uses_prior_output_registry_for_failed_repo_recover captured = capsys.readouterr() assert "recover full entries" not in captured.err + + +@pytest.mark.asyncio +async def test_compact_seed_without_sources_warns_on_failed_repo(tmp_path, capsys): + repo_path = tmp_path / "missing.json" + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + seed_file = tmp_path / "seed.json" + seed_file.write_text(json.dumps({ + "SFTP": { + "name": "SFTP", + "first_seen": "2011-12-15T14:11:26Z", + } + })) + + await main( + str(output_file), + [channel_path.as_uri()], + seed_path=str(seed_file), + ) + + captured = capsys.readouterr() + assert "repository recovery cannot be guaranteed with a compact seed" in captured.err + assert "full registry.json seed for complete recovery" in captured.err + assert "recover full entries" not in captured.err + + +@pytest.mark.asyncio +async def test_registry_seed_without_source_entries_does_not_emit_compact_warning(tmp_path, capsys): + repo_path = tmp_path / "missing.json" + channel_path = tmp_path / "channel.json" + make_channel(channel_path, [repo_path]) + + output_file = tmp_path / "registry.json" + seed_file = tmp_path / "registry_seed.json" + seed_file.write_text(json.dumps({ + "repositories": [repo_path.as_uri()], + "packages": [], + "libraries": [], + })) + + await main( + str(output_file), + [channel_path.as_uri()], + seed_path=str(seed_file), + ) + + captured = capsys.readouterr() + assert "repository recovery cannot be guaranteed with a compact seed" not in captured.err From 03cf5362d6038dea4fe61dfdd4a1525749b0b163 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Sun, 29 Mar 2026 19:19:06 +0200 Subject: [PATCH 29/30] WS --- scripts/generate_registry.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/generate_registry.py b/scripts/generate_registry.py index d2ca29fe3..117a4d55d 100644 --- a/scripts/generate_registry.py +++ b/scripts/generate_registry.py @@ -427,7 +427,8 @@ def apply_seed_lifecycle( if name not in current: current[name] = build_tombstone(entry, now_string) - registry["packages"] = sorted(current.values(), key=lambda entry: entry["name"].casefold()) + registry["packages"] = \ + sorted(current.values(), key=lambda entry: entry["name"].casefold()) def iter_package_entries(db: Mapping[str, Any]) -> Iterable[SeedEntry]: From 92126673b5d05079f3e645c4cd639f989d33e2f7 Mon Sep 17 00:00:00 2001 From: herr kaste Date: Tue, 31 Mar 2026 12:16:30 +0200 Subject: [PATCH 30/30] Use registry worktree sync script in crawl CI Update crawl workflow to seed generate_registry from ./.the-registry/registry.json and sync branch state via an extracted shell script. Add .github/workflows/sync_registry_branch.sh to perform compare-first syncing, fallback commit messaging, and push to the-registry. Add pytest coverage for happy path, no-op behavior, and classifier crash fallback using a local bare origin to avoid network pushes. --- .github/workflows/crawl.yml | 9 + .github/workflows/sync_registry_branch.sh | 50 +++++ tests/test_sync_registry_branch_script.py | 215 ++++++++++++++++++++++ 3 files changed, 274 insertions(+) create mode 100644 .github/workflows/sync_registry_branch.sh create mode 100644 tests/test_sync_registry_branch_script.py diff --git a/.github/workflows/crawl.yml b/.github/workflows/crawl.yml index 111c2be86..df9146e29 100644 --- a/.github/workflows/crawl.yml +++ b/.github/workflows/crawl.yml @@ -29,6 +29,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v6 + - name: Checkout registry worktree + run: | + git fetch origin the-registry + git worktree add --force -B the-registry ./.the-registry origin/the-registry + - name: Set up Python uses: actions/setup-python@v6 with: @@ -79,11 +84,15 @@ jobs: run: | set -o pipefail PYTHONUNBUFFERED=1 uv run -m scripts.generate_registry \ + --seed ./.the-registry/registry.json \ --channel https://raw.githubusercontent.com/packagecontrol/channel/refs/heads/main/repository.json \ --channel https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/channel.json \ -o ./wrk/registry.json \ 2> >(tee registry.log >&2) + - name: Sync registry branch + run: bash ./.github/workflows/sync_registry_branch.sh ./.the-registry/ ./wrk/registry.json + - name: Run crawler run: | set -o pipefail diff --git a/.github/workflows/sync_registry_branch.sh b/.github/workflows/sync_registry_branch.sh new file mode 100644 index 000000000..1770c99df --- /dev/null +++ b/.github/workflows/sync_registry_branch.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail + +UV_BINARY="${UV_BINARY:-uv}" +REGISTRY_DIR="${1:-./.the-registry}" +OLD="${REGISTRY_DIR%/}/registry.json" +NEW="${2:-./wrk/registry.json}" + +if cmp -s "$OLD" "$NEW"; then + echo "No registry changes." + exit 0 +fi + +MSG="" +DESC_ERR="$(mktemp)" +if ! MSG="$("$UV_BINARY" run -m scripts.describe_registry_changes -a "$OLD" -b "$NEW" 2>"$DESC_ERR")"; then + DESC_ERR_CONTENT="$(cat "$DESC_ERR")" + echo "::warning::describe_registry_changes failed; using fallback subject" + if [ -n "$DESC_ERR_CONTENT" ]; then + echo "::notice::describe_registry_changes stderr: $(head -n 1 "$DESC_ERR")" + MSG="$(cat < None: + require_shell_tools() + + with tempfile.TemporaryDirectory(dir=project_root()) as temp_dir: + sandbox = Path(temp_dir) + origin_dir, registry_dir = init_registry_branch_repo(sandbox) + assert_origin_is_local(registry_dir, origin_dir) + + new_registry = sandbox / "wrk-registry.json" + write_registry(new_registry, package_names=["DemoPkg"]) + + result = run_sync_script(registry_dir, new_registry) + + assert result.returncode == 0, result.stderr + assert "fallback subject used" not in result.stdout + + commit_subject = git_output( + ["git", "-C", str(registry_dir), "log", "-1", "--pretty=%s"], + ).strip() + assert commit_subject == "Added `DemoPkg`" + + local_registry = json.loads((registry_dir / "registry.json").read_text(encoding="utf-8")) + generated_registry = json.loads(new_registry.read_text(encoding="utf-8")) + assert local_registry == generated_registry + + local_head = git_output(["git", "-C", str(registry_dir), "rev-parse", "HEAD"]).strip() + remote_head = git_output( + ["git", "--git-dir", str(origin_dir), "rev-parse", "refs/heads/the-registry"], + ).strip() + assert local_head == remote_head + + +def test_sync_registry_branch_noop_when_files_match() -> None: + require_shell_tools() + + with tempfile.TemporaryDirectory(dir=project_root()) as temp_dir: + sandbox = Path(temp_dir) + origin_dir, registry_dir = init_registry_branch_repo(sandbox) + assert_origin_is_local(registry_dir, origin_dir) + + new_registry = sandbox / "wrk-registry.json" + write_registry(new_registry, package_names=[]) + + before_head = git_output(["git", "-C", str(registry_dir), "rev-parse", "HEAD"]).strip() + result = run_sync_script(registry_dir, new_registry) + after_head = git_output(["git", "-C", str(registry_dir), "rev-parse", "HEAD"]).strip() + + assert result.returncode == 0, result.stderr + assert "No registry changes." in result.stdout + assert before_head == after_head + + +def test_sync_registry_branch_commits_when_describe_crashes() -> None: + require_shell_tools() + + with tempfile.TemporaryDirectory(dir=project_root()) as temp_dir: + sandbox = Path(temp_dir) + origin_dir, registry_dir = init_registry_branch_repo(sandbox) + assert_origin_is_local(registry_dir, origin_dir) + + new_registry = sandbox / "wrk-registry.json" + write_registry(new_registry, package_names=["CrashPathPkg"]) + + fake_uv = sandbox / "fake-uv" + fake_uv.write_text( + "#!/usr/bin/env bash\n" + "echo 'Traceback (most recent call last):' >&2\n" + "echo ' File \"scripts/describe_registry_changes.py\", line 1, in ' >&2\n" + "echo 'RuntimeError: boom' >&2\n" + "exit 1\n", + encoding="utf-8", + ) + fake_uv.chmod(0o755) + + env = os.environ.copy() + env["UV_BINARY"] = to_script_arg(fake_uv) + + result = run_sync_script(registry_dir, new_registry, env=env) + + assert result.returncode == 0, result.stderr + assert "describe_registry_changes failed; using fallback subject" in result.stdout + assert ( + "describe_registry_changes stderr: Traceback (most recent call last):" + in result.stdout + ) + + commit_subject = git_output( + ["git", "-C", str(registry_dir), "log", "-1", "--pretty=%s"], + ).strip() + assert commit_subject == "Update registry.json" + + commit_body = git_output( + ["git", "-C", str(registry_dir), "log", "-1", "--pretty=%B"], + ) + assert "describe_registry_changes.py raised" in commit_body + assert "Traceback (most recent call last):" in commit_body + assert "RuntimeError: boom" in commit_body + + local_head = git_output(["git", "-C", str(registry_dir), "rev-parse", "HEAD"]).strip() + remote_head = git_output( + ["git", "--git-dir", str(origin_dir), "rev-parse", "refs/heads/the-registry"], + ).strip() + assert local_head == remote_head + + +def require_shell_tools() -> None: + if shutil.which("git") is None: + pytest.skip("git is required for sync script tests") + if bash_executable() is None: + pytest.skip("bash is required for sync script tests") + + +def run_sync_script( + registry_dir: Path, + new_registry_path: Path, + *, + env: dict[str, str] | None = None, +) -> subprocess.CompletedProcess[str]: + bash_path = bash_executable() + if bash_path is None: + raise RuntimeError("bash is required for sync script tests") + + effective_env = os.environ.copy() + if env: + effective_env.update(env) + effective_env["GIT_ALLOW_PROTOCOL"] = "file" + effective_env["GIT_TERMINAL_PROMPT"] = "0" + + return subprocess.run( + [ + bash_path, + "./.github/workflows/sync_registry_branch.sh", + to_script_arg(registry_dir), + to_script_arg(new_registry_path), + ], + cwd=project_root(), + env=effective_env, + capture_output=True, + text=True, + check=False, + ) + + +def init_registry_branch_repo(sandbox: Path) -> tuple[Path, Path]: + origin_dir = sandbox / "origin.git" + registry_dir = sandbox / ".the-registry" + + run_checked(["git", "init", "--bare", str(origin_dir)]) + run_checked(["git", "clone", str(origin_dir), str(registry_dir)]) + run_checked(["git", "-C", str(registry_dir), "checkout", "-b", "the-registry"]) + run_checked(["git", "-C", str(registry_dir), "config", "user.name", "Test User"]) + run_checked(["git", "-C", str(registry_dir), "config", "user.email", "test@example.com"]) + + write_registry(registry_dir / "registry.json", package_names=[]) + run_checked(["git", "-C", str(registry_dir), "add", "registry.json"]) + run_checked(["git", "-C", str(registry_dir), "commit", "-m", "Initial"]) + run_checked(["git", "-C", str(registry_dir), "push", "-u", "origin", "the-registry"]) + + return origin_dir, registry_dir + + +def assert_origin_is_local(registry_dir: Path, origin_dir: Path) -> None: + origin_url = git_output(["git", "-C", str(registry_dir), "remote", "get-url", "origin"]).strip() + assert Path(origin_url).resolve() == origin_dir.resolve() + + +def to_script_arg(path: Path) -> str: + rel = path.resolve().relative_to(project_root()) + return f"./{rel.as_posix()}" + + +def write_registry(path: Path, *, package_names: list[str]) -> None: + payload = { + "packages": [{"name": name} for name in package_names], + "libraries": [], + } + path.write_text(json.dumps(payload), encoding="utf-8") + + +def git_output(command: list[str]) -> str: + return subprocess.check_output(command, text=True) + + +def run_checked(command: list[str]) -> None: + subprocess.run(command, check=True, capture_output=True, text=True) + + +def bash_executable() -> str | None: + try: + where_output = subprocess.check_output(["where", "bash"], text=True) + except Exception: + return shutil.which("bash") + + for line in where_output.splitlines(): + candidate = line.strip() + if "git" in candidate.lower() and candidate.lower().endswith("bash.exe"): + return candidate + + return shutil.which("bash") + + +def project_root() -> Path: + return Path(__file__).resolve().parents[1]