Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
6070872
Harden secure-source checks for tombstones
kaste Mar 27, 2026
a573f8a
Preserve source on crawl failures
kaste Mar 27, 2026
3c969ad
Skip tombstoned packages in crawl scheduling
kaste Mar 27, 2026
65d7ea9
Reject tombstoned package in --name mode
kaste Mar 27, 2026
51e0203
Handle tombstones in explain mode
kaste Mar 27, 2026
6ddc81e
Move explain helpers into _explain_package
kaste Mar 27, 2026
1b9ecc0
Import tombstones during maintenance
kaste Mar 27, 2026
7d56800
Test tombstone resurrection after maintenance import
kaste Mar 27, 2026
3d2c29c
Classify package and library registry changes
kaste Mar 27, 2026
4efb2bf
Add seeded lifecycle mode to generate_registry
kaste Mar 28, 2026
dea0047
Tighten seeded lifecycle cleanup in registry generation
kaste Mar 28, 2026
5792277
Some type work
kaste Mar 28, 2026
3a639fb
Document seed mode and add seed extractor
kaste Mar 28, 2026
e0881d5
Replace seed_from_workspace with generate_seed
kaste Mar 28, 2026
53331e2
Warn when seed shape completeness is poor
kaste Mar 28, 2026
5214320
Harden seed fallback for failed repositories
kaste Mar 29, 2026
e6bdfcf
Internally rename variable db -> recovery_db
kaste Mar 29, 2026
2d3c91c
Simplify recovery-shape checks in seed loading
kaste Mar 29, 2026
013e84b
Drop SeedLoad null-object and clarify seed strictness
kaste Mar 29, 2026
37f3104
Inline seed package index for lifecycle merge
kaste Mar 29, 2026
38a77b3
Type recovery DB as registry-shaped structure
kaste Mar 29, 2026
4fbb643
Simplify package-entry iteration helper
kaste Mar 29, 2026
8607efd
Use `Registry` type in favor of `RecoveryDb`
kaste Mar 29, 2026
e5f160c
Pass `seed` down to `apply_seed_lifecycle`
kaste Mar 29, 2026
e46166f
Pass registry (`db`) down to `apply_seed_lifecycle`
kaste Mar 29, 2026
aa05198
Remove two redundant str tests
kaste Mar 29, 2026
17dd245
Pass `seed` down to `fetch_packages`
kaste Mar 29, 2026
63f0b34
Refine failed-source warnings for compact seeds
kaste Mar 29, 2026
03cf536
WS
kaste Mar 29, 2026
9212667
Use registry worktree sync script in crawl CI
kaste Mar 31, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/crawl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v6

- name: Checkout registry worktree
run: |
git fetch origin the-registry
git worktree add --force -B the-registry ./.the-registry origin/the-registry

- name: Set up Python
uses: actions/setup-python@v6
with:
Expand Down Expand Up @@ -79,11 +84,15 @@ jobs:
run: |
set -o pipefail
PYTHONUNBUFFERED=1 uv run -m scripts.generate_registry \
--seed ./.the-registry/registry.json \
--channel https://raw.githubusercontent.com/packagecontrol/channel/refs/heads/main/repository.json \
--channel https://raw.githubusercontent.com/wbond/package_control_channel/refs/heads/master/channel.json \
-o ./wrk/registry.json \
2> >(tee registry.log >&2)

- name: Sync registry branch
run: bash ./.github/workflows/sync_registry_branch.sh ./.the-registry/ ./wrk/registry.json

- name: Run crawler
run: |
set -o pipefail
Expand Down
50 changes: 50 additions & 0 deletions .github/workflows/sync_registry_branch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash
set -euo pipefail

UV_BINARY="${UV_BINARY:-uv}"
REGISTRY_DIR="${1:-./.the-registry}"
OLD="${REGISTRY_DIR%/}/registry.json"
NEW="${2:-./wrk/registry.json}"

if cmp -s "$OLD" "$NEW"; then
echo "No registry changes."
exit 0
fi

MSG=""
DESC_ERR="$(mktemp)"
if ! MSG="$("$UV_BINARY" run -m scripts.describe_registry_changes -a "$OLD" -b "$NEW" 2>"$DESC_ERR")"; then
DESC_ERR_CONTENT="$(cat "$DESC_ERR")"
echo "::warning::describe_registry_changes failed; using fallback subject"
if [ -n "$DESC_ERR_CONTENT" ]; then
echo "::notice::describe_registry_changes stderr: $(head -n 1 "$DESC_ERR")"
MSG="$(cat <<EOF
Update registry.json

---
describe_registry_changes.py raised

$DESC_ERR_CONTENT
EOF
)"
else
MSG="Update registry.json"
fi
fi
rm -f "$DESC_ERR"

if [ "$MSG" = "Same." ] || [ -z "$MSG" ]; then
echo "::notice::describe_registry_changes did not recognize a real registry diff; using fallback subject (raw='${MSG:-<empty>}')"
MSG="Update registry.json"
fi

cp "$NEW" "$OLD"

git -C "$REGISTRY_DIR" config user.name "github-actions[bot]"
git -C "$REGISTRY_DIR" config user.email "41898282+github-actions[bot]@users.noreply.github.com"
git -C "$REGISTRY_DIR" add registry.json
git -C "$REGISTRY_DIR" commit -F - <<MSG
$MSG
MSG

git -C "$REGISTRY_DIR" push origin the-registry
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,41 @@ uv run -m scripts.generate_registry
uv run -m scripts.generate_registry --output myreg.json --channel <url1> --channel <url2>
```

`generate_registry` supports implicit lifecycle enrichment. I.e. newly discovered
packages get a `first_seen` timestamp, packages that are removed from the input
channels/repositories are re-added in a tombstoned way. Also input channels/repositories
that throw on access will lead to marked entries *if* a previous registry/seed is
present; this way we prevent packages from jumping between different sources, esp.
malicious ones.

This behavior is on by default, however you might not notice it as we don't blindly
add "first_seen: now" stamps when there is in fact no prior registry found.

Concretely,

- default seed path is `--output`; this way running generate_registry multiple times
from the same directory, will use the file we wrote on the last run as seed-input.

- `--seed [PATH]` enforces seed input (supports `registry.json`, `workspace.json`,
and `seed.json`-style package maps). Will raise if the seed is not present/readable.

- `--no-seed` disables lifecycle enrichment (`first_seen`/`removed` tombstones);
however, source-failure marking still applies when prior seed data is available.

```bash
uv run -m scripts.generate_registry --output registry.json
uv run -m scripts.generate_registry --output registry.json --seed ./the-registry/registry.json
uv run -m scripts.generate_registry --output registry.json --no-seed
```

To create a compact archival seed, use `generate_seed` with either a workspace
or a registry as input (but not both):

```bash
uv run -m scripts.generate_seed --workspace ./workspace.json --output ./seed.json
uv run -m scripts.generate_seed --registry ./registry.json --output ./seed.json
```

---

### 2. `crawl.py`
Expand Down
72 changes: 71 additions & 1 deletion scripts/_explain_package.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from __future__ import annotations

from copy import deepcopy
from difflib import SequenceMatcher
import json
from typing import Any
from typing import Any, Literal

from ._utils import parse_sublime_text_max

from rich import box
from rich.console import Console
Expand Down Expand Up @@ -32,6 +35,64 @@ def print_package_explain(
)


def print_package_explain_effective(name: str, normalized: dict[str, Any]) -> None:
releases = normalized.get("releases", [])
sorted_releases = sorted_release_definitions(releases)
tags_mode = classify_tags_mode(sorted_releases)

normalized_effective = deepcopy(normalized)
normalized_effective["releases"] = keep_newest_release_definitions(sorted_releases)

if tags_mode:
effectively = "(effectively) " if tags_mode == "effective" else ""
print(f"{name} uses {effectively}the tags-mode.")
print(json.dumps(normalized_effective, ensure_ascii=False, sort_keys=True))


def classify_tags_mode(
sorted_releases: list[dict[str, Any]],
) -> bool | Literal["effective"]:
if not sorted_releases:
return False

if all(release_uses_tags_mode(release) for release in sorted_releases):
return True

if release_uses_tags_mode(sorted_releases[-1]):
return "effective"

return False


def sorted_release_definitions(releases: list[dict[str, Any]]) -> list[dict[str, Any]]:
return sorted(releases, key=release_definition_sort_key)


def keep_newest_release_definitions(
releases: list[dict[str, Any]],
) -> list[dict[str, Any]]:
if not releases:
return []

newest_build = parse_sublime_text_max(releases[-1].get("sublime_text"))
return [
release
for release in releases
if parse_sublime_text_max(release.get("sublime_text")) == newest_build
]


def release_definition_sort_key(release: dict[str, Any]) -> tuple[float, str]:
return (
parse_sublime_text_max(release.get("sublime_text")),
_tags_sort_value(release.get("tags")),
)


def release_uses_tags_mode(release: dict[str, Any]) -> bool:
return bool(release.get("tags", False))


def print_library_explain(
name: str,
rows: list[tuple[dict[str, Any], list[dict[str, Any]]]],
Expand Down Expand Up @@ -85,6 +146,15 @@ def print_library_explain(
console.print(table)


def _tags_sort_value(value: Any) -> str:
# Place plain `True` after common prefixes like `st2-`.
if value is True:
return "~~true"
if isinstance(value, str):
return value
return ""


def _render_json_diff_table(
title: str,
left_obj: dict[str, Any] | list[Any],
Expand Down
35 changes: 35 additions & 0 deletions scripts/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,41 @@ def is_semver(s: str) -> bool:
return parse_version(s) is not None


def parse_sublime_text_max(selector: object) -> float:
if not isinstance(selector, str):
return float("inf")

s = re.sub(r"\s+", "", selector)
if s in ("", "*"):
return float("inf")

range_index = s.find("-")
if range_index != -1:
right = s[range_index + 1:]
n = parse_int_prefix(right)
return float(n) if n is not None else float("inf")

if s.startswith("<="):
n = parse_int_prefix(s[2:])
return float(n) if n is not None else float("inf")

if s.startswith("<"):
n = parse_int_prefix(s[1:])
return float(max(0, n - 1)) if n is not None else float("inf")

if s.startswith(">=") or s.startswith(">"):
return float("inf")

n = parse_int_prefix(s)
return float(n) if n is not None else float("inf")


def parse_int_prefix(text: str) -> int | None:
if match := re.match(r"^\d+", text):
return int(match.group(0))
return None


SECONDS_PER_DAY = 24 * 60 * 60


Expand Down
Loading
Loading