Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 31 additions & 11 deletions .github/workflows/crawl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ jobs:
PRESTO_PRESTO_CRAWL: ${{ vars.PRESTO_PRESTO_CRAWL }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
uses: actions/checkout@v6

- name: Set up Python
uses: actions/setup-python@v5
uses: actions/setup-python@v6
with:
python-version: '3.13'

Expand All @@ -39,8 +39,29 @@ jobs:
- name: Ensure wrk directory exists
run: mkdir -p ./wrk

# --------------------------------------------------------------------
# Freeze one run-level timestamp for the entire crawl job.
#
# Why:
# - We want all artifacts and logs from a single workflow run to agree
# on one exact point in time.
# - This avoids subtle drift where separate `date` calls differ by
# seconds and make later analysis harder.
#
# Consumers of this frozen timestamp:
# - scripts/crawl.py (run timestamp for crawl/update detection)
# - scripts/collect_logs.py (log entry timestamp fallback via NOW_TS)
#
# Notes:
# - We export via $GITHUB_ENV so NOW_TS is available to subsequent steps
# in this job.
# - Use epoch seconds (`date +%s`) to stay timezone-agnostic.
# --------------------------------------------------------------------
- name: Freeze run timestamp
run: echo "NOW_TS=$(date +%s)" >> "$GITHUB_ENV"

- name: Restore wrk cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: |
./wrk
Expand Down Expand Up @@ -130,8 +151,7 @@ jobs:
gh release upload ${{ env.RELEASE_TAG }} ./wrk/registry.json --clobber
gh release upload ${{ env.RELEASE_TAG }} ./wrk/workspace.json --clobber

DATE_TS=$(date +%s)
DATE=$(TZ=Europe/Berlin date -d "@$DATE_TS" +"%B %d, %Y, %H:%M GMT%:::z" | sed -E 's/([+-])0/\1/')
DATE=$(TZ=Europe/Berlin date -d "@$NOW_TS" +"%B %d, %Y, %H:%M GMT%:::z" | sed -E 's/([+-])0/\1/')
REPO_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
# Build new notes
{
Expand Down Expand Up @@ -165,13 +185,13 @@ jobs:

uv run -m scripts.collect_logs \
--run-id "${{ github.run_id }}" \
--timestamp "$DATE_TS" \
--workspace ./wrk/workspace.json \
-o ./wrk/logs.json \
notes.txt

- name: Upload wrk backup
id: crawl-backup-step
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v7
with:
name: crawl-backup
path: wrk/
Expand All @@ -185,10 +205,10 @@ jobs:
GITHUB_TOKEN: ${{ github.token }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
uses: actions/checkout@v6

- name: Set up Python
uses: actions/setup-python@v5
uses: actions/setup-python@v6
with:
python-version: '3.13'

Expand All @@ -199,7 +219,7 @@ jobs:
run: mkdir -p ./wrk

- name: Restore wrk cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ./wrk
key: stats-cache-${{ github.run_id }}
Expand All @@ -217,7 +237,7 @@ jobs:

- name: Upload wrk backup
id: stats-backup-step
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v7
with:
name: stats-backup
path: wrk/
Expand Down
8 changes: 8 additions & 0 deletions scripts/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ def pl(count: int, word: str) -> str:
return f"{count} {_INFLECT.plural(singular, count)}"


def format_name_list(names: list[str]) -> str:
if len(names) == 1:
return names[0]
if len(names) == 2:
return f"{names[0]} and {names[1]}"
return f"{', '.join(names[:-1])}, and {names[-1]}"


def pipe(v, *fns):
for fn in fns:
v = fn(v)
Expand Down
94 changes: 77 additions & 17 deletions scripts/collect_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,28 @@
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any
from typing import Any, NotRequired, TypedDict

from ._utils import write_json

DEFAULT_OUTPUT = "logs.json"
HISTORY_DAYS = 32


class LogEntry(TypedDict):
# Canonical log entry shape written by collect_logs.
date: str
run_id: str
notes: str
found_updates: NotRequired[list[FoundUpdateEntry]]


class FoundUpdateEntry(TypedDict):
name: str
detected_at: str
published_at: str


def main():
args = parse_args()
try:
Expand All @@ -30,6 +44,7 @@ class Args:
notes: str
run_id: str | None
timestamp: float | None
workspace: str | None
history_days: int
pretty: bool

Expand All @@ -55,6 +70,14 @@ def parse_args() -> Args:
default=None,
help="Unix timestamp (seconds) when the notes were produced.",
)
parser.add_argument(
"--workspace",
default=None,
help=(
"Optional workspace JSON path. When provided, "
"collect found_updates from matching package entries."
),
)
parser.add_argument(
"--history-days",
type=int,
Expand All @@ -76,6 +99,7 @@ def parse_args() -> Args:
notes=ns.notes,
run_id=ns.run_id,
timestamp=ns.timestamp,
workspace=ns.workspace,
history_days=ns.history_days,
pretty=ns.pretty,
)
Expand All @@ -85,34 +109,43 @@ def update_logs(args: Args):
run_id = args.run_id or os.environ.get("GITHUB_RUN_ID")
if not run_id:
raise SystemExit("collect_logs: missing --run-id or GITHUB_RUN_ID")
if args.timestamp is None:
raise SystemExit("collect_logs: missing --timestamp")

notes_path = Path(args.notes)
if not notes_path.is_file():
raise SystemExit(f"collect_logs: notes file not found: {notes_path}")

notes_text = notes_path.read_text(encoding="utf-8")
forced_timestamp = datetime.fromtimestamp(args.timestamp, tz=timezone.utc)
timestamp = args.timestamp
if timestamp is None:
env_now_ts = os.environ.get("NOW_TS")
if env_now_ts is None:
raise SystemExit("collect_logs: missing --timestamp")
timestamp = float(env_now_ts.strip())

runtime_ts = datetime.fromtimestamp(timestamp, tz=timezone.utc)
run_timestamp_iso = runtime_ts.strftime("%Y-%m-%dT%H:%M:%SZ")

output_path = Path(args.output).expanduser().resolve()
output_dir = output_path.parent
if output_dir and not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)

entries = load_logs(output_path)
entries: list[LogEntry] = load_json(output_path)
run_id_str = str(run_id)
entries = [entry for entry in entries if entry.get("run_id") != run_id_str]

entries.append({
"date": forced_timestamp.isoformat(),
entry: LogEntry = {
"date": runtime_ts.isoformat(),
"run_id": run_id_str,
"notes": notes_text,
})
}
if args.workspace:
entry["found_updates"] = derive_found_updates(args.workspace, run_timestamp_iso)
entries.append(entry)

entries.sort(key=lambda entry: entry["date"], reverse=True)

cutoff = retention_cutoff(args.history_days, reference=now_utc())
cutoff = now_ts() - timedelta(days=args.history_days)
kept_entries = [
entry for entry in entries
if datetime.fromisoformat(entry["date"]) >= cutoff
Expand All @@ -121,18 +154,45 @@ def update_logs(args: Args):
write_json(output_path, kept_entries, pretty=args.pretty, ensure_ascii=True)


def now_utc() -> datetime:
return datetime.now(timezone.utc)
def derive_found_updates(workspace_path: str, run_timestamp_iso: str) -> list[FoundUpdateEntry]:
packages = load_workspace_packages(workspace_path)
found_updates: list[FoundUpdateEntry] = []
for entry in packages.values():
detected_at = entry.get("update_detected")
if detected_at == run_timestamp_iso:
found_updates.append({
"name": entry["name"],
"detected_at": detected_at,
"published_at": entry["last_modified"],
})

found_updates.sort(key=lambda item: item["name"].casefold())
return found_updates


def load_workspace_packages(path: str) -> dict[str, dict]:
workspace_path = Path(path)
if not workspace_path.is_file():
raise SystemExit(f"collect_logs: workspace file not found: {workspace_path}")

workspace = load_json(workspace_path)
if not isinstance(workspace, dict):
raise SystemExit(f"collect_logs: workspace must be a JSON object: {workspace_path}")

def retention_cutoff(keep_days: int, *, reference: datetime | None = None) -> datetime:
"""Compute the earliest UTC timestamp we must retain."""
if reference is None:
reference = now_utc()
return reference - timedelta(days=keep_days)
packages: dict[str, dict] = workspace.get("packages", {})
if not isinstance(packages, dict):
raise SystemExit(f"collect_logs: workspace packages must be an object: {workspace_path}")

return packages


def now_ts() -> datetime:
if value := os.getenv("NOW_TS"):
return datetime.fromtimestamp(float(value.strip()), tz=timezone.utc)
return datetime.now(timezone.utc)


def load_logs(path: Path) -> list[dict[str, Any]]:
def load_json(path: Path) -> Any:
try:
return json.loads(path.read_text(encoding="utf-8"))
except FileNotFoundError:
Expand Down
33 changes: 26 additions & 7 deletions scripts/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
normalize_version_spec,
)
from ._utils import (
next_run, parse_version, resolve_url, update_url, write_json, pl, pick,
VersionInfo
format_name_list, next_run, parse_version, resolve_url, update_url, write_json, pl, pick,
VersionInfo,
)
from ._explain_package import print_package_explain
import traceback
Expand Down Expand Up @@ -81,6 +81,7 @@ class WorkspaceEntry(TypedDict, total=False):
last_seen: IsoTimestamp
next_crawl: IsoTimestamp
last_modified: IsoTimestamp
update_detected: IsoTimestamp
failing_since: IsoTimestamp
fail_reason: str

Expand Down Expand Up @@ -252,6 +253,7 @@ async def main_(
maintenance(registry, workspace)
tocrawl = next_packages_to_crawl(registry, workspace, limit=limit, presto=presto)

updated_packages: list[str] = []
async with aiohttp.ClientSession() as session:
tasks = [
crawl(
Expand All @@ -265,6 +267,8 @@ async def main_(
results = await asyncio.gather(*tasks)
for new_entry in results:
workspace["packages"][new_entry["name"]] = new_entry
if "update_detected" in new_entry:
updated_packages.append(new_entry["name"])
if name_requested:
print(json.dumps(new_entry, indent=2, ensure_ascii=False))

Expand All @@ -275,6 +279,11 @@ async def main_(
f"in db."
)

updated_packages = sorted(updated_packages)
if updated_packages:
s = "" if len(updated_packages) == 1 else "s"
print(f"Found update{s} for {format_name_list(updated_packages)}.")

if len(tocrawl) > 0:
print("GitHub", rate_limit_info)

Expand All @@ -286,7 +295,7 @@ def next_packages_to_crawl(
Returns a list of packages to crawl, sorted by next_crawl timestamp.
If next_crawl is not set, it defaults to the current time.
"""
now = datetime.now(timezone.utc)
now = now_ts()
now_string = now.strftime(UTC_FORMAT)
packages = registry["packages"]
packages_to_crawl = [
Expand Down Expand Up @@ -355,7 +364,7 @@ def next_packages_to_crawl(
def maintenance(registry: Registry, workspace: Workspace) -> None:
# lookup all packages in workspace and mark them as `removed`
# if they have been removed from the registry
now = datetime.now(timezone.utc)
now = now_ts()
now_string = now.strftime(UTC_FORMAT)
current_package_names = {entry["name"] for entry in registry["packages"]}
packages = workspace["packages"]
Expand All @@ -369,7 +378,7 @@ async def crawl(
existing: WorkspaceEntry
) -> WorkspaceEntry:
out: WorkspaceEntry
now = datetime.now(timezone.utc)
now = now_ts()
now_string = now.strftime(UTC_FORMAT)

try:
Expand Down Expand Up @@ -433,6 +442,10 @@ async def crawl(
else:
out["last_modified"] = max((r["date"] for r in releases))

previous_last_modified = existing.get("last_modified")
if previous_last_modified and out["last_modified"] != previous_last_modified:
out["update_detected"] = now_string

# Determine next_crawl interval
last_modified_dt = (
datetime
Expand Down Expand Up @@ -460,7 +473,7 @@ async def crawl_package(
entry: RegistryEntry,
existing: WorkspaceEntry
) -> WorkspaceEntry:
now = datetime.now(timezone.utc)
now = now_ts()
maybe_skip_crawling(entry, existing, now)
ensure_secure_source(entry, existing)

Expand Down Expand Up @@ -613,7 +626,7 @@ async def resolve_tags(
version_set = SpecifierSet(version_spec)

resolved_releases: list[Release] = []
now = datetime.now(timezone.utc)
now = now_ts()
cutoff = now - timedelta(weeks=53)

# We take all releases from the current (rolling) year, but if there
Expand Down Expand Up @@ -1148,6 +1161,12 @@ def count_limit_occurrences(argv: list[str]) -> int:
return count


def now_ts() -> datetime:
if value := os.getenv("NOW_TS"):
return datetime.fromtimestamp(float(value.strip()), tz=timezone.utc)
return datetime.now(timezone.utc)


def env_flag(name: str, default: bool = False) -> bool:
value = os.getenv(name)
if value is None:
Expand Down
Loading
Loading