From 3c030e82f1cb7a63f7262a33b71e1184fecfaf8c Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Sat, 21 Mar 2026 13:17:24 +0100
Subject: [PATCH 01/15] Track update detection timestamps on packages

Add `update_detected` to workspace package entries and set it when a
package's `last_modified` value changes across successful crawls.

- `update_detected` is not set on first discovery
- `update_detected` is unset on the next run
---
 scripts/crawl.py                     |  5 ++
 tests/crawl/test_update_detection.py | 73 ++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 tests/crawl/test_update_detection.py

diff --git a/scripts/crawl.py b/scripts/crawl.py
index 73545174f..97f1c304f 100644
--- a/scripts/crawl.py
+++ b/scripts/crawl.py
@@ -81,6 +81,7 @@ class WorkspaceEntry(TypedDict, total=False):
     last_seen: IsoTimestamp
     next_crawl: IsoTimestamp
     last_modified: IsoTimestamp
+    update_detected: IsoTimestamp
     failing_since: IsoTimestamp
     fail_reason: str
 
@@ -433,6 +434,10 @@ async def crawl(
     else:
         out["last_modified"] = max((r["date"] for r in releases))
 
+        previous_last_modified = existing.get("last_modified")
+        if previous_last_modified and out["last_modified"] != previous_last_modified:
+            out["update_detected"] = now_string
+
         # Determine next_crawl interval
         last_modified_dt = (
             datetime
diff --git a/tests/crawl/test_update_detection.py b/tests/crawl/test_update_detection.py
new file mode 100644
index 000000000..4772cebfa
--- /dev/null
+++ b/tests/crawl/test_update_detection.py
@@ -0,0 +1,73 @@
+import pytest
+
+from scripts.crawl import crawl
+
+
+@pytest.mark.asyncio
+async def test_sets_update_detected_when_last_modified_changes(set_now, monkeypatch):
+    package = {"name": "Example"}
+    existing = {
+        "name": "Example",
+        "last_modified": "2024-05-01T00:00:00Z",
+        "update_detected": "2024-05-05T00:00:00Z",
+    }
+
+    async def stub(*args, **kwargs):
+        return {
+            "name": "Example",
+            "releases": [{"date": "2024-05-31T00:00:00Z"}],
+        }
+
+    set_now("2024-06-01T00:00:00Z")
+    monkeypatch.setattr("scripts.crawl.crawl_package", stub)
+
+    result = await crawl(object(), package, existing)
+
+    assert result["last_modified"] == "2024-05-31T00:00:00Z"
+    assert result["update_detected"] == "2024-06-01T00:00:00Z"
+
+
+@pytest.mark.asyncio
+async def test_does_not_set_update_detected_for_first_seen_package(set_now, monkeypatch):
+    package = {"name": "Example"}
+    existing = {"name": "Example"}
+
+    async def stub(*args, **kwargs):
+        return {
+            "name": "Example",
+            "releases": [{"date": "2024-05-31T00:00:00Z"}],
+        }
+
+    set_now("2024-06-01T00:00:00Z")
+    monkeypatch.setattr("scripts.crawl.crawl_package", stub)
+
+    result = await crawl(object(), package, existing)
+
+    assert result["last_modified"] == "2024-05-31T00:00:00Z"
+    assert "update_detected" not in result
+
+
+@pytest.mark.asyncio
+async def test_drops_previous_update_detected_when_last_modified_is_unchanged(
+    set_now,
+    monkeypatch,
+):
+    package = {"name": "Example"}
+    existing = {
+        "name": "Example",
+        "last_modified": "2024-05-31T00:00:00Z",
+        "update_detected": "2024-05-15T00:00:00Z",
+    }
+
+    async def stub(*args, **kwargs):
+        return {
+            "name": "Example",
+            "releases": [{"date": "2024-05-31T00:00:00Z"}],
+        }
+
+    set_now("2024-06-01T00:00:00Z")
+    monkeypatch.setattr("scripts.crawl.crawl_package", stub)
+
+    result = await crawl(object(), package, existing)
+
+    assert "update_detected" not in result

From 661d91e541545018231145971376cfabf5ab9093 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Sat, 21 Mar 2026 13:20:19 +0100
Subject: [PATCH 02/15] Print "Found update(s)..."

After writing results, collect packages whose `update_detected_at`
matches the run timestamp and print a readable summary line in the crawl
output.

Also move Oxford-list name formatting into `scripts._utils` and reuse it
from both `crawl.py` and `crawl_libraries.py`.
---
 scripts/_utils.py                    |  8 ++++
 scripts/crawl.py                     | 12 ++++-
 scripts/crawl_libraries.py           | 10 +---
 tests/crawl/test_update_detection.py | 68 +++++++++++++++++++++++++++-
 tests/test_utils.py                  | 13 ++++++
 5 files changed, 99 insertions(+), 12 deletions(-)

diff --git a/scripts/_utils.py b/scripts/_utils.py
index 38cf6b95c..979497940 100644
--- a/scripts/_utils.py
+++ b/scripts/_utils.py
@@ -57,6 +57,14 @@ def pl(count: int, word: str) -> str:
     return f"{count} {_INFLECT.plural(singular, count)}"
 
 
+def format_name_list(names: list[str]) -> str:
+    if len(names) == 1:
+        return names[0]
+    if len(names) == 2:
+        return f"{names[0]} and {names[1]}"
+    return f"{', '.join(names[:-1])}, and {names[-1]}"
+
+
 def pipe(v, *fns):
     for fn in fns:
         v = fn(v)
diff --git a/scripts/crawl.py b/scripts/crawl.py
index 97f1c304f..745ba5335 100644
--- a/scripts/crawl.py
+++ b/scripts/crawl.py
@@ -29,8 +29,8 @@
     normalize_version_spec,
 )
 from ._utils import (
-    next_run, parse_version, resolve_url, update_url, write_json, pl, pick,
-    VersionInfo
+    format_name_list, next_run, parse_version, resolve_url, update_url, write_json, pl, pick,
+    VersionInfo,
 )
 from ._explain_package import print_package_explain
 import traceback
@@ -253,6 +253,7 @@ async def main_(
         maintenance(registry, workspace)
         tocrawl = next_packages_to_crawl(registry, workspace, limit=limit, presto=presto)
 
+    updated_packages: list[str] = []
     async with aiohttp.ClientSession() as session:
         tasks = [
             crawl(
@@ -266,6 +267,8 @@ async def main_(
         results = await asyncio.gather(*tasks)
         for new_entry in results:
             workspace["packages"][new_entry["name"]] = new_entry
+            if "update_detected" in new_entry:
+                updated_packages.append(new_entry["name"])
             if name_requested:
                 print(json.dumps(new_entry, indent=2, ensure_ascii=False))
 
@@ -276,6 +279,11 @@ async def main_(
         f"in db."
     )
 
+    updated_packages = sorted(updated_packages)
+    if updated_packages:
+        s = "" if len(updated_packages) == 1 else "s"
+        print(f"Found update{s} for {format_name_list(updated_packages)}.")
+
     if len(tocrawl) > 0:
         print("GitHub", rate_limit_info)
 
diff --git a/scripts/crawl_libraries.py b/scripts/crawl_libraries.py
index ad414121c..4bd9c40b7 100644
--- a/scripts/crawl_libraries.py
+++ b/scripts/crawl_libraries.py
@@ -22,7 +22,7 @@
     load_json,
     resolve_library,
 )
-from ._utils import err, write_json
+from ._utils import err, format_name_list, write_json
 from ._explain_package import print_library_explain
 
 
@@ -465,13 +465,5 @@ def format_updated_message(names: list[str]) -> str:
     return f"{format_name_list(names)} have been updated."
 
 
-def format_name_list(names: list[str]) -> str:
-    if len(names) == 1:
-        return names[0]
-    if len(names) == 2:
-        return f"{names[0]} and {names[1]}"
-    return f"{', '.join(names[:-1])}, and {names[-1]}"
-
-
 if __name__ == "__main__":
     main()
diff --git a/tests/crawl/test_update_detection.py b/tests/crawl/test_update_detection.py
index 4772cebfa..d2fe454ba 100644
--- a/tests/crawl/test_update_detection.py
+++ b/tests/crawl/test_update_detection.py
@@ -1,6 +1,6 @@
 import pytest
 
-from scripts.crawl import crawl
+from scripts.crawl import crawl, main_
 
 
 @pytest.mark.asyncio
@@ -71,3 +71,69 @@ async def stub(*args, **kwargs):
     result = await crawl(object(), package, existing)
 
     assert "update_detected" not in result
+
+
+@pytest.mark.asyncio
+async def test_main_prints_sorted_oxford_list_for_updates(set_now, monkeypatch, capsys):
+    registry = {
+        "packages": [
+            {"name": "gamma"},
+            {"name": "alpha"},
+            {"name": "beta"},
+        ]
+    }
+    workspace = {"packages": {}}
+
+    async def stub_crawl(session, package, existing):
+        return {"name": package["name"], "update_detected": "2024-06-01T00:00:00Z"}
+
+    set_now("2024-06-01T00:00:00Z")
+    monkeypatch.setattr("scripts.crawl.crawl", stub_crawl)
+
+    await main_(registry, workspace, None, 100)
+
+    out = capsys.readouterr().out
+    assert "Found updates for alpha, beta, and gamma." in out
+
+
+@pytest.mark.asyncio
+async def test_main_prints_singular_update_summary(set_now, monkeypatch, capsys):
+    registry = {"packages": [{"name": "alpha"}, {"name": "beta"}]}
+    workspace = {"packages": {}}
+
+    async def stub_crawl(session, package, existing):
+        if package["name"] == "alpha":
+            return {"name": package["name"], "update_detected": "2024-06-01T00:00:00Z"}
+        return {"name": package["name"]}
+
+    set_now("2024-06-01T00:00:00Z")
+    monkeypatch.setattr("scripts.crawl.crawl", stub_crawl)
+
+    await main_(registry, workspace, None, 100)
+
+    out = capsys.readouterr().out
+    assert "Found update for alpha." in out
+
+
+@pytest.mark.asyncio
+async def test_main_does_not_report_first_seen_as_update(
+    set_now,
+    monkeypatch,
+    capsys,
+):
+    registry = {"packages": [{"name": "alpha"}]}
+    workspace = {"packages": {}}
+
+    async def stub_crawl_package(session, package, existing):
+        return {
+            "name": package["name"],
+            "releases": [{"date": "2024-05-31T00:00:00Z"}],
+        }
+
+    set_now("2024-06-01T00:00:00Z")
+    monkeypatch.setattr("scripts.crawl.crawl_package", stub_crawl_package)
+
+    await main_(registry, workspace, None, 100)
+
+    out = capsys.readouterr().out
+    assert "Found update" not in out
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 1716e2f5f..61016b4ff 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,6 +5,7 @@
     is_semver,
     normalize_tz_aware_datetime,
     unique_values_preserving_order,
+    format_name_list,
     pl,
 )
 
@@ -107,6 +108,18 @@ def test_unique_values_preserving_order_iterable():
     assert unique_values_preserving_order(values) == ["a", "b", "c"]
 
 
+@pytest.mark.parametrize(
+    ("names", "expected"),
+    [
+        (["alpha"], "alpha"),
+        (["alpha", "beta"], "alpha and beta"),
+        (["alpha", "beta", "gamma"], "alpha, beta, and gamma"),
+    ],
+)
+def test_format_name_list(names, expected):
+    assert format_name_list(names) == expected
+
+
 @pytest.mark.parametrize(
     ("count", "word", "expected"),
     [

From cbed7a29d70922406aa3ea7b5e5602dd0ed33adc Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Sat, 21 Mar 2026 13:21:12 +0100
Subject: [PATCH 03/15] Freeze crawl job timestamp in workflow

Add an early `NOW_TS` export in the crawl workflow job and reuse that
frozen value when building notes and invoking `scripts.collect_logs`.

This aligns run artifacts and log timestamps to one shared run marker,
and prepares `scripts.crawl` to read the same `NOW_TS` value.
---
 .github/workflows/crawl.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/crawl.yml b/.github/workflows/crawl.yml
index 69a2bf795..55a217c6d 100644
--- a/.github/workflows/crawl.yml
+++ b/.github/workflows/crawl.yml
@@ -39,6 +39,9 @@ jobs:
       - name: Ensure wrk directory exists
         run: mkdir -p ./wrk
 
+      - name: Freeze run timestamp
+        run: echo "NOW_TS=$(date +%s)" >> "$GITHUB_ENV"
+
       - name: Restore wrk cache
         uses: actions/cache@v4
         with:
@@ -130,8 +133,7 @@ jobs:
           gh release upload ${{ env.RELEASE_TAG }} ./wrk/registry.json --clobber
           gh release upload ${{ env.RELEASE_TAG }} ./wrk/workspace.json --clobber
 
-          DATE_TS=$(date +%s)
-          DATE=$(TZ=Europe/Berlin date -d "@$DATE_TS" +"%B %d, %Y, %H:%M GMT%:::z" | sed -E 's/([+-])0/\1/')
+          DATE=$(TZ=Europe/Berlin date -d "@$NOW_TS" +"%B %d, %Y, %H:%M GMT%:::z" | sed -E 's/([+-])0/\1/')
           REPO_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
           # Build new notes
           {
@@ -165,7 +167,7 @@ jobs:
 
           uv run -m scripts.collect_logs \
             --run-id "${{ github.run_id }}" \
-            --timestamp "$DATE_TS" \
+            --timestamp "$NOW_TS" \
             -o ./wrk/logs.json \
             notes.txt
 

From f4843bd5f4cc07dc95c12e761cc92e13c2989cf8 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Sat, 21 Mar 2026 13:21:58 +0100
Subject: [PATCH 04/15] Allow NOW_TS env variable to set the time for crawl

Update `scripts.crawl` to honor `NOW_TS` first to freeze the time during
runtime.

`NOW_TS` is accepted as Unix epoch seconds.
---
 scripts/crawl.py                     | 16 +++++++++++-----
 tests/crawl/test_update_detection.py | 15 ++++++++++++++-
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/scripts/crawl.py b/scripts/crawl.py
index 745ba5335..6b9ceef71 100644
--- a/scripts/crawl.py
+++ b/scripts/crawl.py
@@ -295,7 +295,7 @@ def next_packages_to_crawl(
     Returns a list of packages to crawl, sorted by next_crawl timestamp.
     If next_crawl is not set, it defaults to the current time.
     """
-    now = datetime.now(timezone.utc)
+    now = now_ts()
     now_string = now.strftime(UTC_FORMAT)
     packages = registry["packages"]
     packages_to_crawl = [
@@ -364,7 +364,7 @@ def next_packages_to_crawl(
 def maintenance(registry: Registry, workspace: Workspace) -> None:
     # lookup all packages in workspace and mark them as `removed`
     # if they have been removed from the registry
-    now = datetime.now(timezone.utc)
+    now = now_ts()
     now_string = now.strftime(UTC_FORMAT)
     current_package_names = {entry["name"] for entry in registry["packages"]}
     packages = workspace["packages"]
@@ -378,7 +378,7 @@ async def crawl(
     existing: WorkspaceEntry
 ) -> WorkspaceEntry:
     out: WorkspaceEntry
-    now = datetime.now(timezone.utc)
+    now = now_ts()
     now_string = now.strftime(UTC_FORMAT)
 
     try:
@@ -473,7 +473,7 @@ async def crawl_package(
     entry: RegistryEntry,
     existing: WorkspaceEntry
 ) -> WorkspaceEntry:
-    now = datetime.now(timezone.utc)
+    now = now_ts()
     maybe_skip_crawling(entry, existing, now)
     ensure_secure_source(entry, existing)
 
@@ -626,7 +626,7 @@ async def resolve_tags(
         version_set = SpecifierSet(version_spec)
 
     resolved_releases: list[Release] = []
-    now = datetime.now(timezone.utc)
+    now = now_ts()
     cutoff = now - timedelta(weeks=53)
 
     # We take all releases from the current (rolling) year, but if there
@@ -1161,6 +1161,12 @@ def count_limit_occurrences(argv: list[str]) -> int:
     return count
 
 
+def now_ts() -> datetime:
+    if value := os.getenv("NOW_TS"):
+        return datetime.fromtimestamp(float(value.strip()), tz=timezone.utc)
+    return datetime.now(timezone.utc)
+
+
 def env_flag(name: str, default: bool = False) -> bool:
     value = os.getenv(name)
     if value is None:
diff --git a/tests/crawl/test_update_detection.py b/tests/crawl/test_update_detection.py
index d2fe454ba..9fffdcc98 100644
--- a/tests/crawl/test_update_detection.py
+++ b/tests/crawl/test_update_detection.py
@@ -1,6 +1,7 @@
+from datetime import datetime, timezone
 import pytest
 
-from scripts.crawl import crawl, main_
+from scripts.crawl import crawl, main_, now_ts
 
 
 @pytest.mark.asyncio
@@ -137,3 +138,15 @@ async def stub_crawl_package(session, package, existing):
 
     out = capsys.readouterr().out
     assert "Found update" not in out
+
+
+def test_now_ts_prefers_now_ts_env(monkeypatch, set_now):
+    set_now("2024-06-01T00:00:00Z")
+    monkeypatch.setenv("NOW_TS", "1717286400")
+
+    expected = (
+        datetime
+        .strptime("2024-06-02T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")
+        .replace(tzinfo=timezone.utc)
+    )
+    assert now_ts() == expected

From 4fcc27ec077d49a982706320c26b8b226590a463 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 10:58:01 +0100
Subject: [PATCH 05/15] Allow collect_logs to read NOW_TS by default

Make `--timestamp` optional for collect_logs.

Timestamp precedence is now:
1) explicit `--timestamp`
2) `NOW_TS` environment variable

If neither is available, keep the previous failure behavior and raise
`collect_logs: missing --timestamp`.

Add tests for NOW_TS fallback, explicit-arg precedence over NOW_TS, and
missing-timestamp failure.
---
 scripts/collect_logs.py    | 11 ++++--
 tests/test_collect_logs.py | 76 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/scripts/collect_logs.py b/scripts/collect_logs.py
index f7182dc1e..bfc2a89c6 100644
--- a/scripts/collect_logs.py
+++ b/scripts/collect_logs.py
@@ -85,15 +85,20 @@ def update_logs(args: Args):
     run_id = args.run_id or os.environ.get("GITHUB_RUN_ID")
     if not run_id:
         raise SystemExit("collect_logs: missing --run-id or GITHUB_RUN_ID")
-    if args.timestamp is None:
-        raise SystemExit("collect_logs: missing --timestamp")
 
     notes_path = Path(args.notes)
     if not notes_path.is_file():
         raise SystemExit(f"collect_logs: notes file not found: {notes_path}")
 
     notes_text = notes_path.read_text(encoding="utf-8")
-    forced_timestamp = datetime.fromtimestamp(args.timestamp, tz=timezone.utc)
+    timestamp = args.timestamp
+    if timestamp is None:
+        now_ts = os.environ.get("NOW_TS")
+        if now_ts is None:
+            raise SystemExit("collect_logs: missing --timestamp")
+        timestamp = float(now_ts.strip())
+
+    forced_timestamp = datetime.fromtimestamp(timestamp, tz=timezone.utc)
 
     output_path = Path(args.output).expanduser().resolve()
     output_dir = output_path.parent
diff --git a/tests/test_collect_logs.py b/tests/test_collect_logs.py
index 5056e1ecf..98c280051 100644
--- a/tests/test_collect_logs.py
+++ b/tests/test_collect_logs.py
@@ -2,6 +2,8 @@
 import sys
 from datetime import datetime, timezone
 
+import pytest
+
 import scripts.collect_logs as collect_logs
 
 
@@ -97,3 +99,77 @@ def test_collect_logs_prunes_entries_outside_retention(tmp_path, monkeypatch):
     assert all(entry["run_id"] != "1" for entry in entries)
     # Ordering should keep the newest entry first
     assert entries[0]["run_id"] == "99"
+
+
+def test_collect_logs_uses_now_ts_when_timestamp_is_missing(tmp_path, monkeypatch):
+    notes_path = tmp_path / "notes.txt"
+    notes_path.write_text("line\n", encoding="utf-8")
+
+    ts = datetime(2024, 10, 5, 9, 30, tzinfo=timezone.utc).timestamp()
+    monkeypatch.setenv("NOW_TS", str(ts))
+    monkeypatch.setattr(
+        collect_logs,
+        "now_utc",
+        lambda: datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc),
+    )
+
+    logs_path = tmp_path / "logs.json"
+    args = collect_logs.Args(
+        output=str(logs_path),
+        notes=str(notes_path),
+        run_id="42",
+        timestamp=None,
+        history_days=collect_logs.HISTORY_DAYS,
+        pretty=False,
+    )
+
+    collect_logs.update_logs(args)
+
+    entries = json.loads(logs_path.read_text(encoding="utf-8"))
+    assert entries[0]["date"] == "2024-10-05T09:30:00+00:00"
+
+
+def test_collect_logs_timestamp_arg_wins_over_now_ts(tmp_path, monkeypatch):
+    notes_path = tmp_path / "notes.txt"
+    notes_path.write_text("line\n", encoding="utf-8")
+
+    arg_ts = datetime(2024, 10, 5, 10, 0, tzinfo=timezone.utc).timestamp()
+    env_ts = datetime(2024, 10, 5, 11, 0, tzinfo=timezone.utc).timestamp()
+    monkeypatch.setenv("NOW_TS", str(env_ts))
+    monkeypatch.setattr(
+        collect_logs,
+        "now_utc",
+        lambda: datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc),
+    )
+
+    logs_path = tmp_path / "logs.json"
+    args = collect_logs.Args(
+        output=str(logs_path),
+        notes=str(notes_path),
+        run_id="43",
+        timestamp=arg_ts,
+        history_days=collect_logs.HISTORY_DAYS,
+        pretty=False,
+    )
+
+    collect_logs.update_logs(args)
+
+    entries = json.loads(logs_path.read_text(encoding="utf-8"))
+    assert entries[0]["date"] == "2024-10-05T10:00:00+00:00"
+
+
+def test_collect_logs_raises_same_error_when_no_timestamp_available(tmp_path):
+    notes_path = tmp_path / "notes.txt"
+    notes_path.write_text("line\n", encoding="utf-8")
+
+    args = collect_logs.Args(
+        output=str(tmp_path / "logs.json"),
+        notes=str(notes_path),
+        run_id="44",
+        timestamp=None,
+        history_days=collect_logs.HISTORY_DAYS,
+        pretty=False,
+    )
+
+    with pytest.raises(SystemExit, match="collect_logs: missing --timestamp"):
+        collect_logs.update_logs(args)

From 21a3af8d4a86041d12b8c29668b773f3e09cdb3e Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 10:59:22 +0100
Subject: [PATCH 06/15] Rename collect_logs runtime timestamp variable

Rename `forced_timestamp` to `runtime_ts` in collect_logs for clearer
naming aligned with run-level timestamp semantics.
---
 scripts/collect_logs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/collect_logs.py b/scripts/collect_logs.py
index bfc2a89c6..0b3eb4ecc 100644
--- a/scripts/collect_logs.py
+++ b/scripts/collect_logs.py
@@ -98,7 +98,7 @@ def update_logs(args: Args):
             raise SystemExit("collect_logs: missing --timestamp")
         timestamp = float(now_ts.strip())
 
-    forced_timestamp = datetime.fromtimestamp(timestamp, tz=timezone.utc)
+    runtime_ts = datetime.fromtimestamp(timestamp, tz=timezone.utc)
 
     output_path = Path(args.output).expanduser().resolve()
     output_dir = output_path.parent
@@ -110,7 +110,7 @@ def update_logs(args: Args):
     entries = [entry for entry in entries if entry.get("run_id") != run_id_str]
 
     entries.append({
-        "date": forced_timestamp.isoformat(),
+        "date": runtime_ts.isoformat(),
         "run_id": run_id_str,
         "notes": notes_text,
     })

From a4d6d62d17b1a3f167a9ff42f5f81765dba9fa3d Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 10:59:43 +0100
Subject: [PATCH 07/15] Drop explicit collect_logs timestamp in workflow

Remove `--timestamp` from the collect_logs invocation in crawl.yml.
collect_logs now resolves the run timestamp from `NOW_TS` when no
explicit timestamp argument is provided.
---
 .github/workflows/crawl.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/crawl.yml b/.github/workflows/crawl.yml
index 55a217c6d..130a51e22 100644
--- a/.github/workflows/crawl.yml
+++ b/.github/workflows/crawl.yml
@@ -167,7 +167,6 @@ jobs:
 
           uv run -m scripts.collect_logs \
             --run-id "${{ github.run_id }}" \
-            --timestamp "$NOW_TS" \
             -o ./wrk/logs.json \
             notes.txt
 

From cc8ea107c30ae539f62f4b440e1c2fba8d0e3e93 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 11:00:32 +0100
Subject: [PATCH 08/15] Document frozen NOW_TS usage in crawl workflow

Add an expanded comment block above the NOW_TS export step in crawl.yml.

The comment explains why the timestamp is frozen once per run and calls
out that both scripts/crawl.py and scripts/collect_logs.py consume the
same run-level timestamp.
---
 .github/workflows/crawl.yml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/crawl.yml b/.github/workflows/crawl.yml
index 130a51e22..1b94106f7 100644
--- a/.github/workflows/crawl.yml
+++ b/.github/workflows/crawl.yml
@@ -39,6 +39,24 @@ jobs:
       - name: Ensure wrk directory exists
         run: mkdir -p ./wrk
 
+      # --------------------------------------------------------------------
+      # Freeze one run-level timestamp for the entire crawl job.
+      #
+      # Why:
+      # - We want all artifacts and logs from a single workflow run to agree
+      #   on one exact point in time.
+      # - This avoids subtle drift where separate `date` calls differ by
+      #   seconds and make later analysis harder.
+      #
+      # Consumers of this frozen timestamp:
+      # - scripts/crawl.py      (run timestamp for crawl/update detection)
+      # - scripts/collect_logs.py (log entry timestamp fallback via NOW_TS)
+      #
+      # Notes:
+      # - We export via $GITHUB_ENV so NOW_TS is available to subsequent steps
+      #   in this job.
+      # - Use epoch seconds (`date +%s`) to stay timezone-agnostic.
+      # --------------------------------------------------------------------
       - name: Freeze run timestamp
         run: echo "NOW_TS=$(date +%s)" >> "$GITHUB_ENV"
 

From 013e9e5656aa9facb83c5b40cd872b2867765ea9 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 11:07:25 +0100
Subject: [PATCH 09/15] Add structured found_updates to collected logs

Extend collect_logs with optional --workspace support so each new log
entry can include structured package update detections for the run.

When a workspace path is provided, collect_logs now derives
`found_updates` by matching package `update_detected` timestamps against
the frozen run timestamp, emits deterministic name ordering, and keeps
`published_at` optional when `last_modified` is missing.

Also wire the crawl workflow to pass --workspace so production logs
include found_updates, and add focused tests for matching, ordering,
empty lists, dedupe behavior, and timestamp precedence.
---
 .github/workflows/crawl.yml |   1 +
 scripts/collect_logs.py     |  54 +++++++++-
 tests/test_collect_logs.py  | 209 ++++++++++++++++++++++++++++++++++++
 3 files changed, 260 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/crawl.yml b/.github/workflows/crawl.yml
index 1b94106f7..393d46947 100644
--- a/.github/workflows/crawl.yml
+++ b/.github/workflows/crawl.yml
@@ -185,6 +185,7 @@ jobs:
 
           uv run -m scripts.collect_logs \
             --run-id "${{ github.run_id }}" \
+            --workspace ./wrk/workspace.json \
             -o ./wrk/logs.json \
             notes.txt
 
diff --git a/scripts/collect_logs.py b/scripts/collect_logs.py
index 0b3eb4ecc..9b89b36c8 100644
--- a/scripts/collect_logs.py
+++ b/scripts/collect_logs.py
@@ -30,6 +30,7 @@ class Args:
     notes: str
     run_id: str | None
     timestamp: float | None
+    workspace: str | None
     history_days: int
     pretty: bool
 
@@ -55,6 +56,14 @@ def parse_args() -> Args:
         default=None,
         help="Unix timestamp (seconds) when the notes were produced.",
     )
+    parser.add_argument(
+        "--workspace",
+        default=None,
+        help=(
+            "Optional workspace JSON path. When provided, "
+            "collect found_updates from matching package entries."
+        ),
+    )
     parser.add_argument(
         "--history-days",
         type=int,
@@ -76,6 +85,7 @@ def parse_args() -> Args:
         notes=ns.notes,
         run_id=ns.run_id,
         timestamp=ns.timestamp,
+        workspace=ns.workspace,
         history_days=ns.history_days,
         pretty=ns.pretty,
     )
@@ -99,21 +109,25 @@ def update_logs(args: Args):
         timestamp = float(now_ts.strip())
 
     runtime_ts = datetime.fromtimestamp(timestamp, tz=timezone.utc)
+    run_timestamp_iso = runtime_ts.strftime("%Y-%m-%dT%H:%M:%SZ")
 
     output_path = Path(args.output).expanduser().resolve()
     output_dir = output_path.parent
     if output_dir and not output_dir.exists():
         output_dir.mkdir(parents=True, exist_ok=True)
 
-    entries = load_logs(output_path)
+    entries: list[dict[str, Any]] = load_json(output_path)
     run_id_str = str(run_id)
     entries = [entry for entry in entries if entry.get("run_id") != run_id_str]
 
-    entries.append({
+    entry: dict[str, Any] = {
         "date": runtime_ts.isoformat(),
         "run_id": run_id_str,
         "notes": notes_text,
-    })
+    }
+    if args.workspace:
+        entry["found_updates"] = derive_found_updates(args.workspace, run_timestamp_iso)
+    entries.append(entry)
 
     entries.sort(key=lambda entry: entry["date"], reverse=True)
 
@@ -126,6 +140,38 @@ def update_logs(args: Args):
     write_json(output_path, kept_entries, pretty=args.pretty, ensure_ascii=True)
 
 
+def derive_found_updates(workspace_path: str, run_timestamp_iso: str) -> list[dict[str, Any]]:
+    packages = load_workspace_packages(workspace_path)
+    found_updates = []
+    for entry in packages.values():
+        detected_at = entry.get("update_detected")
+        if detected_at == run_timestamp_iso:
+            found_updates.append({
+                "name": entry["name"],
+                "detected_at": detected_at,
+                "published_at": entry.get("last_modified"),
+            })
+
+    found_updates.sort(key=lambda item: item["name"].casefold())
+    return found_updates
+
+
+def load_workspace_packages(path: str) -> dict[str, dict]:
+    workspace_path = Path(path)
+    if not workspace_path.is_file():
+        raise SystemExit(f"collect_logs: workspace file not found: {workspace_path}")
+
+    workspace = load_json(workspace_path)
+    if not isinstance(workspace, dict):
+        raise SystemExit(f"collect_logs: workspace must be a JSON object: {workspace_path}")
+
+    packages: dict[str, dict] = workspace.get("packages", {})
+    if not isinstance(packages, dict):
+        raise SystemExit(f"collect_logs: workspace packages must be an object: {workspace_path}")
+
+    return packages
+
+
 def now_utc() -> datetime:
     return datetime.now(timezone.utc)
 
@@ -137,7 +183,7 @@ def retention_cutoff(keep_days: int, *, reference: datetime | None = None) -> da
     return reference - timedelta(days=keep_days)
 
 
-def load_logs(path: Path) -> list[dict[str, Any]]:
+def load_json(path: Path) -> Any:
     try:
         return json.loads(path.read_text(encoding="utf-8"))
     except FileNotFoundError:
diff --git a/tests/test_collect_logs.py b/tests/test_collect_logs.py
index 98c280051..1387d4b4f 100644
--- a/tests/test_collect_logs.py
+++ b/tests/test_collect_logs.py
@@ -60,6 +60,211 @@ def test_collect_logs_deduplicates_run_id(tmp_path, monkeypatch):
     assert entries[0]["date"] == "2024-10-05T08:20:00+00:00"
 
 
+def test_collect_logs_adds_found_updates_from_workspace(tmp_path, monkeypatch):
+    notes_path = tmp_path / "notes.txt"
+    notes_path.write_text("line\n", encoding="utf-8")
+
+    ts = datetime(2024, 10, 5, 9, 30, tzinfo=timezone.utc).timestamp()
+    monkeypatch.setattr(
+        collect_logs,
+        "now_utc",
+        lambda: datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc),
+    )
+
+    workspace_path = tmp_path / "workspace.json"
+    workspace_path.write_text(
+        json.dumps(
+            {
+                "packages": {
+                    "zeta": {
+                        "name": "Zeta",
+                        "update_detected": "2024-10-05T09:30:00Z",
+                        "last_modified": "2024-10-04T00:00:00Z",
+                    },
+                    "alpha": {
+                        "name": "alpha",
+                        "update_detected": "2024-10-05T09:30:00Z",
+                    },
+                    "foo": {
+                        "name": "Foo and Bar",
+                        "update_detected": "2024-10-05T09:30:00Z",
+                        "last_modified": "2024-10-03T00:00:00Z",
+                    },
+                    "no-match": {
+                        "name": "No Match",
+                        "update_detected": "2024-10-05T09:31:00Z",
+                        "last_modified": "2024-10-01T00:00:00Z",
+                    },
+                }
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    logs_path = tmp_path / "logs.json"
+    args = collect_logs.Args(
+        output=str(logs_path),
+        notes=str(notes_path),
+        run_id="200",
+        timestamp=ts,
+        workspace=str(workspace_path),
+        history_days=collect_logs.HISTORY_DAYS,
+        pretty=False,
+    )
+
+    collect_logs.update_logs(args)
+
+    entries = json.loads(logs_path.read_text(encoding="utf-8"))
+    assert entries[0]["found_updates"] == [
+        {
+            "name": "alpha",
+            "detected_at": "2024-10-05T09:30:00Z",
+            "published_at": None,
+        },
+        {
+            "name": "Foo and Bar",
+            "detected_at": "2024-10-05T09:30:00Z",
+            "published_at": "2024-10-03T00:00:00Z",
+        },
+        {
+            "name": "Zeta",
+            "detected_at": "2024-10-05T09:30:00Z",
+            "published_at": "2024-10-04T00:00:00Z",
+        },
+    ]
+
+
+def test_collect_logs_writes_empty_found_updates_list_when_no_matches(tmp_path, monkeypatch):
+    notes_path = tmp_path / "notes.txt"
+    notes_path.write_text("line\n", encoding="utf-8")
+
+    ts = datetime(2024, 10, 5, 9, 30, tzinfo=timezone.utc).timestamp()
+    monkeypatch.setattr(
+        collect_logs,
+        "now_utc",
+        lambda: datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc),
+    )
+
+    workspace_path = tmp_path / "workspace.json"
+    workspace_path.write_text(
+        json.dumps(
+            {
+                "packages": {
+                    "alpha": {
+                        "name": "alpha",
+                        "update_detected": "2024-10-05T09:31:00Z",
+                        "last_modified": "2024-10-01T00:00:00Z",
+                    }
+                }
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    logs_path = tmp_path / "logs.json"
+    args = collect_logs.Args(
+        output=str(logs_path),
+        notes=str(notes_path),
+        run_id="201",
+        timestamp=ts,
+        workspace=str(workspace_path),
+        history_days=collect_logs.HISTORY_DAYS,
+        pretty=False,
+    )
+
+    collect_logs.update_logs(args)
+
+    entries = json.loads(logs_path.read_text(encoding="utf-8"))
+    assert entries[0]["found_updates"] == []
+
+
+def test_collect_logs_dedupes_run_id_with_found_updates(tmp_path, monkeypatch):
+    notes_path = tmp_path / "notes.txt"
+    notes_path.write_text("first\n", encoding="utf-8")
+
+    monkeypatch.setenv("GITHUB_RUN_ID", "500")
+    monkeypatch.setattr(
+        collect_logs,
+        "now_utc",
+        lambda: datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc),
+    )
+
+    workspace_path = tmp_path / "workspace.json"
+    workspace_path.write_text(
+        json.dumps(
+            {
+                "packages": {
+                    "alpha": {
+                        "name": "alpha",
+                        "update_detected": "2024-10-05T09:30:00Z",
+                        "last_modified": "2024-10-01T00:00:00Z",
+                    }
+                }
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    logs_path = tmp_path / "logs.json"
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "collect-logs",
+            "-o",
+            str(logs_path),
+            "--timestamp",
+            str(datetime(2024, 10, 5, 9, 30, tzinfo=timezone.utc).timestamp()),
+            "--workspace",
+            str(workspace_path),
+            str(notes_path),
+        ],
+    )
+    collect_logs.main()
+
+    notes_path.write_text("second\n", encoding="utf-8")
+    workspace_path.write_text(
+        json.dumps(
+            {
+                "packages": {
+                    "beta": {
+                        "name": "beta",
+                        "update_detected": "2024-10-05T10:00:00Z",
+                        "last_modified": "2024-10-02T00:00:00Z",
+                    }
+                }
+            }
+        ),
+        encoding="utf-8",
+    )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "collect-logs",
+            "-o",
+            str(logs_path),
+            "--timestamp",
+            str(datetime(2024, 10, 5, 10, 0, tzinfo=timezone.utc).timestamp()),
+            "--workspace",
+            str(workspace_path),
+            str(notes_path),
+        ],
+    )
+    collect_logs.main()
+
+    entries = json.loads(logs_path.read_text(encoding="utf-8"))
+    assert len(entries) == 1
+    assert entries[0]["notes"] == "second\n"
+    assert entries[0]["found_updates"] == [
+        {
+            "name": "beta",
+            "detected_at": "2024-10-05T10:00:00Z",
+            "published_at": "2024-10-02T00:00:00Z",
+        }
+    ]
+
+
 def test_collect_logs_prunes_entries_outside_retention(tmp_path, monkeypatch):
     logs_path = tmp_path / "logs.json"
     old_entry = {
@@ -88,6 +293,7 @@ def test_collect_logs_prunes_entries_outside_retention(tmp_path, monkeypatch):
         notes=str(notes_path),
         run_id="99",
         timestamp=fixed_now.timestamp(),
+        workspace=None,
         history_days=collect_logs.HISTORY_DAYS,
         pretty=True,
     )
@@ -119,6 +325,7 @@ def test_collect_logs_uses_now_ts_when_timestamp_is_missing(tmp_path, monkeypatc
         notes=str(notes_path),
         run_id="42",
         timestamp=None,
+        workspace=None,
         history_days=collect_logs.HISTORY_DAYS,
         pretty=False,
     )
@@ -148,6 +355,7 @@ def test_collect_logs_timestamp_arg_wins_over_now_ts(tmp_path, monkeypatch):
         notes=str(notes_path),
         run_id="43",
         timestamp=arg_ts,
+        workspace=None,
         history_days=collect_logs.HISTORY_DAYS,
         pretty=False,
     )
@@ -167,6 +375,7 @@ def test_collect_logs_raises_same_error_when_no_timestamp_available(tmp_path):
         notes=str(notes_path),
         run_id="44",
         timestamp=None,
+        workspace=None,
         history_days=collect_logs.HISTORY_DAYS,
         pretty=False,
     )

From dc365d5b497d6487fb4c36792b658750137e0226 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 11:16:31 +0100
Subject: [PATCH 10/15] Require explicit reference in retention_cutoff

Make collect_logs.retention_cutoff require a reference datetime instead
of accepting an optional argument.

The helper is only called with an explicit reference at its sole call
site, so the optional branch was unnecessary.
---
 scripts/collect_logs.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/collect_logs.py b/scripts/collect_logs.py
index 9b89b36c8..baa2d0f14 100644
--- a/scripts/collect_logs.py
+++ b/scripts/collect_logs.py
@@ -176,10 +176,8 @@ def now_utc() -> datetime:
     return datetime.now(timezone.utc)
 
 
-def retention_cutoff(keep_days: int, *, reference: datetime | None = None) -> datetime:
+def retention_cutoff(keep_days: int, *, reference: datetime) -> datetime:
     """Compute the earliest UTC timestamp we must retain."""
-    if reference is None:
-        reference = now_utc()
     return reference - timedelta(days=keep_days)
 
 

From ffd183fa052682a90a32cfc1db60ac7a4e854ecb Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 11:17:38 +0100
Subject: [PATCH 11/15] Inline retention cutoff in collect_logs

Inline the retention cutoff calculation in update_logs since the helper
only wrapped a single trivial expression.

This removes one indirection while keeping behavior unchanged.
---
 scripts/collect_logs.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/scripts/collect_logs.py b/scripts/collect_logs.py
index baa2d0f14..e5bc99b6f 100644
--- a/scripts/collect_logs.py
+++ b/scripts/collect_logs.py
@@ -131,7 +131,7 @@ def update_logs(args: Args):
 
     entries.sort(key=lambda entry: entry["date"], reverse=True)
 
-    cutoff = retention_cutoff(args.history_days, reference=now_utc())
+    cutoff = now_utc() - timedelta(days=args.history_days)
     kept_entries = [
         entry for entry in entries
         if datetime.fromisoformat(entry["date"]) >= cutoff
@@ -176,11 +176,6 @@ def now_utc() -> datetime:
     return datetime.now(timezone.utc)
 
 
-def retention_cutoff(keep_days: int, *, reference: datetime) -> datetime:
-    """Compute the earliest UTC timestamp we must retain."""
-    return reference - timedelta(days=keep_days)
-
-
 def load_json(path: Path) -> Any:
     try:
         return json.loads(path.read_text(encoding="utf-8"))

From fd7ed1985594bb91e23713f190ea0403942fd152 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 11:19:40 +0100
Subject: [PATCH 12/15] Use now_ts semantics in collect_logs pruning clock

Replace collect_logs' `now_utc()` helper with a `now_ts()` helper that
matches crawl's NOW_TS-aware behavior.

This keeps retention pruning aligned with the same frozen run timestamp
mechanism when NOW_TS is exported.

Also update collect_logs tests to patch `now_ts` instead of `now_utc`.
---
 scripts/collect_logs.py    | 12 +++++++-----
 tests/test_collect_logs.py | 14 +++++++-------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/scripts/collect_logs.py b/scripts/collect_logs.py
index e5bc99b6f..22c86881d 100644
--- a/scripts/collect_logs.py
+++ b/scripts/collect_logs.py
@@ -103,10 +103,10 @@ def update_logs(args: Args):
     notes_text = notes_path.read_text(encoding="utf-8")
     timestamp = args.timestamp
     if timestamp is None:
-        now_ts = os.environ.get("NOW_TS")
-        if now_ts is None:
+        env_now_ts = os.environ.get("NOW_TS")
+        if env_now_ts is None:
             raise SystemExit("collect_logs: missing --timestamp")
-        timestamp = float(now_ts.strip())
+        timestamp = float(env_now_ts.strip())
 
     runtime_ts = datetime.fromtimestamp(timestamp, tz=timezone.utc)
     run_timestamp_iso = runtime_ts.strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -131,7 +131,7 @@ def update_logs(args: Args):
 
     entries.sort(key=lambda entry: entry["date"], reverse=True)
 
-    cutoff = now_utc() - timedelta(days=args.history_days)
+    cutoff = now_ts() - timedelta(days=args.history_days)
     kept_entries = [
         entry for entry in entries
         if datetime.fromisoformat(entry["date"]) >= cutoff
@@ -172,7 +172,9 @@ def load_workspace_packages(path: str) -> dict[str, dict]:
     return packages
 
 
-def now_utc() -> datetime:
+def now_ts() -> datetime:
+    if value := os.getenv("NOW_TS"):
+        return datetime.fromtimestamp(float(value.strip()), tz=timezone.utc)
     return datetime.now(timezone.utc)
 
 
diff --git a/tests/test_collect_logs.py b/tests/test_collect_logs.py
index 1387d4b4f..7fda9d328 100644
--- a/tests/test_collect_logs.py
+++ b/tests/test_collect_logs.py
@@ -15,7 +15,7 @@ def test_collect_logs_deduplicates_run_id(tmp_path, monkeypatch):
     )
 
     fixed_now = datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc)
-    monkeypatch.setattr(collect_logs, "now_utc", lambda: fixed_now)
+    monkeypatch.setattr(collect_logs, "now_ts", lambda: fixed_now)
     monkeypatch.setenv("GITHUB_RUN_ID", "12345")
 
     logs_path = tmp_path / "logs.json"
@@ -67,7 +67,7 @@ def test_collect_logs_adds_found_updates_from_workspace(tmp_path, monkeypatch):
     ts = datetime(2024, 10, 5, 9, 30, tzinfo=timezone.utc).timestamp()
     monkeypatch.setattr(
         collect_logs,
-        "now_utc",
+        "now_ts",
         lambda: datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc),
     )
 
@@ -141,7 +141,7 @@ def test_collect_logs_writes_empty_found_updates_list_when_no_matches(tmp_path,
     ts = datetime(2024, 10, 5, 9, 30, tzinfo=timezone.utc).timestamp()
     monkeypatch.setattr(
         collect_logs,
-        "now_utc",
+        "now_ts",
         lambda: datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc),
     )
 
@@ -185,7 +185,7 @@ def test_collect_logs_dedupes_run_id_with_found_updates(tmp_path, monkeypatch):
     monkeypatch.setenv("GITHUB_RUN_ID", "500")
     monkeypatch.setattr(
         collect_logs,
-        "now_utc",
+        "now_ts",
         lambda: datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc),
     )
 
@@ -286,7 +286,7 @@ def test_collect_logs_prunes_entries_outside_retention(tmp_path, monkeypatch):
     )
 
     fixed_now = datetime(2024, 10, 5, 12, 0, tzinfo=timezone.utc)
-    monkeypatch.setattr(collect_logs, "now_utc", lambda: fixed_now)
+    monkeypatch.setattr(collect_logs, "now_ts", lambda: fixed_now)
 
     args = collect_logs.Args(
         output=str(logs_path),
@@ -315,7 +315,7 @@ def test_collect_logs_uses_now_ts_when_timestamp_is_missing(tmp_path, monkeypatc
     monkeypatch.setenv("NOW_TS", str(ts))
     monkeypatch.setattr(
         collect_logs,
-        "now_utc",
+        "now_ts",
         lambda: datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc),
     )
 
@@ -345,7 +345,7 @@ def test_collect_logs_timestamp_arg_wins_over_now_ts(tmp_path, monkeypatch):
     monkeypatch.setenv("NOW_TS", str(env_ts))
     monkeypatch.setattr(
         collect_logs,
-        "now_utc",
+        "now_ts",
         lambda: datetime(2024, 10, 6, 0, 0, tzinfo=timezone.utc),
     )
 

From cc9270d1cdf2276ab6248553d3a68bdd15bc1e82 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 11:22:26 +0100
Subject: [PATCH 13/15] Document logs.json output shape with TypedDicts

Add typed structures for collect_logs output entries:
- LogEntry
- FoundUpdateEntry

This makes the logs.json schema explicit in code and clarifies the
optional found_updates payload shape.
---
 scripts/collect_logs.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/scripts/collect_logs.py b/scripts/collect_logs.py
index 22c86881d..d902822d9 100644
--- a/scripts/collect_logs.py
+++ b/scripts/collect_logs.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
-from typing import Any
+from typing import Any, NotRequired, TypedDict
 
 from ._utils import write_json
 
@@ -15,6 +15,20 @@
 HISTORY_DAYS = 32
 
 
+class LogEntry(TypedDict):
+    # Canonical log entry shape written by collect_logs.
+    date: str
+    run_id: str
+    notes: str
+    found_updates: NotRequired[list[FoundUpdateEntry]]
+
+
+class FoundUpdateEntry(TypedDict):
+    name: str
+    detected_at: str
+    published_at: str | None
+
+
 def main():
     args = parse_args()
     try:
@@ -116,11 +130,11 @@ def update_logs(args: Args):
     if output_dir and not output_dir.exists():
         output_dir.mkdir(parents=True, exist_ok=True)
 
-    entries: list[dict[str, Any]] = load_json(output_path)
+    entries: list[LogEntry] = load_json(output_path)
     run_id_str = str(run_id)
     entries = [entry for entry in entries if entry.get("run_id") != run_id_str]
 
-    entry: dict[str, Any] = {
+    entry: LogEntry = {
         "date": runtime_ts.isoformat(),
         "run_id": run_id_str,
         "notes": notes_text,
@@ -140,9 +154,9 @@ def update_logs(args: Args):
     write_json(output_path, kept_entries, pretty=args.pretty, ensure_ascii=True)
 
 
-def derive_found_updates(workspace_path: str, run_timestamp_iso: str) -> list[dict[str, Any]]:
+def derive_found_updates(workspace_path: str, run_timestamp_iso: str) -> list[FoundUpdateEntry]:
     packages = load_workspace_packages(workspace_path)
-    found_updates = []
+    found_updates: list[FoundUpdateEntry] = []
     for entry in packages.values():
         detected_at = entry.get("update_detected")
         if detected_at == run_timestamp_iso:

From 777f14e6910c08c860c253b7f33ff15616145355 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 11:26:23 +0100
Subject: [PATCH 14/15] Enforce last_modified for found_updates entries

Align collect_logs with crawl invariants: if `update_detected` matches
this run, `last_modified` must be present too.

`found_updates.published_at` is hence a required string.
---
 scripts/collect_logs.py    | 4 ++--
 tests/test_collect_logs.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/collect_logs.py b/scripts/collect_logs.py
index d902822d9..6fa9bd479 100644
--- a/scripts/collect_logs.py
+++ b/scripts/collect_logs.py
@@ -26,7 +26,7 @@ class LogEntry(TypedDict):
 class FoundUpdateEntry(TypedDict):
     name: str
     detected_at: str
-    published_at: str | None
+    published_at: str
 
 
 def main():
@@ -163,7 +163,7 @@ def derive_found_updates(workspace_path: str, run_timestamp_iso: str) -> list[Fo
             found_updates.append({
                 "name": entry["name"],
                 "detected_at": detected_at,
-                "published_at": entry.get("last_modified"),
+                "published_at": entry["last_modified"],
             })
 
     found_updates.sort(key=lambda item: item["name"].casefold())
diff --git a/tests/test_collect_logs.py b/tests/test_collect_logs.py
index 7fda9d328..df9f5389e 100644
--- a/tests/test_collect_logs.py
+++ b/tests/test_collect_logs.py
@@ -84,6 +84,7 @@ def test_collect_logs_adds_found_updates_from_workspace(tmp_path, monkeypatch):
                     "alpha": {
                         "name": "alpha",
                         "update_detected": "2024-10-05T09:30:00Z",
+                        "last_modified": "2024-10-05T08:00:00Z",
                     },
                     "foo": {
                         "name": "Foo and Bar",
@@ -119,7 +120,7 @@ def test_collect_logs_adds_found_updates_from_workspace(tmp_path, monkeypatch):
         {
             "name": "alpha",
             "detected_at": "2024-10-05T09:30:00Z",
-            "published_at": None,
+            "published_at": "2024-10-05T08:00:00Z",
         },
         {
             "name": "Foo and Bar",

From f732e948201613114cde03ac479e84a81a3365d7 Mon Sep 17 00:00:00 2001
From: herr kaste <herr.kaste@gmail.com>
Date: Mon, 23 Mar 2026 11:31:45 +0100
Subject: [PATCH 15/15] Bump workflow actions

- actions/cache v4 -> v5
- actions/checkout v4 -> v6
- actions/setup-python v5 -> v6
- actions/upload-artifact v4 -> v7
---
 .github/workflows/crawl.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/crawl.yml b/.github/workflows/crawl.yml
index 393d46947..b80a26c39 100644
--- a/.github/workflows/crawl.yml
+++ b/.github/workflows/crawl.yml
@@ -26,10 +26,10 @@ jobs:
       PRESTO_PRESTO_CRAWL: ${{ vars.PRESTO_PRESTO_CRAWL }}
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.13'
 
@@ -61,7 +61,7 @@ jobs:
         run: echo "NOW_TS=$(date +%s)" >> "$GITHUB_ENV"
 
       - name: Restore wrk cache
-        uses: actions/cache@v4
+        uses: actions/cache@v5
         with:
           path: |
             ./wrk
@@ -191,7 +191,7 @@ jobs:
 
       - name: Upload wrk backup
         id: crawl-backup-step
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v7
         with:
           name: crawl-backup
           path: wrk/
@@ -205,10 +205,10 @@ jobs:
       GITHUB_TOKEN: ${{ github.token }}
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.13'
 
@@ -219,7 +219,7 @@ jobs:
         run: mkdir -p ./wrk
 
       - name: Restore wrk cache
-        uses: actions/cache@v4
+        uses: actions/cache@v5
         with:
           path: ./wrk
           key: stats-cache-${{ github.run_id }}
@@ -237,7 +237,7 @@ jobs:
 
       - name: Upload wrk backup
         id: stats-backup-step
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v7
         with:
           name: stats-backup
           path: wrk/