From bb319d562f8617feb82dd5080b9766354d4b780c Mon Sep 17 00:00:00 2001 From: gerchowl Date: Thu, 30 Apr 2026 17:50:20 +0200 Subject: [PATCH] bugfix(baker): bake_scalar_source updates release-manifest.json (#251) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-#251 ``bake_scalar_source`` only pushed ``.json`` and never touched ``release-manifest.json``. Pre-#239 the client reconstructed the manifest from a tree listing, so the missing write was invisible. After #239 the client reads the static manifest directly — meaning every scalar bake (today: physicallybased) silently left its source absent from the manifest, hiding it from clients. We caught this on ``gerchowl/mat-vis@v2026.04.2`` and patched prod manually (commit 63e84e2). This change is structural only — the prod manifest is already correct. Apply the same CAS retry pattern ``bake_one_per_file`` uses for the catalog + manifest commit: - Push the catalog as before (one commit, via ``push_to_hf``). - Then enter a CAS loop: fetch the current manifest + parent SHA, merge in ``{"scalar": {"complete": True}}`` under the source entry, commit with ``parent_commit=``. Retry on 412/409, exhaust at 6. Reuses ``_fetch_manifest_with_parent`` and ``_merge_manifest_for_source`` from ``hf_bake_per_file`` and ``_create_commit_with_backoff`` from ``hf_retry`` — single contract evolves across both bakers. Drops the stale "clients derive the manifest from the dataset tree" docstring line. Tests pin three properties: 1. Two commits land — catalog (one file) and manifest (one file). 2. The manifest payload carries ``{"physicallybased": {"catalog": "physicallybased.json", "tiers": {"scalar": {"complete": true}}}}``. 3. Pre-existing sources in the manifest are preserved on merge, and the manifest commit opts into ``parent_commit`` for CAS. --- src/mat_vis_baker/hf_bake.py | 109 +++++++++++++- tests/test_bake_scalar_source.py | 234 +++++++++++++++++++++++++++++++ 2 files changed, 337 insertions(+), 6 deletions(-) create mode 100644 tests/test_bake_scalar_source.py diff --git a/src/mat_vis_baker/hf_bake.py b/src/mat_vis_baker/hf_bake.py index 744b56e4..8a7d24e5 100644 --- a/src/mat_vis_baker/hf_bake.py +++ b/src/mat_vis_baker/hf_bake.py @@ -29,7 +29,14 @@ import logging from pathlib import Path +from huggingface_hub import CommitOperationAdd, HfApi + +from mat_vis_baker.hf_bake_per_file import ( + _fetch_manifest_with_parent, + _merge_manifest_for_source, +) from mat_vis_baker.hf_push import push_to_hf +from mat_vis_baker.hf_retry import _create_commit_with_backoff from mat_vis_baker.index_builder import build_index log = logging.getLogger("mat-vis-baker.hf_bake") @@ -64,11 +71,23 @@ def bake_scalar_source( hf_token: str | None = None, dry_run: bool = False, ) -> dict: - """Bake a scalar-only source (physicallybased): write catalog, no tar. + """Bake a scalar-only source (physicallybased): write catalog + manifest. The catalog is overwritten wholesale — a scalar source is baked - as one unit. No manifest write; clients derive the manifest from - the dataset tree. + as one unit. The release-manifest.json is then merged in a second + commit using the same CAS retry pattern as ``bake_one_per_file`` + (#251) so concurrent writers against the same release tag converge + on a manifest that lists every baked source. Pre-#239 the client + reconstructed the manifest from a tree listing, but the per-file + substrate broke tree pagination at scale, so the static manifest + is now the source of truth. + + The scalar manifest entry shape mirrors the textured one + (``source_tiers.SUPPORTED_TIERS`` declares ``physicallybased: + {"scalar"}``):: + + {"catalog": ".json", + "tiers": {"scalar": {"complete": True}}} """ work_dir.mkdir(parents=True, exist_ok=True) fetch = _get_fetcher(source) @@ -80,17 +99,95 @@ def bake_scalar_source( catalog_path.write_text(json.dumps(index, indent=2, ensure_ascii=False) + "\n") if dry_run: - log.info("dry-run: would push 1 file (%s.json) to %s@%s", source, repo_id, release_tag) + log.info( + "dry-run: would push catalog + manifest (%s.json + release-manifest.json) to %s@%s", + source, + repo_id, + release_tag, + ) return {"dry_run": True, "materials": len(index)} - sha = push_to_hf( + # 1) Catalog commit — single file, same wholesale-overwrite shape + # as before. push_to_hf takes care of branch creation if missing. + catalog_sha = push_to_hf( repo_id=repo_id, files=[(catalog_path, f"{source}.json")], revision=release_tag, commit_message=f"feat(data): {release_tag} — bake {source} (scalar)", token=hf_token, ) - return {"commit": sha, "materials": len(index)} + + # 2) Manifest commit — CAS retry loop merges the (source, "scalar") + # entry into whatever the current manifest already holds. Mirrors + # bake_one_per_file's pattern: fetch + parent SHA, merge, commit + # with parent_commit=, retry on 412/409. Reuses the same + # helpers so a single contract evolves across both bakers. + manifest_path = work_dir / "release-manifest.json" + api = HfApi(token=hf_token) + retry_counter: dict[str, int] = {} + cas_retries = 0 + max_retries = 6 + manifest_sha = "" + for attempt in range(max_retries): + existing_manifest, parent_sha = _fetch_manifest_with_parent(api, repo_id, release_tag) + merged = _merge_manifest_for_source(existing_manifest, source, "scalar", release_tag) + manifest_path.write_text(json.dumps(merged, indent=2, ensure_ascii=False) + "\n") + try: + manifest_commit = _create_commit_with_backoff( + api, + source=source, + repo_id=repo_id, + repo_type="dataset", + operations=[ + CommitOperationAdd( + path_in_repo="release-manifest.json", + path_or_fileobj=str(manifest_path), + ), + ], + commit_message=f"feat(data): {release_tag} — {source} catalog + manifest", + revision=release_tag, + parent_commit=parent_sha, + _retry_counter=retry_counter, + ) + manifest_sha = ( + getattr(manifest_commit, "oid", "") + or getattr(manifest_commit, "commit_oid", "") + or "" + ) + break + except Exception as e: # noqa: BLE001 + msg = str(e).lower() + is_cas_conflict = ( + "412" in msg + or "precondition" in msg + or "parent_commit" in msg + or "409" in msg + or "another commit operation" in msg + ) + if is_cas_conflict: + if attempt + 1 == max_retries: + log.error( + "scalar manifest CAS exhausted after %d retries — concurrent writers?", + max_retries, + ) + raise + log.warning( + "scalar manifest CAS retry %d/%d — concurrent writer detected", + attempt + 1, + max_retries, + ) + cas_retries += 1 + continue + raise + + return { + "commit": manifest_sha or catalog_sha, + "catalog_commit": catalog_sha, + "manifest_commit": manifest_sha, + "materials": len(index), + "cas_retries": cas_retries, + "lock_409_retries": retry_counter.get("lock_409", 0), + } def bake_one( diff --git a/tests/test_bake_scalar_source.py b/tests/test_bake_scalar_source.py new file mode 100644 index 00000000..26daaaa3 --- /dev/null +++ b/tests/test_bake_scalar_source.py @@ -0,0 +1,234 @@ +"""Scalar-source bake tests for ``hf_bake.bake_scalar_source`` (#251). + +Pre-#251 ``bake_scalar_source`` only pushed ``.json`` and never +touched ``release-manifest.json``. That worked when the client +reconstructed the manifest from a tree listing, but #239 switched the +client to read ``release-manifest.json`` directly, so a scalar bake +silently left its source invisible to clients. + +These tests pin the post-#251 contract: + +1. ``bake_scalar_source`` produces TWO commits — the catalog (one file) + and the manifest (one file). +2. The manifest commit merges a ``{"catalog": ".json", "tiers": + {"scalar": {"complete": True}}}`` entry into whatever the existing + manifest already holds. +3. The manifest commit opts into HF's ``parent_commit`` lock for CAS + (matching the per-file baker's pattern). + +Pure-Python: every HF API call is mocked. No HF traffic. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +from mat_vis_baker.hf_bake import bake_scalar_source + + +def _fake_record(mid: str): + """Minimal scalar-shape MaterialRecord — no textures.""" + from mat_vis_baker.common import ( + AttributionBlock, + MaterialRecord, + MatVisBlock, + ) + + return MaterialRecord( + id=mid, + source="physicallybased", + mat_vis=MatVisBlock( + name=mid, + category="other", + upstream_id=mid, + attribution=AttributionBlock(license_spdx="CC0-1.0"), + ), + texture_paths={}, + maps=[], + status="ok", + ) + + +def _make_commit_info(oid: str = "deadbeefcafe") -> MagicMock: + info = MagicMock() + info.oid = oid + info.commit_oid = oid + return info + + +class TestBakeScalarSource: + """#251: scalar bakes must update release-manifest.json.""" + + def test_emits_catalog_and_manifest_commits(self, tmp_path: Path) -> None: + """Two commits land: one with ``.json``, one with + ``release-manifest.json``. Pre-#251 only the catalog commit + existed and clients never saw the source post-#239.""" + records = [_fake_record(f"mat_{i}") for i in range(3)] + + with ( + patch("mat_vis_baker.hf_bake._get_fetcher") as fetcher, + patch("huggingface_hub.HfApi") as push_api_cls, + patch("mat_vis_baker.hf_bake.HfApi") as bake_api_cls, + ): + fetcher.return_value = lambda: records + + push_api = push_api_cls.return_value + push_api.list_repo_commits.return_value = [MagicMock()] + push_api.create_commit.return_value = _make_commit_info("catalogsha") + + bake_api = bake_api_cls.return_value + # No prior manifest on the branch — _fetch_manifest_with_parent + # falls back to {} and parent_sha=None. + bake_api.repo_info.side_effect = Exception("revision missing — fresh branch") + bake_api.hf_hub_download.side_effect = Exception("404 — no manifest yet") + bake_api.create_commit.return_value = _make_commit_info("manifestsha") + + result = bake_scalar_source( + source="physicallybased", + release_tag="v2026.04.2", + work_dir=tmp_path, + repo_id="gerchowl/mat-vis-tst", + hf_token="fake", + ) + + # Catalog commit went via push_to_hf → push_api.create_commit. + assert push_api.create_commit.call_count == 1, ( + f"expected 1 catalog commit via push_to_hf; got {push_api.create_commit.call_count}" + ) + catalog_call = push_api.create_commit.call_args + catalog_ops = catalog_call.kwargs["operations"] + catalog_paths = {op.path_in_repo for op in catalog_ops} + assert catalog_paths == {"physicallybased.json"}, catalog_paths + + # Manifest commit went via the bake-side HfApi → bake_api.create_commit. + assert bake_api.create_commit.call_count == 1, ( + f"expected 1 manifest commit via bake-side HfApi; " + f"got {bake_api.create_commit.call_count}" + ) + manifest_call = bake_api.create_commit.call_args + manifest_ops = manifest_call.kwargs["operations"] + manifest_paths = {op.path_in_repo for op in manifest_ops} + assert manifest_paths == {"release-manifest.json"}, manifest_paths + + # Result surfaces both SHAs and the materials count. + assert result["materials"] == len(records) + assert result["catalog_commit"] == "catalogsha" + assert result["manifest_commit"] == "manifestsha" + + def test_manifest_payload_carries_scalar_tier_entry(self, tmp_path: Path) -> None: + """The on-disk manifest written into the commit operation must + contain ``{"sources": {"physicallybased": {"catalog": + "physicallybased.json", "tiers": {"scalar": {"complete": + True}}}}}``. Asserting on the file content (not just the + commit op path) catches a regression that commits an empty or + wrong-shape manifest.""" + records = [_fake_record("mat_a"), _fake_record("mat_b")] + + with ( + patch("mat_vis_baker.hf_bake._get_fetcher") as fetcher, + patch("huggingface_hub.HfApi") as push_api_cls, + patch("mat_vis_baker.hf_bake.HfApi") as bake_api_cls, + ): + fetcher.return_value = lambda: records + + push_api = push_api_cls.return_value + push_api.list_repo_commits.return_value = [MagicMock()] + push_api.create_commit.return_value = _make_commit_info("catalogsha") + + bake_api = bake_api_cls.return_value + bake_api.repo_info.side_effect = Exception("fresh") + bake_api.hf_hub_download.side_effect = Exception("404") + bake_api.create_commit.return_value = _make_commit_info("manifestsha") + + bake_scalar_source( + source="physicallybased", + release_tag="v2026.04.2", + work_dir=tmp_path, + repo_id="gerchowl/mat-vis-tst", + hf_token="fake", + ) + + manifest_path = tmp_path / "release-manifest.json" + assert manifest_path.exists(), "manifest file must be written before commit" + manifest = json.loads(manifest_path.read_text()) + + assert manifest["release_tag"] == "v2026.04.2" + assert manifest["schema_version"] == 3 + assert "physicallybased" in manifest["sources"] + entry = manifest["sources"]["physicallybased"] + assert entry["catalog"] == "physicallybased.json" + assert entry["tiers"] == {"scalar": {"complete": True}} + + def test_manifest_merges_into_existing_sources(self, tmp_path: Path) -> None: + """When the release tag already has a manifest with other + sources baked, ``bake_scalar_source`` MUST preserve them and + only add/overwrite its own entry — anything else regresses + the multi-source race fix from #207/#208.""" + records = [_fake_record("mat_a")] + + existing_manifest = { + "schema_version": 3, + "release_tag": "v2026.04.2", + "sources": { + "polyhaven": { + "catalog": "polyhaven.json", + "tiers": {"1k": {"complete": True}, "2k": {"complete": True}}, + }, + "ambientcg": { + "catalog": "ambientcg.json", + "tiers": {"2k": {"complete": True}}, + }, + }, + } + existing_path = tmp_path / "existing-release-manifest.json" + existing_path.write_text(json.dumps(existing_manifest)) + + with ( + patch("mat_vis_baker.hf_bake._get_fetcher") as fetcher, + patch("huggingface_hub.HfApi") as push_api_cls, + patch("mat_vis_baker.hf_bake.HfApi") as bake_api_cls, + ): + fetcher.return_value = lambda: records + + push_api = push_api_cls.return_value + push_api.list_repo_commits.return_value = [MagicMock()] + push_api.create_commit.return_value = _make_commit_info("catalogsha") + + bake_api = bake_api_cls.return_value + # Existing manifest + parent SHA — _fetch_manifest_with_parent + # returns ({polyhaven, ambientcg}, "parentsha"). + bake_api.repo_info.return_value = MagicMock(sha="parentsha") + bake_api.hf_hub_download.return_value = str(existing_path) + bake_api.create_commit.return_value = _make_commit_info("manifestsha") + + bake_scalar_source( + source="physicallybased", + release_tag="v2026.04.2", + work_dir=tmp_path, + repo_id="gerchowl/mat-vis-tst", + hf_token="fake", + ) + + manifest = json.loads((tmp_path / "release-manifest.json").read_text()) + sources = manifest["sources"] + # Pre-existing sources preserved. + assert "polyhaven" in sources + assert sources["polyhaven"]["tiers"] == { + "1k": {"complete": True}, + "2k": {"complete": True}, + } + assert "ambientcg" in sources + assert sources["ambientcg"]["tiers"] == {"2k": {"complete": True}} + # Scalar entry layered in. + assert sources["physicallybased"] == { + "catalog": "physicallybased.json", + "tiers": {"scalar": {"complete": True}}, + } + + # Manifest commit must opt into parent_commit CAS. + manifest_call = bake_api.create_commit.call_args + assert manifest_call.kwargs.get("parent_commit") == "parentsha", ( + "manifest commit must pass parent_commit for CAS retry" + )