Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 103 additions & 6 deletions src/mat_vis_baker/hf_bake.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,14 @@
import logging
from pathlib import Path

from huggingface_hub import CommitOperationAdd, HfApi

from mat_vis_baker.hf_bake_per_file import (
_fetch_manifest_with_parent,
_merge_manifest_for_source,
)
from mat_vis_baker.hf_push import push_to_hf
from mat_vis_baker.hf_retry import _create_commit_with_backoff
from mat_vis_baker.index_builder import build_index

log = logging.getLogger("mat-vis-baker.hf_bake")
Expand Down Expand Up @@ -64,11 +71,23 @@ def bake_scalar_source(
hf_token: str | None = None,
dry_run: bool = False,
) -> dict:
"""Bake a scalar-only source (physicallybased): write catalog, no tar.
"""Bake a scalar-only source (physicallybased): write catalog + manifest.

The catalog is overwritten wholesale — a scalar source is baked
as one unit. No manifest write; clients derive the manifest from
the dataset tree.
as one unit. The release-manifest.json is then merged in a second
commit using the same CAS retry pattern as ``bake_one_per_file``
(#251) so concurrent writers against the same release tag converge
on a manifest that lists every baked source. Pre-#239 the client
reconstructed the manifest from a tree listing, but the per-file
substrate broke tree pagination at scale, so the static manifest
is now the source of truth.

The scalar manifest entry shape mirrors the textured one
(``source_tiers.SUPPORTED_TIERS`` declares ``physicallybased:
{"scalar"}``)::

{"catalog": "<source>.json",
"tiers": {"scalar": {"complete": True}}}
"""
work_dir.mkdir(parents=True, exist_ok=True)
fetch = _get_fetcher(source)
Expand All @@ -80,17 +99,95 @@ def bake_scalar_source(
catalog_path.write_text(json.dumps(index, indent=2, ensure_ascii=False) + "\n")

if dry_run:
log.info("dry-run: would push 1 file (%s.json) to %s@%s", source, repo_id, release_tag)
log.info(
"dry-run: would push catalog + manifest (%s.json + release-manifest.json) to %s@%s",
source,
repo_id,
release_tag,
)
return {"dry_run": True, "materials": len(index)}

sha = push_to_hf(
# 1) Catalog commit — single file, same wholesale-overwrite shape
# as before. push_to_hf takes care of branch creation if missing.
catalog_sha = push_to_hf(
repo_id=repo_id,
files=[(catalog_path, f"{source}.json")],
revision=release_tag,
commit_message=f"feat(data): {release_tag} — bake {source} (scalar)",
token=hf_token,
)
return {"commit": sha, "materials": len(index)}

# 2) Manifest commit — CAS retry loop merges the (source, "scalar")
# entry into whatever the current manifest already holds. Mirrors
# bake_one_per_file's pattern: fetch + parent SHA, merge, commit
# with parent_commit=<sha>, retry on 412/409. Reuses the same
# helpers so a single contract evolves across both bakers.
manifest_path = work_dir / "release-manifest.json"
api = HfApi(token=hf_token)
retry_counter: dict[str, int] = {}
cas_retries = 0
max_retries = 6
manifest_sha = ""
for attempt in range(max_retries):
existing_manifest, parent_sha = _fetch_manifest_with_parent(api, repo_id, release_tag)
merged = _merge_manifest_for_source(existing_manifest, source, "scalar", release_tag)
manifest_path.write_text(json.dumps(merged, indent=2, ensure_ascii=False) + "\n")
try:
manifest_commit = _create_commit_with_backoff(
api,
source=source,
repo_id=repo_id,
repo_type="dataset",
operations=[
CommitOperationAdd(
path_in_repo="release-manifest.json",
path_or_fileobj=str(manifest_path),
),
],
commit_message=f"feat(data): {release_tag} — {source} catalog + manifest",
revision=release_tag,
parent_commit=parent_sha,
_retry_counter=retry_counter,
)
manifest_sha = (
getattr(manifest_commit, "oid", "")
or getattr(manifest_commit, "commit_oid", "")
or ""
)
break
except Exception as e: # noqa: BLE001
msg = str(e).lower()
is_cas_conflict = (
"412" in msg
or "precondition" in msg
or "parent_commit" in msg
or "409" in msg
or "another commit operation" in msg
)
if is_cas_conflict:
if attempt + 1 == max_retries:
log.error(
"scalar manifest CAS exhausted after %d retries — concurrent writers?",
max_retries,
)
raise
log.warning(
"scalar manifest CAS retry %d/%d — concurrent writer detected",
attempt + 1,
max_retries,
)
cas_retries += 1
continue
raise

return {
"commit": manifest_sha or catalog_sha,
"catalog_commit": catalog_sha,
"manifest_commit": manifest_sha,
"materials": len(index),
"cas_retries": cas_retries,
"lock_409_retries": retry_counter.get("lock_409", 0),
}


def bake_one(
Expand Down
234 changes: 234 additions & 0 deletions tests/test_bake_scalar_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
"""Scalar-source bake tests for ``hf_bake.bake_scalar_source`` (#251).

Pre-#251 ``bake_scalar_source`` only pushed ``<source>.json`` and never
touched ``release-manifest.json``. That worked when the client
reconstructed the manifest from a tree listing, but #239 switched the
client to read ``release-manifest.json`` directly, so a scalar bake
silently left its source invisible to clients.

These tests pin the post-#251 contract:

1. ``bake_scalar_source`` produces TWO commits — the catalog (one file)
and the manifest (one file).
2. The manifest commit merges a ``{"catalog": "<source>.json", "tiers":
{"scalar": {"complete": True}}}`` entry into whatever the existing
manifest already holds.
3. The manifest commit opts into HF's ``parent_commit`` lock for CAS
(matching the per-file baker's pattern).

Pure-Python: every HF API call is mocked. No HF traffic.
"""

from __future__ import annotations

import json
from pathlib import Path
from unittest.mock import MagicMock, patch

from mat_vis_baker.hf_bake import bake_scalar_source


def _fake_record(mid: str):
"""Minimal scalar-shape MaterialRecord — no textures."""
from mat_vis_baker.common import (
AttributionBlock,
MaterialRecord,
MatVisBlock,
)

return MaterialRecord(
id=mid,
source="physicallybased",
mat_vis=MatVisBlock(
name=mid,
category="other",
upstream_id=mid,
attribution=AttributionBlock(license_spdx="CC0-1.0"),
),
texture_paths={},
maps=[],
status="ok",
)


def _make_commit_info(oid: str = "deadbeefcafe") -> MagicMock:
info = MagicMock()
info.oid = oid
info.commit_oid = oid
return info


class TestBakeScalarSource:
"""#251: scalar bakes must update release-manifest.json."""

def test_emits_catalog_and_manifest_commits(self, tmp_path: Path) -> None:
"""Two commits land: one with ``<source>.json``, one with
``release-manifest.json``. Pre-#251 only the catalog commit
existed and clients never saw the source post-#239."""
records = [_fake_record(f"mat_{i}") for i in range(3)]

with (
patch("mat_vis_baker.hf_bake._get_fetcher") as fetcher,
patch("huggingface_hub.HfApi") as push_api_cls,
patch("mat_vis_baker.hf_bake.HfApi") as bake_api_cls,
):
fetcher.return_value = lambda: records

push_api = push_api_cls.return_value
push_api.list_repo_commits.return_value = [MagicMock()]
push_api.create_commit.return_value = _make_commit_info("catalogsha")

bake_api = bake_api_cls.return_value
# No prior manifest on the branch — _fetch_manifest_with_parent
# falls back to {} and parent_sha=None.
bake_api.repo_info.side_effect = Exception("revision missing — fresh branch")
bake_api.hf_hub_download.side_effect = Exception("404 — no manifest yet")
bake_api.create_commit.return_value = _make_commit_info("manifestsha")

result = bake_scalar_source(
source="physicallybased",
release_tag="v2026.04.2",
work_dir=tmp_path,
repo_id="gerchowl/mat-vis-tst",
hf_token="fake",
)

# Catalog commit went via push_to_hf → push_api.create_commit.
assert push_api.create_commit.call_count == 1, (
f"expected 1 catalog commit via push_to_hf; got {push_api.create_commit.call_count}"
)
catalog_call = push_api.create_commit.call_args
catalog_ops = catalog_call.kwargs["operations"]
catalog_paths = {op.path_in_repo for op in catalog_ops}
assert catalog_paths == {"physicallybased.json"}, catalog_paths

# Manifest commit went via the bake-side HfApi → bake_api.create_commit.
assert bake_api.create_commit.call_count == 1, (
f"expected 1 manifest commit via bake-side HfApi; "
f"got {bake_api.create_commit.call_count}"
)
manifest_call = bake_api.create_commit.call_args
manifest_ops = manifest_call.kwargs["operations"]
manifest_paths = {op.path_in_repo for op in manifest_ops}
assert manifest_paths == {"release-manifest.json"}, manifest_paths

# Result surfaces both SHAs and the materials count.
assert result["materials"] == len(records)
assert result["catalog_commit"] == "catalogsha"
assert result["manifest_commit"] == "manifestsha"

def test_manifest_payload_carries_scalar_tier_entry(self, tmp_path: Path) -> None:
"""The on-disk manifest written into the commit operation must
contain ``{"sources": {"physicallybased": {"catalog":
"physicallybased.json", "tiers": {"scalar": {"complete":
True}}}}}``. Asserting on the file content (not just the
commit op path) catches a regression that commits an empty or
wrong-shape manifest."""
records = [_fake_record("mat_a"), _fake_record("mat_b")]

with (
patch("mat_vis_baker.hf_bake._get_fetcher") as fetcher,
patch("huggingface_hub.HfApi") as push_api_cls,
patch("mat_vis_baker.hf_bake.HfApi") as bake_api_cls,
):
fetcher.return_value = lambda: records

push_api = push_api_cls.return_value
push_api.list_repo_commits.return_value = [MagicMock()]
push_api.create_commit.return_value = _make_commit_info("catalogsha")

bake_api = bake_api_cls.return_value
bake_api.repo_info.side_effect = Exception("fresh")
bake_api.hf_hub_download.side_effect = Exception("404")
bake_api.create_commit.return_value = _make_commit_info("manifestsha")

bake_scalar_source(
source="physicallybased",
release_tag="v2026.04.2",
work_dir=tmp_path,
repo_id="gerchowl/mat-vis-tst",
hf_token="fake",
)

manifest_path = tmp_path / "release-manifest.json"
assert manifest_path.exists(), "manifest file must be written before commit"
manifest = json.loads(manifest_path.read_text())

assert manifest["release_tag"] == "v2026.04.2"
assert manifest["schema_version"] == 3
assert "physicallybased" in manifest["sources"]
entry = manifest["sources"]["physicallybased"]
assert entry["catalog"] == "physicallybased.json"
assert entry["tiers"] == {"scalar": {"complete": True}}

def test_manifest_merges_into_existing_sources(self, tmp_path: Path) -> None:
"""When the release tag already has a manifest with other
sources baked, ``bake_scalar_source`` MUST preserve them and
only add/overwrite its own entry — anything else regresses
the multi-source race fix from #207/#208."""
records = [_fake_record("mat_a")]

existing_manifest = {
"schema_version": 3,
"release_tag": "v2026.04.2",
"sources": {
"polyhaven": {
"catalog": "polyhaven.json",
"tiers": {"1k": {"complete": True}, "2k": {"complete": True}},
},
"ambientcg": {
"catalog": "ambientcg.json",
"tiers": {"2k": {"complete": True}},
},
},
}
existing_path = tmp_path / "existing-release-manifest.json"
existing_path.write_text(json.dumps(existing_manifest))

with (
patch("mat_vis_baker.hf_bake._get_fetcher") as fetcher,
patch("huggingface_hub.HfApi") as push_api_cls,
patch("mat_vis_baker.hf_bake.HfApi") as bake_api_cls,
):
fetcher.return_value = lambda: records

push_api = push_api_cls.return_value
push_api.list_repo_commits.return_value = [MagicMock()]
push_api.create_commit.return_value = _make_commit_info("catalogsha")

bake_api = bake_api_cls.return_value
# Existing manifest + parent SHA — _fetch_manifest_with_parent
# returns ({polyhaven, ambientcg}, "parentsha").
bake_api.repo_info.return_value = MagicMock(sha="parentsha")
bake_api.hf_hub_download.return_value = str(existing_path)
bake_api.create_commit.return_value = _make_commit_info("manifestsha")

bake_scalar_source(
source="physicallybased",
release_tag="v2026.04.2",
work_dir=tmp_path,
repo_id="gerchowl/mat-vis-tst",
hf_token="fake",
)

manifest = json.loads((tmp_path / "release-manifest.json").read_text())
sources = manifest["sources"]
# Pre-existing sources preserved.
assert "polyhaven" in sources
assert sources["polyhaven"]["tiers"] == {
"1k": {"complete": True},
"2k": {"complete": True},
}
assert "ambientcg" in sources
assert sources["ambientcg"]["tiers"] == {"2k": {"complete": True}}
# Scalar entry layered in.
assert sources["physicallybased"] == {
"catalog": "physicallybased.json",
"tiers": {"scalar": {"complete": True}},
}

# Manifest commit must opt into parent_commit CAS.
manifest_call = bake_api.create_commit.call_args
assert manifest_call.kwargs.get("parent_commit") == "parentsha", (
"manifest commit must pass parent_commit for CAS retry"
)
Loading