From e9608641714bf48f73ab0ccf1f99e54422d02aa7 Mon Sep 17 00:00:00 2001 From: gerchowl Date: Thu, 30 Apr 2026 18:43:52 +0200 Subject: [PATCH] feat(client): ETag-aware manifest cache + immutable-tag policy (#258) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the never-invalidated manifest disk cache with one conditional GET per client lifecycle across all four reference clients (Python, JS, Rust, shell). Cache holds a sibling .manifest.etag; on the first .manifest access we send If-None-Match and serve the cached body on 304, refetch on 200. Defensive cold-start when the origin omits an ETag — body cached, no etag file written, next lifecycle refetches unconditionally. Also formalises the immutable-tag policy in README + CHANGELOG: once a CalVer tag is published (e.g. v2026.04.2) the data at that revision will not change. New data → new CalVer. This contract is what lets the ETag cache trust 304 responses on pinned tags. All four client packages aligned to 0.6.3. --- CHANGELOG.md | 37 ++++ README.md | 8 + clients/js/mat-vis-client.mjs | 100 +++++++++-- clients/js/package.json | 2 +- clients/js/test_client.mjs | 23 ++- clients/mat-vis.sh | 71 +++++++- clients/python/mat_vis_client_standalone.py | 2 +- clients/python/pyproject.toml | 2 +- clients/python/src/mat_vis_client/client.py | 141 +++++++++++++++- clients/python/test_client.py | 65 +++++++- clients/python/tests/test_cache.py | 17 +- clients/python/tests/test_client.py | 176 ++++++++++++++++++-- clients/rust/Cargo.toml | 2 +- clients/rust/src/main.rs | 77 ++++++++- 14 files changed, 653 insertions(+), 70 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e558441..613cae95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,43 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Security +## mat-vis-client 0.6.3 + +ETag-aware manifest cache across all four reference clients, plus a +formal immutable-tag policy declaration. The previous disk cache was +never invalidated — once a manifest landed under `$MAT_VIS_CACHE`, no +client could pick up a re-published manifest at the same tag short of +clearing the cache by hand. With +[#258](https://github.com/MorePET/mat-vis/issues/258) the cache holds a +sibling `.manifest.etag` file and every client lifecycle issues one +`If-None-Match` conditional GET; HF responds 304 when the manifest is +unchanged (the steady state on an immutable release tag — see Notes +below) and the cached body is served without re-downloading. + +### Changed + +- All four reference clients (Python, JS, Rust, shell) replace the + never-invalidated manifest disk cache with an ETag-aware conditional + GET ([#258](https://github.com/MorePET/mat-vis/issues/258)). One HTTP + round-trip per client lifecycle in the steady state (304 on immutable + tags); the body is refetched only when the server's ETag actually + moves. Defensive cold-start when the origin omits an ETag — body is + cached, but no `.manifest.etag` is written, so the next lifecycle + refetches unconditionally rather than risking a stale-etag deadlock. + JS keeps an in-memory cache only in the browser (no IndexedDB + dependency) and a filesystem cache under Node, matching the existing + zero-deps posture. +- All four client package versions aligned to **0.6.3**. + +### Notes + +- **Release tags are immutable.** Once a CalVer tag is published (e.g. + `v2026.04.2`), the data at that revision will not change. New + upstream snapshots, fixes, or rebakes ship as a new CalVer tag, never + as an in-place rewrite of an existing one. This contract is what lets + the new ETag cache trust 304 responses on pinned tags — the manifest + bytes simply cannot drift under a pinned tag in the first place. + ## mat-vis-client 0.6.2 Out-of-the-box-usable defaults across all four reference clients. With no diff --git a/README.md b/README.md index 367cceb6..c738c608 100644 --- a/README.md +++ b/README.md @@ -268,6 +268,14 @@ dagger call -m .dagger bake-and-release \ - **Data releases**: calver (`v2026.04.2`) — tied to upstream source updates - **Code/client releases**: semver (`v0.6.x`) — API changes +**Release tags are immutable.** Once a CalVer tag is published (e.g. +`v2026.04.2`), the data at that revision will not change — bytes pinned +to a tag stay pinned. New upstream snapshots, fixes, or rebakes ship as +a new CalVer tag, never as an in-place rewrite of an existing one. This +contract is what lets clients use cheap `If-None-Match` conditional GETs +on the manifest (#258) and trust pinned-tag deployments across long +intervals without re-validating every byte. + ## Key design decisions Architecture is captured in [`docs/decisions/`](docs/decisions/). The diff --git a/clients/js/mat-vis-client.mjs b/clients/js/mat-vis-client.mjs index 9d26e45c..40cbf837 100644 --- a/clients/js/mat-vis-client.mjs +++ b/clients/js/mat-vis-client.mjs @@ -28,7 +28,7 @@ const HF_BASE = // SSoT: clients/js/package.json. Kept in sync by // scripts/sync-js-version.py (pre-commit) — a drift test in tests/ // fails CI if these disagree. Do not hand-edit. -export const VERSION = '0.6.2'; +export const VERSION = '0.6.3'; const UA = `mat-vis-client/${VERSION} (JavaScript)`; // Default tag when the caller doesn't pin one (#242). The dataset's @@ -51,8 +51,53 @@ function startsWithMagic(buf, magic) { return true; } +// Issue #258 — manifest cache validates against the origin per client +// lifecycle via a conditional GET. Storage is filesystem in Node (so a +// fresh process can short-circuit body refetches on 304) and in-memory +// only in the browser (no persistent cache available without IndexedDB, +// which would pull a dependency the zero-deps client refuses to take). +const IS_NODE = typeof process !== 'undefined' && process.versions?.node; + +async function readCachedManifest(cacheDir) { + if (!IS_NODE || !cacheDir) return [null, null]; + const { readFile } = await import('fs/promises'); + const { join } = await import('path'); + try { + const body = await readFile(join(cacheDir, '.manifest.json'), 'utf-8'); + let etag = null; + try { + etag = (await readFile(join(cacheDir, '.manifest.etag'), 'utf-8')) || null; + } catch { + // no etag file — cold-start equivalent, fall through. + } + return [body, etag]; + } catch { + return [null, null]; + } +} + +async function writeCachedManifest(cacheDir, body, etag) { + if (!IS_NODE || !cacheDir) return; + const { mkdir, writeFile, unlink } = await import('fs/promises'); + const { join } = await import('path'); + await mkdir(cacheDir, { recursive: true }); + await writeFile(join(cacheDir, '.manifest.json'), body); + const etagPath = join(cacheDir, '.manifest.etag'); + if (etag) { + await writeFile(etagPath, etag); + } else { + // Stale etag from a prior lifecycle would falsely 304 us. + try { + await unlink(etagPath); + } catch { + // already absent — fine. + } + } +} + export class MatVisClient { #tag; + #cacheDir; #manifest = null; #catalogs = new Map(); #tierComplete = new Map(); @@ -60,9 +105,21 @@ export class MatVisClient { /** * @param {Object} opts * @param {string} [opts.tag] - Release tag (default: DEFAULT_TAG, see #242) + * @param {string} [opts.cacheDir] - Per-tag manifest cache dir (Node only). + * Falls back to ``$MAT_VIS_CACHE/`` then ``~/.cache/mat-vis/``. + * Browser: ignored (no persistent cache without IndexedDB). */ - constructor({ tag } = {}) { + constructor({ tag, cacheDir } = {}) { this.#tag = tag || DEFAULT_TAG; + if (IS_NODE) { + const root = + cacheDir || + process.env.MAT_VIS_CACHE || + (process.env.HOME ? `${process.env.HOME}/.cache/mat-vis` : null); + this.#cacheDir = root ? `${root}/${this.#tag}` : null; + } else { + this.#cacheDir = null; + } } #hfUrl(path) { @@ -70,17 +127,36 @@ export class MatVisClient { } async manifest() { - if (!this.#manifest) { - const url = this.#hfUrl('release-manifest.json'); - const resp = await fetch(url, { headers: { 'User-Agent': UA } }); + // In-memory short-circuit (#258): repeated calls in the same + // process never touch HTTP, regardless of substrate. + if (this.#manifest) return this.#manifest; + + const url = this.#hfUrl('release-manifest.json'); + const [cachedBody, cachedEtag] = await readCachedManifest(this.#cacheDir); + const headers = { 'User-Agent': UA }; + if (cachedEtag) headers['If-None-Match'] = cachedEtag; + + const resp = await fetch(url, { headers }); + + // 304 Not Modified — cached body is authoritative. On an immutable + // release tag (see README's immutable-tag note) this is the steady + // state, so we save the body bytes and avoid the JSON parse hop on + // wire-format payload too. + if (resp.status === 304 && cachedBody) { + this.#manifest = JSON.parse(cachedBody); + } else { if (!resp.ok) throw new Error(`Failed to fetch manifest: ${resp.status}`); - this.#manifest = await resp.json(); - const sv = this.#manifest.schema_version; - if (sv !== 3) { - throw new Error( - `Unsupported manifest schema_version=${sv}; this client requires v3 (per-file substrate, ADR-0012).`, - ); - } + const body = await resp.text(); + this.#manifest = JSON.parse(body); + const newEtag = resp.headers.get('etag'); + await writeCachedManifest(this.#cacheDir, body, newEtag); + } + + const sv = this.#manifest.schema_version; + if (sv !== 3) { + throw new Error( + `Unsupported manifest schema_version=${sv}; this client requires v3 (per-file substrate, ADR-0012).`, + ); } return this.#manifest; } diff --git a/clients/js/package.json b/clients/js/package.json index a8679005..58bf5147 100644 --- a/clients/js/package.json +++ b/clients/js/package.json @@ -1,6 +1,6 @@ { "name": "mat-vis-client", - "version": "0.6.2", + "version": "0.6.3", "description": "Pure JavaScript client for mat-vis PBR textures — per-file HF dataset, zero deps, browser + Node 18+", "type": "module", "main": "mat-vis-client.mjs", diff --git a/clients/js/test_client.mjs b/clients/js/test_client.mjs index b1f1d5fc..c9471ff0 100644 --- a/clients/js/test_client.mjs +++ b/clients/js/test_client.mjs @@ -20,26 +20,39 @@ import { MatVisClient, DEFAULT_TAG } from './mat-vis-client.mjs'; const PNG = new Uint8Array([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0, 0, 0, 0]); +// Issue #258 — the manifest path now reads ``text()`` (so the raw +// body bytes can be cached side-by-side with the ETag) and inspects +// ``headers.get('etag')``. Stubs grew the matching surface; legacy +// tests still call ``json()`` against catalogs / sentinels. +function _emptyHeaders() { + return { get: () => null }; +} + function stubFetch(routes) { - return async (url, _opts) => { + return async (url, opts) => { for (const [pattern, handler] of routes) { - if (typeof pattern === 'string' && url.includes(pattern)) return handler(url); - if (pattern instanceof RegExp && pattern.test(url)) return handler(url); + if (typeof pattern === 'string' && url.includes(pattern)) return handler(url, opts); + if (pattern instanceof RegExp && pattern.test(url)) return handler(url, opts); } return { ok: false, status: 404, + headers: _emptyHeaders(), async json() { return {}; }, + async text() { return ''; }, async arrayBuffer() { return new ArrayBuffer(0); }, }; }; } -function jsonResp(obj) { +function jsonResp(obj, etag = null) { + const body = JSON.stringify(obj); return { ok: true, status: 200, + headers: { get: (k) => (k.toLowerCase() === 'etag' ? etag : null) }, async json() { return obj; }, + async text() { return body; }, async arrayBuffer() { return new ArrayBuffer(0); }, }; } @@ -48,7 +61,9 @@ function bytesResp(u8) { return { ok: true, status: 200, + headers: _emptyHeaders(), async json() { throw new Error('not json'); }, + async text() { throw new Error('not text'); }, async arrayBuffer() { return u8.buffer.slice(u8.byteOffset, u8.byteOffset + u8.byteLength); }, diff --git a/clients/mat-vis.sh b/clients/mat-vis.sh index c470516e..6540a1ec 100755 --- a/clients/mat-vis.sh +++ b/clients/mat-vis.sh @@ -26,7 +26,7 @@ HF_BASE="${MAT_VIS_HF_BASE:-https://huggingface.co/datasets/$HF_DATASET/resolve} DEFAULT_TAG="v2026.04.2" TAG="${MAT_VIS_TAG:-$DEFAULT_TAG}" CACHE="${MAT_VIS_CACHE:-$HOME/.cache/mat-vis}" -UA="mat-vis-client/0.6.2 (shell)" +UA="mat-vis-client/0.6.3 (shell)" # ── helpers ────────────────────────────────────────────────────── @@ -46,11 +46,72 @@ fetch_json() { } get_manifest() { - local m sv - m=$(fetch_json "$(hf_url release-manifest.json)" "$CACHE/$TAG/.manifest.json") - sv=$(echo "$m" | jq -r '.schema_version // empty') + # Issue #258 — manifest cache validates against the origin per + # invocation via a conditional GET. Body + ETag are stored side-by- + # side under $CACHE/$TAG/.manifest.{json,etag}; on 304 the cached + # body is served, on 200 both are replaced atomically. Falls back + # to unconditional GET when no .manifest.etag is on disk (cold + # start or after a cache prune) so a stale etag can't lock us out. + local body_path etag_path url etag http_code body sv + body_path="$CACHE/$TAG/.manifest.json" + etag_path="$CACHE/$TAG/.manifest.etag" + url=$(hf_url release-manifest.json) + mkdir -p "$(dirname "$body_path")" + + etag="" + if [ -f "$etag_path" ] && [ -f "$body_path" ]; then + etag=$(cat "$etag_path") + fi + + local tmp_body tmp_headers + tmp_body=$(mktemp) + tmp_headers=$(mktemp) + # shellcheck disable=SC2064 + trap "rm -f '$tmp_body' '$tmp_headers'" RETURN + + local curl_status + if [ -n "$etag" ]; then + http_code=$(curl -sL -o "$tmp_body" -D "$tmp_headers" \ + -w '%{http_code}' \ + -H "User-Agent: $UA" \ + -H "If-None-Match: $etag" \ + "$url") + curl_status=$? + else + http_code=$(curl -sL -o "$tmp_body" -D "$tmp_headers" \ + -w '%{http_code}' \ + -H "User-Agent: $UA" \ + "$url") + curl_status=$? + fi + [ "$curl_status" -eq 0 ] || die "Failed to fetch $url" + + # 000 = non-HTTP scheme (file://, used by structural tests). Treat + # as 200 when curl exited cleanly: there's no ETag semantics on + # local files, so we always overwrite the body cache. + if [ "$http_code" = "304" ] && [ -f "$body_path" ]; then + body=$(cat "$body_path") + elif [ "$http_code" = "200" ] || [ "$http_code" = "000" ]; then + cp "$tmp_body" "$body_path" + # Extract latest ETag header (case-insensitive, last wins on + # redirect chains). Strip CR + surrounding whitespace. + local new_etag + new_etag=$(awk 'BEGIN{IGNORECASE=1} /^etag:/ {sub(/^[Ee][Tt][Aa][Gg]:[ \t]*/, ""); sub(/\r$/, ""); val=$0} END{print val}' "$tmp_headers") + if [ -n "$new_etag" ]; then + printf '%s' "$new_etag" > "$etag_path" + else + # Defensive: server gave no ETag — drop any stale file so + # next invocation refetches unconditionally. + rm -f "$etag_path" + fi + body=$(cat "$body_path") + else + die "Failed to fetch $url (HTTP $http_code)" + fi + + sv=$(echo "$body" | jq -r '.schema_version // empty') [ "$sv" = "3" ] || die "manifest schema_version=$sv (need 3 — per-file substrate, ADR-0012)" - echo "$m" + echo "$body" } # Fetch the catalog (ADR-0011 v3) for a source. Cached locally. diff --git a/clients/python/mat_vis_client_standalone.py b/clients/python/mat_vis_client_standalone.py index b186a551..fc6beb4d 100644 --- a/clients/python/mat_vis_client_standalone.py +++ b/clients/python/mat_vis_client_standalone.py @@ -55,7 +55,7 @@ # Version is kept in sync with clients/python/pyproject.toml by # scripts/sync-standalone-version.py (run via pre-commit). Do not # hand-edit — a drift test in tests/ fails CI if it disagrees. -__version__ = "0.6.2" +__version__ = "0.6.3" # Same User-Agent as the installable package (issue #70). Standalone vs # pip-installed is an internal packaging detail; servers receiving the # request can't act on it and splitting UA populations fragments diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index f12c84bb..c21c02e6 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "mat-vis-client" -version = "0.6.2" +version = "0.6.3" description = "Pure Python client for mat-vis PBR textures — HTTP range reads, zero deps" readme = {file = "README.md", content-type = "text/markdown"} requires-python = ">=3.10" diff --git a/clients/python/src/mat_vis_client/client.py b/clients/python/src/mat_vis_client/client.py index af7a037c..6ce3e62b 100644 --- a/clients/python/src/mat_vis_client/client.py +++ b/clients/python/src/mat_vis_client/client.py @@ -411,6 +411,71 @@ def _get_json(url: str) -> dict | list: return json.loads(_get(url)) +def _get_with_etag(url: str, etag: str | None = None) -> tuple[bytes | None, str | None]: + """Conditional GET. Returns ``(body, response_etag)``. + + Issue #258: replaces the never-invalidated manifest disk cache with a + cheap conditional GET per client lifecycle. When ``etag`` is non-empty + we send ``If-None-Match: ``; HF responds 304 Not Modified when + the dataset hasn't moved on that revision (which on an immutable + release tag is always — see ADR-0007 / immutable-tag policy): + + - 304 → ``(None, etag)``, caller serves the cached body. + - 200 → ``(body, response_etag)``; ``response_etag`` may be None + when the origin / mirror omits ``ETag`` — body still cached, but + next lifecycle re-fetches unconditionally (defensive cold-start). + + Mirrors ``_get``'s retry/backoff envelope by reusing the same loop + structure: 429 / 503 / network failures retry with exponential + backoff and Retry-After honored; 304 short-circuits before the + rate-limit handler sees it; other HTTP errors raise typed + ``HTTPFetchError`` like ``_get``. + """ + hdrs = {"User-Agent": USER_AGENT} + if etag: + hdrs["If-None-Match"] = etag + + last_err: Exception | None = None + for attempt in range(MAX_RETRIES + 1): + try: + req = urllib.request.Request(url, headers=hdrs) + with urllib.request.urlopen(req, timeout=60) as resp: + data = resp.read() + return data, resp.headers.get("ETag") + except urllib.error.HTTPError as e: + if e.code == 304: + # Not Modified — caller's cached body is authoritative. + return None, etag + last_err = e + if not _is_rate_limited(e): + raise HTTPFetchError(url, e.code, e.reason or "") from e + if attempt >= MAX_RETRIES: + wait = _parse_retry_after(e.headers, int(BACKOFF_BASE_SECONDS * (2**attempt))) + raise RateLimitError( + url, wait, f"Rate limited on {url} after {MAX_RETRIES} retries." + ) from e + wait = _parse_retry_after(e.headers, int(BACKOFF_BASE_SECONDS * (2**attempt))) + print( + f"mat-vis-client: rate limited (HTTP {e.code}), " + f"retry {attempt + 1}/{MAX_RETRIES} in {wait}s", + file=sys.stderr, + ) + time.sleep(wait) + except urllib.error.URLError as e: + last_err = e + if attempt >= MAX_RETRIES: + raise NetworkError(url, str(e.reason)) from e + wait = min(int(BACKOFF_BASE_SECONDS * (2**attempt)), RETRY_MAX_WAIT_SECONDS) + print( + f"mat-vis-client: network error ({e.reason}), " + f"retry {attempt + 1}/{MAX_RETRIES} in {wait}s", + file=sys.stderr, + ) + time.sleep(wait) + + raise MatVisError(f"exhausted {MAX_RETRIES} retries for {url}") from last_err + + def _in_range(value: float | None, lo: float, hi: float) -> bool: """Check if a value falls within [lo, hi]. None values never match.""" if value is None: @@ -536,6 +601,44 @@ def _cache_write_text(self, path: Path, text: str) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(text) + def _cache_read_manifest_with_etag(self) -> tuple[str | None, str | None]: + """Return the cached manifest body + ETag, or ``(None, None)``. + + Issue #258: pairs the on-disk manifest cache with its ETag so a + conditional GET can validate against the remote without + refetching the body. Bare ``.manifest.json`` without + ``.manifest.etag`` is treated as etag-less (forces an + unconditional GET next lifecycle, then we adopt whatever ETag + the server hands back). + """ + body = self._cache_read_text(self._cache_scope / ".manifest.json") + if body is None: + return None, None + etag = self._cache_read_text(self._cache_scope / ".manifest.etag") + return body, (etag or None) + + def _cache_write_manifest(self, body: str | bytes, etag: str | None) -> None: + """Persist manifest body + ETag side-by-side. + + ``body`` accepts bytes (raw response) or str (already-decoded); + we always store as text. The ETag file is only written when the + server provided one — absent ``.manifest.etag`` signals "next + lifecycle, refetch unconditionally" (defensive cold-start). + """ + if isinstance(body, bytes): + body = body.decode("utf-8") + self._cache_write_text(self._cache_scope / ".manifest.json", body) + etag_path = self._cache_scope / ".manifest.etag" + if etag: + self._cache_write_text(etag_path, etag) + elif etag_path.exists(): + # Stale etag from a prior lifecycle would falsely 304 us + # against a manifest we no longer have. Clear it. + try: + etag_path.unlink() + except OSError: + pass + @property def manifest(self) -> dict: """Return the v3 manifest for the pinned revision. @@ -543,17 +646,34 @@ def manifest(self) -> dict: Read directly from the authoritative ``release-manifest.json`` file at the dataset root — one HTTP GET, no tree-listing reconstruction (the HF tree API caps at 1000 entries per page, - which prod bakes blow past easily, see #238). Cached per-tag - scope on disk so repeat calls in the same process are free. + which prod bakes blow past easily, see #238). + + Cache strategy (#258): one conditional GET per client lifecycle. + On the first access we send ``If-None-Match: ``; + the server responds 304 if the manifest hasn't moved (which on + an immutable release tag is always — see README's immutable-tag + note) and we serve the cached body. On 200 we replace both body + and ETag. Repeat accesses in the same process hit the in-memory + cache and never touch HTTP. """ if self._manifest is None: - cache_path = self._cache_scope / ".manifest.json" - cached = self._cache_read_text(cache_path) - if cached is not None: - self._manifest = json.loads(cached) + cached_body, cached_etag = self._cache_read_manifest_with_etag() + body, new_etag = _get_with_etag(self._manifest_url, etag=cached_etag) + if body is None: + # 304 Not Modified — cached body is still authoritative. + # cached_body is guaranteed non-None here because + # _get_with_etag only returns body=None when an + # If-None-Match was sent, which requires cached_etag, + # which requires we had a cached body alongside it. + assert cached_body is not None + self._manifest = json.loads(cached_body) else: - self._manifest = _get_json(self._manifest_url) - self._cache_write_text(cache_path, json.dumps(self._manifest, indent=2)) + if isinstance(body, bytes): + body_text = body.decode("utf-8") + else: + body_text = body + self._manifest = json.loads(body_text) + self._cache_write_manifest(body_text, etag=new_etag) self._check_schema_version(self._manifest) self._maybe_warn_updates() return self._manifest @@ -1552,7 +1672,9 @@ def cache_prune( elif keep_set and file_tag and file_tag not in keep_set: f.unlink(missing_ok=True) - # Manifest is current-tag only — drop if not in keep_tags + # Manifest is current-tag only — drop if not in keep_tags. + # The sibling .manifest.etag (#258) goes with it; a stray + # etag without a body would falsely 304 us against nothing. mf = self._cache_dir / ".manifest.json" if mf.exists() and (keep_set or tag): try: @@ -1561,6 +1683,7 @@ def cache_prune( mtag = None if (tag and mtag == tag) or (keep_set and mtag and mtag not in keep_set): mf.unlink(missing_ok=True) + (self._cache_dir / ".manifest.etag").unlink(missing_ok=True) return before - self.cache_size() diff --git a/clients/python/test_client.py b/clients/python/test_client.py index 8b5ee1ed..62990b42 100644 --- a/clients/python/test_client.py +++ b/clients/python/test_client.py @@ -167,10 +167,16 @@ def mock_client(): """Client with mocked HTTP and temp cache.""" with tempfile.TemporaryDirectory() as tmp: client = MatVisClient(tag="v2026.04.1", cache_dir=Path(tmp)) - # Pre-populate manifest cache at the tag-scoped path. + # Pre-populate manifest cache at the tag-scoped path. Issue #258 + # added an ETag sibling — without it the conditional GET on + # first .manifest access would fall back to an unconditional + # fetch and clobber our test mocks. Pre-set the in-memory + # _manifest too so no HTTP is issued at all. cache_path = Path(tmp) / "v2026.04.1" / ".manifest.json" cache_path.parent.mkdir(parents=True, exist_ok=True) cache_path.write_text(json.dumps(MOCK_MANIFEST)) + (Path(tmp) / "v2026.04.1" / ".manifest.etag").write_text('"mock"') + client._manifest = MOCK_MANIFEST # Suppress the background update-check HTTP calls that would # otherwise consume our mocked _get_json side_effect iterations. client._update_warned = True @@ -193,6 +199,8 @@ def mock_search_client(): cache_path = Path(tmp) / "v2026.04.1" / ".manifest.json" cache_path.parent.mkdir(parents=True, exist_ok=True) cache_path.write_text(json.dumps(rich_manifest)) + (Path(tmp) / "v2026.04.1" / ".manifest.etag").write_text('"mock"') + client._manifest = rich_manifest client._update_warned = True yield client @@ -240,11 +248,15 @@ def test_sources(self, mock_client): def _fresh_client(cache_dir: Path) -> MatVisClient: """Client with MOCK_MANIFEST pre-cached but the update-check flag NOT suppressed — lets tests exercise the TTY / env-var gating path. + Issue #258: pair the cached body with an ETag; the tests below + additionally patch ``_get_with_etag`` to return 304-equivalent so + the conditional GET on first ``manifest`` access doesn't escape. """ client = MatVisClient(tag="v2026.04.1", cache_dir=cache_dir) scoped = cache_dir / "v2026.04.1" scoped.mkdir(parents=True, exist_ok=True) (scoped / ".manifest.json").write_text(json.dumps(MOCK_MANIFEST)) + (scoped / ".manifest.etag").write_text('"mock"') return client @@ -268,6 +280,12 @@ def test_no_stderr_on_non_tty(self, capsys): with ( patch.object(mc, "UPDATE_CHECK_DISABLED", False), patch.object(mc, "UPDATE_CHECK_FORCED", False), + # Issue #258: 304-equivalent stub so the conditional + # GET in `manifest` doesn't escape to real HTTP. + patch( + "mat_vis_client.client._get_with_etag", + return_value=(None, '"mock"'), + ), patch.object( client, "check_updates", @@ -302,6 +320,10 @@ def test_log_info_on_tty(self, caplog): patch("sys.stderr.isatty", return_value=True), patch.object(mc, "UPDATE_CHECK_DISABLED", False), patch.object(mc, "UPDATE_CHECK_FORCED", False), + patch( + "mat_vis_client.client._get_with_etag", + return_value=(None, '"mock"'), + ), patch.object( client, "check_updates", @@ -336,6 +358,10 @@ def test_force_check_env_var_overrides_non_tty(self, caplog): patch("sys.stderr.isatty", return_value=False), patch.object(mc, "UPDATE_CHECK_DISABLED", False), patch.object(mc, "UPDATE_CHECK_FORCED", True), + patch( + "mat_vis_client.client._get_with_etag", + return_value=(None, '"mock"'), + ), patch.object( client, "check_updates", @@ -369,6 +395,10 @@ def test_opt_out_env_var_wins_over_force(self, caplog, capsys): patch("sys.stderr.isatty", return_value=True), patch.object(mc, "UPDATE_CHECK_DISABLED", True), patch.object(mc, "UPDATE_CHECK_FORCED", True), + patch( + "mat_vis_client.client._get_with_etag", + return_value=(None, '"mock"'), + ), patch.object(client, "check_updates") as mock_chk, caplog.at_level("INFO", logger="mat-vis-client"), ): @@ -391,6 +421,11 @@ def _write_manifest(self, tmp: Path, data: dict) -> Path: scoped.mkdir(parents=True, exist_ok=True) mf = scoped / ".manifest.json" mf.write_text(json.dumps(data)) + # Issue #258: pair the body with an ETag so the manifest + # property's conditional GET can short-circuit (304) instead of + # falling back to an unconditional refetch and clobbering our + # poisoned-payload test fixture. + (scoped / ".manifest.etag").write_text('"mock"') return mf def test_rejects_manifest_without_schema_version(self): @@ -400,16 +435,24 @@ def test_rejects_manifest_without_schema_version(self): legacy = {k: v for k, v in MOCK_MANIFEST.items() if k != "schema_version"} assert "schema_version" not in legacy self._write_manifest(tmp, legacy) - with pytest.raises(RuntimeError, match="schema_version"): - _ = client.manifest + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(None, '"mock"'), + ): + with pytest.raises(RuntimeError, match="schema_version"): + _ = client.manifest def test_error_message_mentions_cache_clear(self): """Recovery path is surfaced in the error message.""" with tempfile.TemporaryDirectory() as tmp: client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) self._write_manifest(tmp, {"version": 1}) - with pytest.raises(RuntimeError) as excinfo: - _ = client.manifest + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(None, '"mock"'), + ): + with pytest.raises(RuntimeError) as excinfo: + _ = client.manifest msg = str(excinfo.value) assert "cache clear" in msg assert ".manifest.json" in msg @@ -420,8 +463,12 @@ def test_rejects_incompatible_schema_version(self): client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) future = {**MOCK_MANIFEST, "schema_version": 99} self._write_manifest(tmp, future) - with pytest.raises(RuntimeError, match="does not support"): - _ = client.manifest + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(None, '"mock"'), + ): + with pytest.raises(RuntimeError, match="does not support"): + _ = client.manifest class TestClientCatalogQueries: @@ -1396,6 +1443,10 @@ def _make_client(self, tmp: Path, tier: str) -> MatVisClient: client = MatVisClient(tag="test", cache_dir=tmp) (tmp / "test").mkdir(parents=True, exist_ok=True) (tmp / "test" / ".manifest.json").write_text(json.dumps(manifest)) + (tmp / "test" / ".manifest.etag").write_text('"mock"') + # Skip the conditional GET in `manifest` (#258) by pre-setting + # the in-memory cache. + client._manifest = manifest client._update_warned = True # Pre-load the in-memory catalog so _resolve_material_id finds "M". client._indexes["polyhaven"] = [ diff --git a/clients/python/tests/test_cache.py b/clients/python/tests/test_cache.py index 2ccbd5e7..5fb1a2ee 100644 --- a/clients/python/tests/test_cache.py +++ b/clients/python/tests/test_cache.py @@ -14,6 +14,7 @@ from __future__ import annotations +import json import tempfile from pathlib import Path from unittest.mock import patch @@ -95,21 +96,23 @@ def test_cache_true_is_default(tmp_cache): def test_cache_false_does_not_write_manifest_to_disk(tmp_cache): - """With cache=False, fetching manifest must not create a .manifest.json.""" + """With cache=False, fetching manifest must not create .manifest.{json,etag}.""" c = MatVisClient(cache_dir=tmp_cache, tag="v2026.04.0", cache=False) - # Post mat-vis#238: the client fetches release-manifest.json directly, - # so _get_json returns the manifest dict verbatim. - with patch("mat_vis_client.client._get_json", return_value=MOCK_MANIFEST_FETCH): + # Post mat-vis#258: the client fetches release-manifest.json via a + # conditional GET; _get_with_etag returns (body_bytes, etag). + body = json.dumps(MOCK_MANIFEST_FETCH).encode() + with patch("mat_vis_client.client._get_with_etag", return_value=(body, '"x"')): _ = c.manifest - # No cached manifest anywhere under tmp_cache - matches = list(tmp_cache.rglob(".manifest.json")) + # No cached manifest body or etag anywhere under tmp_cache + matches = list(tmp_cache.rglob(".manifest.json")) + list(tmp_cache.rglob(".manifest.etag")) assert matches == [], f"expected no manifest cache, found: {matches}" def test_cache_false_fetches_manifest_every_time(tmp_cache): """With cache=False, manifest is fetched on every .manifest access.""" c = MatVisClient(cache_dir=tmp_cache, tag="v2026.04.0", cache=False) - with patch("mat_vis_client.client._get_json", return_value=MOCK_MANIFEST_FETCH) as mock_get: + body = json.dumps(MOCK_MANIFEST_FETCH).encode() + with patch("mat_vis_client.client._get_with_etag", return_value=(body, '"x"')) as mock_get: _ = c.manifest # Re-reset the in-memory cache to force another fetch c._manifest = None diff --git a/clients/python/tests/test_client.py b/clients/python/tests/test_client.py index 78190b0e..c4b684fc 100644 --- a/clients/python/tests/test_client.py +++ b/clients/python/tests/test_client.py @@ -185,10 +185,19 @@ def mock_client(): with tempfile.TemporaryDirectory() as tmp: client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) # Pre-populate manifest cache so no HTTP needed (tag-scoped path). - cache_path = Path(tmp) / "v2026.04.0" / ".manifest.json" - cache_path.parent.mkdir(parents=True, exist_ok=True) - cache_path.write_text(json.dumps(MOCK_MANIFEST)) - yield client + # Issue #258: pair the cache body with an ETag so the conditional + # GET in `manifest` would short-circuit — but tests below patch + # `_get_with_etag` directly to assert no HTTP escapes either way. + scope = Path(tmp) / "v2026.04.0" + scope.mkdir(parents=True, exist_ok=True) + (scope / ".manifest.json").write_text(json.dumps(MOCK_MANIFEST)) + (scope / ".manifest.etag").write_text('"mock-etag"') + # Patch _get_with_etag so any unintended manifest probe surfaces + # cleanly (304-equivalent against the cached body) instead of + # hitting real HTTP. Tests that exercise the cold path patch + # this themselves with a 200-equivalent return value. + with patch("mat_vis_client.client._get_with_etag", return_value=(None, '"mock-etag"')): + yield client # ── Helper tests ──────────────────────────────────────────────── @@ -228,28 +237,39 @@ def test_sources(self, mock_client): def test_manifest_fetches_release_manifest_directly(self): """Cold cache: ``manifest`` fetches release-manifest.json verbatim - via a single GET (mat-vis#238). No tree-listing reconstruction.""" + via a single conditional GET (#258). No tree-listing reconstruction.""" with tempfile.TemporaryDirectory() as tmp: client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) - with patch("mat_vis_client.client._get_json", return_value=MOCK_MANIFEST) as mock_get: + body = json.dumps(MOCK_MANIFEST).encode() + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(body, '"abc123"'), + ) as mock_get: m = client.manifest # Returned verbatim assert m == MOCK_MANIFEST - # Single fetch, against the release-manifest.json URL + # Single fetch, against the release-manifest.json URL, no ETag yet assert mock_get.call_count == 1 - (url,), _ = mock_get.call_args + args, kwargs = mock_get.call_args + url = args[0] assert url.endswith("/v2026.04.0/release-manifest.json") - # Cached on disk under the tag scope - cached = Path(tmp) / "v2026.04.0" / ".manifest.json" - assert cached.exists() - assert json.loads(cached.read_text()) == MOCK_MANIFEST + assert kwargs.get("etag") is None + # Body + etag both cached under tag scope + cached_body = Path(tmp) / "v2026.04.0" / ".manifest.json" + cached_etag = Path(tmp) / "v2026.04.0" / ".manifest.etag" + assert cached_body.exists() + assert json.loads(cached_body.read_text()) == MOCK_MANIFEST + assert cached_etag.read_text() == '"abc123"' def test_manifest_rejects_unsupported_schema(self): """``_check_schema_version`` still gates the fetched manifest.""" with tempfile.TemporaryDirectory() as tmp: client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) bad = {**MOCK_MANIFEST, "schema_version": 99} - with patch("mat_vis_client.client._get_json", return_value=bad): + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(json.dumps(bad).encode(), '"x"'), + ): with pytest.raises(RuntimeError, match="schema_version=99"): _ = client.manifest @@ -258,11 +278,139 @@ def test_manifest_rejects_missing_schema(self): with tempfile.TemporaryDirectory() as tmp: client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) bad = {k: v for k, v in MOCK_MANIFEST.items() if k != "schema_version"} - with patch("mat_vis_client.client._get_json", return_value=bad): + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(json.dumps(bad).encode(), '"x"'), + ): with pytest.raises(RuntimeError, match="missing 'schema_version'"): _ = client.manifest +class TestManifestEtagCache: + """Issue #258 — manifest cache validates against the origin per + client lifecycle via a conditional GET, instead of trusting disk + forever. Covers cold start, in-process memoization, fresh-process + 304 (server unchanged), fresh-process 200 (server moved), and the + no-ETag-from-server defensive path. + """ + + def test_first_fetch_no_cache(self): + """Cold cache: a single GET populates manifest + ETag side-by-side.""" + with tempfile.TemporaryDirectory() as tmp: + client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) + body = json.dumps(MOCK_MANIFEST).encode() + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(body, '"v1"'), + ) as mock_get: + m = client.manifest + assert m == MOCK_MANIFEST + assert mock_get.call_count == 1 + # First call has no etag (cold cache). + _, kwargs = mock_get.call_args + assert kwargs.get("etag") is None + scope = Path(tmp) / "v2026.04.0" + assert (scope / ".manifest.json").exists() + assert (scope / ".manifest.etag").read_text() == '"v1"' + + def test_repeat_access_no_http(self): + """Second access in the same process is in-memory only, no HTTP.""" + with tempfile.TemporaryDirectory() as tmp: + client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) + body = json.dumps(MOCK_MANIFEST).encode() + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(body, '"v1"'), + ) as mock_get: + _ = client.manifest + _ = client.manifest + _ = client.manifest + # Memoized after first call. + assert mock_get.call_count == 1 + + def test_fresh_process_server_unchanged_304(self): + """Fresh client + same cache_dir + 304: cached body served, no body refetch.""" + with tempfile.TemporaryDirectory() as tmp: + # Seed cache from a "previous" lifecycle. + scope = Path(tmp) / "v2026.04.0" + scope.mkdir(parents=True, exist_ok=True) + (scope / ".manifest.json").write_text(json.dumps(MOCK_MANIFEST)) + (scope / ".manifest.etag").write_text('"v1"') + + client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(None, '"v1"'), # 304: body=None, etag echoed + ) as mock_get: + m = client.manifest + + assert m == MOCK_MANIFEST # served from cached body + assert mock_get.call_count == 1 + _, kwargs = mock_get.call_args + assert kwargs.get("etag") == '"v1"' + # Cache untouched. + assert (scope / ".manifest.etag").read_text() == '"v1"' + + def test_fresh_process_server_mutated_200(self): + """Fresh client + same cache_dir + 200 with new body: cache replaced.""" + with tempfile.TemporaryDirectory() as tmp: + scope = Path(tmp) / "v2026.04.0" + scope.mkdir(parents=True, exist_ok=True) + (scope / ".manifest.json").write_text(json.dumps(MOCK_MANIFEST)) + (scope / ".manifest.etag").write_text('"v1"') + + new_manifest = {**MOCK_MANIFEST, "release_tag": "v2026.04.99"} + new_body = json.dumps(new_manifest).encode() + + client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(new_body, '"v2"'), + ) as mock_get: + m = client.manifest + + assert m == new_manifest + assert mock_get.call_count == 1 + _, kwargs = mock_get.call_args + assert kwargs.get("etag") == '"v1"' # sent the prior etag + # Cache replaced atomically. + assert json.loads((scope / ".manifest.json").read_text()) == new_manifest + assert (scope / ".manifest.etag").read_text() == '"v2"' + + def test_no_etag_from_server_does_not_crash(self): + """Defensive: 200 without ETag stores body but no etag file. + + Next lifecycle behaves like cold start (forces unconditional + refetch) rather than crashing on missing-etag. + """ + with tempfile.TemporaryDirectory() as tmp: + client = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) + body = json.dumps(MOCK_MANIFEST).encode() + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(body, None), + ): + m = client.manifest + assert m == MOCK_MANIFEST + scope = Path(tmp) / "v2026.04.0" + assert (scope / ".manifest.json").exists() + # No etag file written → next lifecycle fetches unconditionally. + assert not (scope / ".manifest.etag").exists() + + # Simulate next lifecycle — fresh client, same cache. + client2 = MatVisClient(tag="v2026.04.0", cache_dir=Path(tmp)) + new_body = json.dumps(MOCK_MANIFEST).encode() + with patch( + "mat_vis_client.client._get_with_etag", + return_value=(new_body, '"now-with-etag"'), + ) as mock_get2: + _ = client2.manifest + # Called with etag=None (cold-start equivalent). + _, kwargs = mock_get2.call_args + assert kwargs.get("etag") is None + assert (scope / ".manifest.etag").read_text() == '"now-with-etag"' + + class TestClientCatalogQueries: """Per-file substrate (#186): materials/channels read from v3 catalog.""" diff --git a/clients/rust/Cargo.toml b/clients/rust/Cargo.toml index ef14440a..d746a1ab 100644 --- a/clients/rust/Cargo.toml +++ b/clients/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mat-vis-client" -version = "0.6.2" +version = "0.6.3" edition = "2021" description = "mat-vis PBR texture client — per-file HF dataset (ADR-0012)" license = "MIT" diff --git a/clients/rust/src/main.rs b/clients/rust/src/main.rs index 115beda4..7dd4840c 100644 --- a/clients/rust/src/main.rs +++ b/clients/rust/src/main.rs @@ -84,16 +84,77 @@ fn validate_schema(m: &Manifest) -> Result<(), String> { Ok(()) } +fn cache_root() -> Option { + if let Ok(v) = std::env::var("MAT_VIS_CACHE") { + return Some(PathBuf::from(v)); + } + if let Ok(home) = std::env::var("HOME") { + return Some(PathBuf::from(home).join(".cache").join("mat-vis")); + } + None +} + +/// Issue #258 — manifest cache validates against the origin per +/// invocation via a conditional GET. Body + ETag are stored side-by- +/// side under ``$MAT_VIS_CACHE//.manifest.{json,etag}``; on 304 +/// the cached body is served, on 200 both are replaced atomically. +/// Cache is a no-op when no cache dir resolves (e.g. ``HOME`` unset +/// inside a hermetic build) — the GET still works, it just falls back +/// to unconditional every invocation. +fn read_cached_manifest(tag: &str) -> Option<(String, Option)> { + let dir = cache_root()?.join(tag); + let body = fs::read_to_string(dir.join(".manifest.json")).ok()?; + let etag = fs::read_to_string(dir.join(".manifest.etag")).ok(); + Some((body, etag)) +} + +fn write_cached_manifest(tag: &str, body: &str, etag: Option<&str>) { + let Some(dir) = cache_root() else { return }; + let dir = dir.join(tag); + if fs::create_dir_all(&dir).is_err() { + return; + } + let _ = fs::write(dir.join(".manifest.json"), body); + let etag_path = dir.join(".manifest.etag"); + match etag { + Some(e) => { + let _ = fs::write(&etag_path, e); + } + None => { + // Stale etag would falsely 304 us against nothing — clear it. + let _ = fs::remove_file(&etag_path); + } + } +} + fn fetch_manifest(tag: &str) -> Manifest { let url = hf_url(tag, "release-manifest.json"); - let m: Manifest = client() - .get(&url) - .send() - .expect("Failed to fetch manifest") - .error_for_status() - .expect("Failed to fetch manifest") - .json() - .expect("Failed to parse manifest"); + let cached = read_cached_manifest(tag); + let mut req = client().get(&url); + if let Some((_, Some(ref etag))) = cached { + req = req.header(reqwest::header::IF_NONE_MATCH, etag.as_str()); + } + let resp = req.send().expect("Failed to fetch manifest"); + + let m: Manifest = if resp.status() == reqwest::StatusCode::NOT_MODIFIED { + // 304 — cached body is authoritative. cached must be Some + // because we only sent If-None-Match when it was. + let (body, _) = cached.expect("304 implies a cached body"); + serde_json::from_str(&body).expect("Failed to parse cached manifest") + } else { + let resp = resp + .error_for_status() + .expect("Failed to fetch manifest"); + let new_etag = resp + .headers() + .get(reqwest::header::ETAG) + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + let body = resp.text().expect("Failed to read manifest body"); + write_cached_manifest(tag, &body, new_etag.as_deref()); + serde_json::from_str(&body).expect("Failed to parse manifest") + }; + if let Err(e) = validate_schema(&m) { eprintln!("{e}"); std::process::exit(1);