From 16724741873db4025641de927dcba9cf8022fd50 Mon Sep 17 00:00:00 2001 From: Afshawn Lotfi Date: Sat, 17 Jan 2026 08:58:07 +0000 Subject: [PATCH 1/4] feat: update README with new Reconstruct API and CachedAssetLoader chore: bump version to 2.5.0-alpha in package.json test: add comprehensive tests for Packable.reconstruct functionality refactor: enhance DataHandler interface with writeBinary method and improve asset loading logic refactor: update Packable class to support reconstructing data with schemas and improve asset handling --- .devcontainer/devcontainer.json | 11 +- .vscode/settings.json | 4 +- python/README.md | 294 +++--- .../extract_reconstruct_example.ipynb | 743 ++++++++++++++ python/examples/mesh_example.ipynb | 211 +--- python/meshly/__init__.py | 6 + python/meshly/array.py | 10 +- python/meshly/data_handler.py | 83 +- python/meshly/packable.py | 909 +++++++++++++----- python/pyproject.toml | 2 +- python/tests/test_packable.py | 603 ++++++++++-- typescript/README.md | 132 +-- typescript/package.json | 2 +- typescript/src/__tests__/loadArray.test.ts | 4 - typescript/src/__tests__/reconstruct.test.ts | 322 +++++++ typescript/src/data-handler.ts | 101 +- typescript/src/index.ts | 14 +- typescript/src/packable.ts | 289 ++++-- 18 files changed, 2822 insertions(+), 918 deletions(-) create mode 100644 python/examples/extract_reconstruct_example.ipynb create mode 100644 typescript/src/__tests__/reconstruct.test.ts diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index d0371d0..2b071f0 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -4,24 +4,17 @@ "name": "Python 3", // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye", - "features": { - "ghcr.io/devcontainers/features/node:1": {}, - "ghcr.io/devcontainers/features/python:1": {} + "ghcr.io/devcontainers/features/node:1": {} }, - // Features to add to the dev container. More info: https://containers.dev/features. // "features": {}, - // Use 'forwardPorts' to make a list of ports inside the container available locally. // "forwardPorts": [], - // Use 'postCreateCommand' to run commands after the container is created. // "postCreateCommand": "pip3 install --user -r requirements.txt", - // Configure tool-specific properties. // "customizations": {}, - // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root" -} +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 05c1df5..2dfab44 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,10 +1,10 @@ { "python.testing.pytestArgs": [ - "-v", - "./python/tests" + "-v" ], "python.testing.pytestEnabled": true, "python.testing.unittestEnabled": false, + "python.testing.cwd": "${workspaceFolder}/python", "jupyter.notebookFileRoot": "${workspaceFolder}/python", // Vitest configuration for TypeScript tests "vitest.root": "./typescript", diff --git a/python/README.md b/python/README.md index 56df366..ad6f072 100644 --- a/python/README.md +++ b/python/README.md @@ -17,12 +17,16 @@ pip install meshly - **`CustomFieldConfig`**: Configuration for custom field encoding/decoding - **`ArrayUtils`**: Utility class for encoding/decoding individual arrays - **`DataHandler`**: Unified interface for reading and writing files or zip archives +- **`CachedAssetLoader`**: Asset loader with disk cache for content-addressable storage +- **`LazyModel`**: Lazy proxy that defers asset loading until field access ### Key Capabilities - Automatic encoding/decoding of numpy array attributes, including nested dictionaries - Custom subclasses with additional array fields are automatically serialized - Custom field encoding via `_get_custom_fields()` override +- **Extract/Reconstruct API** for content-addressable storage with deduplication +- **Lazy loading** with `LazyModel` for deferred asset resolution - Enhanced polygon support with `index_sizes` and VTK-compatible `cell_types` - Mesh markers for boundary conditions, material regions, and geometric features - Mesh operations: triangulate, optimize, simplify, combine, extract @@ -187,116 +191,110 @@ loaded = SceneMesh.load_from_zip("scene.zip") # loaded.materials["wood"] is a MaterialProperties instance ``` -### Nested Packables +### Extract and Reconstruct API -Fields that are themselves `Packable` subclasses are automatically handled: +For content-addressable storage with deduplication, use the `extract()` and `reconstruct()` static methods: ```python -class PhysicsProperties(Packable): - """Physics data as a nested Packable.""" - mass: float = 1.0 - inertia_tensor: np.ndarray # 3x3 matrix +from meshly import Packable + +class SimulationResult(Packable): + """Simulation data with arrays.""" + time: float + temperature: np.ndarray + velocity: np.ndarray + +result = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0, 302.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) +) -class PhysicsMesh(Mesh): - """Mesh with nested Packable field.""" - physics: Optional[PhysicsProperties] = None +# Extract to serializable data + assets +extracted = Packable.extract(result) +# extracted.data = {"time": 0.5, "temperature": {"$ref": "abc123..."}, "velocity": {"$ref": "def456..."}} +# extracted.assets = {"abc123...": , "def456...": } -# Nested Packables use their own encode/decode methods -mesh = PhysicsMesh( - vertices=vertices, - indices=indices, - physics=PhysicsProperties( - mass=2.5, - inertia_tensor=np.eye(3, dtype=np.float32) - ) -) +# Data is JSON-serializable +import json +json.dumps(extracted.data) # Works! -mesh.save_to_zip("physics_mesh.zip") -loaded = PhysicsMesh.load_from_zip("physics_mesh.zip") -print(loaded.physics.mass) # 2.5 +# Reconstruct from data + assets (eager loading) +rebuilt = Packable.reconstruct(SimulationResult, extracted.data, extracted.assets) +assert rebuilt.time == 0.5 ``` -### Caching Nested Packables +### Lazy Loading with Callable Assets -For large projects with shared nested Packables, use caching to deduplicate data using SHA256 content-addressable storage: +When assets is a callable (or `CachedAssetLoader`), `reconstruct()` returns a `LazyModel` that defers loading: ```python -from meshly import DataHandler +from meshly import Packable, CachedAssetLoader, DataHandler +from meshly.packable import LazyModel + +# Define a fetch function (e.g., from cloud storage) +def fetch_asset(checksum: str) -> bytes: + return cloud_storage.download(checksum) -# Create cache handlers from a directory path -cache_handler = DataHandler.create("/path/to/cache") +# Reconstruct with callable - returns LazyModel +lazy = Packable.reconstruct(SimulationResult, data, fetch_asset) +assert isinstance(lazy, LazyModel) -# Save with caching - nested Packables stored separately by hash -mesh.save_to_zip("mesh.zip", cache_handler=cache_handler) +# No assets loaded yet! +print(lazy.time) # Primitive field - no fetch needed +print(lazy.temperature) # NOW temperature asset is fetched +print(lazy.velocity) # NOW velocity asset is fetched -# Load with caching - nested Packables loaded from cache -loaded = PhysicsMesh.load_from_zip("mesh.zip", cache_handler=cache_handler) +# Get full Pydantic model +model = lazy.resolve() +assert isinstance(model, SimulationResult) ``` -**Deduplication example:** +### CachedAssetLoader for Disk Persistence + +Use `CachedAssetLoader` to cache fetched assets to disk: ```python -# Two meshes sharing identical physics properties -shared_physics = PhysicsProperties(mass=1.0, inertia_tensor=np.eye(3)) +from meshly import CachedAssetLoader, DataHandler -mesh1 = PhysicsMesh(vertices=v1, indices=i1, physics=shared_physics) -mesh2 = PhysicsMesh(vertices=v2, indices=i2, physics=shared_physics) +# Create disk cache +cache = DataHandler.create("/path/to/cache") +loader = CachedAssetLoader(fetch_asset, cache) -# Save both with the same cache handler - physics stored only once! -mesh1.save_to_zip("mesh1.zip", cache_handler=cache_handler) -mesh2.save_to_zip("mesh2.zip", cache_handler=cache_handler) +# First access fetches and caches +lazy = Packable.reconstruct(SimulationResult, data, loader) +temp = lazy.temperature # Fetches from source, saves to cache + +# Subsequent access reads from cache +lazy2 = Packable.reconstruct(SimulationResult, data, loader) +temp2 = lazy2.temperature # Reads from cache, no fetch! ``` -**Custom cache handlers:** +### Deduplication with Extract -You can implement custom `DataHandler` subclasses for different storage backends: +Since assets are keyed by SHA256 checksum, identical arrays automatically deduplicate: ```python -from meshly.data_handler import DataHandler -from typing import Optional, List -from pathlib import Path - -class RedisDataHandler(DataHandler): - """Data handler backed by Redis.""" - def __init__(self, redis_client, prefix="packable:"): - super().__init__(source="", rel_path="") - self.redis = redis_client - self.prefix = prefix - - def read_binary(self, subpath) -> bytes: - data = self.redis.get(f"{self.prefix}{subpath}") - if data is None: - raise FileNotFoundError(f"Key not found: {self.prefix}{subpath}") - return data - - def read_text(self, subpath, encoding="utf-8") -> str: - return self.read_binary(subpath).decode(encoding) - - def list_files(self, subpath="", recursive=False) -> List[Path]: - raise NotImplementedError("File listing not supported") +# Two results with same temperature data +result1 = SimulationResult(time=0.0, temperature=shared_temp, velocity=v1) +result2 = SimulationResult(time=1.0, temperature=shared_temp, velocity=v2) +extracted1 = Packable.extract(result1) +extracted2 = Packable.extract(result2) -class RedisWriteHandler(WriteHandler): - """Write handler backed by Redis.""" - def __init__(self, redis_client, prefix="packable:"): - super().__init__(destination="", rel_path="") - self.redis = redis_client - self.prefix = prefix - - def write_binary(self, subpath, content, executable=False) -> None: - data = content if isinstance(content, bytes) else content.read() - self.redis.set(f"{self.prefix}{subpath}", data) - - def write_text(self, subpath, content, executable=False) -> None: - self.redis.set(f"{self.prefix}{subpath}", content.encode('utf-8')) +# Same checksum for temperature - deduplicated! +assert extracted1.data["temperature"] == extracted2.data["temperature"] +``` +**Note**: Direct Packable fields inside another Packable are not supported. Use `extract()` and `reconstruct()` for composing Packables, or embed Packables inside typed dicts: -# Usage with Redis -cache_writer = RedisWriteHandler(redis_client) -cache_reader = RedisReadHandler(redis_client) +```python +from typing import Dict -mesh.save_to_zip("mesh.zip", cache_handler=cache_writer) -loaded = PhysicsMesh.load_from_zip("mesh.zip", cache_handler=cache_reader) +class Container(Packable): + name: str + # Dict of Packables is allowed - extract() handles them + items: Dict[str, SimulationResult] = Field(default_factory=dict) ``` ## Architecture @@ -319,9 +317,11 @@ PackableMetadata (base metadata) The `Packable` base class provides: - `save_to_zip()` / `load_from_zip()` - File I/O with compression - `encode()` / `decode()` - In-memory serialization to/from bytes +- `extract()` / `reconstruct()` - Content-addressable storage with `$ref` checksums - `convert_to()` - Convert arrays between numpy and JAX - `_get_custom_fields()` - Override point for custom field encoding - `load_metadata()` - Generic metadata loading with type parameter +- `checksum` - Computed SHA256 checksum property ### Zip File Structure @@ -510,14 +510,14 @@ class CustomFieldConfig(Generic[V, M]): ```python class Packable(BaseModel): # File I/O - def save_to_zip(self, destination, cache_saver=None) -> None + def save_to_zip(self, destination, cache_handler=None) -> None @classmethod - def load_from_zip(cls, source, array_type=None, cache_loader=None) -> T + def load_from_zip(cls, source, array_type=None, cache_handler=None) -> T # In-memory serialization - def encode(self, cache_saver=None) -> bytes + def encode(self, cache_handler=None) -> bytes @classmethod - def decode(cls, buf: bytes, array_type=None, cache_loader=None) -> T + def decode(cls, buf: bytes, array_type=None, cache_handler=None) -> T # Array conversion def convert_to(self, array_type: ArrayType) -> T @@ -593,103 +593,73 @@ class MeshMetadata(PackableMetadata): array_type: ArrayType = "numpy" # "numpy" or "jax" ``` -### Cache Types +### DataHandler -```python -# Type aliases for cache callbacks -CacheLoader = Callable[[str], Optional[bytes]] # hash -> bytes or None -CacheSaver = Callable[[str, bytes], None] # hash, bytes -> None - -# Factory methods to create cache functions from paths -ReadHandler.create_cache_loader(source: PathLike) -> CacheLoader -WriteHandler.create_cache_saver(destination: PathLike) -> CacheSaver -``` - -### Data Handlers - -The `data_handler` module provides abstract interfaces for reading and writing data, supporting both regular files and zip archives. +The `data_handler` module provides a unified interface for reading and writing data, supporting both regular files and zip archives. ```python -from meshly import ReadHandler, WriteHandler +from meshly import DataHandler -# ReadHandler - Abstract base for reading files -class ReadHandler: +# DataHandler - Unified interface for file I/O +class DataHandler: def __init__(self, source: PathLike | BytesIO, rel_path: str = "") - # Abstract methods (implemented by FileReadHandler, ZipReadHandler) + # Abstract methods (implemented by FileHandler, ZipHandler) def read_text(self, subpath: PathLike, encoding: str = "utf-8") -> str def read_binary(self, subpath: PathLike) -> bytes - def list_files(self, subpath: PathLike = "", recursive: bool = False) -> List[Path] - - # Navigate to subdirectory - def to_path(self, rel_path: str) -> ReadHandler - - # Factory method - automatically creates FileReadHandler or ZipReadHandler - @staticmethod - def create_handler(source: PathLike | BytesIO, rel_path: str = "") -> ReadHandler - - # Create cache loader for nested Packables - @staticmethod - def create_cache_loader(source: PathLike | BytesIO) -> CacheLoader - -# WriteHandler - Abstract base for writing files -class WriteHandler: - def __init__(self, destination: PathLike | BytesIO, rel_path: str = "") - - # Abstract methods (implemented by FileWriteHandler, ZipWriteHandler) def write_text(self, subpath: PathLike, content: str, executable: bool = False) -> None def write_binary(self, subpath: PathLike, content: bytes | BytesIO, executable: bool = False) -> None + def list_files(self, subpath: PathLike = "", recursive: bool = False) -> List[Path] + def exists(self, subpath: PathLike) -> bool + def remove_file(self, subpath: PathLike) -> None # FileHandler only; raises NotImplementedError for ZipHandler # Navigate to subdirectory - def to_path(self, rel_path: str) -> WriteHandler + def to_path(self, rel_path: str) -> DataHandler - # Factory method - automatically creates FileWriteHandler or ZipWriteHandler + # Factory method - automatically creates FileHandler or ZipHandler @staticmethod - def create_handler(destination: PathLike | BytesIO, rel_path: str = "") -> WriteHandler + def create(source: PathLike | BytesIO, rel_path: str = "") -> DataHandler - # Create cache saver for nested Packables - @staticmethod - def create_cache_saver(destination: PathLike | BytesIO) -> CacheSaver - - # Close resources (important for ZipWriteHandler) + # Close resources (important for ZipHandler) def finalize(self) -> None + + # Context manager support (calls finalize() on exit) + def __enter__(self) -> DataHandler + def __exit__(self, exc_type, exc_val, exc_tb) -> bool ``` #### Concrete Implementations ```python -# FileReadHandler - Read from filesystem -handler = FileReadHandler("/path/to/directory") +# FileHandler - Read/write from filesystem +handler = DataHandler.create("/path/to/directory") data = handler.read_binary("subdir/file.bin") -files = handler.list_files("subdir", recursive=True) - -# ZipReadHandler - Read from zip archives -with open("archive.zip", "rb") as f: - handler = ZipReadHandler(BytesIO(f.read())) - metadata = handler.read_text("metadata.json") - array_data = handler.read_binary("arrays/vertices/array.bin") - -# FileWriteHandler - Write to filesystem -handler = FileWriteHandler("/path/to/output") handler.write_text("config.json", '{"version": 1}') -handler.write_binary("data.bin", compressed_bytes) +files = handler.list_files("subdir", recursive=True) -# ZipWriteHandler - Write to zip archives +# ZipHandler - Read/write from zip archives (using context manager) buf = BytesIO() -handler = ZipWriteHandler(buf) -handler.write_text("metadata.json", json_string) -handler.write_binary("data.bin", array_bytes) -handler.finalize() # Important: closes the zip file +with DataHandler.create(buf) as handler: + handler.write_text("metadata.json", json_string) + handler.write_binary("data.bin", array_bytes) +# finalize() is automatically called when exiting the context zip_bytes = buf.getvalue() + +# Reading from existing zip +with open("archive.zip", "rb") as f: + with DataHandler.create(BytesIO(f.read())) as handler: + metadata = handler.read_text("metadata.json") + array_data = handler.read_binary("arrays/vertices/array.bin") ``` #### Advanced Usage ```python # Use handlers for custom storage backends -class S3ReadHandler(ReadHandler): - """Custom handler for reading from S3.""" +class S3DataHandler(DataHandler): + """Custom handler for reading/writing from S3.""" def __init__(self, bucket: str, prefix: str = ""): + super().__init__(source="", rel_path="") self.bucket = bucket self.prefix = prefix @@ -697,12 +667,36 @@ class S3ReadHandler(ReadHandler): key = f"{self.prefix}/{subpath}" if self.prefix else str(subpath) return s3_client.get_object(Bucket=self.bucket, Key=key)['Body'].read() + def write_binary(self, subpath: PathLike, content: bytes | BytesIO, executable: bool = False) -> None: + if isinstance(content, BytesIO): + content.seek(0) + content = content.read() + key = f"{self.prefix}/{subpath}" if self.prefix else str(subpath) + s3_client.put_object(Bucket=self.bucket, Key=key, Body=content) + + def exists(self, subpath: PathLike) -> bool: + key = f"{self.prefix}/{subpath}" if self.prefix else str(subpath) + try: + s3_client.head_object(Bucket=self.bucket, Key=key) + return True + except: + return False + # ... implement other methods -# Deterministic zip output (ZipWriteHandler uses fixed timestamps) +# Deterministic zip output (ZipHandler uses fixed timestamps) # This ensures identical content produces identical zip files -handler = ZipWriteHandler(buf) +handler = DataHandler.create(buf) # All files get timestamp (2020, 1, 1, 0, 0, 0) for reproducibility + +# Automatic mode switching for ZipHandler +handler = DataHandler.create(BytesIO()) +# Handler starts in write mode for empty buffer +handler.write_binary("file1.bin", data1) +# Automatically switches to read mode when needed +content = handler.read_binary("file1.bin") +# Switches back to write mode +handler.write_binary("file2.bin", data2) ``` ## Examples diff --git a/python/examples/extract_reconstruct_example.ipynb b/python/examples/extract_reconstruct_example.ipynb new file mode 100644 index 0000000..ca3c788 --- /dev/null +++ b/python/examples/extract_reconstruct_example.ipynb @@ -0,0 +1,743 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ba736f7c", + "metadata": {}, + "source": [ + "# Extract and Reconstruct: Scientific Simulation Example\n", + "\n", + "This notebook demonstrates `Packable.extract()` and `reconstruct()` with a realistic scientific computing scenario:\n", + "\n", + "- A CFD simulation with mesh geometry and field data\n", + "- Nested Pydantic classes containing Packables (Mesh)\n", + "- Content-addressable storage for deduplication" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6f850881", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from typing import Optional, Dict, List\n", + "from pydantic import BaseModel, Field, ConfigDict\n", + "from meshly import Mesh, Packable" + ] + }, + { + "cell_type": "markdown", + "id": "d3ae1bf6", + "metadata": {}, + "source": [ + "## 1. Define Scientific Data Structures\n", + "\n", + "We'll model a CFD simulation with:\n", + "- `FieldData`: Scalar/vector field on the mesh (temperature, velocity, etc.)\n", + "- `SimulationSnapshot`: A single timestep with mesh + fields\n", + "- `SimulationCase`: Complete case with metadata and multiple snapshots" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "349483ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data structures defined\n" + ] + } + ], + "source": [ + "class FieldData(BaseModel):\n", + " \"\"\"A field defined on mesh nodes or cells.\"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + " \n", + " name: str = Field(..., description=\"Field name (e.g., 'temperature', 'velocity')\")\n", + " field_type: str = Field(..., description=\"'scalar', 'vector', or 'tensor'\")\n", + " location: str = Field(\"node\", description=\"'node' or 'cell' centered\")\n", + " data: np.ndarray = Field(..., description=\"Field values\")\n", + " units: Optional[str] = Field(None, description=\"Physical units\")\n", + "\n", + "\n", + "class SimulationSnapshot(BaseModel):\n", + " \"\"\"A single timestep of simulation data.\n", + " \n", + " Note: This is a regular Pydantic BaseModel (not Packable) that contains\n", + " a Mesh (which IS a Packable). This tests the nested Packable extraction.\n", + " \"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + " \n", + " time: float = Field(..., description=\"Simulation time\")\n", + " iteration: int = Field(..., description=\"Iteration number\")\n", + " mesh: Mesh = Field(..., description=\"Computational mesh\")\n", + " fields: Dict[str, FieldData] = Field(default_factory=dict, description=\"Field data\")\n", + " residuals: Optional[np.ndarray] = Field(None, description=\"Solver residuals\")\n", + "\n", + "\n", + "class SimulationCase(BaseModel):\n", + " \"\"\"Complete simulation case with multiple snapshots.\"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + " \n", + " name: str = Field(..., description=\"Case name\")\n", + " description: str = Field(\"\", description=\"Case description\")\n", + " solver: str = Field(..., description=\"Solver name\")\n", + " parameters: Dict[str, float] = Field(default_factory=dict, description=\"Solver parameters\")\n", + " snapshots: List[SimulationSnapshot] = Field(default_factory=list, description=\"Time snapshots\")\n", + "\n", + "print(\"Data structures defined\")" + ] + }, + { + "cell_type": "markdown", + "id": "bcb88dff", + "metadata": {}, + "source": [ + "## 2. Create Sample Simulation Data\n", + "\n", + "Let's create a simple 2D heat transfer simulation on a quad mesh." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "be109c7d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created mesh: 25 vertices, 16 quads\n" + ] + } + ], + "source": [ + "# Create a simple 2D quad mesh (5x5 grid = 25 nodes, 16 quads)\n", + "nx, ny = 5, 5\n", + "x = np.linspace(0, 1, nx)\n", + "y = np.linspace(0, 1, ny)\n", + "xx, yy = np.meshgrid(x, y)\n", + "\n", + "vertices = np.column_stack([xx.ravel(), yy.ravel(), np.zeros(nx * ny)]).astype(np.float32)\n", + "\n", + "# Create quad indices\n", + "quads = []\n", + "for j in range(ny - 1):\n", + " for i in range(nx - 1):\n", + " n0 = j * nx + i\n", + " n1 = n0 + 1\n", + " n2 = n0 + nx + 1\n", + " n3 = n0 + nx\n", + " quads.append([n0, n1, n2, n3])\n", + "\n", + "indices = np.array(quads, dtype=np.uint32)\n", + "\n", + "mesh = Mesh(vertices=vertices, indices=indices)\n", + "print(f\"Created mesh: {mesh.vertex_count} vertices, {len(indices)} quads\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c7588b21", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created 3 snapshots\n", + " t=0.0: ['temperature', 'velocity']\n", + " t=0.1: ['temperature', 'velocity']\n", + " t=0.2: ['temperature', 'velocity']\n" + ] + } + ], + "source": [ + "# Create simulation snapshots at different times\n", + "def create_snapshot(time: float, iteration: int, mesh: Mesh) -> SimulationSnapshot:\n", + " \"\"\"Create a snapshot with temperature and velocity fields.\"\"\"\n", + " n_nodes = mesh.vertex_count\n", + " coords = mesh.vertices[:, :2] # x, y coordinates\n", + " \n", + " # Temperature: diffusing heat from center\n", + " center = np.array([0.5, 0.5])\n", + " r = np.linalg.norm(coords - center, axis=1)\n", + " temperature = 300 + 100 * np.exp(-r**2 / (0.1 + time)) \n", + " \n", + " # Velocity: rotating flow\n", + " vx = -(coords[:, 1] - 0.5)\n", + " vy = (coords[:, 0] - 0.5)\n", + " velocity = np.column_stack([vx, vy, np.zeros(n_nodes)]).astype(np.float32)\n", + " \n", + " # Residuals (solver convergence)\n", + " residuals = np.array([1e-3 / (iteration + 1), 1e-4 / (iteration + 1)], dtype=np.float32)\n", + " \n", + " return SimulationSnapshot(\n", + " time=time,\n", + " iteration=iteration,\n", + " mesh=mesh,\n", + " fields={\n", + " \"temperature\": FieldData(\n", + " name=\"temperature\",\n", + " field_type=\"scalar\",\n", + " location=\"node\",\n", + " data=temperature.astype(np.float32),\n", + " units=\"K\"\n", + " ),\n", + " \"velocity\": FieldData(\n", + " name=\"velocity\",\n", + " field_type=\"vector\",\n", + " location=\"node\",\n", + " data=velocity,\n", + " units=\"m/s\"\n", + " )\n", + " },\n", + " residuals=residuals\n", + " )\n", + "\n", + "# Create snapshots at t=0, 0.1, 0.2\n", + "snapshots = [\n", + " create_snapshot(0.0, 0, mesh),\n", + " create_snapshot(0.1, 100, mesh),\n", + " create_snapshot(0.2, 200, mesh),\n", + "]\n", + "\n", + "print(f\"Created {len(snapshots)} snapshots\")\n", + "for s in snapshots:\n", + " print(f\" t={s.time}: {list(s.fields.keys())}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "93568d04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Simulation case: heat_transfer_2d\n", + " Solver: simpleFoam\n", + " Parameters: {'dt': 0.001, 'nu': 1e-05, 'alpha': 0.0001}\n", + " Snapshots: 3\n" + ] + } + ], + "source": [ + "# Create the complete simulation case\n", + "case = SimulationCase(\n", + " name=\"heat_transfer_2d\",\n", + " description=\"2D heat transfer with rotating flow\",\n", + " solver=\"simpleFoam\",\n", + " parameters={\n", + " \"dt\": 0.001,\n", + " \"nu\": 1e-5,\n", + " \"alpha\": 1e-4,\n", + " },\n", + " snapshots=snapshots\n", + ")\n", + "\n", + "print(f\"Simulation case: {case.name}\")\n", + "print(f\" Solver: {case.solver}\")\n", + "print(f\" Parameters: {case.parameters}\")\n", + "print(f\" Snapshots: {len(case.snapshots)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "9c7048da", + "metadata": {}, + "source": [ + "## 3. Extract the Simulation Data\n", + "\n", + "`Packable.extract()` recursively processes the nested structure:\n", + "- Arrays → `{\"$ref\": checksum, \"$type\": \"array\"}`\n", + "- Nested Mesh (Packable) → `{\"$ref\": checksum, \"$type\": \"packable\", ...}`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "95533188", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted data keys: ['name', 'description', 'solver', 'parameters', 'snapshots']\n", + "\n", + "Total assets: 8\n", + "\n", + "Asset sizes:\n", + " 4e71a79c2d0fa381: 1,467 bytes\n", + " 28dc719a0c8c1387: 200 bytes\n", + " 59ffdd6bfac7876a: 250 bytes\n", + " 0c345962a52e7e2c: 133 bytes\n", + " 292cfc23f6777b02: 200 bytes\n", + " 17b38a2f2cbdd0a7: 133 bytes\n", + " 145838c08771e6ef: 201 bytes\n", + " ea37b2590dba4b31: 132 bytes\n" + ] + } + ], + "source": [ + "# Extract the entire simulation case\n", + "extracted = Packable.extract(case)\n", + "\n", + "print(f\"Extracted data keys: {list(extracted.data.keys())}\")\n", + "print(f\"\\nTotal assets: {len(extracted.assets)}\")\n", + "print(f\"\\nAsset sizes:\")\n", + "for checksum, data in extracted.assets.items():\n", + " print(f\" {checksum}: {len(data):,} bytes\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ba82742d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted data structure:\n", + "{\n", + " \"name\": \"heat_transfer_2d\",\n", + " \"description\": \"2D heat transfer with rotating flow\",\n", + " \"solver\": \"simpleFoam\",\n", + " \"parameters\": {\n", + " \"dt\": 0.001,\n", + " \"nu\": 1e-05,\n", + " \"alpha\": 0.0001\n", + " },\n", + " \"snapshots\": [\n", + " {\n", + " \"time\": 0.0,\n", + " \"iteration\": 0,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"28dc719a0c8c1387\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"59ffdd6bfac7876a\"\n", + " },\n", + " \"units\": \"m/s\"\n", + " }\n", + " },\n", + " \"residuals\": {\n", + " \"$ref\": \"0c345962a52e7e2c\"\n", + " }\n", + " },\n", + " {\n", + " \"time\": 0.1,\n", + " \"iteration\": 100,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"292cfc23f6777b02\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"59ffdd6bfac7876a\"\n", + " },\n", + " \"units\": \"m/s\"\n", + " }\n", + " },\n", + " \"residuals\": {\n", + " \"$ref\": \"17b38a2f2cbdd0a7\"\n", + " }\n", + " },\n", + " {\n", + " \"time\": 0.2,\n", + " \"iteration\": 200,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"145838c08771e6ef\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \n", + "...\n" + ] + } + ], + "source": [ + "# Examine the extracted data structure\n", + "import json\n", + "\n", + "# Pretty print the extracted data (it's JSON-serializable!)\n", + "print(\"Extracted data structure:\")\n", + "print(json.dumps(extracted.data, indent=2)[:2000] + \"\\n...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6977cb53", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mesh reference: {'$ref': '4e71a79c2d0fa381'}\n" + ] + } + ], + "source": [ + "# Look at the first snapshot's mesh reference\n", + "mesh_ref = extracted.data[\"snapshots\"][0][\"mesh\"]\n", + "print(f\"Mesh reference: {mesh_ref}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "bc82716a", + "metadata": {}, + "source": [ + "## 4. Asset Deduplication\n", + "\n", + "Since all snapshots share the same mesh, it's only stored once!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a251ef65", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mesh checksums: ['4e71a79c2d0fa381', '4e71a79c2d0fa381', '4e71a79c2d0fa381']\n", + "\n", + "All same? True\n", + "\n", + "The mesh is stored only ONCE in assets, saving 2,934 bytes!\n" + ] + } + ], + "source": [ + "# Check mesh references across snapshots\n", + "mesh_refs = [s[\"mesh\"][\"$ref\"] for s in extracted.data[\"snapshots\"]]\n", + "print(f\"Mesh checksums: {mesh_refs}\")\n", + "print(f\"\\nAll same? {len(set(mesh_refs)) == 1}\")\n", + "print(f\"\\nThe mesh is stored only ONCE in assets, saving {(len(mesh_refs)-1) * len(extracted.assets[mesh_refs[0]]):,} bytes!\")" + ] + }, + { + "cell_type": "markdown", + "id": "b732526c", + "metadata": {}, + "source": [ + "## 5. Reconstruct back to SimulationCase" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5c3761f7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Reconstructed case: heat_transfer_2d with 3 snapshots\n", + "Decoded mesh from reconstructed case: 25 vertices, 64 indices\n" + ] + } + ], + "source": [ + "reconstructed_case = Packable.reconstruct(SimulationCase, extracted.data, extracted.assets)\n", + "print(f\"\\nReconstructed case: {reconstructed_case.name} with {len(reconstructed_case.snapshots)} snapshots\")\n", + "\n", + "decoded_mesh = Mesh.decode(reconstructed_case.snapshots[0].mesh.encode())\n", + "print(f\"Decoded mesh from reconstructed case: {decoded_mesh.vertex_count} vertices, {len(decoded_mesh.indices)} indices\")" + ] + }, + { + "cell_type": "markdown", + "id": "ccaf56b9", + "metadata": {}, + "source": [ + "## 6. Lazy Loading with CachedAssetLoader\n", + "\n", + "When working with large datasets, you may want to:\n", + "- Load assets on-demand (lazy loading)\n", + "- Cache fetched assets to disk for subsequent runs\n", + "\n", + "`Packable.reconstruct()` supports this via `CachedAssetLoader`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ac9c08e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Lazy loading with callable ===\n", + "\n", + "LazyModel created, no assets fetched yet. Fetch count: 0\n", + "Type: \n", + "\n", + "Case name: heat_transfer_2d\n", + "Fetch count after accessing name: 0\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "from meshly.packable import CachedAssetLoader\n", + "from meshly.data_handler import DataHandler\n", + "\n", + "# Simulate fetching assets from remote storage\n", + "fetch_count = [0]\n", + "\n", + "def fetch_from_storage(checksum: str) -> bytes:\n", + " \"\"\"Simulate fetching from cloud/remote storage.\"\"\"\n", + " fetch_count[0] += 1\n", + " print(f\" Fetching asset {checksum[:8]}... (fetch #{fetch_count[0]})\")\n", + " return extracted.assets[checksum]\n", + "\n", + "# Using a plain callable - lazy loading, assets fetched on field access\n", + "print(\"=== Lazy loading with callable ===\")\n", + "lazy_case = Packable.reconstruct(SimulationCase, extracted.data, fetch_from_storage)\n", + "\n", + "print(f\"\\nLazyModel created, no assets fetched yet. Fetch count: {fetch_count[0]}\")\n", + "print(f\"Type: {type(lazy_case)}\")\n", + "\n", + "# Access primitive fields - no fetch needed\n", + "print(f\"\\nCase name: {lazy_case.name}\")\n", + "print(f\"Fetch count after accessing name: {fetch_count[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "38bd4003", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Accessing first snapshot ===\n", + " Fetching asset 4e71a79c... (fetch #1)\n", + " Fetching asset 28dc719a... (fetch #2)\n", + " Fetching asset 59ffdd6b... (fetch #3)\n", + " Fetching asset 0c345962... (fetch #4)\n", + " Fetching asset 4e71a79c... (fetch #5)\n", + " Fetching asset 292cfc23... (fetch #6)\n", + " Fetching asset 59ffdd6b... (fetch #7)\n", + " Fetching asset 17b38a2f... (fetch #8)\n", + " Fetching asset 4e71a79c... (fetch #9)\n", + " Fetching asset 145838c0... (fetch #10)\n", + " Fetching asset 59ffdd6b... (fetch #11)\n", + " Fetching asset ea37b259... (fetch #12)\n", + "Fetch count after accessing snapshots: 12\n", + "\n", + "Snapshot time: 0.0\n", + "Mesh vertices shape: (25, 3)\n", + "\n", + "=== Resolving to full model ===\n", + "Final fetch count: 12\n", + "Resolved type: \n" + ] + } + ], + "source": [ + "# Access a snapshot - this triggers fetching of nested assets\n", + "print(\"=== Accessing first snapshot ===\")\n", + "snapshot = lazy_case.snapshots[0]\n", + "print(f\"Fetch count after accessing snapshots: {fetch_count[0]}\")\n", + "\n", + "# The mesh is fetched when we access it\n", + "print(f\"\\nSnapshot time: {snapshot.time}\")\n", + "print(f\"Mesh vertices shape: {snapshot.mesh.vertices.shape}\")\n", + "\n", + "# To fully resolve and get the actual Pydantic model:\n", + "print(\"\\n=== Resolving to full model ===\")\n", + "resolved_case = lazy_case.resolve()\n", + "print(f\"Final fetch count: {fetch_count[0]}\")\n", + "print(f\"Resolved type: {type(resolved_case)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "46d7b7c0", + "metadata": {}, + "source": [ + "### CachedAssetLoader: Persistent Disk Cache\n", + "\n", + "For repeated access, use `CachedAssetLoader` to cache fetched assets to disk:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "88d9c7be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== First run: fetching and caching ===\n", + " Fetching asset 4e71a79c... (fetch #1)\n", + " Fetching asset 28dc719a... (fetch #2)\n", + " Fetching asset 59ffdd6b... (fetch #3)\n", + " Fetching asset 0c345962... (fetch #4)\n", + " Fetching asset 292cfc23... (fetch #5)\n", + " Fetching asset 17b38a2f... (fetch #6)\n", + " Fetching asset 145838c0... (fetch #7)\n", + " Fetching asset ea37b259... (fetch #8)\n", + "Assets fetched: 8\n", + "\n", + "=== Second run: reading from cache ===\n", + "Assets fetched from remote: 0 (all served from cache!)\n", + "Resolved case: heat_transfer_2d with 3 snapshots\n" + ] + } + ], + "source": [ + "import tempfile\n", + "\n", + "# Reset fetch counter\n", + "fetch_count[0] = 0\n", + "\n", + "with tempfile.TemporaryDirectory() as tmpdir:\n", + " cache_path = Path(tmpdir) / \"asset_cache\"\n", + " \n", + " # Create cache handler and loader\n", + " cache_handler = DataHandler.create(cache_path)\n", + " loader = CachedAssetLoader(fetch=fetch_from_storage, cache=cache_handler)\n", + " \n", + " print(\"=== First run: fetching and caching ===\")\n", + " lazy1 = Packable.reconstruct(SimulationCase, extracted.data, loader)\n", + " _ = lazy1.resolve() # Fetch all assets\n", + " print(f\"Assets fetched: {fetch_count[0]}\")\n", + " \n", + " # Finalize to persist cache\n", + " cache_handler.finalize()\n", + " \n", + " # Second run with same cache location\n", + " print(\"\\n=== Second run: reading from cache ===\")\n", + " fetch_count[0] = 0\n", + " cache_handler2 = DataHandler.create(cache_path)\n", + " loader2 = CachedAssetLoader(fetch=fetch_from_storage, cache=cache_handler2)\n", + " \n", + " lazy2 = Packable.reconstruct(SimulationCase, extracted.data, loader2)\n", + " resolved2 = lazy2.resolve()\n", + " print(f\"Assets fetched from remote: {fetch_count[0]} (all served from cache!)\")\n", + " print(f\"Resolved case: {resolved2.name} with {len(resolved2.snapshots)} snapshots\")" + ] + }, + { + "cell_type": "markdown", + "id": "1a54dcde", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "`Packable.extract()` is a **static method** that handles:\n", + "\n", + "| Input | Handling |\n", + "|-------|----------|\n", + "| Top-level Packable | Expands fields, arrays → refs |\n", + "| Nested Packable (in dict/list/BaseModel) | Becomes `{\"$ref\": ..., \"$type\": \"packable\"}` |\n", + "| NumPy arrays | Becomes `{\"$ref\": ..., \"$type\": \"array\"}` |\n", + "| BaseModel | Preserves structure with `__model_class__` |\n", + "| Primitives | Passed through unchanged |\n", + "\n", + "`Packable.reconstruct()` supports three modes:\n", + "\n", + "| AssetProvider | Result | Use Case |\n", + "|--------------|--------|----------|\n", + "| `Dict[str, bytes]` | `TModel` | Eager loading, all assets in memory |\n", + "| `AssetFetcher` | `LazyModel[TModel]` | Lazy per-field loading |\n", + "| `CachedAssetLoader` | `LazyModel[TModel]` | Lazy loading with disk cache |\n", + "\n", + "Key benefits for scientific computing:\n", + "- **Deduplication**: Shared meshes/arrays stored once\n", + "- **Lazy loading**: Load only the fields you need with `LazyModel`\n", + "- **Persistent caching**: `CachedAssetLoader` caches fetched assets to disk\n", + "- **JSON metadata**: Easy to query/index simulation cases\n", + "- **Version control friendly**: Small metadata files, large binary assets" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/examples/mesh_example.ipynb b/python/examples/mesh_example.ipynb index 31ea71e..cd1a3b2 100644 --- a/python/examples/mesh_example.ipynb +++ b/python/examples/mesh_example.ipynb @@ -97,13 +97,6 @@ " materials: dict[str, MaterialProperties] = Field(\n", " default_factory=dict,\n", " description=\"Dictionary of material name to MaterialProperties (BaseModel with arrays)\"\n", - " )\n", - " \n", - " # Nested Packable field - uses its own encode/decode methods\n", - " # This demonstrates automatic handling of Packable fields within other Packables\n", - " physics: Optional[PhysicsProperties] = Field(\n", - " None,\n", - " description=\"Physics properties as a nested Packable\"\n", " )" ] }, @@ -128,8 +121,7 @@ "Mesh created with 8 vertices and 36 indices\n", "Material name: cube_material\n", "Tags: ['cube', 'example']\n", - "Materials (BaseModel dict): ['cube_material', 'secondary_material']\n", - "Physics (nested Packable): mass=2.5, friction=0.7\n" + "Materials (BaseModel dict): ['cube_material', 'secondary_material']\n" ] } ], @@ -237,8 +229,7 @@ "print(f\"Mesh created with {mesh.vertex_count} vertices and {mesh.index_count} indices\")\n", "print(f\"Material name: {mesh.material_name}\")\n", "print(f\"Tags: {mesh.tags}\")\n", - "print(f\"Materials (BaseModel dict): {list(mesh.materials.keys())}\")\n", - "print(f\"Physics (nested Packable): mass={mesh.physics.mass}, friction={mesh.physics.friction}\")" + "print(f\"Materials (BaseModel dict): {list(mesh.materials.keys())}\")" ] }, { @@ -297,7 +288,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Saved mesh to textured_cube.zip, file size: 7695 bytes\n" + "Saved mesh to textured_cube.zip has 8 vertices and 36 indices\n", + "Decoded mesh has 8 vertices and 36 indices\n" ] } ], @@ -305,7 +297,12 @@ "# Save the mesh to a zip file\n", "zip_path = \"textured_cube.zip\"\n", "mesh.save_to_zip(zip_path)\n", - "print(f\"Saved mesh to {zip_path}, file size: {os.path.getsize(zip_path)} bytes\")" + "assert os.path.exists(zip_path)\n", + "print(f\"Saved mesh to {zip_path} has {mesh.vertex_count} vertices and {mesh.index_count} indices\")\n", + "\n", + "\n", + "decoded_mesh = Mesh.decode(mesh.encode())\n", + "print(f\"Decoded mesh has {decoded_mesh.vertex_count} vertices and {decoded_mesh.index_count} indices\")" ] }, { @@ -346,20 +343,7 @@ " type: MaterialProperties\n", " diffuse: [0.2 0.8 0.2]\n", " specular: [0.3 0.3 0.3]\n", - " shininess: 16.0\n", - "\n", - "--- Nested Packable edge case ---\n", - "Physics type: PhysicsProperties\n", - "Physics mass: 2.5\n", - "Physics friction: 0.7\n", - "Physics inertia_tensor:\n", - "[[0.1 0. 0. ]\n", - " [0. 0.1 0. ]\n", - " [0. 0. 0.1]]\n", - "Physics collision_points:\n", - "[[-0.5 -0.5 -0.5]\n", - " [ 0.5 0.5 0.5]\n", - " [ 0. 0. 0. ]]\n" + " shininess: 16.0\n" ] } ], @@ -384,15 +368,7 @@ " print(f\" type: {type(mat).__name__}\")\n", " print(f\" diffuse: {mat.diffuse}\")\n", " print(f\" specular: {mat.specular}\")\n", - " print(f\" shininess: {mat.shininess}\")\n", - "\n", - "# Verify the nested Packable was loaded correctly\n", - "print(f\"\\n--- Nested Packable edge case ---\")\n", - "print(f\"Physics type: {type(loaded_mesh.physics).__name__}\")\n", - "print(f\"Physics mass: {loaded_mesh.physics.mass}\")\n", - "print(f\"Physics friction: {loaded_mesh.physics.friction}\")\n", - "print(f\"Physics inertia_tensor:\\n{loaded_mesh.physics.inertia_tensor}\")\n", - "print(f\"Physics collision_points:\\n{loaded_mesh.physics.collision_points}\")" + " print(f\" shininess: {mat.shininess}\")\n" ] }, { @@ -469,7 +445,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Saved skinned mesh to skinned_cube.zip, file size: 2562 bytes\n", + "Saved skinned mesh to skinned_cube.zip, file size: 2475 bytes\n", "\n", "Loaded skinned mesh: 8 vertices, 36 indices\n", "Skeleton name: human_skeleton\n", @@ -533,85 +509,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 9. Using Cache for Nested Packables\n", + "## 9. Using Callbacks for Nested Packables\n", "\n", - "When working with meshes that contain nested Packables (like our `TexturedMesh` with `PhysicsProperties`), you can use caching to deduplicate shared data and reduce file sizes. The cache uses SHA256 hashes for content-addressable storage." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cache directory: /tmp/tmpkflwgz8v\n", - "\n", - "Cached files (1 items):\n", - " 210dc1059e9d5af349f0dad45dbbdc8797eb82b49e7a3443528337e33ce60854.zip: 1157 bytes\n", - "\n", - "Original zip size: 0 bytes\n", - "Cached zip size: 6505 bytes\n", - "\n", - "--- Loaded from cache ---\n", - "Physics type: PhysicsProperties\n", - "Physics mass: 2.5\n", - "Physics friction: 0.7\n", - "Physics inertia_tensor:\n", - "[[0.1 0. 0. ]\n", - " [0. 0.1 0. ]\n", - " [0. 0. 0.1]]\n", - "\n", - "Removed textured_cube_cached.zip\n" - ] - } - ], - "source": [ - "import tempfile\n", - "from meshly import ReadHandler, WriteHandler\n", + "When working with meshes that contain nested Packables (like our `TexturedMesh` with `PhysicsProperties`), you can use callbacks to implement custom caching, storage, or deduplication strategies.\n", "\n", - "# Create a temporary cache directory\n", - "with tempfile.TemporaryDirectory() as cache_dir:\n", - " print(f\"Cache directory: {cache_dir}\")\n", - " \n", - " # Create cache saver and loader using the handler factory methods\n", - " cache_saver = WriteHandler.create_cache_saver(cache_dir)\n", - " cache_loader = ReadHandler.create_cache_loader(cache_dir)\n", - " \n", - " # Save the mesh with caching - nested PhysicsProperties will be cached separately\n", - " cached_zip_path = \"textured_cube_cached.zip\"\n", - " mesh.save_to_zip(cached_zip_path, cache_saver=cache_saver)\n", - " \n", - " # Check what was cached\n", - " import os\n", - " cache_files = os.listdir(cache_dir)\n", - " print(f\"\\nCached files ({len(cache_files)} items):\")\n", - " for f in cache_files:\n", - " file_path = os.path.join(cache_dir, f)\n", - " print(f\" {f}: {os.path.getsize(file_path)} bytes\")\n", - " \n", - " # Compare file sizes\n", - " original_size = os.path.getsize(zip_path) if os.path.exists(zip_path) else 0\n", - " cached_size = os.path.getsize(cached_zip_path)\n", - " print(f\"\\nOriginal zip size: {original_size} bytes\")\n", - " print(f\"Cached zip size: {cached_size} bytes\")\n", - " \n", - " # Load the mesh back using the cache\n", - " loaded_cached_mesh = TexturedMesh.load_from_zip(cached_zip_path, cache_loader=cache_loader)\n", - " \n", - " # Verify the nested Packable was loaded correctly from cache\n", - " print(f\"\\n--- Loaded from cache ---\")\n", - " print(f\"Physics type: {type(loaded_cached_mesh.physics).__name__}\")\n", - " print(f\"Physics mass: {loaded_cached_mesh.physics.mass}\")\n", - " print(f\"Physics friction: {loaded_cached_mesh.physics.friction}\")\n", - " print(f\"Physics inertia_tensor:\\n{loaded_cached_mesh.physics.inertia_tensor}\")\n", - " \n", - " # Clean up\n", - " if os.path.exists(cached_zip_path):\n", - " os.remove(cached_zip_path)\n", - " print(f\"\\nRemoved {cached_zip_path}\")" + "The callback types are:\n", + "- **`on_packable` (save)**: `Callable[[Packable, str], None]` - called with `(packable, checksum)` when saving\n", + "- **`on_packable` (load)**: `Callable[[Type[Packable], str], Optional[Packable]]` - called with `(packable_type, checksum)` when loading; return `None` to fall back to embedded data" ] }, { @@ -623,78 +527,6 @@ "When multiple meshes share the same nested Packable data, the cache automatically deduplicates them using SHA256 hashes." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Cache directory for deduplication: /workspaces/meshly/cache\n", - "Cache entries: 1 (both meshes share the same physics cache)\n", - "\n", - "Mesh1 material: mesh1, physics mass: 1.0\n", - "Mesh2 material: mesh2, physics mass: 1.0\n" - ] - } - ], - "source": [ - "# Demonstrate cache deduplication - two meshes with identical physics properties\n", - "with tempfile.TemporaryDirectory() as cache_dir:\n", - " print(f\"\\nCache directory for deduplication: {cache_dir}\")\n", - " cache_saver = WriteHandler.create_cache_saver(cache_dir)\n", - " cache_loader = ReadHandler.create_cache_loader(cache_dir)\n", - " \n", - " # Create two meshes with identical physics (will share cache entry)\n", - " shared_physics = PhysicsProperties(\n", - " mass=1.0,\n", - " friction=0.5,\n", - " inertia_tensor=np.eye(3, dtype=np.float32),\n", - " collision_points=np.array([[0, 0, 0]], dtype=np.float32)\n", - " )\n", - " \n", - " mesh1 = TexturedMesh(\n", - " vertices=vertices,\n", - " indices=indices,\n", - " texture_coords=texture_coords,\n", - " normals=normals,\n", - " material_name=\"mesh1\",\n", - " physics=shared_physics\n", - " )\n", - " \n", - " mesh2 = TexturedMesh(\n", - " vertices=vertices * 2, # Different vertices\n", - " indices=indices,\n", - " texture_coords=texture_coords,\n", - " normals=normals,\n", - " material_name=\"mesh2\",\n", - " physics=shared_physics # Same physics - will be deduplicated!\n", - " )\n", - " \n", - " # Save both meshes with the same cache\n", - " mesh1.save_to_zip(\"mesh1.zip\", cache_saver=cache_saver)\n", - " mesh2.save_to_zip(\"mesh2.zip\", cache_saver=cache_saver)\n", - " \n", - " # Check the cache - should only have 1 entry (shared physics)\n", - " cache_files = os.listdir(cache_dir)\n", - " print(f\"Cache entries: {len(cache_files)} (both meshes share the same physics cache)\")\n", - " \n", - " # Load both meshes\n", - " loaded1 = TexturedMesh.load_from_zip(\"mesh1.zip\", cache_loader=cache_loader)\n", - " loaded2 = TexturedMesh.load_from_zip(\"mesh2.zip\", cache_loader=cache_loader)\n", - " \n", - " print(f\"\\nMesh1 material: {loaded1.material_name}, physics mass: {loaded1.physics.mass}\")\n", - " print(f\"Mesh2 material: {loaded2.material_name}, physics mass: {loaded2.physics.mass}\")\n", - " \n", - " # Clean up\n", - " for f in [\"mesh1.zip\", \"mesh2.zip\"]:\n", - " if os.path.exists(f):\n", - " os.remove(f)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -704,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -734,10 +566,7 @@ "This notebook demonstrated:\n", "- Creating custom Mesh subclasses with additional numpy arrays\n", "- Working with nested dictionaries containing arrays\n", - "- Using BaseModel instances with arrays inside dictionaries\n", - "- **Nested Packables** - fields that are themselves Packable classes\n", - "- **Cache support** - using `WriteHandler.create_cache_saver()` and `ReadHandler.create_cache_loader()` for content-addressable storage\n", - "- **Deduplication** - identical nested Packables share the same cache entry" + "- Using BaseModel instances with arrays inside dictionaries" ] } ], diff --git a/python/meshly/__init__.py b/python/meshly/__init__.py index ab5ed42..a3c8d8d 100644 --- a/python/meshly/__init__.py +++ b/python/meshly/__init__.py @@ -16,6 +16,7 @@ from .packable import ( Packable, PackableMetadata, + SerializedPackableData, ) from .mesh import ( @@ -42,6 +43,8 @@ ) from .data_handler import ( + AssetProvider, + CachedAssetLoader, DataHandler, ) @@ -50,8 +53,11 @@ # Packable base class "Packable", "PackableMetadata", + "SerializedPackableData", "ArrayType", # Data handlers + "AssetProvider", + "CachedAssetLoader", "DataHandler", # Mesh classes "Mesh", diff --git a/python/meshly/array.py b/python/meshly/array.py index 0aefe2c..4bccdab 100644 --- a/python/meshly/array.py +++ b/python/meshly/array.py @@ -12,7 +12,7 @@ from pydantic import BaseModel, Field from meshoptimizer._loader import lib -from .data_handler import DataHandler, ZipBuffer +from .data_handler import DataHandler from .common import PathLike # Optional JAX support @@ -374,8 +374,8 @@ def save_to_zip( """ encoded = ArrayUtils.encode_array(array) - zip_buffer = ZipBuffer() - handler = WriteHandler.create_handler(zip_buffer) + zip_buffer = BytesIO() + handler = DataHandler.create(zip_buffer) ArrayUtils.save_array(handler, "array", encoded) handler.finalize() @@ -403,9 +403,9 @@ def load_from_zip( """ if isinstance(source, BytesIO): source.seek(0) - handler = DataHandler.create(ZipBuffer(source.read())) + handler = DataHandler.create(BytesIO(source.read())) else: with open(source, "rb") as f: - handler = DataHandler.create(ZipBuffer(f.read())) + handler = DataHandler.create(BytesIO(f.read())) return ArrayUtils.load_array(handler, "array", array_type) diff --git a/python/meshly/data_handler.py b/python/meshly/data_handler.py index cb3c515..69c1cfa 100644 --- a/python/meshly/data_handler.py +++ b/python/meshly/data_handler.py @@ -1,18 +1,46 @@ import stat -from typing import Callable, List, Optional, Union +from dataclasses import dataclass +from typing import Callable, Dict, List, Optional, Union import zipfile from io import BytesIO from pathlib import Path from abc import abstractmethod from .common import PathLike +HandlerSource = Union[PathLike, BytesIO] -ZipBuffer = BytesIO +# Type for asset provider: either a dict or a callable that fetches by checksum +AssetFetcher = Callable[[str], bytes] +AssetProvider = Union[Dict[str, bytes], AssetFetcher, "CachedAssetLoader"] -HandlerSource = Union[PathLike, ZipBuffer] +@dataclass +class CachedAssetLoader: + """Asset loader with optional disk cache for persistence. + + Wraps a callable asset fetcher with a DataHandler for caching. + Fetched assets are stored as 'assets/{checksum}.bin' and read + from cache on subsequent access. + + Example: + def fetch_from_cloud(checksum: str) -> bytes: + return cloud_storage.download(checksum) + + # Create loader with disk cache + cache = DataHandler.create(Path("./cache")) + loader = CachedAssetLoader(fetch_from_cloud, cache) + + lazy = Packable.reconstruct(SimulationCase, data, loader) + """ + fetch: AssetFetcher + """Callable that fetches asset bytes by checksum""" + cache: "DataHandler" + """DataHandler for caching fetched assets""" + + +class DataHandler: + """Protocol for reading and writing files to various sources.""" -class BaseDataHandler: rel_path: str def resolved_path(self, subpath: PathLike) -> Path: @@ -21,10 +49,6 @@ def resolved_path(self, subpath: PathLike) -> Path: return Path(str(subpath)) return Path(f"{self.rel_path}/{subpath}") - -class DataHandler(BaseDataHandler): - """Protocol for reading and writing files to various sources.""" - def __init__(self, source: HandlerSource, rel_path=""): self.source = source self.rel_path = rel_path @@ -32,34 +56,39 @@ def __init__(self, source: HandlerSource, rel_path=""): @abstractmethod def read_text(self, subpath: PathLike, encoding: str = "utf-8") -> str: """Read text content from a file.""" - ... + raise NotImplementedError @abstractmethod def read_binary(self, subpath: PathLike) -> bytes: """Read binary content from a file.""" - ... + raise NotImplementedError @abstractmethod def write_text(self, subpath: PathLike, content: str, executable: bool = False) -> None: """Write text content to a file.""" - ... + raise NotImplementedError @abstractmethod def write_binary(self, subpath: PathLike, content: Union[bytes, BytesIO], executable: bool = False) -> None: """Write binary content to a file.""" - ... + raise NotImplementedError @abstractmethod def list_files(self, subpath: PathLike = "", recursive: bool = False) -> List[Path]: """List files in the given subpath.""" - ... + raise NotImplementedError @abstractmethod def exists(self, subpath: PathLike) -> bool: """Check if a file exists.""" - ... + raise NotImplementedError - def to_path(self, rel_path: str): + @abstractmethod + def remove_file(self, subpath: PathLike) -> None: + """Remove a file.""" + raise NotImplementedError + + def to_path(self, rel_path: str) -> "DataHandler": """Get a handler with a nested relative path.""" return DataHandler.create(self.source, f"{self.rel_path}/{rel_path}" if self.rel_path != "" else rel_path, self) @@ -75,7 +104,7 @@ def create(source: HandlerSource, rel_path="", existing_handler: Optional["DataH Returns: Handler implementation """ - if isinstance(source, ZipBuffer): + if isinstance(source, BytesIO): return ZipHandler( source, rel_path, @@ -89,6 +118,15 @@ def finalize(self): """Close any resources if needed.""" pass + def __enter__(self): + """Enter context manager.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Exit context manager, calling finalize().""" + self.finalize() + return False + class FileHandler(DataHandler): """Handler for reading and writing files on the regular file system.""" @@ -141,6 +179,11 @@ def exists(self, subpath: PathLike) -> bool: full_path = self.source / self.resolved_path(subpath) return full_path.exists() + def remove_file(self, subpath: PathLike) -> None: + full_path = self.source / self.resolved_path(subpath) + if full_path.exists(): + full_path.unlink() + class ZipHandler(DataHandler): """Handler for reading and writing files in zip archives.""" @@ -218,10 +261,12 @@ def exists(self, subpath: PathLike) -> bool: except KeyError: return False + def remove_file(self, subpath: PathLike) -> None: + # Note: zipfile doesn't support removing files directly. + # This would require recreating the zip without the file. + raise NotImplementedError("ZipHandler does not support removing files") + def finalize(self): """Close the zip file.""" if hasattr(self, 'zip_file') and self.zip_file: self.zip_file.close() - - -ZipBuffer = BytesIO diff --git a/python/meshly/packable.py b/python/meshly/packable.py index 0d3fe0f..186c81d 100644 --- a/python/meshly/packable.py +++ b/python/meshly/packable.py @@ -6,12 +6,17 @@ Custom data classes can inherit from Packable to store simulation results, time-series data, or any structured data with numpy arrays. + +Packables cannot contain nested Packables. For composite structures, +use the extract() and reconstruct() methods to handle asset management. """ import hashlib import json -from dataclasses import dataclass +from dataclasses import dataclass, field +from functools import cached_property from io import BytesIO +from pathlib import Path from typing import ( Callable, Dict, @@ -23,23 +28,18 @@ TypeVar, Union, ) -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, computed_field from .array import ArrayUtils, ArrayType, Array from .common import PathLike -from .data_handler import DataHandler, ZipBuffer +from .data_handler import AssetProvider, CachedAssetLoader, DataHandler + +TModel = TypeVar("TModel", bound=BaseModel) class PackableMetadata(BaseModel): """Metadata for a Packable saved to zip.""" - class_name: str = Field(..., description="Name of the data class") - module_name: str = Field(..., - description="Module containing the data class") field_data: Dict[str, Any] = Field( default_factory=dict, description="Non-array field values") - packable_refs: Dict[str, str] = Field( - default_factory=dict, - description="SHA256 hashes for cached packable fields (field_name -> hash)" - ) TPackableMetadata = TypeVar("TPackableMetadata", bound=PackableMetadata) @@ -47,6 +47,158 @@ class PackableMetadata(BaseModel): FieldValue = TypeVar("FieldValue") # Value type for custom fields +@dataclass +class SerializedPackableData: + """Result of extracting a Packable for serialization. + + Contains the serializable data dict with checksum references, + plus the encoded assets (arrays as bytes). + """ + data: Dict[str, Any] + """Serializable dict with primitive fields and checksum refs for arrays""" + assets: Dict[str, bytes] + """Map of checksum -> encoded bytes for all arrays""" + + +class LazyModel(Generic[TModel]): + """ + Lazy proxy for a Pydantic BaseModel that defers asset loading until field access. + + Fields containing $ref references are not resolved until accessed, + allowing for truly lazy loading from external storage. + + Example: + def fetch_asset(checksum: str) -> bytes: + return cloud_storage.download(checksum) + + lazy = Packable.reconstruct(SimulationCase, data, fetch_asset) + # No assets loaded yet + + temp = lazy.temperature # NOW the temperature asset is fetched + vel = lazy.velocity # NOW the velocity asset is fetched + + # With a cache handler for persistence: + cache = DataHandler.create(Path("./cache")) + loader = CachedAssetLoader(fetch_asset, cache) + lazy = Packable.reconstruct(SimulationCase, data, loader) + """ + + __slots__ = ('_model_class', '_data', '_assets', '_array_type', '_cache', '_resolved') + + def __init__( + self, + model_class: Type[TModel], + data: Dict[str, Any], + assets: AssetProvider, + array_type: Optional[ArrayType] = None, + ): + object.__setattr__(self, '_model_class', model_class) + object.__setattr__(self, '_data', data) + object.__setattr__(self, '_assets', assets) + object.__setattr__(self, '_array_type', array_type) + object.__setattr__(self, '_cache', {}) + object.__setattr__(self, '_resolved', None) + + def _get_cached_asset(self, checksum: str) -> bytes: + """Get asset bytes, using cache if CachedAssetLoader is provided.""" + assets = object.__getattribute__(self, '_assets') + + # Handle CachedAssetLoader + if isinstance(assets, CachedAssetLoader): + cache_path = f"assets/{checksum}.bin" + + # Try to read from cache first + try: + return assets.cache.read_binary(cache_path) + except (KeyError, FileNotFoundError): + pass + + # Fetch from provider + asset_bytes = assets.fetch(checksum) + + # Store in cache + assets.cache.write_binary(cache_path, asset_bytes) + return asset_bytes + + # Handle plain callable + if callable(assets): + return assets(checksum) + + # Handle dict + if checksum not in assets: + raise KeyError(f"Missing asset with checksum '{checksum}'") + return assets[checksum] + + def __getattr__(self, name: str) -> Any: + # Check cache first + cache = object.__getattribute__(self, '_cache') + if name in cache: + return cache[name] + + model_class = object.__getattribute__(self, '_model_class') + data = object.__getattribute__(self, '_data') + array_type = object.__getattribute__(self, '_array_type') + + # Check if it's a model field + if name not in model_class.model_fields: + raise AttributeError(f"'{model_class.__name__}' has no attribute '{name}'") + + if name not in data: + return None + + field_value = data[name] + field_type = model_class.model_fields[name].annotation + + # Resolve this specific field using our caching asset getter + resolved = Packable._resolve_value_with_type( + field_value, field_type, self._get_cached_asset, array_type + ) + + # Cache the resolved value + cache[name] = resolved + return resolved + + def __setattr__(self, name: str, value: Any) -> None: + raise AttributeError("LazyModel is read-only. Use resolve() to get a mutable model.") + + def resolve(self) -> TModel: + """ + Fully resolve all fields and return the actual Pydantic model. + + This will fetch all remaining assets that haven't been accessed yet. + """ + resolved = object.__getattribute__(self, '_resolved') + if resolved is not None: + return resolved + + model_class = object.__getattribute__(self, '_model_class') + data = object.__getattribute__(self, '_data') + array_type = object.__getattribute__(self, '_array_type') + cache = object.__getattribute__(self, '_cache') + + # Resolve all fields, using cache where available + resolved_data = {} + for field_name, field_info in model_class.model_fields.items(): + if field_name in cache: + resolved_data[field_name] = cache[field_name] + elif field_name in data: + resolved_data[field_name] = Packable._resolve_value_with_type( + data[field_name], field_info.annotation, self._get_cached_asset, array_type + ) + + result = model_class(**resolved_data) + object.__setattr__(self, '_resolved', result) + return result + + def __repr__(self) -> str: + model_class = object.__getattribute__(self, '_model_class') + cache = object.__getattribute__(self, '_cache') + data = object.__getattribute__(self, '_data') + loaded = list(cache.keys()) + pending = [k for k in data.keys() if k not in cache] + return f"LazyModel[{model_class.__name__}](loaded={loaded}, pending={pending})" + + @dataclass class CustomFieldConfig(Generic[FieldValue, TPackableMetadata]): """Configuration for custom field encoding/decoding.""" @@ -69,6 +221,10 @@ class Packable(BaseModel): detected, encoded, and saved to zip files. Non-array fields are preserved in metadata. + Packables cannot contain nested Packables. For composite structures, + use extract() to get a serializable dict with asset references, and + reconstruct() to rebuild from the dict and assets. + Example: class SimulationResult(Packable): time: float @@ -81,12 +237,55 @@ class SimulationResult(Packable): velocity=np.zeros((3, 3)) ) result.save_to_zip("result.zip") + + # Load using the specific class loaded = SimulationResult.load_from_zip("result.zip") + + # Or use extract/reconstruct for custom asset management + extracted = result.extract() + # extracted.data contains {"time": 0.1, "temperature": {"$ref": "abc123"}, ...} + # extracted.assets contains {"abc123": , ...} + rebuilt = SimulationResult.reconstruct(extracted.data, extracted.assets) """ class Config: arbitrary_types_allowed = True + def __init__(self, **data): + super().__init__(**data) + self._validate_no_direct_packable_fields() + + def _validate_no_direct_packable_fields(self) -> None: + """Validate that this Packable has no direct Packable fields. + + Packables nested inside dicts or other BaseModels are allowed and will + be handled by extract(). Only direct Packable fields are prohibited. + """ + for field_name in type(self).model_fields: + if field_name in self.__private_attributes__: + continue + value = getattr(self, field_name, None) + if value is None: + continue + + # Only reject direct Packable fields + if isinstance(value, Packable): + raise TypeError( + f"Direct Packable fields are not allowed. Field '{field_name}' " + f"contains a {type(value).__name__}. Packables can be nested " + "inside dicts or other BaseModels, and extract() will handle them." + ) + + @computed_field + @cached_property + def checksum(self) -> str: + """ + Compute SHA256 checksum of the encoded content. + Returns: + 16-character hex string (first 64 bits of SHA256) + """ + return hashlib.sha256(self.encode()).hexdigest()[:16] + @property def array_fields(self) -> Set[str]: """Get all array field paths, including nested arrays in dicts/BaseModels.""" @@ -127,8 +326,6 @@ def _create_metadata(self, field_data: Dict[str, Any]) -> PackableMetadata: PackableMetadata (or subclass) instance """ return PackableMetadata( - class_name=self.__class__.__name__, - module_name=self.__class__.__module__, field_data=field_data, ) @@ -147,50 +344,32 @@ def load_metadata( Returns: Metadata object of the specified type - - Raises: - ValueError: If class name doesn't match """ metadata_text = handler.read_text("metadata.json") metadata_dict = json.loads(metadata_text) - metadata = metadata_cls(**metadata_dict) - - if metadata.class_name != cls.__name__ or metadata.module_name != cls.__module__: - raise ValueError( - f"Class mismatch: expected {cls.__name__} but got {metadata.class_name} from {metadata.module_name}" - ) - - return metadata + return metadata_cls(**metadata_dict) def save_to_zip( self, destination: Union[PathLike, BytesIO], - cache_handler: Optional[DataHandler] = None, ) -> None: """ Save this container to a zip file. Args: destination: Path to the output zip file or BytesIO buffer - cache_handler: Optional DataHandler for caching nested Packables. - When provided, nested Packable fields are saved via - cache_handler.write_binary() and only hash - references are stored in the parent zip. This enables - deduplication and smaller parent files. """ - encoded = self.encode(cache_handler=cache_handler) + encoded = self.encode() if isinstance(destination, BytesIO): destination.write(encoded) else: - with open(destination, "wb") as f: - f.write(encoded) + Path(destination).write_bytes(encoded) @classmethod def load_from_zip( cls: Type[TPackable], source: Union[PathLike, BytesIO], array_type: Optional[ArrayType] = None, - cache_handler: Optional[DataHandler] = None, ) -> TPackable: """ Load a Packable from a zip file. @@ -200,20 +379,22 @@ def load_from_zip( array_type: Array backend to use ("numpy" or "jax"). If None (default), uses the array_type stored in each array's metadata, preserving the original array types that were saved. - cache_handler: Optional Handler to load nested Packables from cache. - When the zip contains hash references (packable_refs), - cache_handler.read_binary() is called to retrieve - cached bytes. Returns: Loaded Packable instance + + Raises: + TypeError: If called on base Packable class instead of a subclass + + Example: + mesh = Mesh.load_from_zip("mesh.zip") """ if isinstance(source, BytesIO): source.seek(0) - return cls.decode(source.read(), array_type, cache_handler) + return cls.decode(source.read(), array_type) else: with open(source, "rb") as f: - return cls.decode(f.read(), array_type, cache_handler) + return cls.decode(f.read(), array_type) @classmethod def _get_custom_fields(cls) -> Dict[str, CustomFieldConfig]: @@ -232,42 +413,6 @@ def _get_custom_field_names(cls) -> Set[str]: """Get set of field names that have custom encoding/decoding.""" return set(cls._get_custom_fields().keys()) - def _get_packable_fields(self) -> Dict[str, "Packable"]: - """Get fields that are Packable instances (excluding self).""" - packable_fields = {} - for field_name in type(self).model_fields: - if field_name in self.__private_attributes__: - continue - value = getattr(self, field_name, None) - if value is not None and isinstance(value, Packable): - packable_fields[field_name] = value - return packable_fields - - def _get_packable_field_names(self) -> Set[str]: - """Get set of field names that are Packable instances.""" - return set(self._get_packable_fields().keys()) - - @classmethod - def _get_packable_field_types(cls) -> Set[str]: - """Get field names that are Packable types from type hints (for decoding).""" - import typing - hints = typing.get_type_hints(cls) - packable_fields = set() - - for field_name, field_type in hints.items(): - # Handle Optional[PackableSubclass] - origin = typing.get_origin(field_type) - if origin is Union: - args = typing.get_args(field_type) - for arg in args: - if isinstance(arg, type) and issubclass(arg, Packable): - packable_fields.add(field_name) - break - elif isinstance(field_type, type) and issubclass(field_type, Packable): - packable_fields.add(field_name) - - return packable_fields - @classmethod def _decode_custom_fields( cls, @@ -376,69 +521,24 @@ def _encode_custom_fields(self, handler: DataHandler) -> None: encoded_bytes = config.encode(value, self) handler.write_binary(f"{config.file_name}.bin", encoded_bytes) - def _encode_packable_fields( - self, - handler: DataHandler, - cache_handler: Optional[DataHandler] = None - ) -> Dict[str, str]: - """Encode fields that are Packable instances. - - Args: - handler: DataHandler for the parent zip (used when no cache) - cache_handler: Optional DataHandler to save to cache. When provided, - packables are saved via cache_handler.write_binary() and - only hash refs are returned. - - Returns: - Dict mapping field names to SHA256 hashes (only when cache_handler provided) - """ - packable_refs: Dict[str, str] = {} - - for field_name, packable in self._get_packable_fields().items(): - # Recursively use cache for nested packables too - encoded_bytes = packable.encode(cache_handler=cache_handler) - - if cache_handler is not None: - # Compute SHA256 hash of the encoded bytes - hash_digest = hashlib.sha256(encoded_bytes).hexdigest()[:16] - packable_refs[field_name] = hash_digest - - # Save to cache with deduplication via exists check - hash_path = f"{hash_digest}.zip" - if not cache_handler.exists(hash_path): - cache_handler.write_binary(hash_path, encoded_bytes) - else: - # Embed in parent zip as before - handler.write_binary(f"packables/{field_name}.zip", encoded_bytes) - - return packable_refs - - def encode(self, cache_handler: Optional[DataHandler] = None) -> bytes: + def encode(self) -> bytes: """ - Serialize this Packable to bytes. - - Args: - cache_handler: Optional DataHandler to save nested Packables to cache. - When provided, nested Packable fields are saved via - cache_handler.write_binary() instead of - embedding in the zip. + Serialize this Packable to bytes (zip format). Returns: Bytes containing the zip-encoded data """ custom_field_names = self._get_custom_field_names() - packable_field_names = self._get_packable_field_names() - skip_fields = custom_field_names | packable_field_names # Encode standard arrays - encoded_arrays = self._encode_standard_arrays(skip_fields) + encoded_arrays = self._encode_standard_arrays(custom_field_names) # Create metadata field_data = self._extract_non_array_fields() metadata = self._create_metadata(field_data) # Write to zip - destination = ZipBuffer() + destination = BytesIO() handler = DataHandler.create(destination) # Save standard arrays @@ -448,13 +548,6 @@ def encode(self, cache_handler: Optional[DataHandler] = None) -> bytes: # Save custom encoded fields self._encode_custom_fields(handler) - # Save packable fields (with optional caching) - packable_refs = self._encode_packable_fields(handler, cache_handler) - - # Store packable refs in metadata if using cache - if packable_refs: - metadata.packable_refs = packable_refs - # Save metadata handler.write_text( "metadata.json", @@ -464,87 +557,11 @@ def encode(self, cache_handler: Optional[DataHandler] = None) -> bytes: handler.finalize() return destination.getvalue() - @classmethod - def _decode_packable_fields( - cls, - handler: DataHandler, - metadata: PackableMetadata, - data: Dict[str, Any], - array_type: Optional[ArrayType] = None, - cache_handler: Optional[DataHandler] = None - ) -> None: - """Decode fields that are Packable instances. - - Supports both embedded packables (in packables/ folder) and cached - packables (referenced by SHA256 hash in metadata.packable_refs). - - Args: - handler: DataHandler for the parent zip - metadata: Loaded metadata containing packable_refs - data: Dict to populate with decoded packables - array_type: Optional array backend to use - cache_handler: Optional DataHandler to load cached packables by hash - """ - # Get field type hints to know the Packable subclass for each field - import typing - hints = typing.get_type_hints(cls) - - # Helper to decode a packable field given its bytes - def decode_field(field_name: str, encoded_bytes: bytes) -> None: - field_type = hints.get(field_name) - if field_type is None: - return - - # Handle Optional[PackableSubclass] - origin = typing.get_origin(field_type) - if origin is Union: - args = typing.get_args(field_type) - for arg in args: - if isinstance(arg, type) and issubclass(arg, Packable): - field_type = arg - break - - if not isinstance(field_type, type) or not issubclass(field_type, Packable): - return - - data[field_name] = field_type.decode(encoded_bytes, array_type, cache_handler) - - # First, try to load from cache using hash refs - if cache_handler and metadata.packable_refs: - for field_name, hash_digest in metadata.packable_refs.items(): - try: - cached_bytes = cache_handler.read_binary(f"{hash_digest}.zip") - decode_field(field_name, cached_bytes) - except (FileNotFoundError, KeyError): - pass # Not in cache, will try embedded - - # Then load any embedded packables (for backward compatibility or no-cache case) - try: - packable_files = handler.list_files("packables", recursive=True) - except (KeyError, FileNotFoundError): - return - - for file_path in packable_files: - file_str = str(file_path) - if not file_str.endswith(".zip"): - continue - - # Extract field name: "packables/inner_mesh.zip" -> "inner_mesh" - field_name = file_str[10:-4] # Remove "packables/" and ".zip" - - # Skip if already loaded from cache - if field_name in data: - continue - - encoded_bytes = handler.read_binary(file_str) - decode_field(field_name, encoded_bytes) - @classmethod def decode( cls: Type[TPackable], buf: bytes, array_type: Optional[ArrayType] = None, - cache_handler: Optional[DataHandler] = None ) -> TPackable: """ Deserialize a Packable from bytes. @@ -553,19 +570,24 @@ def decode( buf: Bytes containing the zip-encoded data array_type: Array backend to use. If None (default), uses the array_type stored in each array's metadata. - cache_handler: Optional DataHandler to load nested Packables from cache. - When metadata contains hash references, - cache_handler.read_binary() is called to retrieve - cached bytes. Returns: Loaded Packable instance + + Raises: + TypeError: If called on base Packable class instead of a subclass """ - handler = DataHandler.create(ZipBuffer(buf)) + if cls is Packable: + raise TypeError( + "Cannot decode on base Packable class. " + "Use the specific subclass: MyClass.decode(...)" + ) + + handler = DataHandler.create(BytesIO(buf)) metadata = cls.load_metadata(handler) # Fields to skip when loading standard arrays - skip_fields = cls._get_custom_field_names() | cls._get_packable_field_types() + skip_fields = cls._get_custom_field_names() data: Dict[str, Any] = {} @@ -575,15 +597,417 @@ def decode( # Load standard arrays cls._load_standard_arrays(handler, data, skip_fields, array_type) - # Decode packable fields - cls._decode_packable_fields(handler, metadata, data, array_type, cache_handler) - - # Merge non-array fields from metadata + # Merge non-array fields from metadata using schema-aware reconstruction if metadata.field_data: - Packable._merge_field_data(data, metadata.field_data) + cls._merge_field_data_with_schema(cls, data, metadata.field_data) return cls(**data) + @staticmethod + def extract(obj: BaseModel) -> SerializedPackableData: + """ + Extract arrays and Packables from a Pydantic BaseModel into serializable data and assets. + + Args: + obj: A Pydantic BaseModel instance (including Packable subclasses) + + Returns an ExtractedPackable with: + - data: A JSON-serializable dict with `{"$ref": checksum}` for arrays/Packables + - assets: A dict mapping checksums to encoded bytes + + Arrays and nested Packables are stored as assets. The type information comes + from the Pydantic schema when reconstructing, so no class/module info is stored. + + Example: + mesh = Mesh(vertices=..., indices=...) + extracted = Packable.extract(mesh) + # extracted.data = {"vertices": {"$ref": "abc..."}, "indices": {"$ref": "def..."}} + + rebuilt = Mesh.reconstruct(extracted.data, extracted.assets) + """ + if not isinstance(obj, BaseModel): + raise TypeError( + f"extract() requires a Pydantic BaseModel, got {type(obj).__name__}. " + "Use Pydantic models for type-safe extraction and reconstruction." + ) + + assets: Dict[str, bytes] = {} + data: Dict[str, Any] = {} + + for field_name in type(obj).model_fields: + if hasattr(obj, '__private_attributes__') and field_name in obj.__private_attributes__: + continue + value = getattr(obj, field_name, None) + if value is None: + continue + data[field_name] = Packable._extract_value(value, assets) + + return SerializedPackableData(data=data, assets=assets) + + @staticmethod + def _extract_value(value: Any, assets: Dict[str, bytes]) -> Any: + """Recursively extract a value, replacing arrays and nested Packables with refs.""" + # Handle arrays + if ArrayUtils.is_array(value): + encoded = ArrayUtils.encode_array(value) + # Pack metadata + data together as bytes for the asset + metadata_json = json.dumps(encoded.metadata.model_dump()).encode('utf-8') + # Format: [4 bytes metadata length][metadata json][array data] + packed = len(metadata_json).to_bytes(4, 'little') + metadata_json + encoded.data + checksum = hashlib.sha256(packed).hexdigest()[:16] + assets[checksum] = packed + return {"$ref": checksum} + + # Handle Packables - extract as encoded zip bytes + if isinstance(value, Packable): + encoded = value.encode() + checksum = hashlib.sha256(encoded).hexdigest()[:16] + assets[checksum] = encoded + return {"$ref": checksum} + + # Handle dicts + if isinstance(value, dict): + return {k: Packable._extract_value(v, assets) for k, v in value.items()} + + # Handle lists/tuples + if isinstance(value, (list, tuple)): + result = [Packable._extract_value(v, assets) for v in value] + return result if isinstance(value, list) else tuple(result) + + # Handle non-Packable BaseModels - recursively extract their fields + if isinstance(value, BaseModel): + extracted = {} + for name in value.model_fields: + field_value = getattr(value, name, None) + if field_value is not None: + extracted[name] = Packable._extract_value(field_value, assets) + return extracted + + # Primitive value - return as-is + return value + + @staticmethod + def _get_asset(assets: AssetProvider, checksum: str) -> bytes: + """Get asset bytes from either a dict or callable provider.""" + if callable(assets): + return assets(checksum) + if checksum not in assets: + raise KeyError(f"Missing asset with checksum '{checksum}'") + return assets[checksum] + + @staticmethod + def reconstruct( + model_class: Type[TModel], + data: Dict[str, Any], + assets: AssetProvider, + array_type: Optional[ArrayType] = None, + ) -> Union[TModel, LazyModel[TModel]]: + """ + Reconstruct a Pydantic BaseModel from extracted data and assets. + + Uses the class's Pydantic schema to determine types for nested fields, + so no runtime type information needs to be stored in the data. + + If assets is a dict, all assets are loaded immediately and the actual + model is returned. If assets is a callable or CachedAssetLoader, a + LazyModel proxy is returned that defers asset loading until field access. + + Args: + model_class: The Pydantic BaseModel class to reconstruct + data: The data dict from extract(), with $ref references + assets: One of: + - Dict mapping checksums to bytes (eager loading) + - Callable that takes a checksum and returns bytes (lazy loading) + - CachedAssetLoader with fetch callable and cache handler (lazy + disk cache) + array_type: Array backend to use. If None, uses the type stored + in each array's metadata. + + Returns: + - If assets is a dict: Reconstructed BaseModel instance (eager) + - If assets is callable/CachedAssetLoader: LazyModel proxy that loads on demand + + Raises: + KeyError: If a referenced asset is missing (for dict assets, raised immediately; + for callable assets, raised on field access) + + Example: + extracted = Packable.extract(simulation_case) + + # Eager loading with dict - returns actual model + rebuilt = Packable.reconstruct(SimulationCase, extracted.data, extracted.assets) + + # Lazy loading with callable - returns LazyModel + def fetch_asset(checksum: str) -> bytes: + return storage.get(checksum) + lazy = Packable.reconstruct(SimulationCase, data, fetch_asset) + + # Lazy loading with disk cache + cache = DataHandler.create(Path("./cache")) + loader = CachedAssetLoader(fetch_asset, cache) + lazy = Packable.reconstruct(SimulationCase, data, loader) + + print(lazy.time) # Primitive field, no fetch needed + print(lazy.temperature) # Fetches and caches temperature asset + model = lazy.resolve() # Get full Pydantic model + """ + if callable(assets) or isinstance(assets, CachedAssetLoader): + return LazyModel(model_class, data, assets, array_type) + + resolved_data = Packable._resolve_refs_with_schema( + model_class, data, assets, array_type + ) + return model_class(**resolved_data) + + @staticmethod + def _decode_packed_array(packed: bytes, array_type: Optional[ArrayType]) -> Any: + """Decode a packed array asset (metadata + data) back to an array.""" + from .array import EncodedArray, ArrayMetadata + + # Unpack: [4 bytes metadata length][metadata json][array data] + metadata_len = int.from_bytes(packed[:4], 'little') + metadata_json = packed[4:4+metadata_len].decode('utf-8') + array_data = packed[4+metadata_len:] + + metadata_dict = json.loads(metadata_json) + metadata = ArrayMetadata(**metadata_dict) + encoded = EncodedArray(data=array_data, metadata=metadata) + + decoded = ArrayUtils.decode_array(encoded) + + # Convert to requested array type if specified + if array_type is not None: + return ArrayUtils.convert_array(decoded, array_type) + elif metadata.array_type != "numpy": + return ArrayUtils.convert_array(decoded, metadata.array_type) + return decoded + + @staticmethod + def _resolve_refs_with_schema( + model_class: Type[BaseModel], + data: Dict[str, Any], + assets: AssetProvider, + array_type: Optional[ArrayType], + ) -> Dict[str, Any]: + """ + Resolve $ref references using Pydantic schema for type information. + + Uses model_class.model_fields to determine the expected type for each field, + so no class/module information needs to be stored in the data. + """ + result = {} + + for field_name, field_info in model_class.model_fields.items(): + if field_name not in data: + continue + + field_value = data[field_name] + field_type = field_info.annotation + + result[field_name] = Packable._resolve_value_with_type( + field_value, field_type, assets, array_type + ) + + return result + + @staticmethod + def _resolve_value_with_type( + value: Any, + expected_type: Any, + assets: AssetProvider, + array_type: Optional[ArrayType], + ) -> Any: + """Resolve a value using the expected type from Pydantic schema.""" + from typing import get_origin, get_args, Union + + if value is None: + return None + + # Handle $ref - decode based on expected type + if isinstance(value, dict) and "$ref" in value: + checksum = value["$ref"] + asset_bytes = Packable._get_asset(assets, checksum) + + # Determine if this is a Packable or array based on expected_type + origin = get_origin(expected_type) + + # Unwrap Optional[X] -> X + if origin is Union: + args = get_args(expected_type) + non_none = [a for a in args if a is not type(None)] + if len(non_none) == 1: + expected_type = non_none[0] + origin = get_origin(expected_type) + + # Check if expected type is a Packable subclass + if isinstance(expected_type, type) and issubclass(expected_type, Packable): + return expected_type.decode(asset_bytes, array_type) + + # Otherwise assume it's an array + return Packable._decode_packed_array(asset_bytes, array_type) + + # Handle nested BaseModel (non-ref dict that should be a model) + if isinstance(value, dict): + origin = get_origin(expected_type) + + # Unwrap Optional + if origin is Union: + args = get_args(expected_type) + non_none = [a for a in args if a is not type(None)] + if len(non_none) == 1: + expected_type = non_none[0] + origin = get_origin(expected_type) + + # Dict type - resolve values with value type + if origin is dict: + key_type, value_type = get_args(expected_type) + return { + k: Packable._resolve_value_with_type(v, value_type, assets, array_type) + for k, v in value.items() + } + + # BaseModel type - recursively resolve with schema + if isinstance(expected_type, type) and issubclass(expected_type, BaseModel): + resolved = Packable._resolve_refs_with_schema( + expected_type, value, assets, array_type + ) + return expected_type(**resolved) + + # Unknown dict - return as-is + return value + + # Handle lists/tuples + if isinstance(value, (list, tuple)): + origin = get_origin(expected_type) + + # Unwrap Optional + if origin is Union: + args = get_args(expected_type) + non_none = [a for a in args if a is not type(None)] + if len(non_none) == 1: + expected_type = non_none[0] + origin = get_origin(expected_type) + + # Get element type + if origin in (list, tuple): + args = get_args(expected_type) + elem_type = args[0] if args else Any + else: + elem_type = Any + + result = [ + Packable._resolve_value_with_type(v, elem_type, assets, array_type) + for v in value + ] + return result if isinstance(value, list) else tuple(result) + + # Primitive - return as-is + return value + + @staticmethod + def _merge_field_data_with_schema( + model_class: Type[BaseModel], + data: Dict[str, Any], + field_data: Dict[str, Any], + ) -> None: + """ + Merge metadata field_data into data, using Pydantic schema for type info. + + This handles the reconstruction of nested BaseModel instances without + needing __model_class__/__model_module__ markers. + """ + from typing import get_origin, get_args, Union + + for key, value in field_data.items(): + if key in ("__model_class__", "__model_module__"): + # Skip legacy markers + continue + + if key not in model_class.model_fields: + # Unknown field - store as-is + data[key] = value + continue + + field_type = model_class.model_fields[key].annotation + merged = Packable._merge_value_with_schema(value, field_type, data.get(key)) + data[key] = merged + + @staticmethod + def _merge_value_with_schema( + metadata_value: Any, + expected_type: Any, + existing_value: Any, + ) -> Any: + """Merge a metadata value with existing data using the schema type.""" + from typing import get_origin, get_args, Union + + if metadata_value is None: + return existing_value + + # Unwrap Optional + origin = get_origin(expected_type) + if origin is Union: + args = get_args(expected_type) + non_none = [a for a in args if a is not type(None)] + if len(non_none) == 1: + expected_type = non_none[0] + origin = get_origin(expected_type) + + # Handle dict type + if origin is dict: + key_type, value_type = get_args(expected_type) + if isinstance(metadata_value, dict) and isinstance(existing_value, dict): + # Merge dict entries + result = dict(existing_value) + for k, v in metadata_value.items(): + if k in ("__model_class__", "__model_module__"): + continue + result[k] = Packable._merge_value_with_schema( + v, value_type, existing_value.get(k) + ) + return result + elif isinstance(metadata_value, dict): + # No existing value - reconstruct from metadata + return { + k: Packable._merge_value_with_schema(v, value_type, None) + for k, v in metadata_value.items() + if k not in ("__model_class__", "__model_module__") + } + return metadata_value + + # Handle BaseModel type + if isinstance(expected_type, type) and issubclass(expected_type, BaseModel): + if isinstance(metadata_value, dict): + # Filter out legacy markers + filtered = {k: v for k, v in metadata_value.items() + if k not in ("__model_class__", "__model_module__")} + + if isinstance(existing_value, dict): + # Merge with existing dict data + merged = dict(existing_value) + Packable._merge_field_data_with_schema(expected_type, merged, filtered) + return expected_type(**merged) + else: + # Reconstruct from metadata + data = {} + Packable._merge_field_data_with_schema(expected_type, data, filtered) + return expected_type(**data) + return metadata_value + + # Handle list type + if origin in (list, tuple): + if isinstance(metadata_value, (list, tuple)): + args = get_args(expected_type) + elem_type = args[0] if args else Any + result = [ + Packable._merge_value_with_schema(v, elem_type, None) + for v in metadata_value + ] + return result if origin is list else tuple(result) + return metadata_value + + # Primitive - use metadata value + return metadata_value + def __reduce__(self): """ Support for pickle serialization. @@ -623,10 +1047,10 @@ def load_array( """ if isinstance(source, BytesIO): source.seek(0) - handler = DataHandler.create(ZipBuffer(source.read())) + handler = DataHandler.create(BytesIO(source.read())) else: with open(source, "rb") as f: - handler = DataHandler.create(ZipBuffer(f.read())) + handler = DataHandler.create(BytesIO(f.read())) return ArrayUtils.load_array(handler, name, array_type) def convert_to(self: TPackable, array_type: ArrayType) -> TPackable: @@ -655,48 +1079,3 @@ def convert_to(self: TPackable, array_type: ArrayType) -> TPackable: return data_copy - @staticmethod - def _reconstruct_model(data: Dict[str, Any]) -> Any: - """Reconstruct BaseModel from serialized dict with __model_class__/__model_module__.""" - if not isinstance(data, dict): - return data - - # Recursively process nested dicts first - processed = {k: Packable._reconstruct_model(v) if isinstance(v, dict) else v - for k, v in data.items() if k not in ("__model_class__", "__model_module__")} - - if "__model_class__" not in data: - return processed - - try: - import importlib - module = importlib.import_module(data["__model_module__"]) - model_class = getattr(module, data["__model_class__"]) - return model_class(**processed) - except (ImportError, AttributeError): - return processed - - @staticmethod - def _merge_field_data(data: Dict[str, Any], field_data: Dict[str, Any]) -> None: - """Merge metadata fields into data, reconstructing BaseModel instances.""" - for key, value in field_data.items(): - existing = data.get(key) - if not isinstance(value, dict): - data[key] = value - elif "__model_class__" in value: - # Single BaseModel: merge arrays then reconstruct - merged = {**value, ** - (existing if isinstance(existing, dict) else {})} - data[key] = Packable._reconstruct_model(merged) - elif isinstance(existing, dict): - # Check if dict of BaseModels - for subkey, subval in value.items(): - if isinstance(subval, dict) and "__model_class__" in subval: - merged = {**subval, **existing.get(subkey, {})} - existing[subkey] = Packable._reconstruct_model(merged) - elif isinstance(subval, dict) and isinstance(existing.get(subkey), dict): - Packable._merge_field_data(existing[subkey], subval) - else: - existing[subkey] = subval - else: - data[key] = Packable._reconstruct_model(value) diff --git a/python/pyproject.toml b/python/pyproject.toml index ba84cf1..b2367ec 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "meshly" -version = "2.4.0-alpha" +version = "2.5.0-alpha" description = "High-level abstractions and utilities for working with meshoptimizer" readme = "README.md" license = {text = "MIT"} diff --git a/python/tests/test_packable.py b/python/tests/test_packable.py index 0fd8a84..20b0ab2 100644 --- a/python/tests/test_packable.py +++ b/python/tests/test_packable.py @@ -3,8 +3,10 @@ import pytest import tempfile import os +import json from io import BytesIO -from typing import Optional +from pathlib import Path +from typing import Optional, Dict, Any import numpy as np from pydantic import BaseModel, Field, ConfigDict @@ -43,7 +45,8 @@ class FieldData(BaseModel): class Snapshot(Packable): """Snapshot with dict of BaseModel containing arrays.""" time: float = Field(..., description="Time value") - fields: dict[str, FieldData] = Field(default_factory=dict, description="Field data") + fields: dict[str, FieldData] = Field( + default_factory=dict, description="Field data") class TestPackable: @@ -100,8 +103,10 @@ def test_save_load_zip_file(self): loaded = SimulationResult.load_from_zip(path) assert loaded.time == pytest.approx(original.time) - np.testing.assert_array_almost_equal(loaded.temperature, original.temperature) - np.testing.assert_array_almost_equal(loaded.velocity, original.velocity) + np.testing.assert_array_almost_equal( + loaded.temperature, original.temperature) + np.testing.assert_array_almost_equal( + loaded.velocity, original.velocity) def test_save_load_bytesio(self): """Test saving and loading from BytesIO.""" @@ -140,8 +145,10 @@ def test_nested_dict_arrays(self): loaded = NestedData.load_from_zip(buffer) assert loaded.label == data.label - np.testing.assert_array_almost_equal(loaded.fields["pressure"], data.fields["pressure"]) - np.testing.assert_array_almost_equal(loaded.fields["density"], data.fields["density"]) + np.testing.assert_array_almost_equal( + loaded.fields["pressure"], data.fields["pressure"]) + np.testing.assert_array_almost_equal( + loaded.fields["density"], data.fields["density"]) def test_deterministic_encode(self): """Test that encode produces consistent output.""" @@ -166,7 +173,9 @@ def test_class_mismatch_error(self): data.save_to_zip(buffer) buffer.seek(0) - with pytest.raises(ValueError, match="Class mismatch"): + # Loading wrong class should fail with Pydantic validation error + # (missing required fields for SimulationResult) + with pytest.raises(Exception): # ValidationError from Pydantic SimulationResult.load_from_zip(buffer) def test_dict_of_basemodel_with_arrays(self): @@ -183,7 +192,8 @@ def test_dict_of_basemodel_with_arrays(self): "velocity": FieldData( name="velocity", type="vector", - data=np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=np.float32), + data=np.array( + [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=np.float32), units="m/s" ) } @@ -242,123 +252,512 @@ def test_dict_of_basemodel_with_optional_none_field(self): np.testing.assert_array_almost_equal( loaded.fields["pressure"].data, snapshot.fields["pressure"].data) - -class InnerPackable(Packable): - """Inner packable for testing nested support.""" - label: str = Field(..., description="Label") - data: np.ndarray = Field(..., description="Data array") - - -class OuterPackable(Packable): - """Outer packable containing a nested packable.""" - name: str = Field(..., description="Name") - inner: Optional[InnerPackable] = Field(None, description="Nested packable") - - -class TestNestedPackableCache: - """Test nested Packable with cache support.""" - - def test_nested_packable_without_cache(self): - """Test nested packable save/load without cache.""" - inner = InnerPackable( - label="inner", - data=np.array([1.0, 2.0, 3.0], dtype=np.float32) + def test_decode_without_class_raises_error(self): + """Test that Packable.decode() raises TypeError - must use specific class.""" + # Create and encode a SimpleData instance + original = SimpleData( + name="dynamic_test", + values=np.array([1.0, 2.0, 3.0], dtype=np.float32) ) - outer = OuterPackable(name="outer", inner=inner) - - buffer = BytesIO() - outer.save_to_zip(buffer) - - buffer.seek(0) - loaded = OuterPackable.load_from_zip(buffer) - - assert loaded.name == "outer" - assert loaded.inner is not None - assert loaded.inner.label == "inner" - np.testing.assert_array_almost_equal(loaded.inner.data, inner.data) + encoded = original.encode() - def test_nested_packable_with_cache(self): - """Test nested packable save/load with cache.""" - from meshly.data_handler import DataHandler + # Decode using base Packable class - should raise TypeError + with pytest.raises(TypeError, match="Cannot decode on base Packable class"): + Packable.decode(encoded) + + # Should work with the specific class + decoded = SimpleData.decode(encoded) + assert decoded.name == original.name + np.testing.assert_array_almost_equal(decoded.values, original.values) - inner = InnerPackable( - label="cached_inner", - data=np.array([4.0, 5.0, 6.0], dtype=np.float32) + def test_load_from_zip_without_class_raises_error(self): + """Test that Packable.load_from_zip() raises TypeError - must use specific class.""" + original = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0, 302.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) ) - outer = OuterPackable(name="cached_outer", inner=inner) with tempfile.TemporaryDirectory() as tmpdir: - cache_path = os.path.join(tmpdir, "cache") - zip_path = os.path.join(tmpdir, "outer.zip") + path = os.path.join(tmpdir, "result.zip") + original.save_to_zip(path) - cache_handler = DataHandler.create(cache_path) - outer.save_to_zip(zip_path, cache_handler=cache_handler) + # Load using base Packable - should raise TypeError + with pytest.raises(TypeError, match="Cannot decode on base Packable class"): + Packable.load_from_zip(path) + + # Should work with the specific class + loaded = SimulationResult.load_from_zip(path) + assert loaded.time == pytest.approx(original.time) + np.testing.assert_array_almost_equal( + loaded.temperature, original.temperature) - cache_files = os.listdir(cache_path) - assert len(cache_files) == 1 - assert cache_files[0].endswith(".zip") - read_cache_handler = DataHandler.create(cache_path) - loaded = OuterPackable.load_from_zip(zip_path, cache_handler=read_cache_handler) +class TestExtractReconstruct: + """Test extract() and reconstruct() functionality.""" - assert loaded.name == "cached_outer" - assert loaded.inner is not None - assert loaded.inner.label == "cached_inner" - np.testing.assert_array_almost_equal(loaded.inner.data, inner.data) + def test_extract_simple(self): + """Test extract() returns data dict with refs and assets.""" + original = SimpleData( + name="test", + values=np.array([1.0, 2.0, 3.0], dtype=np.float32) + ) + + extracted = Packable.extract(original) + + # Data should have the primitive field + assert extracted.data["name"] == "test" + + # Array should be replaced with ref (no $type - we use schema) + assert "$ref" in extracted.data["values"] + checksum = extracted.data["values"]["$ref"] + + # Assets should contain the encoded array + assert checksum in extracted.assets + assert isinstance(extracted.assets[checksum], bytes) + + def test_reconstruct_simple(self): + """Test reconstruct() rebuilds the Packable from data and assets.""" + original = SimpleData( + name="roundtrip", + values=np.array([4.0, 5.0, 6.0], dtype=np.float32) + ) + + extracted = Packable.extract(original) + reconstructed = Packable.reconstruct(SimpleData, extracted.data, extracted.assets) + + assert reconstructed.name == original.name + np.testing.assert_array_almost_equal(reconstructed.values, original.values) + + def test_extract_reconstruct_simulation_result(self): + """Test extract/reconstruct with multiple arrays.""" + original = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + + # Should have 2 assets (2 arrays) + assert len(extracted.assets) == 2 + + # Primitive field should be preserved + assert extracted.data["time"] == 0.5 + + # Arrays should be refs + assert "$ref" in extracted.data["temperature"] + assert "$ref" in extracted.data["velocity"] + + # Reconstruct + reconstructed = Packable.reconstruct(SimulationResult, extracted.data, extracted.assets) + + assert reconstructed.time == pytest.approx(original.time) + np.testing.assert_array_almost_equal(reconstructed.temperature, original.temperature) + np.testing.assert_array_almost_equal(reconstructed.velocity, original.velocity) + + def test_extract_data_is_json_serializable(self): + """Test that extracted data can be JSON serialized.""" + original = SimulationResult( + time=1.0, + temperature=np.array([100.0], dtype=np.float32), + velocity=np.array([[0.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + + # Should be able to serialize to JSON + json_str = json.dumps(extracted.data) + assert isinstance(json_str, str) + + # And deserialize back + loaded_data = json.loads(json_str) + assert loaded_data["time"] == 1.0 + + def test_reconstruct_missing_asset_raises(self): + """Test that reconstruct raises KeyError when asset is missing.""" + data = {"name": "test", "values": {"$ref": "nonexistent_checksum"}} + + with pytest.raises(KeyError, match="Missing asset"): + Packable.reconstruct(SimpleData, data, {}) + + def test_extract_requires_basemodel(self): + """Test extract() requires a Pydantic BaseModel, not plain dict.""" + data = { + "name": "test", + "positions": np.array([[0, 0, 0], [1, 1, 1]], dtype=np.float32), + } + + with pytest.raises(TypeError, match="requires a Pydantic BaseModel"): + Packable.extract(data) + + def test_reconstruct_with_callable_returns_lazy_model(self): + """Test that reconstruct() with callable returns LazyModel for lazy loading.""" + from meshly.packable import LazyModel + + original = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + + # Track which assets were requested + requested_checksums = [] + + def lazy_loader(checksum: str) -> bytes: + """Simulate lazy loading from external storage.""" + requested_checksums.append(checksum) + if checksum not in extracted.assets: + raise KeyError(f"Missing asset with checksum '{checksum}'") + return extracted.assets[checksum] + + # Reconstruct using callable - returns LazyModel + lazy = Packable.reconstruct( + SimulationResult, extracted.data, lazy_loader + ) + + # Should be a LazyModel, not loaded yet + assert isinstance(lazy, LazyModel) + assert len(requested_checksums) == 0 + + # Access fields to trigger loading + assert lazy.time == pytest.approx(original.time) + np.testing.assert_array_almost_equal(lazy.temperature, original.temperature) + np.testing.assert_array_almost_equal(lazy.velocity, original.velocity) + + # Now assets should be loaded + assert len(requested_checksums) == 2 + + def test_reconstruct_callable_missing_asset_raises_on_access(self): + """Test that callable asset provider raises KeyError on field access.""" + data = {"name": "test", "values": {"$ref": "nonexistent"}} + + def failing_loader(checksum: str) -> bytes: + raise KeyError(f"Missing asset with checksum '{checksum}'") + + # With callable, returns LazyModel immediately (no error) + lazy = Packable.reconstruct(SimpleData, data, failing_loader) + + # Error raised when accessing the field + with pytest.raises(KeyError, match="Missing asset"): + _ = lazy.values + + def test_lazy_reconstruct_defers_loading(self): + """Test that reconstruct() with callable doesn't load assets until accessed.""" + original = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + requested_checksums = [] + + def tracking_loader(checksum: str) -> bytes: + requested_checksums.append(checksum) + return extracted.assets[checksum] + + # Create lazy model with callable - NO assets should be loaded yet + lazy = Packable.reconstruct( + SimulationResult, extracted.data, tracking_loader + ) + assert len(requested_checksums) == 0, "No assets should be loaded on creation" + + # Access primitive field - still no asset loading + assert lazy.time == pytest.approx(0.5) + assert len(requested_checksums) == 0, "Primitive access shouldn't load assets" + + # Access temperature - should load only temperature asset + temp = lazy.temperature + assert len(requested_checksums) == 1, "Should load exactly one asset" + np.testing.assert_array_almost_equal(temp, original.temperature) + + # Access temperature again - should use cache, not reload + temp2 = lazy.temperature + assert len(requested_checksums) == 1, "Cached access shouldn't reload" + + # Access velocity - should load velocity asset + vel = lazy.velocity + assert len(requested_checksums) == 2, "Should now have loaded both assets" + np.testing.assert_array_almost_equal(vel, original.velocity) + + def test_lazy_reconstruct_resolve(self): + """Test that resolve() returns the full Pydantic model.""" + original = SimulationResult( + time=1.0, + temperature=np.array([100.0], dtype=np.float32), + velocity=np.array([[0.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + + # Use callable to get LazyModel + lazy = Packable.reconstruct( + SimulationResult, extracted.data, lambda c: extracted.assets[c] + ) + + # Resolve to get actual model + resolved = lazy.resolve() + + # Should be actual SimulationResult instance + assert isinstance(resolved, SimulationResult) + assert resolved.time == pytest.approx(1.0) + np.testing.assert_array_almost_equal(resolved.temperature, original.temperature) + + # Resolve again should return same instance + resolved2 = lazy.resolve() + assert resolved is resolved2 + + def test_lazy_model_repr(self): + """Test LazyModel has informative repr.""" + original = SimulationResult( + time=0.5, + temperature=np.array([300.0], dtype=np.float32), + velocity=np.array([[1.0]], dtype=np.float32) + ) + + extracted = Packable.extract(original) + lazy = Packable.reconstruct( + SimulationResult, extracted.data, lambda c: extracted.assets[c] + ) + + repr_str = repr(lazy) + assert "LazyModel" in repr_str + assert "SimulationResult" in repr_str + + # After accessing one field, repr should reflect that + _ = lazy.temperature + repr_str = repr(lazy) + assert "temperature" in repr_str + + def test_lazy_model_is_readonly(self): + """Test that LazyModel doesn't allow attribute setting.""" + original = SimpleData( + name="test", + values=np.array([1.0], dtype=np.float32) + ) + + extracted = Packable.extract(original) + lazy = Packable.reconstruct( + SimpleData, extracted.data, lambda c: extracted.assets[c] + ) + + with pytest.raises(AttributeError, match="read-only"): + lazy.name = "modified" - def test_cache_deduplication(self): - """Test that identical nested packables share the same cache file.""" + def test_reconstruct_with_cache_handler(self): + """Test that CachedAssetLoader persists fetched assets to disk.""" from meshly.data_handler import DataHandler - - inner1 = InnerPackable( - label="same", - data=np.array([1.0, 2.0], dtype=np.float32) + from meshly.packable import CachedAssetLoader + + original = SimulationResult( + time=0.5, + temperature=np.array([300.0, 301.0], dtype=np.float32), + velocity=np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float32) ) - inner2 = InnerPackable( - label="same", + + extracted = Packable.extract(original) + fetch_count = [0] # Use list to track calls in closure + + def counting_loader(checksum: str) -> bytes: + fetch_count[0] += 1 + return extracted.assets[checksum] + + with tempfile.TemporaryDirectory() as tmpdir: + cache_path = Path(tmpdir) / "cache" + cache_handler = DataHandler.create(cache_path) + + # First lazy model with CachedAssetLoader - should fetch from loader + loader1 = CachedAssetLoader(counting_loader, cache_handler) + lazy1 = Packable.reconstruct( + SimulationResult, extracted.data, loader1 + ) + + # Access temperature - should fetch and cache + _ = lazy1.temperature + assert fetch_count[0] == 1 + + # Access velocity - should fetch and cache + _ = lazy1.velocity + assert fetch_count[0] == 2 + + # Finalize to write cache + cache_handler.finalize() + + # Create new cache handler pointing to same location + cache_handler2 = DataHandler.create(cache_path) + + # Second lazy model with same cache - should read from cache + loader2 = CachedAssetLoader(counting_loader, cache_handler2) + lazy2 = Packable.reconstruct( + SimulationResult, extracted.data, loader2 + ) + + # Access both fields - should NOT call loader (reads from cache) + temp2 = lazy2.temperature + vel2 = lazy2.velocity + assert fetch_count[0] == 2, "Should read from cache, not call loader" + + # Verify data integrity + np.testing.assert_array_almost_equal(temp2, original.temperature) + np.testing.assert_array_almost_equal(vel2, original.velocity) + + +class TestNestedPackableRejection: + """Test that direct Packable fields are rejected, but nested in dicts is allowed.""" + + def test_direct_nested_packable_rejected(self): + """Test that a Packable field containing another Packable is rejected.""" + + class InnerPackable(Packable): + label: str + data: np.ndarray + + class OuterPackable(Packable): + name: str + inner: Optional[InnerPackable] = None + + inner = InnerPackable( + label="inner", data=np.array([1.0, 2.0], dtype=np.float32) ) - outer1 = OuterPackable(name="outer1", inner=inner1) - outer2 = OuterPackable(name="outer2", inner=inner2) + + with pytest.raises(TypeError, match="Direct Packable fields are not allowed"): + OuterPackable(name="outer", inner=inner) + + def test_dict_of_packables_allowed(self): + """Test that Dict[str, Packable] is allowed (Packable inside typed dict).""" + + class ContainerPackable(Packable): + name: str + items: Dict[str, SimpleData] = Field(default_factory=dict) + + inner = SimpleData( + name="inner", + values=np.array([1.0, 2.0, 3.0], dtype=np.float32) + ) + + # Should be allowed with typed dict + container = ContainerPackable(name="container", items={"nested": inner}) + assert container.name == "container" + assert isinstance(container.items["nested"], SimpleData) + + def test_extract_typed_dict_with_nested_packables(self): + """Test that extract() handles typed dicts with nested Packables.""" + + class ContainerPackable(Packable): + name: str + items: Dict[str, SimpleData] = Field(default_factory=dict) + + inner = SimpleData( + name="inner", + values=np.array([1.0, 2.0, 3.0], dtype=np.float32) + ) + + container = ContainerPackable(name="container", items={"nested": inner}) + + # Extract should create refs for the nested Packable + extracted = Packable.extract(container) + + # The nested packable should be a ref (no $type - schema provides type info) + assert "$ref" in extracted.data["items"]["nested"] + + # Should have asset for the nested packable + assert len(extracted.assets) >= 1 + + def test_reconstruct_typed_dict_with_nested_packables(self): + """Test that reconstruct() handles typed dicts with nested Packables.""" + + class ContainerPackable(Packable): + name: str + items: Dict[str, SimpleData] = Field(default_factory=dict) + + inner = SimpleData( + name="inner", + values=np.array([1.0, 2.0, 3.0], dtype=np.float32) + ) + + container = ContainerPackable(name="container", items={"nested": inner}) + + # Extract and reconstruct + extracted = Packable.extract(container) + reconstructed = Packable.reconstruct(ContainerPackable, extracted.data, extracted.assets) + + assert reconstructed.name == "container" + assert isinstance(reconstructed.items["nested"], SimpleData) + assert reconstructed.items["nested"].name == "inner" + np.testing.assert_array_almost_equal( + reconstructed.items["nested"].values, inner.values + ) + + def test_none_nested_packable_allowed(self): + """Test that Optional[Packable] = None is allowed.""" + + class InnerPackable(Packable): + label: str + data: np.ndarray + + class OuterPackable(Packable): + name: str + inner: Optional[InnerPackable] = None + + # Should work with None + outer = OuterPackable(name="outer", inner=None) + assert outer.name == "outer" + assert outer.inner is None + + +class TestDataHandler: + """Test DataHandler functionality.""" + + def test_context_manager_file_handler(self): + """Test DataHandler can be used as context manager with FileHandler.""" + from meshly.data_handler import DataHandler with tempfile.TemporaryDirectory() as tmpdir: - cache_path = os.path.join(tmpdir, "cache") - zip1_path = os.path.join(tmpdir, "outer1.zip") - zip2_path = os.path.join(tmpdir, "outer2.zip") + with DataHandler.create(tmpdir) as handler: + handler.write_text("test.txt", "hello world") + assert handler.exists("test.txt") - cache_handler = DataHandler.create(cache_path) - outer1.save_to_zip(zip1_path, cache_handler=cache_handler) - outer2.save_to_zip(zip2_path, cache_handler=cache_handler) + # File should still exist after context exit + assert os.path.exists(os.path.join(tmpdir, "test.txt")) - cache_files = os.listdir(cache_path) - assert len(cache_files) == 1 + def test_context_manager_zip_handler(self): + """Test DataHandler can be used as context manager with ZipHandler.""" + from meshly.data_handler import DataHandler - read_cache_handler = DataHandler.create(cache_path) - loaded1 = OuterPackable.load_from_zip(zip1_path, cache_handler=read_cache_handler) - loaded2 = OuterPackable.load_from_zip(zip2_path, cache_handler=read_cache_handler) + buffer = BytesIO() + with DataHandler.create(buffer) as handler: + handler.write_text("metadata.json", '{"test": true}') + handler.write_binary("data.bin", b"binary content") - assert loaded1.inner.label == "same" - assert loaded2.inner.label == "same" + # After context exit, zip should be finalized and readable + buffer.seek(0) + with DataHandler.create(BytesIO(buffer.read())) as reader: + content = reader.read_text("metadata.json") + assert content == '{"test": true}' + assert reader.read_binary("data.bin") == b"binary content" - def test_cache_missing_falls_back_to_embedded(self): - """Test loading works when cache file is missing but data is embedded.""" + def test_remove_file(self): + """Test remove_file functionality for FileHandler.""" from meshly.data_handler import DataHandler - inner = InnerPackable( - label="fallback", - data=np.array([7.0, 8.0], dtype=np.float32) - ) - outer = OuterPackable(name="fallback_outer", inner=inner) + with tempfile.TemporaryDirectory() as tmpdir: + handler = DataHandler.create(tmpdir) + handler.write_text("to_delete.txt", "temporary") + assert handler.exists("to_delete.txt") - buffer = BytesIO() - outer.save_to_zip(buffer) + handler.remove_file("to_delete.txt") + assert not handler.exists("to_delete.txt") - with tempfile.TemporaryDirectory() as tmpdir: - cache_path = os.path.join(tmpdir, "cache") - os.makedirs(cache_path) - read_cache_handler = DataHandler.create(cache_path) - buffer.seek(0) - loaded = OuterPackable.load_from_zip(buffer, cache_handler=read_cache_handler) - - assert loaded.name == "fallback_outer" - assert loaded.inner.label == "fallback" + def test_remove_file_zip_raises(self): + """Test remove_file raises NotImplementedError for ZipHandler.""" + from meshly.data_handler import DataHandler + + buffer = BytesIO() + with DataHandler.create(buffer) as handler: + handler.write_text("test.txt", "content") + with pytest.raises(NotImplementedError): + handler.remove_file("test.txt") diff --git a/typescript/README.md b/typescript/README.md index ab72f02..f1f3249 100644 --- a/typescript/README.md +++ b/typescript/README.md @@ -18,6 +18,8 @@ pnpm add meshly - Support for polygon meshes with automatic triangulation - Marker extraction for boundary conditions and regions - Custom field decoding via `getCustomFields()` override +- **Reconstruct API** for resolving `$ref` asset references +- **CachedAssetLoader** for disk-cached asset loading - Full TypeScript type definitions ## Quick Start @@ -92,8 +94,6 @@ protected static override getCustomFields(): Record { ```typescript // Base metadata (matches Python PackableMetadata) interface PackableMetadata { - class_name: string - module_name: string field_data?: Record } @@ -198,14 +198,32 @@ const inletIndices = await Mesh.loadArray(zipData, 'markerIndices.inlet') interface DataHandler { // Read binary content from a file readBinary(path: string): Promise + // Write binary content to a file (optional) + writeBinary?(path: string, content: Uint8Array | ArrayBuffer): Promise // Check if a file exists (optional) exists?(path: string): Promise } -// Create a DataHandler from a hash loader function -function createDataHandler( - loader: (hash: string) => Promise -): DataHandler +// Asset fetch function type +type AssetFetcher = (checksum: string) => Promise + +// Asset provider: either a dict of assets or a fetcher function +type AssetProvider = Record | AssetFetcher +``` + +### CachedAssetLoader + +```typescript +// Asset loader with optional disk cache for persistence +class CachedAssetLoader { + constructor( + fetch: AssetFetcher, // Function that fetches asset bytes by checksum + cache: DataHandler // DataHandler for caching fetched assets + ) + + // Get asset bytes, checking cache first then fetching if needed + async getAsset(checksum: string): Promise +} ``` ### CustomFieldConfig @@ -228,11 +246,18 @@ interface CustomFieldConfig { constructor(data: TData) - // Decode from zip data (with optional cache handler for nested packables) - static async decode( - zipData: ArrayBuffer | Uint8Array, - cacheHandler?: DataHandler - ): Promise> + // Decode from zip data + static async decode(zipData: ArrayBuffer | Uint8Array): Promise> + + // Reconstruct from extracted data and assets + static async reconstruct( + data: Record, + assets: AssetProvider | CachedAssetLoader, + schema?: ReconstructSchema + ): Promise + + // Decode packed array format (metadata + data bytes) + static _decodePackedArray(packed: Uint8Array | ArrayBuffer): TypedArray // Load single array static async loadArray(zipData: ArrayBuffer | Uint8Array, name: string): Promise @@ -242,9 +267,6 @@ class Packable { // Custom field configuration (override in subclasses) protected static getCustomFields(): Record - - // Packable field types for nested packable decoding (override in subclasses) - protected static getPackableFieldTypes(): Record } ``` @@ -267,8 +289,8 @@ class Mesh extends Packable { isUniformPolygons(): boolean getPolygonIndices(): Uint32Array[] | Uint32Array - // Decoding (with optional cache handler for nested packables) - static async decode(zipData: ArrayBuffer | Uint8Array, cacheHandler?: DataHandler): Promise + // Decoding + static async decode(zipData: ArrayBuffer | Uint8Array): Promise // Marker extraction extractByMarker(markerName: string): Mesh @@ -280,6 +302,7 @@ class Mesh extends Packable { // Custom field configuration for meshoptimizer decoding protected static override getCustomFields(): Record> } +} ``` ### MeshData Interface @@ -303,10 +326,7 @@ interface MeshData { ```typescript // Base metadata for all Packable types interface PackableMetadata { - class_name: string - module_name: string field_data?: Record - packable_refs?: Record // SHA256 hash refs for cached packables } // Mesh-specific metadata extending base @@ -323,49 +343,53 @@ interface MeshSize { } ``` -### Cache Support - -When loading meshes with nested Packables that were saved with caching (using Python's `cache_handler`), provide a `DataHandler`: +### Reconstruct Schema Types ```typescript -import { Mesh, DataHandler, createDataHandler } from 'meshly' - -// Example: Fetch from server cache using createDataHandler helper -const cacheHandler = createDataHandler(async (hash) => { - const response = await fetch(`/cache/${hash}.zip`) - return response.ok ? response.arrayBuffer() : undefined -}) - -// Decode with cache support -const mesh = await Mesh.decode(zipData, cacheHandler) +// Decoder function for Packable types +type PackableDecoder = (data: Uint8Array | ArrayBuffer) => Promise | T + +// Schema for a single field +type FieldSchema = + | { type: 'array'; element?: FieldSchema } // TypedArray or Array of items + | { type: 'packable'; decode: PackableDecoder } // Nested Packable + | { type: 'dict'; value?: FieldSchema } // Dict with uniform value type + | { type: 'object'; fields?: ReconstructSchema } // Object with known field types + +// Schema mapping field names to their types +type ReconstructSchema = Record + +// Result of Python's Packable.extract() +interface SerializedPackableData { + data: Record // Serializable dict with $ref references + assets: Record // Map of checksum -> encoded bytes +} ``` -**DataHandler examples:** +### Reconstruct Example ```typescript -// From IndexedDB using createDataHandler -const idbHandler = createDataHandler(async (hash) => { - const db = await openDB('meshly-cache') - return db.get('packables', hash) -}) - -// From Map (in-memory) -const memoryCache = new Map() -const memoryHandler = createDataHandler(async (hash) => memoryCache.get(hash)) - -// Custom class implementing DataHandler interface -class ServerCacheHandler implements DataHandler { - constructor(private baseUrl: string) {} - - async readBinary(path: string): Promise { - const response = await fetch(`${this.baseUrl}/${path}`) - return response.ok ? response.arrayBuffer() : undefined +import { Packable, CachedAssetLoader, ReconstructSchema } from 'meshly' + +// Simple case - all $refs are arrays +const result = await Packable.reconstruct(data, assets) + +// With nested Packables - define schema for type hints +const schema: ReconstructSchema = { + mesh: { type: 'packable', decode: (bytes) => Mesh.decode(bytes) }, + snapshots: { + type: 'array', + element: { type: 'packable', decode: (bytes) => Mesh.decode(bytes) } } } - -const serverHandler = new ServerCacheHandler('/api/cache') -const mesh = await Mesh.decode(zipData, serverHandler) -``` +const result = await Packable.reconstruct(data, assets, schema) + +// With CachedAssetLoader for disk caching +const loader = new CachedAssetLoader( + async (checksum) => fetch(`/api/assets/${checksum}`).then(r => r.arrayBuffer()), + myDataHandler +) +const result = await Packable.reconstruct(data, loader, schema) ``` ### Utility Classes diff --git a/typescript/package.json b/typescript/package.json index 7d606d8..85d27d3 100644 --- a/typescript/package.json +++ b/typescript/package.json @@ -1,6 +1,6 @@ { "name": "meshly", - "version": "2.4.0-alpha", + "version": "2.5.0-alpha", "type": "commonjs", "description": "TypeScript library to decode Python meshoptimizer zip files into THREE.js geometries", "main": "dist/index.js", diff --git a/typescript/src/__tests__/loadArray.test.ts b/typescript/src/__tests__/loadArray.test.ts index 851a046..073acda 100644 --- a/typescript/src/__tests__/loadArray.test.ts +++ b/typescript/src/__tests__/loadArray.test.ts @@ -21,8 +21,6 @@ async function createTestMeshZip(): Promise { // Add metadata zip.file('metadata.json', JSON.stringify({ - class_name: 'Mesh', - module_name: 'meshly.mesh', mesh_size: { vertex_count: 3, vertex_size: 12, @@ -67,8 +65,6 @@ async function createTestMeshWithMarkersZip(): Promise { // Add metadata zip.file('metadata.json', JSON.stringify({ - class_name: 'Mesh', - module_name: 'meshly.mesh', mesh_size: { vertex_count: 4, vertex_size: 12, diff --git a/typescript/src/__tests__/reconstruct.test.ts b/typescript/src/__tests__/reconstruct.test.ts new file mode 100644 index 0000000..2bad2fb --- /dev/null +++ b/typescript/src/__tests__/reconstruct.test.ts @@ -0,0 +1,322 @@ +import { MeshoptEncoder } from 'meshoptimizer' +import { describe, expect, it } from 'vitest' +import { ArrayMetadata } from '../array' +import { AssetProvider, CachedAssetLoader, DataHandler } from '../data-handler' +import { Packable, ReconstructSchema, SerializedPackableData } from '../packable' + +/** + * Helper to encode an array using meshoptimizer and pack with metadata. + * Matches Python's packed array format: [4 bytes metadata length][metadata json][array data] + */ +async function packArray( + values: Float32Array | Uint32Array | Int32Array, + dtype: string +): Promise { + await MeshoptEncoder.ready + + const itemsize = values.BYTES_PER_ELEMENT + const count = values.length + const shape = [count] + + // Encode with meshoptimizer + const encoded = MeshoptEncoder.encodeVertexBuffer( + new Uint8Array(values.buffer, values.byteOffset, values.byteLength), + count, + itemsize + ) + + // Create metadata + const metadata: ArrayMetadata = { shape, dtype, itemsize } + const metadataJson = JSON.stringify(metadata) + const metadataBytes = new TextEncoder().encode(metadataJson) + + // Pack: [4 bytes len][metadata][data] + const packed = new Uint8Array(4 + metadataBytes.length + encoded.length) + const view = new DataView(packed.buffer) + view.setUint32(0, metadataBytes.length, true) // little-endian + packed.set(metadataBytes, 4) + packed.set(encoded, 4 + metadataBytes.length) + + return packed +} + +/** + * Simple SHA256 hash (first 16 chars) for deterministic checksums + */ +async function sha256(data: Uint8Array): Promise { + const hashBuffer = await crypto.subtle.digest('SHA-256', data) + const hashArray = Array.from(new Uint8Array(hashBuffer)) + return hashArray.map((b) => b.toString(16).padStart(2, '0')).join('').slice(0, 16) +} + +/** + * Helper to create extracted data format (simulating Python's Packable.extract output) + */ +async function createExtractedData( + fields: Record, + arrays: Record +): Promise { + const data: Record = { ...fields } + const assets: Record = {} + + for (const [name, values] of Object.entries(arrays)) { + const dtype = values instanceof Float32Array ? 'float32' : 'uint32' + const packed = await packArray(values, dtype) + const checksum = await sha256(packed) + data[name] = { $ref: checksum } + assets[checksum] = packed + } + + return { data, assets } +} + +describe('Packable.reconstruct', () => { + describe('with dict assets (eager loading)', () => { + it('reconstructs simple data with arrays', async () => { + const extracted = await createExtractedData( + { name: 'test', time: 0.5 }, + { + temperature: new Float32Array([300.0, 301.0, 302.0]), + velocity: new Float32Array([1.0, 0.0, 0.0, 1.0]) + } + ) + + const result = await Packable.reconstruct<{ + name: string + time: number + temperature: Float32Array + velocity: Float32Array + }>(extracted.data, extracted.assets) + + expect(result.name).toBe('test') + expect(result.time).toBe(0.5) + expect(result.temperature).toBeInstanceOf(Float32Array) + expect(Array.from(result.temperature)).toEqual([300.0, 301.0, 302.0]) + expect(Array.from(result.velocity)).toEqual([1.0, 0.0, 0.0, 1.0]) + }) + + it('throws KeyError for missing asset', async () => { + const data = { name: 'test', values: { $ref: 'nonexistent_checksum' } } + + await expect(Packable.reconstruct(data, {})).rejects.toThrow( + /Missing asset.*nonexistent_checksum/ + ) + }) + + it('preserves primitive fields unchanged', async () => { + const data = { + name: 'simulation_001', + time: 1.5, + active: true, + config: { iterations: 100, tolerance: 1e-6 } + } + + const result = await Packable.reconstruct(data, {}) + + expect(result).toEqual(data) + }) + + it('handles nested objects with refs', async () => { + const tempArray = new Float32Array([100.0, 200.0]) + const tempPacked = await packArray(tempArray, 'float32') + const tempChecksum = await sha256(tempPacked) + + const data = { + name: 'nested', + fields: { + temperature: { $ref: tempChecksum } + } + } + + const result = await Packable.reconstruct(data, { [tempChecksum]: tempPacked }) + + expect(result.name).toBe('nested') + expect(result.fields.temperature).toBeInstanceOf(Float32Array) + expect(Array.from(result.fields.temperature as Float32Array)).toEqual([100.0, 200.0]) + }) + + it('handles arrays of objects with refs', async () => { + const temp1 = new Float32Array([100.0]) + const temp2 = new Float32Array([200.0]) + const packed1 = await packArray(temp1, 'float32') + const packed2 = await packArray(temp2, 'float32') + const checksum1 = await sha256(packed1) + const checksum2 = await sha256(packed2) + + const data = { + snapshots: [ + { time: 0.0, temperature: { $ref: checksum1 } }, + { time: 1.0, temperature: { $ref: checksum2 } } + ] + } + + type Snapshot = { time: number; temperature: Float32Array } + const result = await Packable.reconstruct<{ snapshots: Snapshot[] }>(data, { + [checksum1]: packed1, + [checksum2]: packed2 + }) + + expect(result.snapshots).toHaveLength(2) + expect(result.snapshots[0].time).toBe(0.0) + expect(Array.from(result.snapshots[0].temperature)).toEqual([100.0]) + expect(result.snapshots[1].time).toBe(1.0) + expect(Array.from(result.snapshots[1].temperature)).toEqual([200.0]) + }) + }) + + describe('with callable assets (lazy loading)', () => { + it('defers loading until field access', async () => { + const extracted = await createExtractedData( + { name: 'lazy_test', time: 0.5 }, + { temperature: new Float32Array([300.0, 301.0]) } + ) + + const requestedChecksums: string[] = [] + const loader: AssetProvider = async (checksum: string) => { + requestedChecksums.push(checksum) + return extracted.assets[checksum] + } + + // Access primitive field only + const result = await Packable.reconstruct(extracted.data, loader) + + // Primitive field should be available + expect(result.name).toBe('lazy_test') + expect(result.time).toBe(0.5) + + // Array should have been fetched (TypeScript version is eager with callable too) + // Note: Unlike Python's LazyModel, TS reconstruct resolves all refs + expect(requestedChecksums.length).toBe(1) + }) + + it('throws when callable returns missing asset', async () => { + const data = { values: { $ref: 'missing' } } + const failingLoader: AssetProvider = async () => { + throw new Error("Missing asset with checksum 'missing'") + } + + await expect(Packable.reconstruct(data, failingLoader)).rejects.toThrow(/Missing asset/) + }) + }) + + describe('with schema for Packables', () => { + it('decodes nested Packable using schema decoder', async () => { + // Create a mock "packable" that's just raw bytes representing a simple object + const mockPackableBytes = new TextEncoder().encode(JSON.stringify({ type: 'mock', value: 42 })) + const checksum = await sha256(mockPackableBytes) + + // Custom decoder that parses the JSON + const mockDecoder = (data: Uint8Array | ArrayBuffer) => { + const bytes = data instanceof Uint8Array ? data : new Uint8Array(data) + return JSON.parse(new TextDecoder().decode(bytes)) + } + + const data = { + name: 'with_nested', + nested: { $ref: checksum } + } + + const schema: ReconstructSchema = { + nested: { type: 'packable', decode: mockDecoder } + } + + const result = await Packable.reconstruct(data, { [checksum]: mockPackableBytes }, schema) + + expect(result.name).toBe('with_nested') + expect(result.nested).toEqual({ type: 'mock', value: 42 }) + }) + + it('handles array of Packables with element schema', async () => { + const item1Bytes = new TextEncoder().encode(JSON.stringify({ id: 1 })) + const item2Bytes = new TextEncoder().encode(JSON.stringify({ id: 2 })) + const checksum1 = await sha256(item1Bytes) + const checksum2 = await sha256(item2Bytes) + + const mockDecoder = (data: Uint8Array | ArrayBuffer) => { + const bytes = data instanceof Uint8Array ? data : new Uint8Array(data) + return JSON.parse(new TextDecoder().decode(bytes)) + } + + const data = { + items: [{ $ref: checksum1 }, { $ref: checksum2 }] + } + + const schema: ReconstructSchema = { + items: { + type: 'array', + element: { type: 'packable', decode: mockDecoder } + } + } + + const result = await Packable.reconstruct(data, { + [checksum1]: item1Bytes, + [checksum2]: item2Bytes + }, schema) + + expect(result.items).toHaveLength(2) + expect(result.items[0]).toEqual({ id: 1 }) + expect(result.items[1]).toEqual({ id: 2 }) + }) + }) + + describe('_decodePackedArray', () => { + it('decodes packed array format correctly', async () => { + const original = new Float32Array([1.0, 2.0, 3.0, 4.0]) + const packed = await packArray(original, 'float32') + + const decoded = Packable._decodePackedArray(packed) + + expect(decoded).toBeInstanceOf(Float32Array) + expect(Array.from(decoded as Float32Array)).toEqual([1.0, 2.0, 3.0, 4.0]) + }) + + it('handles uint32 arrays', async () => { + const original = new Uint32Array([10, 20, 30]) + const packed = await packArray(original, 'uint32') + + const decoded = Packable._decodePackedArray(packed) + + expect(decoded).toBeInstanceOf(Uint32Array) + expect(Array.from(decoded as Uint32Array)).toEqual([10, 20, 30]) + }) + }) +}) + +describe('CachedAssetLoader', () => { + it('caches fetched assets', async () => { + const extracted = await createExtractedData({}, { values: new Float32Array([1.0, 2.0]) }) + const checksum = Object.keys(extracted.assets)[0] + + let fetchCount = 0 + const fetcher = async (c: string) => { + fetchCount++ + return extracted.assets[c] + } + + // Create a simple in-memory cache + const cache: Record = {} + const mockHandler: DataHandler = { + async readBinary(path: string) { + return cache[path] + }, + async writeBinary(path: string, content: Uint8Array | ArrayBuffer) { + cache[path] = content instanceof Uint8Array ? content : new Uint8Array(content) + }, + async exists(path: string) { + return path in cache + } + } + + const loader = new CachedAssetLoader(fetcher, mockHandler) + + // First fetch - should call fetcher + const result1 = await loader.getAsset(checksum) + expect(fetchCount).toBe(1) + expect(result1).toBeDefined() + + // Second fetch - should use cache + const result2 = await loader.getAsset(checksum) + expect(fetchCount).toBe(1) // Still 1, cached + expect(result2).toBeDefined() + }) +}) diff --git a/typescript/src/data-handler.ts b/typescript/src/data-handler.ts index f63427b..bcf164b 100644 --- a/typescript/src/data-handler.ts +++ b/typescript/src/data-handler.ts @@ -15,6 +15,13 @@ export interface DataHandler { */ readBinary(path: string): Promise + /** + * Write binary content to a file. + * @param path - File path + * @param content - Content to write + */ + writeBinary?(path: string, content: Uint8Array | ArrayBuffer): Promise + /** * Check if a file exists. * @param path - File path @@ -24,17 +31,91 @@ export interface DataHandler { } /** - * Create a DataHandler from a simple hash loader function. - * Provides backward compatibility for function-based loaders. + * Asset fetch function type - takes a checksum and returns asset bytes + */ +export type AssetFetcher = (checksum: string) => Promise + +/** + * Asset provider: either a dict of assets or a fetcher function */ -export function createDataHandler( - loader: (hash: string) => Promise -): DataHandler { - return { - readBinary: async (path: string) => { - // Extract hash from path (e.g., "abc123.zip" -> "abc123") - const hash = path.replace(/\.zip$/, '') - return loader(hash) +export type AssetProvider = Record | AssetFetcher + +/** + * Asset loader with optional disk cache for persistence. + * + * Wraps a fetch function with a DataHandler for caching. + * Fetched assets are stored as 'assets/{checksum}.bin' and read + * from cache on subsequent access. + * + * @example + * ```ts + * const loader = new CachedAssetLoader( + * async (checksum) => await fetch(`/api/assets/${checksum}`).then(r => r.arrayBuffer()), + * myDataHandler + * ) + * const model = await Packable.reconstruct(data, loader) + * ``` + */ +export class CachedAssetLoader { + constructor( + /** Function that fetches asset bytes by checksum */ + public readonly fetch: AssetFetcher, + /** DataHandler for caching fetched assets */ + public readonly cache: DataHandler + ) { } + + /** + * Get asset bytes, checking cache first then fetching if needed. + */ + async getAsset(checksum: string): Promise { + const cachePath = `assets/${checksum}.bin` + + // Try cache first + if (this.cache.exists) { + const exists = await this.cache.exists(cachePath) + if (exists) { + const cached = await this.cache.readBinary(cachePath) + if (cached) return cached + } + } else { + // No exists method, try read directly + const cached = await this.cache.readBinary(cachePath) + if (cached) return cached } + + // Fetch from source + const fetched = await this.fetch(checksum) + + // Cache for next time + if (this.cache.writeBinary) { + const data = fetched instanceof Uint8Array ? fetched : new Uint8Array(fetched) + await this.cache.writeBinary(cachePath, data) + } + + return fetched + } +} + + +/** + * Helper to get asset bytes from an AssetProvider + */ +export async function getAsset( + assets: AssetProvider | CachedAssetLoader, + checksum: string +): Promise { + if (assets instanceof CachedAssetLoader) { + return assets.getAsset(checksum) + } + + if (typeof assets === 'function') { + return assets(checksum) + } + + // Dict lookup + const asset = assets[checksum] + if (!asset) { + throw new Error(`Missing asset with checksum '${checksum}'`) } + return asset } diff --git a/typescript/src/index.ts b/typescript/src/index.ts index c387376..9e61841 100644 --- a/typescript/src/index.ts +++ b/typescript/src/index.ts @@ -9,12 +9,22 @@ export { CustomDecoder, CustomFieldConfig, + FieldSchema, Packable, - PackableMetadata + PackableDecoder, + PackableMetadata, + ReconstructSchema, + SerializedPackableData } from './packable' // Export from data-handler module -export { DataHandler, createDataHandler } from './data-handler' +export { + AssetFetcher, + AssetProvider, + CachedAssetLoader, + DataHandler, + getAsset +} from './data-handler' // Export from array module export { ArrayMetadata, ArrayType, ArrayUtils, EncodedArray } from './array' diff --git a/typescript/src/packable.ts b/typescript/src/packable.ts index e429005..0fac97b 100644 --- a/typescript/src/packable.ts +++ b/typescript/src/packable.ts @@ -7,8 +7,8 @@ */ import JSZip from "jszip" -import { ArrayUtils, TypedArray } from "./array" -import { DataHandler } from "./data-handler" +import { ArrayMetadata, ArrayUtils, EncodedArray, TypedArray } from "./array" +import { AssetProvider, CachedAssetLoader, getAsset } from "./data-handler" /** @@ -16,14 +16,8 @@ import { DataHandler } from "./data-handler" * Uses snake_case to match Python serialization format. */ export interface PackableMetadata { - /** Name of the class that created this data */ - class_name: string - /** Module where the class is defined */ - module_name: string /** Non-array field values */ field_data?: Record - /** SHA256 hash references for cached packable fields (field_name -> hash) */ - packable_refs?: Record } /** @@ -85,7 +79,8 @@ export class Packable { * Get custom field configurations for this class. * Subclasses override this to define custom decoders. */ - protected static getCustomFields(): Record { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + protected static getCustomFields(): Record> { return {} } @@ -117,89 +112,6 @@ export class Packable { } } - // ============================================================ - // Packable field handling - // ============================================================ - - /** - * Get packable field types for this class. - * Subclasses override this to declare nested Packable fields. - * Returns a map of field names to their Packable subclass constructors. - */ - protected static getPackableFieldTypes(): Record { - return {} - } - - /** - * Get the set of packable field names - */ - protected static getPackableFieldNames(): Set { - return new Set(Object.keys(this.getPackableFieldTypes())) - } - - /** - * Decode packable fields from the zip or cache. - * - * Supports both embedded packables (in packables/ folder) and cached - * packables (referenced by SHA256 hash in metadata.packable_refs). - */ - protected static async decodePackableFields( - zip: JSZip, - metadata: PackableMetadata, - data: Record, - cacheHandler?: DataHandler - ): Promise { - const packableFieldTypes = this.getPackableFieldTypes() - const loadedFields = new Set() - - // First, try to load from cache using hash refs - if (cacheHandler && metadata.packable_refs) { - for (const [fieldName, hash] of Object.entries(metadata.packable_refs)) { - const PackableClass = packableFieldTypes[fieldName] - if (!PackableClass) continue - - try { - const cachedData = await cacheHandler.readBinary(`${hash}.zip`) - if (cachedData) { - // Use the specific subclass's decode method with cache support - data[fieldName] = await PackableClass.decode(cachedData, cacheHandler) - loadedFields.add(fieldName) - } - } catch { - // Not in cache, will try embedded - } - } - } - - // Then load any embedded packables (for backward compatibility or no-cache case) - const packablesFolder = zip.folder("packables") - if (!packablesFolder) return - - const packableFiles: string[] = [] - packablesFolder.forEach((relativePath, file) => { - if (relativePath.endsWith(".zip") && !file.dir) { - packableFiles.push(relativePath) - } - }) - - for (const relativePath of packableFiles) { - // Extract field name: "inner_mesh.zip" -> "inner_mesh" - const fieldName = relativePath.slice(0, -4) - - // Skip if already loaded from cache - if (loadedFields.has(fieldName)) continue - - const PackableClass = packableFieldTypes[fieldName] - if (!PackableClass) continue - - const file = packablesFolder.file(relativePath) - if (file) { - const encodedBytes = await file.async('arraybuffer') - data[fieldName] = await PackableClass.decode(encodedBytes, cacheHandler) - } - } - } - // ============================================================ // Standard array loading // ============================================================ @@ -260,21 +172,15 @@ export class Packable { * Decode a Packable from zip data. * * @param zipData - Zip file bytes - * @param cacheHandler - Optional DataHandler to load cached packables by SHA256 hash. - * When provided and metadata contains packable_refs, - * nested packables are loaded from cache. * * Subclasses can override this to handle custom field decoding. */ static async decode( - zipData: ArrayBuffer | Uint8Array, - cacheHandler?: DataHandler + zipData: ArrayBuffer | Uint8Array ): Promise> { const zip = await JSZip.loadAsync(zipData) const metadata = await Packable.loadMetadata(zip) const customFieldNames = this.getCustomFieldNames() - const packableFieldNames = this.getPackableFieldNames() - const skipFields = new Set([...customFieldNames, ...packableFieldNames]) const data: Record = {} @@ -282,10 +188,7 @@ export class Packable { await this.decodeCustomFields(zip, metadata, data) // Load standard arrays - await this.loadStandardArrays(zip, data, skipFields) - - // Decode packable fields - await this.decodePackableFields(zip, metadata, data, cacheHandler) + await this.loadStandardArrays(zip, data, customFieldNames) // Merge non-array fields from metadata if (metadata.field_data) { @@ -363,4 +266,184 @@ export class Packable { const zip = await JSZip.loadAsync(zipData) return ArrayUtils.loadArray(zip, name) } + + // ============================================================ + // Extract / Reconstruct for content-addressable storage + // ============================================================ + + /** + * Decode a packed array asset (metadata + data bytes) to a TypedArray. + * + * Format: [4 bytes metadata length][metadata json][array data] + */ + static _decodePackedArray(packed: Uint8Array | ArrayBuffer): TypedArray { + const bytes = packed instanceof Uint8Array ? packed : new Uint8Array(packed) + + // Read metadata length (4 bytes little-endian) + const metadataLen = bytes[0] | (bytes[1] << 8) | (bytes[2] << 16) | (bytes[3] << 24) + + // Parse metadata JSON + const metadataJson = new TextDecoder().decode(bytes.slice(4, 4 + metadataLen)) + const metadata: ArrayMetadata = JSON.parse(metadataJson) + + // Get array data + const arrayData = bytes.slice(4 + metadataLen) + + const encoded: EncodedArray = { data: arrayData, metadata } + return ArrayUtils.decodeArray(encoded) + } + + /** + * Reconstruct a data object from extracted data and assets. + * + * Since TypeScript doesn't have runtime type information like Python's Pydantic, + * this provides a simpler approach: + * - Resolves $ref references to arrays or nested Packables + * - Uses the optional `schema` to determine which refs are Packables vs arrays + * + * @param data - The data dict from extract(), with $ref references + * @param assets - Asset provider (dict, function, or CachedAssetLoader) + * @param schema - Optional schema defining which fields are Packables + * @returns Reconstructed data object with resolved references + * + * @example + * ```ts + * // Simple case - all $refs are arrays + * const rebuilt = await Packable.reconstruct(data, assets) + * + * // With nested Packables - define schema + * const schema: ReconstructSchema = { + * mesh: { type: 'packable', decode: Mesh.decode }, + * snapshots: { + * type: 'array', + * element: { + * mesh: { type: 'packable', decode: Mesh.decode } + * } + * } + * } + * const rebuilt = await Packable.reconstruct(data, assets, schema) + * ``` + */ + static async reconstruct>( + data: Record, + assets: AssetProvider | CachedAssetLoader, + schema?: ReconstructSchema + ): Promise { + const result: Record = {} + + for (const [key, value] of Object.entries(data)) { + const fieldSchema = schema?.[key] + result[key] = await Packable._resolveValue(value, assets, fieldSchema) + } + + return result as T + } + + /** + * Resolve a single value, handling $ref, nested objects, and arrays. + */ + private static async _resolveValue( + value: unknown, + assets: AssetProvider | CachedAssetLoader, + schema?: FieldSchema + ): Promise { + if (value === null || value === undefined) { + return value + } + + // Handle $ref references + if (isRefObject(value)) { + const checksum = value.$ref + const assetBytes = await getAsset(assets, checksum) + const bytes = assetBytes instanceof Uint8Array ? assetBytes : new Uint8Array(assetBytes) + + // Use schema to determine type, default to array + if (schema?.type === 'packable' && schema.decode) { + return schema.decode(bytes) + } + + // Default: decode as array + return Packable._decodePackedArray(bytes) + } + + // Handle arrays (JS arrays, not TypedArrays) + if (Array.isArray(value)) { + const elementSchema = schema?.type === 'array' ? schema.element : undefined + return Promise.all( + value.map(v => Packable._resolveValue(v, assets, elementSchema)) + ) + } + + // Handle nested objects + if (typeof value === 'object' && !ArrayBuffer.isView(value)) { + const obj = value as Record + const result: Record = {} + + for (const [k, v] of Object.entries(obj)) { + // Skip Python model metadata + if (k === '__model_class__' || k === '__model_module__') continue + + // Get nested schema if this is a dict schema + const nestedSchema = schema?.type === 'dict' ? schema.value : + schema?.type === 'object' ? schema.fields?.[k] : undefined + result[k] = await Packable._resolveValue(v, assets, nestedSchema) + } + + return result + } + + // Primitive - return as-is + return value + } +} + + +// ============================================================ +// Reconstruct Schema Types +// ============================================================ + +/** + * Reference object with $ref checksum + */ +interface RefObject { + $ref: string +} + +function isRefObject(value: unknown): value is RefObject { + return typeof value === 'object' && value !== null && '$ref' in value +} + +/** + * Decoder function for Packable types + */ +export type PackableDecoder = (data: Uint8Array | ArrayBuffer) => Promise | T + +/** + * Schema for a single field in reconstruct + */ +export type FieldSchema = + | { type: 'array'; element?: FieldSchema } // TypedArray or Array of items + | { type: 'packable'; decode: PackableDecoder } // Nested Packable + | { type: 'dict'; value?: FieldSchema } // Dict with uniform value type + | { type: 'object'; fields?: ReconstructSchema } // Object with known field types + +/** + * Schema mapping field names to their types for reconstruction. + * + * Without runtime type information, TypeScript needs hints to know + * which $ref values are Packables vs arrays. + */ +export type ReconstructSchema = Record + +/** + * Result of extracting a Packable for serialization. + * + * Contains the serializable data dict with checksum references, + * plus the encoded assets (arrays as bytes). + */ +export interface SerializedPackableData { + /** Serializable dict with primitive fields and checksum refs for arrays */ + data: Record + /** Map of checksum -> encoded bytes for all arrays */ + assets: Record } From 3e7321f448e4d8967240a7d5bcc1e46f993fb7bb Mon Sep 17 00:00:00 2001 From: Afshawn Lotfi Date: Sun, 18 Jan 2026 05:52:56 +0000 Subject: [PATCH 2/4] feat: enhance asset fetching with async support and improved error handling in CachedAssetLoader --- python/meshly/data_handler.py | 22 +- python/meshly/packable.py | 507 +++++++++++++++++++--------------- 2 files changed, 307 insertions(+), 222 deletions(-) diff --git a/python/meshly/data_handler.py b/python/meshly/data_handler.py index 69c1cfa..ec7d5e2 100644 --- a/python/meshly/data_handler.py +++ b/python/meshly/data_handler.py @@ -1,6 +1,6 @@ import stat from dataclasses import dataclass -from typing import Callable, Dict, List, Optional, Union +from typing import Awaitable, Callable, Dict, List, Optional, Union import zipfile from io import BytesIO from pathlib import Path @@ -10,7 +10,9 @@ HandlerSource = Union[PathLike, BytesIO] # Type for asset provider: either a dict or a callable that fetches by checksum -AssetFetcher = Callable[[str], bytes] +# Supports both sync and async fetch functions +# The callable can return None to indicate the asset should be read from cache +AssetFetcher = Callable[[str], Union[bytes, None, Awaitable[Optional[bytes]]]] AssetProvider = Union[Dict[str, bytes], AssetFetcher, "CachedAssetLoader"] @@ -21,10 +23,18 @@ class CachedAssetLoader: Wraps a callable asset fetcher with a DataHandler for caching. Fetched assets are stored as 'assets/{checksum}.bin' and read from cache on subsequent access. + + The fetch callable can return None to indicate the asset is not + available from the remote source, in which case the loader will + attempt to read from the cache. If not in cache either, a KeyError + is raised. Example: - def fetch_from_cloud(checksum: str) -> bytes: - return cloud_storage.download(checksum) + def fetch_from_cloud(checksum: str) -> bytes | None: + try: + return cloud_storage.download(checksum) + except NotFoundError: + return None # Will fallback to cache # Create loader with disk cache cache = DataHandler.create(Path("./cache")) @@ -33,9 +43,9 @@ def fetch_from_cloud(checksum: str) -> bytes: lazy = Packable.reconstruct(SimulationCase, data, loader) """ fetch: AssetFetcher - """Callable that fetches asset bytes by checksum""" + """Callable that fetches asset bytes by checksum (can return None to use cache)""" cache: "DataHandler" - """DataHandler for caching fetched assets""" + """DataHandler for caching fetched assets""""" class DataHandler: diff --git a/python/meshly/packable.py b/python/meshly/packable.py index 186c81d..4fc3bc2 100644 --- a/python/meshly/packable.py +++ b/python/meshly/packable.py @@ -11,25 +11,29 @@ use the extract() and reconstruct() methods to handle asset management. """ +import asyncio import hashlib +import inspect import json +from collections.abc import Callable from dataclasses import dataclass, field from functools import cached_property from io import BytesIO from pathlib import Path from typing import ( - Callable, + Any, Dict, Generic, Optional, Set, Type, - Any, TypeVar, Union, ) + from pydantic import BaseModel, Field, computed_field -from .array import ArrayUtils, ArrayType, Array + +from .array import Array, ArrayType, ArrayUtils from .common import PathLike from .data_handler import AssetProvider, CachedAssetLoader, DataHandler @@ -38,8 +42,8 @@ class PackableMetadata(BaseModel): """Metadata for a Packable saved to zip.""" - field_data: Dict[str, Any] = Field( - default_factory=dict, description="Non-array field values") + + field_data: dict[str, Any] = Field(default_factory=dict, description="Non-array field values") TPackableMetadata = TypeVar("TPackableMetadata", bound=PackableMetadata) @@ -50,132 +54,150 @@ class PackableMetadata(BaseModel): @dataclass class SerializedPackableData: """Result of extracting a Packable for serialization. - + Contains the serializable data dict with checksum references, plus the encoded assets (arrays as bytes). """ - data: Dict[str, Any] + + data: dict[str, Any] """Serializable dict with primitive fields and checksum refs for arrays""" - assets: Dict[str, bytes] + assets: dict[str, bytes] """Map of checksum -> encoded bytes for all arrays""" class LazyModel(Generic[TModel]): """ Lazy proxy for a Pydantic BaseModel that defers asset loading until field access. - + Fields containing $ref references are not resolved until accessed, allowing for truly lazy loading from external storage. - + Example: def fetch_asset(checksum: str) -> bytes: return cloud_storage.download(checksum) - + lazy = Packable.reconstruct(SimulationCase, data, fetch_asset) # No assets loaded yet - + temp = lazy.temperature # NOW the temperature asset is fetched vel = lazy.velocity # NOW the velocity asset is fetched - + # With a cache handler for persistence: cache = DataHandler.create(Path("./cache")) loader = CachedAssetLoader(fetch_asset, cache) lazy = Packable.reconstruct(SimulationCase, data, loader) """ - - __slots__ = ('_model_class', '_data', '_assets', '_array_type', '_cache', '_resolved') - + + __slots__ = ("_model_class", "_data", "_assets", "_array_type", "_cache", "_resolved") + def __init__( self, - model_class: Type[TModel], - data: Dict[str, Any], + model_class: type[TModel], + data: dict[str, Any], assets: AssetProvider, - array_type: Optional[ArrayType] = None, + array_type: ArrayType | None = None, ): - object.__setattr__(self, '_model_class', model_class) - object.__setattr__(self, '_data', data) - object.__setattr__(self, '_assets', assets) - object.__setattr__(self, '_array_type', array_type) - object.__setattr__(self, '_cache', {}) - object.__setattr__(self, '_resolved', None) - + object.__setattr__(self, "_model_class", model_class) + object.__setattr__(self, "_data", data) + object.__setattr__(self, "_assets", assets) + object.__setattr__(self, "_array_type", array_type) + object.__setattr__(self, "_cache", {}) + object.__setattr__(self, "_resolved", None) + def _get_cached_asset(self, checksum: str) -> bytes: """Get asset bytes, using cache if CachedAssetLoader is provided.""" - assets = object.__getattribute__(self, '_assets') - + assets = object.__getattribute__(self, "_assets") + # Handle CachedAssetLoader if isinstance(assets, CachedAssetLoader): cache_path = f"assets/{checksum}.bin" - + # Try to read from cache first try: return assets.cache.read_binary(cache_path) except (KeyError, FileNotFoundError): pass - - # Fetch from provider - asset_bytes = assets.fetch(checksum) - + + # Fetch from provider (may be sync or async) + result = assets.fetch(checksum) + if inspect.isawaitable(result): + result = asyncio.get_event_loop().run_until_complete(result) + + # If fetch returned None, try cache again (it might have been populated elsewhere) + # If still not found, raise error + if result is None: + try: + return assets.cache.read_binary(cache_path) + except (KeyError, FileNotFoundError): + raise KeyError(f"Asset '{checksum}' not found in remote or cache") + + asset_bytes = result + # Store in cache assets.cache.write_binary(cache_path, asset_bytes) return asset_bytes - - # Handle plain callable + + # Handle plain callable (may be sync or async) if callable(assets): - return assets(checksum) - + result = assets(checksum) + if inspect.isawaitable(result): + result = asyncio.get_event_loop().run_until_complete(result) + if result is None: + raise KeyError(f"Asset fetcher returned None for checksum '{checksum}'") + return result + # Handle dict if checksum not in assets: raise KeyError(f"Missing asset with checksum '{checksum}'") return assets[checksum] - + def __getattr__(self, name: str) -> Any: # Check cache first - cache = object.__getattribute__(self, '_cache') + cache = object.__getattribute__(self, "_cache") if name in cache: return cache[name] - - model_class = object.__getattribute__(self, '_model_class') - data = object.__getattribute__(self, '_data') - array_type = object.__getattribute__(self, '_array_type') - + + model_class = object.__getattribute__(self, "_model_class") + data = object.__getattribute__(self, "_data") + array_type = object.__getattribute__(self, "_array_type") + # Check if it's a model field if name not in model_class.model_fields: raise AttributeError(f"'{model_class.__name__}' has no attribute '{name}'") - + if name not in data: return None - + field_value = data[name] field_type = model_class.model_fields[name].annotation - + # Resolve this specific field using our caching asset getter resolved = Packable._resolve_value_with_type( field_value, field_type, self._get_cached_asset, array_type ) - + # Cache the resolved value cache[name] = resolved return resolved - + def __setattr__(self, name: str, value: Any) -> None: raise AttributeError("LazyModel is read-only. Use resolve() to get a mutable model.") - + def resolve(self) -> TModel: """ Fully resolve all fields and return the actual Pydantic model. - + This will fetch all remaining assets that haven't been accessed yet. """ - resolved = object.__getattribute__(self, '_resolved') + resolved = object.__getattribute__(self, "_resolved") if resolved is not None: return resolved - - model_class = object.__getattribute__(self, '_model_class') - data = object.__getattribute__(self, '_data') - array_type = object.__getattribute__(self, '_array_type') - cache = object.__getattribute__(self, '_cache') - + + model_class = object.__getattribute__(self, "_model_class") + data = object.__getattribute__(self, "_data") + array_type = object.__getattribute__(self, "_array_type") + cache = object.__getattribute__(self, "_cache") + # Resolve all fields, using cache where available resolved_data = {} for field_name, field_info in model_class.model_fields.items(): @@ -185,15 +207,15 @@ def resolve(self) -> TModel: resolved_data[field_name] = Packable._resolve_value_with_type( data[field_name], field_info.annotation, self._get_cached_asset, array_type ) - + result = model_class(**resolved_data) - object.__setattr__(self, '_resolved', result) + object.__setattr__(self, "_resolved", result) return result - + def __repr__(self) -> str: - model_class = object.__getattribute__(self, '_model_class') - cache = object.__getattribute__(self, '_cache') - data = object.__getattribute__(self, '_data') + model_class = object.__getattribute__(self, "_model_class") + cache = object.__getattribute__(self, "_cache") + data = object.__getattribute__(self, "_data") loaded = list(cache.keys()) pending = [k for k in data.keys() if k not in cache] return f"LazyModel[{model_class.__name__}](loaded={loaded}, pending={pending})" @@ -202,12 +224,12 @@ def __repr__(self) -> str: @dataclass class CustomFieldConfig(Generic[FieldValue, TPackableMetadata]): """Configuration for custom field encoding/decoding.""" + file_name: str """File name in zip (without .bin extension)""" encode: Callable[[FieldValue, Any], bytes] """Encoder function: (value, instance) -> bytes""" - decode: Callable[[bytes, TPackableMetadata, - Optional[ArrayType]], FieldValue] + decode: Callable[[bytes, TPackableMetadata, ArrayType | None], FieldValue] """Decoder function: (bytes, metadata, array_type) -> value""" optional: bool = False """Whether the field is optional (won't throw if missing)""" @@ -237,10 +259,10 @@ class SimulationResult(Packable): velocity=np.zeros((3, 3)) ) result.save_to_zip("result.zip") - + # Load using the specific class loaded = SimulationResult.load_from_zip("result.zip") - + # Or use extract/reconstruct for custom asset management extracted = result.extract() # extracted.data contains {"time": 0.1, "temperature": {"$ref": "abc123"}, ...} @@ -257,7 +279,7 @@ def __init__(self, **data): def _validate_no_direct_packable_fields(self) -> None: """Validate that this Packable has no direct Packable fields. - + Packables nested inside dicts or other BaseModels are allowed and will be handled by extract(). Only direct Packable fields are prohibited. """ @@ -267,7 +289,7 @@ def _validate_no_direct_packable_fields(self) -> None: value = getattr(self, field_name, None) if value is None: continue - + # Only reject direct Packable fields if isinstance(value, Packable): raise TypeError( @@ -276,18 +298,9 @@ def _validate_no_direct_packable_fields(self) -> None: "inside dicts or other BaseModels, and extract() will handle them." ) - @computed_field - @cached_property - def checksum(self) -> str: - """ - Compute SHA256 checksum of the encoded content. - Returns: - 16-character hex string (first 64 bits of SHA256) - """ - return hashlib.sha256(self.encode()).hexdigest()[:16] @property - def array_fields(self) -> Set[str]: + def array_fields(self) -> set[str]: """Get all array field paths, including nested arrays in dicts/BaseModels.""" result = set() for field_name in type(self).model_fields: @@ -295,11 +308,10 @@ def array_fields(self) -> Set[str]: continue value = getattr(self, field_name, None) if value is not None: - result.update(ArrayUtils.extract_nested_arrays( - value, field_name).keys()) + result.update(ArrayUtils.extract_nested_arrays(value, field_name).keys()) return result - def _extract_non_array_fields(self) -> Dict[str, Any]: + def _extract_non_array_fields(self) -> dict[str, Any]: """Extract non-array field values for metadata, preserving BaseModel type info.""" model_data = {} direct_arrays = {f for f in self.array_fields if "." not in f} @@ -313,7 +325,7 @@ def _extract_non_array_fields(self) -> Dict[str, Any]: model_data[name] = extracted return model_data - def _create_metadata(self, field_data: Dict[str, Any]) -> PackableMetadata: + def _create_metadata(self, field_data: dict[str, Any]) -> PackableMetadata: """ Create metadata for this Packable. @@ -331,9 +343,7 @@ def _create_metadata(self, field_data: Dict[str, Any]) -> PackableMetadata: @classmethod def load_metadata( - cls, - handler: DataHandler, - metadata_cls: Type[TPackableMetadata] = PackableMetadata + cls, handler: DataHandler, metadata_cls: type[TPackableMetadata] = PackableMetadata ) -> TPackableMetadata: """ Load and validate metadata using a read handler. @@ -367,9 +377,9 @@ def save_to_zip( @classmethod def load_from_zip( - cls: Type[TPackable], + cls: type[TPackable], source: Union[PathLike, BytesIO], - array_type: Optional[ArrayType] = None, + array_type: ArrayType | None = None, ) -> TPackable: """ Load a Packable from a zip file. @@ -382,7 +392,7 @@ def load_from_zip( Returns: Loaded Packable instance - + Raises: TypeError: If called on base Packable class instead of a subclass @@ -397,7 +407,7 @@ def load_from_zip( return cls.decode(f.read(), array_type) @classmethod - def _get_custom_fields(cls) -> Dict[str, CustomFieldConfig]: + def _get_custom_fields(cls) -> dict[str, CustomFieldConfig]: """ Get custom field configurations for this class. @@ -409,7 +419,7 @@ def _get_custom_fields(cls) -> Dict[str, CustomFieldConfig]: return {} @classmethod - def _get_custom_field_names(cls) -> Set[str]: + def _get_custom_field_names(cls) -> set[str]: """Get set of field names that have custom encoding/decoding.""" return set(cls._get_custom_fields().keys()) @@ -418,27 +428,27 @@ def _decode_custom_fields( cls, handler: DataHandler, metadata: PackableMetadata, - data: Dict[str, Any], - array_type: Optional[ArrayType] = None + data: dict[str, Any], + array_type: ArrayType | None = None, ) -> None: """Decode fields with custom decoders.""" for field_name, config in cls._get_custom_fields().items(): try: encoded_bytes = handler.read_binary(f"{config.file_name}.bin") - data[field_name] = config.decode( - encoded_bytes, metadata, array_type) + data[field_name] = config.decode(encoded_bytes, metadata, array_type) except (KeyError, FileNotFoundError): if not config.optional: raise ValueError( - f"Required custom field '{field_name}' ({config.file_name}.bin) not found in zip") + f"Required custom field '{field_name}' ({config.file_name}.bin) not found in zip" + ) @classmethod def _load_standard_arrays( cls, handler: DataHandler, - data: Dict[str, Any], - skip_fields: Set[str], - array_type: Optional[ArrayType] = None + data: dict[str, Any], + skip_fields: set[str], + array_type: ArrayType | None = None, ) -> None: """Load standard arrays from arrays/ folder, skipping custom fields.""" try: @@ -475,7 +485,7 @@ def _load_standard_arrays( # Flat array data[name] = decoded - def _encode_standard_arrays(self, skip_fields: Set[str]) -> Dict[str, bytes]: + def _encode_standard_arrays(self, skip_fields: set[str]) -> dict[str, bytes]: """Encode standard arrays, skipping custom fields.""" encoded_arrays = {} @@ -506,8 +516,7 @@ def _encode_standard_arrays(self, skip_fields: Set[str]) -> Dict[str, bytes]: try: array = getattr(self, field_name) if ArrayUtils.is_array(array): - encoded_arrays[field_name] = ArrayUtils.encode_array( - array) + encoded_arrays[field_name] = ArrayUtils.encode_array(array) except AttributeError: pass @@ -559,9 +568,9 @@ def encode(self) -> bytes: @classmethod def decode( - cls: Type[TPackable], + cls: type[TPackable], buf: bytes, - array_type: Optional[ArrayType] = None, + array_type: ArrayType | None = None, ) -> TPackable: """ Deserialize a Packable from bytes. @@ -573,7 +582,7 @@ def decode( Returns: Loaded Packable instance - + Raises: TypeError: If called on base Packable class instead of a subclass """ @@ -582,14 +591,14 @@ def decode( "Cannot decode on base Packable class. " "Use the specific subclass: MyClass.decode(...)" ) - + handler = DataHandler.create(BytesIO(buf)) metadata = cls.load_metadata(handler) # Fields to skip when loading standard arrays skip_fields = cls._get_custom_field_names() - data: Dict[str, Any] = {} + data: dict[str, Any] = {} # Decode custom fields first cls._decode_custom_fields(handler, metadata, data, array_type) @@ -607,22 +616,22 @@ def decode( def extract(obj: BaseModel) -> SerializedPackableData: """ Extract arrays and Packables from a Pydantic BaseModel into serializable data and assets. - + Args: obj: A Pydantic BaseModel instance (including Packable subclasses) - + Returns an ExtractedPackable with: - data: A JSON-serializable dict with `{"$ref": checksum}` for arrays/Packables - assets: A dict mapping checksums to encoded bytes - + Arrays and nested Packables are stored as assets. The type information comes from the Pydantic schema when reconstructing, so no class/module info is stored. - + Example: mesh = Mesh(vertices=..., indices=...) extracted = Packable.extract(mesh) # extracted.data = {"vertices": {"$ref": "abc..."}, "indices": {"$ref": "def..."}} - + rebuilt = Mesh.reconstruct(extracted.data, extracted.assets) """ if not isinstance(obj, BaseModel): @@ -630,50 +639,113 @@ def extract(obj: BaseModel) -> SerializedPackableData: f"extract() requires a Pydantic BaseModel, got {type(obj).__name__}. " "Use Pydantic models for type-safe extraction and reconstruction." ) - - assets: Dict[str, bytes] = {} - data: Dict[str, Any] = {} - + + assets: dict[str, bytes] = {} + data: dict[str, Any] = {} + for field_name in type(obj).model_fields: - if hasattr(obj, '__private_attributes__') and field_name in obj.__private_attributes__: + if hasattr(obj, "__private_attributes__") and field_name in obj.__private_attributes__: continue value = getattr(obj, field_name, None) if value is None: continue data[field_name] = Packable._extract_value(value, assets) - + return SerializedPackableData(data=data, assets=assets) - + + + @staticmethod + def compute_checksum( + obj: Union[bytes, "SerializedPackableData", "Packable", BaseModel], + ) -> str: + """ + Compute SHA256 checksum for various types of data. + + Supports: + - bytes: Direct checksum of the bytes + - SerializedPackableData: Checksum of JSON-serialized data + sorted asset checksums + - Packable: Checksum of encoded zip bytes + - BaseModel: Extract to SerializedPackableData and compute checksum + + Args: + obj: The object to compute checksum for + + Returns: + 16-character hex string (first 64 bits of SHA256) + + Example: + # Direct bytes checksum + checksum = compute_checksum(some_bytes) + + # Packable checksum + mesh = Mesh(vertices=..., indices=...) + checksum = compute_checksum(mesh) + + # Extracted data checksum + extracted = Packable.extract(mesh) + checksum = compute_checksum(extracted) + + # Any Pydantic BaseModel + checksum = compute_checksum(my_pydantic_model) + """ + if isinstance(obj, bytes): + return hashlib.sha256(obj).hexdigest()[:16] + + if isinstance(obj, SerializedPackableData): + # Combine data JSON + all asset bytes for deterministic hashing + data_json = json.dumps(obj.data, sort_keys=True).encode("utf-8") + # Hash data first, then add sorted asset contents + hasher = hashlib.sha256() + hasher.update(data_json) + hasher.update(b"\x00") + # Include actual asset bytes in sorted order for content-based hashing + for checksum in sorted(obj.assets.keys()): + hasher.update(obj.assets[checksum]) + return hasher.hexdigest()[:16] + + if isinstance(obj, Packable): + return hashlib.sha256(obj.encode()).hexdigest()[:16] + + if isinstance(obj, BaseModel): + # Extract and compute checksum of the extracted data + extracted = Packable.extract(obj) + return Packable.compute_checksum(extracted) + + raise TypeError( + f"compute_checksum() requires bytes, SerializedPackableData, Packable, or BaseModel, " + f"got {type(obj).__name__}" + ) + @staticmethod - def _extract_value(value: Any, assets: Dict[str, bytes]) -> Any: + def _extract_value(value: Any, assets: dict[str, bytes]) -> Any: """Recursively extract a value, replacing arrays and nested Packables with refs.""" # Handle arrays if ArrayUtils.is_array(value): encoded = ArrayUtils.encode_array(value) # Pack metadata + data together as bytes for the asset - metadata_json = json.dumps(encoded.metadata.model_dump()).encode('utf-8') + metadata_json = json.dumps(encoded.metadata.model_dump()).encode("utf-8") # Format: [4 bytes metadata length][metadata json][array data] - packed = len(metadata_json).to_bytes(4, 'little') + metadata_json + encoded.data - checksum = hashlib.sha256(packed).hexdigest()[:16] + packed = len(metadata_json).to_bytes(4, "little") + metadata_json + encoded.data + checksum = Packable.compute_checksum(packed) assets[checksum] = packed return {"$ref": checksum} - + # Handle Packables - extract as encoded zip bytes if isinstance(value, Packable): encoded = value.encode() - checksum = hashlib.sha256(encoded).hexdigest()[:16] + checksum = Packable.compute_checksum(encoded) assets[checksum] = encoded return {"$ref": checksum} - + # Handle dicts if isinstance(value, dict): return {k: Packable._extract_value(v, assets) for k, v in value.items()} - + # Handle lists/tuples if isinstance(value, (list, tuple)): result = [Packable._extract_value(v, assets) for v in value] return result if isinstance(value, list) else tuple(result) - + # Handle non-Packable BaseModels - recursively extract their fields if isinstance(value, BaseModel): extracted = {} @@ -682,36 +754,43 @@ def _extract_value(value: Any, assets: Dict[str, bytes]) -> Any: if field_value is not None: extracted[name] = Packable._extract_value(field_value, assets) return extracted - + # Primitive value - return as-is return value @staticmethod def _get_asset(assets: AssetProvider, checksum: str) -> bytes: - """Get asset bytes from either a dict or callable provider.""" + """Get asset bytes from either a dict or callable provider. + + Supports both sync and async callables - async results are awaited + synchronously using asyncio.get_event_loop().run_until_complete(). + """ if callable(assets): - return assets(checksum) + result = assets(checksum) + if inspect.isawaitable(result): + result = asyncio.get_event_loop().run_until_complete(result) + return result if checksum not in assets: raise KeyError(f"Missing asset with checksum '{checksum}'") return assets[checksum] @staticmethod def reconstruct( - model_class: Type[TModel], - data: Dict[str, Any], + model_class: type[TModel], + data: dict[str, Any], assets: AssetProvider, - array_type: Optional[ArrayType] = None, + array_type: ArrayType | None = None, ) -> Union[TModel, LazyModel[TModel]]: """ Reconstruct a Pydantic BaseModel from extracted data and assets. - + Uses the class's Pydantic schema to determine types for nested fields, so no runtime type information needs to be stored in the data. - + If assets is a dict, all assets are loaded immediately and the actual - model is returned. If assets is a callable or CachedAssetLoader, a + model is returned. If assets is a callable or CachedAssetLoader, a LazyModel proxy is returned that defers asset loading until field access. - + Args: model_class: The Pydantic BaseModel class to reconstruct data: The data dict from extract(), with $ref references @@ -721,59 +800,57 @@ def reconstruct( - CachedAssetLoader with fetch callable and cache handler (lazy + disk cache) array_type: Array backend to use. If None, uses the type stored in each array's metadata. - + Returns: - If assets is a dict: Reconstructed BaseModel instance (eager) - If assets is callable/CachedAssetLoader: LazyModel proxy that loads on demand - + Raises: KeyError: If a referenced asset is missing (for dict assets, raised immediately; for callable assets, raised on field access) - + Example: extracted = Packable.extract(simulation_case) - + # Eager loading with dict - returns actual model rebuilt = Packable.reconstruct(SimulationCase, extracted.data, extracted.assets) - + # Lazy loading with callable - returns LazyModel def fetch_asset(checksum: str) -> bytes: return storage.get(checksum) lazy = Packable.reconstruct(SimulationCase, data, fetch_asset) - + # Lazy loading with disk cache cache = DataHandler.create(Path("./cache")) loader = CachedAssetLoader(fetch_asset, cache) lazy = Packable.reconstruct(SimulationCase, data, loader) - + print(lazy.time) # Primitive field, no fetch needed print(lazy.temperature) # Fetches and caches temperature asset model = lazy.resolve() # Get full Pydantic model """ if callable(assets) or isinstance(assets, CachedAssetLoader): return LazyModel(model_class, data, assets, array_type) - - resolved_data = Packable._resolve_refs_with_schema( - model_class, data, assets, array_type - ) + + resolved_data = Packable._resolve_refs_with_schema(model_class, data, assets, array_type) return model_class(**resolved_data) - + @staticmethod - def _decode_packed_array(packed: bytes, array_type: Optional[ArrayType]) -> Any: + def _decode_packed_array(packed: bytes, array_type: ArrayType | None) -> Any: """Decode a packed array asset (metadata + data) back to an array.""" - from .array import EncodedArray, ArrayMetadata - + from .array import ArrayMetadata, EncodedArray + # Unpack: [4 bytes metadata length][metadata json][array data] - metadata_len = int.from_bytes(packed[:4], 'little') - metadata_json = packed[4:4+metadata_len].decode('utf-8') - array_data = packed[4+metadata_len:] - + metadata_len = int.from_bytes(packed[:4], "little") + metadata_json = packed[4 : 4 + metadata_len].decode("utf-8") + array_data = packed[4 + metadata_len :] + metadata_dict = json.loads(metadata_json) metadata = ArrayMetadata(**metadata_dict) encoded = EncodedArray(data=array_data, metadata=metadata) - + decoded = ArrayUtils.decode_array(encoded) - + # Convert to requested array type if specified if array_type is not None: return ArrayUtils.convert_array(decoded, array_type) @@ -783,53 +860,53 @@ def _decode_packed_array(packed: bytes, array_type: Optional[ArrayType]) -> Any: @staticmethod def _resolve_refs_with_schema( - model_class: Type[BaseModel], - data: Dict[str, Any], + model_class: type[BaseModel], + data: dict[str, Any], assets: AssetProvider, - array_type: Optional[ArrayType], - ) -> Dict[str, Any]: + array_type: ArrayType | None, + ) -> dict[str, Any]: """ Resolve $ref references using Pydantic schema for type information. - + Uses model_class.model_fields to determine the expected type for each field, so no class/module information needs to be stored in the data. """ result = {} - + for field_name, field_info in model_class.model_fields.items(): if field_name not in data: continue - + field_value = data[field_name] field_type = field_info.annotation - + result[field_name] = Packable._resolve_value_with_type( field_value, field_type, assets, array_type ) - + return result - + @staticmethod def _resolve_value_with_type( value: Any, expected_type: Any, assets: AssetProvider, - array_type: Optional[ArrayType], + array_type: ArrayType | None, ) -> Any: """Resolve a value using the expected type from Pydantic schema.""" - from typing import get_origin, get_args, Union - + from typing import Union, get_args, get_origin + if value is None: return None - + # Handle $ref - decode based on expected type if isinstance(value, dict) and "$ref" in value: checksum = value["$ref"] asset_bytes = Packable._get_asset(assets, checksum) - + # Determine if this is a Packable or array based on expected_type origin = get_origin(expected_type) - + # Unwrap Optional[X] -> X if origin is Union: args = get_args(expected_type) @@ -837,18 +914,18 @@ def _resolve_value_with_type( if len(non_none) == 1: expected_type = non_none[0] origin = get_origin(expected_type) - + # Check if expected type is a Packable subclass if isinstance(expected_type, type) and issubclass(expected_type, Packable): return expected_type.decode(asset_bytes, array_type) - + # Otherwise assume it's an array return Packable._decode_packed_array(asset_bytes, array_type) - + # Handle nested BaseModel (non-ref dict that should be a model) if isinstance(value, dict): origin = get_origin(expected_type) - + # Unwrap Optional if origin is Union: args = get_args(expected_type) @@ -856,7 +933,7 @@ def _resolve_value_with_type( if len(non_none) == 1: expected_type = non_none[0] origin = get_origin(expected_type) - + # Dict type - resolve values with value type if origin is dict: key_type, value_type = get_args(expected_type) @@ -864,21 +941,21 @@ def _resolve_value_with_type( k: Packable._resolve_value_with_type(v, value_type, assets, array_type) for k, v in value.items() } - + # BaseModel type - recursively resolve with schema if isinstance(expected_type, type) and issubclass(expected_type, BaseModel): resolved = Packable._resolve_refs_with_schema( expected_type, value, assets, array_type ) return expected_type(**resolved) - + # Unknown dict - return as-is return value - + # Handle lists/tuples if isinstance(value, (list, tuple)): origin = get_origin(expected_type) - + # Unwrap Optional if origin is Union: args = get_args(expected_type) @@ -886,51 +963,50 @@ def _resolve_value_with_type( if len(non_none) == 1: expected_type = non_none[0] origin = get_origin(expected_type) - + # Get element type if origin in (list, tuple): args = get_args(expected_type) elem_type = args[0] if args else Any else: elem_type = Any - + result = [ - Packable._resolve_value_with_type(v, elem_type, assets, array_type) - for v in value + Packable._resolve_value_with_type(v, elem_type, assets, array_type) for v in value ] return result if isinstance(value, list) else tuple(result) - + # Primitive - return as-is return value @staticmethod def _merge_field_data_with_schema( - model_class: Type[BaseModel], - data: Dict[str, Any], - field_data: Dict[str, Any], + model_class: type[BaseModel], + data: dict[str, Any], + field_data: dict[str, Any], ) -> None: """ Merge metadata field_data into data, using Pydantic schema for type info. - + This handles the reconstruction of nested BaseModel instances without needing __model_class__/__model_module__ markers. """ - from typing import get_origin, get_args, Union - + from typing import Union, get_args, get_origin + for key, value in field_data.items(): if key in ("__model_class__", "__model_module__"): # Skip legacy markers continue - + if key not in model_class.model_fields: # Unknown field - store as-is data[key] = value continue - + field_type = model_class.model_fields[key].annotation merged = Packable._merge_value_with_schema(value, field_type, data.get(key)) data[key] = merged - + @staticmethod def _merge_value_with_schema( metadata_value: Any, @@ -938,11 +1014,11 @@ def _merge_value_with_schema( existing_value: Any, ) -> Any: """Merge a metadata value with existing data using the schema type.""" - from typing import get_origin, get_args, Union - + from typing import Union, get_args, get_origin + if metadata_value is None: return existing_value - + # Unwrap Optional origin = get_origin(expected_type) if origin is Union: @@ -951,7 +1027,7 @@ def _merge_value_with_schema( if len(non_none) == 1: expected_type = non_none[0] origin = get_origin(expected_type) - + # Handle dict type if origin is dict: key_type, value_type = get_args(expected_type) @@ -973,14 +1049,17 @@ def _merge_value_with_schema( if k not in ("__model_class__", "__model_module__") } return metadata_value - + # Handle BaseModel type if isinstance(expected_type, type) and issubclass(expected_type, BaseModel): if isinstance(metadata_value, dict): # Filter out legacy markers - filtered = {k: v for k, v in metadata_value.items() - if k not in ("__model_class__", "__model_module__")} - + filtered = { + k: v + for k, v in metadata_value.items() + if k not in ("__model_class__", "__model_module__") + } + if isinstance(existing_value, dict): # Merge with existing dict data merged = dict(existing_value) @@ -992,19 +1071,18 @@ def _merge_value_with_schema( Packable._merge_field_data_with_schema(expected_type, data, filtered) return expected_type(**data) return metadata_value - - # Handle list type + + # Handle list type if origin in (list, tuple): if isinstance(metadata_value, (list, tuple)): args = get_args(expected_type) elem_type = args[0] if args else Any result = [ - Packable._merge_value_with_schema(v, elem_type, None) - for v in metadata_value + Packable._merge_value_with_schema(v, elem_type, None) for v in metadata_value ] return result if origin is list else tuple(result) return metadata_value - + # Primitive - use metadata value return metadata_value @@ -1021,9 +1099,7 @@ def __reduce__(self): @staticmethod def load_array( - source: Union[PathLike, BytesIO], - name: str, - array_type: Optional[ArrayType] = None + source: Union[PathLike, BytesIO], name: str, array_type: ArrayType | None = None ) -> Array: """ Load a single array from a zip file without loading the entire object. @@ -1078,4 +1154,3 @@ def convert_to(self: TPackable, array_type: ArrayType) -> TPackable: pass return data_copy - From f1c20ad275e88fd026ef9d2bfa399ce3e859fe88 Mon Sep 17 00:00:00 2001 From: Afshawn Lotfi Date: Sun, 18 Jan 2026 08:19:23 +0000 Subject: [PATCH 3/4] feat: add checksum and serialization utilities - Introduced ChecksumUtils for computing checksums of files, directories, and data structures. - Added SchemaUtils for resolving Pydantic types and merging field data. - Implemented SerializationUtils for packing and unpacking arrays and assets. - Updated __init__.py to include new utility modules. - Added tests for ChecksumUtils to ensure functionality and correctness. - Removed unnecessary metadata handling in Packable class. --- .../extract_reconstruct_example.ipynb | 56 +- python/examples/mesh_example.ipynb | 45 +- python/examples/reconstruct_example.ipynb | 743 +++++++++++++++++ python/meshly/array.py | 61 +- python/meshly/packable.py | 745 ++---------------- python/meshly/utils/__init__.py | 10 +- python/meshly/utils/checksum_utils.py | 152 ++++ python/meshly/utils/schema_utils.py | 224 ++++++ python/meshly/utils/serialization_utils.py | 177 +++++ python/tests/test_checksum_utils.py | 166 ++++ typescript/src/packable.ts | 26 - 11 files changed, 1609 insertions(+), 796 deletions(-) create mode 100644 python/examples/reconstruct_example.ipynb create mode 100644 python/meshly/utils/checksum_utils.py create mode 100644 python/meshly/utils/schema_utils.py create mode 100644 python/meshly/utils/serialization_utils.py create mode 100644 python/tests/test_checksum_utils.py diff --git a/python/examples/extract_reconstruct_example.ipynb b/python/examples/extract_reconstruct_example.ipynb index ca3c788..840192a 100644 --- a/python/examples/extract_reconstruct_example.ipynb +++ b/python/examples/extract_reconstruct_example.ipynb @@ -22,9 +22,8 @@ "outputs": [], "source": [ "import numpy as np\n", - "from typing import Optional, Dict, List\n", - "from pydantic import BaseModel, Field, ConfigDict\n", - "from meshly import Mesh, Packable" + "from meshly import Mesh, Packable\n", + "from pydantic import BaseModel, ConfigDict, Field" ] }, { @@ -58,12 +57,12 @@ "class FieldData(BaseModel):\n", " \"\"\"A field defined on mesh nodes or cells.\"\"\"\n", " model_config = ConfigDict(arbitrary_types_allowed=True)\n", - " \n", + "\n", " name: str = Field(..., description=\"Field name (e.g., 'temperature', 'velocity')\")\n", " field_type: str = Field(..., description=\"'scalar', 'vector', or 'tensor'\")\n", " location: str = Field(\"node\", description=\"'node' or 'cell' centered\")\n", " data: np.ndarray = Field(..., description=\"Field values\")\n", - " units: Optional[str] = Field(None, description=\"Physical units\")\n", + " units: str | None = Field(None, description=\"Physical units\")\n", "\n", "\n", "class SimulationSnapshot(BaseModel):\n", @@ -73,23 +72,23 @@ " a Mesh (which IS a Packable). This tests the nested Packable extraction.\n", " \"\"\"\n", " model_config = ConfigDict(arbitrary_types_allowed=True)\n", - " \n", + "\n", " time: float = Field(..., description=\"Simulation time\")\n", " iteration: int = Field(..., description=\"Iteration number\")\n", " mesh: Mesh = Field(..., description=\"Computational mesh\")\n", - " fields: Dict[str, FieldData] = Field(default_factory=dict, description=\"Field data\")\n", - " residuals: Optional[np.ndarray] = Field(None, description=\"Solver residuals\")\n", + " fields: dict[str, FieldData] = Field(default_factory=dict, description=\"Field data\")\n", + " residuals: np.ndarray | None = Field(None, description=\"Solver residuals\")\n", "\n", "\n", "class SimulationCase(BaseModel):\n", " \"\"\"Complete simulation case with multiple snapshots.\"\"\"\n", " model_config = ConfigDict(arbitrary_types_allowed=True)\n", - " \n", + "\n", " name: str = Field(..., description=\"Case name\")\n", " description: str = Field(\"\", description=\"Case description\")\n", " solver: str = Field(..., description=\"Solver name\")\n", - " parameters: Dict[str, float] = Field(default_factory=dict, description=\"Solver parameters\")\n", - " snapshots: List[SimulationSnapshot] = Field(default_factory=list, description=\"Time snapshots\")\n", + " parameters: dict[str, float] = Field(default_factory=dict, description=\"Solver parameters\")\n", + " snapshots: list[SimulationSnapshot] = Field(default_factory=list, description=\"Time snapshots\")\n", "\n", "print(\"Data structures defined\")" ] @@ -166,20 +165,20 @@ " \"\"\"Create a snapshot with temperature and velocity fields.\"\"\"\n", " n_nodes = mesh.vertex_count\n", " coords = mesh.vertices[:, :2] # x, y coordinates\n", - " \n", + "\n", " # Temperature: diffusing heat from center\n", " center = np.array([0.5, 0.5])\n", " r = np.linalg.norm(coords - center, axis=1)\n", - " temperature = 300 + 100 * np.exp(-r**2 / (0.1 + time)) \n", - " \n", + " temperature = 300 + 100 * np.exp(-r**2 / (0.1 + time))\n", + "\n", " # Velocity: rotating flow\n", " vx = -(coords[:, 1] - 0.5)\n", " vy = (coords[:, 0] - 0.5)\n", " velocity = np.column_stack([vx, vy, np.zeros(n_nodes)]).astype(np.float32)\n", - " \n", + "\n", " # Residuals (solver convergence)\n", " residuals = np.array([1e-3 / (iteration + 1), 1e-4 / (iteration + 1)], dtype=np.float32)\n", - " \n", + "\n", " return SimulationSnapshot(\n", " time=time,\n", " iteration=iteration,\n", @@ -296,7 +295,7 @@ "\n", "print(f\"Extracted data keys: {list(extracted.data.keys())}\")\n", "print(f\"\\nTotal assets: {len(extracted.assets)}\")\n", - "print(f\"\\nAsset sizes:\")\n", + "print(\"\\nAsset sizes:\")\n", "for checksum, data in extracted.assets.items():\n", " print(f\" {checksum}: {len(data):,} bytes\")" ] @@ -540,8 +539,9 @@ ], "source": [ "from pathlib import Path\n", - "from meshly.packable import CachedAssetLoader\n", + "\n", "from meshly.data_handler import DataHandler\n", + "from meshly.packable import CachedAssetLoader\n", "\n", "# Simulate fetching assets from remote storage\n", "fetch_count = [0]\n", @@ -660,25 +660,25 @@ "\n", "with tempfile.TemporaryDirectory() as tmpdir:\n", " cache_path = Path(tmpdir) / \"asset_cache\"\n", - " \n", + "\n", " # Create cache handler and loader\n", " cache_handler = DataHandler.create(cache_path)\n", " loader = CachedAssetLoader(fetch=fetch_from_storage, cache=cache_handler)\n", - " \n", + "\n", " print(\"=== First run: fetching and caching ===\")\n", " lazy1 = Packable.reconstruct(SimulationCase, extracted.data, loader)\n", " _ = lazy1.resolve() # Fetch all assets\n", " print(f\"Assets fetched: {fetch_count[0]}\")\n", - " \n", + "\n", " # Finalize to persist cache\n", " cache_handler.finalize()\n", - " \n", + "\n", " # Second run with same cache location\n", " print(\"\\n=== Second run: reading from cache ===\")\n", " fetch_count[0] = 0\n", " cache_handler2 = DataHandler.create(cache_path)\n", " loader2 = CachedAssetLoader(fetch=fetch_from_storage, cache=cache_handler2)\n", - " \n", + "\n", " lazy2 = Packable.reconstruct(SimulationCase, extracted.data, loader2)\n", " resolved2 = lazy2.resolve()\n", " print(f\"Assets fetched from remote: {fetch_count[0]} (all served from cache!)\")\n", @@ -697,9 +697,9 @@ "| Input | Handling |\n", "|-------|----------|\n", "| Top-level Packable | Expands fields, arrays → refs |\n", - "| Nested Packable (in dict/list/BaseModel) | Becomes `{\"$ref\": ..., \"$type\": \"packable\"}` |\n", - "| NumPy arrays | Becomes `{\"$ref\": ..., \"$type\": \"array\"}` |\n", - "| BaseModel | Preserves structure with `__model_class__` |\n", + "| Nested Packable (in dict/list/BaseModel) | Becomes `{\"$ref\": checksum}` |\n", + "| NumPy arrays | Becomes `{\"$ref\": checksum}` |\n", + "| BaseModel | Recursively extracts fields |\n", "| Primitives | Passed through unchanged |\n", "\n", "`Packable.reconstruct()` supports three modes:\n", @@ -721,7 +721,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -735,7 +735,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.11" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/python/examples/mesh_example.ipynb b/python/examples/mesh_example.ipynb index cd1a3b2..f9e6cda 100644 --- a/python/examples/mesh_example.ipynb +++ b/python/examples/mesh_example.ipynb @@ -20,13 +20,13 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", + "from pathlib import Path\n", + "\n", "import numpy as np\n", - "from typing import Optional, List\n", - "from pydantic import Field\n", "\n", "# Import the Mesh class\n", - "from meshly import Mesh" + "from meshly import Mesh\n", + "from pydantic import Field" ] }, { @@ -44,13 +44,14 @@ "metadata": {}, "outputs": [], "source": [ - "from pydantic import BaseModel, ConfigDict\n", "from meshly import Packable\n", + "from pydantic import BaseModel, ConfigDict\n", + "\n", "\n", "class MaterialProperties(BaseModel):\n", " \"\"\"Material properties with numpy arrays - demonstrates BaseModel in dict edge case.\"\"\"\n", " model_config = ConfigDict(arbitrary_types_allowed=True)\n", - " \n", + "\n", " name: str = Field(..., description=\"Material name\")\n", " diffuse: np.ndarray = Field(..., description=\"Diffuse color array\")\n", " specular: np.ndarray = Field(..., description=\"Specular color array\")\n", @@ -75,11 +76,11 @@ " \"\"\"\n", " # Add texture coordinates and normals as additional numpy arrays\n", " texture_coords: np.ndarray = Field(..., description=\"Texture coordinates\")\n", - " normals: Optional[np.ndarray] = Field(None, description=\"Vertex normals\")\n", - " \n", + " normals: np.ndarray | None = Field(None, description=\"Vertex normals\")\n", + "\n", " # Add non-array attributes\n", " material_name: str = Field(\"default\", description=\"Material name\")\n", - " tags: List[str] = Field(default_factory=list, description=\"Tags for the mesh\")\n", + " tags: list[str] = Field(default_factory=list, description=\"Tags for the mesh\")\n", "\n", " # Dictionary containing nested dictionaries with arrays\n", " material_data: dict[str, dict[str, np.ndarray]] = Field(\n", @@ -295,9 +296,9 @@ ], "source": [ "# Save the mesh to a zip file\n", - "zip_path = \"textured_cube.zip\"\n", + "zip_path = Path(\"textured_cube.zip\")\n", "mesh.save_to_zip(zip_path)\n", - "assert os.path.exists(zip_path)\n", + "assert zip_path.exists()\n", "print(f\"Saved mesh to {zip_path} has {mesh.vertex_count} vertices and {mesh.index_count} indices\")\n", "\n", "\n", @@ -361,7 +362,7 @@ "print(f\"Material colors: {loaded_mesh.material_colors}\")\n", "\n", "# Verify the dict[str, BaseModel] edge case was loaded correctly\n", - "print(f\"\\n--- BaseModel dict edge case ---\")\n", + "print(\"\\n--- BaseModel dict edge case ---\")\n", "print(f\"Materials keys: {list(loaded_mesh.materials.keys())}\")\n", "for mat_name, mat in loaded_mesh.materials.items():\n", " print(f\" {mat_name}:\")\n", @@ -405,10 +406,10 @@ " # Add bone weights and indices as additional numpy arrays\n", " bone_weights: np.ndarray = Field(..., description=\"Bone weights for each vertex\")\n", " bone_indices: np.ndarray = Field(..., description=\"Bone indices for each vertex\")\n", - " \n", + "\n", " # Add non-array attributes\n", " skeleton_name: str = Field(\"default\", description=\"Skeleton name\")\n", - " animation_names: List[str] = Field(default_factory=list, description=\"Animation names\")\n", + " animation_names: list[str] = Field(default_factory=list, description=\"Animation names\")\n", "\n", "# Create a simple skinned mesh\n", "skinned_mesh = SkinnedMesh(\n", @@ -445,7 +446,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Saved skinned mesh to skinned_cube.zip, file size: 2475 bytes\n", + "Saved skinned mesh to skinned_cube.zip, file size: 2477 bytes\n", "\n", "Loaded skinned mesh: 8 vertices, 36 indices\n", "Skeleton name: human_skeleton\n", @@ -457,9 +458,9 @@ ], "source": [ "# Save the skinned mesh to a zip file\n", - "skinned_zip_path = \"skinned_cube.zip\"\n", + "skinned_zip_path = Path(\"skinned_cube.zip\")\n", "skinned_mesh.save_to_zip(skinned_zip_path)\n", - "print(f\"Saved skinned mesh to {skinned_zip_path}, file size: {os.path.getsize(skinned_zip_path)} bytes\")\n", + "print(f\"Saved skinned mesh to {skinned_zip_path}, file size: {skinned_zip_path.stat().st_size} bytes\")\n", "\n", "# Load the skinned mesh from the zip file\n", "loaded_skinned_mesh = SkinnedMesh.load_from_zip(skinned_zip_path)\n", @@ -498,8 +499,8 @@ "source": [ "# Clean up\n", "for path in [zip_path, skinned_zip_path]:\n", - " if os.path.exists(path):\n", - " os.remove(path)\n", + " if Path(path).exists():\n", + " Path(path).unlink()\n", " print(f\"Removed {path}\")\n", "\n", "print(\"\\nExample completed successfully!\")" @@ -543,7 +544,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Converted skinned mesh to JAX arrays, vertex dtype: float32\n" + "JAX not available - skipping conversion example\n" ] } ], @@ -572,7 +573,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -586,7 +587,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.11" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/python/examples/reconstruct_example.ipynb b/python/examples/reconstruct_example.ipynb new file mode 100644 index 0000000..840192a --- /dev/null +++ b/python/examples/reconstruct_example.ipynb @@ -0,0 +1,743 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ba736f7c", + "metadata": {}, + "source": [ + "# Extract and Reconstruct: Scientific Simulation Example\n", + "\n", + "This notebook demonstrates `Packable.extract()` and `reconstruct()` with a realistic scientific computing scenario:\n", + "\n", + "- A CFD simulation with mesh geometry and field data\n", + "- Nested Pydantic classes containing Packables (Mesh)\n", + "- Content-addressable storage for deduplication" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6f850881", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from meshly import Mesh, Packable\n", + "from pydantic import BaseModel, ConfigDict, Field" + ] + }, + { + "cell_type": "markdown", + "id": "d3ae1bf6", + "metadata": {}, + "source": [ + "## 1. Define Scientific Data Structures\n", + "\n", + "We'll model a CFD simulation with:\n", + "- `FieldData`: Scalar/vector field on the mesh (temperature, velocity, etc.)\n", + "- `SimulationSnapshot`: A single timestep with mesh + fields\n", + "- `SimulationCase`: Complete case with metadata and multiple snapshots" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "349483ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data structures defined\n" + ] + } + ], + "source": [ + "class FieldData(BaseModel):\n", + " \"\"\"A field defined on mesh nodes or cells.\"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + "\n", + " name: str = Field(..., description=\"Field name (e.g., 'temperature', 'velocity')\")\n", + " field_type: str = Field(..., description=\"'scalar', 'vector', or 'tensor'\")\n", + " location: str = Field(\"node\", description=\"'node' or 'cell' centered\")\n", + " data: np.ndarray = Field(..., description=\"Field values\")\n", + " units: str | None = Field(None, description=\"Physical units\")\n", + "\n", + "\n", + "class SimulationSnapshot(BaseModel):\n", + " \"\"\"A single timestep of simulation data.\n", + " \n", + " Note: This is a regular Pydantic BaseModel (not Packable) that contains\n", + " a Mesh (which IS a Packable). This tests the nested Packable extraction.\n", + " \"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + "\n", + " time: float = Field(..., description=\"Simulation time\")\n", + " iteration: int = Field(..., description=\"Iteration number\")\n", + " mesh: Mesh = Field(..., description=\"Computational mesh\")\n", + " fields: dict[str, FieldData] = Field(default_factory=dict, description=\"Field data\")\n", + " residuals: np.ndarray | None = Field(None, description=\"Solver residuals\")\n", + "\n", + "\n", + "class SimulationCase(BaseModel):\n", + " \"\"\"Complete simulation case with multiple snapshots.\"\"\"\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + "\n", + " name: str = Field(..., description=\"Case name\")\n", + " description: str = Field(\"\", description=\"Case description\")\n", + " solver: str = Field(..., description=\"Solver name\")\n", + " parameters: dict[str, float] = Field(default_factory=dict, description=\"Solver parameters\")\n", + " snapshots: list[SimulationSnapshot] = Field(default_factory=list, description=\"Time snapshots\")\n", + "\n", + "print(\"Data structures defined\")" + ] + }, + { + "cell_type": "markdown", + "id": "bcb88dff", + "metadata": {}, + "source": [ + "## 2. Create Sample Simulation Data\n", + "\n", + "Let's create a simple 2D heat transfer simulation on a quad mesh." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "be109c7d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created mesh: 25 vertices, 16 quads\n" + ] + } + ], + "source": [ + "# Create a simple 2D quad mesh (5x5 grid = 25 nodes, 16 quads)\n", + "nx, ny = 5, 5\n", + "x = np.linspace(0, 1, nx)\n", + "y = np.linspace(0, 1, ny)\n", + "xx, yy = np.meshgrid(x, y)\n", + "\n", + "vertices = np.column_stack([xx.ravel(), yy.ravel(), np.zeros(nx * ny)]).astype(np.float32)\n", + "\n", + "# Create quad indices\n", + "quads = []\n", + "for j in range(ny - 1):\n", + " for i in range(nx - 1):\n", + " n0 = j * nx + i\n", + " n1 = n0 + 1\n", + " n2 = n0 + nx + 1\n", + " n3 = n0 + nx\n", + " quads.append([n0, n1, n2, n3])\n", + "\n", + "indices = np.array(quads, dtype=np.uint32)\n", + "\n", + "mesh = Mesh(vertices=vertices, indices=indices)\n", + "print(f\"Created mesh: {mesh.vertex_count} vertices, {len(indices)} quads\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c7588b21", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created 3 snapshots\n", + " t=0.0: ['temperature', 'velocity']\n", + " t=0.1: ['temperature', 'velocity']\n", + " t=0.2: ['temperature', 'velocity']\n" + ] + } + ], + "source": [ + "# Create simulation snapshots at different times\n", + "def create_snapshot(time: float, iteration: int, mesh: Mesh) -> SimulationSnapshot:\n", + " \"\"\"Create a snapshot with temperature and velocity fields.\"\"\"\n", + " n_nodes = mesh.vertex_count\n", + " coords = mesh.vertices[:, :2] # x, y coordinates\n", + "\n", + " # Temperature: diffusing heat from center\n", + " center = np.array([0.5, 0.5])\n", + " r = np.linalg.norm(coords - center, axis=1)\n", + " temperature = 300 + 100 * np.exp(-r**2 / (0.1 + time))\n", + "\n", + " # Velocity: rotating flow\n", + " vx = -(coords[:, 1] - 0.5)\n", + " vy = (coords[:, 0] - 0.5)\n", + " velocity = np.column_stack([vx, vy, np.zeros(n_nodes)]).astype(np.float32)\n", + "\n", + " # Residuals (solver convergence)\n", + " residuals = np.array([1e-3 / (iteration + 1), 1e-4 / (iteration + 1)], dtype=np.float32)\n", + "\n", + " return SimulationSnapshot(\n", + " time=time,\n", + " iteration=iteration,\n", + " mesh=mesh,\n", + " fields={\n", + " \"temperature\": FieldData(\n", + " name=\"temperature\",\n", + " field_type=\"scalar\",\n", + " location=\"node\",\n", + " data=temperature.astype(np.float32),\n", + " units=\"K\"\n", + " ),\n", + " \"velocity\": FieldData(\n", + " name=\"velocity\",\n", + " field_type=\"vector\",\n", + " location=\"node\",\n", + " data=velocity,\n", + " units=\"m/s\"\n", + " )\n", + " },\n", + " residuals=residuals\n", + " )\n", + "\n", + "# Create snapshots at t=0, 0.1, 0.2\n", + "snapshots = [\n", + " create_snapshot(0.0, 0, mesh),\n", + " create_snapshot(0.1, 100, mesh),\n", + " create_snapshot(0.2, 200, mesh),\n", + "]\n", + "\n", + "print(f\"Created {len(snapshots)} snapshots\")\n", + "for s in snapshots:\n", + " print(f\" t={s.time}: {list(s.fields.keys())}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "93568d04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Simulation case: heat_transfer_2d\n", + " Solver: simpleFoam\n", + " Parameters: {'dt': 0.001, 'nu': 1e-05, 'alpha': 0.0001}\n", + " Snapshots: 3\n" + ] + } + ], + "source": [ + "# Create the complete simulation case\n", + "case = SimulationCase(\n", + " name=\"heat_transfer_2d\",\n", + " description=\"2D heat transfer with rotating flow\",\n", + " solver=\"simpleFoam\",\n", + " parameters={\n", + " \"dt\": 0.001,\n", + " \"nu\": 1e-5,\n", + " \"alpha\": 1e-4,\n", + " },\n", + " snapshots=snapshots\n", + ")\n", + "\n", + "print(f\"Simulation case: {case.name}\")\n", + "print(f\" Solver: {case.solver}\")\n", + "print(f\" Parameters: {case.parameters}\")\n", + "print(f\" Snapshots: {len(case.snapshots)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "9c7048da", + "metadata": {}, + "source": [ + "## 3. Extract the Simulation Data\n", + "\n", + "`Packable.extract()` recursively processes the nested structure:\n", + "- Arrays → `{\"$ref\": checksum, \"$type\": \"array\"}`\n", + "- Nested Mesh (Packable) → `{\"$ref\": checksum, \"$type\": \"packable\", ...}`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "95533188", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted data keys: ['name', 'description', 'solver', 'parameters', 'snapshots']\n", + "\n", + "Total assets: 8\n", + "\n", + "Asset sizes:\n", + " 4e71a79c2d0fa381: 1,467 bytes\n", + " 28dc719a0c8c1387: 200 bytes\n", + " 59ffdd6bfac7876a: 250 bytes\n", + " 0c345962a52e7e2c: 133 bytes\n", + " 292cfc23f6777b02: 200 bytes\n", + " 17b38a2f2cbdd0a7: 133 bytes\n", + " 145838c08771e6ef: 201 bytes\n", + " ea37b2590dba4b31: 132 bytes\n" + ] + } + ], + "source": [ + "# Extract the entire simulation case\n", + "extracted = Packable.extract(case)\n", + "\n", + "print(f\"Extracted data keys: {list(extracted.data.keys())}\")\n", + "print(f\"\\nTotal assets: {len(extracted.assets)}\")\n", + "print(\"\\nAsset sizes:\")\n", + "for checksum, data in extracted.assets.items():\n", + " print(f\" {checksum}: {len(data):,} bytes\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ba82742d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted data structure:\n", + "{\n", + " \"name\": \"heat_transfer_2d\",\n", + " \"description\": \"2D heat transfer with rotating flow\",\n", + " \"solver\": \"simpleFoam\",\n", + " \"parameters\": {\n", + " \"dt\": 0.001,\n", + " \"nu\": 1e-05,\n", + " \"alpha\": 0.0001\n", + " },\n", + " \"snapshots\": [\n", + " {\n", + " \"time\": 0.0,\n", + " \"iteration\": 0,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"28dc719a0c8c1387\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"59ffdd6bfac7876a\"\n", + " },\n", + " \"units\": \"m/s\"\n", + " }\n", + " },\n", + " \"residuals\": {\n", + " \"$ref\": \"0c345962a52e7e2c\"\n", + " }\n", + " },\n", + " {\n", + " \"time\": 0.1,\n", + " \"iteration\": 100,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"292cfc23f6777b02\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"59ffdd6bfac7876a\"\n", + " },\n", + " \"units\": \"m/s\"\n", + " }\n", + " },\n", + " \"residuals\": {\n", + " \"$ref\": \"17b38a2f2cbdd0a7\"\n", + " }\n", + " },\n", + " {\n", + " \"time\": 0.2,\n", + " \"iteration\": 200,\n", + " \"mesh\": {\n", + " \"$ref\": \"4e71a79c2d0fa381\"\n", + " },\n", + " \"fields\": {\n", + " \"temperature\": {\n", + " \"name\": \"temperature\",\n", + " \"field_type\": \"scalar\",\n", + " \"location\": \"node\",\n", + " \"data\": {\n", + " \"$ref\": \"145838c08771e6ef\"\n", + " },\n", + " \"units\": \"K\"\n", + " },\n", + " \"velocity\": {\n", + " \"name\": \"velocity\",\n", + " \"field_type\": \"vector\",\n", + " \"location\": \"node\",\n", + " \n", + "...\n" + ] + } + ], + "source": [ + "# Examine the extracted data structure\n", + "import json\n", + "\n", + "# Pretty print the extracted data (it's JSON-serializable!)\n", + "print(\"Extracted data structure:\")\n", + "print(json.dumps(extracted.data, indent=2)[:2000] + \"\\n...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6977cb53", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mesh reference: {'$ref': '4e71a79c2d0fa381'}\n" + ] + } + ], + "source": [ + "# Look at the first snapshot's mesh reference\n", + "mesh_ref = extracted.data[\"snapshots\"][0][\"mesh\"]\n", + "print(f\"Mesh reference: {mesh_ref}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "bc82716a", + "metadata": {}, + "source": [ + "## 4. Asset Deduplication\n", + "\n", + "Since all snapshots share the same mesh, it's only stored once!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a251ef65", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mesh checksums: ['4e71a79c2d0fa381', '4e71a79c2d0fa381', '4e71a79c2d0fa381']\n", + "\n", + "All same? True\n", + "\n", + "The mesh is stored only ONCE in assets, saving 2,934 bytes!\n" + ] + } + ], + "source": [ + "# Check mesh references across snapshots\n", + "mesh_refs = [s[\"mesh\"][\"$ref\"] for s in extracted.data[\"snapshots\"]]\n", + "print(f\"Mesh checksums: {mesh_refs}\")\n", + "print(f\"\\nAll same? {len(set(mesh_refs)) == 1}\")\n", + "print(f\"\\nThe mesh is stored only ONCE in assets, saving {(len(mesh_refs)-1) * len(extracted.assets[mesh_refs[0]]):,} bytes!\")" + ] + }, + { + "cell_type": "markdown", + "id": "b732526c", + "metadata": {}, + "source": [ + "## 5. Reconstruct back to SimulationCase" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5c3761f7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Reconstructed case: heat_transfer_2d with 3 snapshots\n", + "Decoded mesh from reconstructed case: 25 vertices, 64 indices\n" + ] + } + ], + "source": [ + "reconstructed_case = Packable.reconstruct(SimulationCase, extracted.data, extracted.assets)\n", + "print(f\"\\nReconstructed case: {reconstructed_case.name} with {len(reconstructed_case.snapshots)} snapshots\")\n", + "\n", + "decoded_mesh = Mesh.decode(reconstructed_case.snapshots[0].mesh.encode())\n", + "print(f\"Decoded mesh from reconstructed case: {decoded_mesh.vertex_count} vertices, {len(decoded_mesh.indices)} indices\")" + ] + }, + { + "cell_type": "markdown", + "id": "ccaf56b9", + "metadata": {}, + "source": [ + "## 6. Lazy Loading with CachedAssetLoader\n", + "\n", + "When working with large datasets, you may want to:\n", + "- Load assets on-demand (lazy loading)\n", + "- Cache fetched assets to disk for subsequent runs\n", + "\n", + "`Packable.reconstruct()` supports this via `CachedAssetLoader`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ac9c08e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Lazy loading with callable ===\n", + "\n", + "LazyModel created, no assets fetched yet. Fetch count: 0\n", + "Type: \n", + "\n", + "Case name: heat_transfer_2d\n", + "Fetch count after accessing name: 0\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "from meshly.data_handler import DataHandler\n", + "from meshly.packable import CachedAssetLoader\n", + "\n", + "# Simulate fetching assets from remote storage\n", + "fetch_count = [0]\n", + "\n", + "def fetch_from_storage(checksum: str) -> bytes:\n", + " \"\"\"Simulate fetching from cloud/remote storage.\"\"\"\n", + " fetch_count[0] += 1\n", + " print(f\" Fetching asset {checksum[:8]}... (fetch #{fetch_count[0]})\")\n", + " return extracted.assets[checksum]\n", + "\n", + "# Using a plain callable - lazy loading, assets fetched on field access\n", + "print(\"=== Lazy loading with callable ===\")\n", + "lazy_case = Packable.reconstruct(SimulationCase, extracted.data, fetch_from_storage)\n", + "\n", + "print(f\"\\nLazyModel created, no assets fetched yet. Fetch count: {fetch_count[0]}\")\n", + "print(f\"Type: {type(lazy_case)}\")\n", + "\n", + "# Access primitive fields - no fetch needed\n", + "print(f\"\\nCase name: {lazy_case.name}\")\n", + "print(f\"Fetch count after accessing name: {fetch_count[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "38bd4003", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Accessing first snapshot ===\n", + " Fetching asset 4e71a79c... (fetch #1)\n", + " Fetching asset 28dc719a... (fetch #2)\n", + " Fetching asset 59ffdd6b... (fetch #3)\n", + " Fetching asset 0c345962... (fetch #4)\n", + " Fetching asset 4e71a79c... (fetch #5)\n", + " Fetching asset 292cfc23... (fetch #6)\n", + " Fetching asset 59ffdd6b... (fetch #7)\n", + " Fetching asset 17b38a2f... (fetch #8)\n", + " Fetching asset 4e71a79c... (fetch #9)\n", + " Fetching asset 145838c0... (fetch #10)\n", + " Fetching asset 59ffdd6b... (fetch #11)\n", + " Fetching asset ea37b259... (fetch #12)\n", + "Fetch count after accessing snapshots: 12\n", + "\n", + "Snapshot time: 0.0\n", + "Mesh vertices shape: (25, 3)\n", + "\n", + "=== Resolving to full model ===\n", + "Final fetch count: 12\n", + "Resolved type: \n" + ] + } + ], + "source": [ + "# Access a snapshot - this triggers fetching of nested assets\n", + "print(\"=== Accessing first snapshot ===\")\n", + "snapshot = lazy_case.snapshots[0]\n", + "print(f\"Fetch count after accessing snapshots: {fetch_count[0]}\")\n", + "\n", + "# The mesh is fetched when we access it\n", + "print(f\"\\nSnapshot time: {snapshot.time}\")\n", + "print(f\"Mesh vertices shape: {snapshot.mesh.vertices.shape}\")\n", + "\n", + "# To fully resolve and get the actual Pydantic model:\n", + "print(\"\\n=== Resolving to full model ===\")\n", + "resolved_case = lazy_case.resolve()\n", + "print(f\"Final fetch count: {fetch_count[0]}\")\n", + "print(f\"Resolved type: {type(resolved_case)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "46d7b7c0", + "metadata": {}, + "source": [ + "### CachedAssetLoader: Persistent Disk Cache\n", + "\n", + "For repeated access, use `CachedAssetLoader` to cache fetched assets to disk:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "88d9c7be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== First run: fetching and caching ===\n", + " Fetching asset 4e71a79c... (fetch #1)\n", + " Fetching asset 28dc719a... (fetch #2)\n", + " Fetching asset 59ffdd6b... (fetch #3)\n", + " Fetching asset 0c345962... (fetch #4)\n", + " Fetching asset 292cfc23... (fetch #5)\n", + " Fetching asset 17b38a2f... (fetch #6)\n", + " Fetching asset 145838c0... (fetch #7)\n", + " Fetching asset ea37b259... (fetch #8)\n", + "Assets fetched: 8\n", + "\n", + "=== Second run: reading from cache ===\n", + "Assets fetched from remote: 0 (all served from cache!)\n", + "Resolved case: heat_transfer_2d with 3 snapshots\n" + ] + } + ], + "source": [ + "import tempfile\n", + "\n", + "# Reset fetch counter\n", + "fetch_count[0] = 0\n", + "\n", + "with tempfile.TemporaryDirectory() as tmpdir:\n", + " cache_path = Path(tmpdir) / \"asset_cache\"\n", + "\n", + " # Create cache handler and loader\n", + " cache_handler = DataHandler.create(cache_path)\n", + " loader = CachedAssetLoader(fetch=fetch_from_storage, cache=cache_handler)\n", + "\n", + " print(\"=== First run: fetching and caching ===\")\n", + " lazy1 = Packable.reconstruct(SimulationCase, extracted.data, loader)\n", + " _ = lazy1.resolve() # Fetch all assets\n", + " print(f\"Assets fetched: {fetch_count[0]}\")\n", + "\n", + " # Finalize to persist cache\n", + " cache_handler.finalize()\n", + "\n", + " # Second run with same cache location\n", + " print(\"\\n=== Second run: reading from cache ===\")\n", + " fetch_count[0] = 0\n", + " cache_handler2 = DataHandler.create(cache_path)\n", + " loader2 = CachedAssetLoader(fetch=fetch_from_storage, cache=cache_handler2)\n", + "\n", + " lazy2 = Packable.reconstruct(SimulationCase, extracted.data, loader2)\n", + " resolved2 = lazy2.resolve()\n", + " print(f\"Assets fetched from remote: {fetch_count[0]} (all served from cache!)\")\n", + " print(f\"Resolved case: {resolved2.name} with {len(resolved2.snapshots)} snapshots\")" + ] + }, + { + "cell_type": "markdown", + "id": "1a54dcde", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "`Packable.extract()` is a **static method** that handles:\n", + "\n", + "| Input | Handling |\n", + "|-------|----------|\n", + "| Top-level Packable | Expands fields, arrays → refs |\n", + "| Nested Packable (in dict/list/BaseModel) | Becomes `{\"$ref\": checksum}` |\n", + "| NumPy arrays | Becomes `{\"$ref\": checksum}` |\n", + "| BaseModel | Recursively extracts fields |\n", + "| Primitives | Passed through unchanged |\n", + "\n", + "`Packable.reconstruct()` supports three modes:\n", + "\n", + "| AssetProvider | Result | Use Case |\n", + "|--------------|--------|----------|\n", + "| `Dict[str, bytes]` | `TModel` | Eager loading, all assets in memory |\n", + "| `AssetFetcher` | `LazyModel[TModel]` | Lazy per-field loading |\n", + "| `CachedAssetLoader` | `LazyModel[TModel]` | Lazy loading with disk cache |\n", + "\n", + "Key benefits for scientific computing:\n", + "- **Deduplication**: Shared meshes/arrays stored once\n", + "- **Lazy loading**: Load only the fields you need with `LazyModel`\n", + "- **Persistent caching**: `CachedAssetLoader` caches fetched assets to disk\n", + "- **JSON metadata**: Easy to query/index simulation cases\n", + "- **Version control friendly**: Small metadata files, large binary assets" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/meshly/array.py b/python/meshly/array.py index 4bccdab..54a85d0 100644 --- a/python/meshly/array.py +++ b/python/meshly/array.py @@ -5,15 +5,16 @@ encoding functions and storing/loading them as encoded data. """ import ctypes -from io import BytesIO import json -from typing import Any, Dict, List, Literal, Optional, Union +from io import BytesIO +from typing import Any, Literal, Optional, Union + import numpy as np -from pydantic import BaseModel, Field from meshoptimizer._loader import lib +from pydantic import BaseModel, Field -from .data_handler import DataHandler from .common import PathLike +from .data_handler import DataHandler # Optional JAX support try: @@ -37,7 +38,7 @@ class ArrayMetadata(BaseModel): Used in zip files to store array metadata. """ - shape: List[int] = Field(..., description="Shape of the array") + shape: list[int] = Field(..., description="Shape of the array") dtype: str = Field(..., description="Data type of the array as string") itemsize: int = Field(..., description="Size of each item in bytes") array_type: ArrayType = Field( @@ -133,57 +134,59 @@ def convert_recursive(obj, array_type: ArrayType): return obj @staticmethod - def extract_nested_arrays(obj, prefix: str = "") -> Dict[str, Array]: + def extract_nested_arrays( + obj, + prefix: str = "", + skip: Optional[callable] = None, + ) -> dict[str, Array]: """Recursively extract arrays from nested dicts and BaseModel instances. - Note: Packable instances are skipped - they handle their own encoding. + Args: + obj: Object to extract arrays from + prefix: Path prefix for nested keys + skip: Optional predicate - if skip(obj) is True, skip this object """ - from pydantic import BaseModel - from .packable import Packable arrays = {} - if ArrayUtils.is_array(obj): - arrays[prefix] = obj - elif isinstance(obj, Packable): - # Skip Packable instances - they encode themselves + if skip and skip(obj): pass + elif ArrayUtils.is_array(obj): + arrays[prefix] = obj elif isinstance(obj, BaseModel): for name in type(obj).model_fields: value = getattr(obj, name, None) if value is not None: key = f"{prefix}.{name}" if prefix else name - arrays.update(ArrayUtils.extract_nested_arrays(value, key)) + arrays.update(ArrayUtils.extract_nested_arrays(value, key, skip)) elif isinstance(obj, dict): for k, v in obj.items(): key = f"{prefix}.{k}" if prefix else k - arrays.update(ArrayUtils.extract_nested_arrays(v, key)) + arrays.update(ArrayUtils.extract_nested_arrays(v, key, skip)) return arrays @staticmethod - def extract_non_arrays(obj): - """Extract non-array values, preserving BaseModel type info for reconstruction. + def extract_non_arrays(obj, skip: Optional[callable] = None): + """Extract non-array values from nested structures. - Note: Packable instances are skipped - they handle their own encoding. + Args: + obj: Object to extract non-arrays from + skip: Optional predicate - if skip(obj) is True, skip this object """ - from pydantic import BaseModel - from .packable import Packable if ArrayUtils.is_array(obj): return None - if isinstance(obj, Packable): - # Skip Packable instances - they encode themselves + if skip and skip(obj): return None if isinstance(obj, BaseModel): - result = {"__model_class__": obj.__class__.__name__, - "__model_module__": obj.__class__.__module__} + result = {} for name in type(obj).model_fields: val = getattr(obj, name, None) - if not ArrayUtils.is_array(val) and not isinstance(val, Packable): - extracted = ArrayUtils.extract_non_arrays(val) + if not ArrayUtils.is_array(val) and not (skip and skip(val)): + extracted = ArrayUtils.extract_non_arrays(val, skip) if extracted is not None: result[name] = extracted - return result if len(result) > 2 else None + return result or None if isinstance(obj, dict): - result = {k: ArrayUtils.extract_non_arrays(v) for k, v in obj.items() - if not ArrayUtils.is_array(v) and not isinstance(v, Packable)} + result = {k: ArrayUtils.extract_non_arrays(v, skip) for k, v in obj.items() + if not ArrayUtils.is_array(v) and not (skip and skip(v))} result = {k: v for k, v in result.items() if v is not None} return result or None return obj diff --git a/python/meshly/packable.py b/python/meshly/packable.py index 4fc3bc2..e7d12cc 100644 --- a/python/meshly/packable.py +++ b/python/meshly/packable.py @@ -11,31 +11,21 @@ use the extract() and reconstruct() methods to handle asset management. """ -import asyncio -import hashlib -import inspect import json from collections.abc import Callable -from dataclasses import dataclass, field -from functools import cached_property +from dataclasses import dataclass from io import BytesIO from pathlib import Path -from typing import ( - Any, - Dict, - Generic, - Optional, - Set, - Type, - TypeVar, - Union, -) - -from pydantic import BaseModel, Field, computed_field +from typing import Any, Generic, TypeVar, Union + +from pydantic import BaseModel, Field from .array import Array, ArrayType, ArrayUtils from .common import PathLike from .data_handler import AssetProvider, CachedAssetLoader, DataHandler +from .utils.checksum_utils import ChecksumUtils +from .utils.schema_utils import SchemaUtils +from .utils.serialization_utils import SerializationUtils TModel = TypeVar("TModel", bound=BaseModel) @@ -48,7 +38,7 @@ class PackableMetadata(BaseModel): TPackableMetadata = TypeVar("TPackableMetadata", bound=PackableMetadata) TPackable = TypeVar("TPackable", bound="Packable") -FieldValue = TypeVar("FieldValue") # Value type for custom fields +FieldValue = TypeVar("FieldValue") @dataclass @@ -66,11 +56,7 @@ class SerializedPackableData: class LazyModel(Generic[TModel]): - """ - Lazy proxy for a Pydantic BaseModel that defers asset loading until field access. - - Fields containing $ref references are not resolved until accessed, - allowing for truly lazy loading from external storage. + """Lazy proxy for a Pydantic BaseModel that defers asset loading until field access. Example: def fetch_asset(checksum: str) -> bytes: @@ -81,11 +67,6 @@ def fetch_asset(checksum: str) -> bytes: temp = lazy.temperature # NOW the temperature asset is fetched vel = lazy.velocity # NOW the velocity asset is fetched - - # With a cache handler for persistence: - cache = DataHandler.create(Path("./cache")) - loader = CachedAssetLoader(fetch_asset, cache) - lazy = Packable.reconstruct(SimulationCase, data, loader) """ __slots__ = ("_model_class", "_data", "_assets", "_array_type", "_cache", "_resolved") @@ -106,53 +87,11 @@ def __init__( def _get_cached_asset(self, checksum: str) -> bytes: """Get asset bytes, using cache if CachedAssetLoader is provided.""" - assets = object.__getattribute__(self, "_assets") - - # Handle CachedAssetLoader - if isinstance(assets, CachedAssetLoader): - cache_path = f"assets/{checksum}.bin" - - # Try to read from cache first - try: - return assets.cache.read_binary(cache_path) - except (KeyError, FileNotFoundError): - pass - - # Fetch from provider (may be sync or async) - result = assets.fetch(checksum) - if inspect.isawaitable(result): - result = asyncio.get_event_loop().run_until_complete(result) - - # If fetch returned None, try cache again (it might have been populated elsewhere) - # If still not found, raise error - if result is None: - try: - return assets.cache.read_binary(cache_path) - except (KeyError, FileNotFoundError): - raise KeyError(f"Asset '{checksum}' not found in remote or cache") - - asset_bytes = result - - # Store in cache - assets.cache.write_binary(cache_path, asset_bytes) - return asset_bytes - - # Handle plain callable (may be sync or async) - if callable(assets): - result = assets(checksum) - if inspect.isawaitable(result): - result = asyncio.get_event_loop().run_until_complete(result) - if result is None: - raise KeyError(f"Asset fetcher returned None for checksum '{checksum}'") - return result - - # Handle dict - if checksum not in assets: - raise KeyError(f"Missing asset with checksum '{checksum}'") - return assets[checksum] + return SerializationUtils.get_cached_asset( + object.__getattribute__(self, "_assets"), checksum + ) def __getattr__(self, name: str) -> Any: - # Check cache first cache = object.__getattribute__(self, "_cache") if name in cache: return cache[name] @@ -161,7 +100,6 @@ def __getattr__(self, name: str) -> Any: data = object.__getattribute__(self, "_data") array_type = object.__getattribute__(self, "_array_type") - # Check if it's a model field if name not in model_class.model_fields: raise AttributeError(f"'{model_class.__name__}' has no attribute '{name}'") @@ -171,12 +109,10 @@ def __getattr__(self, name: str) -> Any: field_value = data[name] field_type = model_class.model_fields[name].annotation - # Resolve this specific field using our caching asset getter - resolved = Packable._resolve_value_with_type( + resolved = SchemaUtils.resolve_value_with_type( field_value, field_type, self._get_cached_asset, array_type ) - # Cache the resolved value cache[name] = resolved return resolved @@ -184,11 +120,7 @@ def __setattr__(self, name: str, value: Any) -> None: raise AttributeError("LazyModel is read-only. Use resolve() to get a mutable model.") def resolve(self) -> TModel: - """ - Fully resolve all fields and return the actual Pydantic model. - - This will fetch all remaining assets that haven't been accessed yet. - """ + """Fully resolve all fields and return the actual Pydantic model.""" resolved = object.__getattribute__(self, "_resolved") if resolved is not None: return resolved @@ -198,13 +130,12 @@ def resolve(self) -> TModel: array_type = object.__getattribute__(self, "_array_type") cache = object.__getattribute__(self, "_cache") - # Resolve all fields, using cache where available resolved_data = {} for field_name, field_info in model_class.model_fields.items(): if field_name in cache: resolved_data[field_name] = cache[field_name] elif field_name in data: - resolved_data[field_name] = Packable._resolve_value_with_type( + resolved_data[field_name] = SchemaUtils.resolve_value_with_type( data[field_name], field_info.annotation, self._get_cached_asset, array_type ) @@ -236,17 +167,12 @@ class CustomFieldConfig(Generic[FieldValue, TPackableMetadata]): class Packable(BaseModel): - """ - Base class for data containers with automatic array serialization. + """Base class for data containers with automatic array serialization. Subclasses can define numpy array attributes which will be automatically detected, encoded, and saved to zip files. Non-array fields are preserved in metadata. - Packables cannot contain nested Packables. For composite structures, - use extract() to get a serializable dict with asset references, and - reconstruct() to rebuild from the dict and assets. - Example: class SimulationResult(Packable): time: float @@ -259,15 +185,7 @@ class SimulationResult(Packable): velocity=np.zeros((3, 3)) ) result.save_to_zip("result.zip") - - # Load using the specific class loaded = SimulationResult.load_from_zip("result.zip") - - # Or use extract/reconstruct for custom asset management - extracted = result.extract() - # extracted.data contains {"time": 0.1, "temperature": {"$ref": "abc123"}, ...} - # extracted.assets contains {"abc123": , ...} - rebuilt = SimulationResult.reconstruct(extracted.data, extracted.assets) """ class Config: @@ -278,27 +196,18 @@ def __init__(self, **data): self._validate_no_direct_packable_fields() def _validate_no_direct_packable_fields(self) -> None: - """Validate that this Packable has no direct Packable fields. - - Packables nested inside dicts or other BaseModels are allowed and will - be handled by extract(). Only direct Packable fields are prohibited. - """ + """Validate that this Packable has no direct Packable fields.""" for field_name in type(self).model_fields: if field_name in self.__private_attributes__: continue value = getattr(self, field_name, None) - if value is None: - continue - - # Only reject direct Packable fields - if isinstance(value, Packable): + if value is not None and isinstance(value, Packable): raise TypeError( f"Direct Packable fields are not allowed. Field '{field_name}' " f"contains a {type(value).__name__}. Packables can be nested " "inside dicts or other BaseModels, and extract() will handle them." ) - @property def array_fields(self) -> set[str]: """Get all array field paths, including nested arrays in dicts/BaseModels.""" @@ -308,11 +217,13 @@ def array_fields(self) -> set[str]: continue value = getattr(self, field_name, None) if value is not None: - result.update(ArrayUtils.extract_nested_arrays(value, field_name).keys()) + result.update(ArrayUtils.extract_nested_arrays( + value, field_name, skip=lambda x: isinstance(x, Packable) + ).keys()) return result def _extract_non_array_fields(self) -> dict[str, Any]: - """Extract non-array field values for metadata, preserving BaseModel type info.""" + """Extract non-array field values for metadata.""" model_data = {} direct_arrays = {f for f in self.array_fields if "." not in f} for name in type(self).model_fields: @@ -320,55 +231,26 @@ def _extract_non_array_fields(self) -> dict[str, Any]: continue value = getattr(self, name, None) if value is not None and not ArrayUtils.is_array(value): - extracted = ArrayUtils.extract_non_arrays(value) + extracted = ArrayUtils.extract_non_arrays(value, skip=lambda x: isinstance(x, Packable)) if extracted is not None: model_data[name] = extracted return model_data def _create_metadata(self, field_data: dict[str, Any]) -> PackableMetadata: - """ - Create metadata for this Packable. - - Subclasses can override this to return custom metadata types. - - Args: - field_data: Non-array field values to include in metadata - - Returns: - PackableMetadata (or subclass) instance - """ - return PackableMetadata( - field_data=field_data, - ) + """Create metadata for this Packable. Subclasses can override.""" + return PackableMetadata(field_data=field_data) @classmethod def load_metadata( cls, handler: DataHandler, metadata_cls: type[TPackableMetadata] = PackableMetadata ) -> TPackableMetadata: - """ - Load and validate metadata using a read handler. - - Args: - handler: ReadHandler for reading files - metadata_cls: The metadata class to use for parsing (default: PackableMetadata) - - Returns: - Metadata object of the specified type - """ + """Load and validate metadata using a read handler.""" metadata_text = handler.read_text("metadata.json") metadata_dict = json.loads(metadata_text) return metadata_cls(**metadata_dict) - def save_to_zip( - self, - destination: Union[PathLike, BytesIO], - ) -> None: - """ - Save this container to a zip file. - - Args: - destination: Path to the output zip file or BytesIO buffer - """ + def save_to_zip(self, destination: Union[PathLike, BytesIO]) -> None: + """Save this container to a zip file.""" encoded = self.encode() if isinstance(destination, BytesIO): destination.write(encoded) @@ -381,24 +263,7 @@ def load_from_zip( source: Union[PathLike, BytesIO], array_type: ArrayType | None = None, ) -> TPackable: - """ - Load a Packable from a zip file. - - Args: - source: Path to the input zip file or BytesIO object - array_type: Array backend to use ("numpy" or "jax"). If None (default), - uses the array_type stored in each array's metadata, - preserving the original array types that were saved. - - Returns: - Loaded Packable instance - - Raises: - TypeError: If called on base Packable class instead of a subclass - - Example: - mesh = Mesh.load_from_zip("mesh.zip") - """ + """Load a Packable from a zip file.""" if isinstance(source, BytesIO): source.seek(0) return cls.decode(source.read(), array_type) @@ -408,14 +273,7 @@ def load_from_zip( @classmethod def _get_custom_fields(cls) -> dict[str, CustomFieldConfig]: - """ - Get custom field configurations for this class. - - Subclasses override this to define custom encoders/decoders. - - Returns: - Dict mapping field names to CustomFieldConfig objects - """ + """Get custom field configurations. Subclasses override this.""" return {} @classmethod @@ -439,7 +297,7 @@ def _decode_custom_fields( except (KeyError, FileNotFoundError): if not config.optional: raise ValueError( - f"Required custom field '{field_name}' ({config.file_name}.bin) not found in zip" + f"Required custom field '{field_name}' ({config.file_name}.bin) not found" ) @classmethod @@ -461,11 +319,9 @@ def _load_standard_arrays( if not file_str.endswith("/array.bin"): continue - # Extract array name: "arrays/markerIndices/boundary/array.bin" -> "markerIndices.boundary" array_path = file_str[7:-10] # Remove "arrays/" and "/array.bin" name = array_path.replace("/", ".") - # Skip custom fields base_field = name.split(".")[0] if base_field in skip_fields: continue @@ -473,7 +329,6 @@ def _load_standard_arrays( decoded = ArrayUtils.load_array(handler, name, array_type) if "." in name: - # Nested array - build nested structure parts = name.split(".") current = data for part in parts[:-1]: @@ -482,7 +337,6 @@ def _load_standard_arrays( current = current[part] current[parts[-1]] = decoded else: - # Flat array data[name] = decoded def _encode_standard_arrays(self, skip_fields: set[str]) -> dict[str, bytes]: @@ -490,29 +344,18 @@ def _encode_standard_arrays(self, skip_fields: set[str]) -> dict[str, bytes]: encoded_arrays = {} for field_name in self.array_fields: - # Skip fields with custom encoding if field_name in skip_fields: continue - # Handle nested array paths (e.g., "textures.diffuse") if "." in field_name: parts = field_name.split(".") obj = self for part in parts[:-1]: - if isinstance(obj, dict): - obj = obj[part] - else: - obj = getattr(obj, part) - - if isinstance(obj, dict): - array = obj[parts[-1]] - else: - array = getattr(obj, parts[-1]) - + obj = obj[part] if isinstance(obj, dict) else getattr(obj, part) + array = obj[parts[-1]] if isinstance(obj, dict) else getattr(obj, parts[-1]) if ArrayUtils.is_array(array): encoded_arrays[field_name] = ArrayUtils.encode_array(array) else: - # Handle direct array fields try: array = getattr(self, field_name) if ArrayUtils.is_array(array): @@ -531,33 +374,20 @@ def _encode_custom_fields(self, handler: DataHandler) -> None: handler.write_binary(f"{config.file_name}.bin", encoded_bytes) def encode(self) -> bytes: - """ - Serialize this Packable to bytes (zip format). - - Returns: - Bytes containing the zip-encoded data - """ + """Serialize this Packable to bytes (zip format).""" custom_field_names = self._get_custom_field_names() - - # Encode standard arrays encoded_arrays = self._encode_standard_arrays(custom_field_names) - - # Create metadata field_data = self._extract_non_array_fields() metadata = self._create_metadata(field_data) - # Write to zip destination = BytesIO() handler = DataHandler.create(destination) - # Save standard arrays for name in sorted(encoded_arrays.keys()): ArrayUtils.save_array(handler, name, encoded_arrays[name]) - # Save custom encoded fields self._encode_custom_fields(handler) - # Save metadata handler.write_text( "metadata.json", json.dumps(metadata.model_dump(), indent=2, sort_keys=True), @@ -572,20 +402,7 @@ def decode( buf: bytes, array_type: ArrayType | None = None, ) -> TPackable: - """ - Deserialize a Packable from bytes. - - Args: - buf: Bytes containing the zip-encoded data - array_type: Array backend to use. If None (default), uses the - array_type stored in each array's metadata. - - Returns: - Loaded Packable instance - - Raises: - TypeError: If called on base Packable class instead of a subclass - """ + """Deserialize a Packable from bytes.""" if cls is Packable: raise TypeError( "Cannot decode on base Packable class. " @@ -594,50 +411,27 @@ def decode( handler = DataHandler.create(BytesIO(buf)) metadata = cls.load_metadata(handler) - - # Fields to skip when loading standard arrays skip_fields = cls._get_custom_field_names() data: dict[str, Any] = {} - - # Decode custom fields first cls._decode_custom_fields(handler, metadata, data, array_type) - - # Load standard arrays cls._load_standard_arrays(handler, data, skip_fields, array_type) - # Merge non-array fields from metadata using schema-aware reconstruction if metadata.field_data: - cls._merge_field_data_with_schema(cls, data, metadata.field_data) + SchemaUtils.merge_field_data_with_schema(cls, data, metadata.field_data) return cls(**data) @staticmethod def extract(obj: BaseModel) -> SerializedPackableData: - """ - Extract arrays and Packables from a Pydantic BaseModel into serializable data and assets. - - Args: - obj: A Pydantic BaseModel instance (including Packable subclasses) - - Returns an ExtractedPackable with: - - data: A JSON-serializable dict with `{"$ref": checksum}` for arrays/Packables - - assets: A dict mapping checksums to encoded bytes - - Arrays and nested Packables are stored as assets. The type information comes - from the Pydantic schema when reconstructing, so no class/module info is stored. + """Extract arrays and Packables from a BaseModel into serializable data and assets. - Example: - mesh = Mesh(vertices=..., indices=...) - extracted = Packable.extract(mesh) - # extracted.data = {"vertices": {"$ref": "abc..."}, "indices": {"$ref": "def..."}} - - rebuilt = Mesh.reconstruct(extracted.data, extracted.assets) + Returns: + SerializedPackableData with data dict (refs for arrays) and assets dict """ if not isinstance(obj, BaseModel): raise TypeError( - f"extract() requires a Pydantic BaseModel, got {type(obj).__name__}. " - "Use Pydantic models for type-safe extraction and reconstruction." + f"extract() requires a Pydantic BaseModel, got {type(obj).__name__}." ) assets: dict[str, bytes] = {} @@ -647,133 +441,38 @@ def extract(obj: BaseModel) -> SerializedPackableData: if hasattr(obj, "__private_attributes__") and field_name in obj.__private_attributes__: continue value = getattr(obj, field_name, None) - if value is None: - continue - data[field_name] = Packable._extract_value(value, assets) + if value is not None: + data[field_name] = SerializationUtils.extract_value(value, assets) return SerializedPackableData(data=data, assets=assets) - @staticmethod def compute_checksum( obj: Union[bytes, "SerializedPackableData", "Packable", BaseModel], ) -> str: - """ - Compute SHA256 checksum for various types of data. - - Supports: - - bytes: Direct checksum of the bytes - - SerializedPackableData: Checksum of JSON-serialized data + sorted asset checksums - - Packable: Checksum of encoded zip bytes - - BaseModel: Extract to SerializedPackableData and compute checksum - - Args: - obj: The object to compute checksum for + """Compute SHA256 checksum for various types of data. Returns: 16-character hex string (first 64 bits of SHA256) - - Example: - # Direct bytes checksum - checksum = compute_checksum(some_bytes) - - # Packable checksum - mesh = Mesh(vertices=..., indices=...) - checksum = compute_checksum(mesh) - - # Extracted data checksum - extracted = Packable.extract(mesh) - checksum = compute_checksum(extracted) - - # Any Pydantic BaseModel - checksum = compute_checksum(my_pydantic_model) """ if isinstance(obj, bytes): - return hashlib.sha256(obj).hexdigest()[:16] + return ChecksumUtils.compute_bytes_checksum(obj) if isinstance(obj, SerializedPackableData): - # Combine data JSON + all asset bytes for deterministic hashing - data_json = json.dumps(obj.data, sort_keys=True).encode("utf-8") - # Hash data first, then add sorted asset contents - hasher = hashlib.sha256() - hasher.update(data_json) - hasher.update(b"\x00") - # Include actual asset bytes in sorted order for content-based hashing - for checksum in sorted(obj.assets.keys()): - hasher.update(obj.assets[checksum]) - return hasher.hexdigest()[:16] + return ChecksumUtils.compute_dict_checksum(obj.data, obj.assets) if isinstance(obj, Packable): - return hashlib.sha256(obj.encode()).hexdigest()[:16] + return ChecksumUtils.compute_bytes_checksum(obj.encode()) if isinstance(obj, BaseModel): - # Extract and compute checksum of the extracted data extracted = Packable.extract(obj) - return Packable.compute_checksum(extracted) + return ChecksumUtils.compute_dict_checksum(extracted.data, extracted.assets) raise TypeError( f"compute_checksum() requires bytes, SerializedPackableData, Packable, or BaseModel, " f"got {type(obj).__name__}" ) - @staticmethod - def _extract_value(value: Any, assets: dict[str, bytes]) -> Any: - """Recursively extract a value, replacing arrays and nested Packables with refs.""" - # Handle arrays - if ArrayUtils.is_array(value): - encoded = ArrayUtils.encode_array(value) - # Pack metadata + data together as bytes for the asset - metadata_json = json.dumps(encoded.metadata.model_dump()).encode("utf-8") - # Format: [4 bytes metadata length][metadata json][array data] - packed = len(metadata_json).to_bytes(4, "little") + metadata_json + encoded.data - checksum = Packable.compute_checksum(packed) - assets[checksum] = packed - return {"$ref": checksum} - - # Handle Packables - extract as encoded zip bytes - if isinstance(value, Packable): - encoded = value.encode() - checksum = Packable.compute_checksum(encoded) - assets[checksum] = encoded - return {"$ref": checksum} - - # Handle dicts - if isinstance(value, dict): - return {k: Packable._extract_value(v, assets) for k, v in value.items()} - - # Handle lists/tuples - if isinstance(value, (list, tuple)): - result = [Packable._extract_value(v, assets) for v in value] - return result if isinstance(value, list) else tuple(result) - - # Handle non-Packable BaseModels - recursively extract their fields - if isinstance(value, BaseModel): - extracted = {} - for name in value.model_fields: - field_value = getattr(value, name, None) - if field_value is not None: - extracted[name] = Packable._extract_value(field_value, assets) - return extracted - - # Primitive value - return as-is - return value - - @staticmethod - def _get_asset(assets: AssetProvider, checksum: str) -> bytes: - """Get asset bytes from either a dict or callable provider. - - Supports both sync and async callables - async results are awaited - synchronously using asyncio.get_event_loop().run_until_complete(). - """ - if callable(assets): - result = assets(checksum) - if inspect.isawaitable(result): - result = asyncio.get_event_loop().run_until_complete(result) - return result - if checksum not in assets: - raise KeyError(f"Missing asset with checksum '{checksum}'") - return assets[checksum] - @staticmethod def reconstruct( model_class: type[TModel], @@ -781,367 +480,35 @@ def reconstruct( assets: AssetProvider, array_type: ArrayType | None = None, ) -> Union[TModel, LazyModel[TModel]]: - """ - Reconstruct a Pydantic BaseModel from extracted data and assets. - - Uses the class's Pydantic schema to determine types for nested fields, - so no runtime type information needs to be stored in the data. - - If assets is a dict, all assets are loaded immediately and the actual - model is returned. If assets is a callable or CachedAssetLoader, a - LazyModel proxy is returned that defers asset loading until field access. + """Reconstruct a Pydantic BaseModel from extracted data and assets. - Args: - model_class: The Pydantic BaseModel class to reconstruct - data: The data dict from extract(), with $ref references - assets: One of: - - Dict mapping checksums to bytes (eager loading) - - Callable that takes a checksum and returns bytes (lazy loading) - - CachedAssetLoader with fetch callable and cache handler (lazy + disk cache) - array_type: Array backend to use. If None, uses the type stored - in each array's metadata. - - Returns: - - If assets is a dict: Reconstructed BaseModel instance (eager) - - If assets is callable/CachedAssetLoader: LazyModel proxy that loads on demand - - Raises: - KeyError: If a referenced asset is missing (for dict assets, raised immediately; - for callable assets, raised on field access) - - Example: - extracted = Packable.extract(simulation_case) - - # Eager loading with dict - returns actual model - rebuilt = Packable.reconstruct(SimulationCase, extracted.data, extracted.assets) - - # Lazy loading with callable - returns LazyModel - def fetch_asset(checksum: str) -> bytes: - return storage.get(checksum) - lazy = Packable.reconstruct(SimulationCase, data, fetch_asset) - - # Lazy loading with disk cache - cache = DataHandler.create(Path("./cache")) - loader = CachedAssetLoader(fetch_asset, cache) - lazy = Packable.reconstruct(SimulationCase, data, loader) - - print(lazy.time) # Primitive field, no fetch needed - print(lazy.temperature) # Fetches and caches temperature asset - model = lazy.resolve() # Get full Pydantic model + If assets is a dict, returns the actual model (eager loading). + If assets is a callable or CachedAssetLoader, returns a LazyModel proxy. """ if callable(assets) or isinstance(assets, CachedAssetLoader): return LazyModel(model_class, data, assets, array_type) - resolved_data = Packable._resolve_refs_with_schema(model_class, data, assets, array_type) + resolved_data = SchemaUtils.resolve_refs_with_schema(model_class, data, assets, array_type) return model_class(**resolved_data) - @staticmethod - def _decode_packed_array(packed: bytes, array_type: ArrayType | None) -> Any: - """Decode a packed array asset (metadata + data) back to an array.""" - from .array import ArrayMetadata, EncodedArray - - # Unpack: [4 bytes metadata length][metadata json][array data] - metadata_len = int.from_bytes(packed[:4], "little") - metadata_json = packed[4 : 4 + metadata_len].decode("utf-8") - array_data = packed[4 + metadata_len :] - - metadata_dict = json.loads(metadata_json) - metadata = ArrayMetadata(**metadata_dict) - encoded = EncodedArray(data=array_data, metadata=metadata) - - decoded = ArrayUtils.decode_array(encoded) - - # Convert to requested array type if specified - if array_type is not None: - return ArrayUtils.convert_array(decoded, array_type) - elif metadata.array_type != "numpy": - return ArrayUtils.convert_array(decoded, metadata.array_type) - return decoded - - @staticmethod - def _resolve_refs_with_schema( - model_class: type[BaseModel], - data: dict[str, Any], - assets: AssetProvider, - array_type: ArrayType | None, - ) -> dict[str, Any]: - """ - Resolve $ref references using Pydantic schema for type information. - - Uses model_class.model_fields to determine the expected type for each field, - so no class/module information needs to be stored in the data. - """ - result = {} - - for field_name, field_info in model_class.model_fields.items(): - if field_name not in data: - continue - - field_value = data[field_name] - field_type = field_info.annotation - - result[field_name] = Packable._resolve_value_with_type( - field_value, field_type, assets, array_type - ) - - return result - - @staticmethod - def _resolve_value_with_type( - value: Any, - expected_type: Any, - assets: AssetProvider, - array_type: ArrayType | None, - ) -> Any: - """Resolve a value using the expected type from Pydantic schema.""" - from typing import Union, get_args, get_origin - - if value is None: - return None - - # Handle $ref - decode based on expected type - if isinstance(value, dict) and "$ref" in value: - checksum = value["$ref"] - asset_bytes = Packable._get_asset(assets, checksum) - - # Determine if this is a Packable or array based on expected_type - origin = get_origin(expected_type) - - # Unwrap Optional[X] -> X - if origin is Union: - args = get_args(expected_type) - non_none = [a for a in args if a is not type(None)] - if len(non_none) == 1: - expected_type = non_none[0] - origin = get_origin(expected_type) - - # Check if expected type is a Packable subclass - if isinstance(expected_type, type) and issubclass(expected_type, Packable): - return expected_type.decode(asset_bytes, array_type) - - # Otherwise assume it's an array - return Packable._decode_packed_array(asset_bytes, array_type) - - # Handle nested BaseModel (non-ref dict that should be a model) - if isinstance(value, dict): - origin = get_origin(expected_type) - - # Unwrap Optional - if origin is Union: - args = get_args(expected_type) - non_none = [a for a in args if a is not type(None)] - if len(non_none) == 1: - expected_type = non_none[0] - origin = get_origin(expected_type) - - # Dict type - resolve values with value type - if origin is dict: - key_type, value_type = get_args(expected_type) - return { - k: Packable._resolve_value_with_type(v, value_type, assets, array_type) - for k, v in value.items() - } - - # BaseModel type - recursively resolve with schema - if isinstance(expected_type, type) and issubclass(expected_type, BaseModel): - resolved = Packable._resolve_refs_with_schema( - expected_type, value, assets, array_type - ) - return expected_type(**resolved) - - # Unknown dict - return as-is - return value - - # Handle lists/tuples - if isinstance(value, (list, tuple)): - origin = get_origin(expected_type) - - # Unwrap Optional - if origin is Union: - args = get_args(expected_type) - non_none = [a for a in args if a is not type(None)] - if len(non_none) == 1: - expected_type = non_none[0] - origin = get_origin(expected_type) - - # Get element type - if origin in (list, tuple): - args = get_args(expected_type) - elem_type = args[0] if args else Any - else: - elem_type = Any - - result = [ - Packable._resolve_value_with_type(v, elem_type, assets, array_type) for v in value - ] - return result if isinstance(value, list) else tuple(result) - - # Primitive - return as-is - return value - - @staticmethod - def _merge_field_data_with_schema( - model_class: type[BaseModel], - data: dict[str, Any], - field_data: dict[str, Any], - ) -> None: - """ - Merge metadata field_data into data, using Pydantic schema for type info. - - This handles the reconstruction of nested BaseModel instances without - needing __model_class__/__model_module__ markers. - """ - from typing import Union, get_args, get_origin - - for key, value in field_data.items(): - if key in ("__model_class__", "__model_module__"): - # Skip legacy markers - continue - - if key not in model_class.model_fields: - # Unknown field - store as-is - data[key] = value - continue - - field_type = model_class.model_fields[key].annotation - merged = Packable._merge_value_with_schema(value, field_type, data.get(key)) - data[key] = merged - - @staticmethod - def _merge_value_with_schema( - metadata_value: Any, - expected_type: Any, - existing_value: Any, - ) -> Any: - """Merge a metadata value with existing data using the schema type.""" - from typing import Union, get_args, get_origin - - if metadata_value is None: - return existing_value - - # Unwrap Optional - origin = get_origin(expected_type) - if origin is Union: - args = get_args(expected_type) - non_none = [a for a in args if a is not type(None)] - if len(non_none) == 1: - expected_type = non_none[0] - origin = get_origin(expected_type) - - # Handle dict type - if origin is dict: - key_type, value_type = get_args(expected_type) - if isinstance(metadata_value, dict) and isinstance(existing_value, dict): - # Merge dict entries - result = dict(existing_value) - for k, v in metadata_value.items(): - if k in ("__model_class__", "__model_module__"): - continue - result[k] = Packable._merge_value_with_schema( - v, value_type, existing_value.get(k) - ) - return result - elif isinstance(metadata_value, dict): - # No existing value - reconstruct from metadata - return { - k: Packable._merge_value_with_schema(v, value_type, None) - for k, v in metadata_value.items() - if k not in ("__model_class__", "__model_module__") - } - return metadata_value - - # Handle BaseModel type - if isinstance(expected_type, type) and issubclass(expected_type, BaseModel): - if isinstance(metadata_value, dict): - # Filter out legacy markers - filtered = { - k: v - for k, v in metadata_value.items() - if k not in ("__model_class__", "__model_module__") - } - - if isinstance(existing_value, dict): - # Merge with existing dict data - merged = dict(existing_value) - Packable._merge_field_data_with_schema(expected_type, merged, filtered) - return expected_type(**merged) - else: - # Reconstruct from metadata - data = {} - Packable._merge_field_data_with_schema(expected_type, data, filtered) - return expected_type(**data) - return metadata_value - - # Handle list type - if origin in (list, tuple): - if isinstance(metadata_value, (list, tuple)): - args = get_args(expected_type) - elem_type = args[0] if args else Any - result = [ - Packable._merge_value_with_schema(v, elem_type, None) for v in metadata_value - ] - return result if origin is list else tuple(result) - return metadata_value - - # Primitive - use metadata value - return metadata_value - def __reduce__(self): - """ - Support for pickle serialization. - - Array types are preserved automatically via the per-array metadata. - """ - return ( - self.__class__.decode, - (self.encode(),), - ) + """Support for pickle serialization.""" + return (self.__class__.decode, (self.encode(),)) @staticmethod def load_array( source: Union[PathLike, BytesIO], name: str, array_type: ArrayType | None = None ) -> Array: - """ - Load a single array from a zip file without loading the entire object. - - Useful for large files where you only need one array. - - Args: - source: Path to the zip file or BytesIO buffer - name: Array name (e.g., "normals" or "markerIndices.boundary") - array_type: Array backend to use ("numpy" or "jax"). If None (default), - uses the array_type stored in the array's metadata. - - Returns: - Decoded array (numpy or JAX) - - Raises: - KeyError: If array not found in zip - - Example: - normals = Mesh.load_array("mesh.zip", "normals") - """ + """Load a single array from a zip file without loading the entire object.""" if isinstance(source, BytesIO): source.seek(0) handler = DataHandler.create(BytesIO(source.read())) else: - with open(source, "rb") as f: - handler = DataHandler.create(BytesIO(f.read())) + handler = DataHandler.create(BytesIO(Path(source).read_bytes())) return ArrayUtils.load_array(handler, name, array_type) def convert_to(self: TPackable, array_type: ArrayType) -> TPackable: - """ - Create a new Packable with all arrays converted to the specified type. - - Args: - array_type: Target array backend ("numpy" or "jax") - - Returns: - A new Packable with all arrays converted - - Raises: - AssertionError: If JAX is requested but not available - """ + """Create a new Packable with all arrays converted to the specified type.""" data_copy = self.model_copy(deep=True) for field_name in data_copy.model_fields_set: diff --git a/python/meshly/utils/__init__.py b/python/meshly/utils/__init__.py index 8c5d72a..7674b30 100644 --- a/python/meshly/utils/__init__.py +++ b/python/meshly/utils/__init__.py @@ -2,14 +2,20 @@ Utility modules for meshly. This package contains utility functions for mesh operations, element handling, -and triangulation. +triangulation, checksums, serialization, and schema operations. """ +from .checksum_utils import ChecksumUtils from .element_utils import ElementUtils, TriangulationUtils from .mesh_utils import MeshUtils +from .schema_utils import SchemaUtils +from .serialization_utils import SerializationUtils __all__ = [ + "ChecksumUtils", "ElementUtils", - "TriangulationUtils", "MeshUtils", + "SchemaUtils", + "SerializationUtils", + "TriangulationUtils", ] diff --git a/python/meshly/utils/checksum_utils.py b/python/meshly/utils/checksum_utils.py new file mode 100644 index 0000000..e1af476 --- /dev/null +++ b/python/meshly/utils/checksum_utils.py @@ -0,0 +1,152 @@ +"""Checksum utilities for hashing data, files, and directories.""" + +import hashlib +import json +from pathlib import Path +from typing import Any, Optional + + +class ChecksumUtils: + """Utility class for computing checksums.""" + + # Thresholds for switching to fast checksum strategy + LARGE_FILE_THRESHOLD = 10 * 1024 * 1024 # 10MB + LARGE_DIR_FILE_COUNT_THRESHOLD = 100 + + @staticmethod + def compute_bytes_checksum(data: bytes) -> str: + """Compute SHA256 checksum for bytes. + + Args: + data: Bytes to hash + + Returns: + 16-character hex string (first 64 bits of SHA256) + """ + return hashlib.sha256(data).hexdigest()[:16] + + @staticmethod + def compute_dict_checksum(data: dict[str, Any], assets: dict[str, bytes]) -> str: + """Compute checksum for a data dict with assets. + + Combines data JSON + all asset bytes for deterministic hashing. + + Args: + data: JSON-serializable dict + assets: Map of checksum -> bytes + + Returns: + 16-character hex string + """ + data_json = json.dumps(data, sort_keys=True).encode("utf-8") + hasher = hashlib.sha256() + hasher.update(data_json) + hasher.update(b"\x00") + for checksum in sorted(assets.keys()): + hasher.update(assets[checksum]) + return hasher.hexdigest()[:16] + + @staticmethod + def compute_file_checksum(file_path: Path, fast: bool = False) -> str: + """Compute checksum of a file. + + Args: + file_path: Path to the file + fast: If True, use file metadata (size, mtime) instead of content hash + for large files. This is much faster but less accurate. + + Returns: + Full SHA256 checksum string + """ + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + if not file_path.is_file(): + raise ValueError(f"Path is not a file: {file_path}") + + file_size = file_path.stat().st_size + + if fast and file_size > ChecksumUtils.LARGE_FILE_THRESHOLD: + return ChecksumUtils._compute_file_metadata_checksum(file_path) + + return ChecksumUtils._compute_file_content_checksum(file_path) + + @staticmethod + def _compute_file_content_checksum(file_path: Path) -> str: + """Compute SHA256 checksum of file contents.""" + hasher = hashlib.sha256() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + hasher.update(chunk) + return hasher.hexdigest() + + @staticmethod + def _compute_file_metadata_checksum(file_path: Path) -> str: + """Compute checksum based on file metadata (path, size, mtime).""" + stat = file_path.stat() + metadata = f"{file_path.resolve()}|{stat.st_size}|{stat.st_mtime}" + return hashlib.sha256(metadata.encode()).hexdigest() + + @staticmethod + def compute_directory_checksum(dir_path: Path, fast: Optional[bool] = None) -> str: + """Compute checksum of a directory. + + Args: + dir_path: Path to the directory + fast: If True, use file metadata instead of content hashes. + If None (default), automatically use fast strategy for large directories. + + Returns: + Full SHA256 checksum string combining all file checksums + """ + dir_path = Path(dir_path) + if not dir_path.exists(): + raise FileNotFoundError(f"Directory not found: {dir_path}") + if not dir_path.is_dir(): + raise ValueError(f"Path is not a directory: {dir_path}") + + all_files = sorted(dir_path.rglob("*")) + file_paths = [f for f in all_files if f.is_file()] + + if fast is None: + fast = len(file_paths) > ChecksumUtils.LARGE_DIR_FILE_COUNT_THRESHOLD + + hasher = hashlib.sha256() + + for file_path in file_paths: + rel_path = file_path.relative_to(dir_path) + hasher.update(str(rel_path).encode()) + + if fast: + file_hash = ChecksumUtils._compute_file_metadata_checksum(file_path) + else: + file_hash = ChecksumUtils._compute_file_content_checksum(file_path) + + hasher.update(file_hash.encode()) + + return hasher.hexdigest() + + @staticmethod + def compute_path_checksum(path: Path, fast: Optional[bool] = None) -> str: + """Compute checksum of a file or directory. + + Args: + path: Path to file or directory + fast: If True, use metadata-based checksums for speed. + If None, automatically use fast strategy for large files/directories. + + Returns: + Full SHA256 checksum string + """ + path = Path(path) + if not path.exists(): + raise FileNotFoundError(f"Path not found: {path}") + + if path.is_file(): + return ChecksumUtils.compute_file_checksum( + path, fast=fast if fast is not None else False + ) + elif path.is_dir(): + return ChecksumUtils.compute_directory_checksum(path, fast=fast) + else: + raise ValueError(f"Path is neither a file nor directory: {path}") diff --git a/python/meshly/utils/schema_utils.py b/python/meshly/utils/schema_utils.py new file mode 100644 index 0000000..5826036 --- /dev/null +++ b/python/meshly/utils/schema_utils.py @@ -0,0 +1,224 @@ +"""Schema utilities for resolving Pydantic types and merging field data.""" + +from typing import Any, Union, get_args, get_origin + +from pydantic import BaseModel + +from ..array import ArrayType +from ..data_handler import AssetProvider +from .serialization_utils import SerializationUtils + + +class SchemaUtils: + """Utility class for Pydantic schema operations.""" + + @staticmethod + def unwrap_optional(expected_type: Any) -> Any: + """Unwrap Optional[X] to X. + + Args: + expected_type: Type annotation, possibly Optional + + Returns: + Inner type if Optional, otherwise unchanged + """ + origin = get_origin(expected_type) + if origin is Union: + args = get_args(expected_type) + non_none = [a for a in args if a is not type(None)] + if len(non_none) == 1: + return non_none[0] + return expected_type + + @staticmethod + def resolve_refs_with_schema( + model_class: type[BaseModel], + data: dict[str, Any], + assets: AssetProvider, + array_type: ArrayType | None, + ) -> dict[str, Any]: + """Resolve $ref references using Pydantic schema for type information. + + Args: + model_class: Pydantic model class with field definitions + data: Data dict with potential $ref values + assets: Asset provider + array_type: Target array type + + Returns: + Resolved data dict + """ + result = {} + + for field_name, field_info in model_class.model_fields.items(): + if field_name not in data: + continue + + result[field_name] = SchemaUtils.resolve_value_with_type( + data[field_name], field_info.annotation, assets, array_type + ) + + return result + + @staticmethod + def resolve_value_with_type( + value: Any, + expected_type: Any, + assets: AssetProvider, + array_type: ArrayType | None, + ) -> Any: + """Resolve a value using the expected type from Pydantic schema. + + Args: + value: Value to resolve + expected_type: Expected type from schema + assets: Asset provider + array_type: Target array type + + Returns: + Resolved value + """ + # Import here to avoid circular imports + from ..packable import Packable + + if value is None: + return None + + # Handle $ref + if isinstance(value, dict) and "$ref" in value: + checksum = value["$ref"] + asset_bytes = SerializationUtils.get_asset(assets, checksum) + + expected_type = SchemaUtils.unwrap_optional(expected_type) + origin = get_origin(expected_type) + + if isinstance(expected_type, type) and issubclass(expected_type, Packable): + return expected_type.decode(asset_bytes, array_type) + + return SerializationUtils.unpack_array(asset_bytes, array_type) + + # Handle nested dict + if isinstance(value, dict): + expected_type = SchemaUtils.unwrap_optional(expected_type) + origin = get_origin(expected_type) + + if origin is dict: + _, value_type = get_args(expected_type) + return { + k: SchemaUtils.resolve_value_with_type(v, value_type, assets, array_type) + for k, v in value.items() + } + + if isinstance(expected_type, type) and issubclass(expected_type, BaseModel): + resolved = SchemaUtils.resolve_refs_with_schema( + expected_type, value, assets, array_type + ) + return expected_type(**resolved) + + return value + + # Handle lists/tuples + if isinstance(value, (list, tuple)): + expected_type = SchemaUtils.unwrap_optional(expected_type) + origin = get_origin(expected_type) + + if origin in (list, tuple): + args = get_args(expected_type) + elem_type = args[0] if args else Any + else: + elem_type = Any + + result = [ + SchemaUtils.resolve_value_with_type(v, elem_type, assets, array_type) + for v in value + ] + return result if isinstance(value, list) else tuple(result) + + return value + + @staticmethod + def merge_field_data_with_schema( + model_class: type[BaseModel], + data: dict[str, Any], + field_data: dict[str, Any], + ) -> None: + """Merge metadata field_data into data using Pydantic schema. + + Args: + model_class: Pydantic model class + data: Target data dict (modified in place) + field_data: Source field data from metadata + """ + for key, value in field_data.items(): + if key not in model_class.model_fields: + data[key] = value + continue + + field_type = model_class.model_fields[key].annotation + data[key] = SchemaUtils.merge_value_with_schema(value, field_type, data.get(key)) + + @staticmethod + def merge_value_with_schema( + metadata_value: Any, + expected_type: Any, + existing_value: Any, + ) -> Any: + """Merge a metadata value with existing data using the schema type. + + Args: + metadata_value: Value from metadata + expected_type: Expected type from schema + existing_value: Existing value in data dict + + Returns: + Merged value + """ + if metadata_value is None: + return existing_value + + expected_type = SchemaUtils.unwrap_optional(expected_type) + origin = get_origin(expected_type) + + # Handle dict type + if origin is dict: + _, value_type = get_args(expected_type) + if isinstance(metadata_value, dict) and isinstance(existing_value, dict): + result = dict(existing_value) + for k, v in metadata_value.items(): + result[k] = SchemaUtils.merge_value_with_schema( + v, value_type, existing_value.get(k) + ) + return result + elif isinstance(metadata_value, dict): + return { + k: SchemaUtils.merge_value_with_schema(v, value_type, None) + for k, v in metadata_value.items() + } + return metadata_value + + # Handle BaseModel type + if isinstance(expected_type, type) and issubclass(expected_type, BaseModel): + if isinstance(metadata_value, dict): + if isinstance(existing_value, dict): + merged = dict(existing_value) + SchemaUtils.merge_field_data_with_schema(expected_type, merged, metadata_value) + return expected_type(**merged) + else: + data = {} + SchemaUtils.merge_field_data_with_schema(expected_type, data, metadata_value) + return expected_type(**data) + return metadata_value + + # Handle list type + if origin in (list, tuple): + if isinstance(metadata_value, (list, tuple)): + args = get_args(expected_type) + elem_type = args[0] if args else Any + result = [ + SchemaUtils.merge_value_with_schema(v, elem_type, None) + for v in metadata_value + ] + return result if origin is list else tuple(result) + return metadata_value + + return metadata_value diff --git a/python/meshly/utils/serialization_utils.py b/python/meshly/utils/serialization_utils.py new file mode 100644 index 0000000..a078ee5 --- /dev/null +++ b/python/meshly/utils/serialization_utils.py @@ -0,0 +1,177 @@ +"""Serialization utilities for packing/unpacking arrays and assets.""" + +import asyncio +import inspect +import json +from typing import Any + +from pydantic import BaseModel + +from ..array import ArrayMetadata, ArrayType, ArrayUtils, EncodedArray +from ..data_handler import AssetProvider, CachedAssetLoader +from .checksum_utils import ChecksumUtils + + +class SerializationUtils: + """Utility class for serialization operations.""" + + @staticmethod + def pack_array(encoded: EncodedArray) -> bytes: + """Pack an encoded array into bytes with metadata. + + Format: [4 bytes metadata length][metadata json][array data] + + Args: + encoded: EncodedArray with metadata and data + + Returns: + Packed bytes + """ + metadata_json = json.dumps(encoded.metadata.model_dump()).encode("utf-8") + return len(metadata_json).to_bytes(4, "little") + metadata_json + encoded.data + + @staticmethod + def unpack_array(packed: bytes, array_type: ArrayType | None = None) -> Any: + """Unpack bytes back to an array. + + Args: + packed: Packed bytes from pack_array + array_type: Target array type, or None to use stored type + + Returns: + Decoded array (numpy or JAX) + """ + metadata_len = int.from_bytes(packed[:4], "little") + metadata_json = packed[4 : 4 + metadata_len].decode("utf-8") + array_data = packed[4 + metadata_len :] + + metadata_dict = json.loads(metadata_json) + metadata = ArrayMetadata(**metadata_dict) + encoded = EncodedArray(data=array_data, metadata=metadata) + + decoded = ArrayUtils.decode_array(encoded) + + if array_type is not None: + return ArrayUtils.convert_array(decoded, array_type) + elif metadata.array_type != "numpy": + return ArrayUtils.convert_array(decoded, metadata.array_type) + return decoded + + @staticmethod + def get_asset(assets: AssetProvider, checksum: str) -> bytes: + """Get asset bytes from a provider (dict, callable, or CachedAssetLoader). + + Supports both sync and async callables. + + Args: + assets: Asset provider (dict, callable, or CachedAssetLoader) + checksum: Asset checksum to fetch + + Returns: + Asset bytes + + Raises: + KeyError: If asset not found + """ + if callable(assets): + result = assets(checksum) + if inspect.isawaitable(result): + result = asyncio.get_event_loop().run_until_complete(result) + return result + if checksum not in assets: + raise KeyError(f"Missing asset with checksum '{checksum}'") + return assets[checksum] + + @staticmethod + def get_cached_asset( + assets: AssetProvider, + checksum: str, + ) -> bytes: + """Get asset bytes with caching support for CachedAssetLoader. + + Args: + assets: Asset provider + checksum: Asset checksum + + Returns: + Asset bytes + + Raises: + KeyError: If asset not found + """ + if isinstance(assets, CachedAssetLoader): + cache_path = f"assets/{checksum}.bin" + + try: + return assets.cache.read_binary(cache_path) + except (KeyError, FileNotFoundError): + pass + + result = assets.fetch(checksum) + if inspect.isawaitable(result): + result = asyncio.get_event_loop().run_until_complete(result) + + if result is None: + try: + return assets.cache.read_binary(cache_path) + except (KeyError, FileNotFoundError): + raise KeyError(f"Asset '{checksum}' not found in remote or cache") + + assets.cache.write_binary(cache_path, result) + return result + + if callable(assets): + result = assets(checksum) + if inspect.isawaitable(result): + result = asyncio.get_event_loop().run_until_complete(result) + if result is None: + raise KeyError(f"Asset fetcher returned None for checksum '{checksum}'") + return result + + if checksum not in assets: + raise KeyError(f"Missing asset with checksum '{checksum}'") + return assets[checksum] + + @staticmethod + def extract_value(value: Any, assets: dict[str, bytes]) -> Any: + """Recursively extract a value, replacing arrays and Packables with refs. + + Args: + value: Value to extract + assets: Dict to populate with encoded assets + + Returns: + Extracted value with $ref for arrays/Packables + """ + # Import here to avoid circular imports + from ..packable import Packable + + if ArrayUtils.is_array(value): + encoded = ArrayUtils.encode_array(value) + packed = SerializationUtils.pack_array(encoded) + checksum = ChecksumUtils.compute_bytes_checksum(packed) + assets[checksum] = packed + return {"$ref": checksum} + + if isinstance(value, Packable): + encoded = value.encode() + checksum = ChecksumUtils.compute_bytes_checksum(encoded) + assets[checksum] = encoded + return {"$ref": checksum} + + if isinstance(value, dict): + return {k: SerializationUtils.extract_value(v, assets) for k, v in value.items()} + + if isinstance(value, (list, tuple)): + result = [SerializationUtils.extract_value(v, assets) for v in value] + return result if isinstance(value, list) else tuple(result) + + if isinstance(value, BaseModel): + extracted = {} + for name in value.model_fields: + field_value = getattr(value, name, None) + if field_value is not None: + extracted[name] = SerializationUtils.extract_value(field_value, assets) + return extracted + + return value diff --git a/python/tests/test_checksum_utils.py b/python/tests/test_checksum_utils.py new file mode 100644 index 0000000..f968f72 --- /dev/null +++ b/python/tests/test_checksum_utils.py @@ -0,0 +1,166 @@ +"""Tests for ChecksumUtils.""" + +import shutil +import tempfile +from pathlib import Path + +import pytest +from meshly.utils.checksum_utils import ChecksumUtils + + +@pytest.fixture +def temp_dir(): + """Create and clean up a temporary directory.""" + d = tempfile.mkdtemp() + yield Path(d) + shutil.rmtree(d) + + +@pytest.fixture +def test_file(temp_dir): + """Create a simple test file.""" + f = temp_dir / "test_file.txt" + f.write_text("Hello, World!") + return f + + +@pytest.fixture +def test_subdir(temp_dir): + """Create a test directory with multiple files.""" + subdir = temp_dir / "subdir" + subdir.mkdir() + (subdir / "file1.txt").write_text("Content 1") + (subdir / "file2.txt").write_text("Content 2") + + nested = subdir / "nested" + nested.mkdir() + (nested / "file3.txt").write_text("Content 3") + + return subdir + + +class TestFileChecksum: + """Tests for file checksum computation.""" + + def test_returns_string(self, test_file): + """Test that file checksum returns a hex string.""" + result = ChecksumUtils.compute_file_checksum(test_file) + assert isinstance(result, str) + assert len(result) == 64 # SHA256 produces 64 hex chars + + def test_is_deterministic(self, test_file): + """Test that same file produces same checksum.""" + result1 = ChecksumUtils.compute_file_checksum(test_file) + result2 = ChecksumUtils.compute_file_checksum(test_file) + assert result1 == result2 + + def test_differs_for_different_content(self, temp_dir): + """Test that different content produces different checksum.""" + file1 = temp_dir / "a.txt" + file2 = temp_dir / "b.txt" + file1.write_text("Content A") + file2.write_text("Content B") + + assert ChecksumUtils.compute_file_checksum(file1) != ChecksumUtils.compute_file_checksum(file2) + + def test_not_found_raises(self, temp_dir): + """Test that missing file raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + ChecksumUtils.compute_file_checksum(temp_dir / "nonexistent.txt") + + def test_on_directory_raises_error(self, test_subdir): + """Test that passing a directory raises ValueError.""" + with pytest.raises(ValueError): + ChecksumUtils.compute_file_checksum(test_subdir) + + def test_fast_mode_uses_metadata(self, temp_dir): + """Test that fast mode produces valid checksum.""" + large_file = temp_dir / "large.txt" + large_file.write_text("Some content") + + result_fast = ChecksumUtils.compute_file_checksum(large_file, fast=True) + result_normal = ChecksumUtils.compute_file_checksum(large_file, fast=False) + + assert isinstance(result_fast, str) + assert len(result_fast) == 64 + # For small files, fast=True still uses content hash + assert result_fast == result_normal + + +class TestDirectoryChecksum: + """Tests for directory checksum computation.""" + + def test_returns_string(self, test_subdir): + """Test that directory checksum returns a hex string.""" + result = ChecksumUtils.compute_directory_checksum(test_subdir) + assert isinstance(result, str) + assert len(result) == 64 + + def test_is_deterministic(self, test_subdir): + """Test that same directory produces same checksum.""" + result1 = ChecksumUtils.compute_directory_checksum(test_subdir) + result2 = ChecksumUtils.compute_directory_checksum(test_subdir) + assert result1 == result2 + + def test_changes_with_content(self, test_subdir): + """Test that modifying a file changes directory checksum.""" + checksum_before = ChecksumUtils.compute_directory_checksum(test_subdir) + (test_subdir / "file1.txt").write_text("Modified content") + checksum_after = ChecksumUtils.compute_directory_checksum(test_subdir) + assert checksum_before != checksum_after + + def test_not_found_raises(self, temp_dir): + """Test that missing directory raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + ChecksumUtils.compute_directory_checksum(temp_dir / "nonexistent_dir") + + def test_on_file_raises_error(self, test_file): + """Test that passing a file raises ValueError.""" + with pytest.raises(ValueError): + ChecksumUtils.compute_directory_checksum(test_file) + + def test_fast_mode(self, test_subdir): + """Test that fast mode works for directories.""" + result_fast = ChecksumUtils.compute_directory_checksum(test_subdir, fast=True) + assert isinstance(result_fast, str) + assert len(result_fast) == 64 + + def test_empty_directory(self, temp_dir): + """Test checksum of an empty directory.""" + empty_dir = temp_dir / "empty" + empty_dir.mkdir() + + result = ChecksumUtils.compute_directory_checksum(empty_dir) + assert isinstance(result, str) + assert len(result) == 64 + + def test_includes_structure(self, temp_dir): + """Test that directory structure affects checksum.""" + dir1 = temp_dir / "dir1" + dir2 = temp_dir / "dir2" + dir1.mkdir() + dir2.mkdir() + + (dir1 / "a.txt").write_text("content") + (dir2 / "b.txt").write_text("content") # Same content, different name + + assert ChecksumUtils.compute_directory_checksum(dir1) != ChecksumUtils.compute_directory_checksum(dir2) + + +class TestPathChecksum: + """Tests for unified path checksum.""" + + def test_file(self, test_file): + """Test that compute_path_checksum works for files.""" + result = ChecksumUtils.compute_path_checksum(test_file) + assert result == ChecksumUtils.compute_file_checksum(test_file) + + def test_directory(self, test_subdir): + """Test that compute_path_checksum works for directories.""" + result = ChecksumUtils.compute_path_checksum(test_subdir) + assert result == ChecksumUtils.compute_directory_checksum(test_subdir) + + def test_not_found_raises(self, temp_dir): + """Test that missing path raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + ChecksumUtils.compute_path_checksum(temp_dir / "nonexistent") diff --git a/typescript/src/packable.ts b/typescript/src/packable.ts index 0fac97b..526929a 100644 --- a/typescript/src/packable.ts +++ b/typescript/src/packable.ts @@ -210,11 +210,6 @@ export class Packable { fieldData: Record ): void { for (const [key, value] of Object.entries(fieldData)) { - // Skip Python BaseModel reconstruction metadata - if (key === "__model_class__" || key === "__model_module__") { - continue - } - const existing = data[key] if ( @@ -229,33 +224,12 @@ export class Packable { existing as Record, value as Record ) - } else if (typeof value === "object" && value !== null && !ArrayBuffer.isView(value)) { - // Value is an object that might contain Python metadata - clean it - data[key] = Packable._stripModelMetadata(value as Record) } else { data[key] = value } } } - /** - * Recursively strip Python BaseModel metadata keys from an object. - */ - private static _stripModelMetadata(obj: Record): Record { - const result: Record = {} - for (const [key, value] of Object.entries(obj)) { - if (key === "__model_class__" || key === "__model_module__") { - continue - } - if (typeof value === "object" && value !== null && !ArrayBuffer.isView(value)) { - result[key] = Packable._stripModelMetadata(value as Record) - } else { - result[key] = value - } - } - return result - } - /** * Load a single array from a zip file without loading the entire object. */ From b70df46d9715981ffe69e30d9dd4f32a9c65ad38 Mon Sep 17 00:00:00 2001 From: Afshawn Lotfi Date: Mon, 19 Jan 2026 08:17:36 +0000 Subject: [PATCH 4/4] refactor: improve readability of array extraction methods in Packable class --- python/meshly/packable.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/python/meshly/packable.py b/python/meshly/packable.py index e7d12cc..0e34997 100644 --- a/python/meshly/packable.py +++ b/python/meshly/packable.py @@ -217,9 +217,11 @@ def array_fields(self) -> set[str]: continue value = getattr(self, field_name, None) if value is not None: - result.update(ArrayUtils.extract_nested_arrays( - value, field_name, skip=lambda x: isinstance(x, Packable) - ).keys()) + result.update( + ArrayUtils.extract_nested_arrays( + value, field_name, skip=lambda x: isinstance(x, Packable) + ).keys() + ) return result def _extract_non_array_fields(self) -> dict[str, Any]: @@ -231,7 +233,9 @@ def _extract_non_array_fields(self) -> dict[str, Any]: continue value = getattr(self, name, None) if value is not None and not ArrayUtils.is_array(value): - extracted = ArrayUtils.extract_non_arrays(value, skip=lambda x: isinstance(x, Packable)) + extracted = ArrayUtils.extract_non_arrays( + value, skip=lambda x: isinstance(x, Packable) + ) if extracted is not None: model_data[name] = extracted return model_data @@ -430,9 +434,7 @@ def extract(obj: BaseModel) -> SerializedPackableData: SerializedPackableData with data dict (refs for arrays) and assets dict """ if not isinstance(obj, BaseModel): - raise TypeError( - f"extract() requires a Pydantic BaseModel, got {type(obj).__name__}." - ) + raise TypeError(f"extract() requires a Pydantic BaseModel, got {type(obj).__name__}.") assets: dict[str, bytes] = {} data: dict[str, Any] = {} @@ -444,6 +446,12 @@ def extract(obj: BaseModel) -> SerializedPackableData: if value is not None: data[field_name] = SerializationUtils.extract_value(value, assets) + # Include computed fields (Pydantic v2) + for field_name in type(obj).model_computed_fields: + value = getattr(obj, field_name, None) + if value is not None: + data[field_name] = SerializationUtils.extract_value(value, assets) + return SerializedPackableData(data=data, assets=assets) @staticmethod