diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..6077999 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,1090 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "ascii" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "blosc-src" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9046dd58971db0226346fde214143d16a6eb12f535b5320d0ea94fcea420631" +dependencies = [ + "cc", + "libz-sys", + "lz4-sys", + "snappy_src", + "zstd-sys", +] + +[[package]] +name = "cc" +version = "1.2.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "fd5" +version = "0.1.0" +dependencies = [ + "hdf5-metno", + "hdf5-metno-sys", + "serde", + "serde_json", + "sha2", + "tempfile", + "thiserror", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + +[[package]] +name = "hdf5-metno" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a6c90397db1fe43273705a49b49e0595bd0d49f4aac990b116103ae5bf52961" +dependencies = [ + "bitflags", + "blosc-src", + "cfg-if", + "errno", + "hdf5-metno-derive", + "hdf5-metno-sys", + "hdf5-metno-types", + "libc", + "lzf-sys", + "ndarray", + "paste", +] + +[[package]] +name = "hdf5-metno-derive" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "205c825c5140aa2791cec795068e8aa8d299862009b7fbd59bd4d876b47842c5" +dependencies = [ + "proc-macro-crate", + "proc-macro-error2", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "hdf5-metno-src" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36b0303729f84fb0f2dc510d28b64cb716fb13e6a139a17e88db329123ecff82" +dependencies = [ + "cmake", + "libz-sys", +] + +[[package]] +name = "hdf5-metno-sys" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de20d5ba22c244493bdfefb91d8e9de08e3e58d96a792532da5e0df545aed279" +dependencies = [ + "hdf5-metno-src", + "libc", + "libloading", + "libz-sys", + "parking_lot", + "pkg-config", + "regex", + "serde", + "serde_derive", + "winreg", +] + +[[package]] +name = "hdf5-metno-types" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1698f197367c277fac6c3a35ad12397941b57f7554778d925ceb54c3fe754723" +dependencies = [ + "ascii", + "cfg-if", + "hdf5-metno-sys", + "libc", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "libc" +version = "0.2.182" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" + +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "libz-sys" +version = "1.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4735e9cbde5aac84a5ce588f6b23a90b9b0b528f6c5a8db8a4aff300463a0839" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "link-cplusplus" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f78c730aaa7d0b9336a299029ea49f9ee53b0ed06e9202e8cb7db9bae7b8c82" +dependencies = [ + "cc", +] + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "lz4-sys" +version = "1.11.1+lz4-1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bd8c0d6c6ed0cd30b3652886bb8711dc4bb01d637a68105a3d5158039b418e6" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "lzf-sys" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0798d023ce0905e2c77ed96de92aab929ff9db2036cbef4edfee0daf33582aec" +dependencies = [ + "cc", +] + +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "ndarray" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520080814a7a6b4a6e9070823bb24b4531daac8c4627e08ba5de8c5ef2f2752d" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "portable-atomic-util" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a9db96d7fa8782dd8c15ce32ffe8680bbd1e978a43bf51a34d39483540495f5" +dependencies = [ + "portable-atomic", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro-crate" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +dependencies = [ + "toml_edit", +] + +[[package]] +name = "proc-macro-error-attr2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "proc-macro-error2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802" +dependencies = [ + "proc-macro-error-attr2", + "proc-macro2", + "quote", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "snappy_src" +version = "0.2.5+snappy.1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e1432067a55bcfb1fd522d2aca6537a4fcea32bba87ea86921226d14f9bad53" +dependencies = [ + "cc", + "link-cplusplus", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0" +dependencies = [ + "fastrand", + "getrandom 0.4.1", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "toml_datetime" +version = "0.7.5+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_edit" +version = "0.23.10+spec-1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" +dependencies = [ + "indexmap", + "toml_datetime", + "toml_parser", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.0.9+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" +dependencies = [ + "winnow", +] + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "winnow" +version = "0.7.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +dependencies = [ + "memchr", +] + +[[package]] +name = "winreg" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" +dependencies = [ + "cfg-if", + "serde", + "windows-sys 0.48.0", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..fac59bc --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[workspace] +members = ["crates/fd5"] +resolver = "2" + +[workspace.dependencies] +hdf5-metno = { version = "0.11.0", features = ["blosc-all", "lzf", "static", "zlib"] } +sha2 = "0.10" +serde = { version = "1", features = ["derive"] } +serde_json = "1.0" diff --git a/crates/fd5/Cargo.toml b/crates/fd5/Cargo.toml new file mode 100644 index 0000000..40c1e79 --- /dev/null +++ b/crates/fd5/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "fd5" +version = "0.1.0" +edition = "2021" +description = "Rust implementation of fd5 Merkle-tree hashing, verification, and editing" +license = "Apache-2.0" + +[dependencies] +hdf5-metno = { workspace = true } +hdf5-metno-sys = "0.10.1" +sha2 = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = "2" + +[dev-dependencies] +tempfile = "3" diff --git a/crates/fd5/src/attr_ser.rs b/crates/fd5/src/attr_ser.rs new file mode 100644 index 0000000..e3b1b0b --- /dev/null +++ b/crates/fd5/src/attr_ser.rs @@ -0,0 +1,154 @@ +//! Deterministic attribute-to-bytes serialization. +//! +//! Must produce byte-identical output to Python's `_serialize_attr` (hash.py L76-85): +//! +//! - `str` → `.encode("utf-8")` +//! - `bytes` → as-is +//! - `np.ndarray` → `.tobytes()` (row-major C-order) +//! - `np.generic` → `np.array(value).tobytes()` +//! - fallback → `str(value).encode("utf-8")` +//! +//! In HDF5-metno, attributes arrive as typed values. We read raw bytes for +//! numeric types and UTF-8 for strings to match Python exactly. +//! +//! **Important**: Uses `read_raw` (not `read_1d`) for arrays because +//! attributes can be multi-dimensional (e.g. 4×4 affine matrices). + +use hdf5_metno::types::{FloatSize, IntSize, TypeDescriptor, VarLenAscii, VarLenUnicode}; +use hdf5_metno::Attribute; + +use crate::error::Fd5Result; + +/// Serialize an HDF5 attribute value to bytes, matching Python's `_serialize_attr`. +pub fn serialize_attr(attr: &Attribute) -> Fd5Result> { + let td = attr.dtype()?.to_descriptor()?; + + if attr.is_scalar() { + serialize_scalar(attr, &td) + } else { + serialize_array(attr, &td) + } +} + +fn serialize_scalar(attr: &Attribute, td: &TypeDescriptor) -> Fd5Result> { + match td { + // String types → UTF-8 bytes (matching Python str.encode("utf-8")) + TypeDescriptor::VarLenUnicode => { + let v: VarLenUnicode = attr.read_scalar()?; + Ok(v.as_str().as_bytes().to_vec()) + } + TypeDescriptor::VarLenAscii => { + let v: VarLenAscii = attr.read_scalar()?; + Ok(v.as_str().as_bytes().to_vec()) + } + TypeDescriptor::FixedAscii(_) | TypeDescriptor::FixedUnicode(_) => { + // Read raw, trim trailing nulls, return UTF-8 + let raw = attr.read_raw::()?; + let s = String::from_utf8_lossy(&raw); + let trimmed = s.trim_end_matches('\0'); + Ok(trimmed.as_bytes().to_vec()) + } + + // Numeric scalars → np.array(value).tobytes() + TypeDescriptor::Integer(int_size) => Ok(match int_size { + IntSize::U1 => attr.read_scalar::()?.to_ne_bytes().to_vec(), + IntSize::U2 => attr.read_scalar::()?.to_ne_bytes().to_vec(), + IntSize::U4 => attr.read_scalar::()?.to_ne_bytes().to_vec(), + IntSize::U8 => attr.read_scalar::()?.to_ne_bytes().to_vec(), + }), + TypeDescriptor::Unsigned(int_size) => Ok(match int_size { + IntSize::U1 => attr.read_scalar::()?.to_ne_bytes().to_vec(), + IntSize::U2 => attr.read_scalar::()?.to_ne_bytes().to_vec(), + IntSize::U4 => attr.read_scalar::()?.to_ne_bytes().to_vec(), + IntSize::U8 => attr.read_scalar::()?.to_ne_bytes().to_vec(), + }), + TypeDescriptor::Float(float_size) => Ok(match float_size { + FloatSize::U4 => attr.read_scalar::()?.to_ne_bytes().to_vec(), + FloatSize::U8 => attr.read_scalar::()?.to_ne_bytes().to_vec(), + }), + TypeDescriptor::Boolean => { + let v: bool = attr.read_scalar()?; + Ok(vec![v as u8]) + } + + // Fallback: str(value).encode("utf-8") + _ => { + let raw = attr.read_raw::()?; + Ok(raw) + } + } +} + +/// Serialize a non-scalar attribute to bytes. +/// +/// Uses `read_raw` to handle any dimensionality (1D, 2D, etc.). +fn serialize_array(attr: &Attribute, td: &TypeDescriptor) -> Fd5Result> { + match td { + TypeDescriptor::Integer(int_size) => Ok(match int_size { + IntSize::U1 => { + let v = attr.read_raw::()?; + v.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + IntSize::U2 => { + let v = attr.read_raw::()?; + v.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + IntSize::U4 => { + let v = attr.read_raw::()?; + v.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + IntSize::U8 => { + let v = attr.read_raw::()?; + v.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + }), + TypeDescriptor::Unsigned(int_size) => Ok(match int_size { + IntSize::U1 => attr.read_raw::()?, + IntSize::U2 => { + let v = attr.read_raw::()?; + v.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + IntSize::U4 => { + let v = attr.read_raw::()?; + v.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + IntSize::U8 => { + let v = attr.read_raw::()?; + v.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + }), + TypeDescriptor::Float(float_size) => Ok(match float_size { + FloatSize::U4 => { + let v = attr.read_raw::()?; + v.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + FloatSize::U8 => { + let v = attr.read_raw::()?; + v.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + }), + TypeDescriptor::Boolean => { + let v = attr.read_raw::()?; + Ok(v.iter().map(|&b| b as u8).collect()) + } + // For string arrays in attributes, concatenate UTF-8 bytes + TypeDescriptor::VarLenUnicode => { + let v = attr.read_raw::()?; + let mut buf = Vec::new(); + for s in &v { + buf.extend_from_slice(s.as_str().as_bytes()); + } + Ok(buf) + } + TypeDescriptor::VarLenAscii => { + let v = attr.read_raw::()?; + let mut buf = Vec::new(); + for s in &v { + buf.extend_from_slice(s.as_str().as_bytes()); + } + Ok(buf) + } + // Fallback: try reading raw bytes + _ => Ok(attr.read_raw::()?), + } +} diff --git a/crates/fd5/src/builder.rs b/crates/fd5/src/builder.rs new file mode 100644 index 0000000..fc0ad53 --- /dev/null +++ b/crates/fd5/src/builder.rs @@ -0,0 +1,479 @@ +//! fd5 builder — creates sealed HDF5 files with inline Merkle-tree hashing. +//! +//! Mirrors Python's `fd5.create` context-manager API. The builder: +//! 1. Opens a temp HDF5 file and writes root attributes +//! 2. Delegates product-specific writes via `ProductSchema` +//! 3. Seals the file: embeds schema, computes id + content_hash, renames +//! +//! Data hashes are computed inline during `create_dataset` calls (tee pattern) +//! and cached so that `compute_content_hash` can skip re-reading datasets. + +use std::cell::RefCell; +use std::collections::{BTreeMap, HashMap}; +use std::path::{Path, PathBuf}; +use std::rc::Rc; + +use hdf5_metno::types::VarLenUnicode; +use hdf5_metno::{File, Group}; +use sha2::{Digest, Sha256}; + +use crate::error::{Fd5Error, Fd5Result}; +use crate::h5io::{dict_to_h5, write_attr_i64, write_attr_str}; +use crate::hash::{compute_content_hash, compute_id}; +use crate::naming::generate_filename; +use crate::product::{get_schema, ProductSchema}; + +/// Shared hash caches between `Fd5Builder` and `HashTrackingGroup` instances. +type DataHashCache = Rc>>; +type ChunkDigestCache = Rc>>>; + +// --------------------------------------------------------------------------- +// HashTrackingGroup +// --------------------------------------------------------------------------- + +/// Wraps an `hdf5_metno::Group` to compute data hashes inline during writes. +/// +/// Cached data hashes (`sha256(data_bytes)`) and per-chunk digests are stored +/// in shared caches keyed by the dataset's absolute HDF5 path. +pub struct HashTrackingGroup { + group: Group, + data_hash_cache: DataHashCache, + chunk_digest_cache: ChunkDigestCache, +} + +impl HashTrackingGroup { + fn new( + group: Group, + data_hash_cache: DataHashCache, + chunk_digest_cache: ChunkDigestCache, + ) -> Self { + Self { + group, + data_hash_cache, + chunk_digest_cache, + } + } + + /// Create a sub-group, returning a wrapped `HashTrackingGroup`. + pub fn create_group(&self, name: &str) -> Fd5Result { + let grp = self.group.create_group(name)?; + Ok(HashTrackingGroup::new( + grp, + Rc::clone(&self.data_hash_cache), + Rc::clone(&self.chunk_digest_cache), + )) + } + + /// Create a dataset, write data, and cache the SHA-256 hash of the raw bytes. + fn create_dataset_and_hash( + &self, + name: &str, + data: &[T], + ) -> Fd5Result<()> { + let ds = self + .group + .new_dataset::() + .shape([data.len()]) + .create(name)?; + ds.write(data)?; + + let byte_len = data.len() * std::mem::size_of::(); + let byte_ptr = data.as_ptr() as *const u8; + // SAFETY: &[T] is a contiguous, aligned, initialized buffer. + let bytes = unsafe { std::slice::from_raw_parts(byte_ptr, byte_len) }; + let data_hash = format!("{:x}", Sha256::digest(bytes)); + + self.data_hash_cache + .borrow_mut() + .insert(ds.name(), data_hash); + + Ok(()) + } + + /// Create a dataset of f64 values, hashing inline. + pub fn create_dataset_f64(&self, name: &str, data: &[f64]) -> Fd5Result<()> { + self.create_dataset_and_hash(name, data) + } + + /// Create a dataset of f32 values, hashing inline. + pub fn create_dataset_f32(&self, name: &str, data: &[f32]) -> Fd5Result<()> { + self.create_dataset_and_hash(name, data) + } + + /// Create a dataset of i64 values, hashing inline. + pub fn create_dataset_i64(&self, name: &str, data: &[i64]) -> Fd5Result<()> { + self.create_dataset_and_hash(name, data) + } + + /// Create a dataset of i32 values, hashing inline. + pub fn create_dataset_i32(&self, name: &str, data: &[i32]) -> Fd5Result<()> { + self.create_dataset_and_hash(name, data) + } + + /// Create a dataset of u8 values, hashing inline. + pub fn create_dataset_u8(&self, name: &str, data: &[u8]) -> Fd5Result<()> { + self.create_dataset_and_hash(name, data) + } + + /// Write a string attribute on this group. + pub fn write_attr_str(&self, name: &str, value: &str) -> Fd5Result<()> { + write_attr_str(&self.group, name, value) + } + + /// Write an i64 attribute on this group. + pub fn write_attr_i64(&self, name: &str, value: i64) -> Fd5Result<()> { + write_attr_i64(&self.group, name, value) + } + + /// Access the underlying HDF5 group (for advanced use). + pub fn group(&self) -> &Group { + &self.group + } +} + +// --------------------------------------------------------------------------- +// Fd5Builder +// --------------------------------------------------------------------------- + +/// Builder that orchestrates fd5 file creation. +/// +/// Do not instantiate directly -- use [`create()`]. +pub struct Fd5Builder { + file: File, + tmp_path: PathBuf, + out_dir: PathBuf, + product_type: String, + timestamp: String, + schema: Box, + data_hash_cache: DataHashCache, + chunk_digest_cache: ChunkDigestCache, +} + +impl Fd5Builder { + /// Write product-specific data through the registered schema. + pub fn write_product(&self, data: &serde_json::Value) -> Fd5Result<()> { + let group = self.file.as_group()?; + let tracking = HashTrackingGroup::new( + group, + Rc::clone(&self.data_hash_cache), + Rc::clone(&self.chunk_digest_cache), + ); + self.schema.write(&tracking, data) + } + + /// Write metadata group from a JSON value (nested dict -> HDF5 groups/attrs). + pub fn write_metadata(&self, metadata: &serde_json::Value) -> Fd5Result<()> { + let grp = self.file.create_group("metadata")?; + dict_to_h5(&grp, metadata) + } + + /// Access the underlying HDF5 file (for advanced writes). + pub fn file(&self) -> &File { + &self.file + } + + /// Seal the file: validate, embed schema, compute id + content_hash, rename. + /// + /// Consumes self -- the file cannot be used after sealing. + pub fn seal(self) -> Fd5Result { + self.validate()?; + self.write_chunk_hashes()?; + + let schema_json = self.schema.json_schema(); + let schema_str = serde_json::to_string(&schema_json)?; + let root = self.file.as_group()?; + + let vlu: VarLenUnicode = schema_str + .parse() + .map_err(|e| Fd5Error::Other(format!("{}", e)))?; + root.new_attr::() + .shape(()) + .create("_schema")? + .write_scalar(&vlu)?; + + let id_keys = self.schema.id_inputs(); + let mut id_inputs = BTreeMap::new(); + for key in &id_keys { + let val = read_root_attr_str(&root, key).unwrap_or_default(); + id_inputs.insert(key.clone(), val); + } + let file_id = compute_id(&id_inputs); + + write_attr_str(&root, "id", &file_id)?; + write_attr_str(&root, "id_inputs", &id_keys.join(" + "))?; + + // content_hash is computed from the file directly (data already written) + let content_hash = compute_content_hash(&self.file)?; + write_attr_str(&root, "content_hash", &content_hash)?; + + self.file.flush()?; + drop(root); + self.file.close()?; + + let product_slug = self.product_type.replace('/', "-"); + let filename = generate_filename(&product_slug, &file_id, Some(&self.timestamp)); + let final_path = self.out_dir.join(filename); + std::fs::rename(&self.tmp_path, &final_path)?; + + Ok(final_path) + } + + fn validate(&self) -> Fd5Result<()> { + let root = self.file.as_group()?; + for attr_name in &["name", "description", "timestamp"] { + let val = read_root_attr_str(&root, attr_name).unwrap_or_default(); + if val.is_empty() { + return Err(Fd5Error::Other(format!( + "Required attribute '{}' is missing or empty", + attr_name + ))); + } + } + Ok(()) + } + + fn write_chunk_hashes(&self) -> Fd5Result<()> { + let cache = self.chunk_digest_cache.borrow(); + for (ds_path, digests) in cache.iter() { + let ds = self.file.dataset(ds_path)?; + let parent_path = ds_path + .rsplit_once('/') + .map(|(p, _)| if p.is_empty() { "/" } else { p }) + .unwrap_or("/"); + let ds_name = ds_path.rsplit_once('/').map(|(_, n)| n).unwrap_or(ds_path); + let hashes_name = format!("{}_chunk_hashes", ds_name); + + let parent = if parent_path == "/" { + self.file.as_group()? + } else { + self.file.group(parent_path)? + }; + + let vlu_digests: Vec = digests + .iter() + .map(|d| d.parse::().unwrap()) + .collect(); + let chunk_ds = parent + .new_dataset::() + .shape([vlu_digests.len()]) + .create(hashes_name.as_str())?; + chunk_ds.write(&vlu_digests)?; + + write_attr_str(&chunk_ds, "algorithm", "sha256")?; + drop(ds); + } + Ok(()) + } +} + +/// Read a string attribute from a group, returning `None` if not found. +fn read_root_attr_str(group: &Group, name: &str) -> Option { + group + .attr(name) + .ok() + .and_then(|a| { + a.read_scalar::() + .map(|v| v.as_str().to_string()) + .ok() + .or_else(|| { + a.read_scalar::() + .map(|v| v.as_str().to_string()) + .ok() + }) + }) +} + +// --------------------------------------------------------------------------- +// Public entry point +// --------------------------------------------------------------------------- + +/// Create a new fd5 builder -- analogous to Python's `fd5.create()` context manager. +/// +/// Opens a temporary HDF5 file, writes root attributes, and returns a builder +/// that can be used to write product data and metadata. Call `seal()` to finalize. +/// +/// # Errors +/// +/// Returns an error if the product schema is not registered or if the temp file +/// cannot be created. +pub fn create( + out_dir: &Path, + product: &str, + name: &str, + description: &str, + timestamp: &str, +) -> Fd5Result { + let schema = get_schema(product)?; + + std::fs::create_dir_all(out_dir)?; + + let product_slug = product.replace('/', "_"); + let tmp_name = format!(".fd5_{}.h5.tmp", product_slug); + let tmp_path = out_dir.join(tmp_name); + let file = File::create(&tmp_path)?; + + let root = file.as_group()?; + write_attr_str(&root, "product", product)?; + write_attr_str(&root, "name", name)?; + write_attr_str(&root, "description", description)?; + write_attr_str(&root, "timestamp", timestamp)?; + write_attr_i64(&root, "_schema_version", 1)?; + + let data_hash_cache: DataHashCache = Rc::new(RefCell::new(HashMap::new())); + let chunk_digest_cache: ChunkDigestCache = Rc::new(RefCell::new(HashMap::new())); + + Ok(Fd5Builder { + file, + tmp_path, + out_dir: out_dir.to_path_buf(), + product_type: product.to_string(), + timestamp: timestamp.to_string(), + schema, + data_hash_cache, + chunk_digest_cache, + }) +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use crate::product::{register_schema, TestProductSchema}; + use crate::verify; + use tempfile::TempDir; + + fn register_test_schema() { + register_schema(Box::new(TestProductSchema)); + } + + #[test] + fn test_create_and_seal() { + register_test_schema(); + let tmp_dir = TempDir::new().unwrap(); + + let builder = create( + tmp_dir.path(), + "test/product", + "my-test", + "A test file", + "2024-01-15T10:30:00", + ) + .unwrap(); + + let data = serde_json::json!({"values": [1.0, 2.0, 3.0]}); + builder.write_product(&data).unwrap(); + + let sealed_path = builder.seal().unwrap(); + assert!(sealed_path.exists()); + assert!(sealed_path + .file_name() + .unwrap() + .to_str() + .unwrap() + .ends_with(".h5")); + + // Verify with the existing verify module + let status = verify::verify(&sealed_path).unwrap(); + match status { + verify::Fd5Status::Valid(_) => {} // expected + other => panic!("Expected Valid, got {:?}", other), + } + } + + #[test] + fn test_content_hash_deterministic() { + register_test_schema(); + let tmp_dir = TempDir::new().unwrap(); + + let make_file = |subdir: &str| -> String { + // Each call needs its own schema registration since get_schema removes it + register_test_schema(); + let out = tmp_dir.path().join(subdir); + let builder = create( + &out, + "test/product", + "my-test", + "A test file", + "2024-01-15T10:30:00", + ) + .unwrap(); + + let data = serde_json::json!({"values": [1.0, 2.0, 3.0]}); + builder.write_product(&data).unwrap(); + let path = builder.seal().unwrap(); + + // Read content_hash from sealed file + let file = File::open(&path).unwrap(); + let group = file.as_group().unwrap(); + read_root_attr_str(&group, "content_hash").unwrap() + }; + + let hash1 = make_file("a"); + let hash2 = make_file("b"); + assert_eq!(hash1, hash2, "content_hash should be deterministic"); + } + + #[test] + fn test_missing_required_attr_fails() { + register_test_schema(); + let tmp_dir = TempDir::new().unwrap(); + + let builder = create( + tmp_dir.path(), + "test/product", + "", // empty name + "A test file", + "2024-01-15T10:30:00", + ) + .unwrap(); + + let result = builder.seal(); + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("name"), + "Error should mention 'name': {}", + err_msg + ); + } + + #[test] + fn test_write_metadata() { + register_test_schema(); + let tmp_dir = TempDir::new().unwrap(); + + let builder = create( + tmp_dir.path(), + "test/product", + "my-test", + "A test file", + "2024-01-15T10:30:00", + ) + .unwrap(); + + let metadata = serde_json::json!({ + "subject": "test-subject-01", + "scanner": { + "model": "Explorer", + "manufacturer": "United Imaging" + } + }); + builder.write_metadata(&metadata).unwrap(); + + let data = serde_json::json!({"values": [1.0, 2.0, 3.0]}); + builder.write_product(&data).unwrap(); + + let sealed_path = builder.seal().unwrap(); + assert!(sealed_path.exists()); + + // Verify the sealed file + let status = verify::verify(&sealed_path).unwrap(); + match status { + verify::Fd5Status::Valid(_) => {} + other => panic!("Expected Valid, got {:?}", other), + } + } +} diff --git a/crates/fd5/src/edit.rs b/crates/fd5/src/edit.rs new file mode 100644 index 0000000..289301c --- /dev/null +++ b/crates/fd5/src/edit.rs @@ -0,0 +1,150 @@ +//! fd5 attribute editing with copy-on-write or in-place modes. +//! +//! After modifying an attribute, the `content_hash` is recomputed and +//! written back, re-sealing the file. + +use std::path::{Path, PathBuf}; + +use hdf5_metno::types::VarLenUnicode; +use hdf5_metno::File; + +use crate::error::Fd5Result; +use crate::hash::compute_content_hash; + +/// How the edit should be applied. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum EditMode { + /// Copy the file first, edit the copy (safe default). + CopyOnWrite, + /// Edit the original file in place (dev/expert flag). + InPlace, +} + +/// Typed attribute values for writing. +#[derive(Debug, Clone)] +pub enum AttrValue { + String(String), + Int64(i64), + Float64(f64), +} + +/// Description of a planned edit — shown in confirmation dialog before applying. +#[derive(Debug, Clone)] +pub struct EditPlan { + pub source_path: PathBuf, + pub attr_path: String, + pub attr_name: String, + pub old_value: String, + pub new_value: AttrValue, + pub mode: EditMode, +} + +/// Result of a completed edit. +#[derive(Debug, Clone)] +pub struct EditResult { + pub output_path: PathBuf, + pub old_content_hash: String, + pub new_content_hash: String, +} + +fn make_vlu(s: &str) -> VarLenUnicode { + s.parse().expect("content_hash should not contain null bytes") +} + +impl EditPlan { + /// Apply the edit plan: modify the attribute and re-seal with new content_hash. + pub fn apply(&self) -> Fd5Result { + let target_path = match self.mode { + EditMode::CopyOnWrite => { + let stem = self + .source_path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("file"); + let ext = self + .source_path + .extension() + .and_then(|s| s.to_str()) + .unwrap_or("h5"); + let parent = self.source_path.parent().unwrap_or(Path::new(".")); + let target = parent.join(format!("{}_edited.{}", stem, ext)); + std::fs::copy(&self.source_path, &target)?; + target + } + EditMode::InPlace => self.source_path.clone(), + }; + + // Open for read-write + let file = File::open_rw(&target_path)?; + let root_group: &hdf5_metno::Group = &*file; + + // Read old content_hash + let old_hash = root_group + .attr("content_hash") + .ok() + .and_then(|a| { + a.read_scalar::() + .map(|v| v.as_str().to_string()) + .ok() + }) + .unwrap_or_default(); + + // Write the new attribute value on the target object + if self.attr_path == "/" { + write_attr(root_group, &self.attr_name, &self.new_value)?; + } else { + let target_group = root_group.group(&self.attr_path)?; + write_attr(&target_group, &self.attr_name, &self.new_value)?; + } + + // Recompute and write new content_hash + let new_hash = compute_content_hash(&file)?; + // Delete old content_hash and write new + if root_group.attr("content_hash").is_ok() { + root_group.delete_attr("content_hash")?; + } + let vlu = make_vlu(&new_hash); + root_group + .new_attr::() + .shape(()) + .create("content_hash")? + .write_scalar(&vlu)?; + + file.flush()?; + + Ok(EditResult { + output_path: target_path, + old_content_hash: old_hash, + new_content_hash: new_hash, + }) + } +} + +/// Write a typed value as an HDF5 attribute, replacing any existing attribute. +fn write_attr( + loc: &hdf5_metno::Location, + name: &str, + value: &AttrValue, +) -> Fd5Result<()> { + // Delete existing attribute if present + if loc.attr(name).is_ok() { + loc.delete_attr(name)?; + } + + match value { + AttrValue::String(s) => { + let vlu = make_vlu(s); + loc.new_attr::() + .shape(()) + .create(name)? + .write_scalar(&vlu)?; + } + AttrValue::Int64(v) => { + loc.new_attr::().shape(()).create(name)?.write_scalar(v)?; + } + AttrValue::Float64(v) => { + loc.new_attr::().shape(()).create(name)?.write_scalar(v)?; + } + } + Ok(()) +} diff --git a/crates/fd5/src/error.rs b/crates/fd5/src/error.rs new file mode 100644 index 0000000..afe5012 --- /dev/null +++ b/crates/fd5/src/error.rs @@ -0,0 +1,26 @@ +/// Errors produced by the fd5 crate. +#[derive(Debug, thiserror::Error)] +pub enum Fd5Error { + #[error("HDF5 error: {0}")] + Hdf5(#[from] hdf5_metno::Error), + + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + #[error("JSON error: {0}")] + Json(#[from] serde_json::Error), + + #[error("missing attribute: {0}")] + MissingAttribute(String), + + #[error("hash mismatch: stored={stored}, computed={computed}")] + HashMismatch { stored: String, computed: String }, + + #[error("not an fd5 file (no content_hash attribute)")] + NotFd5, + + #[error("{0}")] + Other(String), +} + +pub type Fd5Result = std::result::Result; diff --git a/crates/fd5/src/h5io.rs b/crates/fd5/src/h5io.rs new file mode 100644 index 0000000..c20462c --- /dev/null +++ b/crates/fd5/src/h5io.rs @@ -0,0 +1,302 @@ +//! Lossless round-trip between `serde_json::Value` trees and HDF5 groups/attrs. +//! +//! Type mapping follows the Python `fd5.h5io` module: +//! - JSON objects become HDF5 sub-groups +//! - JSON strings become VarLenUnicode attributes +//! - JSON numbers become i64 or f64 attributes +//! - JSON booleans become bool attributes +//! - JSON arrays become array attributes (typed by first element) +//! - JSON null values are skipped (absence encodes null) + +use hdf5_metno::types::VarLenUnicode; +use hdf5_metno::{Group, Location}; +use serde_json::Value; + +use crate::error::{Fd5Error, Fd5Result}; + +/// Write a `serde_json::Value` tree to an HDF5 group. +/// +/// Objects become subgroups, scalars become attributes. +/// Keys are written in sorted order for deterministic layout. +pub fn dict_to_h5(group: &Group, data: &Value) -> Fd5Result<()> { + let obj = data + .as_object() + .ok_or_else(|| Fd5Error::Other("dict_to_h5 expects a JSON object".into()))?; + + let mut keys: Vec<&String> = obj.keys().collect(); + keys.sort(); + + for key in keys { + let value = &obj[key]; + if value.is_null() { + continue; + } + write_value(group, key, value)?; + } + Ok(()) +} + +/// Read an HDF5 group tree into a `serde_json::Value`. +/// +/// Reads attributes and sub-groups; datasets are not read. +pub fn h5_to_dict(group: &Group) -> Fd5Result { + let mut map = serde_json::Map::new(); + + let mut attr_names = group.attr_names()?; + attr_names.sort(); + for key in &attr_names { + let attr = group.attr(key)?; + let val = read_attr_value(&attr)?; + map.insert(key.clone(), val); + } + + let mut member_names = group.member_names()?; + member_names.sort(); + for key in &member_names { + if let Ok(child_group) = group.group(key) { + let child_val = h5_to_dict(&child_group)?; + map.insert(key.clone(), child_val); + } + } + + Ok(Value::Object(map)) +} + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +fn write_value(group: &Group, key: &str, value: &Value) -> Fd5Result<()> { + match value { + Value::Object(_) => { + let sub = group.create_group(key)?; + dict_to_h5(&sub, value)?; + } + Value::Bool(b) => { + write_attr_bool(group, key, *b)?; + } + Value::Number(n) => { + if let Some(i) = n.as_i64() { + write_attr_i64(group, key, i)?; + } else if let Some(f) = n.as_f64() { + write_attr_f64(group, key, f)?; + } + } + Value::String(s) => { + write_attr_str(group, key, s)?; + } + Value::Array(arr) => { + write_list(group, key, arr)?; + } + Value::Null => {} + } + Ok(()) +} + +fn write_list(group: &Group, key: &str, arr: &[Value]) -> Fd5Result<()> { + if arr.is_empty() { + let data: Vec = vec![]; + group + .new_attr::() + .shape([0]) + .create(key)? + .write_raw(&data)?; + return Ok(()); + } + + let first = &arr[0]; + if first.is_boolean() { + let bools: Vec = arr + .iter() + .map(|v| v.as_bool().unwrap_or(false)) + .collect(); + group + .new_attr::() + .shape([bools.len()]) + .create(key)? + .write_raw(&bools)?; + } else if first.is_number() { + if first.is_i64() && arr.iter().all(|v| v.is_i64()) { + let ints: Vec = arr + .iter() + .map(|v| v.as_i64().unwrap_or(0)) + .collect(); + group + .new_attr::() + .shape([ints.len()]) + .create(key)? + .write_raw(&ints)?; + } else { + let floats: Vec = arr + .iter() + .map(|v| v.as_f64().unwrap_or(0.0)) + .collect(); + group + .new_attr::() + .shape([floats.len()]) + .create(key)? + .write_raw(&floats)?; + } + } else if first.is_string() { + let strings: Vec = arr + .iter() + .map(|v| { + let s = v.as_str().unwrap_or(""); + s.parse::().expect("valid unicode string") + }) + .collect(); + group + .new_attr::() + .shape([strings.len()]) + .create(key)? + .write_raw(&strings)?; + } else { + return Err(Fd5Error::Other(format!( + "Unsupported array element type for key '{}'", + key + ))); + } + Ok(()) +} + +/// Write a VarLenUnicode string attribute. +pub fn write_attr_str(loc: &Location, name: &str, value: &str) -> Fd5Result<()> { + let vlu: VarLenUnicode = value.parse().map_err(|e| { + Fd5Error::Other(format!("Invalid string for attribute '{}': {}", name, e)) + })?; + loc.new_attr::() + .shape(()) + .create(name)? + .write_scalar(&vlu)?; + Ok(()) +} + +/// Write an i64 attribute. +pub fn write_attr_i64(loc: &Location, name: &str, value: i64) -> Fd5Result<()> { + loc.new_attr::() + .shape(()) + .create(name)? + .write_scalar(&value)?; + Ok(()) +} + +/// Write an f64 attribute. +pub fn write_attr_f64(loc: &Location, name: &str, value: f64) -> Fd5Result<()> { + loc.new_attr::() + .shape(()) + .create(name)? + .write_scalar(&value)?; + Ok(()) +} + +/// Write a bool attribute. +pub fn write_attr_bool(loc: &Location, name: &str, value: bool) -> Fd5Result<()> { + loc.new_attr::() + .shape(()) + .create(name)? + .write_scalar(&value)?; + Ok(()) +} + +/// Read an HDF5 attribute into a `serde_json::Value`. +fn read_attr_value(attr: &hdf5_metno::Attribute) -> Fd5Result { + use hdf5_metno::types::{FloatSize, IntSize, TypeDescriptor}; + + let td = attr.dtype()?.to_descriptor()?; + + if attr.is_scalar() { + match &td { + TypeDescriptor::VarLenUnicode => { + let v: VarLenUnicode = attr.read_scalar()?; + Ok(Value::String(v.as_str().to_string())) + } + TypeDescriptor::VarLenAscii => { + let v: hdf5_metno::types::VarLenAscii = attr.read_scalar()?; + Ok(Value::String(v.as_str().to_string())) + } + TypeDescriptor::Integer(int_size) => { + let val: i64 = match int_size { + IntSize::U1 => attr.read_scalar::()? as i64, + IntSize::U2 => attr.read_scalar::()? as i64, + IntSize::U4 => attr.read_scalar::()? as i64, + IntSize::U8 => attr.read_scalar::()?, + }; + Ok(Value::Number(serde_json::Number::from(val))) + } + TypeDescriptor::Unsigned(int_size) => { + let val: u64 = match int_size { + IntSize::U1 => attr.read_scalar::()? as u64, + IntSize::U2 => attr.read_scalar::()? as u64, + IntSize::U4 => attr.read_scalar::()? as u64, + IntSize::U8 => attr.read_scalar::()?, + }; + Ok(Value::Number(serde_json::Number::from(val))) + } + TypeDescriptor::Float(float_size) => { + let val: f64 = match float_size { + FloatSize::U4 => attr.read_scalar::()? as f64, + FloatSize::U8 => attr.read_scalar::()?, + }; + Ok(serde_json::json!(val)) + } + TypeDescriptor::Boolean => { + let v: bool = attr.read_scalar()?; + Ok(Value::Bool(v)) + } + _ => { + let raw = attr.read_raw::()?; + Ok(Value::String(String::from_utf8_lossy(&raw).to_string())) + } + } + } else { + // Array attribute + read_array_attr_value(attr, &td) + } +} + +fn read_array_attr_value( + attr: &hdf5_metno::Attribute, + td: &hdf5_metno::types::TypeDescriptor, +) -> Fd5Result { + use hdf5_metno::types::{FloatSize, IntSize, TypeDescriptor}; + + match td { + TypeDescriptor::VarLenUnicode => { + let v = attr.read_raw::()?; + let arr: Vec = v.iter().map(|s| Value::String(s.as_str().to_string())).collect(); + Ok(Value::Array(arr)) + } + TypeDescriptor::VarLenAscii => { + let v = attr.read_raw::()?; + let arr: Vec = v.iter().map(|s| Value::String(s.as_str().to_string())).collect(); + Ok(Value::Array(arr)) + } + TypeDescriptor::Integer(int_size) => { + let vals: Vec = match int_size { + IntSize::U1 => attr.read_raw::()?.iter().map(|&v| v as i64).collect(), + IntSize::U2 => attr.read_raw::()?.iter().map(|&v| v as i64).collect(), + IntSize::U4 => attr.read_raw::()?.iter().map(|&v| v as i64).collect(), + IntSize::U8 => attr.read_raw::()?.to_vec(), + }; + let arr: Vec = vals.into_iter().map(|v| Value::Number(v.into())).collect(); + Ok(Value::Array(arr)) + } + TypeDescriptor::Float(float_size) => { + let vals: Vec = match float_size { + FloatSize::U4 => attr.read_raw::()?.iter().map(|&v| v as f64).collect(), + FloatSize::U8 => attr.read_raw::()?.to_vec(), + }; + let arr: Vec = vals.into_iter().map(|v| serde_json::json!(v)).collect(); + Ok(Value::Array(arr)) + } + TypeDescriptor::Boolean => { + let v = attr.read_raw::()?; + let arr: Vec = v.iter().map(|&b| Value::Bool(b)).collect(); + Ok(Value::Array(arr)) + } + _ => { + let raw = attr.read_raw::()?; + Ok(Value::String(String::from_utf8_lossy(&raw).to_string())) + } + } +} diff --git a/crates/fd5/src/hash.rs b/crates/fd5/src/hash.rs new file mode 100644 index 0000000..3a7c882 --- /dev/null +++ b/crates/fd5/src/hash.rs @@ -0,0 +1,296 @@ +//! fd5 Merkle tree hashing — direct port of Python's `hash.py`. +//! +//! Implements the content_hash computation: +//! 1. `sorted_attrs_hash(obj)` — SHA-256 of sorted attributes (skip `content_hash`) +//! 2. `dataset_hash(ds)` — `sha256(attrs_hash + sha256(data_bytes))` +//! 3. `group_hash(group)` — `sha256(attrs_hash + child_hashes)` (recursive) +//! 4. `compute_content_hash(file)` — `"sha256:" + sha256(root_group_hash)` +//! 5. `compute_id(inputs)` — `"sha256:" + sha256(sorted_values.join('\0'))` + +use sha2::{Digest, Sha256}; + +use hdf5_metno::types::TypeDescriptor; +use hdf5_metno::{Dataset, File, Group, Location}; + +use crate::attr_ser::serialize_attr; +use crate::error::Fd5Result; + +const CHUNK_HASHES_SUFFIX: &str = "_chunk_hashes"; +const EXCLUDED_ATTRS: &[&str] = &["content_hash"]; + +/// Check if a dataset name is a chunk-hashes auxiliary dataset. +fn is_chunk_hashes_dataset(name: &str) -> bool { + name.ends_with(CHUNK_HASHES_SUFFIX) +} + +/// Compute `sha256(sha256(key + serialize(val)) for key in sorted(attrs))`. +/// +/// Exactly matches Python's `_sorted_attrs_hash`. +fn sorted_attrs_hash(obj: &Location) -> Fd5Result { + let mut h = Sha256::new(); + + let mut attr_names = obj.attr_names()?; + attr_names.sort(); + + for key in &attr_names { + if EXCLUDED_ATTRS.contains(&key.as_str()) { + continue; + } + let attr = obj.attr(key)?; + let val_bytes = serialize_attr(&attr)?; + + // inner = sha256(key_utf8 + value_bytes) + let mut inner = Sha256::new(); + inner.update(key.as_bytes()); + inner.update(&val_bytes); + let inner_hex = format!("{:x}", inner.finalize()); + + // Feed hex digest string into outer hasher + h.update(inner_hex.as_bytes()); + } + + Ok(format!("{:x}", h.finalize())) +} + +/// Hash a dataset: `sha256(attrs_hash + sha256(data.tobytes()))`. +/// +/// Reads the entire dataset as contiguous row-major bytes. +fn dataset_hash(ds: &Dataset) -> Fd5Result { + let attrs_h = sorted_attrs_hash(ds)?; + + // Read dataset data as raw bytes + let data_bytes = read_dataset_bytes(ds)?; + let data_hash = format!("{:x}", Sha256::digest(&data_bytes)); + + let combined = format!("{}{}", attrs_h, data_hash); + Ok(format!("{:x}", Sha256::digest(combined.as_bytes()))) +} + +/// Recursively compute the Merkle hash of a group. +/// +/// `sha256(sorted_attrs_hash + child_hashes)` where children are +/// processed in sorted key order, `_chunk_hashes` datasets and +/// external links are excluded. +fn group_hash(group: &Group) -> Fd5Result { + let mut h = Sha256::new(); + h.update(sorted_attrs_hash(group)?.as_bytes()); + + let mut member_names = group.member_names()?; + member_names.sort(); + + for key in &member_names { + if is_chunk_hashes_dataset(key) { + continue; + } + + // Check link type — skip external links + if is_external_link(group, key) { + continue; + } + + // Try as group first, then dataset + if let Ok(child_group) = group.group(key) { + h.update(group_hash(&child_group)?.as_bytes()); + } else if let Ok(child_ds) = group.dataset(key) { + h.update(dataset_hash(&child_ds)?.as_bytes()); + } + // If neither, skip (broken link) + } + + Ok(format!("{:x}", h.finalize())) +} + +/// Check if a member is an external link using iter_visit. +fn is_external_link(group: &Group, name: &str) -> bool { + use hdf5_metno::LinkType; + use std::cell::Cell; + + let is_external = Cell::new(false); + let _ = group.iter_visit_default((), |_group, link_name, link_info, _| { + if link_name == name && link_info.link_type == LinkType::External { + is_external.set(true); + return false; // stop iteration + } + true // continue + }); + is_external.get() +} + +/// Read all data from a dataset as contiguous row-major bytes. +/// +/// Matches Python's `ds[...].tobytes()`. Uses `read_raw` to handle +/// datasets of any dimensionality. +fn read_dataset_bytes(ds: &Dataset) -> Fd5Result> { + let td = ds.dtype()?.to_descriptor()?; + let total_elems: usize = ds.shape().iter().product(); + + if total_elems == 0 { + return Ok(Vec::new()); + } + + let bytes = match td { + TypeDescriptor::Float(hdf5_metno::types::FloatSize::U4) => { + let data = ds.read_raw::()?; + data.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + TypeDescriptor::Float(hdf5_metno::types::FloatSize::U8) => { + let data = ds.read_raw::()?; + data.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + TypeDescriptor::Integer(int_size) => match int_size { + hdf5_metno::types::IntSize::U1 => { + let data = ds.read_raw::()?; + data.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + hdf5_metno::types::IntSize::U2 => { + let data = ds.read_raw::()?; + data.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + hdf5_metno::types::IntSize::U4 => { + let data = ds.read_raw::()?; + data.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + hdf5_metno::types::IntSize::U8 => { + let data = ds.read_raw::()?; + data.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + }, + TypeDescriptor::Unsigned(int_size) => match int_size { + hdf5_metno::types::IntSize::U1 => { + let data = ds.read_raw::()?; + data + } + hdf5_metno::types::IntSize::U2 => { + let data = ds.read_raw::()?; + data.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + hdf5_metno::types::IntSize::U4 => { + let data = ds.read_raw::()?; + data.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + hdf5_metno::types::IntSize::U8 => { + let data = ds.read_raw::()?; + data.iter().flat_map(|x| x.to_ne_bytes()).collect() + } + }, + TypeDescriptor::Boolean => { + let data = ds.read_raw::()?; + data.iter().map(|&b| b as u8).collect() + } + // Compound datasets (e.g. event tables) and other types: + // Read raw bytes using H5Dread with the file's native type. + _ => { + read_dataset_raw_bytes(ds, total_elems)? + } + }; + + Ok(bytes) +} + +/// Read raw bytes from a dataset using the file's native type. +/// +/// This handles compound types and any other type where we can't use +/// a typed `read_raw()` call. Uses the HDF5 C API directly. +fn read_dataset_raw_bytes(ds: &Dataset, total_elems: usize) -> Fd5Result> { + use hdf5_metno_sys::h5d::{H5Dget_type, H5Dread}; + use hdf5_metno_sys::h5p::H5P_DEFAULT; + use hdf5_metno_sys::h5s::H5S_ALL; + use hdf5_metno_sys::h5t::H5Tclose; + + let elem_size = ds.dtype()?.size(); + let total_bytes = total_elems * elem_size; + + // Get the dataset's file type (not a converted one) + let file_type_id = unsafe { H5Dget_type(ds.id()) }; + if file_type_id < 0 { + return Err(crate::error::Fd5Error::Other( + "H5Dget_type failed".to_string(), + )); + } + + let mut buf = vec![0u8; total_bytes]; + let ret = unsafe { + H5Dread( + ds.id(), + file_type_id, + H5S_ALL, + H5S_ALL, + H5P_DEFAULT, + buf.as_mut_ptr().cast(), + ) + }; + + // Close the type we opened + unsafe { H5Tclose(file_type_id) }; + + if ret < 0 { + return Err(crate::error::Fd5Error::Other( + "H5Dread failed for compound/opaque dataset".to_string(), + )); + } + Ok(buf) +} + +/// Compute the algorithm-prefixed content hash: `"sha256:"`. +/// +/// Direct equivalent of Python's `compute_content_hash(root)`. +pub fn compute_content_hash(file: &File) -> Fd5Result { + let root = file.as_group()?; + let root_h = group_hash(&root)?; + let final_hash = format!("{:x}", Sha256::digest(root_h.as_bytes())); + Ok(format!("sha256:{}", final_hash)) +} + +/// Compute the algorithm-prefixed content hash from a Group. +pub fn compute_content_hash_from_group(group: &Group) -> Fd5Result { + let root_h = group_hash(group)?; + let final_hash = format!("{:x}", Sha256::digest(root_h.as_bytes())); + Ok(format!("sha256:{}", final_hash)) +} + +/// Compute `"sha256:" + sha256(sorted_values.join('\0'))`. +/// +/// Direct equivalent of Python's `compute_id(inputs, id_inputs_desc)`. +pub fn compute_id(inputs: &std::collections::BTreeMap) -> String { + let payload: String = inputs + .values() + .cloned() + .collect::>() + .join("\0"); + let digest = format!("{:x}", Sha256::digest(payload.as_bytes())); + format!("sha256:{}", digest) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::BTreeMap; + + #[test] + fn test_compute_id_deterministic() { + let mut inputs = BTreeMap::new(); + inputs.insert("b".to_string(), "val_b".to_string()); + inputs.insert("a".to_string(), "val_a".to_string()); + + let id1 = compute_id(&inputs); + let id2 = compute_id(&inputs); + assert_eq!(id1, id2); + assert!(id1.starts_with("sha256:")); + } + + #[test] + fn test_compute_id_sorted_order() { + // BTreeMap is already sorted, but verify the output matches + // sha256("val_a\0val_b") + let mut inputs = BTreeMap::new(); + inputs.insert("a".to_string(), "val_a".to_string()); + inputs.insert("b".to_string(), "val_b".to_string()); + + let expected_payload = "val_a\0val_b"; + let expected = format!( + "sha256:{:x}", + Sha256::digest(expected_payload.as_bytes()) + ); + assert_eq!(compute_id(&inputs), expected); + } +} diff --git a/crates/fd5/src/lib.rs b/crates/fd5/src/lib.rs new file mode 100644 index 0000000..ba27449 --- /dev/null +++ b/crates/fd5/src/lib.rs @@ -0,0 +1,21 @@ +//! # fd5 +//! +//! Rust implementation of fd5 Merkle-tree hashing, verification, editing, +//! and file creation for immutable HDF5 data products sealed with `content_hash`. + +pub mod attr_ser; +pub mod builder; +pub mod edit; +pub mod error; +pub mod h5io; +pub mod hash; +pub mod naming; +pub mod product; +pub mod schema; +pub mod verify; + +pub use builder::{create, Fd5Builder, HashTrackingGroup}; +pub use error::{Fd5Error, Fd5Result}; +pub use hash::{compute_content_hash, compute_id}; +pub use product::{get_schema, register_schema, ProductSchema}; +pub use verify::{Fd5Status, verify}; diff --git a/crates/fd5/src/naming.rs b/crates/fd5/src/naming.rs new file mode 100644 index 0000000..b968b63 --- /dev/null +++ b/crates/fd5/src/naming.rs @@ -0,0 +1,108 @@ +//! Deterministic filename generation following the fd5 naming convention. +//! +//! Format: `YYYY-MM-DD_HH-MM-SS_-.h5` + +const SHA256_PREFIX: &str = "sha256:"; +const ID_HEX_LENGTH: usize = 8; +const EXTENSION: &str = ".h5"; + +/// Generate an fd5-compliant filename. +/// +/// Format: `YYYY-MM-DD_HH-MM-SS_-.h5` +/// +/// When `timestamp` is `None` the datetime prefix is omitted. +/// The `id_hash` is truncated to the first 8 hex characters; a +/// `sha256:` prefix is stripped automatically if present. +pub fn generate_filename(product: &str, id_hash: &str, timestamp: Option<&str>) -> String { + let short_id = truncate_id(id_hash); + let mut parts: Vec = Vec::new(); + + if let Some(ts) = timestamp { + if let Some(formatted) = format_timestamp(ts) { + parts.push(formatted); + } + } + + parts.push(format!("{}-{}", product, short_id)); + format!("{}{}", parts.join("_"), EXTENSION) +} + +fn truncate_id(id_hash: &str) -> &str { + let raw = id_hash.strip_prefix(SHA256_PREFIX).unwrap_or(id_hash); + &raw[..raw.len().min(ID_HEX_LENGTH)] +} + +/// Best-effort reformat of an ISO 8601 timestamp to `YYYY-MM-DD_HH-MM-SS`. +fn format_timestamp(ts: &str) -> Option { + if ts.is_empty() { + return None; + } + let ts = ts.trim(); + + let (date_part, time_part) = if let Some(pos) = ts.find('T') { + (&ts[..pos], Some(&ts[pos + 1..])) + } else if let Some(pos) = ts.find(' ') { + (&ts[..pos], Some(&ts[pos + 1..])) + } else { + (ts, None) + }; + + if date_part.len() < 10 { + return Some(date_part.to_string()); + } + + let date_formatted = &date_part[..10]; + + if let Some(time) = time_part { + // Strip timezone suffix and fractional seconds + let time_clean = time + .split(|c: char| c == '+' || c == 'Z') + .next() + .unwrap_or(time); + let time_clean = time_clean + .split('.') + .next() + .unwrap_or(time_clean); + if time_clean.len() >= 8 { + let hh = &time_clean[0..2]; + let mm = &time_clean[3..5]; + let ss = &time_clean[6..8]; + Some(format!("{}_{}-{}-{}", date_formatted, hh, mm, ss)) + } else { + Some(date_formatted.to_string()) + } + } else { + Some(date_formatted.to_string()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_generate_filename_with_timestamp() { + let name = generate_filename( + "imaging/recon", + "sha256:abcdef0123456789", + Some("2024-01-15T10:30:00"), + ); + assert_eq!(name, "2024-01-15_10-30-00_imaging-recon-abcdef01.h5"); + } + + #[test] + fn test_generate_filename_no_timestamp() { + let name = generate_filename("test/product", "sha256:deadbeef99", None); + assert_eq!(name, "test-product-deadbeef.h5"); + } + + #[test] + fn test_truncate_id_strips_prefix() { + assert_eq!(truncate_id("sha256:abcdef0123456789"), "abcdef01"); + } + + #[test] + fn test_truncate_id_no_prefix() { + assert_eq!(truncate_id("abcdef0123456789"), "abcdef01"); + } +} diff --git a/crates/fd5/src/product.rs b/crates/fd5/src/product.rs new file mode 100644 index 0000000..ee5d8d2 --- /dev/null +++ b/crates/fd5/src/product.rs @@ -0,0 +1,108 @@ +//! Product schema trait and registry. +//! +//! Each product type (e.g. `imaging/recon`, `imaging/sinogram`) has a schema +//! that knows how to write product-specific data and provides the JSON Schema +//! for validation. + +use std::collections::HashMap; +use std::sync::Mutex; + +use serde_json::Value; + +use crate::builder::HashTrackingGroup; +use crate::error::{Fd5Error, Fd5Result}; + +/// Trait that every product schema must implement. +pub trait ProductSchema: Send + Sync { + /// The product type string (e.g. `"imaging/recon"`). + fn product_type(&self) -> &str; + + /// The schema version string (e.g. `"1.0.0"`). + fn schema_version(&self) -> &str; + + /// The JSON Schema as a `serde_json::Value`. + fn json_schema(&self) -> Value; + + /// The list of root attribute keys used to compute the file id. + fn id_inputs(&self) -> Vec; + + /// Write product-specific data through the hash-tracking group. + fn write(&self, target: &HashTrackingGroup, data: &Value) -> Fd5Result<()>; +} + +// --------------------------------------------------------------------------- +// Global registry +// --------------------------------------------------------------------------- + +static REGISTRY: Mutex>>> = Mutex::new(None); + +fn with_registry(f: F) -> R +where + F: FnOnce(&mut HashMap>) -> R, +{ + let mut lock = REGISTRY.lock().unwrap(); + let map = lock.get_or_insert_with(HashMap::new); + f(map) +} + +/// Register a product schema. Overwrites any existing schema for the same product type. +pub fn register_schema(schema: Box) { + let key = schema.product_type().to_string(); + with_registry(|map| { + map.insert(key, schema); + }); +} + +/// Look up the schema for a product type. Returns an error if not found. +pub fn get_schema(product: &str) -> Fd5Result> { + with_registry(|map| { + map.remove(product).ok_or_else(|| { + Fd5Error::Other(format!( + "No schema registered for product type '{}'", + product + )) + }) + }) +} + +// --------------------------------------------------------------------------- +// Test product schema +// --------------------------------------------------------------------------- + +/// A simple test product schema for unit tests. +pub struct TestProductSchema; + +impl ProductSchema for TestProductSchema { + fn product_type(&self) -> &str { + "test/product" + } + + fn schema_version(&self) -> &str { + "1.0.0" + } + + fn json_schema(&self) -> Value { + serde_json::json!({ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "values": {"type": "array", "items": {"type": "number"}} + } + }) + } + + fn id_inputs(&self) -> Vec { + vec!["product".into(), "name".into(), "timestamp".into()] + } + + fn write(&self, target: &HashTrackingGroup, data: &Value) -> Fd5Result<()> { + if let Some(values) = data.get("values").and_then(|v| v.as_array()) { + let floats: Vec = values + .iter() + .map(|v| v.as_f64().unwrap_or(0.0)) + .collect(); + target.create_dataset_f64("values", &floats)?; + } + Ok(()) + } +} diff --git a/crates/fd5/src/schema.rs b/crates/fd5/src/schema.rs new file mode 100644 index 0000000..87afd5f --- /dev/null +++ b/crates/fd5/src/schema.rs @@ -0,0 +1,40 @@ +//! JSON Schema loading, validation, and embedded `_schema` extraction. +//! +//! Mirrors Python's `schema.py`. + +use hdf5_metno::File; +use serde_json::Value; + +use crate::error::{Fd5Error, Fd5Result}; + +/// Extract and parse the `_schema` JSON attribute from an fd5 file. +pub fn dump_schema(file: &File) -> Fd5Result { + let group = file.as_group()?; + let attr = group.attr("_schema").map_err(|_| { + Fd5Error::MissingAttribute("_schema".to_string()) + })?; + let raw: String = attr.read_scalar::() + .map(|v| v.as_str().to_string()) + .or_else(|_| attr.read_scalar::().map(|v| v.as_str().to_string())) + .map_err(|e| Fd5Error::Other(format!("Failed to read _schema attribute: {e}")))?; + let schema: Value = serde_json::from_str(&raw)?; + Ok(schema) +} + +/// Read the `_schema_version` attribute (int64). +pub fn schema_version(file: &File) -> Fd5Result { + let group = file.as_group()?; + let attr = group.attr("_schema_version").map_err(|_| { + Fd5Error::MissingAttribute("_schema_version".to_string()) + })?; + let v: i64 = attr.read_scalar()?; + Ok(v) +} + +/// Check if an fd5 file has an embedded schema. +pub fn has_schema(file: &File) -> bool { + file.as_group() + .ok() + .and_then(|g| g.attr("_schema").ok()) + .is_some() +} diff --git a/crates/fd5/src/verify.rs b/crates/fd5/src/verify.rs new file mode 100644 index 0000000..80e1424 --- /dev/null +++ b/crates/fd5/src/verify.rs @@ -0,0 +1,66 @@ +//! fd5 integrity verification. +//! +//! Recomputes the Merkle tree and compares with the stored `content_hash`. + +use std::path::Path; + +use hdf5_metno::File; + +use crate::error::{Fd5Error, Fd5Result}; +use crate::hash::compute_content_hash; + +/// Verification status of an fd5 file. +#[derive(Debug, Clone)] +pub enum Fd5Status { + /// Currently checking (used for UI state). + Checking, + /// Hash verified successfully. + Valid(String), + /// Hash mismatch. + Invalid { stored: String, computed: String }, + /// Not an fd5 file (no content_hash attribute). + NotFd5, + /// Error during verification. + Error(String), +} + +/// Recompute the Merkle tree and compare with the stored `content_hash`. +/// +/// Returns `true` if the hashes match, `false` otherwise (including +/// when `content_hash` is missing). +/// +/// Direct equivalent of Python's `verify(path)`. +pub fn verify(path: &Path) -> Fd5Result { + let file = File::open(path)?; + verify_file(&file) +} + +/// Verify an already-opened file. +pub fn verify_file(file: &File) -> Fd5Result { + let group = file.as_group()?; + + // Read stored content_hash + let stored = match group.attr("content_hash") { + Ok(attr) => { + let val: String = attr + .read_scalar::() + .map(|v| v.as_str().to_string()) + .or_else(|_| { + attr.read_scalar::() + .map(|v| v.as_str().to_string()) + }) + .map_err(|e| Fd5Error::Other(format!("Failed to read content_hash: {e}")))?; + val + } + Err(_) => return Ok(Fd5Status::NotFd5), + }; + + // Compute fresh hash + let computed = compute_content_hash(file)?; + + if computed == stored { + Ok(Fd5Status::Valid(stored)) + } else { + Ok(Fd5Status::Invalid { stored, computed }) + } +} diff --git a/crates/fd5/tests/conformance.rs b/crates/fd5/tests/conformance.rs new file mode 100644 index 0000000..6d32e93 --- /dev/null +++ b/crates/fd5/tests/conformance.rs @@ -0,0 +1,268 @@ +//! Cross-language conformance tests for fd5 Merkle tree hashing. +//! +//! Uses fixture files generated by `tests/conformance/generate_fixtures.py` +//! and expected values from `tests/conformance/expected/*.json`. +//! +//! **Note**: Run with `--test-threads=1` if HDF5 is not built with +//! thread-safety enabled, as the raw `H5Dread` calls for compound +//! datasets are not thread-safe. + +use std::path::{Path, PathBuf}; + +use fd5::verify::Fd5Status; +use fd5::{compute_content_hash, verify as verify_fn}; + +fn fixtures_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../../tests/conformance/fixtures") + .canonicalize() + .expect("conformance fixtures directory must exist — run generate_fixtures.py first") +} + +fn invalid_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../../tests/conformance/invalid") + .canonicalize() + .expect("conformance invalid directory must exist — run generate_fixtures.py first") +} + +fn expected_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../../tests/conformance/expected") + .canonicalize() + .expect("conformance expected directory must exist") +} + +fn load_expected(name: &str) -> serde_json::Value { + let path = expected_dir().join(format!("{}.json", name)); + let text = std::fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("Failed to read {}: {}", path.display(), e)); + serde_json::from_str(&text).unwrap() +} + +// ----------------------------------------------------------------------- +// Valid fixtures: verify() returns Valid +// ----------------------------------------------------------------------- + +#[test] +fn verify_minimal() { + let path = fixtures_dir().join("minimal.fd5"); + let expected = load_expected("minimal"); + assert_eq!(expected["verify"], true); + + let status = verify_fn(&path).unwrap(); + assert!( + matches!(status, Fd5Status::Valid(_)), + "minimal.fd5 should verify as Valid, got: {:?}", + status + ); +} + +#[test] +fn verify_sealed() { + let path = fixtures_dir().join("sealed.fd5"); + let expected = load_expected("sealed"); + assert_eq!(expected["verify"], true); + + let status = verify_fn(&path).unwrap(); + assert!( + matches!(status, Fd5Status::Valid(_)), + "sealed.fd5 should verify as Valid, got: {:?}", + status + ); +} + +#[test] +fn verify_complex_metadata() { + let path = fixtures_dir().join("complex-metadata.fd5"); + let expected = load_expected("complex-metadata"); + assert_eq!(expected["verify"], true); + + let status = verify_fn(&path).unwrap(); + assert!( + matches!(status, Fd5Status::Valid(_)), + "complex-metadata.fd5 should verify as Valid, got: {:?}", + status + ); +} + +#[test] +fn verify_multiscale() { + let path = fixtures_dir().join("multiscale.fd5"); + let expected = load_expected("multiscale"); + assert_eq!(expected["verify"], true); + + let status = verify_fn(&path).unwrap(); + assert!( + matches!(status, Fd5Status::Valid(_)), + "multiscale.fd5 should verify as Valid, got: {:?}", + status + ); +} + +#[test] +fn verify_tabular() { + let path = fixtures_dir().join("tabular.fd5"); + let expected = load_expected("tabular"); + assert_eq!(expected["verify"], true); + + let status = verify_fn(&path).unwrap(); + assert!( + matches!(status, Fd5Status::Valid(_)), + "tabular.fd5 should verify as Valid, got: {:?}", + status + ); +} + +// ----------------------------------------------------------------------- +// with-provenance: may contain external links, expected verify varies +// ----------------------------------------------------------------------- + +#[test] +fn verify_with_provenance() { + let path = fixtures_dir().join("with-provenance.fd5"); + let expected = load_expected("with-provenance"); + let should_verify = expected["verify"].as_bool().unwrap(); + + let status = verify_fn(&path).unwrap(); + if should_verify { + assert!( + matches!(status, Fd5Status::Valid(_)), + "with-provenance.fd5 should verify as Valid, got: {:?}", + status + ); + } else { + // Expected to not verify (e.g. external links change hash) + // The file still has a content_hash, just doesn't match + assert!( + !matches!(status, Fd5Status::NotFd5), + "with-provenance.fd5 should be an fd5 file" + ); + } +} + +// ----------------------------------------------------------------------- +// Invalid fixtures +// ----------------------------------------------------------------------- + +#[test] +fn verify_bad_hash_fails() { + let path = invalid_dir().join("bad-hash.fd5"); + let status = verify_fn(&path).unwrap(); + assert!( + matches!(status, Fd5Status::Invalid { .. }), + "bad-hash.fd5 should verify as Invalid, got: {:?}", + status + ); +} + +#[test] +fn verify_no_schema_still_has_hash() { + // no-schema.fd5 has content_hash but no _schema + // Verification should still work since it only checks content_hash + let path = invalid_dir().join("no-schema.fd5"); + let status = verify_fn(&path).unwrap(); + // This file was created with a valid hash, so verify should pass + assert!( + matches!(status, Fd5Status::Valid(_)), + "no-schema.fd5 has valid content_hash, should verify: {:?}", + status + ); +} + +// ----------------------------------------------------------------------- +// compute_content_hash matches stored hash +// ----------------------------------------------------------------------- + +#[test] +fn content_hash_matches_stored_sealed() { + let path = fixtures_dir().join("sealed.fd5"); + let file = hdf5_metno::File::open(&path).unwrap(); + let stored: String = file + .attr("content_hash") + .unwrap() + .read_scalar::() + .map(|v| v.as_str().to_string()) + .unwrap(); + let computed = compute_content_hash(&file).unwrap(); + assert_eq!( + computed, stored, + "Rust compute_content_hash must match stored hash" + ); +} + +#[test] +fn content_hash_matches_stored_minimal() { + let path = fixtures_dir().join("minimal.fd5"); + let file = hdf5_metno::File::open(&path).unwrap(); + let stored: String = file + .attr("content_hash") + .unwrap() + .read_scalar::() + .map(|v| v.as_str().to_string()) + .unwrap(); + let computed = compute_content_hash(&file).unwrap(); + assert_eq!( + computed, stored, + "Rust compute_content_hash must match stored hash for minimal" + ); +} + +#[test] +fn content_hash_prefix() { + let path = fixtures_dir().join("sealed.fd5"); + let file = hdf5_metno::File::open(&path).unwrap(); + let computed = compute_content_hash(&file).unwrap(); + assert!( + computed.starts_with("sha256:"), + "content_hash must start with 'sha256:'" + ); + // 7 for "sha256:" + 64 hex chars + assert_eq!(computed.len(), 71, "content_hash must be sha256: + 64 hex"); +} + +// ----------------------------------------------------------------------- +// Root attributes match expected values +// ----------------------------------------------------------------------- + +#[test] +fn root_attrs_match_sealed() { + let expected = load_expected("sealed"); + let path = fixtures_dir().join("sealed.fd5"); + let file = hdf5_metno::File::open(&path).unwrap(); + + let root_attrs = &expected["root_attrs"]; + for (key, val) in root_attrs.as_object().unwrap() { + let attr = file.attr(key).unwrap_or_else(|_| { + panic!("sealed.fd5 missing expected attribute: {}", key) + }); + if let Some(expected_str) = val.as_str() { + let actual: String = attr + .read_scalar::() + .map(|v| v.as_str().to_string()) + .unwrap(); + assert_eq!(actual, expected_str, "Attribute '{}' mismatch", key); + } + } + + // Check prefixed attributes exist with correct prefix + if let Some(prefixed) = expected.get("root_attrs_prefixed") { + for (key, prefix_val) in prefixed.as_object().unwrap() { + let attr = file.attr(key).unwrap_or_else(|_| { + panic!("sealed.fd5 missing prefixed attribute: {}", key) + }); + let actual: String = attr + .read_scalar::() + .map(|v| v.as_str().to_string()) + .unwrap(); + let prefix = prefix_val.as_str().unwrap(); + assert!( + actual.starts_with(prefix), + "Attribute '{}' should start with '{}', got '{}'", + key, + prefix, + actual + ); + } + } +} diff --git a/schemas/_manifest.json b/schemas/_manifest.json new file mode 100644 index 0000000..f78fbf5 --- /dev/null +++ b/schemas/_manifest.json @@ -0,0 +1,120 @@ +{ + "calibration": { + "id_inputs": [ + "calibration_type", + "scanner_model", + "scanner_serial", + "valid_from" + ], + "required_root_attrs": { + "domain": "medical_imaging", + "product": "calibration" + }, + "schema_file": "calibration.schema.json", + "schema_version": "1.0.0" + }, + "device_data": { + "id_inputs": [ + "timestamp", + "scanner", + "device_type" + ], + "required_root_attrs": { + "domain": "medical_imaging", + "product": "device_data" + }, + "schema_file": "device_data.schema.json", + "schema_version": "1.0.0" + }, + "listmode": { + "id_inputs": [ + "timestamp", + "scanner", + "vendor_series_id" + ], + "required_root_attrs": { + "domain": "medical_imaging", + "product": "listmode" + }, + "schema_file": "listmode.schema.json", + "schema_version": "1.0.0" + }, + "recon": { + "id_inputs": [ + "timestamp", + "scanner", + "vendor_series_id" + ], + "required_root_attrs": { + "domain": "medical_imaging", + "product": "recon" + }, + "schema_file": "recon.schema.json", + "schema_version": "1.1.0" + }, + "roi": { + "id_inputs": [ + "timestamp", + "scanner", + "vendor_series_id" + ], + "required_root_attrs": { + "domain": "medical_imaging", + "product": "roi" + }, + "schema_file": "roi.schema.json", + "schema_version": "1.0.0" + }, + "sim": { + "id_inputs": [ + "simulator", + "phantom", + "random_seed" + ], + "required_root_attrs": { + "domain": "medical_imaging", + "product": "sim" + }, + "schema_file": "sim.schema.json", + "schema_version": "1.0.0" + }, + "sinogram": { + "id_inputs": [ + "timestamp", + "scanner", + "vendor_series_id" + ], + "required_root_attrs": { + "domain": "medical_imaging", + "product": "sinogram" + }, + "schema_file": "sinogram.schema.json", + "schema_version": "1.0.0" + }, + "spectrum": { + "id_inputs": [ + "timestamp", + "scanner", + "measurement_id" + ], + "required_root_attrs": { + "domain": "medical_imaging", + "product": "spectrum" + }, + "schema_file": "spectrum.schema.json", + "schema_version": "1.0.0" + }, + "transform": { + "id_inputs": [ + "timestamp", + "source_image_id", + "target_image_id" + ], + "required_root_attrs": { + "domain": "medical_imaging", + "product": "transform" + }, + "schema_file": "transform.schema.json", + "schema_version": "1.0.0" + } +} diff --git a/schemas/calibration.schema.json b/schemas/calibration.schema.json new file mode 100644 index 0000000..ee44da5 --- /dev/null +++ b/schemas/calibration.schema.json @@ -0,0 +1,67 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "_schema_version": { + "type": "integer" + }, + "product": { + "type": "string", + "const": "calibration" + }, + "calibration_type": { + "type": "string", + "enum": [ + "cross_calibration", + "crystal_map", + "dead_time", + "energy_calibration", + "gain_map", + "normalization", + "sensitivity", + "timing_calibration" + ] + }, + "scanner_model": { + "type": "string" + }, + "scanner_serial": { + "type": "string" + }, + "valid_from": { + "type": "string" + }, + "valid_until": { + "type": "string" + }, + "default": { + "type": "string" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "domain": { + "type": "string" + }, + "metadata": { + "type": "object", + "description": "Calibration metadata including type-specific parameters and conditions" + }, + "data": { + "type": "object", + "description": "Calibration datasets \u2014 structure depends on calibration_type" + } + }, + "required": [ + "_schema_version", + "product", + "calibration_type", + "scanner_model", + "scanner_serial", + "valid_from", + "valid_until" + ] +} diff --git a/schemas/device_data.schema.json b/schemas/device_data.schema.json new file mode 100644 index 0000000..40e80c1 --- /dev/null +++ b/schemas/device_data.schema.json @@ -0,0 +1,68 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "_schema_version": { + "type": "integer" + }, + "product": { + "type": "string", + "const": "device_data" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "domain": { + "type": "string" + }, + "device_type": { + "type": "string", + "enum": [ + "blood_sampler", + "environmental_sensor", + "infusion_pump", + "motion_tracker", + "physiological_monitor" + ] + }, + "device_model": { + "type": "string" + }, + "recording_start": { + "type": "string" + }, + "recording_duration": { + "type": "object", + "properties": { + "value": { + "type": "number" + }, + "units": { + "type": "string", + "const": "s" + }, + "unitSI": { + "type": "number" + } + } + }, + "metadata": { + "type": "object" + }, + "channels": { + "type": "object" + } + }, + "required": [ + "_schema_version", + "product", + "name", + "description", + "device_type", + "device_model", + "recording_start" + ] +} diff --git a/schemas/listmode.schema.json b/schemas/listmode.schema.json new file mode 100644 index 0000000..5aeeab7 --- /dev/null +++ b/schemas/listmode.schema.json @@ -0,0 +1,68 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "_schema_version": { + "type": "integer" + }, + "product": { + "type": "string", + "const": "listmode" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "domain": { + "type": "string" + }, + "mode": { + "type": "string" + }, + "table_pos": { + "type": "object", + "description": "Table position with units" + }, + "duration": { + "type": "object", + "description": "Acquisition duration with units" + }, + "z_min": { + "type": "object", + "description": "Axial FOV minimum with units" + }, + "z_max": { + "type": "object", + "description": "Axial FOV maximum with units" + }, + "metadata": { + "type": "object", + "properties": { + "daq": { + "type": "object", + "description": "Data acquisition system parameters" + } + } + }, + "raw_data": { + "type": "object", + "description": "Raw detector event datasets (compound)" + }, + "proc_data": { + "type": "object", + "description": "Processed event datasets (compound)" + }, + "device_data": { + "type": "object", + "description": "Embedded device streams (ECG, bellows) following NXlog pattern" + } + }, + "required": [ + "_schema_version", + "product", + "name", + "description" + ] +} diff --git a/schemas/recon.schema.json b/schemas/recon.schema.json new file mode 100644 index 0000000..9535c73 --- /dev/null +++ b/schemas/recon.schema.json @@ -0,0 +1,48 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "_schema_version": { + "type": "integer" + }, + "product": { + "type": "string", + "const": "recon" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "domain": { + "type": "string" + }, + "volume": { + "type": "object", + "description": "Root-level volume dataset (represented as attrs in h5_to_dict)" + }, + "mips": { + "type": "object", + "description": "MIP projections (coronal, sagittal, axial); N-D for dynamic data" + }, + "frames": { + "type": "object", + "description": "Frame timing, gating phase, and trigger data for 4D+ volumes" + }, + "device_data": { + "type": "object", + "description": "Embedded device streams (ECG, bellows) following NXlog pattern" + }, + "provenance": { + "type": "object", + "description": "Original file provenance, DICOM header, per-slice metadata" + } + }, + "required": [ + "_schema_version", + "product", + "name", + "description" + ] +} diff --git a/schemas/roi.schema.json b/schemas/roi.schema.json new file mode 100644 index 0000000..5017899 --- /dev/null +++ b/schemas/roi.schema.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "_schema_version": { + "type": "integer" + }, + "product": { + "type": "string", + "const": "roi" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "domain": { + "type": "string" + }, + "timestamp": { + "type": "string" + } + }, + "required": [ + "_schema_version", + "product", + "name", + "description" + ] +} diff --git a/schemas/sim.schema.json b/schemas/sim.schema.json new file mode 100644 index 0000000..6fa24ca --- /dev/null +++ b/schemas/sim.schema.json @@ -0,0 +1,32 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "_schema_version": { + "type": "integer" + }, + "product": { + "type": "string", + "const": "sim" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "domain": { + "type": "string" + }, + "ground_truth": { + "type": "object", + "description": "Ground truth distributions (activity, attenuation)" + } + }, + "required": [ + "_schema_version", + "product", + "name", + "description" + ] +} diff --git a/schemas/sinogram.schema.json b/schemas/sinogram.schema.json new file mode 100644 index 0000000..699f9fe --- /dev/null +++ b/schemas/sinogram.schema.json @@ -0,0 +1,52 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "_schema_version": { + "type": "integer" + }, + "product": { + "type": "string", + "const": "sinogram" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "domain": { + "type": "string" + }, + "n_radial": { + "type": "integer" + }, + "n_angular": { + "type": "integer" + }, + "n_planes": { + "type": "integer" + }, + "span": { + "type": "integer" + }, + "max_ring_diff": { + "type": "integer" + }, + "tof_bins": { + "type": "integer" + } + }, + "required": [ + "_schema_version", + "product", + "name", + "description", + "n_radial", + "n_angular", + "n_planes", + "span", + "max_ring_diff", + "tof_bins" + ] +} diff --git a/schemas/spectrum.schema.json b/schemas/spectrum.schema.json new file mode 100644 index 0000000..bcf2a5e --- /dev/null +++ b/schemas/spectrum.schema.json @@ -0,0 +1,28 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "_schema_version": { + "type": "integer" + }, + "product": { + "type": "string", + "const": "spectrum" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "n_dimensions": { + "type": "integer" + } + }, + "required": [ + "_schema_version", + "product", + "name", + "description" + ] +} diff --git a/schemas/transform.schema.json b/schemas/transform.schema.json new file mode 100644 index 0000000..8ce8128 --- /dev/null +++ b/schemas/transform.schema.json @@ -0,0 +1,43 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "_schema_version": { + "type": "integer" + }, + "product": { + "type": "string", + "const": "transform" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "transform_type": { + "type": "string", + "enum": [ + "affine", + "bspline", + "deformable", + "rigid" + ] + }, + "direction": { + "type": "string", + "enum": [ + "source_to_target", + "target_to_source" + ] + } + }, + "required": [ + "_schema_version", + "product", + "name", + "description", + "transform_type", + "direction" + ] +} diff --git a/scripts/extract_schemas.py b/scripts/extract_schemas.py new file mode 100644 index 0000000..f29b58f --- /dev/null +++ b/scripts/extract_schemas.py @@ -0,0 +1,57 @@ +"""Extract JSON Schema files from Python product schemas. + +Generates standalone schema files into ``schemas/`` as the +language-agnostic single source of truth for fd5 file validation. + +Usage:: + + uv run python scripts/extract_schemas.py +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from fd5.registry import get_schema, list_schemas + + +def main() -> None: + schemas_dir = Path(__file__).resolve().parent.parent / "schemas" + schemas_dir.mkdir(exist_ok=True) + + manifest: dict[str, dict] = {} + + for product_type in sorted(list_schemas()): + schema = get_schema(product_type) + schema_dict = schema.json_schema() + + # Sanitise product_type for filename (e.g. "device_data" stays, + # but hypothetical "foo/bar" becomes "foo_bar") + safe_name = product_type.replace("/", "_") + schema_file = f"{safe_name}.schema.json" + out_path = schemas_dir / schema_file + + with open(out_path, "w") as f: + json.dump(schema_dict, f, indent=2, sort_keys=False) + f.write("\n") + + manifest[product_type] = { + "schema_file": schema_file, + "schema_version": schema.schema_version, + "id_inputs": schema.id_inputs(), + "required_root_attrs": schema.required_root_attrs(), + } + + print(f" {schema_file}") + + manifest_path = schemas_dir / "_manifest.json" + with open(manifest_path, "w") as f: + json.dump(manifest, f, indent=2, sort_keys=True) + f.write("\n") + + print(f"\nWrote {len(manifest)} schemas + _manifest.json to {schemas_dir}") + + +if __name__ == "__main__": + main() diff --git a/src/fd5/imaging/recon.py b/src/fd5/imaging/recon.py index 4de24e7..f0a2b84 100644 --- a/src/fd5/imaging/recon.py +++ b/src/fd5/imaging/recon.py @@ -20,7 +20,7 @@ import h5py import numpy as np -_SCHEMA_VERSION = "1.0.0" +_SCHEMA_VERSION = "1.1.0" _GZIP_LEVEL = 4 @@ -47,9 +47,9 @@ def json_schema(self) -> dict[str, Any]: "type": "object", "description": "Root-level volume dataset (represented as attrs in h5_to_dict)", }, - "mips_per_frame": { + "mips": { "type": "object", - "description": "Per-frame MIP projections for dynamic (4D+) data", + "description": "MIP projections (coronal, sagittal, axial); N-D for dynamic data", }, "frames": { "type": "object", @@ -106,10 +106,7 @@ def write(self, target: h5py.File | h5py.Group, data: dict[str, Any]) -> None: if "pyramid" in data: self._write_pyramid(target, spatial_vol, data) - self._write_mips(target, spatial_vol) - - if data.get("mips_per_frame") and volume.ndim >= 4: - self._write_mips_per_frame(target, volume) + self._write_mips(target, volume) if "device_data" in data: self._write_device_data(target, data["device_data"]) @@ -260,58 +257,45 @@ def _write_pyramid( ds.attrs["description"] = f"{factor}x downsampled volume" # ------------------------------------------------------------------ - # MIP projections + # MIP projections (nested under /mips/ group) # ------------------------------------------------------------------ def _write_mips( self, - target: h5py.File | h5py.Group, - spatial_vol: np.ndarray, - ) -> None: - mip_cor = spatial_vol.max(axis=1).astype(np.float32) - ds_cor = target.create_dataset("mip_coronal", data=mip_cor) - ds_cor.attrs["projection_type"] = "mip" - ds_cor.attrs["axis"] = np.int64(1) - ds_cor.attrs["description"] = "Coronal MIP (summed over all frames if dynamic)" - - mip_sag = spatial_vol.max(axis=2).astype(np.float32) - ds_sag = target.create_dataset("mip_sagittal", data=mip_sag) - ds_sag.attrs["projection_type"] = "mip" - ds_sag.attrs["axis"] = np.int64(2) - ds_sag.attrs["description"] = "Sagittal MIP (summed over all frames if dynamic)" - - # ------------------------------------------------------------------ - # Per-frame MIPs (optional, 4D+ only) - # ------------------------------------------------------------------ - - @staticmethod - def _write_mips_per_frame( target: h5py.File | h5py.Group, volume: np.ndarray, ) -> None: - n_frames = volume.shape[0] - z, y, x = volume.shape[-3], volume.shape[-2], volume.shape[-1] + """Write MIP projections under ``/mips/`` group. - cor_stack = np.empty((n_frames, z, x), dtype=np.float32) - sag_stack = np.empty((n_frames, z, y), dtype=np.float32) - for i in range(n_frames): - frame_3d = volume[i] - while frame_3d.ndim > 3: - frame_3d = frame_3d.sum(axis=0) - cor_stack[i] = frame_3d.max(axis=1).astype(np.float32) - sag_stack[i] = frame_3d.max(axis=2).astype(np.float32) + For 3D volumes, MIPs are 2D arrays. For 4D+ volumes, MIPs are + N-D arrays preserving all leading (non-spatial) dimensions. + Spatial axes are assumed to be the last three dimensions (Z, Y, X). + """ + ndim = volume.ndim + grp = target.create_group("mips") - grp = target.create_group("mips_per_frame") + # Spatial axis indices (last 3 dims) + ax_z = ndim - 3 # axial collapses Z + ax_y = ndim - 2 # coronal collapses Y + ax_x = ndim - 1 # sagittal collapses X - ds_cor = grp.create_dataset("coronal", data=cor_stack) + mip_cor = volume.max(axis=ax_y).astype(np.float32) + ds_cor = grp.create_dataset("coronal", data=mip_cor) ds_cor.attrs["projection_type"] = "mip" - ds_cor.attrs["axis"] = np.int64(1) - ds_cor.attrs["description"] = "Per-frame coronal MIPs" + ds_cor.attrs["axis"] = np.int64(ax_y) + ds_cor.attrs["description"] = "Coronal MIP (max along Y)" - ds_sag = grp.create_dataset("sagittal", data=sag_stack) + mip_sag = volume.max(axis=ax_x).astype(np.float32) + ds_sag = grp.create_dataset("sagittal", data=mip_sag) ds_sag.attrs["projection_type"] = "mip" - ds_sag.attrs["axis"] = np.int64(2) - ds_sag.attrs["description"] = "Per-frame sagittal MIPs" + ds_sag.attrs["axis"] = np.int64(ax_x) + ds_sag.attrs["description"] = "Sagittal MIP (max along X)" + + mip_ax = volume.max(axis=ax_z).astype(np.float32) + ds_ax = grp.create_dataset("axial", data=mip_ax) + ds_ax.attrs["projection_type"] = "mip" + ds_ax.attrs["axis"] = np.int64(ax_z) + ds_ax.attrs["description"] = "Axial MIP (max along Z)" # ------------------------------------------------------------------ # Embedded device_data (optional, NXlog pattern) diff --git a/tests/test_recon.py b/tests/test_recon.py index ab75614..50e9765 100644 --- a/tests/test_recon.py +++ b/tests/test_recon.py @@ -372,128 +372,146 @@ def test_pyramid_level_compression(self, schema, h5file): class TestWriteMIP: + def test_mips_group_created(self, schema, h5file): + data = _minimal_3d_data() + schema.write(h5file, data) + assert "mips" in h5file + assert isinstance(h5file["mips"], h5py.Group) + def test_mip_coronal_created(self, schema, h5file): data = _minimal_3d_data() schema.write(h5file, data) - assert "mip_coronal" in h5file + assert "mips/coronal" in h5file def test_mip_sagittal_created(self, schema, h5file): data = _minimal_3d_data() schema.write(h5file, data) - assert "mip_sagittal" in h5file + assert "mips/sagittal" in h5file - def test_mip_coronal_shape(self, schema, h5file): + def test_mip_axial_created(self, schema, h5file): data = _minimal_3d_data() schema.write(h5file, data) - ds = h5file["mip_coronal"] + assert "mips/axial" in h5file + + def test_mip_coronal_shape_3d(self, schema, h5file): + data = _minimal_3d_data() + schema.write(h5file, data) + ds = h5file["mips/coronal"] + # 3D (16, 32, 32) → coronal collapses Y (axis 1) → (16, 32) assert ds.shape == (16, 32) - def test_mip_sagittal_shape(self, schema, h5file): + def test_mip_sagittal_shape_3d(self, schema, h5file): data = _minimal_3d_data() schema.write(h5file, data) - ds = h5file["mip_sagittal"] + ds = h5file["mips/sagittal"] + # 3D (16, 32, 32) → sagittal collapses X (axis 2) → (16, 32) assert ds.shape == (16, 32) + def test_mip_axial_shape_3d(self, schema, h5file): + data = _minimal_3d_data() + schema.write(h5file, data) + ds = h5file["mips/axial"] + # 3D (16, 32, 32) → axial collapses Z (axis 0) → (32, 32) + assert ds.shape == (32, 32) + def test_mip_coronal_attrs(self, schema, h5file): data = _minimal_3d_data() schema.write(h5file, data) - ds = h5file["mip_coronal"] + ds = h5file["mips/coronal"] assert ds.attrs["projection_type"] == "mip" - assert ds.attrs["axis"] == 1 assert "description" in ds.attrs def test_mip_sagittal_attrs(self, schema, h5file): data = _minimal_3d_data() schema.write(h5file, data) - ds = h5file["mip_sagittal"] + ds = h5file["mips/sagittal"] + assert ds.attrs["projection_type"] == "mip" + assert "description" in ds.attrs + + def test_mip_axial_attrs(self, schema, h5file): + data = _minimal_3d_data() + schema.write(h5file, data) + ds = h5file["mips/axial"] assert ds.attrs["projection_type"] == "mip" - assert ds.attrs["axis"] == 2 assert "description" in ds.attrs def test_mip_dtype_float32(self, schema, h5file): data = _minimal_3d_data() schema.write(h5file, data) - assert h5file["mip_coronal"].dtype == np.float32 - assert h5file["mip_sagittal"].dtype == np.float32 + assert h5file["mips/coronal"].dtype == np.float32 + assert h5file["mips/sagittal"].dtype == np.float32 + assert h5file["mips/axial"].dtype == np.float32 - def test_mip_4d_uses_summed_volume(self, schema, h5file): - data = _minimal_4d_data() + def test_mip_3d_coronal_values(self, schema, h5file): + data = _minimal_3d_data() + schema.write(h5file, data) + vol = data["volume"] + expected = vol.max(axis=1).astype(np.float32) + np.testing.assert_array_almost_equal(h5file["mips/coronal"][:], expected) + + def test_mip_3d_axial_values(self, schema, h5file): + data = _minimal_3d_data() schema.write(h5file, data) vol = data["volume"] - summed = vol.sum(axis=0) - expected_coronal = summed.max(axis=1) - np.testing.assert_array_almost_equal(h5file["mip_coronal"][:], expected_coronal) + expected = vol.max(axis=0).astype(np.float32) + np.testing.assert_array_almost_equal(h5file["mips/axial"][:], expected) # --------------------------------------------------------------------------- -# write() — mips_per_frame (optional, 4D+ only) +# write() — MIP N-D arrays for 4D+ data # --------------------------------------------------------------------------- -class TestWriteMipsPerFrame: - def test_mips_per_frame_created_for_4d(self, schema, h5file): - data = _minimal_4d_data() - data["mips_per_frame"] = True - schema.write(h5file, data) - assert "mips_per_frame" in h5file - assert isinstance(h5file["mips_per_frame"], h5py.Group) - - def test_mips_per_frame_coronal_shape(self, schema, h5file): +class TestWriteMIP4D: + def test_mip_4d_coronal_shape(self, schema, h5file): data = _minimal_4d_data() - data["mips_per_frame"] = True schema.write(h5file, data) - ds = h5file["mips_per_frame/coronal"] + ds = h5file["mips/coronal"] n_frames, z, y, x = data["volume"].shape + # 4D: coronal collapses Y (axis 2) → (T, Z, X) assert ds.shape == (n_frames, z, x) assert ds.dtype == np.float32 - def test_mips_per_frame_sagittal_shape(self, schema, h5file): + def test_mip_4d_sagittal_shape(self, schema, h5file): data = _minimal_4d_data() - data["mips_per_frame"] = True schema.write(h5file, data) - ds = h5file["mips_per_frame/sagittal"] + ds = h5file["mips/sagittal"] n_frames, z, y, x = data["volume"].shape + # 4D: sagittal collapses X (axis 3) → (T, Z, Y) assert ds.shape == (n_frames, z, y) assert ds.dtype == np.float32 - def test_mips_per_frame_coronal_attrs(self, schema, h5file): + def test_mip_4d_axial_shape(self, schema, h5file): data = _minimal_4d_data() - data["mips_per_frame"] = True schema.write(h5file, data) - ds = h5file["mips_per_frame/coronal"] - assert ds.attrs["projection_type"] == "mip" - assert ds.attrs["axis"] == 1 - assert ds.attrs["description"] == "Per-frame coronal MIPs" + ds = h5file["mips/axial"] + n_frames, z, y, x = data["volume"].shape + # 4D: axial collapses Z (axis 1) → (T, Y, X) + assert ds.shape == (n_frames, y, x) + assert ds.dtype == np.float32 - def test_mips_per_frame_sagittal_attrs(self, schema, h5file): + def test_mip_4d_coronal_values(self, schema, h5file): data = _minimal_4d_data() - data["mips_per_frame"] = True - schema.write(h5file, data) - ds = h5file["mips_per_frame/sagittal"] - assert ds.attrs["projection_type"] == "mip" - assert ds.attrs["axis"] == 2 - assert ds.attrs["description"] == "Per-frame sagittal MIPs" - - def test_mips_per_frame_not_created_for_3d(self, schema, h5file): - data = _minimal_3d_data() - data["mips_per_frame"] = True schema.write(h5file, data) - assert "mips_per_frame" not in h5file + vol = data["volume"] + # Coronal = max along Y (axis 2) + expected = vol.max(axis=2).astype(np.float32) + np.testing.assert_array_almost_equal(h5file["mips/coronal"][:], expected) - def test_mips_per_frame_absent_when_not_requested(self, schema, h5file): + def test_mip_4d_per_frame_coronal_matches(self, schema, h5file): data = _minimal_4d_data() schema.write(h5file, data) - assert "mips_per_frame" not in h5file + vol = data["volume"] + # Frame 0 coronal should match vol[0].max(axis=1) + expected_cor_0 = vol[0].max(axis=1).astype(np.float32) + np.testing.assert_array_almost_equal(h5file["mips/coronal"][0], expected_cor_0) - def test_mips_per_frame_values_match_manual(self, schema, h5file): + def test_mip_4d_axial_values(self, schema, h5file): data = _minimal_4d_data() - data["mips_per_frame"] = True schema.write(h5file, data) vol = data["volume"] - expected_cor_0 = vol[0].max(axis=1).astype(np.float32) - np.testing.assert_array_almost_equal( - h5file["mips_per_frame/coronal"][0], expected_cor_0 - ) + expected = vol.max(axis=1).astype(np.float32) + np.testing.assert_array_almost_equal(h5file["mips/axial"][:], expected) # --------------------------------------------------------------------------- @@ -779,9 +797,9 @@ def test_both_provenance_fields(self, schema, h5file): class TestJsonSchemaOptionalProperties: - def test_has_mips_per_frame_property(self, schema): + def test_has_mips_property(self, schema): result = schema.json_schema() - assert "mips_per_frame" in result["properties"] + assert "mips" in result["properties"] def test_has_frames_property(self, schema): result = schema.json_schema() @@ -798,7 +816,7 @@ def test_has_provenance_property(self, schema): def test_optional_properties_not_required(self, schema): result = schema.json_schema() required = result.get("required", []) - for prop in ("mips_per_frame", "frames", "device_data", "provenance"): + for prop in ("mips", "frames", "device_data", "provenance"): assert prop not in required