From ed25d7ea607a0e2452ce6a1a6d2ca87930382033 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 6 Aug 2025 12:56:16 +0200 Subject: [PATCH 1/4] Sanitize unlimited_dims when writing `to_netcdf`, raise ValueError when encoding is inconsistent with dataset. --- xarray/backends/api.py | 32 ++++++++++---- xarray/tests/test_backends.py | 83 +++++++++++++++++------------------ 2 files changed, 65 insertions(+), 50 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 328b7568cdd..909bc2c3d9b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -240,6 +240,28 @@ def check_attr(name, value, valid_types): check_attr(k, v, valid_types) +def _sanitize_unlimited_dims(dataset, unlimited_dims): + msg_origin = "unlimited_dims-kwarg" + if unlimited_dims is None: + unlimited_dims = dataset.encoding.get("unlimited_dims", None) + msg_origin = "dataset.encoding" + if unlimited_dims is not None: + if isinstance(unlimited_dims, str) or not isinstance(unlimited_dims, Iterable): + unlimited_dims = [unlimited_dims] + else: + unlimited_dims = list(unlimited_dims) + dataset_dims = set(dataset.dims) + unlimited_dims = set(unlimited_dims) + if undeclared_dims := (unlimited_dims - dataset_dims): + msg = ( + f"Unlimited dimension(s) {undeclared_dims!r} declared in {msg_origin!r}, " + f"but not part of current dataset dimensions. " + f"Consider removing {undeclared_dims!r} from {msg_origin!r}." + ) + raise ValueError(msg) + return unlimited_dims + + def _resolve_decoders_kwargs(decode_cf, open_backend_dataset_parameters, **decoders): for d in list(decoders): if decode_cf is False and d in open_backend_dataset_parameters: @@ -1942,6 +1964,8 @@ def to_netcdf( # validate Dataset keys, DataArray names, and attr keys/values _validate_dataset_names(dataset) _validate_attrs(dataset, engine, invalid_netcdf) + # sanitize unlimited_dims + unlimited_dims = _sanitize_unlimited_dims(dataset, unlimited_dims) try: store_open = WRITEABLE_STORES[engine] @@ -1976,14 +2000,6 @@ def to_netcdf( store = store_open(target, mode, format, group, **kwargs) - if unlimited_dims is None: - unlimited_dims = dataset.encoding.get("unlimited_dims", None) - if unlimited_dims is not None: - if isinstance(unlimited_dims, str) or not isinstance(unlimited_dims, Iterable): - unlimited_dims = [unlimited_dims] - else: - unlimited_dims = list(unlimited_dims) - writer = ArrayWriter() # TODO: figure out how to refactor this logic (here and in save_mfdataset) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2fe57f51d65..aed0aad7752 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1324,6 +1324,47 @@ def test_encoding_kwarg(self) -> None: with self.roundtrip(ds, save_kwargs=kwargs) as actual: pass + def test_encoding_unlimited_dims(self) -> None: + if isinstance(self, ZarrBase): + pytest.skip("No unlimited_dims handled in zarr.") + ds = Dataset({"x": ("y", np.arange(10.0))}) + with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual: + assert actual.encoding["unlimited_dims"] == set("y") + assert_equal(ds, actual) + + # Regression test for https://github.com/pydata/xarray/issues/2134 + with self.roundtrip(ds, save_kwargs=dict(unlimited_dims="y")) as actual: + assert actual.encoding["unlimited_dims"] == set("y") + assert_equal(ds, actual) + + ds.encoding = {"unlimited_dims": ["y"]} + with self.roundtrip(ds) as actual: + assert actual.encoding["unlimited_dims"] == set("y") + assert_equal(ds, actual) + + # Regression test for https://github.com/pydata/xarray/issues/2134 + ds.encoding = {"unlimited_dims": "y"} + with self.roundtrip(ds) as actual: + assert actual.encoding["unlimited_dims"] == set("y") + assert_equal(ds, actual) + + # test unlimited_dims validation + # https://github.com/pydata/xarray/issues/10549 + ds.encoding = {"unlimited_dims": "z"} + with pytest.raises( + ValueError, + match=r"Unlimited dimension\(s\) .* declared in 'dataset.encoding'", + ): + with self.roundtrip(ds) as _: + pass + ds.encoding = {} + with pytest.raises( + ValueError, + match=r"Unlimited dimension\(s\) .* declared in 'unlimited_dims-kwarg'", + ): + with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["z"])) as _: + pass + def test_encoding_kwarg_dates(self) -> None: ds = Dataset({"t": pd.date_range("2000-01-01", periods=3)}) units = "days since 1900-01-01" @@ -1918,16 +1959,6 @@ def test_read_variable_len_strings(self) -> None: with open_dataset(tmp_file, **cast(dict, kwargs)) as actual: assert_identical(expected, actual) - def test_encoding_unlimited_dims(self) -> None: - ds = Dataset({"x": ("y", np.arange(10.0))}) - with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual: - assert actual.encoding["unlimited_dims"] == set("y") - assert_equal(ds, actual) - ds.encoding = {"unlimited_dims": ["y"]} - with self.roundtrip(ds) as actual: - assert actual.encoding["unlimited_dims"] == set("y") - assert_equal(ds, actual) - def test_raise_on_forward_slashes_in_names(self) -> None: # test for forward slash in variable names and dimensions # see GH 7943 @@ -4243,28 +4274,6 @@ def test_cross_engine_read_write_netcdf3(self) -> None: for k in data.variables ] - def test_encoding_unlimited_dims(self) -> None: - ds = Dataset({"x": ("y", np.arange(10.0))}) - with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual: - assert actual.encoding["unlimited_dims"] == set("y") - assert_equal(ds, actual) - - # Regression test for https://github.com/pydata/xarray/issues/2134 - with self.roundtrip(ds, save_kwargs=dict(unlimited_dims="y")) as actual: - assert actual.encoding["unlimited_dims"] == set("y") - assert_equal(ds, actual) - - ds.encoding = {"unlimited_dims": ["y"]} - with self.roundtrip(ds) as actual: - assert actual.encoding["unlimited_dims"] == set("y") - assert_equal(ds, actual) - - # Regression test for https://github.com/pydata/xarray/issues/2134 - ds.encoding = {"unlimited_dims": "y"} - with self.roundtrip(ds) as actual: - assert actual.encoding["unlimited_dims"] == set("y") - assert_equal(ds, actual) - @requires_h5netcdf @requires_netCDF4 @@ -4333,16 +4342,6 @@ def test_read_byte_attrs_as_unicode(self) -> None: expected = Dataset(attrs={"foo": "bar"}) assert_identical(expected, actual) - def test_encoding_unlimited_dims(self) -> None: - ds = Dataset({"x": ("y", np.arange(10.0))}) - with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual: - assert actual.encoding["unlimited_dims"] == set("y") - assert_equal(ds, actual) - ds.encoding = {"unlimited_dims": ["y"]} - with self.roundtrip(ds) as actual: - assert actual.encoding["unlimited_dims"] == set("y") - assert_equal(ds, actual) - def test_compression_encoding_h5py(self) -> None: ENCODINGS: tuple[tuple[dict[str, Any], dict[str, Any]], ...] = ( # h5py style compression with gzip codec will be converted to From e4572ba673d19d8f8daa2ae28cfffd6b3b856c50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 6 Aug 2025 14:38:45 +0200 Subject: [PATCH 2/4] deselect unlimited_dims for --- xarray/tests/test_conventions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 4b3729e2724..a958d5b3d19 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -606,6 +606,10 @@ def test_encoding_kwarg_fixed_width_string(self) -> None: # CFEncodedInMemoryStore doesn't support explicit string encodings. pass + def test_encoding_unlimited_dims(self) -> None: + # CFEncodedInMemoryStore doesn't support unlimited_dims. + pass + class TestDecodeCFVariableWithArrayUnits: def test_decode_cf_variable_with_array_units(self) -> None: From e8ecd598e1c73b648640ecca44f0454507780ade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 6 Aug 2025 15:34:57 +0200 Subject: [PATCH 3/4] add whats-new.rst entry --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 06a3c2cb22d..e6a5cf1b2a7 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -46,6 +46,9 @@ Bug fixes By `Deepak Cherian `_. - Fix detection of the ``h5netcdf`` backend. Xarray now selects ``h5netcdf`` if the default ``netCDF4`` engine is not available (:issue:`10401`, :pull:`10557`). By `Scott Staniewicz `_. +- Ensure ``unlimited_dims` passed to :py:meth:`xarray.DataArray.to_netcdf`, :py:meth:`xarray.Dataset.to_netcdf` or :py:meth:`xarray.DataTree.to_netcdf` only contains dimensions present in the object; raise ``ValueError`` otherwise (:issue:`10549`, :pull:`10608`). + By `Kai Mühlbauer `_. + Documentation ~~~~~~~~~~~~~ From 8bd0d0493a283718454a98826e04dedff47ab8e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 11 Aug 2025 08:36:18 +0200 Subject: [PATCH 4/4] Update doc/whats-new.rst --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e5e410899d1..81d370766ab 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -50,7 +50,7 @@ Bug fixes By `Deepak Cherian `_. - Fix detection of the ``h5netcdf`` backend. Xarray now selects ``h5netcdf`` if the default ``netCDF4`` engine is not available (:issue:`10401`, :pull:`10557`). By `Scott Staniewicz `_. -- Ensure ``unlimited_dims` passed to :py:meth:`xarray.DataArray.to_netcdf`, :py:meth:`xarray.Dataset.to_netcdf` or :py:meth:`xarray.DataTree.to_netcdf` only contains dimensions present in the object; raise ``ValueError`` otherwise (:issue:`10549`, :pull:`10608`). +- Ensure ``unlimited_dims`` passed to :py:meth:`xarray.DataArray.to_netcdf`, :py:meth:`xarray.Dataset.to_netcdf` or :py:meth:`xarray.DataTree.to_netcdf` only contains dimensions present in the object; raise ``ValueError`` otherwise (:issue:`10549`, :pull:`10608`). By `Kai Mühlbauer `_.