Skip to content

Sanitize unlimited_dims when writing to_netcdf #10608

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ Bug fixes
By `Deepak Cherian <https://github.com/dcherian>`_.
- Fix detection of the ``h5netcdf`` backend. Xarray now selects ``h5netcdf`` if the default ``netCDF4`` engine is not available (:issue:`10401`, :pull:`10557`).
By `Scott Staniewicz <https://github.com/scottstanie>`_.
- Ensure ``unlimited_dims`` passed to :py:meth:`xarray.DataArray.to_netcdf`, :py:meth:`xarray.Dataset.to_netcdf` or :py:meth:`xarray.DataTree.to_netcdf` only contains dimensions present in the object; raise ``ValueError`` otherwise (:issue:`10549`, :pull:`10608`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.


Documentation
~~~~~~~~~~~~~
Expand Down
32 changes: 24 additions & 8 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,28 @@ def check_attr(name, value, valid_types):
check_attr(k, v, valid_types)


def _sanitize_unlimited_dims(dataset, unlimited_dims):
msg_origin = "unlimited_dims-kwarg"
if unlimited_dims is None:
unlimited_dims = dataset.encoding.get("unlimited_dims", None)
msg_origin = "dataset.encoding"
if unlimited_dims is not None:
if isinstance(unlimited_dims, str) or not isinstance(unlimited_dims, Iterable):
unlimited_dims = [unlimited_dims]
else:
unlimited_dims = list(unlimited_dims)
dataset_dims = set(dataset.dims)
unlimited_dims = set(unlimited_dims)
if undeclared_dims := (unlimited_dims - dataset_dims):
msg = (
f"Unlimited dimension(s) {undeclared_dims!r} declared in {msg_origin!r}, "
f"but not part of current dataset dimensions. "
f"Consider removing {undeclared_dims!r} from {msg_origin!r}."
)
raise ValueError(msg)
return unlimited_dims


def _resolve_decoders_kwargs(decode_cf, open_backend_dataset_parameters, **decoders):
for d in list(decoders):
if decode_cf is False and d in open_backend_dataset_parameters:
Expand Down Expand Up @@ -2007,6 +2029,8 @@ def to_netcdf(
# validate Dataset keys, DataArray names, and attr keys/values
_validate_dataset_names(dataset)
_validate_attrs(dataset, engine, invalid_netcdf)
# sanitize unlimited_dims
unlimited_dims = _sanitize_unlimited_dims(dataset, unlimited_dims)

try:
store_open = WRITEABLE_STORES[engine]
Expand Down Expand Up @@ -2045,14 +2069,6 @@ def to_netcdf(

store = store_open(target, mode, format, group, **kwargs)

if unlimited_dims is None:
unlimited_dims = dataset.encoding.get("unlimited_dims", None)
if unlimited_dims is not None:
if isinstance(unlimited_dims, str) or not isinstance(unlimited_dims, Iterable):
unlimited_dims = [unlimited_dims]
else:
unlimited_dims = list(unlimited_dims)

writer = ArrayWriter()

# TODO: figure out how to refactor this logic (here and in save_mfdataset)
Expand Down
61 changes: 41 additions & 20 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -1324,6 +1324,47 @@
with self.roundtrip(ds, save_kwargs=kwargs) as actual:
pass

def test_encoding_unlimited_dims(self) -> None:
if isinstance(self, ZarrBase):
pytest.skip("No unlimited_dims handled in zarr.")
ds = Dataset({"x": ("y", np.arange(10.0))})
with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual:
assert actual.encoding["unlimited_dims"] == set("y")
assert_equal(ds, actual)

# Regression test for https://github.com/pydata/xarray/issues/2134
with self.roundtrip(ds, save_kwargs=dict(unlimited_dims="y")) as actual:
assert actual.encoding["unlimited_dims"] == set("y")
assert_equal(ds, actual)

ds.encoding = {"unlimited_dims": ["y"]}
with self.roundtrip(ds) as actual:
assert actual.encoding["unlimited_dims"] == set("y")
assert_equal(ds, actual)

# Regression test for https://github.com/pydata/xarray/issues/2134
ds.encoding = {"unlimited_dims": "y"}
with self.roundtrip(ds) as actual:
assert actual.encoding["unlimited_dims"] == set("y")
assert_equal(ds, actual)

# test unlimited_dims validation
# https://github.com/pydata/xarray/issues/10549
ds.encoding = {"unlimited_dims": "z"}
with pytest.raises(
ValueError,
match=r"Unlimited dimension\(s\) .* declared in 'dataset.encoding'",
):
with self.roundtrip(ds) as _:
pass
ds.encoding = {}
with pytest.raises(
ValueError,
match=r"Unlimited dimension\(s\) .* declared in 'unlimited_dims-kwarg'",
):
with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["z"])) as _:
pass

def test_encoding_kwarg_dates(self) -> None:
ds = Dataset({"t": pd.date_range("2000-01-01", periods=3)})
units = "days since 1900-01-01"
Expand Down Expand Up @@ -1918,16 +1959,6 @@
with open_dataset(tmp_file, **cast(dict, kwargs)) as actual:
assert_identical(expected, actual)

def test_encoding_unlimited_dims(self) -> None:
ds = Dataset({"x": ("y", np.arange(10.0))})
with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual:
assert actual.encoding["unlimited_dims"] == set("y")
assert_equal(ds, actual)
ds.encoding = {"unlimited_dims": ["y"]}
with self.roundtrip(ds) as actual:
assert actual.encoding["unlimited_dims"] == set("y")
assert_equal(ds, actual)

def test_raise_on_forward_slashes_in_names(self) -> None:
# test for forward slash in variable names and dimensions
# see GH 7943
Expand Down Expand Up @@ -2343,7 +2374,7 @@
# Flaky test. Very open to contributions on fixing this
@pytest.mark.flaky
def test_roundtrip_coordinates(self) -> None:
super().test_roundtrip_coordinates()

Check failure on line 2377 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.13 flaky

TestNetCDF4ViaDaskData.test_roundtrip_coordinates Failed: Timeout (>180.0s) from pytest-timeout.

@requires_cftime
def test_roundtrip_cftime_bnds(self):
Expand Down Expand Up @@ -4393,16 +4424,6 @@
expected = Dataset(attrs={"foo": "bar"})
assert_identical(expected, actual)

def test_encoding_unlimited_dims(self) -> None:
ds = Dataset({"x": ("y", np.arange(10.0))})
with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual:
assert actual.encoding["unlimited_dims"] == set("y")
assert_equal(ds, actual)
ds.encoding = {"unlimited_dims": ["y"]}
with self.roundtrip(ds) as actual:
assert actual.encoding["unlimited_dims"] == set("y")
assert_equal(ds, actual)

def test_compression_encoding_h5py(self) -> None:
ENCODINGS: tuple[tuple[dict[str, Any], dict[str, Any]], ...] = (
# h5py style compression with gzip codec will be converted to
Expand Down Expand Up @@ -5726,7 +5747,7 @@
def test_dask_roundtrip(self) -> None:
with create_tmp_file() as tmp:
data = create_test_data()
data.to_netcdf(tmp)

Check failure on line 5750 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.13 flaky

TestDask.test_dask_roundtrip Failed: Timeout (>180.0s) from pytest-timeout.
chunks = {"dim1": 4, "dim2": 4, "dim3": 4, "time": 10}
with open_dataset(tmp, chunks=chunks) as dask_ds:
assert_identical(data, dask_ds)
Expand Down
4 changes: 4 additions & 0 deletions xarray/tests/test_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,10 @@ def test_encoding_kwarg_fixed_width_string(self) -> None:
# CFEncodedInMemoryStore doesn't support explicit string encodings.
pass

def test_encoding_unlimited_dims(self) -> None:
# CFEncodedInMemoryStore doesn't support unlimited_dims.
pass


class TestDecodeCFVariableWithArrayUnits:
def test_decode_cf_variable_with_array_units(self) -> None:
Expand Down
Loading