From 9f205f6a7dc37aea625661f0049e9ead246ed3f3 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 30 Dec 2025 10:54:18 -0500 Subject: [PATCH 01/18] Add an auto mechanism that doesn't split encoded chunks --- xarray/namedarray/daskmanager.py | 68 ++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index eb01a150c18..b5739635ce1 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -43,6 +43,66 @@ def is_chunked_array(self, data: duckarray[Any, Any]) -> bool: def chunks(self, data: Any) -> _NormalizedChunks: return data.chunks # type: ignore[no-any-return] + def meta_chunks(chunks, shape, target, typesize, encoded_chunks): + """Determine meta chunks + + This takes in a chunks value that contains ``"auto"`` values in certain + dimensions and replaces those values with concrete dimension sizes that try + to get chunks to be of a certain size in bytes, provided by the ``limit=`` + keyword. If multiple dimensions are marked as ``"auto"`` then they will + all respond to get close to the byte target, while never splitting + ``encoded_chunks``. + + Parameters + ---------- + chunks: tuple + A tuple of either dimensions or tuples of explicit chunk dimensions + Some entries should be "auto". Any explicit dimensions must match or + be multiple of ``encoded_chunks`` + shape: tuple[int] + The + target: int + The target size of the chunk in bytes. + typesize: int + The size, in bytes, of each element of the chunk. + encoded_chunks: tuple[int] + """ + shape = np.array(shape) + + # "auto" stays as "auto" + # empty tuple means match encoded chunks + # -1 means whole dim is in one chunk + desired_chunks = np.array( + [ + c or encoded_chunks[i] if c != -1 else shape[i] + for i, c in enumerate(chunks) + ] + ) + + auto_chunks = desired_chunks == "auto" + chunks = np.where(auto_chunks, np.array(encoded_chunks), desired_chunks).astype( + int + ) + + while True: + # Repeatedly loop over the ``encoded_chunks``, multiplying them by 2. + # Stop when: + # 1a. we are larger than the target chunk size OR + # 1b. we are within 50% of the target chunk size + + idx = np.argmax(shape / chunks * auto_chunks) + chunk_bytes = np.prod(chunks) * typesize + + if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: + break + + if np.prod(chunks) == 1: + break # Element size larger than max_bytes + + chunks[idx] = chunks[idx] * 2 + + return tuple(int(x) for x in chunks) + def normalize_chunks( self, chunks: T_Chunks | _NormalizedChunks, @@ -54,6 +114,14 @@ def normalize_chunks( """Called by open_dataset""" from dask.array.core import normalize_chunks + chunks = self.meta_chunks( + chunks, + shape=shape, + target=128 * 1024 * 1024, + typesize=dtype.itemsize, + encoded_chunks=previous_chunks, + ) # type: ignore[no-untyped-call] + return normalize_chunks( chunks, shape=shape, From c3777ec07f197911ff58bf4b1ad984674d3b4be0 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 30 Dec 2025 10:58:12 -0500 Subject: [PATCH 02/18] Forgot self --- xarray/namedarray/daskmanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index b5739635ce1..d89e4a063ff 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -43,7 +43,7 @@ def is_chunked_array(self, data: duckarray[Any, Any]) -> bool: def chunks(self, data: Any) -> _NormalizedChunks: return data.chunks # type: ignore[no-any-return] - def meta_chunks(chunks, shape, target, typesize, encoded_chunks): + def meta_chunks(self, chunks, shape, target, typesize, encoded_chunks): """Determine meta chunks This takes in a chunks value that contains ``"auto"`` values in certain From 6cfdac5069b3094af8ddc2f8d704e03f8a65748c Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 30 Dec 2025 15:31:59 -0500 Subject: [PATCH 03/18] Change from 'auto' to 'preserve' --- xarray/namedarray/daskmanager.py | 37 ++++++++++++++++++-------------- xarray/namedarray/utils.py | 2 +- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index d89e4a063ff..23673067882 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -49,9 +49,8 @@ def meta_chunks(self, chunks, shape, target, typesize, encoded_chunks): This takes in a chunks value that contains ``"auto"`` values in certain dimensions and replaces those values with concrete dimension sizes that try to get chunks to be of a certain size in bytes, provided by the ``limit=`` - keyword. If multiple dimensions are marked as ``"auto"`` then they will - all respond to get close to the byte target, while never splitting - ``encoded_chunks``. + keyword. Any dimensions marked as ``"auto"`` will potentially be multiplied + to get close to the byte target, while never splitting ``encoded_chunks``. Parameters ---------- @@ -79,7 +78,7 @@ def meta_chunks(self, chunks, shape, target, typesize, encoded_chunks): ] ) - auto_chunks = desired_chunks == "auto" + auto_chunks = desired_chunks == "preserve" chunks = np.where(auto_chunks, np.array(encoded_chunks), desired_chunks).astype( int ) @@ -88,18 +87,20 @@ def meta_chunks(self, chunks, shape, target, typesize, encoded_chunks): # Repeatedly loop over the ``encoded_chunks``, multiplying them by 2. # Stop when: # 1a. we are larger than the target chunk size OR - # 1b. we are within 50% of the target chunk size + # 1b. we are within 50% of the target chunk size OR + # 2. the size of the auto chunks matches the shape of the array - idx = np.argmax(shape / chunks * auto_chunks) + num_chunks = shape / chunks * auto_chunks + idx = np.argmax(num_chunks) chunk_bytes = np.prod(chunks) * typesize if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: break - if np.prod(chunks) == 1: - break # Element size larger than max_bytes + if (num_chunks <= 1).all(): + break - chunks[idx] = chunks[idx] * 2 + chunks[idx] = min(chunks[idx] * 2, shape[idx]) return tuple(int(x) for x in chunks) @@ -114,13 +115,17 @@ def normalize_chunks( """Called by open_dataset""" from dask.array.core import normalize_chunks - chunks = self.meta_chunks( - chunks, - shape=shape, - target=128 * 1024 * 1024, - typesize=dtype.itemsize, - encoded_chunks=previous_chunks, - ) # type: ignore[no-untyped-call] + if any(c == "preserve" for c in chunks) and any(c == "auto" for c in chunks): + raise ValueError('chunks cannot use a combination of "auto" and "preserve"') + + if previous_chunks and any(c == "preserve" for c in chunks): + chunks = self.meta_chunks( + chunks, + shape=shape, + target=96 * 1024 * 1024, + typesize=dtype.itemsize, + encoded_chunks=previous_chunks, + ) # type: ignore[no-untyped-call] return normalize_chunks( chunks, diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 3490a76aa8d..26d99d7d40a 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -222,7 +222,7 @@ def _get_chunk( # type: ignore[no-untyped-def] preferred_chunk_shape = tuple( itertools.starmap(preferred_chunks.get, zip(dims, shape, strict=True)) ) - if isinstance(chunks, Number) or (chunks == "auto"): + if isinstance(chunks, (Number, str)): chunks = dict.fromkeys(dims, chunks) chunk_shape = tuple( chunks.get(dim, None) or preferred_chunk_sizes From 95b6fddbfba7fc30349f571a584dcfa8f3107363 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 30 Dec 2025 15:34:01 -0500 Subject: [PATCH 04/18] Make sure api allows chunks='preserve' --- xarray/backends/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index fd992f3e5d8..29236548df6 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -292,9 +292,9 @@ def _dataset_from_backend_dataset( create_default_indexes, **extra_tokens, ): - if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: + if not isinstance(chunks, int | dict) and chunks not in {None, "auto", "preserve"}: raise ValueError( - f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." + f"chunks must be an int, dict, 'auto', 'preserve', or None. Instead found {chunks}." ) _protect_dataset_variables_inplace(backend_ds, cache) From 9c234b6e812ff29a47e12ae6af3fa2f63b529a8a Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 30 Dec 2025 15:59:54 -0500 Subject: [PATCH 05/18] Add types --- xarray/namedarray/_typing.py | 2 +- xarray/namedarray/daskmanager.py | 52 ++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/xarray/namedarray/_typing.py b/xarray/namedarray/_typing.py index 9610b96d4f9..f9227f7796a 100644 --- a/xarray/namedarray/_typing.py +++ b/xarray/namedarray/_typing.py @@ -74,7 +74,7 @@ def dtype(self) -> _DType_co: ... _NormalizedChunks = tuple[tuple[int, ...], ...] # FYI in some cases we don't allow `None`, which this doesn't take account of. # # FYI the `str` is for a size string, e.g. "16MB", supported by dask. -T_ChunkDim: TypeAlias = str | int | Literal["auto"] | tuple[int, ...] | None # noqa: PYI051 +T_ChunkDim: TypeAlias = str | int | Literal["auto", "preserve"] | tuple[int, ...] | None # noqa: PYI051 # We allow the tuple form of this (though arguably we could transition to named dims only) T_Chunks: TypeAlias = T_ChunkDim | Mapping[Any, T_ChunkDim] diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 23673067882..c7bf1230003 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -43,54 +43,62 @@ def is_chunked_array(self, data: duckarray[Any, Any]) -> bool: def chunks(self, data: Any) -> _NormalizedChunks: return data.chunks # type: ignore[no-any-return] - def meta_chunks(self, chunks, shape, target, typesize, encoded_chunks): + def preserve_chunks( + self, + chunks: T_Chunks, + shape: tuple[int, ...], + target: int, + typesize: int, + previous_chunks: tuple[int], + ) -> tuple[int]: """Determine meta chunks - This takes in a chunks value that contains ``"auto"`` values in certain + This takes in a chunks value that contains ``"preserve"`` values in certain dimensions and replaces those values with concrete dimension sizes that try to get chunks to be of a certain size in bytes, provided by the ``limit=`` - keyword. Any dimensions marked as ``"auto"`` will potentially be multiplied - to get close to the byte target, while never splitting ``encoded_chunks``. + keyword. Any dimensions marked as ``"preserve"`` will potentially be multiplied + to get close to the byte target, while never splitting ``previous_chunks``. Parameters ---------- - chunks: tuple + chunks: tuple[int | str | tuple, ...] A tuple of either dimensions or tuples of explicit chunk dimensions - Some entries should be "auto". Any explicit dimensions must match or - be multiple of ``encoded_chunks`` + Some entries should be "preserve". Any explicit dimensions must match or + be multiple of ``previous_chunks`` shape: tuple[int] - The + The shape of the array target: int The target size of the chunk in bytes. typesize: int The size, in bytes, of each element of the chunk. - encoded_chunks: tuple[int] + previous_chunks: tuple[int] + Size of chunks being preserved. Expressed as a tuple of ints which matches + the way chunks are encoded in Zarr. """ shape = np.array(shape) + previous_chunks = np.array(previous_chunks) - # "auto" stays as "auto" - # empty tuple means match encoded chunks + # "preserve" stays as "preserve" + # empty tuple means match previous chunks # -1 means whole dim is in one chunk desired_chunks = np.array( [ - c or encoded_chunks[i] if c != -1 else shape[i] + c or previous_chunks[i] if c != -1 else shape[i] for i, c in enumerate(chunks) ] ) - auto_chunks = desired_chunks == "preserve" - chunks = np.where(auto_chunks, np.array(encoded_chunks), desired_chunks).astype( - int - ) + preserve_chunks = desired_chunks == "preserve" + chunks = np.where(preserve_chunks, previous_chunks, desired_chunks).astype(int) while True: - # Repeatedly loop over the ``encoded_chunks``, multiplying them by 2. + # Repeatedly loop over the ``previous_chunks``, multiplying them by 2. # Stop when: # 1a. we are larger than the target chunk size OR # 1b. we are within 50% of the target chunk size OR - # 2. the size of the auto chunks matches the shape of the array + # 2. the chunk covers the entire array - num_chunks = shape / chunks * auto_chunks + num_chunks = shape / chunks * preserve_chunks idx = np.argmax(num_chunks) chunk_bytes = np.prod(chunks) * typesize @@ -119,13 +127,13 @@ def normalize_chunks( raise ValueError('chunks cannot use a combination of "auto" and "preserve"') if previous_chunks and any(c == "preserve" for c in chunks): - chunks = self.meta_chunks( + chunks = self.preserve_chunks( chunks, shape=shape, target=96 * 1024 * 1024, typesize=dtype.itemsize, - encoded_chunks=previous_chunks, - ) # type: ignore[no-untyped-call] + previous_chunks=previous_chunks, + ) return normalize_chunks( chunks, From e1d392df80bb8edf6cb34d9ef00d400f076ac662 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 31 Dec 2025 06:40:55 -0500 Subject: [PATCH 06/18] Refactor and add test * Move ``preserve_chunks`` to base ChunkManager class * Get target size from dask config options for DaskManager * Add test for open_zarr --- xarray/namedarray/daskmanager.py | 71 +---------------------------- xarray/namedarray/parallelcompat.py | 69 ++++++++++++++++++++++++++++ xarray/tests/test_backends.py | 38 +++++++++++++++ 3 files changed, 108 insertions(+), 70 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index c7bf1230003..955a58b8a69 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -43,75 +43,6 @@ def is_chunked_array(self, data: duckarray[Any, Any]) -> bool: def chunks(self, data: Any) -> _NormalizedChunks: return data.chunks # type: ignore[no-any-return] - def preserve_chunks( - self, - chunks: T_Chunks, - shape: tuple[int, ...], - target: int, - typesize: int, - previous_chunks: tuple[int], - ) -> tuple[int]: - """Determine meta chunks - - This takes in a chunks value that contains ``"preserve"`` values in certain - dimensions and replaces those values with concrete dimension sizes that try - to get chunks to be of a certain size in bytes, provided by the ``limit=`` - keyword. Any dimensions marked as ``"preserve"`` will potentially be multiplied - to get close to the byte target, while never splitting ``previous_chunks``. - - Parameters - ---------- - chunks: tuple[int | str | tuple, ...] - A tuple of either dimensions or tuples of explicit chunk dimensions - Some entries should be "preserve". Any explicit dimensions must match or - be multiple of ``previous_chunks`` - shape: tuple[int] - The shape of the array - target: int - The target size of the chunk in bytes. - typesize: int - The size, in bytes, of each element of the chunk. - previous_chunks: tuple[int] - Size of chunks being preserved. Expressed as a tuple of ints which matches - the way chunks are encoded in Zarr. - """ - shape = np.array(shape) - previous_chunks = np.array(previous_chunks) - - # "preserve" stays as "preserve" - # empty tuple means match previous chunks - # -1 means whole dim is in one chunk - desired_chunks = np.array( - [ - c or previous_chunks[i] if c != -1 else shape[i] - for i, c in enumerate(chunks) - ] - ) - - preserve_chunks = desired_chunks == "preserve" - chunks = np.where(preserve_chunks, previous_chunks, desired_chunks).astype(int) - - while True: - # Repeatedly loop over the ``previous_chunks``, multiplying them by 2. - # Stop when: - # 1a. we are larger than the target chunk size OR - # 1b. we are within 50% of the target chunk size OR - # 2. the chunk covers the entire array - - num_chunks = shape / chunks * preserve_chunks - idx = np.argmax(num_chunks) - chunk_bytes = np.prod(chunks) * typesize - - if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: - break - - if (num_chunks <= 1).all(): - break - - chunks[idx] = min(chunks[idx] * 2, shape[idx]) - - return tuple(int(x) for x in chunks) - def normalize_chunks( self, chunks: T_Chunks | _NormalizedChunks, @@ -130,7 +61,7 @@ def normalize_chunks( chunks = self.preserve_chunks( chunks, shape=shape, - target=96 * 1024 * 1024, + target=self.get_auto_chunk_size(), typesize=dtype.itemsize, previous_chunks=previous_chunks, ) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index 8a68f5e9562..f55009a7017 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -784,3 +784,72 @@ def get_auto_chunk_size( raise NotImplementedError( "For 'auto' rechunking of cftime arrays, get_auto_chunk_size must be implemented by the chunk manager" ) + + @staticmethod + def preserve_chunks( + chunks: T_Chunks, + shape: tuple[int, ...], + target: int, + typesize: int, + previous_chunks: tuple[int], + ) -> tuple[int]: + """Determine meta chunks + + This takes in a chunks value that contains ``"preserve"`` values in certain + dimensions and replaces those values with concrete dimension sizes that try + to get chunks to be of a certain size in bytes, provided by the ``limit=`` + keyword. Any dimensions marked as ``"preserve"`` will potentially be multiplied + to get close to the byte target, while never splitting ``previous_chunks``. + + Parameters + ---------- + chunks: tuple[int | str | tuple, ...] + A tuple of either dimensions or tuples of explicit chunk dimensions + Some entries should be "preserve". Any explicit dimensions must match or + be multiple of ``previous_chunks`` + shape: tuple[int] + The shape of the array + target: int + The target size of the chunk in bytes. + typesize: int + The size, in bytes, of each element of the chunk. + previous_chunks: tuple[int] + Size of chunks being preserved. Expressed as a tuple of ints which matches + the way chunks are encoded in Zarr. + """ + shape = np.array(shape) + previous_chunks = np.array(previous_chunks) + + # "preserve" stays as "preserve" + # empty tuple means match previous chunks + # -1 means whole dim is in one chunk + desired_chunks = np.array( + [ + c or previous_chunks[i] if c != -1 else shape[i] + for i, c in enumerate(chunks) + ] + ) + + preserve_chunks = desired_chunks == "preserve" + chunks = np.where(preserve_chunks, previous_chunks, desired_chunks).astype(int) + + while True: + # Repeatedly loop over the ``previous_chunks``, multiplying them by 2. + # Stop when: + # 1a. we are larger than the target chunk size OR + # 1b. we are within 50% of the target chunk size OR + # 2. the chunk covers the entire array + + num_chunks = shape / chunks * preserve_chunks + idx = np.argmax(num_chunks) + chunk_bytes = np.prod(chunks) * typesize + + if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: + break + + if (num_chunks <= 1).all(): + break + + chunks[idx] = min(chunks[idx] * 2, shape[idx]) + + return tuple(int(x) for x in chunks) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e42bfc2cd9f..3a197cfedc5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7434,6 +7434,44 @@ def test_chunking_consistency(chunks, tmp_path: Path) -> None: xr.testing.assert_chunks_equal(actual, expected) +@requires_zarr +@requires_dask +@pytest.mark.parametrize( + "chunks,expected", + [ + ("preserve", (320, 320)), + (-1, (500, 500)), + ({}, (10, 10)), + ({"x": "preserve"}, (500, 10)), + ({"x": -1}, (500, 10)), + ({"x": "preserve", "y": -1}, (160, 500)), + ], +) +def test_open_dataset_chunking_zarr_with_preserve( + chunks, expected, tmp_path: Path +) -> None: + encoded_chunks = 10 + dask_arr = da.from_array( + np.ones((500, 500), dtype="float64"), chunks=encoded_chunks + ) + ds = xr.Dataset( + { + "test": xr.DataArray( + dask_arr, + dims=("x", "y"), + ) + } + ) + ds["test"].encoding["chunks"] = encoded_chunks + ds.to_zarr(tmp_path / "test.zarr") + + with dask.config.set({"array.chunk-size": "1MiB"}): + with open_dataset( + tmp_path / "test.zarr", engine="zarr", chunks=chunks + ) as actual: + assert (actual.chunks["x"][0], actual.chunks["y"][0]) == expected + + def _check_guess_can_open_and_open(entrypoint, obj, engine, expected): assert entrypoint.guess_can_open(obj) with open_dataset(obj, engine=engine) as actual: From 882c1739858c51168534327fbe9769c6f6928e0e Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 31 Dec 2025 08:49:47 -0500 Subject: [PATCH 07/18] Add hypothesis testing --- properties/test_parallelcompat.py | 71 +++++++++++++++++++++++++++++++ xarray/testing/strategies.py | 68 +++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 properties/test_parallelcompat.py diff --git a/properties/test_parallelcompat.py b/properties/test_parallelcompat.py new file mode 100644 index 00000000000..91325578619 --- /dev/null +++ b/properties/test_parallelcompat.py @@ -0,0 +1,71 @@ +import numpy as np +import pytest + +pytest.importorskip("hypothesis") +# isort: split + +from hypothesis import given + +import xarray.testing.strategies as xrst +from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint + + +class TestPreserveChunks: + @given(xrst.shape_and_chunks()) + def test_preserve_all_chunks( + self, shape_and_chunks: tuple[tuple[int, ...], tuple[int, ...]] + ) -> None: + shape, previous_chunks = shape_and_chunks + typesize = 8 + target = 1024 * 1024 + + actual = ChunkManagerEntrypoint.preserve_chunks( + chunks=("preserve",) * len(shape), + shape=shape, + target=target, + typesize=typesize, + previous_chunks=previous_chunks, + ) + for i, chunk in enumerate(actual): + if chunk != shape[i]: + assert chunk >= previous_chunks[i] + assert chunk % previous_chunks[i] == 0 + assert chunk <= shape[i] + + if actual != shape: + assert np.prod(actual) * typesize >= 0.5 * target + + @pytest.mark.parametrize("first_chunk", [-1, (), 1]) + @given(xrst.shape_and_chunks(min_dims=2)) + def test_preserve_some_chunks( + self, + first_chunk: int | tuple[int, ...], + shape_and_chunks: tuple[tuple[int, ...], tuple[int, ...]], + ) -> None: + shape, previous_chunks = shape_and_chunks + typesize = 4 + target = 2 * 1024 * 1024 + + actual = ChunkManagerEntrypoint.preserve_chunks( + chunks=(first_chunk, *["preserve" for _ in range(len(shape) - 1)]), + shape=shape, + target=target, + typesize=typesize, + previous_chunks=previous_chunks, + ) + for i, chunk in enumerate(actual): + if i == 0: + if first_chunk == 1: + assert chunk == 1 + elif first_chunk == -1: + assert chunk == shape[i] + elif first_chunk == (): + assert chunk == previous_chunks[i] + elif chunk != shape[i]: + assert chunk >= previous_chunks[i] + assert chunk % previous_chunks[i] == 0 + assert chunk <= shape[i] + + # if we have more than one chunk, make sure the chunks are big enough + if actual[1:] != shape[1:]: + assert np.prod(actual) * typesize >= 0.5 * target diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py index 9f6bb8110e8..af974a144be 100644 --- a/xarray/testing/strategies.py +++ b/xarray/testing/strategies.py @@ -31,6 +31,7 @@ "names", "outer_array_indexers", "pandas_index_dtypes", + "shape_and_chunks", "supported_dtypes", "unique_subset_of", "variables", @@ -210,6 +211,73 @@ def dimension_sizes( ) +@st.composite +def shape_and_chunks( + draw: st.DrawFn, + *, + min_dims: int = 1, + max_dims: int = 4, + min_size: int = 1, + max_size: int = 900, +) -> tuple[tuple[int, ...], tuple[int, ...]]: + """ + Generate a shape tuple and corresponding chunks tuple. + + Each element in the chunks tuple is smaller than or equal to the + corresponding element in the shape tuple. + + Requires the hypothesis package to be installed. + + Parameters + ---------- + min_dims : int, optional + Minimum number of dimensions. Default is 1. + max_dims : int, optional + Maximum number of dimensions. Default is 4. + min_size : int, optional + Minimum size for each dimension. Default is 1. + max_size : int, optional + Maximum size for each dimension. Default is 100. + + Returns + ------- + tuple[tuple[int, ...], tuple[int, ...]] + A tuple containing (shape, chunks) where: + - shape is a tuple of positive integers + - chunks is a tuple where each element is an integer <= corresponding shape element + + Examples + -------- + >>> shape_and_chunks().example() # doctest: +SKIP + ((5, 3, 8), (2, 3, 4)) + >>> shape_and_chunks().example() # doctest: +SKIP + ((10, 7), (10, 3)) + >>> shape_and_chunks(min_dims=2, max_dims=3).example() # doctest: +SKIP + ((4, 6, 2), (2, 3, 1)) + + See Also + -------- + :ref:`testing.hypothesis`_ + """ + # Generate the shape tuple + ndim = draw(st.integers(min_value=min_dims, max_value=max_dims)) + shape = draw( + st.tuples( + *[st.integers(min_value=min_size, max_value=max_size) for _ in range(ndim)] + ) + ) + + # Generate chunks tuple with each element <= corresponding shape element + chunks_elements = [] + for size in shape: + # Each chunk is an integer between 1 and the size of that dimension + chunk_element = draw(st.integers(min_value=1, max_value=size)) + chunks_elements.append(chunk_element) + + chunks = tuple(chunks_elements) + return shape, chunks + + _readable_strings = st.text( _readable_characters, max_size=5, From 02c3e15bce739a9a71f45528c5f83e234262011f Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 31 Dec 2025 09:04:39 -0500 Subject: [PATCH 08/18] Tidy up strategy --- xarray/testing/strategies.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py index af974a144be..9f0a7080936 100644 --- a/xarray/testing/strategies.py +++ b/xarray/testing/strategies.py @@ -266,15 +266,10 @@ def shape_and_chunks( *[st.integers(min_value=min_size, max_value=max_size) for _ in range(ndim)] ) ) - # Generate chunks tuple with each element <= corresponding shape element - chunks_elements = [] - for size in shape: - # Each chunk is an integer between 1 and the size of that dimension - chunk_element = draw(st.integers(min_value=1, max_value=size)) - chunks_elements.append(chunk_element) - - chunks = tuple(chunks_elements) + chunks = draw( + st.tuples(*[st.integers(min_value=1, max_value=size) for size in shape]) + ) return shape, chunks From a0bcd7b347dd137bc09b32bcd625a6106ed562e9 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 31 Dec 2025 13:17:18 -0500 Subject: [PATCH 09/18] Fix up typing --- xarray/namedarray/daskmanager.py | 9 ++++---- xarray/namedarray/parallelcompat.py | 34 ++++++++++++++++++----------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 955a58b8a69..6f6a046b684 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -11,6 +11,7 @@ if TYPE_CHECKING: from xarray.namedarray._typing import ( + T_ChunkDim, T_Chunks, _DType_co, _NormalizedChunks, @@ -45,11 +46,11 @@ def chunks(self, data: Any) -> _NormalizedChunks: def normalize_chunks( self, - chunks: T_Chunks | _NormalizedChunks, + chunks: tuple[T_ChunkDim, ...] | _NormalizedChunks, shape: tuple[int, ...] | None = None, limit: int | None = None, dtype: _DType_co | None = None, - previous_chunks: _NormalizedChunks | None = None, + previous_chunks: tuple[int, ...] | _NormalizedChunks | None = None, ) -> Any: """Called by open_dataset""" from dask.array.core import normalize_chunks @@ -57,12 +58,12 @@ def normalize_chunks( if any(c == "preserve" for c in chunks) and any(c == "auto" for c in chunks): raise ValueError('chunks cannot use a combination of "auto" and "preserve"') - if previous_chunks and any(c == "preserve" for c in chunks): + if shape and previous_chunks and any(c == "preserve" for c in chunks): chunks = self.preserve_chunks( chunks, shape=shape, target=self.get_auto_chunk_size(), - typesize=dtype.itemsize, + typesize=getattr(dtype, "itemsize", 8), previous_chunks=previous_chunks, ) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index f55009a7017..7aa3448959a 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from xarray.namedarray._typing import ( + T_ChunkDim, T_Chunks, _Chunks, _DType, @@ -787,12 +788,12 @@ def get_auto_chunk_size( @staticmethod def preserve_chunks( - chunks: T_Chunks, + chunks: tuple[T_ChunkDim, ...], shape: tuple[int, ...], target: int, typesize: int, - previous_chunks: tuple[int], - ) -> tuple[int]: + previous_chunks: tuple[int, ...] | _NormalizedChunks, + ) -> tuple[T_ChunkDim, ...]: """Determine meta chunks This takes in a chunks value that contains ``"preserve"`` values in certain @@ -817,21 +818,28 @@ def preserve_chunks( Size of chunks being preserved. Expressed as a tuple of ints which matches the way chunks are encoded in Zarr. """ - shape = np.array(shape) - previous_chunks = np.array(previous_chunks) + # pop the first item off in case it's a tuple of tuples + preferred_chunks = np.array( + [c if isinstance(c, int) else c[0] for c in previous_chunks] + ) # "preserve" stays as "preserve" - # empty tuple means match previous chunks + # None or empty tuple means match previous chunks # -1 means whole dim is in one chunk desired_chunks = np.array( [ - c or previous_chunks[i] if c != -1 else shape[i] + c or preferred_chunks[i] if c != -1 else shape[i] for i, c in enumerate(chunks) ] ) - preserve_chunks = desired_chunks == "preserve" - chunks = np.where(preserve_chunks, previous_chunks, desired_chunks).astype(int) + + if not preserve_chunks.any(): + return chunks + + new_chunks = np.where(preserve_chunks, preferred_chunks, desired_chunks).astype( + int + ) while True: # Repeatedly loop over the ``previous_chunks``, multiplying them by 2. @@ -840,9 +848,9 @@ def preserve_chunks( # 1b. we are within 50% of the target chunk size OR # 2. the chunk covers the entire array - num_chunks = shape / chunks * preserve_chunks + num_chunks = np.array(shape) / new_chunks * preserve_chunks idx = np.argmax(num_chunks) - chunk_bytes = np.prod(chunks) * typesize + chunk_bytes = np.prod(new_chunks) * typesize if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: break @@ -850,6 +858,6 @@ def preserve_chunks( if (num_chunks <= 1).all(): break - chunks[idx] = min(chunks[idx] * 2, shape[idx]) + new_chunks[idx] = min(new_chunks[idx] * 2, shape[idx]) - return tuple(int(x) for x in chunks) + return tuple(int(x) for x in new_chunks) From 70c4189b4211582d35562f89f135d1f288a846e8 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 31 Dec 2025 13:45:52 -0500 Subject: [PATCH 10/18] Move `preserve_chunks` call out of `normalize_chunks` --- xarray/namedarray/daskmanager.py | 12 ------------ xarray/namedarray/utils.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 6f6a046b684..323beb6a37e 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -55,18 +55,6 @@ def normalize_chunks( """Called by open_dataset""" from dask.array.core import normalize_chunks - if any(c == "preserve" for c in chunks) and any(c == "auto" for c in chunks): - raise ValueError('chunks cannot use a combination of "auto" and "preserve"') - - if shape and previous_chunks and any(c == "preserve" for c in chunks): - chunks = self.preserve_chunks( - chunks, - shape=shape, - target=self.get_auto_chunk_size(), - typesize=getattr(dtype, "itemsize", 8), - previous_chunks=previous_chunks, - ) - return normalize_chunks( chunks, shape=shape, diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 26d99d7d40a..2a997b4a831 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -236,6 +236,20 @@ def _get_chunk( # type: ignore[no-untyped-def] limit = None dtype = data.dtype + if any(c == "preserve" for c in chunk_shape) and any( + c == "auto" for c in chunk_shape + ): + raise ValueError('chunks cannot use a combination of "auto" and "preserve"') + + if shape and preferred_chunk_shape and any(c == "preserve" for c in chunk_shape): + chunk_shape = chunkmanager.preserve_chunks( + chunk_shape, + shape=shape, + target=chunkmanager.get_auto_chunk_size(), + typesize=getattr(dtype, "itemsize", 8), + previous_chunks=preferred_chunk_shape, + ) + chunk_shape = chunkmanager.normalize_chunks( chunk_shape, shape=shape, From a16707577c8757053da7e903665de6094b661b2c Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 11 Mar 2026 17:18:43 -0400 Subject: [PATCH 11/18] Add docs --- doc/whats-new.rst | 4 ++++ xarray/backends/api.py | 22 +++++++++++++++++----- xarray/backends/zarr.py | 5 ++++- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 93f335e625b..dcbc76cf2ec 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,6 +14,10 @@ v2026.05.0 (unreleased) New Features ~~~~~~~~~~~~ +- Adds a new option ``chunks="preserve"`` when opening a dataset. This option + guarantees that chunks in xarray match on-disk chunks or multiples of them. + No chunk splitting allowed. (:pull:`11060`). + By `Julia Signell `_ Breaking Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 29236548df6..89c1a6efc5d 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -430,11 +430,14 @@ def open_dataset( "netcdf4" over "h5netcdf" over "scipy" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto' or None, default: None + chunks : int, dict, 'auto', 'preserve' or None, default: None If provided, used to load the data into dask arrays. - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the engine preferred chunks. + - ``chunks="preserve"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "preserve" takes multiples of them + over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -674,11 +677,14 @@ def open_dataarray( "netcdf4" over "h5netcdf" over "scipy" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto' or None, default: None + chunks : int, dict, 'auto', 'preserve', or None, default: None If provided, used to load the data into dask arrays. - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the engine preferred chunks. + - ``chunks="preserve"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "preserve" takes multiples of them + over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -900,11 +906,14 @@ def open_datatree( "h5netcdf" over "netcdf4" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto' or None, default: None + chunks : int, dict, 'auto', preserve, or None, default: None If provided, used to load the data into dask arrays. - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the engine preferred chunks. + - ``chunks="preserve"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "preserve" takes multiples of them + over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -1146,11 +1155,14 @@ def open_groups( ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. can also be used. - chunks : int, dict, 'auto' or None, default: None + chunks : int, dict, 'auto', 'preserve', or None, default: None If provided, used to load the data into dask arrays. - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the engine preferred chunks. + - ``chunks="preserve"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "preserve" takes multiples of them + over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -1418,7 +1430,7 @@ def open_mfdataset( concatenation along more than one dimension is desired, then ``paths`` must be a nested list-of-lists (see ``combine_nested`` for details). (A string glob will be expanded to a 1-dimensional list.) - chunks : int, dict, 'auto' or None, optional + chunks : int, dict, 'auto', 'preserve', or None, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to match the diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index d9279dc2de9..67a4bcffb8c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1499,12 +1499,15 @@ def open_zarr( Array synchronizer provided to zarr group : str, optional Group path. (a.k.a. `path` in zarr terminology.) - chunks : int, dict, "auto" or None, optional + chunks : int, dict, "auto", "preserve", or None, optional Used to load the data into dask arrays. Default behavior is to use ``chunks={}`` if dask is available, otherwise ``chunks=None``. - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the engine preferred chunks. + - ``chunks="preserve"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "preserve" takes multiples of them + over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. From d7c964456e226fa40c578dd9d9a5e979ee353828 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 13 Mar 2026 15:18:38 -0400 Subject: [PATCH 12/18] Improve docs --- xarray/namedarray/parallelcompat.py | 37 +++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index 7aa3448959a..4f81a7826ad 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -794,13 +794,36 @@ def preserve_chunks( typesize: int, previous_chunks: tuple[int, ...] | _NormalizedChunks, ) -> tuple[T_ChunkDim, ...]: - """Determine meta chunks + """Quickly determine optimal chunks close to target size but never splitting + previous_chunks. - This takes in a chunks value that contains ``"preserve"`` values in certain - dimensions and replaces those values with concrete dimension sizes that try - to get chunks to be of a certain size in bytes, provided by the ``limit=`` + This takes in a chunks argument potentially containing ``"preserve"`` for all + dimensions (if scalar) or several dimensions (if tuple). This function + replaces ``"preserver"`` with concrete dimension sizes that try + to get chunks to be close to certain size in bytes, provided by the ``target=`` keyword. Any dimensions marked as ``"preserve"`` will potentially be multiplied - to get close to the byte target, while never splitting ``previous_chunks``. + by some factor to get close to the byte target, while never splitting + ``previous_chunks``. + + Examples + -------- + >>> ChunkManagerEntrypoint.preserve_chunks( + ... chunks=("preserve", "preserve", "preserve"), + ... shape=(1280, 1280, 20), + ... target=500 * 1024, + ... typesize=8, + ... previous_chunks=(128, 128, 1), + ... ) + (128, 128, 2) + + >>> ChunkManagerEntrypoint.preserve_chunks( + ... chunks=("preserve", "preserve", 1), + ... shape=(1280, 1280, 20), + ... target=1 * 1024 * 1024, + ... typesize=8, + ... previous_chunks=(128, 128, 1), + ... ) + (512, 256, 1) Parameters ---------- @@ -824,7 +847,7 @@ def preserve_chunks( ) # "preserve" stays as "preserve" - # None or empty tuple means match previous chunks + # None or empty tuple means match preferred_chunks # -1 means whole dim is in one chunk desired_chunks = np.array( [ @@ -842,7 +865,7 @@ def preserve_chunks( ) while True: - # Repeatedly loop over the ``previous_chunks``, multiplying them by 2. + # Repeatedly look for the dim with the most chunks and multiply it by 2. # Stop when: # 1a. we are larger than the target chunk size OR # 1b. we are within 50% of the target chunk size OR From fce8cd7cceab8c01b95c77d61deddb3678f421a3 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Mon, 16 Mar 2026 15:05:58 -0400 Subject: [PATCH 13/18] For non-uniform chunks just pass them back as is --- xarray/namedarray/parallelcompat.py | 88 +++++++++++++++++------------ 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index 4f81a7826ad..94fe3003a77 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -797,13 +797,13 @@ def preserve_chunks( """Quickly determine optimal chunks close to target size but never splitting previous_chunks. - This takes in a chunks argument potentially containing ``"preserve"`` for all - dimensions (if scalar) or several dimensions (if tuple). This function - replaces ``"preserver"`` with concrete dimension sizes that try - to get chunks to be close to certain size in bytes, provided by the ``target=`` + This takes in a chunks argument potentially containing ``"preserve"`` for several + dimensions. This function replaces ``"preserve"`` with concrete dimension sizes that + try to get chunks to be close to certain size in bytes, provided by the ``target=`` keyword. Any dimensions marked as ``"preserve"`` will potentially be multiplied by some factor to get close to the byte target, while never splitting - ``previous_chunks``. + ``previous_chunks``. If chunks are non-uniform along a particular dimension + then that dimension will always use exactly ``previous_chunks``. Examples -------- @@ -825,45 +825,60 @@ def preserve_chunks( ... ) (512, 256, 1) + >>> ChunkManagerEntrypoint.preserve_chunks( + ... chunks=("preserve", "preserve", 1), + ... shape=(1280, 1280, 20), + ... target=1 * 1024 * 1024, + ... typesize=8, + ... previous_chunks=((128,) * 10, (128, 256, 256, 512), (1,) * 20), + ... ) + (256, (128, 256, 256, 512), 1) + Parameters ---------- - chunks: tuple[int | str | tuple, ...] + chunks: tuple[int | str | tuple[int], ...] A tuple of either dimensions or tuples of explicit chunk dimensions - Some entries should be "preserve". Any explicit dimensions must match or - be multiple of ``previous_chunks`` + Some entries should be "preserve". shape: tuple[int] The shape of the array target: int The target size of the chunk in bytes. typesize: int The size, in bytes, of each element of the chunk. - previous_chunks: tuple[int] - Size of chunks being preserved. Expressed as a tuple of ints which matches - the way chunks are encoded in Zarr. - """ - # pop the first item off in case it's a tuple of tuples - preferred_chunks = np.array( - [c if isinstance(c, int) else c[0] for c in previous_chunks] - ) - - # "preserve" stays as "preserve" - # None or empty tuple means match preferred_chunks - # -1 means whole dim is in one chunk - desired_chunks = np.array( - [ - c or preferred_chunks[i] if c != -1 else shape[i] - for i, c in enumerate(chunks) - ] - ) - preserve_chunks = desired_chunks == "preserve" - - if not preserve_chunks.any(): + previous_chunks: tuple[int | tuple[int], ...] + Size of chunks being preserved. Expressed as a tuple of ints or tuple + of tuple of ints. + """ + new_chunks = [*previous_chunks] + auto_dims = [c == "preserve" for c in chunks] + max_chunks = np.array(shape) + for i, previous_chunk in enumerate(previous_chunks): + chunk = chunks[i] + if chunk == -1: + # -1 means whole dim is in one chunk + new_chunks[i] = shape[i] + else: + if isinstance(previous_chunk, tuple): + # For uniform chunks just take the first item + if previous_chunk[1:-1] == previous_chunk[:-2]: + previous_chunk = previous_chunk[0] + # For non-uniform chunks, leave them alone + else: + auto_dims[i] = False + max_chunks[i] = max(previous_chunk) + + if isinstance(previous_chunk, int): + # preserve, None or () means we want to track previous chunk + if chunk == "preserve" or not chunk: + max_chunks[i] = previous_chunk + # otherwise use the explicitly provided chunk + else: + new_chunks[i] = chunk + max_chunks[i] = chunk if isinstance(chunk, int) else max(chunk) + + if not any(auto_dims): return chunks - new_chunks = np.where(preserve_chunks, preferred_chunks, desired_chunks).astype( - int - ) - while True: # Repeatedly look for the dim with the most chunks and multiply it by 2. # Stop when: @@ -871,9 +886,9 @@ def preserve_chunks( # 1b. we are within 50% of the target chunk size OR # 2. the chunk covers the entire array - num_chunks = np.array(shape) / new_chunks * preserve_chunks + num_chunks = np.array(shape) / max_chunks * auto_dims idx = np.argmax(num_chunks) - chunk_bytes = np.prod(new_chunks) * typesize + chunk_bytes = np.prod(max_chunks) * typesize if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: break @@ -882,5 +897,6 @@ def preserve_chunks( break new_chunks[idx] = min(new_chunks[idx] * 2, shape[idx]) + max_chunks[idx] = new_chunks[idx] - return tuple(int(x) for x in new_chunks) + return tuple(new_chunks) From dd146c0407ce52050dd8d8d23c3dbbdf712a34f0 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Mon, 16 Mar 2026 15:52:33 -0400 Subject: [PATCH 14/18] Use the last dim first to take advantage of c-ordered linearization --- xarray/namedarray/parallelcompat.py | 8 +++++--- xarray/tests/test_backends.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index 94fe3003a77..5c86e8c579d 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -823,7 +823,7 @@ def preserve_chunks( ... typesize=8, ... previous_chunks=(128, 128, 1), ... ) - (512, 256, 1) + (128, 1024, 1) >>> ChunkManagerEntrypoint.preserve_chunks( ... chunks=("preserve", "preserve", 1), @@ -861,6 +861,7 @@ def preserve_chunks( if isinstance(previous_chunk, tuple): # For uniform chunks just take the first item if previous_chunk[1:-1] == previous_chunk[:-2]: + new_chunks[i] = previous_chunk[0] previous_chunk = previous_chunk[0] # For non-uniform chunks, leave them alone else: @@ -880,14 +881,13 @@ def preserve_chunks( return chunks while True: - # Repeatedly look for the dim with the most chunks and multiply it by 2. + # Repeatedly look for the last dim with more than one chunk and multiply it by 2. # Stop when: # 1a. we are larger than the target chunk size OR # 1b. we are within 50% of the target chunk size OR # 2. the chunk covers the entire array num_chunks = np.array(shape) / max_chunks * auto_dims - idx = np.argmax(num_chunks) chunk_bytes = np.prod(max_chunks) * typesize if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: @@ -896,6 +896,8 @@ def preserve_chunks( if (num_chunks <= 1).all(): break + idx = int(np.nonzero(num_chunks > 1)[0][-1]) + new_chunks[idx] = min(new_chunks[idx] * 2, shape[idx]) max_chunks[idx] = new_chunks[idx] diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3a197cfedc5..a6b87c29f6b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7439,7 +7439,7 @@ def test_chunking_consistency(chunks, tmp_path: Path) -> None: @pytest.mark.parametrize( "chunks,expected", [ - ("preserve", (320, 320)), + ("preserve", (160, 500)), (-1, (500, 500)), ({}, (10, 10)), ({"x": "preserve"}, (500, 10)), From 67ff2f3dca05900664718e5f4014a4f5be272003 Mon Sep 17 00:00:00 2001 From: Alfonso Ladino Date: Wed, 25 Mar 2026 11:08:00 -0500 Subject: [PATCH 15/18] Add `inherit='all_coords'` option to `DataTree.to_dataset()` (#11230) * Add inherit='all' option to DataTree.to_dataset() * Add whats-new entry for inherit='all' (#11230) * Fix prune() signature accidentally modified by ruff-format * Fix mypy errors: remove unused type-ignore, add typing.cast in test * Rename inherit='all' to inherit='all_coords' per review feedback --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index dcbc76cf2ec..c91af80aa09 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -18,6 +18,9 @@ New Features guarantees that chunks in xarray match on-disk chunks or multiples of them. No chunk splitting allowed. (:pull:`11060`). By `Julia Signell `_ +- Added ``inherit='all_coords'`` option to :py:meth:`DataTree.to_dataset` to inherit + all parent coordinates, not just indexed ones (:issue:`10812`, :pull:`11230`). + By `Alfonso Ladino `_. Breaking Changes ~~~~~~~~~~~~~~~~ From 2aa143f91eb74c97eb6ef3405e81975435d05bb5 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 15 Apr 2026 15:48:35 -0400 Subject: [PATCH 16/18] Change the meaning of `auto` to default `preserve` behavior --- properties/test_parallelcompat.py | 4 +-- xarray/backends/api.py | 46 ++++++++++++++--------------- xarray/backends/zarr.py | 10 +++---- xarray/core/dataset.py | 2 +- xarray/namedarray/_typing.py | 2 +- xarray/namedarray/parallelcompat.py | 6 ++-- xarray/namedarray/utils.py | 7 +---- xarray/structure/chunks.py | 27 ++++++++++++----- xarray/tests/test_backends.py | 7 ++--- 9 files changed, 59 insertions(+), 52 deletions(-) diff --git a/properties/test_parallelcompat.py b/properties/test_parallelcompat.py index 91325578619..ffc21f6f9ee 100644 --- a/properties/test_parallelcompat.py +++ b/properties/test_parallelcompat.py @@ -20,7 +20,7 @@ def test_preserve_all_chunks( target = 1024 * 1024 actual = ChunkManagerEntrypoint.preserve_chunks( - chunks=("preserve",) * len(shape), + chunks=("auto",) * len(shape), shape=shape, target=target, typesize=typesize, @@ -47,7 +47,7 @@ def test_preserve_some_chunks( target = 2 * 1024 * 1024 actual = ChunkManagerEntrypoint.preserve_chunks( - chunks=(first_chunk, *["preserve" for _ in range(len(shape) - 1)]), + chunks=(first_chunk, *["auto" for _ in range(len(shape) - 1)]), shape=shape, target=target, typesize=typesize, diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 89c1a6efc5d..d1b901b41e4 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -259,11 +259,11 @@ def _chunk_ds( name, var, var_chunks, + chunkmanager, overwrite_encoded_chunks=overwrite_encoded_chunks, name_prefix=name_prefix, token=token, inline_array=inline_array, - chunked_array_type=chunkmanager, from_array_kwargs=from_array_kwargs.copy(), just_use_token=True, ) @@ -292,9 +292,9 @@ def _dataset_from_backend_dataset( create_default_indexes, **extra_tokens, ): - if not isinstance(chunks, int | dict) and chunks not in {None, "auto", "preserve"}: + if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: raise ValueError( - f"chunks must be an int, dict, 'auto', 'preserve', or None. Instead found {chunks}." + f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." ) _protect_dataset_variables_inplace(backend_ds, cache) @@ -430,14 +430,14 @@ def open_dataset( "netcdf4" over "h5netcdf" over "scipy" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto', 'preserve' or None, default: None + chunks : int, dict, 'auto', 'dask-auto' or None, default: None If provided, used to load the data into dask arrays. - - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. - - ``chunks="preserve"`` will use a chunking scheme that never splits encoded - chunks. If encoded chunks are small then "preserve" takes multiples of them + - ``chunks="auto"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "auto" takes multiples of them over the largest dimension. + - ``chunks="dask-auto"`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -677,14 +677,14 @@ def open_dataarray( "netcdf4" over "h5netcdf" over "scipy" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto', 'preserve', or None, default: None + chunks : int, dict, 'auto', 'dask-auto', or None, default: None If provided, used to load the data into dask arrays. - - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. - - ``chunks="preserve"`` will use a chunking scheme that never splits encoded - chunks. If encoded chunks are small then "preserve" takes multiples of them + - ``chunks="auto"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "auto" takes multiples of them over the largest dimension. + - ``chunks='dask-auto'`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -906,13 +906,13 @@ def open_datatree( "h5netcdf" over "netcdf4" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto', preserve, or None, default: None + chunks : int, dict, 'auto', 'dask-auto', or None, default: None If provided, used to load the data into dask arrays. - - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the + - ``chunks="dask-auto"`` will use dask ``auto`` chunking taking into account the engine preferred chunks. - - ``chunks="preserve"`` will use a chunking scheme that never splits encoded - chunks. If encoded chunks are small then "preserve" takes multiples of them + - ``chunks="auto"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "auto" takes multiples of them over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, @@ -1155,14 +1155,14 @@ def open_groups( ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. can also be used. - chunks : int, dict, 'auto', 'preserve', or None, default: None + chunks : int, dict, 'auto', 'dask-auto', or None, default: None If provided, used to load the data into dask arrays. - - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. - - ``chunks="preserve"`` will use a chunking scheme that never splits encoded - chunks. If encoded chunks are small then "preserve" takes multiples of them + - ``chunks="auto"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "auto" takes multiples of them over the largest dimension. + - ``chunks="dask-auto"`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -1430,7 +1430,7 @@ def open_mfdataset( concatenation along more than one dimension is desired, then ``paths`` must be a nested list-of-lists (see ``combine_nested`` for details). (A string glob will be expanded to a 1-dimensional list.) - chunks : int, dict, 'auto', 'preserve', or None, optional + chunks : int, dict, 'auto', 'dask-auto', or None, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to match the diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 67a4bcffb8c..cdc1bb1d568 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1499,15 +1499,15 @@ def open_zarr( Array synchronizer provided to zarr group : str, optional Group path. (a.k.a. `path` in zarr terminology.) - chunks : int, dict, "auto", "preserve", or None, optional + chunks : int, dict, "auto", "dask-auto", or None, optional Used to load the data into dask arrays. Default behavior is to use ``chunks={}`` if dask is available, otherwise ``chunks=None``. - - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. - - ``chunks="preserve"`` will use a chunking scheme that never splits encoded - chunks. If encoded chunks are small then "preserve" takes multiples of them + - ``chunks="auto"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "auto" takes multiples of them over the largest dimension. + - ``chunks='dask-auto'`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a046c5d0f9e..e44e8f3aa14 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2639,11 +2639,11 @@ def _resolve_resampler(name: Hashable, resampler: Resampler) -> tuple[int, ...]: k, v, chunks_mapping_ints, + chunkmanager, token, lock, name_prefix, inline_array=inline_array, - chunked_array_type=chunkmanager, from_array_kwargs=from_array_kwargs.copy(), ) for k, v in self.variables.items() diff --git a/xarray/namedarray/_typing.py b/xarray/namedarray/_typing.py index f9227f7796a..9610b96d4f9 100644 --- a/xarray/namedarray/_typing.py +++ b/xarray/namedarray/_typing.py @@ -74,7 +74,7 @@ def dtype(self) -> _DType_co: ... _NormalizedChunks = tuple[tuple[int, ...], ...] # FYI in some cases we don't allow `None`, which this doesn't take account of. # # FYI the `str` is for a size string, e.g. "16MB", supported by dask. -T_ChunkDim: TypeAlias = str | int | Literal["auto", "preserve"] | tuple[int, ...] | None # noqa: PYI051 +T_ChunkDim: TypeAlias = str | int | Literal["auto"] | tuple[int, ...] | None # noqa: PYI051 # We allow the tuple form of this (though arguably we could transition to named dims only) T_Chunks: TypeAlias = T_ChunkDim | Mapping[Any, T_ChunkDim] diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index 5c86e8c579d..e6302f378c9 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -850,7 +850,7 @@ def preserve_chunks( of tuple of ints. """ new_chunks = [*previous_chunks] - auto_dims = [c == "preserve" for c in chunks] + auto_dims = [c == "auto" for c in chunks] max_chunks = np.array(shape) for i, previous_chunk in enumerate(previous_chunks): chunk = chunks[i] @@ -869,8 +869,8 @@ def preserve_chunks( max_chunks[i] = max(previous_chunk) if isinstance(previous_chunk, int): - # preserve, None or () means we want to track previous chunk - if chunk == "preserve" or not chunk: + # auto, None or () means we want to track previous chunk + if chunk == "auto" or not chunk: max_chunks[i] = previous_chunk # otherwise use the explicitly provided chunk else: diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 2a997b4a831..598fa567e6f 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -236,12 +236,7 @@ def _get_chunk( # type: ignore[no-untyped-def] limit = None dtype = data.dtype - if any(c == "preserve" for c in chunk_shape) and any( - c == "auto" for c in chunk_shape - ): - raise ValueError('chunks cannot use a combination of "auto" and "preserve"') - - if shape and preferred_chunk_shape and any(c == "preserve" for c in chunk_shape): + if shape and preferred_chunk_shape and any(c == "auto" for c in chunk_shape): chunk_shape = chunkmanager.preserve_chunks( chunk_shape, shape=shape, diff --git a/xarray/structure/chunks.py b/xarray/structure/chunks.py index 1fba975399d..e2cb40a330d 100644 --- a/xarray/structure/chunks.py +++ b/xarray/structure/chunks.py @@ -14,7 +14,6 @@ from xarray.namedarray.parallelcompat import ( ChunkManagerEntrypoint, get_chunked_array_type, - guess_chunkmanager, ) if TYPE_CHECKING: @@ -65,12 +64,12 @@ def _maybe_chunk( name: Hashable, var: Variable, chunks: Mapping[Any, T_ChunkDim] | None, + chunkmanager: ChunkManagerEntrypoint, token=None, lock=None, name_prefix: str = "xarray-", overwrite_encoded_chunks: bool = False, inline_array: bool = False, - chunked_array_type: str | ChunkManagerEntrypoint | None = None, from_array_kwargs=None, just_use_token=False, ) -> Variable: @@ -80,10 +79,24 @@ def _maybe_chunk( chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} if var.ndim: - chunked_array_type = guess_chunkmanager( - chunked_array_type - ) # coerce string to ChunkManagerEntrypoint type - if isinstance(chunked_array_type, DaskManager): + if ( + var.shape + and var.chunks + and chunks + and any(c == "auto" for c in chunks.values()) + ): + chunk_shape = chunkmanager.preserve_chunks( + tuple(chunks.get(dim, ()) for dim in var.dims), + shape=var.shape, + target=chunkmanager.get_auto_chunk_size(), + typesize=getattr(var.dtype, "itemsize", 8), + previous_chunks=var.chunks, + ) + chunks = { + dim: chunk_shape[i] for i, dim in enumerate(var.dims) if dim in chunks + } + + if isinstance(chunkmanager, DaskManager): if not just_use_token: from dask.base import tokenize @@ -104,7 +117,7 @@ def _maybe_chunk( var = var.chunk( chunks, - chunked_array_type=chunked_array_type, + chunked_array_type=chunkmanager, from_array_kwargs=from_array_kwargs, ) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a6b87c29f6b..5f76c8cc1f0 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7405,7 +7405,6 @@ def test_open_dataset_chunking_zarr(chunks, tmp_path: Path) -> None: @pytest.mark.parametrize( "chunks", ["auto", -1, {}, {"x": "auto"}, {"x": -1}, {"x": "auto", "y": -1}] ) -@pytest.mark.filterwarnings("ignore:The specified chunks separate") def test_chunking_consistency(chunks, tmp_path: Path) -> None: encoded_chunks: dict[str, Any] = {} dask_arr = da.from_array( @@ -7439,12 +7438,12 @@ def test_chunking_consistency(chunks, tmp_path: Path) -> None: @pytest.mark.parametrize( "chunks,expected", [ - ("preserve", (160, 500)), + ("auto", (160, 500)), (-1, (500, 500)), ({}, (10, 10)), - ({"x": "preserve"}, (500, 10)), + ({"x": "auto"}, (500, 10)), ({"x": -1}, (500, 10)), - ({"x": "preserve", "y": -1}, (160, 500)), + ({"x": "auto", "y": -1}, (160, 500)), ], ) def test_open_dataset_chunking_zarr_with_preserve( From 4bea2b35b3f1d63385dab5260d8824134a256b3e Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 15 Apr 2026 16:12:17 -0400 Subject: [PATCH 17/18] Update docs --- doc/whats-new.rst | 9 +++------ xarray/backends/api.py | 18 +++++------------- xarray/backends/zarr.py | 4 +--- xarray/namedarray/parallelcompat.py | 14 +++++++------- 4 files changed, 16 insertions(+), 29 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c91af80aa09..43660ae6a7a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,13 +14,10 @@ v2026.05.0 (unreleased) New Features ~~~~~~~~~~~~ -- Adds a new option ``chunks="preserve"`` when opening a dataset. This option - guarantees that chunks in xarray match on-disk chunks or multiples of them. - No chunk splitting allowed. (:pull:`11060`). +- Change behavior of ``chunks="auto"`` to guarantee that chunks in xarray + match on-disk chunks or multiples of them. No automatic chunk splitting allowed. + (:pull:`11060`). By `Julia Signell `_ -- Added ``inherit='all_coords'`` option to :py:meth:`DataTree.to_dataset` to inherit - all parent coordinates, not just indexed ones (:issue:`10812`, :pull:`11230`). - By `Alfonso Ladino `_. Breaking Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d1b901b41e4..c5167eb6bc2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -430,14 +430,12 @@ def open_dataset( "netcdf4" over "h5netcdf" over "scipy" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto', 'dask-auto' or None, default: None + chunks : int, dict, 'auto', or None, default: None If provided, used to load the data into dask arrays. - ``chunks="auto"`` will use a chunking scheme that never splits encoded chunks. If encoded chunks are small then "auto" takes multiples of them over the largest dimension. - - ``chunks="dask-auto"`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -677,14 +675,12 @@ def open_dataarray( "netcdf4" over "h5netcdf" over "scipy" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto', 'dask-auto', or None, default: None + chunks : int, dict, 'auto', or None, default: None If provided, used to load the data into dask arrays. - ``chunks="auto"`` will use a chunking scheme that never splits encoded chunks. If encoded chunks are small then "auto" takes multiples of them over the largest dimension. - - ``chunks='dask-auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -906,11 +902,9 @@ def open_datatree( "h5netcdf" over "netcdf4" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto', 'dask-auto', or None, default: None + chunks : int, dict, 'auto', or None, default: None If provided, used to load the data into dask arrays. - - ``chunks="dask-auto"`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. - ``chunks="auto"`` will use a chunking scheme that never splits encoded chunks. If encoded chunks are small then "auto" takes multiples of them over the largest dimension. @@ -1155,14 +1149,12 @@ def open_groups( ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. can also be used. - chunks : int, dict, 'auto', 'dask-auto', or None, default: None + chunks : int, dict, 'auto', or None, default: None If provided, used to load the data into dask arrays. - ``chunks="auto"`` will use a chunking scheme that never splits encoded chunks. If encoded chunks are small then "auto" takes multiples of them over the largest dimension. - - ``chunks="dask-auto"`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -1430,7 +1422,7 @@ def open_mfdataset( concatenation along more than one dimension is desired, then ``paths`` must be a nested list-of-lists (see ``combine_nested`` for details). (A string glob will be expanded to a 1-dimensional list.) - chunks : int, dict, 'auto', 'dask-auto', or None, optional + chunks : int, dict, 'auto', or None, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to match the diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index cdc1bb1d568..7adb9641de4 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1499,15 +1499,13 @@ def open_zarr( Array synchronizer provided to zarr group : str, optional Group path. (a.k.a. `path` in zarr terminology.) - chunks : int, dict, "auto", "dask-auto", or None, optional + chunks : int, dict, "auto", or None, optional Used to load the data into dask arrays. Default behavior is to use ``chunks={}`` if dask is available, otherwise ``chunks=None``. - ``chunks="auto"`` will use a chunking scheme that never splits encoded chunks. If encoded chunks are small then "auto" takes multiples of them over the largest dimension. - - ``chunks='dask-auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index e6302f378c9..bfc12bc3180 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -797,10 +797,10 @@ def preserve_chunks( """Quickly determine optimal chunks close to target size but never splitting previous_chunks. - This takes in a chunks argument potentially containing ``"preserve"`` for several - dimensions. This function replaces ``"preserve"`` with concrete dimension sizes that + This takes in a chunks argument potentially containing ``"auto"`` for several + dimensions. This function replaces ``"auto"`` with concrete dimension sizes that try to get chunks to be close to certain size in bytes, provided by the ``target=`` - keyword. Any dimensions marked as ``"preserve"`` will potentially be multiplied + keyword. Any dimensions marked as ``"auto"`` will potentially be multiplied by some factor to get close to the byte target, while never splitting ``previous_chunks``. If chunks are non-uniform along a particular dimension then that dimension will always use exactly ``previous_chunks``. @@ -808,7 +808,7 @@ def preserve_chunks( Examples -------- >>> ChunkManagerEntrypoint.preserve_chunks( - ... chunks=("preserve", "preserve", "preserve"), + ... chunks=("auto", "auto", "auto"), ... shape=(1280, 1280, 20), ... target=500 * 1024, ... typesize=8, @@ -817,7 +817,7 @@ def preserve_chunks( (128, 128, 2) >>> ChunkManagerEntrypoint.preserve_chunks( - ... chunks=("preserve", "preserve", 1), + ... chunks=("auto", "auto", 1), ... shape=(1280, 1280, 20), ... target=1 * 1024 * 1024, ... typesize=8, @@ -826,7 +826,7 @@ def preserve_chunks( (128, 1024, 1) >>> ChunkManagerEntrypoint.preserve_chunks( - ... chunks=("preserve", "preserve", 1), + ... chunks=("auto", "auto", 1), ... shape=(1280, 1280, 20), ... target=1 * 1024 * 1024, ... typesize=8, @@ -838,7 +838,7 @@ def preserve_chunks( ---------- chunks: tuple[int | str | tuple[int], ...] A tuple of either dimensions or tuples of explicit chunk dimensions - Some entries should be "preserve". + Some entries should be "auto". shape: tuple[int] The shape of the array target: int From ba3b385efd9e5d76d63d389b83dc99165b5aedcf Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 15 Apr 2026 16:13:44 -0400 Subject: [PATCH 18/18] No oxford comma for or --- xarray/backends/api.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index c5167eb6bc2..ba242fa78bc 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -294,7 +294,7 @@ def _dataset_from_backend_dataset( ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: raise ValueError( - f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." + f"chunks must be an int, dict, 'auto' or None. Instead found {chunks}." ) _protect_dataset_variables_inplace(backend_ds, cache) @@ -344,7 +344,7 @@ def _datatree_from_backend_datatree( ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: raise ValueError( - f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." + f"chunks must be an int, dict, 'auto' or None. Instead found {chunks}." ) _protect_datatree_variables_inplace(backend_tree, cache) @@ -430,7 +430,7 @@ def open_dataset( "netcdf4" over "h5netcdf" over "scipy" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto', or None, default: None + chunks : int, dict, 'auto' or None, default: None If provided, used to load the data into dask arrays. - ``chunks="auto"`` will use a chunking scheme that never splits encoded @@ -675,7 +675,7 @@ def open_dataarray( "netcdf4" over "h5netcdf" over "scipy" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto', or None, default: None + chunks : int, dict, 'auto' or None, default: None If provided, used to load the data into dask arrays. - ``chunks="auto"`` will use a chunking scheme that never splits encoded @@ -902,7 +902,7 @@ def open_datatree( "h5netcdf" over "netcdf4" (customizable via ``netcdf_engine_order`` in ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto', or None, default: None + chunks : int, dict, 'auto' or None, default: None If provided, used to load the data into dask arrays. - ``chunks="auto"`` will use a chunking scheme that never splits encoded @@ -1149,7 +1149,7 @@ def open_groups( ``xarray.set_options()``). A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. can also be used. - chunks : int, dict, 'auto', or None, default: None + chunks : int, dict, 'auto' or None, default: None If provided, used to load the data into dask arrays. - ``chunks="auto"`` will use a chunking scheme that never splits encoded @@ -1422,7 +1422,7 @@ def open_mfdataset( concatenation along more than one dimension is desired, then ``paths`` must be a nested list-of-lists (see ``combine_nested`` for details). (A string glob will be expanded to a 1-dimensional list.) - chunks : int, dict, 'auto', or None, optional + chunks : int, dict, 'auto' or None, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to match the