diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 93f335e625b..43660ae6a7a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,6 +14,10 @@ v2026.05.0 (unreleased) New Features ~~~~~~~~~~~~ +- Change behavior of ``chunks="auto"`` to guarantee that chunks in xarray + match on-disk chunks or multiples of them. No automatic chunk splitting allowed. + (:pull:`11060`). + By `Julia Signell `_ Breaking Changes ~~~~~~~~~~~~~~~~ diff --git a/properties/test_parallelcompat.py b/properties/test_parallelcompat.py new file mode 100644 index 00000000000..ffc21f6f9ee --- /dev/null +++ b/properties/test_parallelcompat.py @@ -0,0 +1,71 @@ +import numpy as np +import pytest + +pytest.importorskip("hypothesis") +# isort: split + +from hypothesis import given + +import xarray.testing.strategies as xrst +from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint + + +class TestPreserveChunks: + @given(xrst.shape_and_chunks()) + def test_preserve_all_chunks( + self, shape_and_chunks: tuple[tuple[int, ...], tuple[int, ...]] + ) -> None: + shape, previous_chunks = shape_and_chunks + typesize = 8 + target = 1024 * 1024 + + actual = ChunkManagerEntrypoint.preserve_chunks( + chunks=("auto",) * len(shape), + shape=shape, + target=target, + typesize=typesize, + previous_chunks=previous_chunks, + ) + for i, chunk in enumerate(actual): + if chunk != shape[i]: + assert chunk >= previous_chunks[i] + assert chunk % previous_chunks[i] == 0 + assert chunk <= shape[i] + + if actual != shape: + assert np.prod(actual) * typesize >= 0.5 * target + + @pytest.mark.parametrize("first_chunk", [-1, (), 1]) + @given(xrst.shape_and_chunks(min_dims=2)) + def test_preserve_some_chunks( + self, + first_chunk: int | tuple[int, ...], + shape_and_chunks: tuple[tuple[int, ...], tuple[int, ...]], + ) -> None: + shape, previous_chunks = shape_and_chunks + typesize = 4 + target = 2 * 1024 * 1024 + + actual = ChunkManagerEntrypoint.preserve_chunks( + chunks=(first_chunk, *["auto" for _ in range(len(shape) - 1)]), + shape=shape, + target=target, + typesize=typesize, + previous_chunks=previous_chunks, + ) + for i, chunk in enumerate(actual): + if i == 0: + if first_chunk == 1: + assert chunk == 1 + elif first_chunk == -1: + assert chunk == shape[i] + elif first_chunk == (): + assert chunk == previous_chunks[i] + elif chunk != shape[i]: + assert chunk >= previous_chunks[i] + assert chunk % previous_chunks[i] == 0 + assert chunk <= shape[i] + + # if we have more than one chunk, make sure the chunks are big enough + if actual[1:] != shape[1:]: + assert np.prod(actual) * typesize >= 0.5 * target diff --git a/xarray/backends/api.py b/xarray/backends/api.py index fd992f3e5d8..ba242fa78bc 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -259,11 +259,11 @@ def _chunk_ds( name, var, var_chunks, + chunkmanager, overwrite_encoded_chunks=overwrite_encoded_chunks, name_prefix=name_prefix, token=token, inline_array=inline_array, - chunked_array_type=chunkmanager, from_array_kwargs=from_array_kwargs.copy(), just_use_token=True, ) @@ -294,7 +294,7 @@ def _dataset_from_backend_dataset( ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: raise ValueError( - f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." + f"chunks must be an int, dict, 'auto' or None. Instead found {chunks}." ) _protect_dataset_variables_inplace(backend_ds, cache) @@ -344,7 +344,7 @@ def _datatree_from_backend_datatree( ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: raise ValueError( - f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." + f"chunks must be an int, dict, 'auto' or None. Instead found {chunks}." ) _protect_datatree_variables_inplace(backend_tree, cache) @@ -433,8 +433,9 @@ class (a subclass of ``BackendEntrypoint``) can also be used. chunks : int, dict, 'auto' or None, default: None If provided, used to load the data into dask arrays. - - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. + - ``chunks="auto"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "auto" takes multiples of them + over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -677,8 +678,9 @@ class (a subclass of ``BackendEntrypoint``) can also be used. chunks : int, dict, 'auto' or None, default: None If provided, used to load the data into dask arrays. - - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. + - ``chunks="auto"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "auto" takes multiples of them + over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -903,8 +905,9 @@ def open_datatree( chunks : int, dict, 'auto' or None, default: None If provided, used to load the data into dask arrays. - - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. + - ``chunks="auto"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "auto" takes multiples of them + over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. @@ -1149,8 +1152,9 @@ def open_groups( chunks : int, dict, 'auto' or None, default: None If provided, used to load the data into dask arrays. - - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. + - ``chunks="auto"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "auto" takes multiples of them + over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index d9279dc2de9..7adb9641de4 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1499,12 +1499,13 @@ def open_zarr( Array synchronizer provided to zarr group : str, optional Group path. (a.k.a. `path` in zarr terminology.) - chunks : int, dict, "auto" or None, optional + chunks : int, dict, "auto", or None, optional Used to load the data into dask arrays. Default behavior is to use ``chunks={}`` if dask is available, otherwise ``chunks=None``. - - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. + - ``chunks="auto"`` will use a chunking scheme that never splits encoded + chunks. If encoded chunks are small then "auto" takes multiples of them + over the largest dimension. - ``chunks=None`` skips using dask. This uses xarray's internally private :ref:`lazy indexing classes `, but data is eagerly loaded into memory as numpy arrays when accessed. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a046c5d0f9e..e44e8f3aa14 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2639,11 +2639,11 @@ def _resolve_resampler(name: Hashable, resampler: Resampler) -> tuple[int, ...]: k, v, chunks_mapping_ints, + chunkmanager, token, lock, name_prefix, inline_array=inline_array, - chunked_array_type=chunkmanager, from_array_kwargs=from_array_kwargs.copy(), ) for k, v in self.variables.items() diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index eb01a150c18..323beb6a37e 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -11,6 +11,7 @@ if TYPE_CHECKING: from xarray.namedarray._typing import ( + T_ChunkDim, T_Chunks, _DType_co, _NormalizedChunks, @@ -45,11 +46,11 @@ def chunks(self, data: Any) -> _NormalizedChunks: def normalize_chunks( self, - chunks: T_Chunks | _NormalizedChunks, + chunks: tuple[T_ChunkDim, ...] | _NormalizedChunks, shape: tuple[int, ...] | None = None, limit: int | None = None, dtype: _DType_co | None = None, - previous_chunks: _NormalizedChunks | None = None, + previous_chunks: tuple[int, ...] | _NormalizedChunks | None = None, ) -> Any: """Called by open_dataset""" from dask.array.core import normalize_chunks diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index 8a68f5e9562..bfc12bc3180 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from xarray.namedarray._typing import ( + T_ChunkDim, T_Chunks, _Chunks, _DType, @@ -784,3 +785,120 @@ def get_auto_chunk_size( raise NotImplementedError( "For 'auto' rechunking of cftime arrays, get_auto_chunk_size must be implemented by the chunk manager" ) + + @staticmethod + def preserve_chunks( + chunks: tuple[T_ChunkDim, ...], + shape: tuple[int, ...], + target: int, + typesize: int, + previous_chunks: tuple[int, ...] | _NormalizedChunks, + ) -> tuple[T_ChunkDim, ...]: + """Quickly determine optimal chunks close to target size but never splitting + previous_chunks. + + This takes in a chunks argument potentially containing ``"auto"`` for several + dimensions. This function replaces ``"auto"`` with concrete dimension sizes that + try to get chunks to be close to certain size in bytes, provided by the ``target=`` + keyword. Any dimensions marked as ``"auto"`` will potentially be multiplied + by some factor to get close to the byte target, while never splitting + ``previous_chunks``. If chunks are non-uniform along a particular dimension + then that dimension will always use exactly ``previous_chunks``. + + Examples + -------- + >>> ChunkManagerEntrypoint.preserve_chunks( + ... chunks=("auto", "auto", "auto"), + ... shape=(1280, 1280, 20), + ... target=500 * 1024, + ... typesize=8, + ... previous_chunks=(128, 128, 1), + ... ) + (128, 128, 2) + + >>> ChunkManagerEntrypoint.preserve_chunks( + ... chunks=("auto", "auto", 1), + ... shape=(1280, 1280, 20), + ... target=1 * 1024 * 1024, + ... typesize=8, + ... previous_chunks=(128, 128, 1), + ... ) + (128, 1024, 1) + + >>> ChunkManagerEntrypoint.preserve_chunks( + ... chunks=("auto", "auto", 1), + ... shape=(1280, 1280, 20), + ... target=1 * 1024 * 1024, + ... typesize=8, + ... previous_chunks=((128,) * 10, (128, 256, 256, 512), (1,) * 20), + ... ) + (256, (128, 256, 256, 512), 1) + + Parameters + ---------- + chunks: tuple[int | str | tuple[int], ...] + A tuple of either dimensions or tuples of explicit chunk dimensions + Some entries should be "auto". + shape: tuple[int] + The shape of the array + target: int + The target size of the chunk in bytes. + typesize: int + The size, in bytes, of each element of the chunk. + previous_chunks: tuple[int | tuple[int], ...] + Size of chunks being preserved. Expressed as a tuple of ints or tuple + of tuple of ints. + """ + new_chunks = [*previous_chunks] + auto_dims = [c == "auto" for c in chunks] + max_chunks = np.array(shape) + for i, previous_chunk in enumerate(previous_chunks): + chunk = chunks[i] + if chunk == -1: + # -1 means whole dim is in one chunk + new_chunks[i] = shape[i] + else: + if isinstance(previous_chunk, tuple): + # For uniform chunks just take the first item + if previous_chunk[1:-1] == previous_chunk[:-2]: + new_chunks[i] = previous_chunk[0] + previous_chunk = previous_chunk[0] + # For non-uniform chunks, leave them alone + else: + auto_dims[i] = False + max_chunks[i] = max(previous_chunk) + + if isinstance(previous_chunk, int): + # auto, None or () means we want to track previous chunk + if chunk == "auto" or not chunk: + max_chunks[i] = previous_chunk + # otherwise use the explicitly provided chunk + else: + new_chunks[i] = chunk + max_chunks[i] = chunk if isinstance(chunk, int) else max(chunk) + + if not any(auto_dims): + return chunks + + while True: + # Repeatedly look for the last dim with more than one chunk and multiply it by 2. + # Stop when: + # 1a. we are larger than the target chunk size OR + # 1b. we are within 50% of the target chunk size OR + # 2. the chunk covers the entire array + + num_chunks = np.array(shape) / max_chunks * auto_dims + chunk_bytes = np.prod(max_chunks) * typesize + + if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: + break + + if (num_chunks <= 1).all(): + break + + idx = int(np.nonzero(num_chunks > 1)[0][-1]) + + new_chunks[idx] = min(new_chunks[idx] * 2, shape[idx]) + max_chunks[idx] = new_chunks[idx] + + return tuple(new_chunks) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 3490a76aa8d..598fa567e6f 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -222,7 +222,7 @@ def _get_chunk( # type: ignore[no-untyped-def] preferred_chunk_shape = tuple( itertools.starmap(preferred_chunks.get, zip(dims, shape, strict=True)) ) - if isinstance(chunks, Number) or (chunks == "auto"): + if isinstance(chunks, (Number, str)): chunks = dict.fromkeys(dims, chunks) chunk_shape = tuple( chunks.get(dim, None) or preferred_chunk_sizes @@ -236,6 +236,15 @@ def _get_chunk( # type: ignore[no-untyped-def] limit = None dtype = data.dtype + if shape and preferred_chunk_shape and any(c == "auto" for c in chunk_shape): + chunk_shape = chunkmanager.preserve_chunks( + chunk_shape, + shape=shape, + target=chunkmanager.get_auto_chunk_size(), + typesize=getattr(dtype, "itemsize", 8), + previous_chunks=preferred_chunk_shape, + ) + chunk_shape = chunkmanager.normalize_chunks( chunk_shape, shape=shape, diff --git a/xarray/structure/chunks.py b/xarray/structure/chunks.py index 1fba975399d..e2cb40a330d 100644 --- a/xarray/structure/chunks.py +++ b/xarray/structure/chunks.py @@ -14,7 +14,6 @@ from xarray.namedarray.parallelcompat import ( ChunkManagerEntrypoint, get_chunked_array_type, - guess_chunkmanager, ) if TYPE_CHECKING: @@ -65,12 +64,12 @@ def _maybe_chunk( name: Hashable, var: Variable, chunks: Mapping[Any, T_ChunkDim] | None, + chunkmanager: ChunkManagerEntrypoint, token=None, lock=None, name_prefix: str = "xarray-", overwrite_encoded_chunks: bool = False, inline_array: bool = False, - chunked_array_type: str | ChunkManagerEntrypoint | None = None, from_array_kwargs=None, just_use_token=False, ) -> Variable: @@ -80,10 +79,24 @@ def _maybe_chunk( chunks = {dim: chunks[dim] for dim in var.dims if dim in chunks} if var.ndim: - chunked_array_type = guess_chunkmanager( - chunked_array_type - ) # coerce string to ChunkManagerEntrypoint type - if isinstance(chunked_array_type, DaskManager): + if ( + var.shape + and var.chunks + and chunks + and any(c == "auto" for c in chunks.values()) + ): + chunk_shape = chunkmanager.preserve_chunks( + tuple(chunks.get(dim, ()) for dim in var.dims), + shape=var.shape, + target=chunkmanager.get_auto_chunk_size(), + typesize=getattr(var.dtype, "itemsize", 8), + previous_chunks=var.chunks, + ) + chunks = { + dim: chunk_shape[i] for i, dim in enumerate(var.dims) if dim in chunks + } + + if isinstance(chunkmanager, DaskManager): if not just_use_token: from dask.base import tokenize @@ -104,7 +117,7 @@ def _maybe_chunk( var = var.chunk( chunks, - chunked_array_type=chunked_array_type, + chunked_array_type=chunkmanager, from_array_kwargs=from_array_kwargs, ) diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py index 9f6bb8110e8..9f0a7080936 100644 --- a/xarray/testing/strategies.py +++ b/xarray/testing/strategies.py @@ -31,6 +31,7 @@ "names", "outer_array_indexers", "pandas_index_dtypes", + "shape_and_chunks", "supported_dtypes", "unique_subset_of", "variables", @@ -210,6 +211,68 @@ def dimension_sizes( ) +@st.composite +def shape_and_chunks( + draw: st.DrawFn, + *, + min_dims: int = 1, + max_dims: int = 4, + min_size: int = 1, + max_size: int = 900, +) -> tuple[tuple[int, ...], tuple[int, ...]]: + """ + Generate a shape tuple and corresponding chunks tuple. + + Each element in the chunks tuple is smaller than or equal to the + corresponding element in the shape tuple. + + Requires the hypothesis package to be installed. + + Parameters + ---------- + min_dims : int, optional + Minimum number of dimensions. Default is 1. + max_dims : int, optional + Maximum number of dimensions. Default is 4. + min_size : int, optional + Minimum size for each dimension. Default is 1. + max_size : int, optional + Maximum size for each dimension. Default is 100. + + Returns + ------- + tuple[tuple[int, ...], tuple[int, ...]] + A tuple containing (shape, chunks) where: + - shape is a tuple of positive integers + - chunks is a tuple where each element is an integer <= corresponding shape element + + Examples + -------- + >>> shape_and_chunks().example() # doctest: +SKIP + ((5, 3, 8), (2, 3, 4)) + >>> shape_and_chunks().example() # doctest: +SKIP + ((10, 7), (10, 3)) + >>> shape_and_chunks(min_dims=2, max_dims=3).example() # doctest: +SKIP + ((4, 6, 2), (2, 3, 1)) + + See Also + -------- + :ref:`testing.hypothesis`_ + """ + # Generate the shape tuple + ndim = draw(st.integers(min_value=min_dims, max_value=max_dims)) + shape = draw( + st.tuples( + *[st.integers(min_value=min_size, max_value=max_size) for _ in range(ndim)] + ) + ) + # Generate chunks tuple with each element <= corresponding shape element + chunks = draw( + st.tuples(*[st.integers(min_value=1, max_value=size) for size in shape]) + ) + return shape, chunks + + _readable_strings = st.text( _readable_characters, max_size=5, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e42bfc2cd9f..5f76c8cc1f0 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7405,7 +7405,6 @@ def test_open_dataset_chunking_zarr(chunks, tmp_path: Path) -> None: @pytest.mark.parametrize( "chunks", ["auto", -1, {}, {"x": "auto"}, {"x": -1}, {"x": "auto", "y": -1}] ) -@pytest.mark.filterwarnings("ignore:The specified chunks separate") def test_chunking_consistency(chunks, tmp_path: Path) -> None: encoded_chunks: dict[str, Any] = {} dask_arr = da.from_array( @@ -7434,6 +7433,44 @@ def test_chunking_consistency(chunks, tmp_path: Path) -> None: xr.testing.assert_chunks_equal(actual, expected) +@requires_zarr +@requires_dask +@pytest.mark.parametrize( + "chunks,expected", + [ + ("auto", (160, 500)), + (-1, (500, 500)), + ({}, (10, 10)), + ({"x": "auto"}, (500, 10)), + ({"x": -1}, (500, 10)), + ({"x": "auto", "y": -1}, (160, 500)), + ], +) +def test_open_dataset_chunking_zarr_with_preserve( + chunks, expected, tmp_path: Path +) -> None: + encoded_chunks = 10 + dask_arr = da.from_array( + np.ones((500, 500), dtype="float64"), chunks=encoded_chunks + ) + ds = xr.Dataset( + { + "test": xr.DataArray( + dask_arr, + dims=("x", "y"), + ) + } + ) + ds["test"].encoding["chunks"] = encoded_chunks + ds.to_zarr(tmp_path / "test.zarr") + + with dask.config.set({"array.chunk-size": "1MiB"}): + with open_dataset( + tmp_path / "test.zarr", engine="zarr", chunks=chunks + ) as actual: + assert (actual.chunks["x"][0], actual.chunks["y"][0]) == expected + + def _check_guess_can_open_and_open(entrypoint, obj, engine, expected): assert entrypoint.guess_can_open(obj) with open_dataset(obj, engine=engine) as actual: