diff --git a/changes/3422.bugfix.rst b/changes/3422.bugfix.rst new file mode 100644 index 0000000000..ed4b8c266d --- /dev/null +++ b/changes/3422.bugfix.rst @@ -0,0 +1,4 @@ +Fix a potential race condition when using :func:`zarr.create_array` with the ``data`` parameter +set to a NumPy array. Previously Zarr was iterating over the newly created array with a granularity +that was too low. Now Zarr chooses a granularity that matches the size of the stored objects for +that array. \ No newline at end of file diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ce19f99ba0..f31b0cc0a4 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4421,7 +4421,7 @@ async def _copy_arraylike_region(chunk_coords: slice, _data: NDArrayLike) -> Non # Stream data from the source array to the new array await concurrent_map( - [(region, data) for region in result._iter_chunk_regions()], + [(region, data) for region in result._iter_shard_regions()], _copy_arraylike_region, zarr.core.config.config.get("async.concurrency"), ) diff --git a/tests/test_array.py b/tests/test_array.py index 97aef9319b..cf201ce0c7 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -74,6 +74,7 @@ ZarrUserWarning, ) from zarr.storage import LocalStore, MemoryStore, StorePath +from zarr.storage._logging import LoggingStore from .test_dtype.conftest import zdtype_examples @@ -2119,3 +2120,28 @@ def test_iter_chunk_regions( assert observed == expected assert observed == tuple(arr._iter_chunk_regions()) assert observed == tuple(arr._async_array._iter_chunk_regions()) + + +@pytest.mark.parametrize("num_shards", [1, 3]) +@pytest.mark.parametrize("array_type", ["numpy", "zarr"]) +def test_create_array_with_data_num_gets( + num_shards: int, array_type: Literal["numpy", "zarr"] +) -> None: + """ + Test that creating an array with data only invokes a single get request per stored object + """ + store = LoggingStore(store=MemoryStore()) + + chunk_shape = (1,) + shard_shape = (100,) + shape = (shard_shape[0] * num_shards,) + data: Array | npt.NDArray[np.int64] + if array_type == "numpy": + data = np.zeros(shape[0], dtype="int64") + else: + data = zarr.zeros(shape, dtype="int64") + + zarr.create_array(store, data=data, chunks=chunk_shape, shards=shard_shape, fill_value=-1) # type: ignore[arg-type] + # one get for the metadata and one per shard. + # Note: we don't actually need one get per shard, but this is the current behavior + assert store.counter["get"] == 1 + num_shards