Skip to content

Commit 591b895

Browse files
authored
fix/simpler blosc tuning (#3553)
* only autotune when the default parameters were chosen, or they were set to None * better refactoring of blosc codec * changelog
1 parent b0e5c69 commit 591b895

File tree

4 files changed

+35
-102
lines changed

4 files changed

+35
-102
lines changed

changes/3545.misc.md

Lines changed: 0 additions & 1 deletion
This file was deleted.

changes/3553.misc.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Changes the internal logic of the `BloscCodec` class to ensure that the `typesize` and `shuffle` parameters are not set to `None` when creating a new instance of `BloscCodec`.

src/zarr/codecs/blosc.py

Lines changed: 18 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from __future__ import annotations
22

33
import asyncio
4-
import warnings
5-
from dataclasses import dataclass, replace
4+
from dataclasses import dataclass, field, replace
65
from enum import Enum
76
from functools import cached_property
87
from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict
@@ -15,7 +14,6 @@
1514
from zarr.core.buffer.cpu import as_numpy_array_wrapper
1615
from zarr.core.common import JSON, NamedRequiredConfig, parse_enum, parse_named_configuration
1716
from zarr.core.dtype.common import HasItemSize
18-
from zarr.errors import ZarrDeprecationWarning
1917

2018
if TYPE_CHECKING:
2119
from typing import Self
@@ -131,12 +129,6 @@ class BloscCodec(BytesBytesCodec):
131129
132130
Attributes
133131
----------
134-
tunable_attrs : set of {'typesize', 'shuffle'}
135-
Attributes that will be automatically tuned when `evolve_from_array_spec()`
136-
is called. By default, contains {'typesize', 'shuffle'}. When either
137-
`typesize` or `shuffle` is explicitly set to None during initialization,
138-
the corresponding attribute is added to this set (if not already present),
139-
allowing it to be overridden based on the array's dtype.
140132
is_fixed_size : bool
141133
Always False for Blosc codec, as compression produces variable-sized output.
142134
typesize : int
@@ -154,8 +146,8 @@ class BloscCodec(BytesBytesCodec):
154146
----------
155147
typesize : int, optional
156148
The data type size in bytes. This affects how the shuffle filter processes
157-
the data. If None (deprecated), defaults to 1 and the attribute is marked
158-
as tunable. Default: 1.
149+
the data. If None, defaults to 1 and the attribute is marked as tunable.
150+
Default: 1.
159151
cname : BloscCname or {'lz4', 'lz4hc', 'blosclz', 'snappy', 'zlib', 'zstd'}, optional
160152
The compression algorithm to use. Default: 'zstd'.
161153
clevel : int, optional
@@ -168,47 +160,17 @@ class BloscCodec(BytesBytesCodec):
168160
- 'shuffle': Byte shuffling (better for typesize > 1)
169161
- 'bitshuffle': Bit shuffling (better for typesize == 1)
170162
171-
If None (deprecated), defaults to 'bitshuffle' and the attribute is marked
163+
If None, defaults to 'bitshuffle' and the attribute is marked
172164
as tunable. Default: 'bitshuffle'.
173165
blocksize : int, optional
174166
The requested size of compressed blocks in bytes. A value of 0 means
175167
automatic block size selection. Default: 0.
176-
tunable_attrs : set of {'typesize', 'shuffle'}, optional
177-
Names of attributes that can be automatically adjusted by
178-
`evolve_from_array_spec()`. This allows the codec to adapt its parameters
179-
based on the array's data type when the array is created. If None, defaults
180-
to {'typesize', 'shuffle'}.
181168
182169
Notes
183170
-----
184-
**Tunable Attributes Logic**:
185-
186-
The `tunable_attrs` mechanism allows codec parameters to be automatically
187-
adjusted based on the array's data type:
188-
189-
1. **Initialization**: During `__init__`, if `tunable_attrs` is None, it
190-
defaults to {'typesize', 'shuffle'}. This means both attributes can be
191-
tuned by default.
192-
193-
2. **Deprecated None Values**: If `typesize` or `shuffle` is explicitly set
194-
to None:
195-
196-
- A deprecation warning is issued
197-
- The parameter is set to a default value (1 for typesize, 'bitshuffle'
198-
for shuffle)
199-
- The attribute name is added to `tunable_attrs`
200-
201-
3. **Evolution**: When `evolve_from_array_spec()` is called (typically during
202-
array creation), it creates a new codec instance with updated parameters:
203-
204-
- If 'typesize' is in `tunable_attrs`, it's set to the array dtype's
205-
item size
206-
- If 'shuffle' is in `tunable_attrs`, it's set to 'bitshuffle' if
207-
item_size == 1, otherwise 'shuffle'
208-
209-
4. **Explicit Values**: If you explicitly set `typesize=4` or
210-
`shuffle='noshuffle'`, these values are NOT in `tunable_attrs` by default
211-
and will not be changed by `evolve_from_array_spec()`.
171+
**Tunable attributes**: If `typesize` or `shuffle` are set to None during
172+
initialization, they are marked as tunable attributes. This means they can be
173+
adjusted later based on the data type of the array being compressed.
212174
213175
**Thread Safety**: This codec sets `numcodecs.blosc.use_threads = False` at
214176
module import time to avoid threading issues in asyncio contexts.
@@ -229,28 +191,14 @@ class BloscCodec(BytesBytesCodec):
229191
>>> codec.cname
230192
<BloscCname.zstd: 'zstd'>
231193
232-
Use deprecated None values (will be tuned automatically):
233-
234-
>>> codec = BloscCodec(typesize=None, shuffle=None) # doctest: +SKIP
235-
DeprecationWarning: The typesize parameter was set to None...
236-
>>> 'typesize' in codec.tunable_attrs
237-
True
238-
>>> 'shuffle' in codec.tunable_attrs
239-
True
240-
241-
Prevent automatic tuning:
242-
243-
>>> codec = BloscCodec(typesize=4, shuffle='noshuffle', tunable_attrs=set())
244-
>>> codec.tunable_attrs
245-
set()
246-
247194
See Also
248195
--------
249196
BloscShuffle : Enum for shuffle filter options
250197
BloscCname : Enum for compression algorithm options
251198
"""
252199

253-
tunable_attrs: set[Literal["typesize", "shuffle"]]
200+
# This attribute tracks parameters were set to None at init time, and thus tunable
201+
_tunable_attrs: set[Literal["typesize", "shuffle"]] = field(init=False)
254202
is_fixed_size = False
255203

256204
typesize: int
@@ -262,41 +210,25 @@ class BloscCodec(BytesBytesCodec):
262210
def __init__(
263211
self,
264212
*,
265-
typesize: int | None = 1,
213+
typesize: int | None = None,
266214
cname: BloscCname | CName = BloscCname.zstd,
267215
clevel: int = 5,
268-
shuffle: BloscShuffle | Shuffle | None = "bitshuffle",
216+
shuffle: BloscShuffle | Shuffle | None = None,
269217
blocksize: int = 0,
270-
tunable_attrs: set[Literal["typesize", "shuffle"]] | None = None,
271218
) -> None:
272-
# set default value of tunable_attrs
273-
if tunable_attrs is None:
274-
object.__setattr__(self, "tunable_attrs", {"typesize", "shuffle"})
275-
else:
276-
object.__setattr__(self, "tunable_attrs", tunable_attrs)
219+
object.__setattr__(self, "_tunable_attrs", set())
277220

278-
# If typesize was set to None: warn, replace it with a valid typesize
221+
# If typesize was set to None, replace it with a valid typesize
279222
# and flag the typesize attribute as safe to replace later
280223
if typesize is None:
281-
msg = (
282-
"The typesize parameter was set to None. This is deprecated. "
283-
"Provide a positive int for the typesize parameter instead. "
284-
)
285-
warnings.warn(msg, ZarrDeprecationWarning, stacklevel=2)
286224
typesize = 1
287-
self.tunable_attrs.update({"typesize"})
225+
self._tunable_attrs.update({"typesize"})
288226

289-
# If shuffle was set to None: warn, replace it with a valid typesize
227+
# If shuffle was set to None, replace it with a valid shuffle
290228
# and flag the shuffle attribute as safe to replace later
291229
if shuffle is None:
292-
msg = (
293-
"The shuffle parameter was set to None. This is deprecated. "
294-
"Provide a valid shuffle literal string -- "
295-
f"one of {SHUFFLE!r} -- instead."
296-
)
297-
warnings.warn(msg, ZarrDeprecationWarning, stacklevel=2)
298230
shuffle = BloscShuffle.bitshuffle
299-
self.tunable_attrs.update({"shuffle"})
231+
self._tunable_attrs.update({"shuffle"})
300232

301233
typesize_parsed = parse_typesize(typesize)
302234
cname_parsed = parse_enum(cname, BloscCname)
@@ -339,9 +271,9 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
339271
if isinstance(array_spec.dtype, HasItemSize):
340272
item_size = array_spec.dtype.item_size
341273
new_codec = self
342-
if "typesize" in self.tunable_attrs:
274+
if "typesize" in self._tunable_attrs:
343275
new_codec = replace(new_codec, typesize=item_size)
344-
if "shuffle" in self.tunable_attrs:
276+
if "shuffle" in self._tunable_attrs:
345277
new_codec = replace(
346278
new_codec,
347279
shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle),

tests/test_codecs/test_blosc.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import json
2-
from typing import Literal
32

43
import numcodecs
54
import numpy as np
@@ -8,11 +7,10 @@
87

98
import zarr
109
from zarr.codecs import BloscCodec
11-
from zarr.codecs.blosc import BloscShuffle
10+
from zarr.codecs.blosc import BloscShuffle, Shuffle
1211
from zarr.core.array_spec import ArraySpec
1312
from zarr.core.buffer import default_buffer_prototype
1413
from zarr.core.dtype import UInt16
15-
from zarr.errors import ZarrDeprecationWarning
1614
from zarr.storage import MemoryStore, StorePath
1715

1816

@@ -62,12 +60,21 @@ async def test_blosc_evolve(dtype: str) -> None:
6260
assert blosc_configuration_json["shuffle"] == "shuffle"
6361

6462

65-
@pytest.mark.parametrize("tunable_attrs", [{"typesize"}, {"shuffle"}, {"typesize", "shuffle"}])
66-
def test_tunable_attrs(tunable_attrs: set[Literal["typesize", "shuffle"]]) -> None:
63+
@pytest.mark.parametrize("shuffle", [None, "bitshuffle", BloscShuffle.shuffle])
64+
@pytest.mark.parametrize("typesize", [None, 1, 2])
65+
def test_tunable_attrs_param(shuffle: None | Shuffle | BloscShuffle, typesize: None | int) -> None:
6766
"""
68-
Test that the tunable_attrs parameter is respected when calling evolve_from_array_spec
67+
Test that the tunable_attrs parameter is set as expected when creating a BloscCodec,
6968
"""
70-
codec = BloscCodec(tunable_attrs=tunable_attrs)
69+
codec = BloscCodec(typesize=typesize, shuffle=shuffle)
70+
71+
if shuffle is None:
72+
assert codec.shuffle == BloscShuffle.bitshuffle # default shuffle
73+
assert "shuffle" in codec._tunable_attrs
74+
if typesize is None:
75+
assert codec.typesize == 1 # default typesize
76+
assert "typesize" in codec._tunable_attrs
77+
7178
new_dtype = UInt16()
7279
array_spec = ArraySpec(
7380
shape=(1,),
@@ -78,22 +85,16 @@ def test_tunable_attrs(tunable_attrs: set[Literal["typesize", "shuffle"]]) -> No
7885
)
7986

8087
evolved_codec = codec.evolve_from_array_spec(array_spec=array_spec)
81-
if "typesize" in tunable_attrs:
88+
if typesize is None:
8289
assert evolved_codec.typesize == new_dtype.item_size
8390
else:
8491
assert evolved_codec.typesize == codec.typesize
85-
if "shuffle" in tunable_attrs:
92+
if shuffle is None:
8693
assert evolved_codec.shuffle == BloscShuffle.shuffle
8794
else:
8895
assert evolved_codec.shuffle == codec.shuffle
8996

9097

91-
@pytest.mark.parametrize("kwargs", [{"typesize": None}, {"shuffle": None}])
92-
def test_invalid_parameters_warns(kwargs: dict[str, object]) -> None:
93-
with pytest.warns(ZarrDeprecationWarning, match="The .* parameter was set to None."):
94-
BloscCodec(**kwargs) # type: ignore[arg-type]
95-
96-
9798
async def test_typesize() -> None:
9899
a = np.arange(1000000, dtype=np.uint64)
99100
codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()]

0 commit comments

Comments
 (0)