From 675481c4d1d855f8c9e808fb427edca15f9e382e Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Thu, 9 Jun 2022 18:03:39 +0200 Subject: [PATCH] Let Dask pick automatic chunk size along train/time axis --- extra_data/keydata.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/extra_data/keydata.py b/extra_data/keydata.py index 2101cd94..3706dfe9 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -373,17 +373,10 @@ def dask_array(self, labelled=False): chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:] itemsize = chunk.dataset.dtype.itemsize - # Find chunk size of maximum 2 GB. This is largely arbitrary: - # we want chunks small enough that each worker can have at least - # a couple in memory (Maxwell nodes have 256-768 GB in late 2019). - # But bigger chunks means less overhead. - # Empirically, making chunks 4 times bigger/smaller didn't seem to - # affect speed dramatically - but this could depend on many factors. - # TODO: optional user control of chunking - limit = 2 * 1024 ** 3 - while np.product(chunk_shape) * itemsize > limit and chunk_dim0 > 1: - chunk_dim0 //= 2 - chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:] + # Let Dask pick chunk size along the first (trains/time) dimension. + # It will aim for a chunk size of 128 MiB by default, though users can + # configure this with a setting called 'array.chunk-size'. + chunk_shape = ("auto",) + chunk.dataset.shape[1:] chunks_darrs.append( da.from_array(