From 675481c4d1d855f8c9e808fb427edca15f9e382e Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas.kluyver@xfel.eu>
Date: Thu, 9 Jun 2022 18:03:39 +0200
Subject: [PATCH] Let Dask pick automatic chunk size along train/time axis

---
 extra_data/keydata.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/extra_data/keydata.py b/extra_data/keydata.py
index 2101cd94..3706dfe9 100644
--- a/extra_data/keydata.py
+++ b/extra_data/keydata.py
@@ -373,17 +373,10 @@ def dask_array(self, labelled=False):
             chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:]
             itemsize = chunk.dataset.dtype.itemsize
 
-            # Find chunk size of maximum 2 GB. This is largely arbitrary:
-            # we want chunks small enough that each worker can have at least
-            # a couple in memory (Maxwell nodes have 256-768 GB in late 2019).
-            # But bigger chunks means less overhead.
-            # Empirically, making chunks 4 times bigger/smaller didn't seem to
-            # affect speed dramatically - but this could depend on many factors.
-            # TODO: optional user control of chunking
-            limit = 2 * 1024 ** 3
-            while np.product(chunk_shape) * itemsize > limit and chunk_dim0 > 1:
-                chunk_dim0 //= 2
-                chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:]
+            # Let Dask pick chunk size along the first (trains/time) dimension.
+            # It will aim for a chunk size of 128 MiB by default, though users can
+            # configure this with a setting called 'array.chunk-size'.
+            chunk_shape = ("auto",) + chunk.dataset.shape[1:]
 
             chunks_darrs.append(
                 da.from_array(