From 646c3cd869afa307b36ea3328f6b069e6089b3c3 Mon Sep 17 00:00:00 2001 From: maitry63 Date: Thu, 13 Nov 2025 16:11:37 +0000 Subject: [PATCH 1/5] =?UTF-8?q?=F0=9F=93=9Ddocs:=20clarify=20shuffle=20beh?= =?UTF-8?q?avior=20and=20example=20in=20PyDataset?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../data_adapters/py_dataset_adapter.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/keras/src/trainers/data_adapters/py_dataset_adapter.py b/keras/src/trainers/data_adapters/py_dataset_adapter.py index 18865af026cf..e74b02d8a4fd 100644 --- a/keras/src/trainers/data_adapters/py_dataset_adapter.py +++ b/keras/src/trainers/data_adapters/py_dataset_adapter.py @@ -40,6 +40,12 @@ class PyDataset: multiprocessed setting. Reduce this value to reduce the CPU memory consumption of your dataset. Defaults to 10. + shuffle: Whether to shuffle the sample ordering at the end of + each epoch.This argument passed to `model.fit()`. when + `model.fit(.., shuffle=True)`, the training loop + automatically calls `on_epoch_end()` at each epoch + boundary, allowing datasets to implement custom + shuffling logic. Defaults to False. Notes: @@ -52,6 +58,9 @@ class PyDataset: over the dataset. They are not being used by the `PyDataset` class directly. When you are manually iterating over a `PyDataset`, no parallelism is applied. + - `shuffle=False` keeps the sample order fixed across epochs. + For distributed or deterministic training prefer + `shuffle=False` and manage the order externally. Example: @@ -66,10 +75,12 @@ class PyDataset: class CIFAR10PyDataset(keras.utils.PyDataset): - def __init__(self, x_set, y_set, batch_size, **kwargs): + def __init__(self, x_set, y_set, batch_size,shuffle=False, **kwargs): super().__init__(**kwargs) self.x, self.y = x_set, y_set self.batch_size = batch_size + self.shuffle = shuffle + self.indices = np.arrange(len(self.x)) def __len__(self): # Return number of batches. @@ -87,6 +98,12 @@ def __getitem__(self, idx): return np.array([ resize(imread(file_name), (200, 200)) for file_name in batch_x]), np.array(batch_y) + + def on_epoch_end(self): + # Called automatically by model.fit() when shuffle=True + # + if self.shuffle: + np.random.shuffle(self.indices) ``` """ From 16058c464d6e635c9b8371489eb456e14357742a Mon Sep 17 00:00:00 2001 From: maitry63 Date: Thu, 13 Nov 2025 21:57:31 +0530 Subject: [PATCH 2/5] Update keras/src/trainers/data_adapters/py_dataset_adapter.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- keras/src/trainers/data_adapters/py_dataset_adapter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/keras/src/trainers/data_adapters/py_dataset_adapter.py b/keras/src/trainers/data_adapters/py_dataset_adapter.py index e74b02d8a4fd..7c8c14152586 100644 --- a/keras/src/trainers/data_adapters/py_dataset_adapter.py +++ b/keras/src/trainers/data_adapters/py_dataset_adapter.py @@ -41,11 +41,11 @@ class PyDataset: Reduce this value to reduce the CPU memory consumption of your dataset. Defaults to 10. shuffle: Whether to shuffle the sample ordering at the end of - each epoch.This argument passed to `model.fit()`. when - `model.fit(.., shuffle=True)`, the training loop - automatically calls `on_epoch_end()` at each epoch - boundary, allowing datasets to implement custom - shuffling logic. Defaults to False. + each epoch. This argument is passed to `model.fit()`. When + `model.fit(..., shuffle=True)`, the training loop + automatically calls `on_epoch_end()` at each epoch + boundary, allowing datasets to implement custom + shuffling logic. Defaults to `False`. Notes: From fa549b91e011c029ecb8680bf037ea324d79a9c7 Mon Sep 17 00:00:00 2001 From: maitry63 Date: Thu, 13 Nov 2025 22:00:31 +0530 Subject: [PATCH 3/5] Update keras/src/trainers/data_adapters/py_dataset_adapter.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- keras/src/trainers/data_adapters/py_dataset_adapter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras/src/trainers/data_adapters/py_dataset_adapter.py b/keras/src/trainers/data_adapters/py_dataset_adapter.py index 7c8c14152586..e12756184d2e 100644 --- a/keras/src/trainers/data_adapters/py_dataset_adapter.py +++ b/keras/src/trainers/data_adapters/py_dataset_adapter.py @@ -75,12 +75,12 @@ class PyDataset: class CIFAR10PyDataset(keras.utils.PyDataset): - def __init__(self, x_set, y_set, batch_size,shuffle=False, **kwargs): + def __init__(self, x_set, y_set, batch_size, shuffle=False, **kwargs): super().__init__(**kwargs) self.x, self.y = x_set, y_set self.batch_size = batch_size self.shuffle = shuffle - self.indices = np.arrange(len(self.x)) + self.indices = np.arange(len(self.x)) def __len__(self): # Return number of batches. From 78ca394040d4c0ab39eb3708be2dff42e75bf5c9 Mon Sep 17 00:00:00 2001 From: maitry63 Date: Thu, 13 Nov 2025 16:57:49 +0000 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=93=9Ddocs:=20Fix=20PyDataset=20examp?= =?UTF-8?q?le=20to=20use=20self.indices=20for=20correct=20shuffling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- keras/src/trainers/data_adapters/py_dataset_adapter.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/keras/src/trainers/data_adapters/py_dataset_adapter.py b/keras/src/trainers/data_adapters/py_dataset_adapter.py index e12756184d2e..be0d610874ed 100644 --- a/keras/src/trainers/data_adapters/py_dataset_adapter.py +++ b/keras/src/trainers/data_adapters/py_dataset_adapter.py @@ -92,16 +92,17 @@ def __getitem__(self, idx): # Cap upper bound at array length; the last batch may be smaller # if the total number of items is not a multiple of batch size. high = min(low + self.batch_size, len(self.x)) - batch_x = self.x[low:high] - batch_y = self.y[low:high] + # Retrieve a batch of data by index + batch_indices = self.indices[low:high] + batch_x = self.x[batch_indices] + batch_y = self.y[batch_indices] return np.array([ resize(imread(file_name), (200, 200)) for file_name in batch_x]), np.array(batch_y) def on_epoch_end(self): - # Called automatically by model.fit() when shuffle=True - # + # Shuffle indices at the end of each epoch if enabled if self.shuffle: np.random.shuffle(self.indices) ``` From 54cae659d4d682df2d8163ca83a47cb7bea009a3 Mon Sep 17 00:00:00 2001 From: maitry63 Date: Fri, 14 Nov 2025 17:08:49 +0000 Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=93=9D=20Docs:=20updated=20example=20?= =?UTF-8?q?so=20correctly=20use=20shuffled=20indices=20&=20initial=20shuff?= =?UTF-8?q?le.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/trainers/data_adapters/py_dataset_adapter.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/keras/src/trainers/data_adapters/py_dataset_adapter.py b/keras/src/trainers/data_adapters/py_dataset_adapter.py index be0d610874ed..88177a39eca3 100644 --- a/keras/src/trainers/data_adapters/py_dataset_adapter.py +++ b/keras/src/trainers/data_adapters/py_dataset_adapter.py @@ -42,7 +42,7 @@ class PyDataset: your dataset. Defaults to 10. shuffle: Whether to shuffle the sample ordering at the end of each epoch. This argument is passed to `model.fit()`. When - `model.fit(..., shuffle=True)`, the training loop + calling `model.fit(..., shuffle=True)`, the training loop automatically calls `on_epoch_end()` at each epoch boundary, allowing datasets to implement custom shuffling logic. Defaults to `False`. @@ -80,7 +80,12 @@ def __init__(self, x_set, y_set, batch_size, shuffle=False, **kwargs): self.x, self.y = x_set, y_set self.batch_size = batch_size self.shuffle = shuffle + # create index array for shuffling self.indices = np.arange(len(self.x)) + # Shuffle once at initialization when shuffle=True + if self.shuffle: + np.random.shuffle(self.indices) + def __len__(self): # Return number of batches. @@ -92,7 +97,7 @@ def __getitem__(self, idx): # Cap upper bound at array length; the last batch may be smaller # if the total number of items is not a multiple of batch size. high = min(low + self.batch_size, len(self.x)) - # Retrieve a batch of data by index + # Retrieve a batch using shuffled indices batch_indices = self.indices[low:high] batch_x = self.x[batch_indices] batch_y = self.y[batch_indices] @@ -100,7 +105,7 @@ def __getitem__(self, idx): return np.array([ resize(imread(file_name), (200, 200)) for file_name in batch_x]), np.array(batch_y) - + def on_epoch_end(self): # Shuffle indices at the end of each epoch if enabled if self.shuffle: