From 646c3cd869afa307b36ea3328f6b069e6089b3c3 Mon Sep 17 00:00:00 2001
From: maitry63 <maitry@google.com>
Date: Thu, 13 Nov 2025 16:11:37 +0000
Subject: [PATCH 1/5] =?UTF-8?q?=F0=9F=93=9Ddocs:=20clarify=20shuffle=20beh?=
 =?UTF-8?q?avior=20and=20example=20in=20PyDataset?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../data_adapters/py_dataset_adapter.py       | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/keras/src/trainers/data_adapters/py_dataset_adapter.py b/keras/src/trainers/data_adapters/py_dataset_adapter.py
index 18865af026cf..e74b02d8a4fd 100644
--- a/keras/src/trainers/data_adapters/py_dataset_adapter.py
+++ b/keras/src/trainers/data_adapters/py_dataset_adapter.py
@@ -40,6 +40,12 @@ class PyDataset:
             multiprocessed setting.
             Reduce this value to reduce the CPU memory consumption of
             your dataset. Defaults to 10.
+        shuffle: Whether to shuffle the sample ordering at the end of
+            each epoch.This argument passed to `model.fit()`. when 
+            `model.fit(.., shuffle=True)`, the training loop 
+            automatically calls `on_epoch_end()` at each epoch 
+            boundary, allowing datasets to implement custom 
+            shuffling logic. Defaults to False.
 
     Notes:
 
@@ -52,6 +58,9 @@ class PyDataset:
         over the dataset. They are not being used by the `PyDataset` class
         directly. When you are manually iterating over a `PyDataset`,
         no parallelism is applied.
+    - `shuffle=False` keeps the sample order fixed across epochs.
+        For distributed or deterministic training prefer
+        `shuffle=False` and manage the order externally.
 
     Example:
 
@@ -66,10 +75,12 @@ class PyDataset:
 
     class CIFAR10PyDataset(keras.utils.PyDataset):
 
-        def __init__(self, x_set, y_set, batch_size, **kwargs):
+        def __init__(self, x_set, y_set, batch_size,shuffle=False, **kwargs):
             super().__init__(**kwargs)
             self.x, self.y = x_set, y_set
             self.batch_size = batch_size
+            self.shuffle = shuffle
+            self.indices = np.arrange(len(self.x))
 
         def __len__(self):
             # Return number of batches.
@@ -87,6 +98,12 @@ def __getitem__(self, idx):
             return np.array([
                 resize(imread(file_name), (200, 200))
                    for file_name in batch_x]), np.array(batch_y)
+        
+        def on_epoch_end(self):
+            # Called automatically by model.fit() when shuffle=True
+            #
+            if self.shuffle:
+                np.random.shuffle(self.indices)
     ```
     """
 

From 16058c464d6e635c9b8371489eb456e14357742a Mon Sep 17 00:00:00 2001
From: maitry63 <maitry@google.com>
Date: Thu, 13 Nov 2025 21:57:31 +0530
Subject: [PATCH 2/5] Update
 keras/src/trainers/data_adapters/py_dataset_adapter.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 keras/src/trainers/data_adapters/py_dataset_adapter.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/keras/src/trainers/data_adapters/py_dataset_adapter.py b/keras/src/trainers/data_adapters/py_dataset_adapter.py
index e74b02d8a4fd..7c8c14152586 100644
--- a/keras/src/trainers/data_adapters/py_dataset_adapter.py
+++ b/keras/src/trainers/data_adapters/py_dataset_adapter.py
@@ -41,11 +41,11 @@ class PyDataset:
             Reduce this value to reduce the CPU memory consumption of
             your dataset. Defaults to 10.
         shuffle: Whether to shuffle the sample ordering at the end of
-            each epoch.This argument passed to `model.fit()`. when 
-            `model.fit(.., shuffle=True)`, the training loop 
-            automatically calls `on_epoch_end()` at each epoch 
-            boundary, allowing datasets to implement custom 
-            shuffling logic. Defaults to False.
+            each epoch. This argument is passed to `model.fit()`. When
+            `model.fit(..., shuffle=True)`, the training loop
+            automatically calls `on_epoch_end()` at each epoch
+            boundary, allowing datasets to implement custom
+            shuffling logic. Defaults to `False`.
 
     Notes:
 

From fa549b91e011c029ecb8680bf037ea324d79a9c7 Mon Sep 17 00:00:00 2001
From: maitry63 <maitry@google.com>
Date: Thu, 13 Nov 2025 22:00:31 +0530
Subject: [PATCH 3/5] Update
 keras/src/trainers/data_adapters/py_dataset_adapter.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 keras/src/trainers/data_adapters/py_dataset_adapter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras/src/trainers/data_adapters/py_dataset_adapter.py b/keras/src/trainers/data_adapters/py_dataset_adapter.py
index 7c8c14152586..e12756184d2e 100644
--- a/keras/src/trainers/data_adapters/py_dataset_adapter.py
+++ b/keras/src/trainers/data_adapters/py_dataset_adapter.py
@@ -75,12 +75,12 @@ class PyDataset:
 
     class CIFAR10PyDataset(keras.utils.PyDataset):
 
-        def __init__(self, x_set, y_set, batch_size,shuffle=False, **kwargs):
+        def __init__(self, x_set, y_set, batch_size, shuffle=False, **kwargs):
             super().__init__(**kwargs)
             self.x, self.y = x_set, y_set
             self.batch_size = batch_size
             self.shuffle = shuffle
-            self.indices = np.arrange(len(self.x))
+            self.indices = np.arange(len(self.x))
 
         def __len__(self):
             # Return number of batches.

From 78ca394040d4c0ab39eb3708be2dff42e75bf5c9 Mon Sep 17 00:00:00 2001
From: maitry63 <maitry@google.com>
Date: Thu, 13 Nov 2025 16:57:49 +0000
Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=93=9Ddocs:=20Fix=20PyDataset=20examp?=
 =?UTF-8?q?le=20to=20use=20self.indices=20for=20correct=20shuffling?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 keras/src/trainers/data_adapters/py_dataset_adapter.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/keras/src/trainers/data_adapters/py_dataset_adapter.py b/keras/src/trainers/data_adapters/py_dataset_adapter.py
index e12756184d2e..be0d610874ed 100644
--- a/keras/src/trainers/data_adapters/py_dataset_adapter.py
+++ b/keras/src/trainers/data_adapters/py_dataset_adapter.py
@@ -92,16 +92,17 @@ def __getitem__(self, idx):
             # Cap upper bound at array length; the last batch may be smaller
             # if the total number of items is not a multiple of batch size.
             high = min(low + self.batch_size, len(self.x))
-            batch_x = self.x[low:high]
-            batch_y = self.y[low:high]
+            # Retrieve a batch of data by index 
+            batch_indices = self.indices[low:high]
+            batch_x = self.x[batch_indices]
+            batch_y = self.y[batch_indices]
 
             return np.array([
                 resize(imread(file_name), (200, 200))
                    for file_name in batch_x]), np.array(batch_y)
         
         def on_epoch_end(self):
-            # Called automatically by model.fit() when shuffle=True
-            #
+            # Shuffle indices at the end of each epoch if enabled
             if self.shuffle:
                 np.random.shuffle(self.indices)
     ```

From 54cae659d4d682df2d8163ca83a47cb7bea009a3 Mon Sep 17 00:00:00 2001
From: maitry63 <maitry@google.com>
Date: Fri, 14 Nov 2025 17:08:49 +0000
Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=93=9D=20Docs:=20updated=20example=20?=
 =?UTF-8?q?so=20correctly=20use=20shuffled=20indices=20&=20initial=20shuff?=
 =?UTF-8?q?le.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/trainers/data_adapters/py_dataset_adapter.py  | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/keras/src/trainers/data_adapters/py_dataset_adapter.py b/keras/src/trainers/data_adapters/py_dataset_adapter.py
index be0d610874ed..88177a39eca3 100644
--- a/keras/src/trainers/data_adapters/py_dataset_adapter.py
+++ b/keras/src/trainers/data_adapters/py_dataset_adapter.py
@@ -42,7 +42,7 @@ class PyDataset:
             your dataset. Defaults to 10.
         shuffle: Whether to shuffle the sample ordering at the end of
             each epoch. This argument is passed to `model.fit()`. When
-            `model.fit(..., shuffle=True)`, the training loop
+            calling `model.fit(..., shuffle=True)`, the training loop
             automatically calls `on_epoch_end()` at each epoch
             boundary, allowing datasets to implement custom
             shuffling logic. Defaults to `False`.
@@ -80,7 +80,12 @@ def __init__(self, x_set, y_set, batch_size, shuffle=False, **kwargs):
             self.x, self.y = x_set, y_set
             self.batch_size = batch_size
             self.shuffle = shuffle
+            # create index array for shuffling
             self.indices = np.arange(len(self.x))
+            # Shuffle once at initialization when shuffle=True
+            if self.shuffle:
+                np.random.shuffle(self.indices)
+
 
         def __len__(self):
             # Return number of batches.
@@ -92,7 +97,7 @@ def __getitem__(self, idx):
             # Cap upper bound at array length; the last batch may be smaller
             # if the total number of items is not a multiple of batch size.
             high = min(low + self.batch_size, len(self.x))
-            # Retrieve a batch of data by index 
+            # Retrieve a batch using shuffled indices
             batch_indices = self.indices[low:high]
             batch_x = self.x[batch_indices]
             batch_y = self.y[batch_indices]
@@ -100,7 +105,7 @@ def __getitem__(self, idx):
             return np.array([
                 resize(imread(file_name), (200, 200))
                    for file_name in batch_x]), np.array(batch_y)
-        
+
         def on_epoch_end(self):
             # Shuffle indices at the end of each epoch if enabled
             if self.shuffle: