From 8c5497b7bfc6e0cbfd9459b411220e768ecd141b Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 22 Jul 2025 16:19:29 +0000
Subject: [PATCH 001/168] Implement TimedGRPOTrainer to log roll-out batch
 durations

---
 python/spotlight_prj/fedllm/custom_trainer.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index fd5fab12e..5e1e0aa85 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -26,6 +26,16 @@
 from run_fedllm import LLMTrainer, LLMAggregator, save_checkpoint, load_checkpoint
 from src.peft_utils import set_peft_model_state_dict
 from src.modeling_utils import load_state_dict
+import time, logging
+
+class TimedGRPOTrainer(GRPOTrainer):
+    def _make_experience(self, *args, **kwargs):
+        
+        t0 = time.perf_counter()
+        result = super()._make_experience(*args, **kwargs)
+        self.log(f"roll-out batch {self.state.global_step} : "
+                     f"{time.perf_counter() - t0:.3f}s")
+        return result
 
 
 class FullModelLLMTrainer(LLMTrainer):
@@ -186,7 +196,7 @@ def train(self, train_data, device, args):
         self.log(f"GRPO Config - max_completion_length: 1024, num_generations: {num_generations}")
         
         # Create GRPO trainer with fresh model and tokenizer
-        grpo_trainer = GRPOTrainer(
+        grpo_trainer = TimedGRPOTrainer(
             model=fresh_model,  # Use fresh model
             args=cfg,
             train_dataset=ds.shuffle(seed=cfg.seed),

From e91d56a53a614a950c554e6186a7497925955045 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 22 Jul 2025 19:16:13 +0000
Subject: [PATCH 002/168] Add venv to .gitignore to exclude virtual environment
 files

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index a4b62842b..b8c7ee47a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -277,3 +277,6 @@ python/examples/launch/hello_world/fedml_job_entry_pack.bat
 **mpi_host_file
 /python/fedml/workflow/driver_example/customized_job_example/train_job/bootstrap.bat
 /python/fedml/workflow/driver_example/customized_job_example/train_job/fedml_job_entry_pack.bat
+
+
+venv
\ No newline at end of file

From b990ba28c2d63f7d264f8c3c6ba3da784b716a69 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 22 Jul 2025 19:21:38 +0000
Subject: [PATCH 003/168] Add timing logs for set_model_params in
 FullModelLLMTrainer and FullModelLLMAggregator

---
 python/spotlight_prj/fedllm/custom_trainer.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 5e1e0aa85..e191bce33 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -261,6 +261,8 @@ def on_after_local_training(self, train_data, device, args):
     def set_model_params(self, model_parameters) -> None:
         self.log("start")
 
+        t0 = time.perf_counter()
+
         model_parameters = to_device(model_parameters, device="cpu")
 
         barrier()
@@ -283,6 +285,9 @@ def set_model_params(self, model_parameters) -> None:
                 state_dict=model_parameters,
                 synchronize=True
             )
+        
+        elapsed = time.perf_counter() - t0
+        self.log(f"set_model_params (client) took {elapsed:.3f}s")
 
         self.log("finished")
     
@@ -318,6 +323,8 @@ class FullModelLLMAggregator(LLMAggregator):
     def set_model_params(self, model_parameters) -> None:
         self.log("start")
 
+        t0 = time.perf_counter()
+
         model_parameters = to_device(model_parameters, device="cpu")
 
         barrier()
@@ -340,5 +347,9 @@ def set_model_params(self, model_parameters) -> None:
                 state_dict=model_parameters,
                 synchronize=True
             )
+        
+        elapsed = time.perf_counter() - t0
+        self.log(f"set_model_params (server) took {elapsed:.3f}s")
+
 
         self.log("finished") 
\ No newline at end of file

From 26f9668be25536f31cd402a4dae984b7fc04bece Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 22 Jul 2025 20:00:04 +0000
Subject: [PATCH 004/168] Update run_fedml_client_custom.sh and
 run_fedml_server_custom.sh to include timeout for launch_fedllm_custom.py
 execution

---
 python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh | 2 +-
 python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh b/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
index 2d4be7678..29288d35d 100755
--- a/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
+++ b/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
@@ -25,7 +25,7 @@ LAUNCHER="${6:-"auto"}"
 CONFIG_PATH="${7:-"fedml_config/grpo_gsm8k_test_config.yaml"}"
 
 # Use the custom launcher that properly handles non-PEFT models
-python3 launch_fedllm_custom.py \
+timeout --signal=SIGINT --kill-after=30s 21600 python3 launch_fedllm_custom.py \
   --cf "${CONFIG_PATH}" \
   --rank "${RANK}" \
   --role client \
diff --git a/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh b/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
index f32a85e9f..e50d58f85 100755
--- a/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
+++ b/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
@@ -26,7 +26,7 @@ LAUNCHER="${6:-"auto"}"
 CONFIG_PATH="${7:-"fedml_config/fedml_config.yaml"}"
 
 # Use the custom launcher that properly handles non-PEFT models
-python3 launch_fedllm_custom.py \
+timeout --signal=SIGINT --kill-after=30s 21600 python3 launch_fedllm_custom.py \
   --cf "${CONFIG_PATH}" \
   --rank "${RANK}" \
   --role server \

From 74a395fb9b4df73c1f107a1e2922996c40d17df8 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 22 Jul 2025 20:12:28 +0000
Subject: [PATCH 005/168] Add periodic checkpointing and per-round checkpoint
 configuration to FullModelLLMTrainer and FullModelLLMAggregator

---
 python/spotlight_prj/fedllm/custom_trainer.py | 100 +++++++++++++++---
 1 file changed, 86 insertions(+), 14 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index e191bce33..d93946c5c 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -27,6 +27,7 @@
 from src.peft_utils import set_peft_model_state_dict
 from src.modeling_utils import load_state_dict
 import time, logging
+import threading
 
 class TimedGRPOTrainer(GRPOTrainer):
     def _make_experience(self, *args, **kwargs):
@@ -49,6 +50,14 @@ def __init__(self, *args, **kwargs):
         self.DATASET_ANS = re.compile(r"####\s*([-+]?\d+\.?\d*)")
         # Regex for model completion format (\boxed{})
         self.MODEL_ANS = re.compile(r"\\boxed\{([^}]*)\}")
+
+        # ------------------------------------------------------------------
+        # Configuration: enable or disable per-round checkpoints
+        # ------------------------------------------------------------------
+
+        # Default: omit per-round checkpoints unless user explicitly enables
+        # them via the FedML YAML (enable_round_checkpoints: true)
+        self._enable_round_ckpt = getattr(self.args, "enable_round_checkpoints", False)
     
     def reward_fn(self, completions, answer, **_):
         """Reward function for GSM8K that checks if the predicted answer matches the true answer."""
@@ -232,17 +241,17 @@ def train(self, train_data, device, args):
         else:
             self.model.load_state_dict(trained_state, strict=False)
         
-        # Save the trained model in FedML's expected location
-        self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_before_agg"
-        self.log(f"Saving GRPO-trained model to \"{self.latest_checkpoint_dir}\"")
-        
-        # Save checkpoint using FedML's model
-        save_checkpoint(
-            self.model,
-            self.latest_checkpoint_dir,
-            is_saving_process=self.training_args.should_save,
-            synchronize=True
-        )
+        # Optionally save a pre-aggregation checkpoint for this round
+        if self._enable_round_ckpt:
+            self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_before_agg"
+            self.log(f"[round-ckpt] Saving GRPO-trained model to \"{self.latest_checkpoint_dir}\"")
+
+            save_checkpoint(
+                self.model,
+                self.latest_checkpoint_dir,
+                is_saving_process=self.training_args.should_save,
+                synchronize=True
+            )
         
         # Clean up fresh model to free memory
         del fresh_model
@@ -274,7 +283,7 @@ def set_model_params(self, model_parameters) -> None:
             load_state_dict(self.model, model_parameters, strict=False)
         barrier()
 
-        if self.round_idx >= 0 and self.should_save:
+        if self._enable_round_ckpt and self.round_idx >= 0 and self.should_save:
             # save aggregated model checkpoint
             self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_after_agg"
             self.log(f"saving aggregated model to \"{self.latest_checkpoint_dir}\"")
@@ -320,6 +329,69 @@ def await_sync_process_group(self, from_process: int = 0) -> list:
 class FullModelLLMAggregator(LLMAggregator):
     """Custom aggregator that properly handles both PEFT and non-PEFT models."""
     
+    # ------------------------------------------------------------------
+    # Periodic checkpointing setup
+    # ------------------------------------------------------------------
+
+    def __init__(self, *args, **kwargs):
+        """Extend parent init and start a background thread that creates a
+        checkpoint every ``server_checkpoint_interval_minutes`` (default 30).
+
+        Notes
+        -----
+        * Only the main process (``self.is_main_process()``) actually writes the
+          checkpoint to avoid race conditions.
+        * Checkpoints are written under
+          ``{self.checkpoint_dir}/wallclock_{unix_ts}`` so they will not
+          collide with the per-round checkpoints that already exist.
+        """
+        super().__init__(*args, **kwargs)
+
+        # Determine interval (seconds)
+        interval_min = getattr(self.args, "server_checkpoint_interval_minutes", 30)
+        if interval_min <= 0:
+            # Disable if user passes 0 or negative value
+            self._checkpoint_interval = None
+            return
+
+        self._checkpoint_interval = interval_min * 60
+
+        # Background thread is only needed on the main process
+        if self.is_main_process():
+            self._stop_checkpoint_evt = threading.Event()
+            self._checkpoint_thread = threading.Thread(
+                target=self._periodic_checkpoint_loop,
+                name="periodic_ckpt_thread",
+                daemon=True,
+            )
+            self._checkpoint_thread.start()
+
+        # Whether to save per-round checkpoints (default False)
+        self._enable_round_ckpt = getattr(self.args, "enable_round_checkpoints", False)
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _periodic_checkpoint_loop(self):
+        """Loop that sleeps ``_checkpoint_interval`` seconds then writes a
+        checkpoint until ``_stop_checkpoint_evt`` is set (i.e., program exit).
+        """
+        while not self._stop_checkpoint_evt.wait(self._checkpoint_interval):
+            try:
+                ts = int(time.time())
+                ckpt_dir = self.checkpoint_dir / f"wallclock_{ts}"
+                self.log(f"Periodic checkpoint → {ckpt_dir}")
+                save_checkpoint(
+                    self.model,
+                    checkpoint_dir=ckpt_dir,
+                    is_saving_process=self.training_args.should_save,
+                    synchronize=True,
+                )
+            except Exception as e:
+                # Log and continue – do not crash training due to checkpoint failure
+                self.log(f"[WARN] Periodic checkpoint failed: {e}")
+
     def set_model_params(self, model_parameters) -> None:
         self.log("start")
 
@@ -336,7 +408,7 @@ def set_model_params(self, model_parameters) -> None:
             load_state_dict(self.model, model_parameters, strict=False)
         barrier()
 
-        if self.round_idx >= 0 and self.should_save:
+        if self._enable_round_ckpt and self.round_idx >= 0 and self.should_save:
             # save aggregated model checkpoint
             self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_after_agg"
             self.log(f"saving aggregated model to \"{self.latest_checkpoint_dir}\"")
@@ -352,4 +424,4 @@ def set_model_params(self, model_parameters) -> None:
         self.log(f"set_model_params (server) took {elapsed:.3f}s")
 
 
-        self.log("finished") 
\ No newline at end of file
+        self.log("finished")
\ No newline at end of file

From 6c3ea9c70ad1caf64d1146a5bccc36884abad650 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 22 Jul 2025 21:04:05 +0000
Subject: [PATCH 006/168] Remove commented-out code blocks in
 FullModelLLMTrainer for clarity

---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index d93946c5c..7e1ca1436 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -242,6 +242,7 @@ def train(self, train_data, device, args):
             self.model.load_state_dict(trained_state, strict=False)
         
         # Optionally save a pre-aggregation checkpoint for this round
+        """
         if self._enable_round_ckpt:
             self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_before_agg"
             self.log(f"[round-ckpt] Saving GRPO-trained model to \"{self.latest_checkpoint_dir}\"")
@@ -252,6 +253,7 @@ def train(self, train_data, device, args):
                 is_saving_process=self.training_args.should_save,
                 synchronize=True
             )
+        """
         
         # Clean up fresh model to free memory
         del fresh_model
@@ -283,6 +285,7 @@ def set_model_params(self, model_parameters) -> None:
             load_state_dict(self.model, model_parameters, strict=False)
         barrier()
 
+        """
         if self._enable_round_ckpt and self.round_idx >= 0 and self.should_save:
             # save aggregated model checkpoint
             self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_after_agg"
@@ -294,6 +297,7 @@ def set_model_params(self, model_parameters) -> None:
                 state_dict=model_parameters,
                 synchronize=True
             )
+        """
         
         elapsed = time.perf_counter() - t0
         self.log(f"set_model_params (client) took {elapsed:.3f}s")

From 499104a0dd8fd670b9ac16b8781eeec80ea6fa8d Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 23 Jul 2025 00:07:12 +0000
Subject: [PATCH 007/168] Add logging for global update frequency in
 FedMLServerManager

---
 .../cross_silo/server/fedml_server_manager.py     | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/python/fedml/cross_silo/server/fedml_server_manager.py b/python/fedml/cross_silo/server/fedml_server_manager.py
index 9639f4c0e..3d30f4db7 100644
--- a/python/fedml/cross_silo/server/fedml_server_manager.py
+++ b/python/fedml/cross_silo/server/fedml_server_manager.py
@@ -246,6 +246,21 @@ def handle_message_receive_model_from_client(self, msg_params):
             if self.is_main_process():
                 mlops.log_aggregated_model_info(self.args.round_idx, model_url=global_model_url)
 
+            # --------------------------------------------------
+            # Log global-update frequency in wall-clock terms
+            # --------------------------------------------------
+            current_ts = time.time()
+            # Compute and print only if this is not the very first round
+            if hasattr(self, "_last_round_end_ts") and self._last_round_end_ts is not None:
+                delta = current_ts - self._last_round_end_ts
+                if delta > 0:
+                    freq = 1.0 / delta
+                    logging.info(
+                        f"Global update frequency: {freq:.4f} updates/sec ({delta:.2f} s per round)"
+                    )
+            # Record timestamp for the next round
+            self._last_round_end_ts = current_ts
+
             logging.info("\n\n==========end {}-th round training===========\n".format(self.args.round_idx))
             if self.args.round_idx < self.round_num:
                 mlops.event("server.wait", event_started=True, event_value=str(self.args.round_idx))

From a2e576e9560d0170a40e937497c03739526f0bf7 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 23 Jul 2025 01:41:09 +0000
Subject: [PATCH 008/168] Add Nesterov momentum support in
 FullModelLLMAggregator and update logging in FedMLServerManager

---
 .../cross_silo/server/fedml_server_manager.py |  1 +
 python/spotlight_prj/fedllm/custom_trainer.py | 69 ++++++++++++++++++-
 .../fedml_config/grpo_gsm8k_test_config.yaml  |  2 +-
 3 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/python/fedml/cross_silo/server/fedml_server_manager.py b/python/fedml/cross_silo/server/fedml_server_manager.py
index 3d30f4db7..1905c7f1d 100644
--- a/python/fedml/cross_silo/server/fedml_server_manager.py
+++ b/python/fedml/cross_silo/server/fedml_server_manager.py
@@ -262,6 +262,7 @@ def handle_message_receive_model_from_client(self, msg_params):
             self._last_round_end_ts = current_ts
 
             logging.info("\n\n==========end {}-th round training===========\n".format(self.args.round_idx))
+            logging.info(f"Number of rounds: {self.round_num}")
             if self.args.round_idx < self.round_num:
                 mlops.event("server.wait", event_started=True, event_value=str(self.args.round_idx))
 
diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 7e1ca1436..3720ec74e 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -22,6 +22,7 @@
 from fedml.train.llm.distributed import barrier
 from peft import PeftModel
 from trl import GRPOTrainer, GRPOConfig
+from fedml.ml.aggregator.agg_operator import FedMLAggOperator
 
 from run_fedllm import LLMTrainer, LLMAggregator, save_checkpoint, load_checkpoint
 from src.peft_utils import set_peft_model_state_dict
@@ -373,6 +374,18 @@ def __init__(self, *args, **kwargs):
         # Whether to save per-round checkpoints (default False)
         self._enable_round_ckpt = getattr(self.args, "enable_round_checkpoints", False)
 
+        # ------------------ Nesterov Momentum Setup (NEW) ------------------
+        # Learning rate for the server optimizer (default 1.0 so the server fully
+        # applies the aggregated update when momentum=0)
+        self._server_lr = getattr(self.args, "server_lr", 1.0)
+        # Momentum coefficient. Typical values are 0.9 or 0.99
+        self._momentum = getattr(self.args, "server_momentum", 0.9)
+        # Enable / disable Nesterov variant (default=True)
+        self._nesterov = getattr(self.args, "server_nesterov", True)
+        # Momentum buffer for each parameter
+        self._velocity: OrderedDict = OrderedDict()
+        # -------------------------------------------------------------------
+
     # ------------------------------------------------------------------
     # Internal helpers
     # ------------------------------------------------------------------
@@ -428,4 +441,58 @@ def set_model_params(self, model_parameters) -> None:
         self.log(f"set_model_params (server) took {elapsed:.3f}s")
 
 
-        self.log("finished")
\ No newline at end of file
+        self.log("finished")
+
+    def aggregate(self, raw_client_model_list):
+        """Aggregate client models with Nesterov momentum.
+
+        Steps
+        -----
+        1. Compute the FedAvg-style weighted average of client models (same as the
+           default FedML behaviour).
+        2. Treat the *difference* between the current global model and the
+           aggregated model as the (negative) gradient.
+        3. Perform an SGD update with momentum on the server side.  If
+           ``self._nesterov`` is ``True``, use the Nesterov variant.
+        4. Save the updated parameters via ``set_model_params`` and return them.
+        """
+        self.log("aggregate: start")
+
+        # Step-1: FedAvg aggregation (reuse FedMLAggOperator)
+        aggregated_params: OrderedDict = FedMLAggOperator.agg(self.args, raw_client_model_list)
+
+        # Step-2: Load current global params (on CPU)
+        global_params: OrderedDict = self.get_model_params()
+
+        # Step-3: Momentum update
+        updated_params: OrderedDict = OrderedDict()
+        for name, global_tensor in global_params.items():
+            # Non-floating tensors (e.g. buffers) are copied directly
+            if not torch.is_floating_point(global_tensor):
+                updated_params[name] = aggregated_params[name]
+                continue
+
+            device = global_tensor.device           # cuda:0 (or cpu)
+            agg_tensor = aggregated_params[name].to(device)
+            grad = global_tensor - agg_tensor
+
+            # Initialise velocity buffer if first time
+            if name not in self._velocity:
+                self._velocity[name] = torch.zeros_like(grad)
+
+            # Momentum accumulation
+            self._velocity[name] = self._momentum * self._velocity[name] + grad
+
+            # Nesterov look-ahead
+            if self._nesterov:
+                update = self._momentum * self._velocity[name] + grad
+            else:
+                update = self._velocity[name]
+
+            # Parameter update (SGD step)
+            updated_params[name] = global_tensor - self._server_lr * update
+
+        # Step-4: Push new params to the model & return
+        self.set_model_params(updated_params)
+        self.log("aggregate: finished")
+        return updated_params
\ No newline at end of file
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index f2c3549db..e4dd0dfdd 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -28,7 +28,7 @@ train_args:
   server_optimizer: "FedAvg"
   client_num_in_total: 1  # Single client setup
   client_num_per_round: 1  # Single client setup
-  comm_round: 3  # Reduced to 3 rounds for testing
+  comm_round: 1  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 10  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0

From 579dd31ddd6cc3fc2b6155b9bfc95f949cf027d7 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 23 Jul 2025 01:41:51 +0000
Subject: [PATCH 009/168] Refactor checkpoint saving logic in
 FullModelLLMTrainer and FullModelLLMAggregator by removing commented-out code
 and ensuring consistent execution of checkpointing after aggregation.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 69 +++++++++----------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 7e1ca1436..ffcd104ad 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -242,18 +242,17 @@ def train(self, train_data, device, args):
             self.model.load_state_dict(trained_state, strict=False)
         
         # Optionally save a pre-aggregation checkpoint for this round
-        """
-        if self._enable_round_ckpt:
-            self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_before_agg"
-            self.log(f"[round-ckpt] Saving GRPO-trained model to \"{self.latest_checkpoint_dir}\"")
-
-            save_checkpoint(
-                self.model,
-                self.latest_checkpoint_dir,
-                is_saving_process=self.training_args.should_save,
-                synchronize=True
-            )
-        """
+
+        self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_before_agg"
+        self.log(f"[round-ckpt] Saving GRPO-trained model to \"{self.latest_checkpoint_dir}\"")
+
+        save_checkpoint(
+            self.model,
+            self.latest_checkpoint_dir,
+            is_saving_process=self.training_args.should_save,
+            synchronize=True
+        )
+
         
         # Clean up fresh model to free memory
         del fresh_model
@@ -285,19 +284,18 @@ def set_model_params(self, model_parameters) -> None:
             load_state_dict(self.model, model_parameters, strict=False)
         barrier()
 
-        """
-        if self._enable_round_ckpt and self.round_idx >= 0 and self.should_save:
-            # save aggregated model checkpoint
-            self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_after_agg"
-            self.log(f"saving aggregated model to \"{self.latest_checkpoint_dir}\"")
-            save_checkpoint(
-                self.model,
-                self.latest_checkpoint_dir,
-                is_saving_process=self.training_args.should_save,
-                state_dict=model_parameters,
-                synchronize=True
-            )
-        """
+
+
+        # save aggregated model checkpoint
+        self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_after_agg"
+        self.log(f"saving aggregated model to \"{self.latest_checkpoint_dir}\"")
+        save_checkpoint(
+            self.model,
+            self.latest_checkpoint_dir,
+            is_saving_process=self.training_args.should_save,
+            state_dict=model_parameters,
+            synchronize=True
+        )
         
         elapsed = time.perf_counter() - t0
         self.log(f"set_model_params (client) took {elapsed:.3f}s")
@@ -412,17 +410,16 @@ def set_model_params(self, model_parameters) -> None:
             load_state_dict(self.model, model_parameters, strict=False)
         barrier()
 
-        if self._enable_round_ckpt and self.round_idx >= 0 and self.should_save:
-            # save aggregated model checkpoint
-            self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_after_agg"
-            self.log(f"saving aggregated model to \"{self.latest_checkpoint_dir}\"")
-            save_checkpoint(
-                self.model,
-                self.latest_checkpoint_dir,
-                is_saving_process=self.training_args.should_save,
-                state_dict=model_parameters,
-                synchronize=True
-            )
+        # save aggregated model checkpoint
+        self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_after_agg"
+        self.log(f"saving aggregated model to \"{self.latest_checkpoint_dir}\"")
+        save_checkpoint(
+            self.model,
+            self.latest_checkpoint_dir,
+            is_saving_process=self.training_args.should_save,
+            state_dict=model_parameters,
+            synchronize=True
+        )
         
         elapsed = time.perf_counter() - t0
         self.log(f"set_model_params (server) took {elapsed:.3f}s")

From b2c987e708762e35ef532320659363d528c6b5ce Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 23 Jul 2025 15:29:16 +0000
Subject: [PATCH 010/168] Add RewardFunction class for evaluating model
 responses with correctness and format compliance scoring. Introduce
 DataFormatting and Evaluation classes for data handling and numerical
 extraction. Update FullModelLLMTrainer to utilize the new reward function.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 157 ++++++++++++-
 .../spotlight_prj/fedllm/data_formatting.py   | 158 ++++++++++++++
 python/spotlight_prj/fedllm/evaluation.py     | 206 ++++++++++++++++++
 3 files changed, 520 insertions(+), 1 deletion(-)
 create mode 100644 python/spotlight_prj/fedllm/data_formatting.py
 create mode 100644 python/spotlight_prj/fedllm/evaluation.py

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index fee1c1952..efdd4f32e 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -30,6 +30,156 @@
 import time, logging
 import threading
 
+from data_formatting import DataFormatting
+from evaluation import Evaluation
+
+
+class RewardFunction:
+
+    def __init__(self, exact_match_reward, numeric_equivalence_reward, incorrect_answer_reward):
+
+        self.exact_match_reward = exact_match_reward
+        self.numeric_equivalence_reward = numeric_equivalence_reward
+        self.incorrect_answer_reward = incorrect_answer_reward
+        self.dat_fmt = DataFormatting()
+        self.eval = Evaluation()
+
+
+        pass
+
+
+    def correctness_reward(self, prompts, completions, answer, **kwargs):
+
+        """
+        Assings a reward based on the correctness of the model's answer.
+
+        Args:
+            prompts (list): A list of input prompts.
+            completons (list): List of model completions, each containing content.
+            answer (list): List of expected answers. 
+            **kwargs**: Additional keyword arguments.
+
+        Returns:
+            list: List of numerical rewards for each completion. 
+
+        Explanation:
+            1. Extracts content from each completion. 
+            2. Extracts the answer portion from each response using extrac_answer_from_response
+            3. Assigns rewards based on matching criteria:
+                - 2.0 points for an exact match
+                - 1.5 points for numeric equivalence (when values match but format differs)
+                - 0.0 points for incorrect answers
+            4. Tracks completion lengths for analysis.  
+        """
+
+
+
+        responses = [completion[0] ['content'] for completion in completions]
+
+        extracted = [self.dat_fmt.extract_answer_from_model_output(r) for r in responses]
+
+        rewards = []
+
+        for r, a in zip(extracted, answer):
+
+            if r==a: # exact match case
+                rewards.append(self.exact_match_reward)
+
+            else:
+                #Try numeric equivalence
+                r_num  = self.eval.extract_single_number(str(r))
+                a_num = self.eval.extract_single_number(str(a))
+
+                if r_num is not None and a_num is not None and r_num==a_num:
+
+                    rewards.append(self.numeric_equivalence_reward)
+
+                else:
+                    rewards.append(self.incorrect_answer_reward)
+
+        completion_lengths = [len(response.split()) for response in responses]
+
+        return rewards
+
+
+    def format_reward(self, completions, **kwargs):
+
+        """
+        Assigns a reward for adhering to the XML format. 
+
+        Args:
+            completions (list): List of model completions, each containing content.
+
+            **kwargs** Additional keyward arguments
+        
+        Returns:
+            list: List of format compliace scores for each completion. 
+        
+        Explanations:
+            1. Extracts the content from each completions. 
+            2. Evaluates format compliance by checking for required XML tags:
+                - 0.2 points for each tag present (<reasoning>, </reasoning>, <answer>, </answer>)
+                - Maximum score of 0.8 for perfect format compliance
+            3. Stores and returns the format compliance scores.
+        """
+
+        responses = [completion[0]['content'] for completion in completions]
+
+        rewards = []
+
+        format_scores = []
+
+        for response in responses:
+
+            score = 0.0
+
+            if "<reasoning>" in response: score +=0.2
+            if "</reasoning>" in response: score +=0.2
+            if "<answer>" in response: score +=0.2
+            if "</answer>" in response: score += 0.2
+
+            rewards.append(score)
+        return rewards
+
+
+
+    def combined_reward(self, prompts, completions, answer):
+
+        """
+        Combines correctness and format rewards.
+
+        Args:
+            prompts (list[str]): List of prompt texts
+            completions (list[list[dict]]): List of completion dictionaries.
+            answer (list[str]): List of expected answers
+        
+        Returns:
+            list[float]:Combined rewards for each prompt-completion pair
+        
+        Explanation:
+            1. Calculates separate reward for correctness and format compliance.
+            2. Combines the rewards with the following weights:
+                - correctness score range: 0.0 to 2.0
+                - Format score range 0.0 to 0.8
+                - Total possible range: 0.0 to 2.8
+            3. Returns the combined reward for each example. 
+        """
+
+        # Get individual rewards
+
+        correctness_scores = self.correctness_reward(prompts=prompts, completions=completions,answer=answer)
+
+        format_scores = self.format_reward(completions=completions)
+
+        combined_reward = []
+
+        for c_score, f_score in zip(correctness_scores, format_scores):
+
+            combined_reward.append(c_score + f_score)
+
+
+        return combined_reward
+
 class TimedGRPOTrainer(GRPOTrainer):
     def _make_experience(self, *args, **kwargs):
         
@@ -59,6 +209,11 @@ def __init__(self, *args, **kwargs):
         # Default: omit per-round checkpoints unless user explicitly enables
         # them via the FedML YAML (enable_round_checkpoints: true)
         self._enable_round_ckpt = getattr(self.args, "enable_round_checkpoints", False)
+
+        exact_match_reward = 2.0
+        numeric_equivalence_reward=1.5
+        incorrect_answer_reward=0.0
+        self.rwdfn = RewardFunction(exact_match_reward, numeric_equivalence_reward, incorrect_answer_reward)
     
     def reward_fn(self, completions, answer, **_):
         """Reward function for GSM8K that checks if the predicted answer matches the true answer."""
@@ -211,7 +366,7 @@ def train(self, train_data, device, args):
             args=cfg,
             train_dataset=ds.shuffle(seed=cfg.seed),
             processing_class=fresh_tokenizer,  # Use fresh tokenizer
-            reward_funcs=self.reward_fn,
+            reward_funcs=self.rwdfn,
         )
         
         # **FIX: Set generation parameters for numerical stability**
diff --git a/python/spotlight_prj/fedllm/data_formatting.py b/python/spotlight_prj/fedllm/data_formatting.py
new file mode 100644
index 000000000..047892cc3
--- /dev/null
+++ b/python/spotlight_prj/fedllm/data_formatting.py
@@ -0,0 +1,158 @@
+
+from datasets import load_dataset
+
+
+class DataFormatting:
+
+    def __init__(self):
+
+        self.system_prompt  = """
+        
+        Respond in the following format:
+
+        <reasoning>
+        ...
+        </reasoning>
+
+        <answer>
+
+        ...
+        </answer>
+        
+        """
+
+
+
+    def extract_answer_from_model_output(self, text):
+
+        """
+        Extracts the value from the last <answer> tag in the text.
+
+        Args:
+            text (str): The model generated containing XML-style <answer> tags. 
+        
+        Returs:
+            str or None: The content inside the <answer> tags, or None if no valid answer is found 
+        
+        Explanation: 
+            1. Splits the text on the <answer> tag to isolate content after the tag.
+            2. Checks if at least one <answer> tag exists in the text. 
+            3. For the last <answer> segment:
+                - Verifies it contains a closing </answer>
+                - Extracts only the content between the tags.
+            4. Returns None if the answer is empty (just "...") or if tags are missing
+        """
+
+
+        #split on <answer> and take everything after the last occurane.
+        parts = text.split("<answer>")
+
+        if len(parts)<2: # No <answer> tag found
+
+            return None
+
+        last_part = parts[-1]
+
+        #Extract the content up to </answer>
+
+        if "</answer>" not in last_part:
+            return None
+
+        answer = last_part.split("</answer>")[0].strip()
+
+        return None if answer =="..." else answer
+
+
+    def extract_answer_from_dataset(self, text):
+
+        """
+        Extracts the answer from the GSM8K dataset examples.
+
+        Args:
+            text(str): The dataset example text containing a question and answer
+        
+        Returns:
+            str or None: The extracted answer part after the '####' delimiter, or None
+        
+
+        Explanation: 
+
+        1. Checks if the text contains the '####' delimiter that separates questions from answers
+        2. If found, splits the text at this delimiter and returns the second part 
+        3. The answer is stripped of leading or trailing white spaces. 
+        4. Returns None if no delimiter is present. 
+
+        """
+
+        if "####" not in text:
+            return None
+
+        return text.split("####")[1].strip()
+
+
+
+    def prepare_dataset(self, split="train"):
+
+        """
+        Load and prepare GSM8K dataset for training with string prompts.
+
+        Args:
+            split(str): The dataset split to load("train" or "test"), Defaults to "train"
+        
+        Returns:
+            list: A list of formatted examples, each containing a prompt string and the role
+        
+        Explanation:
+            1. Loads GSM8K dataset from Hugging Face dataset hub.
+            2. For each example in the dataset:
+                - Creates a list of messages with system prompt and the question.
+                - Converts this list into a single string prompt using build_prompt()
+                - Extracts the answer from the dataset example. 
+                - Creates a list of formatted examples with prompt and answer. 
+            3. Returns the list of formatted examples ready for model training or evaluation. 
+        """
+
+        data = load_dataset('openai/gsm8k', 'main')[split]
+
+        formatted_data = []
+
+        for example in data:
+
+            # convert the list of messages to a single string prompt
+
+            prompt_str = self.build_prompt([
+                {"role": "system", "content": self.system_prompt},
+                {"role":"user", "content": example["question"]}
+            ])
+
+
+            formatted_example = {
+                "prompt":prompt_str, # string rather than a list
+                "answer": self.extract_answer_from_dataset(example["answer"])
+            }
+            formatted_data.append(formatted_example)
+
+        return formatted_data
+
+
+
+    def build_prompt(self,messages):
+
+        """
+        Build a single prompt string from a list of messages.
+
+        Args:
+            messages(list): A list of message dictionaries, each with 'role' and 'content'
+
+        Returns:
+            str: A concatenated string of all message content.
+
+        Explanation:
+            1. Takes a list of message dictionaries in typical chat format. 
+            2. Extracts the 'content' field from each message and strips whitespace. 
+            3. Joins all content strings with newlines to create a single prompt. 
+            4. This preserves the training format while converting from structures messages. 
+       """
+
+        return "\n".join(msg["content"].strip() for msg in messages)
+
diff --git a/python/spotlight_prj/fedllm/evaluation.py b/python/spotlight_prj/fedllm/evaluation.py
new file mode 100644
index 000000000..c1cfa94f3
--- /dev/null
+++ b/python/spotlight_prj/fedllm/evaluation.py
@@ -0,0 +1,206 @@
+
+import re
+
+import torch
+from data_formatting import DataFormatting
+
+
+class Evaluation:
+
+    def __init__(self):
+        self.dat_fmt = DataFormatting()
+
+
+
+
+    def extract_last_number(self, text):
+
+        """
+        Extracts the last number appearing in the text
+
+        Args:
+            text (str): The text to extract a number from.
+
+        Returns:
+            float or None: The last number in the text, or None if no number is found
+
+        
+        Explanation:
+            1. Removes dollar signs and percentage symbols from text. 
+            2. Users regex to find a number that appeares at the end of the text. 
+            3. The pattern matches numbers that appear at the end of the string. 
+            4 Return the found number as float, or None if no match is found. 
+        """
+
+        text = text.replace('$', '').replace('%','')
+
+        pattern = r'(?:^|\s|=)\s*(-?\d*\.?\d+)\s*$'
+
+        match = re.search(pattern, text)
+
+        return float(match.group(1)) if match else None
+
+
+
+
+    def extract_single_number(self, text):
+
+        """
+        Extracts a single number from text if exactly one number is present.
+
+        Args:
+            text (str): The text to extract number from. 
+        
+        Returns:
+            float or None: The single number in the text, or None if zero or multiple numbers. 
+        
+        Explanation:
+            1. Uses regex to find all numbers in the text including the negative numebers.
+            2. If exactly one number if found, returns it as float. 
+            3. If zero or multiple numbers are found, returns None.
+        
+        """
+
+        numbers =re.findall(r'-?\d*\.?\d+', text)
+        #print("NUMBERS ARE:::", numbers)
+
+        if len(numbers)==0:
+            return None
+        elif len(numbers)==1:
+            return float(numbers[0])
+
+        else:
+            return  None
+
+
+
+    def evaluate_model(self, model, tokenizer, eval_samples, device):
+
+        """
+        Evaluates the  model on a set of examples and prints detailed results. 
+        
+        Args:
+            model: The language model to evaluate. 
+            tokenizer: The tokenizer for encoding inputs and decoding outputs. 
+            eval_samples (list): List of evaluation examples each containing "prompt" and "answer"
+            device: The device (CPU or GPU) to run evaluation on 
+        
+        Return:
+            float: The accuracy percentage (correct predictions / total examples * 100)
+        
+
+        Explanation:
+            1. Sets the model to evaluation mode. 
+            2. For each example in the evaluation set:
+                - Encodes the prompt and generates a respnse using the model
+                - Extracts the predicted answer from the generated response
+                - Compares the predicted answer with the expected answer using multiple methods
+
+                    a. Extract string matching
+                    b. Single number extraction and comparion.
+                    c. Last number extraction and comparison
+                -Prints detailed information about each example
+            3. Calculates and returns the overall accuracy. 
+            4. Returns the model to training mode. 
+
+        """
+
+
+        model.eval()
+
+        correct = 0
+
+        total = len(eval_samples)
+
+        print("\n" + "="*50)
+        print("EVALUATION ON", total, "EXAMPLES")
+        print("="*50)
+
+
+        for example in eval_samples:
+
+            #get the prompt and expected answer
+
+            full_prompt = example["prompt"]
+            expected = example["answer"]
+
+            #Tokenize and generate response
+
+            inputs = tokenizer(full_prompt, return_tensors='pt', padding=False, truncation=False, return_attention_mask=True).to(device)
+
+            with torch.no_grad():
+
+                outputs = model.generate(
+                    input_ids = inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                    max_new_tokens=512,
+                    temperature=0.7,
+                    num_return_sequences=1,
+                    pad_token_id = tokenizer.pad_token_id,
+                    eos_token_id = tokenizer.eos_token_id,
+                    forced_eos_token_id = tokenizer.eos_token_id,
+                    early_stopping = False,
+                )
+
+                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+                try:
+                    #Extract answers and check correctness
+                    predicted = self.dat_fmt.extract_answer_from_model_output(response)
+
+                    #Try different matching method
+
+                    if predicted == expected : # Exact match
+
+                        is_correct = True
+
+                    else:
+                        # Try single number matchin
+                        pred_num = self.extract_single_number(str(predicted))
+                        exp_num = self.extract_single_number(str(expected))
+
+                        if pred_num is not None and exp_num is not None and pred_num==exp_num:
+
+                            is_correct = True
+                        else:
+                            #Try the last number matchin
+                            pre_num = self.extract_last_number(str(predicted))
+                            exp_num = self.extract_last_number(str(expected))
+
+                            is_correct = (pred_num is not None and exp_num is not None and pred_num == exp_num)
+
+                    if is_correct:
+                        correct+=1
+
+
+                    # Print evaluation results
+
+                    print("\nPrompt:")
+                    print(full_prompt)
+                    print("\nExpected Answer:")
+                    print(expected)
+                    print("\nExtracted Answer:")
+                    print(predicted)
+                    print("\nFull Generated Response:")
+                    print(response)
+                    print("\nCorrect:", "✓" if is_correct else "✗")
+                    print("--"*50)
+
+                except Exception as e:
+
+                    print("\nFailed to parse the model output from prompt:")
+                    print(full_prompt)
+                    print('Error:',e)
+                    print('-'*50)
+
+
+        accuracy = (correct / total) * 100
+
+        print(f"\nAccuracy: {accuracy:.2f}% ({correct}/{total})" )
+
+        # return the model to training mode
+        model.train()
+
+        return accuracy
+
+

From 37ad46e0d5f00afc7dd11eb5ec98bb2afeb2f499 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 23 Jul 2025 15:39:06 +0000
Subject: [PATCH 011/168] Update logging format in FedMLServerManager to
 include total rounds and adjust communication rounds in
 grpo_gsm8k_test_config.yaml from 1 to 2 for testing.

---
 python/fedml/cross_silo/server/fedml_server_manager.py         | 3 +--
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml            | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/fedml/cross_silo/server/fedml_server_manager.py b/python/fedml/cross_silo/server/fedml_server_manager.py
index 1905c7f1d..eb934508d 100644
--- a/python/fedml/cross_silo/server/fedml_server_manager.py
+++ b/python/fedml/cross_silo/server/fedml_server_manager.py
@@ -261,8 +261,7 @@ def handle_message_receive_model_from_client(self, msg_params):
             # Record timestamp for the next round
             self._last_round_end_ts = current_ts
 
-            logging.info("\n\n==========end {}-th round training===========\n".format(self.args.round_idx))
-            logging.info(f"Number of rounds: {self.round_num}")
+            logging.info("\n\n==========end {}/{}-th round training===========\n".format(self.args.round_idx, self.round_num))
             if self.args.round_idx < self.round_num:
                 mlops.event("server.wait", event_started=True, event_value=str(self.args.round_idx))
 
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index e4dd0dfdd..733b2aa16 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -28,7 +28,7 @@ train_args:
   server_optimizer: "FedAvg"
   client_num_in_total: 1  # Single client setup
   client_num_per_round: 1  # Single client setup
-  comm_round: 1  # Reduced to 3 rounds for testing
+  comm_round: 2  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 10  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0

From b5fb68652faf35b39e97e066ad98c61ae54cc639 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 23 Jul 2025 18:17:15 +0000
Subject: [PATCH 012/168] Refactor correctness_reward and combined_reward
 methods in RewardFunction class to streamline input parameters and improve
 clarity. Update FullModelLLMTrainer to utilize the revised combined_reward
 function.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 28 +++++++------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index efdd4f32e..171331aae 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -48,7 +48,7 @@ def __init__(self, exact_match_reward, numeric_equivalence_reward, incorrect_ans
         pass
 
 
-    def correctness_reward(self, prompts, completions, answer, **kwargs):
+    def correctness_reward(self, completions, answer, **kwargs):
 
         """
         Assings a reward based on the correctness of the model's answer.
@@ -72,33 +72,25 @@ def correctness_reward(self, prompts, completions, answer, **kwargs):
             4. Tracks completion lengths for analysis.  
         """
 
-
-
-        responses = [completion[0] ['content'] for completion in completions]
-
-        extracted = [self.dat_fmt.extract_answer_from_model_output(r) for r in responses]
-
         rewards = []
 
-        for r, a in zip(extracted, answer):
+        for c, a in zip(completions, answer):
 
-            if r==a: # exact match case
+            if c==a: # exact match case
                 rewards.append(self.exact_match_reward)
 
             else:
                 #Try numeric equivalence
-                r_num  = self.eval.extract_single_number(str(r))
+                c_num  = self.eval.extract_single_number(str(c))
                 a_num = self.eval.extract_single_number(str(a))
 
-                if r_num is not None and a_num is not None and r_num==a_num:
+                if c_num is not None and a_num is not None and c_num==a_num:
 
                     rewards.append(self.numeric_equivalence_reward)
 
                 else:
                     rewards.append(self.incorrect_answer_reward)
 
-        completion_lengths = [len(response.split()) for response in responses]
-
         return rewards
 
 
@@ -141,9 +133,9 @@ def format_reward(self, completions, **kwargs):
             rewards.append(score)
         return rewards
 
+    
 
-
-    def combined_reward(self, prompts, completions, answer):
+    def combined_reward(self, completions, answer, **_):
 
         """
         Combines correctness and format rewards.
@@ -167,7 +159,7 @@ def combined_reward(self, prompts, completions, answer):
 
         # Get individual rewards
 
-        correctness_scores = self.correctness_reward(prompts=prompts, completions=completions,answer=answer)
+        correctness_scores = self.correctness_reward(completions=completions,answer=answer)
 
         format_scores = self.format_reward(completions=completions)
 
@@ -175,7 +167,7 @@ def combined_reward(self, prompts, completions, answer):
 
         for c_score, f_score in zip(correctness_scores, format_scores):
 
-            combined_reward.append(c_score + f_score)
+            combined_reward.append(c_score)
 
 
         return combined_reward
@@ -366,7 +358,7 @@ def train(self, train_data, device, args):
             args=cfg,
             train_dataset=ds.shuffle(seed=cfg.seed),
             processing_class=fresh_tokenizer,  # Use fresh tokenizer
-            reward_funcs=self.rwdfn,
+            reward_funcs=self.rwdfn.combined_reward,
         )
         
         # **FIX: Set generation parameters for numerical stability**

From 91642a41a42cf28732421ae8a1465cca81e5d3ea Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 23 Jul 2025 18:56:33 +0000
Subject: [PATCH 013/168] Remove the format_reward method from the
 RewardFunction class to streamline reward calculation, focusing solely on
 correctness scoring in the combined_reward method.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 44 +------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 171331aae..a4bdd99bb 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -93,46 +93,6 @@ def correctness_reward(self, completions, answer, **kwargs):
 
         return rewards
 
-
-    def format_reward(self, completions, **kwargs):
-
-        """
-        Assigns a reward for adhering to the XML format. 
-
-        Args:
-            completions (list): List of model completions, each containing content.
-
-            **kwargs** Additional keyward arguments
-        
-        Returns:
-            list: List of format compliace scores for each completion. 
-        
-        Explanations:
-            1. Extracts the content from each completions. 
-            2. Evaluates format compliance by checking for required XML tags:
-                - 0.2 points for each tag present (<reasoning>, </reasoning>, <answer>, </answer>)
-                - Maximum score of 0.8 for perfect format compliance
-            3. Stores and returns the format compliance scores.
-        """
-
-        responses = [completion[0]['content'] for completion in completions]
-
-        rewards = []
-
-        format_scores = []
-
-        for response in responses:
-
-            score = 0.0
-
-            if "<reasoning>" in response: score +=0.2
-            if "</reasoning>" in response: score +=0.2
-            if "<answer>" in response: score +=0.2
-            if "</answer>" in response: score += 0.2
-
-            rewards.append(score)
-        return rewards
-
     
 
     def combined_reward(self, completions, answer, **_):
@@ -161,11 +121,9 @@ def combined_reward(self, completions, answer, **_):
 
         correctness_scores = self.correctness_reward(completions=completions,answer=answer)
 
-        format_scores = self.format_reward(completions=completions)
-
         combined_reward = []
 
-        for c_score, f_score in zip(correctness_scores, format_scores):
+        for c_score in correctness_scores:
 
             combined_reward.append(c_score)
 

From eaa734421307f7f26029c39d12a9ba05a56d6777 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 23 Jul 2025 23:32:25 +0000
Subject: [PATCH 014/168] Increase max_new_tokens in FullModelLLMTrainer from
 512 to 1024 for improved model output capacity. Update
 grpo_gsm8k_test_config.yaml to change client setup from 1 to 2 clients for
 enhanced testing scenarios.

---
 python/spotlight_prj/fedllm/custom_trainer.py                 | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index a4bdd99bb..c80852d21 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -327,7 +327,7 @@ def train(self, train_data, device, args):
             "top_k": 50,
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
-            "max_new_tokens": 512,
+            "max_new_tokens": 1024,
             "repetition_penalty": 1.1,  # Prevent repetition
             "length_penalty": 1.0,      # Neutral length penalty
         }
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 733b2aa16..c94d8af72 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -26,8 +26,8 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 1  # Single client setup
-  client_num_per_round: 1  # Single client setup
+  client_num_in_total: 2  # Single client setup
+  client_num_per_round: 2  # Single client setup
   comm_round: 2  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 10  # Only 10 training steps per round for quick testing

From afc697cb5ca95dbeae8f30017dbfb2c29695abdb Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 23 Jul 2025 23:38:20 +0000
Subject: [PATCH 015/168] Add report_to parameter for Weights & Biases
 integration in FullModelLLMTrainer to enhance experiment tracking.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index c80852d21..7b0750913 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -305,6 +305,7 @@ def train(self, train_data, device, args):
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
             # Add seed for reproducibility in federated setting
             seed=42 + self.round_idx * 100 + args.rank,  # Different seed per round and client
+            report_to="wandb",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")

From 99434eddbce9b44e72ed08ae1c482c2f14ba41cc Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 00:31:09 +0000
Subject: [PATCH 016/168] Refactor reward function usage in FullModelLLMTrainer
 to utilize reward_fn instead of combined_reward, enhancing clarity and
 consistency in reward calculation.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 7b0750913..d375a6e12 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -47,7 +47,6 @@ def __init__(self, exact_match_reward, numeric_equivalence_reward, incorrect_ans
 
         pass
 
-
     def correctness_reward(self, completions, answer, **kwargs):
 
         """
@@ -317,7 +316,7 @@ def train(self, train_data, device, args):
             args=cfg,
             train_dataset=ds.shuffle(seed=cfg.seed),
             processing_class=fresh_tokenizer,  # Use fresh tokenizer
-            reward_funcs=self.rwdfn.combined_reward,
+            reward_funcs=self.reward_fn,
         )
         
         # **FIX: Set generation parameters for numerical stability**

From ee38a160cf652df091b7e114ac8e8664f51b44a8 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 01:01:35 +0000
Subject: [PATCH 017/168] Refactor reward function parameters in
 FullModelLLMTrainer to enhance clarity and maintainability. Update reward_fn
 to utilize class attributes for reward values, improving consistency in
 reward calculations.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index d375a6e12..70303e406 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -159,28 +159,31 @@ def __init__(self, *args, **kwargs):
         # them via the FedML YAML (enable_round_checkpoints: true)
         self._enable_round_ckpt = getattr(self.args, "enable_round_checkpoints", False)
 
-        exact_match_reward = 2.0
-        numeric_equivalence_reward=1.5
-        incorrect_answer_reward=0.0
-        self.rwdfn = RewardFunction(exact_match_reward, numeric_equivalence_reward, incorrect_answer_reward)
+        self.exact_match_reward = 2.0
+        self.numeric_equivalence_reward=1.5
+        self.incorrect_answer_reward=0.0
+        self.rwdfn = RewardFunction(self.exact_match_reward, self.numeric_equivalence_reward, self.incorrect_answer_reward)
     
     def reward_fn(self, completions, answer, **_):
         """Reward function for GSM8K that checks if the predicted answer matches the true answer."""
         out = []
         for c, ans in zip(completions, answer):
-            # Extract from dataset answer (GSM8K format)
-            tru = self.DATASET_ANS.search(ans)
-            # Extract from model completion (boxed format, fallback to GSM8K format)
-            pred = self.MODEL_ANS.search(c)
-            if not pred:
-                pred = self.DATASET_ANS.search(c)
-            
-            if pred and tru:
-                pred_num = pred.group(1)
-                tru_num = tru.group(1)
-                out.append(1.0 if pred_num == tru_num else -0.2)
+            if c==ans:
+                out.append(self.exact_match_reward)
             else:
-                out.append(-0.2)
+                # Extract from dataset answer (GSM8K format)
+                tru = self.DATASET_ANS.search(ans)
+                # Extract from model completion (boxed format, fallback to GSM8K format)
+                pred = self.MODEL_ANS.search(c)
+                if not pred:
+                    pred = self.DATASET_ANS.search(c)
+                
+                if pred and tru:
+                    pred_num = pred.group(1)
+                    tru_num = tru.group(1)
+                    out.append(self.numeric_equivalence_reward if pred_num == tru_num else self.incorrect_answer_reward)
+                else:
+                    out.append(self.incorrect_answer_reward)
         return out
     
     def train(self, train_data, device, args):

From f4b4f1d515da23de7d9855250b883f66b252fd6b Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 01:32:25 +0000
Subject: [PATCH 018/168] Update gradient accumulation steps and batch sizes in
 grpo_gsm8k_test_config.yaml for improved testing. Adjust timeout duration in
 run_fedml_client_custom.sh and run_fedml_server_custom.sh scripts to
 accommodate longer execution times.

---
 python/spotlight_prj/fedllm/custom_trainer.py             | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml       | 8 ++++----
 .../fedllm/scripts/run_fedml_client_custom.sh             | 2 +-
 .../fedllm/scripts/run_fedml_server_custom.sh             | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 70303e406..92a7cf76c 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -201,7 +201,7 @@ def train(self, train_data, device, args):
         
         # Calculate effective batch size for GRPO constraint
         # effective_batch_size = num_gpus * per_device_batch_size * gradient_accumulation_steps
-        gradient_accumulation_steps = 4
+        gradient_accumulation_steps = getattr(args, 'gradient_accumulation_steps', 2)
         effective_batch_size = 1 * grpo_batch_size * gradient_accumulation_steps
         
         # Num generations must evenly divide the effective batch size
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index c94d8af72..3b18e235e 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -28,11 +28,11 @@ train_args:
   server_optimizer: "FedAvg"
   client_num_in_total: 2  # Single client setup
   client_num_per_round: 2  # Single client setup
-  comm_round: 2  # Reduced to 3 rounds for testing
+  comm_round: 200  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 10  # Only 10 training steps per round for quick testing
+  grpo_max_steps: 150  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 2  # Smaller batch size for faster testing
+  grpo_batch_size: 32  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1
@@ -45,7 +45,7 @@ train_args:
   gradient_checkpointing: False  # Match GRPO config
   per_device_train_batch_size: 4  # Will be overridden by GRPO
   per_device_eval_batch_size: 8
-  gradient_accumulation_steps: 1  # Will be overridden by GRPO
+  gradient_accumulation_steps: 2  # Will be overridden by GRPO
   eval_accumulation_steps: 4
   learning_rate: 5e-6  # Will be overridden by GRPO
   warmup_steps: 0
diff --git a/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh b/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
index 29288d35d..e878a2053 100755
--- a/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
+++ b/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
@@ -25,7 +25,7 @@ LAUNCHER="${6:-"auto"}"
 CONFIG_PATH="${7:-"fedml_config/grpo_gsm8k_test_config.yaml"}"
 
 # Use the custom launcher that properly handles non-PEFT models
-timeout --signal=SIGINT --kill-after=30s 21600 python3 launch_fedllm_custom.py \
+timeout --signal=SIGINT --kill-after=30s 28800 python3 launch_fedllm_custom.py \
   --cf "${CONFIG_PATH}" \
   --rank "${RANK}" \
   --role client \
diff --git a/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh b/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
index e50d58f85..d8f708991 100755
--- a/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
+++ b/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
@@ -26,7 +26,7 @@ LAUNCHER="${6:-"auto"}"
 CONFIG_PATH="${7:-"fedml_config/fedml_config.yaml"}"
 
 # Use the custom launcher that properly handles non-PEFT models
-timeout --signal=SIGINT --kill-after=30s 21600 python3 launch_fedllm_custom.py \
+timeout --signal=SIGINT --kill-after=30s 28800 python3 launch_fedllm_custom.py \
   --cf "${CONFIG_PATH}" \
   --rank "${RANK}" \
   --role server \

From 740eed761221d8a00f5f65aacbfc49e91dcc12c9 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 01:49:11 +0000
Subject: [PATCH 019/168] Update grpo_batch_size in grpo_gsm8k_test_config.yaml
 from 32 to 2 for faster testing iterations.

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 3b18e235e..7b370489e 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -32,7 +32,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 150  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 32  # Smaller batch size for faster testing
+  grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From cd16d020a3f534b6475aeaf3cf6da7a15a7abbfd Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 14:17:11 +0000
Subject: [PATCH 020/168] Update FullModelLLMTrainer to correctly handle model
 state dict loading and improve logging of missing/unexpected keys. Adjust
 grpo_gsm8k_test_config.yaml for testing parameters, reducing communication
 rounds and training steps for quicker iterations.

---
 python/spotlight_prj/fedllm/custom_trainer.py               | 5 +++--
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml         | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 92a7cf76c..87a245fcc 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -263,12 +263,13 @@ def train(self, train_data, device, args):
             self.log("Copying trained weights to fresh model")
             # Get the current model state dict (handling potential PEFT wrapping)
             if isinstance(self.model, PeftModel):
-                current_state = self.model.base_model.state_dict()
+                current_state = self.model.state_dict()
             else:
                 current_state = self.model.state_dict()
             
             # Load into fresh model
-            fresh_model.load_state_dict(current_state, strict=False)
+            incompatible = fresh_model.load_state_dict(current_state, strict=True)
+            logging.info("missing:", incompatible.missing_keys, "unexpected:", incompatible.unexpected_keys)
         
         # Move fresh model to correct device
         fresh_model.to(device)
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 7b370489e..13cecf5e1 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -28,11 +28,11 @@ train_args:
   server_optimizer: "FedAvg"
   client_num_in_total: 2  # Single client setup
   client_num_per_round: 2  # Single client setup
-  comm_round: 200  # Reduced to 3 rounds for testing
+  comm_round: 3  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 150  # Only 10 training steps per round for quick testing
+  grpo_max_steps: 10  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 2  # Smaller batch size for faster testing
+  grpo_batch_size: 4  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From f8261bb2d5ef484bb411bf8c59a5b682c6a6d3e9 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 16:07:40 +0000
Subject: [PATCH 021/168] Refactor state dict loading in FullModelLLMTrainer to
 correctly access base model state. Enhance logging to provide clearer output
 of missing and unexpected keys during model loading for improved debugging.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 87a245fcc..a012ad6c9 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -263,13 +263,16 @@ def train(self, train_data, device, args):
             self.log("Copying trained weights to fresh model")
             # Get the current model state dict (handling potential PEFT wrapping)
             if isinstance(self.model, PeftModel):
-                current_state = self.model.state_dict()
+                current_state = self.model.base_model.state_dict()
             else:
                 current_state = self.model.state_dict()
             
             # Load into fresh model
             incompatible = fresh_model.load_state_dict(current_state, strict=True)
-            logging.info("missing:", incompatible.missing_keys, "unexpected:", incompatible.unexpected_keys)
+            # Log any keys that failed to load for easier debugging
+            self.log(
+                f"missing keys: {incompatible.missing_keys}, unexpected keys: {incompatible.unexpected_keys}"
+            )
         
         # Move fresh model to correct device
         fresh_model.to(device)

From c7848a38a5c0c081ce65afbe0e237d6d0a8af00e Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 16:17:47 +0000
Subject: [PATCH 022/168] Enhance checkpoint saving in FullModelLLMAggregator
 to utilize HuggingFace's save_pretrained method for improved compatibility.
 Implement fallback mechanism for models lacking this method, ensuring robust
 checkpointing during training.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index a012ad6c9..36c6663b7 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -509,12 +509,25 @@ def _periodic_checkpoint_loop(self):
                 ts = int(time.time())
                 ckpt_dir = self.checkpoint_dir / f"wallclock_{ts}"
                 self.log(f"Periodic checkpoint → {ckpt_dir}")
-                save_checkpoint(
-                    self.model,
-                    checkpoint_dir=ckpt_dir,
-                    is_saving_process=self.training_args.should_save,
-                    synchronize=True,
-                )
+                # Always save checkpoints in the standard HuggingFace format so that
+                # the resulting directory can be loaded with `from_pretrained`.
+                # Only the main process writes the checkpoint to avoid race conditions
+                # (the background thread is spawned exclusively on the main process).
+                if self.training_args.should_save:
+                    ckpt_dir.mkdir(parents=True, exist_ok=True)
+                    try:
+                        # Try the native HuggingFace save.
+                        # For `PeftModel` this will also persist the adapter weights.
+                        self.model.save_pretrained(str(ckpt_dir), state_dict=self.model.state_dict())
+                    except AttributeError:
+                        # Fallback to the generic helper if the model doesn't implement
+                        # `save_pretrained` (unlikely for LLMs but safe-guard regardless).
+                        save_checkpoint(
+                            self.model,
+                            checkpoint_dir=ckpt_dir,
+                            is_saving_process=True,
+                            synchronize=False,
+                        )
             except Exception as e:
                 # Log and continue – do not crash training due to checkpoint failure
                 self.log(f"[WARN] Periodic checkpoint failed: {e}")

From 8732a294af601fa313477e206f1605b1066bb2fd Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 16:32:34 +0000
Subject: [PATCH 023/168] Update communication rounds in
 grpo_gsm8k_test_config.yaml from 3 to 30 for extended testing iterations.

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 13cecf5e1..7f7733ad5 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -28,7 +28,7 @@ train_args:
   server_optimizer: "FedAvg"
   client_num_in_total: 2  # Single client setup
   client_num_per_round: 2  # Single client setup
-  comm_round: 3  # Reduced to 3 rounds for testing
+  comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 10  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0

From 85220ca5669b907c8346350076dad76d877daaf3 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 17:33:06 +0000
Subject: [PATCH 024/168] Add debugging breakpoint in reward function of
 FullModelLLMTrainer for improved troubleshooting during answer validation.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 36c6663b7..f81c5ce4e 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -168,6 +168,7 @@ def reward_fn(self, completions, answer, **_):
         """Reward function for GSM8K that checks if the predicted answer matches the true answer."""
         out = []
         for c, ans in zip(completions, answer):
+            import ipdb; ipdb.set_trace()
             if c==ans:
                 out.append(self.exact_match_reward)
             else:

From dd8a8960468f29ff6780b5cc12cc114e12dd796e Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 17:33:47 +0000
Subject: [PATCH 025/168] Update client configuration in
 grpo_gsm8k_test_config.yaml to use a single client setup for testing,
 adjusting client_num_in_total and client_num_per_round to 1.

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 7f7733ad5..5b20bfe91 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -26,8 +26,8 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 2  # Single client setup
-  client_num_per_round: 2  # Single client setup
+  client_num_in_total: 1  # Single client setup
+  client_num_per_round: 1  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 10  # Only 10 training steps per round for quick testing

From 304b809961d0c325239df19ef8c4a4252a69a345 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 17:49:23 +0000
Subject: [PATCH 026/168] Enhance debugging in reward function of
 FullModelLLMTrainer by adding print statements to display completions and
 answers for better troubleshooting during answer validation.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index f81c5ce4e..883cf2979 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -168,7 +168,8 @@ def reward_fn(self, completions, answer, **_):
         """Reward function for GSM8K that checks if the predicted answer matches the true answer."""
         out = []
         for c, ans in zip(completions, answer):
-            import ipdb; ipdb.set_trace()
+            print("completions:", c)
+            print("answer:", ans)
             if c==ans:
                 out.append(self.exact_match_reward)
             else:

From 97e3a904a627ab4f9178c46d4f282a18afb51d95 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 18:47:15 +0000
Subject: [PATCH 027/168] Add methods to handle boxed content and convert
 strings to numbers in FullModelLLMTrainer. Refactor reward function to
 improve answer validation by incorporating numeric equivalence checks for
 better accuracy.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 60 +++++++++++++------
 1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 883cf2979..a0d5fddfe 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -33,6 +33,8 @@
 from data_formatting import DataFormatting
 from evaluation import Evaluation
 
+from fractions import Fraction
+
 
 class RewardFunction:
 
@@ -151,6 +153,8 @@ def __init__(self, *args, **kwargs):
         # Regex for model completion format (\boxed{})
         self.MODEL_ANS = re.compile(r"\\boxed\{([^}]*)\}")
 
+        self.BOXED_RE = re.compile(r"\\boxed\{([^}]*)\}")  # capture content inside \boxed{…}
+
         # ------------------------------------------------------------------
         # Configuration: enable or disable per-round checkpoints
         # ------------------------------------------------------------------
@@ -164,28 +168,50 @@ def __init__(self, *args, **kwargs):
         self.incorrect_answer_reward=0.0
         self.rwdfn = RewardFunction(self.exact_match_reward, self.numeric_equivalence_reward, self.incorrect_answer_reward)
     
+    def to_number(self, text: str) -> Optional[float]:
+        """Convert string to float if possible, handling simple fractions."""
+        text = text.replace(",", "").strip()
+        # Fractions like 3/4
+        if "/" in text:
+            try:
+                return float(Fraction(text))
+            except (ValueError, ZeroDivisionError):
+                pass
+        try:
+            return float(text)
+        except ValueError:
+            return None
+
+
+    def extract_boxed(self, text: str) -> str:
+        """Return first \\boxed{...} contents; '' if none."""
+        m = self.BOXED_RE.search(text)
+        return m.group(1) if m else ""
+    
     def reward_fn(self, completions, answer, **_):
         """Reward function for GSM8K that checks if the predicted answer matches the true answer."""
         out = []
         for c, ans in zip(completions, answer):
-            print("completions:", c)
-            print("answer:", ans)
-            if c==ans:
-                out.append(self.exact_match_reward)
-            else:
-                # Extract from dataset answer (GSM8K format)
-                tru = self.DATASET_ANS.search(ans)
-                # Extract from model completion (boxed format, fallback to GSM8K format)
-                pred = self.MODEL_ANS.search(c)
-                if not pred:
-                    pred = self.DATASET_ANS.search(c)
-                
-                if pred and tru:
-                    pred_num = pred.group(1)
-                    tru_num = tru.group(1)
-                    out.append(self.numeric_equivalence_reward if pred_num == tru_num else self.incorrect_answer_reward)
+            # Extract from dataset answer (GSM8K format)
+            tru = self.DATASET_ANS.search(ans)
+            # Extract from model completion (boxed format, fallback to GSM8K format)
+            pred = self.MODEL_ANS.search(c)
+            if not pred:
+                pred = self.DATASET_ANS.search(c)
+            
+            if pred and tru:
+                pred_num = pred.group(1)
+                tru_num = tru.group(1)
+                if pred_num == tru_num:
+                    out.append(self.exact_match_reward)
                 else:
-                    out.append(self.incorrect_answer_reward)
+                    p_num, g_num = self.to_number(pred), self.to_number(tru)
+                    if (p_num is not None and g_num is not None and abs(p_num - g_num) < 1e-4):
+                        out.append(self.numeric_equivalence_reward)
+                    else:
+                        out.append(self.incorrect_answer_reward)
+            else:
+                out.append(self.incorrect_answer_reward)
         return out
     
     def train(self, train_data, device, args):

From 09291eef4aca9a8a66141b4bf35706b2affd3340 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 19:07:55 +0000
Subject: [PATCH 028/168] Fix reward function in FullModelLLMTrainer by
 updating variable names for clarity in numeric equivalence checks, enhancing
 accuracy in answer validation.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index a0d5fddfe..fec3d48aa 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -205,7 +205,7 @@ def reward_fn(self, completions, answer, **_):
                 if pred_num == tru_num:
                     out.append(self.exact_match_reward)
                 else:
-                    p_num, g_num = self.to_number(pred), self.to_number(tru)
+                    p_num, g_num = self.to_number(pred_num), self.to_number(tru_num)
                     if (p_num is not None and g_num is not None and abs(p_num - g_num) < 1e-4):
                         out.append(self.numeric_equivalence_reward)
                     else:

From 8c1ed2d155aa164b14a895a813183739eee071f5 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 19:25:37 +0000
Subject: [PATCH 029/168] Update grpo_gsm8k_test_config.yaml to adjust client
 configuration and training parameters for improved testing. Increase
 client_num_in_total and client_num_per_round to 2, extend comm_round to 300,
 and raise grpo_max_steps to 150 for more comprehensive evaluation.

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 5b20bfe91..075ec0912 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -26,11 +26,11 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 1  # Single client setup
-  client_num_per_round: 1  # Single client setup
-  comm_round: 30  # Reduced to 3 rounds for testing
+  client_num_in_total: 2  # Single client setup
+  client_num_per_round: 2  # Single client setup
+  comm_round: 300  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 10  # Only 10 training steps per round for quick testing
+  grpo_max_steps: 150  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
   grpo_batch_size: 4  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)

From 7d9a745b4eac257bfd31a2a1324a1bf934a96b5e Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 24 Jul 2025 22:48:20 +0000
Subject: [PATCH 030/168] Refactor docstring in FullModelLLMAggregator's
 aggregate method for clarity. Remove unnecessary comment and ensure proper
 formatting with a newline at the end of the file.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index fec3d48aa..a751a234e 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -593,8 +593,9 @@ def set_model_params(self, model_parameters) -> None:
 
         self.log("finished")
 
+    """
     def aggregate(self, raw_client_model_list):
-        """Aggregate client models with Nesterov momentum.
+        Aggregate client models with Nesterov momentum.
 
         Steps
         -----
@@ -605,7 +606,7 @@ def aggregate(self, raw_client_model_list):
         3. Perform an SGD update with momentum on the server side.  If
            ``self._nesterov`` is ``True``, use the Nesterov variant.
         4. Save the updated parameters via ``set_model_params`` and return them.
-        """
+        
         self.log("aggregate: start")
 
         # Step-1: FedAvg aggregation (reuse FedMLAggOperator)
@@ -645,4 +646,5 @@ def aggregate(self, raw_client_model_list):
         # Step-4: Push new params to the model & return
         self.set_model_params(updated_params)
         self.log("aggregate: finished")
-        return updated_params
\ No newline at end of file
+        return updated_params
+    """
\ No newline at end of file

From 30426705b8b4317cd4e174df5478ec598f3f0066 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 02:28:22 +0000
Subject: [PATCH 031/168] Implement KL divergence logging in TimedGRPOTrainer
 and adjust learning rate and generation parameters in FullModelLLMTrainer for
 improved training dynamics.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 27 ++++++++++++++++---
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index a751a234e..c2676e645 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -132,12 +132,31 @@ def combined_reward(self, completions, answer, **_):
         return combined_reward
 
 class TimedGRPOTrainer(GRPOTrainer):
+    def _record_step_stats(self, stats):
+        # first let the parent push its metrics
+        super()._record_step_stats(stats)
+
+        # add / overwrite any extra metrics and push once more
+        stats["kl_divergence"] = stats["kl"].mean().item()
+        self.accelerator.log(stats, step=self.state.global_step)
+    
     def _make_experience(self, *args, **kwargs):
         
         t0 = time.perf_counter()
         result = super()._make_experience(*args, **kwargs)
-        self.log(f"roll-out batch {self.state.global_step} : "
+        self.accelerator.log(f"roll-out batch {self.state.global_step} : "
                      f"{time.perf_counter() - t0:.3f}s")
+        
+        # `out["kl"]` is a 1-D tensor of per-token KL values
+        kl_mean = result["kl"].mean().item()
+
+        # push to the FedML / accelerate logger – it will end up in client?.log
+        self.log({"kl_divergence": kl_mean})
+
+        self.log(
+            f"roll-out batch {self.state.global_step} "
+            f"(elapsed {time.perf_counter() - t0:.3f}s, kl={kl_mean:.4f})"
+        )
         return result
 
 
@@ -330,7 +349,7 @@ def train(self, train_data, device, args):
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
-            learning_rate=5e-6,
+            learning_rate=3e-5,
             bf16=use_bf16,  # Match model precision
             fp16=not use_bf16,  # Use fp16 if not bf16
             gradient_checkpointing=False,  # Keep consistent with config
@@ -357,8 +376,8 @@ def train(self, train_data, device, args):
         # **FIX: Set generation parameters for numerical stability**
         grpo_trainer.generation_kwargs = {
             "do_sample": True,
-            "temperature": 1.0,
-            "top_p": 0.9,
+            "temperature": 1.2,
+            "top_p": 0.97,
             "top_k": 50,
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,

From 0b28e4d9207e951403c3331f0ae16731a523d50c Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 02:42:18 +0000
Subject: [PATCH 032/168] Update generation parameters and add scale_rewards
 option in FullModelLLMTrainer for enhanced training stability and control.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index c2676e645..4875daa43 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -359,6 +359,7 @@ def train(self, train_data, device, args):
             # Add seed for reproducibility in federated setting
             seed=42 + self.round_idx * 100 + args.rank,  # Different seed per round and client
             report_to="wandb",
+            scale_rewards=False,
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")
@@ -376,7 +377,7 @@ def train(self, train_data, device, args):
         # **FIX: Set generation parameters for numerical stability**
         grpo_trainer.generation_kwargs = {
             "do_sample": True,
-            "temperature": 1.2,
+            "temperature": 1.0,
             "top_p": 0.97,
             "top_k": 50,
             "pad_token_id": fresh_tokenizer.eos_token_id,

From 3c9bff70f07b97ac2a898fc1af53b849b2069535 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 03:48:05 +0000
Subject: [PATCH 033/168] Adjust temperature parameter in generation settings
 of FullModelLLMTrainer to enhance numerical stability during training.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 4875daa43..4c377fe9d 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -377,7 +377,7 @@ def train(self, train_data, device, args):
         # **FIX: Set generation parameters for numerical stability**
         grpo_trainer.generation_kwargs = {
             "do_sample": True,
-            "temperature": 1.0,
+            "temperature": 1.2,
             "top_p": 0.97,
             "top_k": 50,
             "pad_token_id": fresh_tokenizer.eos_token_id,

From d0d5c5aefeb064e5f07d49c396b48bfaf6e439be Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 03:52:46 +0000
Subject: [PATCH 034/168] Adjust top_p parameter in generation settings of
 FullModelLLMTrainer to improve sampling behavior during training.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 4c377fe9d..ab4b18e92 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -378,7 +378,7 @@ def train(self, train_data, device, args):
         grpo_trainer.generation_kwargs = {
             "do_sample": True,
             "temperature": 1.2,
-            "top_p": 0.97,
+            "top_p": 0.95,
             "top_k": 50,
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,

From 0700f726323b6bc0566f2d7f2e0aee376707b53a Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 04:23:58 +0000
Subject: [PATCH 035/168] Refactor generation parameters in FullModelLLMTrainer
 by adjusting temperature and top_p values for improved sampling behavior, and
 comment out scale_rewards option for clarity.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index ab4b18e92..3a6b6e08e 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -359,7 +359,7 @@ def train(self, train_data, device, args):
             # Add seed for reproducibility in federated setting
             seed=42 + self.round_idx * 100 + args.rank,  # Different seed per round and client
             report_to="wandb",
-            scale_rewards=False,
+            #scale_rewards=False,
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")
@@ -377,8 +377,8 @@ def train(self, train_data, device, args):
         # **FIX: Set generation parameters for numerical stability**
         grpo_trainer.generation_kwargs = {
             "do_sample": True,
-            "temperature": 1.2,
-            "top_p": 0.95,
+            "temperature": 1.0,
+            "top_p": 0.9,
             "top_k": 50,
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,

From 01aae9653fcbd034544d20e5b7d73ab203d1efa4 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 05:09:39 +0000
Subject: [PATCH 036/168] Update learning rate in FullModelLLMTrainer to 5e-6
 and uncomment scale_rewards option for improved training configuration.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 3a6b6e08e..a1fd1b0fb 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -349,7 +349,7 @@ def train(self, train_data, device, args):
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
-            learning_rate=3e-5,
+            learning_rate=5e-6,
             bf16=use_bf16,  # Match model precision
             fp16=not use_bf16,  # Use fp16 if not bf16
             gradient_checkpointing=False,  # Keep consistent with config
@@ -359,7 +359,7 @@ def train(self, train_data, device, args):
             # Add seed for reproducibility in federated setting
             seed=42 + self.round_idx * 100 + args.rank,  # Different seed per round and client
             report_to="wandb",
-            #scale_rewards=False,
+            scale_rewards=False,
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")

From 4da64e2fe5bc4904a767929810811a374c94f07e Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 12:55:22 +0000
Subject: [PATCH 037/168] Update generation parameters in FullModelLLMTrainer
 to include temperature, top_p, top_k, and repetition_penalty for enhanced
 sampling control and numerical stability.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index a1fd1b0fb..2be72e465 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -360,6 +360,10 @@ def train(self, train_data, device, args):
             seed=42 + self.round_idx * 100 + args.rank,  # Different seed per round and client
             report_to="wandb",
             scale_rewards=False,
+            temperature=1.0,
+            top_p=0.9,
+            top_k=50,
+            repetition_penalty=1.1,
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")
@@ -377,13 +381,9 @@ def train(self, train_data, device, args):
         # **FIX: Set generation parameters for numerical stability**
         grpo_trainer.generation_kwargs = {
             "do_sample": True,
-            "temperature": 1.0,
-            "top_p": 0.9,
-            "top_k": 50,
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "max_new_tokens": 1024,
-            "repetition_penalty": 1.1,  # Prevent repetition
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From 682fb559d5a900d636d703d233f4d5a2aada1529 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 12:59:06 +0000
Subject: [PATCH 038/168] Adjust max_completion_length in FullModelLLMTrainer
 to 100 and add epsilon and beta parameters for improved generation control
 and stability.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 2be72e465..f880a8821 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -345,7 +345,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=1024,
+            max_completion_length=100,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -364,6 +364,8 @@ def train(self, train_data, device, args):
             top_p=0.9,
             top_k=50,
             repetition_penalty=1.1,
+            epsilon=0.2,
+            beta=0.1,
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")

From 850cdc236fe165946719944cc02638b61dde385a Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 15:28:53 +0000
Subject: [PATCH 039/168] Refactor custom trainer to integrate
 TrainingMetricsLogger and GRPOMetricsCallback for enhanced logging during
 GRPO training. Update grpo_gsm8k_test_config.yaml to modify client
 configuration and reduce training steps for quicker testing.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 444 ++++++++++++++----
 .../fedml_config/grpo_gsm8k_test_config.yaml  |   8 +-
 2 files changed, 350 insertions(+), 102 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index f880a8821..edf9305db 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -30,106 +30,14 @@
 import time, logging
 import threading
 
-from data_formatting import DataFormatting
-from evaluation import Evaluation
-
 from fractions import Fraction
 
+# New import for TrainerCallback
+from transformers import TrainerCallback
 
-class RewardFunction:
-
-    def __init__(self, exact_match_reward, numeric_equivalence_reward, incorrect_answer_reward):
-
-        self.exact_match_reward = exact_match_reward
-        self.numeric_equivalence_reward = numeric_equivalence_reward
-        self.incorrect_answer_reward = incorrect_answer_reward
-        self.dat_fmt = DataFormatting()
-        self.eval = Evaluation()
-
-
-        pass
-
-    def correctness_reward(self, completions, answer, **kwargs):
-
-        """
-        Assings a reward based on the correctness of the model's answer.
-
-        Args:
-            prompts (list): A list of input prompts.
-            completons (list): List of model completions, each containing content.
-            answer (list): List of expected answers. 
-            **kwargs**: Additional keyword arguments.
-
-        Returns:
-            list: List of numerical rewards for each completion. 
-
-        Explanation:
-            1. Extracts content from each completion. 
-            2. Extracts the answer portion from each response using extrac_answer_from_response
-            3. Assigns rewards based on matching criteria:
-                - 2.0 points for an exact match
-                - 1.5 points for numeric equivalence (when values match but format differs)
-                - 0.0 points for incorrect answers
-            4. Tracks completion lengths for analysis.  
-        """
-
-        rewards = []
-
-        for c, a in zip(completions, answer):
-
-            if c==a: # exact match case
-                rewards.append(self.exact_match_reward)
-
-            else:
-                #Try numeric equivalence
-                c_num  = self.eval.extract_single_number(str(c))
-                a_num = self.eval.extract_single_number(str(a))
-
-                if c_num is not None and a_num is not None and c_num==a_num:
-
-                    rewards.append(self.numeric_equivalence_reward)
-
-                else:
-                    rewards.append(self.incorrect_answer_reward)
-
-        return rewards
-
-    
+import wandb
+import json
 
-    def combined_reward(self, completions, answer, **_):
-
-        """
-        Combines correctness and format rewards.
-
-        Args:
-            prompts (list[str]): List of prompt texts
-            completions (list[list[dict]]): List of completion dictionaries.
-            answer (list[str]): List of expected answers
-        
-        Returns:
-            list[float]:Combined rewards for each prompt-completion pair
-        
-        Explanation:
-            1. Calculates separate reward for correctness and format compliance.
-            2. Combines the rewards with the following weights:
-                - correctness score range: 0.0 to 2.0
-                - Format score range 0.0 to 0.8
-                - Total possible range: 0.0 to 2.8
-            3. Returns the combined reward for each example. 
-        """
-
-        # Get individual rewards
-
-        correctness_scores = self.correctness_reward(completions=completions,answer=answer)
-
-        combined_reward = []
-
-        for c_score in correctness_scores:
-
-            combined_reward.append(c_score)
-
-
-        return combined_reward
 
 class TimedGRPOTrainer(GRPOTrainer):
     def _record_step_stats(self, stats):
@@ -139,6 +47,12 @@ def _record_step_stats(self, stats):
         # add / overwrite any extra metrics and push once more
         stats["kl_divergence"] = stats["kl"].mean().item()
         self.accelerator.log(stats, step=self.state.global_step)
+
+        # NEW: forward stats to Trainer's logging system so that callbacks
+        # like GRPOMetricsCallback can record them via the TrainingMetricsLogger.
+        # This ensures that after every GRPO step the metrics are properly
+        # captured by the custom logger.
+        self.log(stats)
     
     def _make_experience(self, *args, **kwargs):
         
@@ -185,7 +99,16 @@ def __init__(self, *args, **kwargs):
         self.exact_match_reward = 2.0
         self.numeric_equivalence_reward=1.5
         self.incorrect_answer_reward=0.0
-        self.rwdfn = RewardFunction(self.exact_match_reward, self.numeric_equivalence_reward, self.incorrect_answer_reward)
+
+        # Instantiate the training metrics logger and keep as an attribute so
+        # it can be accessed by callbacks.
+        self.logger = TrainingMetricsLogger(
+            log_dir=os.path.join(self.args.output_dir, "wandb_logs"),
+            run_name=f"fedml-grpo-training",
+            enable_wandb=True,
+            wandb_project="grpo-training",
+            wandb_entity="grpo-training",
+        )
     
     def to_number(self, text: str) -> Optional[float]:
         """Convert string to float if possible, handling simple fractions."""
@@ -389,6 +312,9 @@ def train(self, train_data, device, args):
             "length_penalty": 1.0,      # Neutral length penalty
         }
         
+        # Attach our logging callback so that metrics are recorded every step.
+        grpo_trainer.add_callback(GRPOMetricsCallback(self.logger))
+
         self.log(f"Set generation parameters: {grpo_trainer.generation_kwargs}")
         
         # Run GRPO training
@@ -669,4 +595,326 @@ def aggregate(self, raw_client_model_list):
         self.set_model_params(updated_params)
         self.log("aggregate: finished")
         return updated_params
-    """
\ No newline at end of file
+    """
+
+
+
+class TrainingMetricsLogger:
+    """Comprehensive logging for GRPO training with WandB support"""
+
+    def __init__(self, log_dir: str, run_name: Optional[str] = None, 
+                 enable_wandb: bool = False,
+                 wandb_project: Optional[str] = None, wandb_entity: Optional[str] = None,
+                 wandb_config: Optional[dict] = None):
+        self.log_dir = log_dir
+        self.run_name = run_name or f"grpo_training_{int(time.time())}"
+        self.enable_wandb = enable_wandb
+
+        # WandB setup
+        self.wandb_run = None
+        if self.enable_wandb:
+            self.wandb_run = wandb.init(
+                project=wandb_project or "grpo-training",
+                entity=wandb_entity,
+                name=self.run_name,
+                config=wandb_config or {},
+                reinit=True
+            )
+            print(f"WandB logging initialized. Project: {wandb_project or 'grpo-training'}")
+
+        # Metrics tracking
+        self.step_count = 0
+        self.training_start_time = time.time()
+        self.last_log_time = time.time()
+
+        # Accumulated metrics for averaging
+        self.accumulated_metrics = {
+            'losses': [],
+            'rewards': [],
+            'kl_divergences': [],
+            'policy_losses': [],
+            'value_losses': [],
+            'advantages': [],
+            'rollout_lengths': []
+        }
+
+    def log_training_step(self, step_id: str, train_result: dict, global_step: int):
+        """Log metrics for a single training step"""
+        
+        # Prepare metrics dict for wandb
+        wandb_metrics = {}
+
+        # Core training metrics
+        if 'loss' in train_result:
+            wandb_metrics['training/loss'] = train_result['loss']
+            self.accumulated_metrics['losses'].append(train_result['loss'])
+
+        if 'avg_reward' in train_result:
+            wandb_metrics['training/avg_reward'] = train_result['avg_reward']
+            self.accumulated_metrics['rewards'].append(train_result['avg_reward'])
+
+        # Advanced GRPO metrics
+        if 'kl_divergence' in train_result:
+            wandb_metrics['training/kl_divergence'] = train_result['kl_divergence']
+            self.accumulated_metrics['kl_divergences'].append(train_result['kl_divergence'])
+
+        if 'policy_loss' in train_result:
+            wandb_metrics['training/policy_loss'] = train_result['policy_loss']
+            self.accumulated_metrics['policy_losses'].append(train_result['policy_loss'])
+
+        if 'value_loss' in train_result:
+            wandb_metrics['training/value_loss'] = train_result['value_loss']
+            self.accumulated_metrics['value_losses'].append(train_result['value_loss'])
+
+        if 'advantage_mean' in train_result:
+            wandb_metrics['training/advantage_mean'] = train_result['advantage_mean']
+            self.accumulated_metrics['advantages'].append(train_result['advantage_mean'])
+
+        # Rollout statistics
+        if 'rollout_count' in train_result:
+            wandb_metrics['rollouts/count_per_step'] = train_result['rollout_count']
+
+        if 'avg_rollout_length' in train_result:
+            wandb_metrics['rollouts/avg_length'] = train_result['avg_rollout_length']
+            self.accumulated_metrics['rollout_lengths'].append(train_result['avg_rollout_length'])
+
+        if 'rollout_time' in train_result:
+            wandb_metrics['performance/rollout_time'] = train_result['rollout_time']
+
+        if 'training_time' in train_result:
+            wandb_metrics['performance/training_step_time'] = train_result['training_time']
+
+        # Weight update timing metrics
+        if 'weight_update_time' in train_result:
+            wandb_metrics['performance/weight_update_time'] = train_result['weight_update_time']
+
+        if 'backward_time' in train_result:
+            wandb_metrics['performance/backward_pass_time'] = train_result['backward_time']
+
+        if 'optimizer_time' in train_result:
+            wandb_metrics['performance/optimizer_step_time'] = train_result['optimizer_time']
+
+        if 'wait_time' in train_result:
+            wandb_metrics['performance/batch_wait_time'] = train_result['wait_time']
+
+        # Gradient metrics
+        if 'grad_norm' in train_result:
+            wandb_metrics['training/grad_norm'] = train_result['grad_norm']
+
+        # Learning rate
+        if 'learning_rate' in train_result:
+            wandb_metrics['training/learning_rate'] = train_result['learning_rate']
+
+        # Log to wandb
+        if self.enable_wandb and self.wandb_run and wandb_metrics:
+            wandb_metrics['global_step'] = global_step
+            self.wandb_run.log(wandb_metrics, step=global_step)
+
+        self.step_count += 1
+
+    def log_server_statistics(self, stats: dict, global_step: int):
+        """Log server and system statistics"""
+        wandb_metrics = {}
+
+        if 'server_statistics' in stats:
+            server_stats = stats['server_statistics']
+
+            # Handle double nesting
+            if 'server_statistics' in server_stats:
+                server_stats = server_stats['server_statistics']
+
+            # Active workers
+            if 'active_workers' in server_stats:
+                wandb_metrics['system/active_workers'] = server_stats['active_workers']
+
+            # Model subscribers
+            if 'model_subscribers' in server_stats:
+                inference_workers = [w for w in server_stats['model_subscribers'] if 'trainer' not in w.lower()]
+                wandb_metrics['system/inference_workers'] = len(inference_workers)
+                wandb_metrics['system/total_subscribers'] = len(server_stats['model_subscribers'])
+
+            # Service status
+            if 'service_status' in server_stats:
+                service_status = server_stats['service_status']
+
+                # Buffer statistics
+                if 'buffer_statistics' in service_status:
+                    buffer_stats = service_status['buffer_statistics']
+
+                    if 'pending_steps' in buffer_stats:
+                        wandb_metrics['system/pending_steps'] = buffer_stats['pending_steps']
+
+                    if 'ready_batches' in buffer_stats:
+                        wandb_metrics['system/ready_batches'] = buffer_stats['ready_batches']
+
+                    if 'total_rollouts_received' in buffer_stats:
+                        wandb_metrics['system/total_rollouts_received'] = buffer_stats['total_rollouts_received']
+
+                # Model version tracking
+                if 'current_model_version' in service_status:
+                    wandb_metrics['system/current_model_version'] = service_status['current_model_version']
+
+        # Pipeline statistics
+        if 'current_pipeline_depth' in stats:
+            wandb_metrics['system/pipeline_depth'] = stats['current_pipeline_depth']
+
+        if 'model_broadcasts' in stats:
+            wandb_metrics['system/model_broadcasts'] = stats['model_broadcasts']
+
+        # Log to wandb
+        if self.enable_wandb and self.wandb_run and wandb_metrics:
+            self.wandb_run.log(wandb_metrics, step=global_step)
+
+    def log_performance_metrics(self, global_step: int, training_rate: Optional[float] = None):
+        """Log performance and timing metrics"""
+        wandb_metrics = {}
+        
+        current_time = time.time()
+        elapsed_time = current_time - self.training_start_time
+
+        # Training rate
+        if training_rate is not None:
+            wandb_metrics['performance/training_rate_steps_per_hour'] = training_rate
+
+        # Overall training time
+        wandb_metrics['performance/elapsed_time_hours'] = elapsed_time / 3600
+
+        # Steps per second (recent)
+        time_since_last_log = current_time - self.last_log_time
+        if time_since_last_log > 0 and hasattr(self, 'last_step_count'):
+            steps_since_last = global_step - self.last_step_count
+            steps_per_second = steps_since_last / time_since_last_log
+            wandb_metrics['performance/steps_per_second'] = steps_per_second
+
+        # Log to wandb
+        if self.enable_wandb and wandb_metrics:
+            self.wandb_run.log(wandb_metrics, step=global_step)
+
+        self.last_log_time = current_time
+        self.last_step_count = global_step
+
+    def log_moving_averages(self, global_step: int, window_size: int = 100):
+        """Log moving averages of key metrics"""
+        wandb_metrics = {}
+
+        def get_moving_average(values, window):
+            if len(values) == 0:
+                return 0
+            window = min(window, len(values))
+            return sum(values[-window:]) / window
+
+        # Moving averages
+        if self.accumulated_metrics['losses']:
+            avg_loss = get_moving_average(self.accumulated_metrics['losses'], window_size)
+            wandb_metrics[f'moving_avg/loss_{window_size}'] = avg_loss
+
+        if self.accumulated_metrics['rewards']:
+            avg_reward = get_moving_average(self.accumulated_metrics['rewards'], window_size)
+            wandb_metrics[f'moving_avg/reward_{window_size}'] = avg_reward
+
+        if self.accumulated_metrics['kl_divergences']:
+            avg_kl = get_moving_average(self.accumulated_metrics['kl_divergences'], window_size)
+            wandb_metrics[f'moving_avg/kl_divergence_{window_size}'] = avg_kl
+
+        if self.accumulated_metrics['rollout_lengths']:
+            avg_length = get_moving_average(self.accumulated_metrics['rollout_lengths'], window_size)
+            wandb_metrics[f'moving_avg/rollout_length_{window_size}'] = avg_length
+
+        # Log to wandb
+        if self.enable_wandb and wandb_metrics:
+            self.wandb_run.log(wandb_metrics, step=global_step)
+
+    def log_hyperparameters(self, hparams: dict):
+        """Log hyperparameters"""
+        # Convert all values to scalars for TensorBoard
+        scalar_hparams = {}
+        for key, value in hparams.items():
+            if isinstance(value, (int, float)):
+                scalar_hparams[key] = value
+            elif isinstance(value, (str,list)):
+                # TensorBoard doesn't handle strings well, so we'll just log them as text
+                continue
+            else:
+                scalar_hparams[key] = float(value) if value is not None else 0.0
+
+        # Log to wandb (wandb handles different types better)
+        if self.enable_wandb:
+            # Update wandb config with hyperparameters
+            self.wandb_run.config.update(hparams)
+
+    def log_model_statistics(self, model, global_step: int):
+        """Log model-specific statistics"""
+        wandb_metrics = {}
+
+        # Model parameter statistics
+        total_params = sum(p.numel() for p in model.parameters())
+        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+        wandb_metrics['model/total_parameters'] = total_params
+        wandb_metrics['model/trainable_parameters'] = trainable_params
+
+        # Parameter norms
+        total_norm = 0
+        for p in model.parameters():
+            if p.grad is not None:
+                total_norm += p.grad.data.norm(2).item() ** 2
+        total_norm = total_norm ** 0.5
+
+        if total_norm > 0:
+            wandb_metrics['model/gradient_norm'] = total_norm
+
+        # Weight norms by layer (sample a few to avoid too many metrics)
+        layer_count = 0
+        for name, param in model.named_parameters():
+            if param.requires_grad and param.data is not None:
+                # Only log first few layers to wandb to avoid clutter
+                if layer_count < 10:
+                    wandb_metrics[f'model_weights/{name}_norm'] = param.data.norm().item()
+                layer_count += 1
+
+        # Log to wandb
+        if self.enable_wandb and wandb_metrics:
+            self.wandb_run.log(wandb_metrics, step=global_step)
+
+    def log_reward_distribution(self, rewards: list, global_step: int):
+        """Log reward distribution"""
+        if rewards:
+            if self.enable_wandb:
+                wandb_metrics = {
+                    'rewards/min': min(rewards),
+                    'rewards/max': max(rewards),
+                    'rewards/std': torch.tensor(rewards).std().item(),
+                    'rewards/mean': sum(rewards) / len(rewards)
+                }
+                # Create histogram for wandb
+                wandb_metrics['rewards/histogram'] = wandb.Histogram(rewards)
+                self.wandb_run.log(wandb_metrics, step=global_step)
+
+    def save_training_config(self, config: dict):
+        """Save training configuration to file"""
+        config_path = os.path.join(self.log_dir, "training_config.json")
+        with open(config_path, 'w') as f:
+            json.dump(config, f, indent=2, default=str)
+        print(f"Training configuration saved to: {config_path}")
+
+    def close(self):
+        """Close logging connections"""
+        if self.enable_wandb and self.wandb_run:
+            self.wandb_run.finish()
+            print("WandB logging closed")
+
+# -------------------- New Callback --------------------
+class GRPOMetricsCallback(TrainerCallback):
+    """HuggingFace Trainer callback that forwards log events to our
+    TrainingMetricsLogger instance so that each GRPO step is recorded."""
+
+    def __init__(self, logger: "TrainingMetricsLogger"):
+        super().__init__()
+        self.logger = logger
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        # Forward the metrics dictionary to the TrainingMetricsLogger. This
+        # fires after every call to `Trainer.log`, i.e. after each GRPO step.
+        if logs:
+            # Use a generic step_id; users can differentiate by global_step.
+            self.logger.log_training_step("grpo_step", logs, state.global_step)
\ No newline at end of file
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 075ec0912..278f67550 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -26,11 +26,11 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 2  # Single client setup
-  client_num_per_round: 2  # Single client setup
-  comm_round: 300  # Reduced to 3 rounds for testing
+  client_num_in_total: 1  # Single client setup
+  client_num_per_round: 1  # Single client setup
+  comm_round: 3  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 150  # Only 10 training steps per round for quick testing
+  grpo_max_steps: 10  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
   grpo_batch_size: 4  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)

From 035704b9d85afd9d27d5d67eb21ec7d180d91dc6 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 15:53:58 +0000
Subject: [PATCH 040/168] Remove wandb_entity parameter from
 FullModelLLMTrainer configuration to streamline logging setup.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index edf9305db..08254d78c 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -107,7 +107,6 @@ def __init__(self, *args, **kwargs):
             run_name=f"fedml-grpo-training",
             enable_wandb=True,
             wandb_project="grpo-training",
-            wandb_entity="grpo-training",
         )
     
     def to_number(self, text: str) -> Optional[float]:

From f90c4ecfd9ccecf7ab4cc512712d5bc1f3e4e53b Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 16:30:44 +0000
Subject: [PATCH 041/168] Increase max_completion_length in FullModelLLMTrainer
 from 100 to 256 for enhanced generation capacity.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 08254d78c..74e7fdf5d 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -267,7 +267,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=100,
+            max_completion_length=256,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps

From f9d0833af91d058857a4c0b32dd141bacc1f3237 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 16:55:10 +0000
Subject: [PATCH 042/168] Comment out report_to parameter in
 FullModelLLMTrainer to disable Weights & Biases logging for improved
 configuration flexibility.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 74e7fdf5d..c4e36f53a 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -280,7 +280,7 @@ def train(self, train_data, device, args):
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
             # Add seed for reproducibility in federated setting
             seed=42 + self.round_idx * 100 + args.rank,  # Different seed per round and client
-            report_to="wandb",
+            #report_to="wandb",
             scale_rewards=False,
             temperature=1.0,
             top_p=0.9,

From 91eda1d4c4c8e7401316440e848d776e9317af7f Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 18:16:05 +0000
Subject: [PATCH 043/168] Update FullModelLLMTrainer to dynamically set
 run_name based on client rank and run_id, and modify TrainingMetricsLogger to
 use a monotonically-increasing global_step for accurate WandB logging.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index c4e36f53a..2b10017fb 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -104,7 +104,7 @@ def __init__(self, *args, **kwargs):
         # it can be accessed by callbacks.
         self.logger = TrainingMetricsLogger(
             log_dir=os.path.join(self.args.output_dir, "wandb_logs"),
-            run_name=f"fedml-grpo-training",
+            run_name=f"client{getattr(self.args, 'rank', 'unknown')}_run{getattr(self.args, 'run_id', os.getenv('FEDML_CURRENT_RUN_ID', '0'))}",
             enable_wandb=True,
             wandb_project="grpo-training",
         )
@@ -706,9 +706,16 @@ def log_training_step(self, step_id: str, train_result: dict, global_step: int):
 
         # Log to wandb
         if self.enable_wandb and self.wandb_run and wandb_metrics:
-            wandb_metrics['global_step'] = global_step
-            self.wandb_run.log(wandb_metrics, step=global_step)
-
+            # Replace the Trainer-provided ``global_step`` (which resets every
+            # round) with an internal monotonically-increasing counter so
+            # that WandB treats each update as a new step instead of
+            # overwriting previous values.
+            wandb_step = self.step_count  # 0-based running counter
+            wandb_metrics['global_step'] = wandb_step
+            self.wandb_run.log(wandb_metrics, step=wandb_step)
+
+        # Advance our own monotonically-increasing counter by exactly one
+        # because this method is invoked once per call to `Trainer.log`.
         self.step_count += 1
 
     def log_server_statistics(self, stats: dict, global_step: int):

From 1a478c4d5696987be4937116d1a5291cb38348d7 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 18:17:40 +0000
Subject: [PATCH 044/168] Update FullModelLLMTrainer to modify run_name format
 and change wandb_project name for better clarity in logging.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 2b10017fb..ecd6ceb8a 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -104,9 +104,9 @@ def __init__(self, *args, **kwargs):
         # it can be accessed by callbacks.
         self.logger = TrainingMetricsLogger(
             log_dir=os.path.join(self.args.output_dir, "wandb_logs"),
-            run_name=f"client{getattr(self.args, 'rank', 'unknown')}_run{getattr(self.args, 'run_id', os.getenv('FEDML_CURRENT_RUN_ID', '0'))}",
+            run_name=f"fl-client{getattr(self.args, 'rank', 'unknown')}_run{getattr(self.args, 'run_id', os.getenv('FEDML_CURRENT_RUN_ID', '0'))}",
             enable_wandb=True,
-            wandb_project="grpo-training",
+            wandb_project="fedllm-grpo-training",
         )
     
     def to_number(self, text: str) -> Optional[float]:

From 5e53db231dcfe1d620671aa4c42deab9a83aa903 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 18:40:46 +0000
Subject: [PATCH 045/168] Update FullModelLLMTrainer and
 grpo_gsm8k_test_config.yaml to enhance training parameters: increase
 max_completion_length to 512, adjust temperature to 0.7 and top_p to 0.95 for
 better generation quality, and modify grpo_max_steps to 150 for extended
 training during testing.

---
 python/spotlight_prj/fedllm/custom_trainer.py        | 12 +++++++-----
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml  |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index ecd6ceb8a..f4b47d7cc 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -267,7 +267,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=512,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -282,8 +282,8 @@ def train(self, train_data, device, args):
             seed=42 + self.round_idx * 100 + args.rank,  # Different seed per round and client
             #report_to="wandb",
             scale_rewards=False,
-            temperature=1.0,
-            top_p=0.9,
+            temperature=0.7,
+            top_p=0.95,
             top_k=50,
             repetition_penalty=1.1,
             epsilon=0.2,
@@ -307,7 +307,7 @@ def train(self, train_data, device, args):
             "do_sample": True,
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
-            "max_new_tokens": 1024,
+            "max_new_tokens": 512,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         
@@ -923,4 +923,6 @@ def on_log(self, args, state, control, logs=None, **kwargs):
         # fires after every call to `Trainer.log`, i.e. after each GRPO step.
         if logs:
             # Use a generic step_id; users can differentiate by global_step.
-            self.logger.log_training_step("grpo_step", logs, state.global_step)
\ No newline at end of file
+            self.logger.log_training_step("grpo_step", logs, state.global_step)
+
+            self.logger.log_moving_averages(state.global_step, window_size=100)
\ No newline at end of file
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 278f67550..2993ce63e 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -30,7 +30,7 @@ train_args:
   client_num_per_round: 1  # Single client setup
   comm_round: 3  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 10  # Only 10 training steps per round for quick testing
+  grpo_max_steps: 150  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
   grpo_batch_size: 4  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)

From 6791c4a1a62a4f6f4d5febe76517faa9ed6245f6 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 18:55:28 +0000
Subject: [PATCH 046/168] Update grpo_gsm8k_test_config.yaml to increase
 comm_round from 3 to 10 and reduce grpo_max_steps from 150 to 50 for
 optimized testing. Adjust timeout duration in run_fedml_client_custom.sh and
 run_fedml_server_custom.sh scripts from 28800s to 21600s for improved
 execution time.

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 .../spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh   | 2 +-
 .../spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 2993ce63e..6401f0a4f 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -28,9 +28,9 @@ train_args:
   server_optimizer: "FedAvg"
   client_num_in_total: 1  # Single client setup
   client_num_per_round: 1  # Single client setup
-  comm_round: 3  # Reduced to 3 rounds for testing
+  comm_round: 10  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 150  # Only 10 training steps per round for quick testing
+  grpo_max_steps: 50  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
   grpo_batch_size: 4  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
diff --git a/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh b/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
index e878a2053..29288d35d 100755
--- a/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
+++ b/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
@@ -25,7 +25,7 @@ LAUNCHER="${6:-"auto"}"
 CONFIG_PATH="${7:-"fedml_config/grpo_gsm8k_test_config.yaml"}"
 
 # Use the custom launcher that properly handles non-PEFT models
-timeout --signal=SIGINT --kill-after=30s 28800 python3 launch_fedllm_custom.py \
+timeout --signal=SIGINT --kill-after=30s 21600 python3 launch_fedllm_custom.py \
   --cf "${CONFIG_PATH}" \
   --rank "${RANK}" \
   --role client \
diff --git a/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh b/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
index d8f708991..e50d58f85 100755
--- a/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
+++ b/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
@@ -26,7 +26,7 @@ LAUNCHER="${6:-"auto"}"
 CONFIG_PATH="${7:-"fedml_config/fedml_config.yaml"}"
 
 # Use the custom launcher that properly handles non-PEFT models
-timeout --signal=SIGINT --kill-after=30s 28800 python3 launch_fedllm_custom.py \
+timeout --signal=SIGINT --kill-after=30s 21600 python3 launch_fedllm_custom.py \
   --cf "${CONFIG_PATH}" \
   --rank "${RANK}" \
   --role server \

From 2bd0c7398c10ae2cdf8ec0149d1400ac09fc4dde Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 20:19:08 +0000
Subject: [PATCH 047/168] Enhance FullModelLLMAggregator with WandB logging for
 server statistics and update grpo_gsm8k_test_config.yaml to set comm_round to
 3 for optimized testing. This includes initializing a TrainingMetricsLogger
 for aggregator-level metrics and logging model broadcasts.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 59 ++++++++++++++++++-
 .../fedml_config/grpo_gsm8k_test_config.yaml  |  2 +-
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index f4b47d7cc..2462b19cf 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -437,6 +437,18 @@ def __init__(self, *args, **kwargs):
         """
         super().__init__(*args, **kwargs)
 
+        # -------------------------------------------------------------
+        # WandB logger for aggregator-level (server) statistics – initialize
+        # EARLY so that it exists even when periodic checkpointing is disabled.
+        # -------------------------------------------------------------
+        self.logger = TrainingMetricsLogger(
+            log_dir=os.path.join(self.args.output_dir, "wandb_logs"),
+            run_name=f"fl-server_run{getattr(self.args, 'run_id', os.getenv('FEDML_CURRENT_RUN_ID', '0'))}",
+            enable_wandb=True,
+            wandb_project="fedllm-grpo-training",
+        )
+        self.model_broadcasts = 0
+
         # Determine interval (seconds)
         interval_min = getattr(self.args, "server_checkpoint_interval_minutes", 30)
         if interval_min <= 0:
@@ -471,6 +483,21 @@ def __init__(self, *args, **kwargs):
         self._velocity: OrderedDict = OrderedDict()
         # -------------------------------------------------------------------
 
+        # ----- WandB server-side logger (NEW) -----
+        # Create a standalone TrainingMetricsLogger so that aggregator-level
+        # system statistics (e.g. active workers, model broadcasts) are also
+        # recorded in the same WandB project as the clients.
+        if not hasattr(self, "logger"):
+            self.logger = TrainingMetricsLogger(
+                log_dir=os.path.join(self.args.output_dir, "wandb_logs"),
+                run_name=f"fl-server_run{getattr(self.args, 'run_id', os.getenv('FEDML_CURRENT_RUN_ID', '0'))}",
+                enable_wandb=True,
+                wandb_project="fedllm-grpo-training",
+            )
+            # Counter for how many times the global model has been broadcast to
+            # clients – useful for monitoring server throughput.
+            self.model_broadcasts = 0
+
     # ------------------------------------------------------------------
     # Internal helpers
     # ------------------------------------------------------------------
@@ -486,6 +513,7 @@ def _periodic_checkpoint_loop(self):
                 self.log(f"Periodic checkpoint → {ckpt_dir}")
                 # Always save checkpoints in the standard HuggingFace format so that
                 # the resulting directory can be loaded with `from_pretrained`.
+                # For `PeftModel` this will also persist the adapter weights.
                 # Only the main process writes the checkpoint to avoid race conditions
                 # (the background thread is spawned exclusively on the main process).
                 if self.training_args.should_save:
@@ -537,6 +565,25 @@ def set_model_params(self, model_parameters) -> None:
         elapsed = time.perf_counter() - t0
         self.log(f"set_model_params (server) took {elapsed:.3f}s")
 
+        # -------------------------------------------------------------
+        # NEW: push aggregator-level system statistics to WandB
+        # -------------------------------------------------------------
+        self.model_broadcasts += 1
+        self.logger.log_server_statistics(
+            stats={
+                "server_statistics": {
+                    "active_workers": getattr(self.args, "client_num_in_total", 0),
+                    "model_subscribers": [],
+                    "service_status": {
+                        "current_model_version": self.round_idx,
+                        "buffer_statistics": {},
+                    },
+                },
+                "current_pipeline_depth": 0,  # placeholder – update if pipeline depth is tracked elsewhere
+                "model_broadcasts": self.model_broadcasts,
+            },
+            global_step=self.round_idx,
+        )
 
         self.log("finished")
 
@@ -828,7 +875,10 @@ def get_moving_average(values, window):
 
         # Log to wandb
         if self.enable_wandb and wandb_metrics:
-            self.wandb_run.log(wandb_metrics, step=global_step)
+            # Use our internal monotonically-increasing counter so that these
+            # points are not overwritten when `global_step` resets each round.
+            wandb_step = max(0, self.step_count - 1)
+            self.wandb_run.log(wandb_metrics, step=wandb_step)
 
     def log_hyperparameters(self, hparams: dict):
         """Log hyperparameters"""
@@ -918,6 +968,7 @@ def __init__(self, logger: "TrainingMetricsLogger"):
         super().__init__()
         self.logger = logger
 
+    """"
     def on_log(self, args, state, control, logs=None, **kwargs):
         # Forward the metrics dictionary to the TrainingMetricsLogger. This
         # fires after every call to `Trainer.log`, i.e. after each GRPO step.
@@ -925,4 +976,10 @@ def on_log(self, args, state, control, logs=None, **kwargs):
             # Use a generic step_id; users can differentiate by global_step.
             self.logger.log_training_step("grpo_step", logs, state.global_step)
 
+            self.logger.log_moving_averages(state.global_step, window_size=100)
+    """
+    def on_step_end(self, args, state, control, logs=None,**kwargs):
+        # Always emit a point – even if HF wouldn't have logged this step
+        if logs and self.logger.step_count % 10 == 0:
+            self.logger.log_training_step("on_step_end", logs, state.global_step)
             self.logger.log_moving_averages(state.global_step, window_size=100)
\ No newline at end of file
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 6401f0a4f..0901093f0 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -28,7 +28,7 @@ train_args:
   server_optimizer: "FedAvg"
   client_num_in_total: 1  # Single client setup
   client_num_per_round: 1  # Single client setup
-  comm_round: 10  # Reduced to 3 rounds for testing
+  comm_round: 3  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0

From 086c4bbdb24f920d7bf733e14e9ab04f9b3a9d74 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 22:33:41 +0000
Subject: [PATCH 048/168] Update FullModelLLMTrainer to set logging_steps to 1
 for more frequent logging and clean up commented code in GRPOMetricsCallback
 for improved readability.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 2462b19cf..1209ce01e 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -275,7 +275,8 @@ def train(self, train_data, device, args):
             bf16=use_bf16,  # Match model precision
             fp16=not use_bf16,  # Use fp16 if not bf16
             gradient_checkpointing=False,  # Keep consistent with config
-            logging_steps=5 if grpo_max_steps > 0 and grpo_max_steps < 50 else 25,  # More frequent logging for short runs
+            #logging_steps=5 if grpo_max_steps > 0 and grpo_max_steps < 50 else 25,  # More frequent logging for short runs
+            logging_steps=1,
             log_completions=True,
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
             # Add seed for reproducibility in federated setting
@@ -968,7 +969,7 @@ def __init__(self, logger: "TrainingMetricsLogger"):
         super().__init__()
         self.logger = logger
 
-    """"
+    
     def on_log(self, args, state, control, logs=None, **kwargs):
         # Forward the metrics dictionary to the TrainingMetricsLogger. This
         # fires after every call to `Trainer.log`, i.e. after each GRPO step.
@@ -982,4 +983,5 @@ def on_step_end(self, args, state, control, logs=None,**kwargs):
         # Always emit a point – even if HF wouldn't have logged this step
         if logs and self.logger.step_count % 10 == 0:
             self.logger.log_training_step("on_step_end", logs, state.global_step)
-            self.logger.log_moving_averages(state.global_step, window_size=100)
\ No newline at end of file
+            self.logger.log_moving_averages(state.global_step, window_size=100)
+    """

From 4ac8dd020438cab1424463fa42198d0774046a4c Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 23:31:30 +0000
Subject: [PATCH 049/168] Refactor TrainingMetricsLogger to use instance method
 for moving average calculation and add run_id to WandB logging configuration
 for improved tracking.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 31 +++++++------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 1209ce01e..895f2b8be 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -664,6 +664,7 @@ def __init__(self, log_dir: str, run_name: Optional[str] = None,
                 project=wandb_project or "grpo-training",
                 entity=wandb_entity,
                 name=self.run_name,
+                id=getattr(self.args, 'run_id'),
                 config=wandb_config or {},
                 reinit=True
             )
@@ -847,31 +848,32 @@ def log_performance_metrics(self, global_step: int, training_rate: Optional[floa
         self.last_log_time = current_time
         self.last_step_count = global_step
 
+
+    def get_moving_average(values, window):
+        if len(values) == 0:
+            return 0
+        window = min(window, len(values))
+        return sum(values[-window:]) / window
+
     def log_moving_averages(self, global_step: int, window_size: int = 100):
         """Log moving averages of key metrics"""
         wandb_metrics = {}
 
-        def get_moving_average(values, window):
-            if len(values) == 0:
-                return 0
-            window = min(window, len(values))
-            return sum(values[-window:]) / window
-
         # Moving averages
         if self.accumulated_metrics['losses']:
-            avg_loss = get_moving_average(self.accumulated_metrics['losses'], window_size)
+            avg_loss = self.get_moving_average(self.accumulated_metrics['losses'], window_size)
             wandb_metrics[f'moving_avg/loss_{window_size}'] = avg_loss
 
         if self.accumulated_metrics['rewards']:
-            avg_reward = get_moving_average(self.accumulated_metrics['rewards'], window_size)
+            avg_reward = self.get_moving_average(self.accumulated_metrics['rewards'], window_size)
             wandb_metrics[f'moving_avg/reward_{window_size}'] = avg_reward
 
         if self.accumulated_metrics['kl_divergences']:
-            avg_kl = get_moving_average(self.accumulated_metrics['kl_divergences'], window_size)
+            avg_kl = self.get_moving_average(self.accumulated_metrics['kl_divergences'], window_size)
             wandb_metrics[f'moving_avg/kl_divergence_{window_size}'] = avg_kl
 
         if self.accumulated_metrics['rollout_lengths']:
-            avg_length = get_moving_average(self.accumulated_metrics['rollout_lengths'], window_size)
+            avg_length = self.get_moving_average(self.accumulated_metrics['rollout_lengths'], window_size)
             wandb_metrics[f'moving_avg/rollout_length_{window_size}'] = avg_length
 
         # Log to wandb
@@ -976,12 +978,3 @@ def on_log(self, args, state, control, logs=None, **kwargs):
         if logs:
             # Use a generic step_id; users can differentiate by global_step.
             self.logger.log_training_step("grpo_step", logs, state.global_step)
-
-            self.logger.log_moving_averages(state.global_step, window_size=100)
-    """
-    def on_step_end(self, args, state, control, logs=None,**kwargs):
-        # Always emit a point – even if HF wouldn't have logged this step
-        if logs and self.logger.step_count % 10 == 0:
-            self.logger.log_training_step("on_step_end", logs, state.global_step)
-            self.logger.log_moving_averages(state.global_step, window_size=100)
-    """

From f2ccd9c001e2ee624cf6fbe6120b2099ea56dc6d Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 25 Jul 2025 23:47:17 +0000
Subject: [PATCH 050/168] Enhance TrainingMetricsLogger and
 FullModelLLMAggregator with improved WandB logging configuration by adding
 args parameter for unique run IDs and grouping. Update logging mechanism to
 ensure accurate step tracking and prevent metric overwrites in the WandB UI.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 83 +++++++++++++++----
 1 file changed, 69 insertions(+), 14 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 895f2b8be..4782cbff9 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -107,6 +107,7 @@ def __init__(self, *args, **kwargs):
             run_name=f"fl-client{getattr(self.args, 'rank', 'unknown')}_run{getattr(self.args, 'run_id', os.getenv('FEDML_CURRENT_RUN_ID', '0'))}",
             enable_wandb=True,
             wandb_project="fedllm-grpo-training",
+            args=self.args,
         )
     
     def to_number(self, text: str) -> Optional[float]:
@@ -447,6 +448,7 @@ def __init__(self, *args, **kwargs):
             run_name=f"fl-server_run{getattr(self.args, 'run_id', os.getenv('FEDML_CURRENT_RUN_ID', '0'))}",
             enable_wandb=True,
             wandb_project="fedllm-grpo-training",
+            args=self.args,
         )
         self.model_broadcasts = 0
 
@@ -494,6 +496,7 @@ def __init__(self, *args, **kwargs):
                 run_name=f"fl-server_run{getattr(self.args, 'run_id', os.getenv('FEDML_CURRENT_RUN_ID', '0'))}",
                 enable_wandb=True,
                 wandb_project="fedllm-grpo-training",
+                args=self.args,
             )
             # Counter for how many times the global model has been broadcast to
             # clients – useful for monitoring server throughput.
@@ -649,26 +652,78 @@ def aggregate(self, raw_client_model_list):
 class TrainingMetricsLogger:
     """Comprehensive logging for GRPO training with WandB support"""
 
-    def __init__(self, log_dir: str, run_name: Optional[str] = None, 
-                 enable_wandb: bool = False,
-                 wandb_project: Optional[str] = None, wandb_entity: Optional[str] = None,
-                 wandb_config: Optional[dict] = None):
+    def __init__(
+        self,
+        log_dir: str,
+        run_name: Optional[str] = None,
+        enable_wandb: bool = False,
+        wandb_project: Optional[str] = None,
+        wandb_entity: Optional[str] = None,
+        wandb_config: Optional[dict] = None,
+        args: Optional[Any] = None,
+    ):
+        """Parameters
+        ----------
+        log_dir : str
+            Directory where auxiliary JSON / txt logs will be written.
+        run_name : str, optional
+            Human-readable name that will appear in the WandB UI.
+        enable_wandb : bool, default False
+            If ``True`` a WandB run is initialised, otherwise the logger will
+            operate in offline mode and simply discard `.log*()` calls.
+        wandb_project, wandb_entity, wandb_config : Optional[str | dict]
+            Passed through to :pyfunc:`wandb.init` unchanged.
+        args : Any, optional
+            (FedML) *args* namespace used throughout the project.  We only
+            use it to derive a *unique* WandB run *id* so that the server and
+            every client write to **separate** runs instead of clobbering one
+            another.
+        """
+
         self.log_dir = log_dir
         self.run_name = run_name or f"grpo_training_{int(time.time())}"
         self.enable_wandb = enable_wandb
+        self.args = args  # may be ``None`` for unit tests / offline runs
 
-        # WandB setup
+        # ------------------------------------------------------------------
+        # WandB setup – ensure that each process (server / client-rank-N) gets
+        # its *own* run.  Re-using the same run *id* from multiple processes
+        # causes metrics to silently overwrite each other and leads to exactly
+        # the "not everything we log shows up" behaviour that we observed on
+        # the dashboard.
+        # ------------------------------------------------------------------
         self.wandb_run = None
         if self.enable_wandb:
-            self.wandb_run = wandb.init(
-                project=wandb_project or "grpo-training",
-                entity=wandb_entity,
-                name=self.run_name,
-                id=getattr(self.args, 'run_id'),
-                config=wandb_config or {},
-                reinit=True
+            wandb_kwargs = {
+                "project": wandb_project or "grpo-training",
+                "entity": wandb_entity,
+                "name": self.run_name,
+                "config": wandb_config or {},
+                "reinit": True,
+            }
+
+            # Use a *group* so that the server run and all client runs are
+            # nicely collated in the WandB UI, while still receiving unique
+            # run IDs.
+            if args is not None and hasattr(args, "run_id"):
+                wandb_kwargs["group"] = str(args.run_id)
+
+                # Derive a UNIQUE id: "<run_id>-server"  or  "<run_id>-client<rank>"
+                role_suffix = (
+                    "-server"
+                    if getattr(args, "role", "server") == "server"
+                    else f"-client{getattr(args, 'rank', '0')}"
+                )
+                wandb_kwargs["id"] = f"{args.run_id}{role_suffix}"
+
+            # Remove None entries so wandb.init does not complain.
+            wandb_kwargs = {k: v for k, v in wandb_kwargs.items() if v is not None}
+
+            self.wandb_run = wandb.init(**wandb_kwargs)
+            print(
+                f"[WandB] Logging initialised → "
+                f"project={wandb_kwargs.get('project')}, run_name={self.run_name}"
             )
-            print(f"WandB logging initialized. Project: {wandb_project or 'grpo-training'}")
 
         # Metrics tracking
         self.step_count = 0
@@ -818,7 +873,7 @@ def log_server_statistics(self, stats: dict, global_step: int):
 
         # Log to wandb
         if self.enable_wandb and self.wandb_run and wandb_metrics:
-            self.wandb_run.log(wandb_metrics, step=global_step)
+            self.wandb_run.log(wandb_metrics, step=self.step_count)
 
     def log_performance_metrics(self, global_step: int, training_rate: Optional[float] = None):
         """Log performance and timing metrics"""

From 8692e473d4b1f592901b1075a22544916cc14a5b Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 26 Jul 2025 00:30:26 +0000
Subject: [PATCH 051/168] Update FullModelLLMTrainer to disable log_completions
 for cleaner logging and modify grpo_gsm8k_test_config.yaml to adjust client
 and communication settings for improved testing efficiency.

---
 python/spotlight_prj/fedllm/custom_trainer.py               | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 4782cbff9..adc83c8df 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -278,7 +278,7 @@ def train(self, train_data, device, args):
             gradient_checkpointing=False,  # Keep consistent with config
             #logging_steps=5 if grpo_max_steps > 0 and grpo_max_steps < 50 else 25,  # More frequent logging for short runs
             logging_steps=1,
-            log_completions=True,
+            log_completions=False,
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
             # Add seed for reproducibility in federated setting
             seed=42 + self.round_idx * 100 + args.rank,  # Different seed per round and client
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 0901093f0..84eb0db2e 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -26,9 +26,9 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 1  # Single client setup
-  client_num_per_round: 1  # Single client setup
-  comm_round: 3  # Reduced to 3 rounds for testing
+  client_num_in_total: 2  # Single client setup
+  client_num_per_round: 2  # Single client setup
+  comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0

From 8c57b887030324914c882d552947c76cd8fb0657 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 26 Jul 2025 01:44:03 +0000
Subject: [PATCH 052/168] Add checkpoint cleanup functionality to
 FullModelLLMTrainer and FullModelLLMAggregator

Implement a method to delete old round checkpoints while retaining the most recent ones. This enhancement helps manage storage and improves training efficiency by cleaning up unnecessary files after each training round. Ensure that wall-clock checkpoints are preserved during the cleanup process.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index adc83c8df..4a2e6eb9e 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -29,6 +29,7 @@
 from src.modeling_utils import load_state_dict
 import time, logging
 import threading
+import shutil  # for deleting old checkpoints
 
 from fractions import Fraction
 
@@ -343,6 +344,9 @@ def train(self, train_data, device, args):
             synchronize=True
         )
 
+        # After saving the current round checkpoint, clean up older round_* checkpoints
+        if self.training_args.should_save:
+            self._cleanup_old_round_checkpoints()
         
         # Clean up fresh model to free memory
         del fresh_model
@@ -417,6 +421,29 @@ def await_sync_process_group(self, from_process: int = 0) -> list:
         self.log("finished")
         return outputs
 
+    def _cleanup_old_round_checkpoints(self, keep_last: int = 1):
+        """Delete old round_* checkpoints but keep the most recent `keep_last`.
+
+        Wall-clock checkpoints (wallclock_*) are never removed.
+        """
+        pattern = re.compile(r"round_(\d+)_(before|after)_agg")
+        # Collect candidate directories and their round numbers
+        ckpts = []
+        for d in self.checkpoint_dir.iterdir():
+            m = pattern.fullmatch(d.name)
+            if m and d != self.latest_checkpoint_dir:
+                ckpts.append((int(m.group(1)), d))
+
+        # Sort by round number so oldest come first
+        ckpts.sort(key=lambda x: x[0])
+
+        # Remove all but the newest `keep_last` checkpoints
+        for _, d in ckpts[:-keep_last]:
+            try:
+                shutil.rmtree(d, ignore_errors=True)
+            except Exception as e:
+                self.log(f"[WARN] Failed to delete old checkpoint {d}: {e}")
+
 
 class FullModelLLMAggregator(LLMAggregator):
     """Custom aggregator that properly handles both PEFT and non-PEFT models."""
@@ -565,6 +592,10 @@ def set_model_params(self, model_parameters) -> None:
             state_dict=model_parameters,
             synchronize=True
         )
+
+        # Clean up old round checkpoints on the server as well
+        if self.training_args.should_save:
+            self._cleanup_old_round_checkpoints()
         
         elapsed = time.perf_counter() - t0
         self.log(f"set_model_params (server) took {elapsed:.3f}s")

From 262f45eeaa9c4a606b936334d8e4af262f7663ae Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 26 Jul 2025 16:53:36 +0000
Subject: [PATCH 053/168] Update FullModelLLMTrainer to use a time-based seed
 for reproducibility and adjust timeout duration in run_fedml_client_custom.sh
 and run_fedml_server_custom.sh scripts from 21600s to 22200s for extended
 execution time.

---
 python/spotlight_prj/fedllm/custom_trainer.py                  | 2 +-
 python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh | 2 +-
 python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 4a2e6eb9e..cc17252be 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -282,7 +282,7 @@ def train(self, train_data, device, args):
             log_completions=False,
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
             # Add seed for reproducibility in federated setting
-            seed=42 + self.round_idx * 100 + args.rank,  # Different seed per round and client
+            seed=int(time.perf_counter_ns() % (2**32))
             #report_to="wandb",
             scale_rewards=False,
             temperature=0.7,
diff --git a/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh b/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
index 29288d35d..09fa207cb 100755
--- a/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
+++ b/python/spotlight_prj/fedllm/scripts/run_fedml_client_custom.sh
@@ -25,7 +25,7 @@ LAUNCHER="${6:-"auto"}"
 CONFIG_PATH="${7:-"fedml_config/grpo_gsm8k_test_config.yaml"}"
 
 # Use the custom launcher that properly handles non-PEFT models
-timeout --signal=SIGINT --kill-after=30s 21600 python3 launch_fedllm_custom.py \
+timeout --signal=SIGINT --kill-after=30s 22200 python3 launch_fedllm_custom.py \
   --cf "${CONFIG_PATH}" \
   --rank "${RANK}" \
   --role client \
diff --git a/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh b/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
index e50d58f85..8dd277898 100755
--- a/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
+++ b/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
@@ -26,7 +26,7 @@ LAUNCHER="${6:-"auto"}"
 CONFIG_PATH="${7:-"fedml_config/fedml_config.yaml"}"
 
 # Use the custom launcher that properly handles non-PEFT models
-timeout --signal=SIGINT --kill-after=30s 21600 python3 launch_fedllm_custom.py \
+timeout --signal=SIGINT --kill-after=30s 22200 python3 launch_fedllm_custom.py \
   --cf "${CONFIG_PATH}" \
   --rank "${RANK}" \
   --role server \

From af28fb68231c2dd1e8b6fbcf0bb2ae475a72f7f5 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 26 Jul 2025 17:03:46 +0000
Subject: [PATCH 054/168] Fix seed assignment in FullModelLLMTrainer to ensure
 proper formatting and maintain reproducibility in federated training.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index cc17252be..780352be5 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -282,7 +282,7 @@ def train(self, train_data, device, args):
             log_completions=False,
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
             # Add seed for reproducibility in federated setting
-            seed=int(time.perf_counter_ns() % (2**32))
+            seed=int(time.perf_counter_ns() % (2**32)),
             #report_to="wandb",
             scale_rewards=False,
             temperature=0.7,

From 1adc9ed6a6914f4fbe61ec2d08529fdd73481f9a Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 00:53:38 +0000
Subject: [PATCH 055/168] Refactor aggregate method documentation in
 FullModelLLMAggregator to clarify Nesterov momentum steps and improve code
 readability.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 780352be5..b984058c9 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -622,8 +622,9 @@ def set_model_params(self, model_parameters) -> None:
 
         self.log("finished")
 
-    """
+
     def aggregate(self, raw_client_model_list):
+        """
         Aggregate client models with Nesterov momentum.
 
         Steps
@@ -635,7 +636,7 @@ def aggregate(self, raw_client_model_list):
         3. Perform an SGD update with momentum on the server side.  If
            ``self._nesterov`` is ``True``, use the Nesterov variant.
         4. Save the updated parameters via ``set_model_params`` and return them.
-        
+        """
         self.log("aggregate: start")
 
         # Step-1: FedAvg aggregation (reuse FedMLAggOperator)
@@ -676,7 +677,6 @@ def aggregate(self, raw_client_model_list):
         self.set_model_params(updated_params)
         self.log("aggregate: finished")
         return updated_params
-    """
 
 
 

From 77222e19b46065631c5ae9bfaf69e6f78cc14fbc Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 01:35:34 +0000
Subject: [PATCH 056/168] Add method to cleanup old round checkpoints in
 FullModelLLMAggregator

Implement _cleanup_old_round_checkpoints to delete outdated round checkpoints while retaining the specified number of recent ones. This enhancement aids in managing storage and improving training efficiency by removing unnecessary files after each training round, while ensuring wall-clock checkpoints remain intact.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index b984058c9..4042b55a4 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -533,6 +533,29 @@ def __init__(self, *args, **kwargs):
     # Internal helpers
     # ------------------------------------------------------------------
 
+    def _cleanup_old_round_checkpoints(self, keep_last: int = 1):
+        """Delete old round_* checkpoints but keep the most recent `keep_last`.
+
+        Wall-clock checkpoints (wallclock_*) are never removed.
+        """
+        pattern = re.compile(r"round_(\d+)_(before|after)_agg")
+        # Collect candidate directories and their round numbers
+        ckpts = []
+        for d in self.checkpoint_dir.iterdir():
+            m = pattern.fullmatch(d.name)
+            if m and d != self.latest_checkpoint_dir:
+                ckpts.append((int(m.group(1)), d))
+
+        # Sort by round number so oldest come first
+        ckpts.sort(key=lambda x: x[0])
+
+        # Remove all but the newest `keep_last` checkpoints
+        for _, d in ckpts[:-keep_last]:
+            try:
+                shutil.rmtree(d, ignore_errors=True)
+            except Exception as e:
+                self.log(f"[WARN] Failed to delete old checkpoint {d}: {e}")
+
     def _periodic_checkpoint_loop(self):
         """Loop that sleeps ``_checkpoint_interval`` seconds then writes a
         checkpoint until ``_stop_checkpoint_evt`` is set (i.e., program exit).

From 09fb8b67f150048f9ae230d564dd41fc5dc0831f Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 01:36:13 +0000
Subject: [PATCH 057/168] Add evaluation script for Qwen3-0.6B on GSM8K test
 split

Introduce a new script, validatation.py, to evaluate the Qwen3-0.6B model on the GSM8K dataset using vLLM. The script includes functionality for model loading, batch processing, and reward calculation based on prediction accuracy. It supports command-line arguments for customization of evaluation parameters such as rollouts, batch size, and total examples.
---
 python/spotlight_prj/fedllm/validatation.py | 177 ++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 python/spotlight_prj/fedllm/validatation.py

diff --git a/python/spotlight_prj/fedllm/validatation.py b/python/spotlight_prj/fedllm/validatation.py
new file mode 100644
index 000000000..5dfce0e9d
--- /dev/null
+++ b/python/spotlight_prj/fedllm/validatation.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python
+"""
+Evaluate Qwen3‑0.6B on GSM8K (test split) with vLLM.
+
+Usage examples
+--------------
+# Default: download model, 1 rollout, 8‑question batches, 100 examples
+python eval_qwen3_gsm8k.py
+
+# Local checkpoint, 4 rollouts, 16 questions per batch, 200 examples
+python eval_qwen3_gsm8k.py \
+  --model /path/to/Qwen3-0.6B-local \
+  --rollouts 4 \
+  --batch-examples 16 \
+  --num-examples 200
+"""
+
+import argparse
+import re
+import time
+from fractions import Fraction
+from typing import Optional, List
+
+from datasets import load_dataset          # pip install datasets
+from vllm import LLM, SamplingParams       # pip install vllm
+
+# --------------------------- reward configuration ---------------------------
+
+BOXED_RE = re.compile(r"\\boxed\{([^}]*)\}")  # capture content inside \boxed{…}
+
+EXACT_MATCH_REWARD = 2.0
+NUM_EQ_REWARD      = 1.5
+INCORRECT_REWARD   = 0.0
+
+
+# ------------------------------- utilities ---------------------------------
+
+def to_number(text: str) -> Optional[float]:
+    """Convert string to float if possible, handling simple fractions."""
+    text = text.replace(",", "").strip()
+    # Fractions like 3/4
+    if "/" in text:
+        try:
+            return float(Fraction(text))
+        except (ValueError, ZeroDivisionError):
+            pass
+    try:
+        return float(text)
+    except ValueError:
+        return None
+
+
+def extract_boxed(text: str) -> str:
+    """Return first \\boxed{...} contents; '' if none."""
+    m = BOXED_RE.search(text)
+    return m.group(1) if m else ""
+
+
+def reward(pred: str, gold: str) -> float:
+    """Assign reward based on exact match or numeric equivalence."""
+    pred, gold = pred.strip(), gold.strip()
+    if pred == gold:
+        return EXACT_MATCH_REWARD
+    p_num, g_num = to_number(pred), to_number(gold)
+    if (p_num is not None and g_num is not None
+            and abs(p_num - g_num) < 1e-4):
+        return NUM_EQ_REWARD
+    return INCORRECT_REWARD
+
+
+def batched(lst: List, n: int):
+    """Yield successive n‑sized chunks from *lst*."""
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+
+
+# ------------------------------- main --------------------------------------
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser()
+    p.add_argument("--rollouts", type=int, default=4,
+                   help="completions per example (default: 4)")
+    p.add_argument("--batch-examples", type=int, default=16,
+                   help="examples per vLLM inference call (default: 2)")
+    p.add_argument("--num-examples", type=int, default=100,
+                   help="total GSM8K test examples to evaluate (default: 100)")
+    p.add_argument(
+        "--model",
+        default=None,
+        help=("HF repo ID or local checkpoint dir. "
+              "If omitted, downloads Qwen/Qwen3-0.6B automatically."),
+    )
+
+    # When loading a locally fine-tuned checkpoint, the tokenizer files are
+    # often *not* included in the output directory.  Allow the user to point
+    # to an existing tokenizer (typically the original base model on the HF
+    # Hub) to avoid the `vocab_file is None` error coming from
+    # `transformers`.
+    p.add_argument("--tokenizer",
+                   default="Qwen/Qwen3-0.6B",
+                   help=("Tokenizer repo / path (default: Qwen/Qwen3-0.6B). "
+                         "Override if you need a different tokenizer."))
+    p.add_argument("--max-tokens", type=int, default=1024,
+                   help="generation length cap (tokens)")
+    p.add_argument("--temperature", type=float, default=0.7)
+    p.add_argument("--top-p", type=float, default=0.95)
+    return p.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    # ------------------------- resolve model path --------------------------
+    if args.model is None:
+        args.model = "Qwen/Qwen3-0.6B"
+        print(f"[INFO] No --model given → downloading '{args.model}' "
+              "from Hugging Face Hub…")
+
+    # ----------------------- initialize LLM & sampler ----------------------
+    # Use a fallback tokenizer path if the user provided one; otherwise rely on
+    # the model path itself.  This prevents crashes when the checkpoint
+    # directory does not contain tokenizer artifacts.
+    llm = LLM(model=args.model,
+              tokenizer=args.tokenizer,
+              trust_remote_code=True,   # Qwen uses custom code
+              dtype="auto")             # let vLLM choose BF16 / FP16 / FP32
+
+    sampler = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_tokens,
+        n=args.rollouts
+    )
+
+    # --------------------------- load dataset -----------------------------
+    ds = load_dataset("openai/gsm8k", "main", split="test")
+    ds = ds.shuffle(seed=42).select(range(min(args.num_examples, len(ds))))
+
+    total_reward = 0.0
+    total_completions = len(ds) * args.rollouts
+
+    # --------------------------- evaluation -------------------------------
+    print(f"[INFO] Starting generation for {len(ds)} examples in batches of {args.batch_examples}...")
+    start_time = time.time()
+    
+    batch_count = 0
+    for batch in batched(list(ds), args.batch_examples):
+        batch_start = time.time()
+        prompts = [ex["question"] for ex in batch]   # **raw questions only**
+        outputs = llm.generate(prompts, sampler)
+        batch_end = time.time()
+        
+        batch_count += 1
+        batch_time = batch_end - batch_start
+        print(f"[TIMING] Batch {batch_count} ({len(batch)} examples): {batch_time:.2f}s")
+
+        for ex, gen in zip(batch, outputs):
+            gold = ex["answer"].split("####")[-1].strip()
+            for out in gen.outputs:
+                pred = extract_boxed(out.text)
+                total_reward += reward(pred, gold)
+
+    end_time = time.time()
+    total_generation_time = end_time - start_time
+    
+    avg_reward = total_reward / total_completions
+    print(f"\n[TIMING] Total generation time: {total_generation_time:.2f}s")
+    print(f"[TIMING] Average time per batch: {total_generation_time / batch_count:.2f}s")
+    print(f"[TIMING] Average time per example: {total_generation_time / len(ds):.2f}s")
+    print(f"[TIMING] Average time per completion: {total_generation_time / total_completions:.3f}s")
+    print(f"\nEvaluated {len(ds)} examples × {args.rollouts} rollouts "
+          f"(batch size = {args.batch_examples}).")
+    print(f"Average reward: {avg_reward:.4f}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From daef41604ac4113bc1f758610444628b5758db46 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 14:15:51 +0000
Subject: [PATCH 058/168] Refactor aggregate method documentation in
 FullModelLLMAggregator and update model name in grpo_gsm8k_test_config.yaml
 to Qwen3-1.7B

---
 python/spotlight_prj/fedllm/custom_trainer.py             | 8 ++++----
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml       | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 4042b55a4..4ce261ed4 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -645,9 +645,9 @@ def set_model_params(self, model_parameters) -> None:
 
         self.log("finished")
 
-
+    """
     def aggregate(self, raw_client_model_list):
-        """
+        
         Aggregate client models with Nesterov momentum.
 
         Steps
@@ -659,7 +659,7 @@ def aggregate(self, raw_client_model_list):
         3. Perform an SGD update with momentum on the server side.  If
            ``self._nesterov`` is ``True``, use the Nesterov variant.
         4. Save the updated parameters via ``set_model_params`` and return them.
-        """
+        
         self.log("aggregate: start")
 
         # Step-1: FedAvg aggregation (reuse FedMLAggOperator)
@@ -700,7 +700,7 @@ def aggregate(self, raw_client_model_list):
         self.set_model_params(updated_params)
         self.log("aggregate: finished")
         return updated_params
-
+    """
 
 
 class TrainingMetricsLogger:
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 84eb0db2e..1849a20d2 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-0.6B"
+  model_name_or_path: "Qwen/Qwen3-1.7B"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 

From 1da9db9c6ba80e4410362c5b49842a39ab2c0701 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 14:40:53 +0000
Subject: [PATCH 059/168] Update model name in save_initial_checkpoint.py from
 Qwen3-0.6B to Qwen3-1.7B for consistency with recent changes.

---
 python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index 4b7ef2147..a178b6e58 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "Qwen/Qwen3-1.7B"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From cf97042f60af18713e72b7f0c6e0112ac0674698 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 15:21:03 +0000
Subject: [PATCH 060/168] Adjust GRPO batch size in grpo_gsm8k_test_config.yaml
 from 4 to 2 for faster testing.

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 1849a20d2..e38f9ee7c 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -32,7 +32,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 50  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 4  # Smaller batch size for faster testing
+  grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From 8ee82b4b66ab47f5791fd78b4481bb447ad8b5de Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 15:33:29 +0000
Subject: [PATCH 061/168] Update gradient_checkpointing settings in
 FullModelLLMTrainer and grpo_gsm8k_test_config.yaml for consistency with
 configuration

---
 python/spotlight_prj/fedllm/custom_trainer.py                   | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 4ce261ed4..59e586bcf 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -276,7 +276,7 @@ def train(self, train_data, device, args):
             learning_rate=5e-6,
             bf16=use_bf16,  # Match model precision
             fp16=not use_bf16,  # Use fp16 if not bf16
-            gradient_checkpointing=False,  # Keep consistent with config
+            gradient_checkpointing=getattr(args, 'gradient_checkpointing', False),
             #logging_steps=5 if grpo_max_steps > 0 and grpo_max_steps < 50 else 25,  # More frequent logging for short runs
             logging_steps=1,
             log_completions=False,
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index e38f9ee7c..34cccb009 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -42,7 +42,7 @@ train_args:
   seed: 1234
   fp16: True  # Use fp16 instead of bf16 for GPU compatibility
   bf16: False
-  gradient_checkpointing: False  # Match GRPO config
+  gradient_checkpointing: True  # Match GRPO config
   per_device_train_batch_size: 4  # Will be overridden by GRPO
   per_device_eval_batch_size: 8
   gradient_accumulation_steps: 2  # Will be overridden by GRPO

From 852d836ede98b0a860db2cc28410a74c838ae15b Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 15:52:44 +0000
Subject: [PATCH 062/168] Add optimizer configuration in FullModelLLMTrainer

Set the optimizer to "adamw_bnb_8bit" in the FullModelLLMTrainer class to enhance training performance and efficiency.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 59e586bcf..7fd9fd52b 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -291,6 +291,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.1,
+            optim="adamw_bnb_8bit",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")

From 36bae8c2d91750ba299621f4e88cd6aa3bdf1099 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 16:57:35 +0000
Subject: [PATCH 063/168] Add initial checkpoint saving in
 run_fedml_server_custom.sh

Integrate a call to save_initial_checkpoint.py in the run_fedml_server_custom.sh script to ensure that an initial model checkpoint is saved before launching the FedML server. This addition enhances the training workflow by preserving the starting state of the model.
---
 python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh b/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
index 8dd277898..08f7d83bd 100755
--- a/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
+++ b/python/spotlight_prj/fedllm/scripts/run_fedml_server_custom.sh
@@ -25,6 +25,8 @@ LAUNCHER="${6:-"auto"}"
 # FedML config
 CONFIG_PATH="${7:-"fedml_config/fedml_config.yaml"}"
 
+python scripts/save_initial_checkpoint.py
+
 # Use the custom launcher that properly handles non-PEFT models
 timeout --signal=SIGINT --kill-after=30s 22200 python3 launch_fedllm_custom.py \
   --cf "${CONFIG_PATH}" \

From fed4829879b484c1086befb578793256b1ad5361 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 17:05:25 +0000
Subject: [PATCH 064/168] Update training configuration and model parameters in
 FullModelLLMTrainer and grpo_gsm8k_test_config.yaml

- Set TRANSFORMERS_VERBOSITY to "error" to reduce logging noise.
- Adjusted max_completion_length and max_new_tokens from 512 to 256 for improved performance.
- Modified client_num_in_total and client_num_per_round to 1 for single client testing.
- Increased grpo_batch_size from 2 to 4 for faster testing.
---
 python/spotlight_prj/fedllm/custom_trainer.py             | 8 +++++---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml       | 6 +++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 7fd9fd52b..7edbc51aa 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -110,6 +110,8 @@ def __init__(self, *args, **kwargs):
             wandb_project="fedllm-grpo-training",
             args=self.args,
         )
+
+    os.environ["TRANSFORMERS_VERBOSITY"] = "error"
     
     def to_number(self, text: str) -> Optional[float]:
         """Convert string to float if possible, handling simple fractions."""
@@ -269,7 +271,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=512,
+            max_completion_length=256,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -279,7 +281,7 @@ def train(self, train_data, device, args):
             gradient_checkpointing=getattr(args, 'gradient_checkpointing', False),
             #logging_steps=5 if grpo_max_steps > 0 and grpo_max_steps < 50 else 25,  # More frequent logging for short runs
             logging_steps=1,
-            log_completions=False,
+            log_completions=True,
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
             # Add seed for reproducibility in federated setting
             seed=int(time.perf_counter_ns() % (2**32)),
@@ -311,7 +313,7 @@ def train(self, train_data, device, args):
             "do_sample": True,
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
-            "max_new_tokens": 512,
+            "max_new_tokens": 256,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 34cccb009..0c0235a33 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -26,13 +26,13 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 2  # Single client setup
-  client_num_per_round: 2  # Single client setup
+  client_num_in_total: 1  # Single client setup
+  client_num_per_round: 1  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50  # Only 10 training steps per round for quick testing
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 2  # Smaller batch size for faster testing
+  grpo_batch_size: 4  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From 5870af86b487239f93c7732bef7fd3f37235a08b Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 17:22:21 +0000
Subject: [PATCH 065/168] Enhance custom trainer and configuration for improved
 logging and performance

- Suppress HF Transformers advisory warnings by setting environment variables before importing transformers.
- Enable flash attention in grpo_gsm8k_test_config.yaml for better model performance during training.
---
 python/spotlight_prj/fedllm/custom_trainer.py              | 7 ++++++-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml        | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 7edbc51aa..a54d5bab2 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -6,6 +6,10 @@
 This version also integrates GRPO training for GSM8K dataset.
 """
 
+# Silence HF Transformers advisory warnings about caching vs gradient checkpointing – must be set BEFORE importing transformers
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
+os.environ["TRANSFORMERS_VERBOSITY"] = "error"
+
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -40,6 +44,8 @@
 import json
 
 
+
+
 class TimedGRPOTrainer(GRPOTrainer):
     def _record_step_stats(self, stats):
         # first let the parent push its metrics
@@ -111,7 +117,6 @@ def __init__(self, *args, **kwargs):
             args=self.args,
         )
 
-    os.environ["TRANSFORMERS_VERBOSITY"] = "error"
     
     def to_number(self, text: str) -> Optional[float]:
         """Convert string to float if possible, handling simple fractions."""
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 0c0235a33..d182fd0a4 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -20,7 +20,7 @@ model_args:
   skip_log_model_net: True
   model_name_or_path: "Qwen/Qwen3-1.7B"
   peft_type: "none"  # Full model fine-tuning
-  use_flash_attention: False
+  use_flash_attention: True
 
 train_args:
   federated_optimizer: "FedAvg"

From 5f4fd91d5f0b8c63bb36410ed33cbd3adde6f862 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 17:25:24 +0000
Subject: [PATCH 066/168] Reorganize environment variable settings for HF
 Transformers in custom trainer

- Moved the suppression of HF Transformers advisory warnings to occur before importing transformers to ensure proper functionality.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index a54d5bab2..e1fc9ecef 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -6,14 +6,16 @@
 This version also integrates GRPO training for GSM8K dataset.
 """
 
-# Silence HF Transformers advisory warnings about caching vs gradient checkpointing – must be set BEFORE importing transformers
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
-os.environ["TRANSFORMERS_VERBOSITY"] = "error"
+
 
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 
+# Silence HF Transformers advisory warnings about caching vs gradient checkpointing – must be set BEFORE importing transformers
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
+os.environ["TRANSFORMERS_VERBOSITY"] = "error"
+
 import re
 import torch
 from collections import OrderedDict

From c4bfc951985e7645c6053a1af36d999ea3fa2fe1 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 17:28:47 +0000
Subject: [PATCH 067/168] Disable flash attention in
 grpo_gsm8k_test_config.yaml to revert to standard model training settings.

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index d182fd0a4..0c0235a33 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -20,7 +20,7 @@ model_args:
   skip_log_model_net: True
   model_name_or_path: "Qwen/Qwen3-1.7B"
   peft_type: "none"  # Full model fine-tuning
-  use_flash_attention: True
+  use_flash_attention: False
 
 train_args:
   federated_optimizer: "FedAvg"

From ad6c7793b9640a098e04833330e838077b4301f3 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 18:04:25 +0000
Subject: [PATCH 068/168] Enhance model loading and tokenizer initialization in
 custom trainer and checkpoint script

- Added trust_remote_code=True to model and tokenizer loading in custom_trainer.py and save_initial_checkpoint.py for improved security and functionality.
- Included a demonstration of model inference in custom_trainer.py to verify model behavior after loading.
---
 python/spotlight_prj/fedllm/custom_trainer.py   | 17 +++++++++++++----
 .../fedllm/scripts/save_initial_checkpoint.py   |  4 ++--
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index e1fc9ecef..c41ea386f 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -220,23 +220,32 @@ def train(self, train_data, device, args):
                 fresh_model = AutoModelForCausalLM.from_pretrained(
                     model_name, 
                     torch_dtype=torch.bfloat16,
-                    use_cache=False
+                    use_cache=False,
+                    trust_remote_code=True
                 )
             else:
                 fresh_model = AutoModelForCausalLM.from_pretrained(
                     model_name, 
                     torch_dtype=torch.float32,  # Use float32 for better stability
-                    use_cache=False
+                    use_cache=False,
+                    trust_remote_code=True
                 )
         except Exception as e:
             self.log(f"Failed to load with requested precision, falling back to float32: {e}")
             fresh_model = AutoModelForCausalLM.from_pretrained(
                 model_name, 
                 torch_dtype=torch.float32,  # Fallback to float32
-                use_cache=False
+                use_cache=False,
+                trust_remote_code=True
             )
-        fresh_tokenizer = AutoTokenizer.from_pretrained(model_name)
+        fresh_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         fresh_tokenizer.pad_token = fresh_tokenizer.eos_token
+
+        print("\n=========================")
+        ids = fresh_tokenizer("1 + 1 =", return_tensors="pt").to(fresh_model.device)
+        out = fresh_model.generate(**ids, max_new_tokens=3)
+        print(fresh_tokenizer.decode(out[0], skip_special_tokens=True))
+        print("=========================\n")
         
         # Copy current model state to fresh model (to preserve any training from previous rounds)
         if self.round_idx > 0:
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index a178b6e58..2b00909dd 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -20,8 +20,8 @@
 
 # Load model and tokenizer
 print("Loading model...")
-model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
 
 # Save model in the format expected by FedML (pytorch_model.bin)
 print("Saving model checkpoint...")

From b672c1a4efdde045bc2d82493148e70f10e15fc9 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 18:12:24 +0000
Subject: [PATCH 069/168] Add warnings filter in custom trainer to suppress
 advisory messages

- Imported the warnings module and set it to ignore warnings in custom_trainer.py to enhance clarity during model training.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index c41ea386f..67f9f5cce 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -45,7 +45,8 @@
 import wandb
 import json
 
-
+import warnings
+warnings.filterwarnings("ignore")
 
 
 class TimedGRPOTrainer(GRPOTrainer):

From a6930a9b53b9949aa8727ca6fa5b03045997d0de Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sun, 27 Jul 2025 18:19:24 +0000
Subject: [PATCH 070/168] Add gradient check for NaN/Inf values in training
 process

- Implemented a method in FullModelLLMTrainer to check for NaN or Inf values in model parameters after training updates.
- Added a warning message to alert users if such values are detected, enhancing model training reliability.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 67f9f5cce..e35fa0343 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -341,6 +341,16 @@ def train(self, train_data, device, args):
         
         # Run GRPO training
         grpo_trainer.train()
+
+        def _check_grad_nan(self, trainer):
+            for n, p in trainer.model.named_parameters():
+                if torch.isnan(p).any() or torch.isinf(p).any():
+                    return True
+            return False
+
+        # in your training loop, right after `grpo_trainer.step()` or similar
+        if self._check_grad_nan(grpo_trainer):
+            print("***‼ Detected NaN/Inf after update!***")
         
         # **Copy trained weights back to FedML's model**
         self.log("Copying GRPO-trained weights back to FedML model")

From 4f9731137ed104b0b0378c85b48be0912033094e Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 28 Jul 2025 01:05:22 +0000
Subject: [PATCH 071/168] Refactor training configuration and model handling in
 custom trainer

- Removed environment variable settings for suppressing HF Transformers warnings from custom_trainer.py.
- Enhanced model handling by offloading the FedML copy before allocating a fresh model and increasing max_completion_length and max_new_tokens to 512.
- Updated grpo_gsm8k_test_config.yaml to support a two-client setup and adjusted batch size and gradient accumulation steps for testing efficiency.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 31 ++++++-------------
 .../fedml_config/grpo_gsm8k_test_config.yaml  | 16 +++++-----
 2 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index e35fa0343..04db24a18 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -12,10 +12,6 @@
 import os
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 
-# Silence HF Transformers advisory warnings about caching vs gradient checkpointing – must be set BEFORE importing transformers
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
-os.environ["TRANSFORMERS_VERBOSITY"] = "error"
-
 import re
 import torch
 from collections import OrderedDict
@@ -170,6 +166,7 @@ def reward_fn(self, completions, answer, **_):
     def train(self, train_data, device, args):
         """Override train to use GRPO training on GSM8K dataset."""
         self.log("Starting GRPO training on GSM8K")
+
         
         # Load GSM8K dataset
         ds = load_dataset("openai/gsm8k", "main", split="train")
@@ -209,6 +206,10 @@ def train(self, train_data, device, args):
         # **FIX: Load fresh model and tokenizer for GRPO to avoid FedML state corruption**
         from transformers import AutoModelForCausalLM, AutoTokenizer
         import torch
+
+        # ↓↓↓  off-load the FedML copy BEFORE allocating fresh_model
+        self.model.to("cpu")
+        torch.cuda.empty_cache()       # actually releases the VRAM
         
         # Get model name from model_args
         model_name = self.model_args.model_name_or_path
@@ -241,12 +242,6 @@ def train(self, train_data, device, args):
             )
         fresh_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         fresh_tokenizer.pad_token = fresh_tokenizer.eos_token
-
-        print("\n=========================")
-        ids = fresh_tokenizer("1 + 1 =", return_tensors="pt").to(fresh_model.device)
-        out = fresh_model.generate(**ids, max_new_tokens=3)
-        print(fresh_tokenizer.decode(out[0], skip_special_tokens=True))
-        print("=========================\n")
         
         # Copy current model state to fresh model (to preserve any training from previous rounds)
         if self.round_idx > 0:
@@ -288,7 +283,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=512,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -298,7 +293,7 @@ def train(self, train_data, device, args):
             gradient_checkpointing=getattr(args, 'gradient_checkpointing', False),
             #logging_steps=5 if grpo_max_steps > 0 and grpo_max_steps < 50 else 25,  # More frequent logging for short runs
             logging_steps=1,
-            log_completions=True,
+            log_completions=False,
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
             # Add seed for reproducibility in federated setting
             seed=int(time.perf_counter_ns() % (2**32)),
@@ -330,7 +325,8 @@ def train(self, train_data, device, args):
             "do_sample": True,
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
-            "max_new_tokens": 256,
+            "bos_token_id": fresh_tokenizer.bos_token_id,
+            "max_new_tokens": 512,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         
@@ -342,16 +338,7 @@ def train(self, train_data, device, args):
         # Run GRPO training
         grpo_trainer.train()
 
-        def _check_grad_nan(self, trainer):
-            for n, p in trainer.model.named_parameters():
-                if torch.isnan(p).any() or torch.isinf(p).any():
-                    return True
-            return False
 
-        # in your training loop, right after `grpo_trainer.step()` or similar
-        if self._check_grad_nan(grpo_trainer):
-            print("***‼ Detected NaN/Inf after update!***")
-        
         # **Copy trained weights back to FedML's model**
         self.log("Copying GRPO-trained weights back to FedML model")
         trained_state = fresh_model.state_dict()
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 0c0235a33..0368a3d3e 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -26,13 +26,13 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 1  # Single client setup
-  client_num_per_round: 1  # Single client setup
+  client_num_in_total: 2  # Single client setup
+  client_num_per_round: 2  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 50  # Only 10 training steps per round for quick testing
+  grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 4  # Smaller batch size for faster testing
+  grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1
@@ -42,12 +42,12 @@ train_args:
   seed: 1234
   fp16: True  # Use fp16 instead of bf16 for GPU compatibility
   bf16: False
-  gradient_checkpointing: True  # Match GRPO config
-  per_device_train_batch_size: 4  # Will be overridden by GRPO
+  gradient_checkpointing: False  # Match GRPO config
+  per_device_train_batch_size: 2  # Will be overridden by GRPO
   per_device_eval_batch_size: 8
-  gradient_accumulation_steps: 2  # Will be overridden by GRPO
+  gradient_accumulation_steps: 1  # Will be overridden by GRPO
   eval_accumulation_steps: 4
-  learning_rate: 5e-6  # Will be overridden by GRPO
+  learning_rate: 5e-6 # Will be overridden by GRPO
   warmup_steps: 0
   output_dir: ".logs/FedML/{run_id}"
   logging_steps: 5  # Frequent logging for testing

From 79b980ff27be230f0f95575ccf0d95a3a441ec0b Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 28 Jul 2025 01:48:27 +0000
Subject: [PATCH 072/168] Refactor checkpoint saving logic in run_fedllm.py

- Updated the checkpoint saving mechanism to ensure a single weight file is stored for reliable loading of models.
- Differentiated filename handling for PEFT models and standard Hugging Face models to maintain compatibility with downstream logic.
---
 python/spotlight_prj/fedllm/run_fedllm.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/python/spotlight_prj/fedllm/run_fedllm.py b/python/spotlight_prj/fedllm/run_fedllm.py
index b726450ca..0b3836dde 100644
--- a/python/spotlight_prj/fedllm/run_fedllm.py
+++ b/python/spotlight_prj/fedllm/run_fedllm.py
@@ -157,15 +157,22 @@ def _save_checkpoint(
     if state_dict is None:
         state_dict = model.state_dict()
 
-    if isinstance(model, (PeftModel, PreTrainedModel)):
-        model.save_pretrained(
-            save_directory=str(checkpoint_dir),
-            state_dict=state_dict
-        )
+    # Always store a **single** weight file so that downstream logic can
+    # reliably load it without having to handle Hugging Face sharded
+    # checkpoints.  For PEFT (LoRA/Adapter) models we keep the original
+    # filename expected by `load_checkpoint()` (``adapter_model.bin``),
+    # otherwise we save using the standard Hugging Face filename
+    # ``pytorch_model.bin``.
+
+    checkpoint_dir = Path(checkpoint_dir)
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+
+    if isinstance(model, PeftModel):
+        filename = PEFT_WEIGHTS_NAME  # "adapter_model.bin"
     else:
-        checkpoint_dir = Path(checkpoint_dir)
-        checkpoint_dir.mkdir(parents=True, exist_ok=True)
-        torch.save(state_dict, str(checkpoint_dir / HF_WEIGHTS_NAME))
+        filename = HF_WEIGHTS_NAME    # "pytorch_model.bin"
+
+    torch.save(state_dict, str(checkpoint_dir / filename))
 
 
 def save_checkpoint(

From 7667d9926938767fd35c8478ca1db010438fdb5c Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 28 Jul 2025 01:59:25 +0000
Subject: [PATCH 073/168] Refactor checkpoint saving logic in run_fedllm.py

- Improved the checkpoint saving process for Hugging Face Trainer to persist a single weight file, simplifying downstream logic.
- Enhanced handling of state_dict to prioritize caller-provided weights during the saving process, ensuring more reliable model checkpoints.
---
 python/spotlight_prj/fedllm/run_fedllm.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/run_fedllm.py b/python/spotlight_prj/fedllm/run_fedllm.py
index 0b3836dde..10ceb7d8a 100644
--- a/python/spotlight_prj/fedllm/run_fedllm.py
+++ b/python/spotlight_prj/fedllm/run_fedllm.py
@@ -215,9 +215,24 @@ def save_checkpoint(
             f" \"{type(model_or_trainer)}\"."
         )
 
-    # save model checkpoint
+    # Save model checkpoint
     if isinstance(model_or_trainer, HFTrainer):
-        model_or_trainer.save_checkpoint(checkpoint_dir)
+        # Hugging Face Trainer normally creates sharded checkpoints. To keep
+        # downstream logic simple we instead persist a **single** weight file
+        # for the underlying model, re-using the same helper that `Module`
+        # path employs.
+
+        underlying_model = model_or_trainer.model
+
+        if is_saving_process:
+            # Prefer caller-provided `state_dict` when given (e.g. aggregated
+            # weights from the server); otherwise pull fresh weights from the
+            # model.
+            _save_checkpoint(
+                underlying_model,
+                checkpoint_dir,
+                state_dict or underlying_model.state_dict()
+            )
 
     elif isinstance(model_or_trainer, Module):
         if is_saving_process:

From 4481c0dde502f3bcbb88ead7a274d97a1dcd7cfe Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 28 Jul 2025 02:11:50 +0000
Subject: [PATCH 074/168] Enhance checkpoint saving logic in
 run_fedllm_custom.py

- Updated the checkpoint saving process to always produce a single-file checkpoint, simplifying downstream loading logic.
- Differentiated filename handling for PEFT models and standard models to ensure compatibility with Hugging Face's requirements.
- Ensured checkpoint creation occurs even when the save strategy is set to "no" in the training configuration.
---
 python/spotlight_prj/fedllm/run_fedllm.py     |  7 +++++-
 .../spotlight_prj/fedllm/run_fedllm_custom.py | 22 ++++++++++---------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/python/spotlight_prj/fedllm/run_fedllm.py b/python/spotlight_prj/fedllm/run_fedllm.py
index 10ceb7d8a..c09268e2a 100644
--- a/python/spotlight_prj/fedllm/run_fedllm.py
+++ b/python/spotlight_prj/fedllm/run_fedllm.py
@@ -403,7 +403,12 @@ def on_after_local_training(self, train_data, device, args: Arguments) -> None:
 
         self.latest_checkpoint_dir = self.checkpoint_dir / f"round_{self.round_idx}_before_agg"
         self.log(f"saving model to \"{self.latest_checkpoint_dir}\"")
-        save_checkpoint(self.trainer, self.latest_checkpoint_dir)
+        # Force checkpoint creation even if TrainingArguments.save_strategy == "no"
+        save_checkpoint(
+            self.trainer,
+            self.latest_checkpoint_dir,
+            is_saving_process=True,
+        )
 
         self.log("finished")
         return outputs
diff --git a/python/spotlight_prj/fedllm/run_fedllm_custom.py b/python/spotlight_prj/fedllm/run_fedllm_custom.py
index 1ea370696..825c10163 100644
--- a/python/spotlight_prj/fedllm/run_fedllm_custom.py
+++ b/python/spotlight_prj/fedllm/run_fedllm_custom.py
@@ -44,17 +44,19 @@ def _save_checkpoint(
     if state_dict is None:
         state_dict = model.state_dict()
 
-    if isinstance(model, (PeftModel, PreTrainedModel)):
-        # Force safe_serialization=False to get pytorch_model.bin instead of model.safetensors
-        model.save_pretrained(
-            save_directory=str(checkpoint_dir),
-            state_dict=state_dict,
-            safe_serialization=False  # This ensures pytorch_model.bin is created
-        )
+    # Always produce a single-file checkpoint so that downstream loading logic
+    # can simply look for ``adapter_model.bin`` (PEFT) or ``pytorch_model.bin``
+    # without worrying about Hugging Face sharding.
+
+    checkpoint_dir = Path(checkpoint_dir)
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+
+    if isinstance(model, PeftModel):
+        filename = "adapter_model.bin"
     else:
-        checkpoint_dir = Path(checkpoint_dir)
-        checkpoint_dir.mkdir(parents=True, exist_ok=True)
-        torch.save(state_dict, str(checkpoint_dir / HF_WEIGHTS_NAME))
+        filename = HF_WEIGHTS_NAME  # "pytorch_model.bin"
+
+    torch.save(state_dict, str(checkpoint_dir / filename))
 
 
 # Monkey patch the _save_checkpoint function in the imported module

From 6b119548c30f3c5ac32b6d9895243430d0d619f2 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 28 Jul 2025 19:26:05 +0000
Subject: [PATCH 075/168] Update GRPO configuration in
 grpo_gsm8k_test_config.yaml for testing

- Reduced grpo_max_steps from 50 to 2 to streamline testing.
- Maintained other training parameters for consistency in the testing environment.
---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 0368a3d3e..3bc6a989a 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -30,7 +30,7 @@ train_args:
   client_num_per_round: 2  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 50
+  grpo_max_steps: 2
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
   grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)

From c358d9227f7255d18449641ff35f8352f6f4862f Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 28 Jul 2025 23:57:07 +0000
Subject: [PATCH 076/168] Ensure model parameters are on CPU and clear CUDA
 cache in FullModelLLMTrainer

- Added a line to move model parameters to CPU for consistency.
- Implemented a call to clear the CUDA cache to optimize memory usage during training.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 04db24a18..33b5a7b79 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -426,6 +426,8 @@ def sync_process_group(
         if round_idx is None:
             round_idx = self.round_idx
 
+        model_params = to_device(model_params, "cpu")   # ensure params live on CPU
+        torch.cuda.empty_cache() 
         broadcast_object_list([round_idx, model_params, client_index], from_process=from_process)
 
         self.log("finished")

From 134196df33fab1c9f2e72ec18ef2f27a18f4bef2 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 00:31:48 +0000
Subject: [PATCH 077/168] Enhance timing and logging in TimedGRPOTrainer and
 FullModelLLMTrainer

- Added measurement of inter-step wall-clock time in TimedGRPOTrainer to capture the full duration of GRPO optimization steps.
- Implemented logging of average completion time per generation and GRPO step time for better performance tracking.
- Updated TrainingMetricsLogger to accumulate and log new metrics related to completion and step times for improved analysis.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 134 +++++++++---------
 1 file changed, 68 insertions(+), 66 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 33b5a7b79..fa511109b 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -47,12 +47,28 @@
 
 class TimedGRPOTrainer(GRPOTrainer):
     def _record_step_stats(self, stats):
-        # first let the parent push its metrics
+        # -------------------------------------------------------------
+        # Measure *inter-step* wall-clock time: difference between the start
+        # of this stats call and the previous.  This captures the full time
+        # spent in the GRPO optimisation step (generation + backward pass,
+        # etc.) rather than just the duration of this method.
+        # -------------------------------------------------------------
+        t_now = time.perf_counter()
+        step_elapsed = None
+        if hasattr(self, "_prev_step_t"):
+            step_elapsed = t_now - self._prev_step_t
+        self._prev_step_t = t_now  # update for next call
+
+        # Call parent implementation *after* timing start so that we include
+        # all work done before stats are returned.
         super()._record_step_stats(stats)
 
-        # add / overwrite any extra metrics and push once more
+        # -------------------------------------------------------------
+        # Compute additional metrics
+        # -------------------------------------------------------------
         stats["kl_divergence"] = stats["kl"].mean().item()
-        self.accelerator.log(stats, step=self.state.global_step)
+        if step_elapsed is not None:
+            stats["grpo_step_time"] = step_elapsed  # seconds
 
         # NEW: forward stats to Trainer's logging system so that callbacks
         # like GRPOMetricsCallback can record them via the TrainingMetricsLogger.
@@ -64,8 +80,17 @@ def _make_experience(self, *args, **kwargs):
         
         t0 = time.perf_counter()
         result = super()._make_experience(*args, **kwargs)
-        self.accelerator.log(f"roll-out batch {self.state.global_step} : "
-                     f"{time.perf_counter() - t0:.3f}s")
+        # ------------------------------------------------------------------
+        # Compute and log average completion time per generation
+        # ------------------------------------------------------------------
+        elapsed = time.perf_counter() - t0  # total time for this roll-out batch
+        num_gens = max(1, getattr(self.args, "num_generations", 1))
+        avg_completion_time = elapsed / num_gens
+
+        # Log the metric so that it is captured by both Accelerate and
+        # the TrainingMetricsLogger (via GRPOMetricsCallback).
+        self.accelerator.log({"avg_completion_time": avg_completion_time}, step=self.state.global_step)
+        self.log({"avg_completion_time": avg_completion_time})
         
         # `out["kl"]` is a 1-D tensor of per-token KL values
         kl_mean = result["kl"].mean().item()
@@ -73,9 +98,9 @@ def _make_experience(self, *args, **kwargs):
         # push to the FedML / accelerate logger – it will end up in client?.log
         self.log({"kl_divergence": kl_mean})
 
-        self.log(
-            f"roll-out batch {self.state.global_step} "
-            f"(elapsed {time.perf_counter() - t0:.3f}s, kl={kl_mean:.4f})"
+        # Human-readable string message (kept for completeness)
+        self.accelerator.log(
+            f"roll-out batch {self.state.global_step} : {elapsed:.3f}s"
         )
         return result
 
@@ -435,8 +460,22 @@ def sync_process_group(
     def await_sync_process_group(self, from_process: int = 0) -> list:
         self.log("start")
 
+        # ---------------------- Timing start ----------------------
+        t0 = time.perf_counter()
         outputs = broadcast_object_list([None, None, None], from_process=from_process)
+        download_elapsed = time.perf_counter() - t0
+
+        # ---------------------- WandB log ------------------------
+        if getattr(self, "logger", None) and self.logger.enable_wandb and self.logger.wandb_run:
+            # Step keyed by federated round so uploads and downloads align.
+            self.logger.wandb_run.log({
+                "performance/model_download_time": download_elapsed
+            }, step=self.round_idx)
 
+            # Store for optional moving-average statistics.
+            self.logger.accumulated_metrics.setdefault("model_download_times", []).append(download_elapsed)
+
+        self.log(f"model download took {download_elapsed:.3f}s")
         self.log("finished")
         return outputs
 
@@ -664,63 +703,6 @@ def set_model_params(self, model_parameters) -> None:
 
         self.log("finished")
 
-    """
-    def aggregate(self, raw_client_model_list):
-        
-        Aggregate client models with Nesterov momentum.
-
-        Steps
-        -----
-        1. Compute the FedAvg-style weighted average of client models (same as the
-           default FedML behaviour).
-        2. Treat the *difference* between the current global model and the
-           aggregated model as the (negative) gradient.
-        3. Perform an SGD update with momentum on the server side.  If
-           ``self._nesterov`` is ``True``, use the Nesterov variant.
-        4. Save the updated parameters via ``set_model_params`` and return them.
-        
-        self.log("aggregate: start")
-
-        # Step-1: FedAvg aggregation (reuse FedMLAggOperator)
-        aggregated_params: OrderedDict = FedMLAggOperator.agg(self.args, raw_client_model_list)
-
-        # Step-2: Load current global params (on CPU)
-        global_params: OrderedDict = self.get_model_params()
-
-        # Step-3: Momentum update
-        updated_params: OrderedDict = OrderedDict()
-        for name, global_tensor in global_params.items():
-            # Non-floating tensors (e.g. buffers) are copied directly
-            if not torch.is_floating_point(global_tensor):
-                updated_params[name] = aggregated_params[name]
-                continue
-
-            device = global_tensor.device           # cuda:0 (or cpu)
-            agg_tensor = aggregated_params[name].to(device)
-            grad = global_tensor - agg_tensor
-
-            # Initialise velocity buffer if first time
-            if name not in self._velocity:
-                self._velocity[name] = torch.zeros_like(grad)
-
-            # Momentum accumulation
-            self._velocity[name] = self._momentum * self._velocity[name] + grad
-
-            # Nesterov look-ahead
-            if self._nesterov:
-                update = self._momentum * self._velocity[name] + grad
-            else:
-                update = self._velocity[name]
-
-            # Parameter update (SGD step)
-            updated_params[name] = global_tensor - self._server_lr * update
-
-        # Step-4: Push new params to the model & return
-        self.set_model_params(updated_params)
-        self.log("aggregate: finished")
-        return updated_params
-    """
-
 
 class TrainingMetricsLogger:
     """Comprehensive logging for GRPO training with WandB support"""
@@ -811,7 +793,9 @@ def __init__(
             'policy_losses': [],
             'value_losses': [],
             'advantages': [],
-            'rollout_lengths': []
+            'rollout_lengths': [],
+            'completion_times': [],
+            'step_times': [],
         }
 
     def log_training_step(self, step_id: str, train_result: dict, global_step: int):
@@ -854,6 +838,11 @@ def log_training_step(self, step_id: str, train_result: dict, global_step: int):
             wandb_metrics['rollouts/avg_length'] = train_result['avg_rollout_length']
             self.accumulated_metrics['rollout_lengths'].append(train_result['avg_rollout_length'])
 
+        # Average completion time (per generation)
+        if 'avg_completion_time' in train_result:
+            wandb_metrics['performance/avg_completion_time'] = train_result['avg_completion_time']
+            self.accumulated_metrics['completion_times'].append(train_result['avg_completion_time'])
+
         if 'rollout_time' in train_result:
             wandb_metrics['performance/rollout_time'] = train_result['rollout_time']
 
@@ -881,6 +870,11 @@ def log_training_step(self, step_id: str, train_result: dict, global_step: int):
         if 'learning_rate' in train_result:
             wandb_metrics['training/learning_rate'] = train_result['learning_rate']
 
+        # GRPO step time
+        if 'grpo_step_time' in train_result:
+            wandb_metrics['performance/grpo_step_time'] = train_result['grpo_step_time']
+            self.accumulated_metrics['step_times'].append(train_result['grpo_step_time'])
+
         # Log to wandb
         if self.enable_wandb and self.wandb_run and wandb_metrics:
             # Replace the Trainer-provided ``global_step`` (which resets every
@@ -1004,6 +998,14 @@ def log_moving_averages(self, global_step: int, window_size: int = 100):
             avg_length = self.get_moving_average(self.accumulated_metrics['rollout_lengths'], window_size)
             wandb_metrics[f'moving_avg/rollout_length_{window_size}'] = avg_length
 
+        if self.accumulated_metrics['completion_times']:
+            avg_ct = self.get_moving_average(self.accumulated_metrics['completion_times'], window_size)
+            wandb_metrics[f'moving_avg/completion_time_{window_size}'] = avg_ct
+
+        if self.accumulated_metrics['step_times']:
+            avg_st = self.get_moving_average(self.accumulated_metrics['step_times'], window_size)
+            wandb_metrics[f'moving_avg/step_time_{window_size}'] = avg_st
+
         # Log to wandb
         if self.enable_wandb and wandb_metrics:
             # Use our internal monotonically-increasing counter so that these

From 56a701b3deec7cd6c63f55884951473a64121bf2 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 00:46:31 +0000
Subject: [PATCH 078/168] Add evaluation script for Qwen3-0.6B on GSM8K and
 enhance logging in TimedGRPOTrainer

- Introduced a new validation script to evaluate the Qwen3-0.6B model on the GSM8K test split, including command-line arguments for customization.
- Enhanced logging in TimedGRPOTrainer to print average completion time for better performance insights.
- Updated GRPO configuration to increase max steps from 2 to 50 for more comprehensive testing.
---
 python/spotlight_prj/fedllm/custom_trainer.py                   | 1 +
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 python/spotlight_prj/fedllm/{validatation.py => validation.py}  | 0
 3 files changed, 2 insertions(+), 1 deletion(-)
 rename python/spotlight_prj/fedllm/{validatation.py => validation.py} (100%)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index fa511109b..059be726f 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -91,6 +91,7 @@ def _make_experience(self, *args, **kwargs):
         # the TrainingMetricsLogger (via GRPOMetricsCallback).
         self.accelerator.log({"avg_completion_time": avg_completion_time}, step=self.state.global_step)
         self.log({"avg_completion_time": avg_completion_time})
+        print(f"avg_completion_time: {avg_completion_time}")
         
         # `out["kl"]` is a 1-D tensor of per-token KL values
         kl_mean = result["kl"].mean().item()
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 3bc6a989a..0368a3d3e 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -30,7 +30,7 @@ train_args:
   client_num_per_round: 2  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 2
+  grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
   grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
diff --git a/python/spotlight_prj/fedllm/validatation.py b/python/spotlight_prj/fedllm/validation.py
similarity index 100%
rename from python/spotlight_prj/fedllm/validatation.py
rename to python/spotlight_prj/fedllm/validation.py

From a3a32409df5a5514599bcae3e67e48927eae5fe1 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 01:15:15 +0000
Subject: [PATCH 079/168] Update GRPO configuration in
 grpo_gsm8k_test_config.yaml to switch fp16 and bf16 settings

- Changed fp16 from True to False and bf16 from False to True for improved GPU compatibility.
- Ensured consistency with GRPO configuration requirements.
---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 0368a3d3e..3d1c8e2fa 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -40,8 +40,8 @@ train_args:
   deepspeed: null  # Disable DeepSpeed for GRPO compatibility
   ddp_find_unused_parameters: False
   seed: 1234
-  fp16: True  # Use fp16 instead of bf16 for GPU compatibility
-  bf16: False
+  fp16: False  # Use fp16 instead of bf16 for GPU compatibility
+  bf16: True
   gradient_checkpointing: False  # Match GRPO config
   per_device_train_batch_size: 2  # Will be overridden by GRPO
   per_device_eval_batch_size: 8

From ba8ac29b534518b4d0f80d8f86e7d11194041d1c Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 11:13:34 +0000
Subject: [PATCH 080/168] Add checkpoint cleanup functionality in
 FullModelLLMAggregator

- Implemented a method to prune older wallclock checkpoints, retaining only the most recent three to manage disk usage effectively.
- Enhanced checkpoint validity checks to ensure only usable checkpoints are kept, improving overall system reliability during long-running training sessions.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 059be726f..137c56386 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -644,6 +644,11 @@ def _periodic_checkpoint_loop(self):
                             is_saving_process=True,
                             synchronize=False,
                         )
+                    # ---------------- New behaviour ----------------
+                    # After successfully writing the checkpoint, prune older
+                    # wallclock_* checkpoints so that only the latest three are
+                    # kept on disk.
+                    self._cleanup_old_wallclock_checkpoints()
             except Exception as e:
                 # Log and continue – do not crash training due to checkpoint failure
                 self.log(f"[WARN] Periodic checkpoint failed: {e}")
@@ -704,6 +709,51 @@ def set_model_params(self, model_parameters) -> None:
 
         self.log("finished")
 
+    def _cleanup_old_wallclock_checkpoints(self, keep_last: int = 3):
+        """Delete old wallclock_* checkpoints but keep the most recent ``keep_last``.
+
+        This complements the round-based checkpoint cleanup by pruning time-based
+        checkpoints created by the periodic background thread.  The newest
+        ``keep_last`` checkpoints are retained; older ones are removed to avoid
+        unbounded disk usage on long-running servers.
+        """
+        pattern = re.compile(r"wallclock_(\d+)$")
+        valid_ckpts = []  # (timestamp, Path)
+        invalid_ckpts = []  # Path(s) that lack model files
+
+        # Determine candidate checkpoints and group by validity
+        for d in self.checkpoint_dir.iterdir():
+            m = pattern.fullmatch(d.name)
+            if not m:
+                continue  # skip non-wallclock dirs
+
+            # Heuristic: consider checkpoint *valid* if it contains at least one
+            # model weight file produced by ``save_pretrained`` or our fallback
+            # helper (i.e. *.bin or *.safetensors).  This covers both HF and PEFT.
+            has_model_file = any(d.glob("*.bin")) or any(d.glob("*.safetensors")) or any(d.glob("*.pt"))
+
+            if has_model_file:
+                valid_ckpts.append((int(m.group(1)), d))
+            else:
+                invalid_ckpts.append(d)
+
+        # Remove *all* invalid checkpoints immediately as they are unusable
+        for d in invalid_ckpts:
+            try:
+                shutil.rmtree(d, ignore_errors=True)
+            except Exception as e:
+                self.log(f"[WARN] Failed to delete incomplete wallclock checkpoint {d}: {e}")
+
+        # Sort valid checkpoints chronologically (oldest first)
+        valid_ckpts.sort(key=lambda x: x[0])
+
+        # Keep only the most recent ``keep_last`` valid checkpoints
+        for _, d in valid_ckpts[:-keep_last]:
+            try:
+                shutil.rmtree(d, ignore_errors=True)
+            except Exception as e:
+                self.log(f"[WARN] Failed to delete old wallclock checkpoint {d}: {e}")
+
 
 class TrainingMetricsLogger:
     """Comprehensive logging for GRPO training with WandB support"""

From c2311f6c5217af0947f729825718617e7960388c Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 11:28:42 +0000
Subject: [PATCH 081/168] Refactor average completion time logging in
 TimedGRPOTrainer

- Updated the calculation and logging of average completion time to use a class attribute for consistency.
- Enhanced the TrainingMetricsLogger to accumulate average completion times correctly, improving performance tracking.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 137c56386..21020a0f5 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -85,13 +85,12 @@ def _make_experience(self, *args, **kwargs):
         # ------------------------------------------------------------------
         elapsed = time.perf_counter() - t0  # total time for this roll-out batch
         num_gens = max(1, getattr(self.args, "num_generations", 1))
-        avg_completion_time = elapsed / num_gens
+        self.avg_completion_time = elapsed / num_gens
 
         # Log the metric so that it is captured by both Accelerate and
         # the TrainingMetricsLogger (via GRPOMetricsCallback).
-        self.accelerator.log({"avg_completion_time": avg_completion_time}, step=self.state.global_step)
-        self.log({"avg_completion_time": avg_completion_time})
-        print(f"avg_completion_time: {avg_completion_time}")
+        self.accelerator.log({"avg_completion_time": self.avg_completion_time}, step=self.state.global_step)
+        self.log({"avg_completion_time": self.avg_completion_time})
         
         # `out["kl"]` is a 1-D tensor of per-token KL values
         kl_mean = result["kl"].mean().item()
@@ -890,9 +889,9 @@ def log_training_step(self, step_id: str, train_result: dict, global_step: int):
             self.accumulated_metrics['rollout_lengths'].append(train_result['avg_rollout_length'])
 
         # Average completion time (per generation)
-        if 'avg_completion_time' in train_result:
-            wandb_metrics['performance/avg_completion_time'] = train_result['avg_completion_time']
-            self.accumulated_metrics['completion_times'].append(train_result['avg_completion_time'])
+        if self.avg_completion_time is not None:
+            wandb_metrics['performance/avg_completion_time'] = self.avg_completion_time
+            self.accumulated_metrics['avg_completion_times'].append(self.avg_completion_time)
 
         if 'rollout_time' in train_result:
             wandb_metrics['performance/rollout_time'] = train_result['rollout_time']

From 675dae59d5fd246862c9cf216a201c8227344838 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 11:37:51 +0000
Subject: [PATCH 082/168] Comment out the 'optim' parameter in
 FullModelLLMTrainer's GRPO configuration for potential optimization
 adjustments.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 21020a0f5..aa55e8c7f 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -330,7 +330,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.1,
-            optim="adamw_bnb_8bit",
+            #optim="adamw_bnb_8bit",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")

From 078b48b2a39f798c44bf0d23674b5c8394a97993 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 12:31:40 +0000
Subject: [PATCH 083/168] Enhance average completion time tracking in
 TrainingMetricsLogger

- Added a new attribute to store the most recent average completion time reported by the trainer, initialized to avoid AttributeError.
- Updated the logic to cache the average completion time from training results and adjusted the accumulation of completion times for improved performance metrics.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index aa55e8c7f..bd012745d 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -834,6 +834,10 @@ def __init__(
         self.step_count = 0
         self.training_start_time = time.time()
         self.last_log_time = time.time()
+        # Stores the most recent average completion time reported by the trainer.
+        # Initialised here so that attribute always exists and we avoid AttributeError
+        # if the metric is accessed before the first value is logged.
+        self.avg_completion_time: Optional[float] = None
 
         # Accumulated metrics for averaging
         self.accumulated_metrics = {
@@ -889,9 +893,13 @@ def log_training_step(self, step_id: str, train_result: dict, global_step: int):
             self.accumulated_metrics['rollout_lengths'].append(train_result['avg_rollout_length'])
 
         # Average completion time (per generation)
+        # Update the cached value if the trainer provided a fresh measurement.
+        if 'avg_completion_time' in train_result:
+            self.avg_completion_time = train_result['avg_completion_time']
+
         if self.avg_completion_time is not None:
             wandb_metrics['performance/avg_completion_time'] = self.avg_completion_time
-            self.accumulated_metrics['avg_completion_times'].append(self.avg_completion_time)
+            self.accumulated_metrics['completion_times'].append(self.avg_completion_time)
 
         if 'rollout_time' in train_result:
             wandb_metrics['performance/rollout_time'] = train_result['rollout_time']

From bced90b122aed852cc669b147a33f95a6827d7c3 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 18:37:52 +0000
Subject: [PATCH 084/168] Update model configuration and enhance logging in
 TrainingMetricsLogger

- Changed model_name_or_path from "Qwen/Qwen3-1.7B" to "Qwen/Qwen3-0.6B" in multiple files for consistency.
- Increased grpo_batch_size from 2 to 4 and gradient_accumulation_steps from 1 to 2 in the GRPO configuration for improved training efficiency.
- Added a print statement in TrainingMetricsLogger to log the average completion time for better performance tracking.
---
 python/spotlight_prj/fedllm/custom_trainer.py               | 2 ++
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml         | 6 +++---
 .../spotlight_prj/fedllm/scripts/save_initial_checkpoint.py | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index bd012745d..e98c8bf70 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -900,6 +900,8 @@ def log_training_step(self, step_id: str, train_result: dict, global_step: int):
         if self.avg_completion_time is not None:
             wandb_metrics['performance/avg_completion_time'] = self.avg_completion_time
             self.accumulated_metrics['completion_times'].append(self.avg_completion_time)
+        
+        print(f"avg_completion_time: {self.avg_completion_time}")
 
         if 'rollout_time' in train_result:
             wandb_metrics['performance/rollout_time'] = train_result['rollout_time']
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 3d1c8e2fa..2d12f01c7 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-1.7B"
+  model_name_or_path: "Qwen/Qwen3-0.6B"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 
@@ -32,7 +32,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 2  # Smaller batch size for faster testing
+  grpo_batch_size: 4  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1
@@ -45,7 +45,7 @@ train_args:
   gradient_checkpointing: False  # Match GRPO config
   per_device_train_batch_size: 2  # Will be overridden by GRPO
   per_device_eval_batch_size: 8
-  gradient_accumulation_steps: 1  # Will be overridden by GRPO
+  gradient_accumulation_steps: 2  # Will be overridden by GRPO
   eval_accumulation_steps: 4
   learning_rate: 5e-6 # Will be overridden by GRPO
   warmup_steps: 0
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index 2b00909dd..be46d17f4 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-1.7B"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From e19acfd6004b2bf2c916797afd348752ee19d679 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 20:06:31 +0000
Subject: [PATCH 085/168] Update wallclock checkpoint retention policy in
 FullModelLLMAggregator

- Increased the number of retained wallclock checkpoints from three to six to enhance checkpoint management and ensure more historical data is available for analysis.
- Adjusted the cleanup method to reflect the new retention policy, improving disk usage efficiency during long-running training sessions.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index e98c8bf70..431eb7a6c 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -645,7 +645,7 @@ def _periodic_checkpoint_loop(self):
                         )
                     # ---------------- New behaviour ----------------
                     # After successfully writing the checkpoint, prune older
-                    # wallclock_* checkpoints so that only the latest three are
+                    # wallclock_* checkpoints so that only the latest six are
                     # kept on disk.
                     self._cleanup_old_wallclock_checkpoints()
             except Exception as e:
@@ -708,7 +708,7 @@ def set_model_params(self, model_parameters) -> None:
 
         self.log("finished")
 
-    def _cleanup_old_wallclock_checkpoints(self, keep_last: int = 3):
+    def _cleanup_old_wallclock_checkpoints(self, keep_last: int = 6):
         """Delete old wallclock_* checkpoints but keep the most recent ``keep_last``.
 
         This complements the round-based checkpoint cleanup by pruning time-based

From e8553c9654e79ddc40a228b5456d7ee7709d0e1a Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 20:09:42 +0000
Subject: [PATCH 086/168] Enhance logging of average completion time in
 TimedGRPOTrainer

- Added a print statement to log the average completion time after calculation for better visibility during training.
- Removed redundant print statement from TrainingMetricsLogger to streamline output and avoid duplication.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 431eb7a6c..0541e9efe 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -87,6 +87,8 @@ def _make_experience(self, *args, **kwargs):
         num_gens = max(1, getattr(self.args, "num_generations", 1))
         self.avg_completion_time = elapsed / num_gens
 
+        print(f"avg_completion_time: {self.avg_completion_time}")
+
         # Log the metric so that it is captured by both Accelerate and
         # the TrainingMetricsLogger (via GRPOMetricsCallback).
         self.accelerator.log({"avg_completion_time": self.avg_completion_time}, step=self.state.global_step)
@@ -900,8 +902,6 @@ def log_training_step(self, step_id: str, train_result: dict, global_step: int):
         if self.avg_completion_time is not None:
             wandb_metrics['performance/avg_completion_time'] = self.avg_completion_time
             self.accumulated_metrics['completion_times'].append(self.avg_completion_time)
-        
-        print(f"avg_completion_time: {self.avg_completion_time}")
 
         if 'rollout_time' in train_result:
             wandb_metrics['performance/rollout_time'] = train_result['rollout_time']

From 547cb65789bf777c65d4b251cfe83c07f37af9c1 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 20:13:21 +0000
Subject: [PATCH 087/168] Improve formatting of average completion time log in
 TimedGRPOTrainer

- Adjusted the print statement for average completion time to include a newline for better readability in the output during training sessions.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 0541e9efe..729a17fd3 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -87,7 +87,7 @@ def _make_experience(self, *args, **kwargs):
         num_gens = max(1, getattr(self.args, "num_generations", 1))
         self.avg_completion_time = elapsed / num_gens
 
-        print(f"avg_completion_time: {self.avg_completion_time}")
+        print(f"\navg_completion_time: {self.avg_completion_time}")
 
         # Log the metric so that it is captured by both Accelerate and
         # the TrainingMetricsLogger (via GRPOMetricsCallback).

From ba40e91ab27500a088f4c5c0f3628d65da3b271f Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 22:27:23 +0000
Subject: [PATCH 088/168] Update GRPO configuration and model training
 parameters

- Increased num_generations from 2 to 4 in FullModelLLMTrainer for enhanced training output.
- Adjusted grpo_batch_size from 4 to 2 in grpo_gsm8k_test_config.yaml for optimized testing performance.
---
 python/spotlight_prj/fedllm/custom_trainer.py                   | 2 ++
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 729a17fd3..6d17f6a0c 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -222,6 +222,8 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
+        num_generations = 4
+        
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
             self.log(f"GRPO training for {grpo_max_steps} steps (test mode)")
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 2d12f01c7..1cf3b7de1 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -32,7 +32,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 4  # Smaller batch size for faster testing
+  grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From 6c8c7ec5a6e65ca624ce1587d19433f3406c6a73 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 23:09:27 +0000
Subject: [PATCH 089/168] Refactor experience generation method in
 TimedGRPOTrainer

- Renamed `_make_experience` to `_generate_and_score_completions` to align with upstream GRPOTrainer implementation, ensuring the method is invoked during training.
- Updated the method to measure generation latency per roll-out batch and log average completion time, enhancing performance tracking.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 22 +++++++------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 6d17f6a0c..9822b9899 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -76,10 +76,16 @@ def _record_step_stats(self, stats):
         # captured by the custom logger.
         self.log(stats)
     
-    def _make_experience(self, *args, **kwargs):
+    # Override GRPOTrainer internals to measure generation latency per roll-out batch
+    # NOTE: Upstream `GRPOTrainer` uses `_generate_and_score_completions` (not
+    # `_make_experience`).  The original override therefore never executed.
+    # We rename the method accordingly so that it is invoked during training.
+
+    def _generate_and_score_completions(self, *args, **kwargs):
         
         t0 = time.perf_counter()
-        result = super()._make_experience(*args, **kwargs)
+        # Call upstream implementation
+        result = super()._generate_and_score_completions(*args, **kwargs)
         # ------------------------------------------------------------------
         # Compute and log average completion time per generation
         # ------------------------------------------------------------------
@@ -87,23 +93,11 @@ def _make_experience(self, *args, **kwargs):
         num_gens = max(1, getattr(self.args, "num_generations", 1))
         self.avg_completion_time = elapsed / num_gens
 
-        print(f"\navg_completion_time: {self.avg_completion_time}")
-
         # Log the metric so that it is captured by both Accelerate and
         # the TrainingMetricsLogger (via GRPOMetricsCallback).
         self.accelerator.log({"avg_completion_time": self.avg_completion_time}, step=self.state.global_step)
         self.log({"avg_completion_time": self.avg_completion_time})
-        
-        # `out["kl"]` is a 1-D tensor of per-token KL values
-        kl_mean = result["kl"].mean().item()
 
-        # push to the FedML / accelerate logger – it will end up in client?.log
-        self.log({"kl_divergence": kl_mean})
-
-        # Human-readable string message (kept for completeness)
-        self.accelerator.log(
-            f"roll-out batch {self.state.global_step} : {elapsed:.3f}s"
-        )
         return result
 
 

From edf407dd8e7607d0282bbb84f9b17830c7ca94c8 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 29 Jul 2025 23:38:51 +0000
Subject: [PATCH 090/168] Update model parameters and configurations for
 training optimization

- Reduced num_generations from 4 to 2 in FullModelLLMTrainer to streamline training output.
- Updated model_name_or_path from "Qwen/Qwen3-0.6B" to "Qwen/Qwen3-1.7B" in multiple configuration files for consistency and improved model performance.
- Adjusted gradient_accumulation_steps from 2 to 1 in grpo_gsm8k_test_config.yaml to enhance training efficiency.
---
 python/spotlight_prj/fedllm/custom_trainer.py                 | 4 ++--
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 .../spotlight_prj/fedllm/scripts/save_initial_checkpoint.py   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 9822b9899..7df43e197 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -216,7 +216,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 4
+        num_generations = 2
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
@@ -328,7 +328,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.1,
-            #optim="adamw_bnb_8bit",
+            optim="adamw_bnb_8bit",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 1cf3b7de1..3d1c8e2fa 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-0.6B"
+  model_name_or_path: "Qwen/Qwen3-1.7B"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 
@@ -45,7 +45,7 @@ train_args:
   gradient_checkpointing: False  # Match GRPO config
   per_device_train_batch_size: 2  # Will be overridden by GRPO
   per_device_eval_batch_size: 8
-  gradient_accumulation_steps: 2  # Will be overridden by GRPO
+  gradient_accumulation_steps: 1  # Will be overridden by GRPO
   eval_accumulation_steps: 4
   learning_rate: 5e-6 # Will be overridden by GRPO
   warmup_steps: 0
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index be46d17f4..2b00909dd 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "Qwen/Qwen3-1.7B"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From 2d8c1ab8a6333bdd59f92792fdb12958d7f8b713 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 00:49:05 +0000
Subject: [PATCH 091/168] Update model configurations and training parameters
 for improved performance

- Increased num_generations from 2 to 4 in FullModelLLMTrainer to enhance training output.
- Updated model_name_or_path from "Qwen/Qwen3-1.7B" to "Qwen/Qwen3-0.6B" in configuration files for consistency.
- Adjusted grpo_batch_size from 2 to 4 in grpo_gsm8k_test_config.yaml for optimized testing performance.
---
 python/spotlight_prj/fedllm/custom_trainer.py                 | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 .../spotlight_prj/fedllm/scripts/save_initial_checkpoint.py   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 7df43e197..7b71b8f46 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -216,7 +216,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 2
+        num_generations = 4
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 3d1c8e2fa..df34b942e 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-1.7B"
+  model_name_or_path: "Qwen/Qwen3-0.6B"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 
@@ -32,7 +32,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 2  # Smaller batch size for faster testing
+  grpo_batch_size: 4  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index 2b00909dd..be46d17f4 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-1.7B"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From b3e564c279ed1fab0aed6b84528d7824866e8b21 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 09:28:10 +0000
Subject: [PATCH 092/168] Adjust max completion length and new token limit in
 FullModelLLMTrainer for optimized training

- Reduced max_completion_length from 512 to 256 to improve training efficiency.
- Decreased max_new_tokens from 512 to 256 to align with updated training parameters.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 7b71b8f46..69b9f21a4 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -306,7 +306,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=512,
+            max_completion_length=256,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -349,7 +349,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 512,
+            "max_new_tokens": 256,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From eb2d32a60220538b6a995e3598f4ea680cff97df Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 14:00:38 +0000
Subject: [PATCH 093/168] Enhance validation and memory management in training
 process

- Added subprocess call to launch validation after saving checkpoints in FullModelLLMAggregator, improving model evaluation workflow.
- Adjusted max-tokens parameter in validation.py from 1024 to 512 for better resource management.
- Implemented logging of average reward to a file in validation.py to facilitate performance tracking.
- Introduced periodic CUDA cache clearing in TimedGRPOTrainer to optimize memory usage during training.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 19 ++++++++++++++++++-
 python/spotlight_prj/fedllm/validation.py     |  6 +++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 69b9f21a4..d4590af96 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -31,6 +31,7 @@
 from src.modeling_utils import load_state_dict
 import time, logging
 import threading
+import subprocess  # for launching validation after checkpoints
 import shutil  # for deleting old checkpoints
 
 from fractions import Fraction
@@ -98,6 +99,9 @@ def _generate_and_score_completions(self, *args, **kwargs):
         self.accelerator.log({"avg_completion_time": self.avg_completion_time}, step=self.state.global_step)
         self.log({"avg_completion_time": self.avg_completion_time})
 
+        if self.state.global_step % 10 == 0:
+            torch.cuda.empty_cache()
+
         return result
 
 
@@ -391,7 +395,7 @@ def train(self, train_data, device, args):
         # Clean up fresh model to free memory
         del fresh_model
         del fresh_tokenizer
-        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        torch.cuda.empty_cache()
         
         self.log("GRPO training finished")
     
@@ -646,6 +650,19 @@ def _periodic_checkpoint_loop(self):
                     # wallclock_* checkpoints so that only the latest six are
                     # kept on disk.
                     self._cleanup_old_wallclock_checkpoints()
+                    # Run validation on the newly saved checkpoint
+                    try:
+                        script_path = Path(__file__).parent / "validation.py"
+                        log_path = Path(self.args.output_dir) / "validation.log"
+                        with open(log_path, "a") as lf:
+                            subprocess.Popen(
+                                [sys.executable, str(script_path), "--model", str(ckpt_dir)],
+                                stdout=lf,
+                                stderr=subprocess.STDOUT,
+                                close_fds=True,
+                            )
+                    except Exception as e:
+                        self.log(f"[WARN] Failed to launch validation: {e}")
             except Exception as e:
                 # Log and continue – do not crash training due to checkpoint failure
                 self.log(f"[WARN] Periodic checkpoint failed: {e}")
diff --git a/python/spotlight_prj/fedllm/validation.py b/python/spotlight_prj/fedllm/validation.py
index 5dfce0e9d..5ffa8347c 100644
--- a/python/spotlight_prj/fedllm/validation.py
+++ b/python/spotlight_prj/fedllm/validation.py
@@ -100,7 +100,7 @@ def parse_args() -> argparse.Namespace:
                    default="Qwen/Qwen3-0.6B",
                    help=("Tokenizer repo / path (default: Qwen/Qwen3-0.6B). "
                          "Override if you need a different tokenizer."))
-    p.add_argument("--max-tokens", type=int, default=1024,
+    p.add_argument("--max-tokens", type=int, default=512,
                    help="generation length cap (tokens)")
     p.add_argument("--temperature", type=float, default=0.7)
     p.add_argument("--top-p", type=float, default=0.95)
@@ -172,6 +172,10 @@ def main() -> None:
           f"(batch size = {args.batch_examples}).")
     print(f"Average reward: {avg_reward:.4f}")
 
+    # Save average reward to file
+    with open("avg_reward.txt", "a") as f:
+        f.write(f"{avg_reward:.4f}\n")
+
 
 if __name__ == "__main__":
     main()
\ No newline at end of file

From 415cbcb6bd60c2921602c33f718ad4052b71808a Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 14:51:32 +0000
Subject: [PATCH 094/168] Update model configurations and training parameters
 for consistency and optimization

- Changed model_name_or_path from "Qwen/Qwen3-0.6B" to "Qwen/Qwen3-1.7B-FP8" in configuration files for improved model performance.
- Reduced num_generations from 4 to 2 in FullModelLLMTrainer to streamline training output.
- Adjusted grpo_batch_size from 4 to 1 and gradient_accumulation_steps from 1 to 2 in grpo_gsm8k_test_config.yaml for optimized testing performance.
---
 python/spotlight_prj/fedllm/custom_trainer.py               | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml         | 6 +++---
 .../spotlight_prj/fedllm/scripts/save_initial_checkpoint.py | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index d4590af96..67b70949a 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -220,7 +220,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 4
+        num_generations = 2
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index df34b942e..27fae0913 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-0.6B"
+  model_name_or_path: "Qwen/Qwen3-1.7B-FP8"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 
@@ -32,7 +32,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 4  # Smaller batch size for faster testing
+  grpo_batch_size: 1  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1
@@ -45,7 +45,7 @@ train_args:
   gradient_checkpointing: False  # Match GRPO config
   per_device_train_batch_size: 2  # Will be overridden by GRPO
   per_device_eval_batch_size: 8
-  gradient_accumulation_steps: 1  # Will be overridden by GRPO
+  gradient_accumulation_steps: 2  # Will be overridden by GRPO
   eval_accumulation_steps: 4
   learning_rate: 5e-6 # Will be overridden by GRPO
   warmup_steps: 0
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index be46d17f4..993eb9818 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "Qwen/Qwen3-1.7B-FP8"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From a2b5b98b2d316c3636af26d15d19d4c3a03f1a1d Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 15:02:19 +0000
Subject: [PATCH 095/168] Update model name in configuration files for
 consistency

- Changed model_name_or_path from "Qwen/Qwen3-1.7B-FP8" to "Qwen/Qwen3-1.7B" in both grpo_gsm8k_test_config.yaml and save_initial_checkpoint.py to ensure uniformity across the project.
---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 27fae0913..ab84da19e 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-1.7B-FP8"
+  model_name_or_path: "Qwen/Qwen3-1.7B"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index 993eb9818..2b00909dd 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-1.7B-FP8"
+MODEL_NAME = "Qwen/Qwen3-1.7B"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From 2a68cb06f6fcfa7c1e67686e20dde796318be83f Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 19:08:36 +0000
Subject: [PATCH 096/168] Update training parameters and memory management in
 FullModelLLMTrainer

- Increased max_completion_length and max_new_tokens from 256 to 512 to enhance model output capabilities.
- Adjusted gradient_accumulation_steps from 1 to 2 in grpo_gsm8k_test_config.yaml for improved training efficiency.
- Enhanced memory management by explicitly deleting trainer components and moving the model to CPU after training.
---
 python/spotlight_prj/fedllm/custom_trainer.py             | 8 ++++++--
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml       | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index d4590af96..d7041fd76 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -310,7 +310,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=512,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -353,7 +353,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 256,
+            "max_new_tokens": 512,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         
@@ -395,6 +395,10 @@ def train(self, train_data, device, args):
         # Clean up fresh model to free memory
         del fresh_model
         del fresh_tokenizer
+        del grpo_trainer.optimizer
+        del grpo_trainer.lr_scheduler
+        del grpo_trainer
+        self.model.to("cpu")
         torch.cuda.empty_cache()
         
         self.log("GRPO training finished")
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index df34b942e..2d12f01c7 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -45,7 +45,7 @@ train_args:
   gradient_checkpointing: False  # Match GRPO config
   per_device_train_batch_size: 2  # Will be overridden by GRPO
   per_device_eval_batch_size: 8
-  gradient_accumulation_steps: 1  # Will be overridden by GRPO
+  gradient_accumulation_steps: 2  # Will be overridden by GRPO
   eval_accumulation_steps: 4
   learning_rate: 5e-6 # Will be overridden by GRPO
   warmup_steps: 0

From 411b5ed487fd52838892111980a075ba1879adb0 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 19:11:32 +0000
Subject: [PATCH 097/168] Update model configurations and training parameters
 for consistency and optimization

- Increased num_generations from 2 to 4 in FullModelLLMTrainer to enhance training output.
- Updated model_name_or_path from "Qwen/Qwen3-1.7B" to "Qwen/Qwen3-0.6B" in configuration files for consistency.
- Adjusted grpo_batch_size from 1 to 4 in grpo_gsm8k_test_config.yaml for improved testing performance.
---
 python/spotlight_prj/fedllm/custom_trainer.py                 | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 .../spotlight_prj/fedllm/scripts/save_initial_checkpoint.py   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 85b5a471b..d7041fd76 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -220,7 +220,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 2
+        num_generations = 4
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index ab84da19e..2d12f01c7 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-1.7B"
+  model_name_or_path: "Qwen/Qwen3-0.6B"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 
@@ -32,7 +32,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 1  # Smaller batch size for faster testing
+  grpo_batch_size: 4  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index 2b00909dd..be46d17f4 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-1.7B"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From 91459893f3a81ab76461de69c324f9db230a4875 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 19:24:56 +0000
Subject: [PATCH 098/168] Update model configurations and training parameters
 for consistency

- Changed model_name_or_path from "Qwen/Qwen3-0.6B" to "Qwen/Qwen3-1.7B-GPTQ-Int8" in configuration files for improved model performance.
- Reduced num_generations from 4 to 2 in FullModelLLMTrainer to streamline training output.
- Adjusted grpo_batch_size from 4 to 1 in grpo_gsm8k_test_config.yaml for optimized testing performance.
---
 python/spotlight_prj/fedllm/custom_trainer.py                 | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 .../spotlight_prj/fedllm/scripts/save_initial_checkpoint.py   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index d7041fd76..85b5a471b 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -220,7 +220,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 4
+        num_generations = 2
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 2d12f01c7..83a0f43b0 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-0.6B"
+  model_name_or_path: "Qwen/Qwen3-1.7B-GPTQ-Int8"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 
@@ -32,7 +32,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 4  # Smaller batch size for faster testing
+  grpo_batch_size: 1  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index be46d17f4..0ddd1374e 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "Qwen/Qwen3-1.7B-GPTQ-Int8"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From deef150a8b04d26fffa876b2461ab4dcf604665b Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 21:49:59 +0000
Subject: [PATCH 099/168] Update max completion length and new token limit in
 FullModelLLMTrainer for enhanced training capacity

- Increased max_completion_length from 256 to 512 to allow for longer outputs during training.
- Adjusted max_new_tokens from 256 to 512 to align with the updated completion length.
- Improved memory management by explicitly deleting trainer components and moving the model to CPU after training.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index d4590af96..d7041fd76 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -310,7 +310,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=512,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -353,7 +353,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 256,
+            "max_new_tokens": 512,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         
@@ -395,6 +395,10 @@ def train(self, train_data, device, args):
         # Clean up fresh model to free memory
         del fresh_model
         del fresh_tokenizer
+        del grpo_trainer.optimizer
+        del grpo_trainer.lr_scheduler
+        del grpo_trainer
+        self.model.to("cpu")
         torch.cuda.empty_cache()
         
         self.log("GRPO training finished")

From fe579620c411795a11380f228d49c7111ad58727 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 21:57:03 +0000
Subject: [PATCH 100/168] Comment out optim parameter in FullModelLLMTrainer to
 disable 8-bit AdamW optimizer for potential performance adjustments.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 85b5a471b..dc145f79b 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -332,7 +332,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.1,
-            optim="adamw_bnb_8bit",
+            #optim="adamw_bnb_8bit",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")

From 7bd11ba010e7b33262c68cc22bb346c67bd9ca06 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 22:02:01 +0000
Subject: [PATCH 101/168] Update model configurations and training parameters
 for improved performance

- Increased num_generations from 2 to 4 in FullModelLLMTrainer to enhance training output.
- Changed model_name_or_path from "Qwen/Qwen3-1.7B-GPTQ-Int8" to "Qwen/Qwen3-0.6B" in configuration files for consistency.
- Adjusted grpo_batch_size from 1 to 2 in grpo_gsm8k_test_config.yaml for optimized testing performance.
---
 python/spotlight_prj/fedllm/custom_trainer.py                 | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 .../spotlight_prj/fedllm/scripts/save_initial_checkpoint.py   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index dc145f79b..e414ec7a8 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -220,7 +220,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 2
+        num_generations = 4
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 83a0f43b0..1cf3b7de1 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-1.7B-GPTQ-Int8"
+  model_name_or_path: "Qwen/Qwen3-0.6B"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 
@@ -32,7 +32,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 1  # Smaller batch size for faster testing
+  grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index 0ddd1374e..be46d17f4 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-1.7B-GPTQ-Int8"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From df96bf09df7ddbfd43bc6f44640473c68cae5d5c Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 30 Jul 2025 22:04:31 +0000
Subject: [PATCH 102/168] Update client configuration parameters in
 grpo_gsm8k_test_config.yaml for increased client capacity

- Increased client_num_in_total and client_num_per_round from 2 to 4 to support a larger client setup during training.
---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 1cf3b7de1..df84cb81c 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -26,8 +26,8 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 2  # Single client setup
-  client_num_per_round: 2  # Single client setup
+  client_num_in_total: 4  # Single client setup
+  client_num_per_round: 4  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50

From 6e44080e00b803251204f4477b0638eb86043f1d Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 31 Jul 2025 15:57:24 +0000
Subject: [PATCH 103/168] Enhance TimedGRPOTrainer initialization and update
 model configurations

- Added initialization logic for torch_dtype in TimedGRPOTrainer to handle various input types.
- Created a reference model using AutoConfig for improved model handling.
- Updated model_name_or_path from "Qwen/Qwen3-0.6B" to "Qwen/Qwen3-1.7B" in grpo_gsm8k_test_config.yaml and save_initial_checkpoint.py for consistency.
- Adjusted grpo_batch_size and client settings in grpo_gsm8k_test_config.yaml for optimized testing performance.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 34 ++++++++++++++++---
 .../fedml_config/grpo_gsm8k_test_config.yaml  |  8 ++---
 .../fedllm/scripts/save_initial_checkpoint.py |  2 +-
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index e414ec7a8..e8e2460fb 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -37,7 +37,8 @@
 from fractions import Fraction
 
 # New import for TrainerCallback
-from transformers import TrainerCallback
+from transformers import TrainerCallback, AutoConfig
+import transformers
 
 import wandb
 import json
@@ -47,6 +48,29 @@
 
 
 class TimedGRPOTrainer(GRPOTrainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        model_init_kwargs = args.model_init_kwargs or {}
+        torch_dtype = model_init_kwargs.get("torch_dtype")
+        if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None:
+            pass  # torch_dtype is already a torch.dtype or "auto" or None
+        elif isinstance(torch_dtype, str):  # it's a str, but not "auto"
+            torch_dtype = getattr(torch, torch_dtype)
+            model_init_kwargs["torch_dtype"] = torch_dtype
+        else:
+            raise ValueError(
+                "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
+                f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}."
+            )
+
+        # Reference model
+        if self.beta != 0.0:
+            # For deepspeed, fsdp or non-distributed models, create a reference model from scratch
+            config = AutoConfig.from_pretrained("Qwen/Qwen3-1.7B-FP8")
+            architecture = getattr(transformers, config.architectures[0])
+            self.ref_model = architecture.from_pretrained("Qwen/Qwen3-1.7B-FP8", **model_init_kwargs)
+
     def _record_step_stats(self, stats):
         # -------------------------------------------------------------
         # Measure *inter-step* wall-clock time: difference between the start
@@ -220,7 +244,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 4
+        num_generations = 2
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
@@ -310,7 +334,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=512,
+            max_completion_length=256,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -332,7 +356,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.1,
-            #optim="adamw_bnb_8bit",
+            optim="adamw_bnb_8bit",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")
@@ -353,7 +377,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 512,
+            "max_new_tokens": 256,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index df84cb81c..09147167c 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-0.6B"
+  model_name_or_path: "Qwen/Qwen3-1.7B"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 
@@ -26,13 +26,13 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 4  # Single client setup
-  client_num_per_round: 4  # Single client setup
+  client_num_in_total: 1  # Single client setup
+  client_num_per_round: 1  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 2  # Smaller batch size for faster testing
+  grpo_batch_size: 1  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index be46d17f4..2b00909dd 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "Qwen/Qwen3-1.7B"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From a09748184bd3c66c4b80ee243f331f2237c4ffca Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 31 Jul 2025 16:29:14 +0000
Subject: [PATCH 104/168] Update broadcast_object_list call in
 FullModelLLMTrainer to specify device as CPU

- Modified the broadcast_object_list function to include the device parameter, ensuring that model parameters are explicitly broadcasted from the CPU.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index e8e2460fb..9cd6eeb22 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -483,7 +483,7 @@ def sync_process_group(
 
         model_params = to_device(model_params, "cpu")   # ensure params live on CPU
         torch.cuda.empty_cache() 
-        broadcast_object_list([round_idx, model_params, client_index], from_process=from_process)
+        broadcast_object_list([round_idx, model_params, client_index], from_process=from_process, device=torch.device("cpu"))
 
         self.log("finished")
 

From a14707e957f335403ac210ca84af57577c86e929 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 31 Jul 2025 17:49:21 +0000
Subject: [PATCH 105/168] Refactor FullModelLLMTrainer to use float16 and
 enhance model parameter handling

- Updated model loading in FullModelLLMTrainer to use torch.float16 for improved performance.
- Added dtype logging for model parameters to verify data types during training.
- Ensured reference model in TimedGRPOTrainer is moved to CPU for better resource management.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 9cd6eeb22..4bbfb4a87 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -70,6 +70,8 @@ def __init__(self, *args, **kwargs):
             config = AutoConfig.from_pretrained("Qwen/Qwen3-1.7B-FP8")
             architecture = getattr(transformers, config.architectures[0])
             self.ref_model = architecture.from_pretrained("Qwen/Qwen3-1.7B-FP8", **model_init_kwargs)
+        
+        self.ref_model.to('cpu')
 
     def _record_step_stats(self, stats):
         # -------------------------------------------------------------
@@ -279,7 +281,7 @@ def train(self, train_data, device, args):
             else:
                 fresh_model = AutoModelForCausalLM.from_pretrained(
                     model_name, 
-                    torch_dtype=torch.float32,  # Use float32 for better stability
+                    torch_dtype=torch.float16,  # Use float32 for better stability
                     use_cache=False,
                     trust_remote_code=True
                 )
@@ -287,7 +289,7 @@ def train(self, train_data, device, args):
             self.log(f"Failed to load with requested precision, falling back to float32: {e}")
             fresh_model = AutoModelForCausalLM.from_pretrained(
                 model_name, 
-                torch_dtype=torch.float32,  # Fallback to float32
+                torch_dtype=torch.float16,  # Fallback to float32
                 use_cache=False,
                 trust_remote_code=True
             )
@@ -482,7 +484,10 @@ def sync_process_group(
             round_idx = self.round_idx
 
         model_params = to_device(model_params, "cpu")   # ensure params live on CPU
-        torch.cuda.empty_cache() 
+
+        dtypes = set(t.dtype for t in model_params.values())
+        print(f"model_params dtypes: {dtypes}")  # Should print torch.float32 if FP32
+
         broadcast_object_list([round_idx, model_params, client_index], from_process=from_process, device=torch.device("cpu"))
 
         self.log("finished")

From c7e89e7d2f388bbb02bde1ceffd1d478c7815a2a Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 31 Jul 2025 19:53:20 +0000
Subject: [PATCH 106/168] Refactor broadcast_object_list call in
 FullModelLLMTrainer to remove explicit device parameter

- Updated the broadcast_object_list function call to eliminate the device specification, simplifying the code while maintaining functionality.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 4bbfb4a87..ea4fed281 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -488,7 +488,7 @@ def sync_process_group(
         dtypes = set(t.dtype for t in model_params.values())
         print(f"model_params dtypes: {dtypes}")  # Should print torch.float32 if FP32
 
-        broadcast_object_list([round_idx, model_params, client_index], from_process=from_process, device=torch.device("cpu"))
+        broadcast_object_list([round_idx, model_params, client_index], from_process=from_process)
 
         self.log("finished")
 

From 7f02a2574dc17b31483a7930090057befc962bd4 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 31 Jul 2025 20:29:27 +0000
Subject: [PATCH 107/168] Comment out CPU transfer for reference model in
 TimedGRPOTrainer and add model_dtype configuration in
 grpo_gsm8k_test_config.yaml for improved model handling.

---
 python/spotlight_prj/fedllm/custom_trainer.py                   | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index ea4fed281..1d21d9cc0 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -71,7 +71,7 @@ def __init__(self, *args, **kwargs):
             architecture = getattr(transformers, config.architectures[0])
             self.ref_model = architecture.from_pretrained("Qwen/Qwen3-1.7B-FP8", **model_init_kwargs)
         
-        self.ref_model.to('cpu')
+        #self.ref_model.to('cpu')
 
     def _record_step_stats(self, stats):
         # -------------------------------------------------------------
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 09147167c..74cface24 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -19,6 +19,7 @@ data_args:
 model_args:
   skip_log_model_net: True
   model_name_or_path: "Qwen/Qwen3-1.7B"
+  model_dtype: "bfloat16"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
 

From e9108d4802de90a8cef52522ccc50c4b5f50ba96 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 31 Jul 2025 20:51:37 +0000
Subject: [PATCH 108/168] Refactor model initialization in TimedGRPOTrainer to
 retrieve model_init_kwargs from kwargs

- Updated the initialization of model_init_kwargs to use kwargs.get('args', GRPOConfig()) for improved flexibility in model configuration.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 1d21d9cc0..e4a5fb330 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -51,7 +51,7 @@ class TimedGRPOTrainer(GRPOTrainer):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        model_init_kwargs = args.model_init_kwargs or {}
+        model_init_kwargs = kwargs.get('args', GRPOConfig()).model_init_kwargs or {}
         torch_dtype = model_init_kwargs.get("torch_dtype")
         if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None:
             pass  # torch_dtype is already a torch.dtype or "auto" or None

From e80eedb13bf63d8a0a9e5fdf706d1fc8a6153143 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 31 Jul 2025 21:08:19 +0000
Subject: [PATCH 109/168] Update reference model configuration in
 TimedGRPOTrainer to use GPTQ-Int8 variant

- Changed model initialization to load the reference model from "Qwen/Qwen3-1.7B-GPTQ-Int8" instead of "Qwen/Qwen3-1.7B-FP8" for improved performance and compatibility.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index e4a5fb330..324c1bc87 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -67,9 +67,9 @@ def __init__(self, *args, **kwargs):
         # Reference model
         if self.beta != 0.0:
             # For deepspeed, fsdp or non-distributed models, create a reference model from scratch
-            config = AutoConfig.from_pretrained("Qwen/Qwen3-1.7B-FP8")
+            config = AutoConfig.from_pretrained("Qwen/Qwen3-1.7B-GPTQ-Int8")
             architecture = getattr(transformers, config.architectures[0])
-            self.ref_model = architecture.from_pretrained("Qwen/Qwen3-1.7B-FP8", **model_init_kwargs)
+            self.ref_model = architecture.from_pretrained("Qwen/Qwen3-1.7B-GPTQ-Int8", **model_init_kwargs)
         
         #self.ref_model.to('cpu')
 

From f5cc03cd5bffe6646b1a91cddb20a3bfd16d6ac4 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 31 Jul 2025 21:26:02 +0000
Subject: [PATCH 110/168] Add docstring to TimedGRPOTrainer class for improved
 documentation

---
 python/spotlight_prj/fedllm/custom_trainer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 324c1bc87..f52fb9c30 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -48,9 +48,11 @@
 
 
 class TimedGRPOTrainer(GRPOTrainer):
+    """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+        
         model_init_kwargs = kwargs.get('args', GRPOConfig()).model_init_kwargs or {}
         torch_dtype = model_init_kwargs.get("torch_dtype")
         if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None:
@@ -72,6 +74,7 @@ def __init__(self, *args, **kwargs):
             self.ref_model = architecture.from_pretrained("Qwen/Qwen3-1.7B-GPTQ-Int8", **model_init_kwargs)
         
         #self.ref_model.to('cpu')
+    """
 
     def _record_step_stats(self, stats):
         # -------------------------------------------------------------

From 1b3500db059f6546ec6eb6a3ffa51fec1be9bff0 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 31 Jul 2025 21:41:18 +0000
Subject: [PATCH 111/168] Enhance TimedGRPOTrainer with dropout control and
 reference model synchronization

- Added functionality to disable dropout in models based on the provided arguments.
- Implemented preparation of the reference model for DeepSpeed and FSDP if enabled.
- Introduced synchronization of the reference model with a callback when specified in the arguments.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index f52fb9c30..3e4be7a76 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -73,6 +73,23 @@ def __init__(self, *args, **kwargs):
             architecture = getattr(transformers, config.architectures[0])
             self.ref_model = architecture.from_pretrained("Qwen/Qwen3-1.7B-GPTQ-Int8", **model_init_kwargs)
         
+        # Disable dropout in the models
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+        
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            elif self.is_fsdp_enabled:
+                self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+
+        if args.sync_ref_model:
+            self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator))
+        
         #self.ref_model.to('cpu')
     """
 

From f457548ba9757cf6a2a9555e2b3d193c3ba3140d Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 31 Jul 2025 22:33:17 +0000
Subject: [PATCH 112/168] Update beta parameter in FullModelLLMTrainer to
 improve model performance

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 3e4be7a76..768d3f965 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -377,7 +377,7 @@ def train(self, train_data, device, args):
             top_k=50,
             repetition_penalty=1.1,
             epsilon=0.2,
-            beta=0.1,
+            beta=0.0,
             optim="adamw_bnb_8bit",
         )
         

From 4bf2a064924bc119174d754ea04b1de941a8b7e7 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 00:17:28 +0000
Subject: [PATCH 113/168] Update client configuration in
 grpo_gsm8k_test_config.yaml to support multiple clients for testing

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 74cface24..599c03e8b 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -27,8 +27,8 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 1  # Single client setup
-  client_num_per_round: 1  # Single client setup
+  client_num_in_total: 4  # Single client setup
+  client_num_per_round: 4  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50

From 2ab283507f646cb8770cd72ce9b57e7e1f1ef43c Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 10:54:36 +0000
Subject: [PATCH 114/168] Adjust max completion length and new tokens in
 FullModelLLMTrainer for optimized training performance

---
 python/spotlight_prj/fedllm/custom_trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 768d3f965..181ebb590 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -356,7 +356,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=128,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -378,7 +378,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.0,
-            optim="adamw_bnb_8bit",
+            #optim="adamw_bnb_8bit",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")
@@ -399,7 +399,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 256,
+            "max_new_tokens": 128,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From 403cac47dc4d459ea2a2c9fbb7c284f3603c5bc3 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 11:16:51 +0000
Subject: [PATCH 115/168] Update optimizer in FullModelLLMTrainer to use
 paged_adamw_8bit for enhanced training efficiency

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 181ebb590..b2a08970e 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -378,7 +378,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.0,
-            #optim="adamw_bnb_8bit",
+            optim="paged_adamw_8bit",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")

From fc9f3ba7359cff01c8e8268a1aa7349880cfc83e Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 11:52:13 +0000
Subject: [PATCH 116/168] Enhance memory management in FullModelLLMTrainer by
 adding garbage collection and freeing resources after training

- Introduced garbage collection and memory cleanup steps post-training to optimize resource usage.
- Ensured model is transferred to CPU and unnecessary variables are deleted to prevent memory leaks.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index b2a08970e..00372fdb3 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -46,6 +46,8 @@
 import warnings
 warnings.filterwarnings("ignore")
 
+import gc
+
 
 class TimedGRPOTrainer(GRPOTrainer):
     """
@@ -411,6 +413,7 @@ def train(self, train_data, device, args):
         # Run GRPO training
         grpo_trainer.train()
 
+        
 
         # **Copy trained weights back to FedML's model**
         self.log("Copying GRPO-trained weights back to FedML model")
@@ -421,6 +424,8 @@ def train(self, train_data, device, args):
             self.model.base_model.load_state_dict(trained_state, strict=False)
         else:
             self.model.load_state_dict(trained_state, strict=False)
+        self.model.to("cpu")
+        del trained_state
         
         # Optionally save a pre-aggregation checkpoint for this round
 
@@ -438,6 +443,12 @@ def train(self, train_data, device, args):
         if self.training_args.should_save:
             self._cleanup_old_round_checkpoints()
         
+        grpo_trainer.accelerator.end_training()
+        grpo_trainer.accelerator.free_memory()
+        grpo_trainer.model = None
+        gc.collect()
+        torch.cuda.empty_cache()
+        
         # Clean up fresh model to free memory
         del fresh_model
         del fresh_tokenizer

From d9d6bed92b030e915af1dbfc61a03a8b1af82b87 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 11:52:39 +0000
Subject: [PATCH 117/168] Enhance TimedGRPOTrainer with fallback utilities and
 dropout management

- Added fallback stubs for prepare_fsdp and SyncRefModelCallback to ensure compatibility with varying TRL versions.
- Introduced a function to disable dropout in models, enhancing control over training behavior.
- Updated reference model preparation for DeepSpeed with additional parameters for batch size and precision settings.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 47 +++++++++++++++++--
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 00372fdb3..e92ce2c51 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -24,6 +24,32 @@
 from fedml.train.llm.distributed import barrier
 from peft import PeftModel
 from trl import GRPOTrainer, GRPOConfig
+from trl.trainer.utils import prepare_deepspeed
+
+# Fallback stub if prepare_fsdp is unavailable in current TRL version
+try:
+    from trl.trainer.utils import prepare_fsdp  # type: ignore
+except ImportError:  # pragma: no cover
+    def prepare_fsdp(model, accelerator):
+        """Minimal FSDP prep fallback – just use accelerator.prepare_model."""
+        return accelerator.prepare_model(model, evaluation_mode=True)
+
+# Optional: stub SyncRefModelCallback if not provided upstream
+try:
+    from trl.trainer.callbacks import SyncRefModelCallback  # hypothetical future addition
+except Exception:
+    from transformers import TrainerCallback
+    class SyncRefModelCallback(TrainerCallback):
+        """Fallback no-op callback used when TRL doesn't ship one.
+        Simply keeps reference model on correct device and in eval mode.
+        """
+        def __init__(self, ref_model=None, accelerator=None):
+            self.ref_model = ref_model
+            self.accelerator = accelerator
+        def on_train_begin(self, args, state, control, **kwargs):
+            if self.ref_model is not None and self.accelerator is not None:
+                self.ref_model.to(self.accelerator.device)
+                self.ref_model.eval()
 from fedml.ml.aggregator.agg_operator import FedMLAggOperator
 
 from run_fedllm import LLMTrainer, LLMAggregator, save_checkpoint, load_checkpoint
@@ -49,6 +75,17 @@
 import gc
 
 
+def disable_dropout_in_model(model: torch.nn.Module) -> None:
+    """
+    Disable dropout by setting all torch.nn.Dropout modules to eval mode and
+    zero probability.
+    """
+    for module in model.modules():
+        if isinstance(module, torch.nn.Dropout):
+            module.p = 0.0
+            module.eval()
+
+
 class TimedGRPOTrainer(GRPOTrainer):
     """
     def __init__(self, *args, **kwargs):
@@ -76,20 +113,22 @@ def __init__(self, *args, **kwargs):
             self.ref_model = architecture.from_pretrained("Qwen/Qwen3-1.7B-GPTQ-Int8", **model_init_kwargs)
         
         # Disable dropout in the models
-        if args.disable_dropout:
-            disable_dropout_in_model(model)
+        if getattr(self.args, "disable_dropout", False):
+            disable_dropout_in_model(self.model)
             if self.ref_model is not None:
                 disable_dropout_in_model(self.ref_model)
         
         if self.ref_model is not None:
             if self.is_deepspeed_enabled:
-                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+                # Prepare reference model under DeepSpeed when enabled
+                per_device_bs = getattr(self.args, 'per_device_train_batch_size', 1)
+                self.ref_model = prepare_deepspeed(self.ref_model, per_device_bs, fp16=getattr(self.args, 'fp16', False), bf16=getattr(self.args, 'bf16', False))
             elif self.is_fsdp_enabled:
                 self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
             else:
                 self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
 
-        if args.sync_ref_model:
+        if getattr(self.args, "sync_ref_model", False):
             self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator))
         
         #self.ref_model.to('cpu')

From 315188cff7a7920ea7f05c5e984f0d45d9d03a2a Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 12:24:49 +0000
Subject: [PATCH 118/168] Update optimizer and clean up memory management in
 FullModelLLMTrainer

- Changed optimizer from paged_adamw_8bit to galore_adamw_8bit_layerwise for improved training performance.
- Commented out deletion of trained_state to retain state information for debugging.
- Added docstring to clarify memory cleanup steps after training.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index e92ce2c51..07e4d0d2e 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -419,7 +419,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.0,
-            optim="paged_adamw_8bit",
+            optim="galore_adamw_8bit_layerwise",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")
@@ -464,7 +464,7 @@ def train(self, train_data, device, args):
         else:
             self.model.load_state_dict(trained_state, strict=False)
         self.model.to("cpu")
-        del trained_state
+        #del trained_state
         
         # Optionally save a pre-aggregation checkpoint for this round
 
@@ -481,12 +481,13 @@ def train(self, train_data, device, args):
         # After saving the current round checkpoint, clean up older round_* checkpoints
         if self.training_args.should_save:
             self._cleanup_old_round_checkpoints()
-        
+        """
         grpo_trainer.accelerator.end_training()
         grpo_trainer.accelerator.free_memory()
         grpo_trainer.model = None
         gc.collect()
         torch.cuda.empty_cache()
+        """
         
         # Clean up fresh model to free memory
         del fresh_model

From cd0b61d7343d7c497a02078140c74314524a0e90 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 12:27:11 +0000
Subject: [PATCH 119/168] Comment out garbage collection import in
 custom_trainer.py to streamline code and improve readability.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 07e4d0d2e..ba1835c5b 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -72,7 +72,7 @@ def on_train_begin(self, args, state, control, **kwargs):
 import warnings
 warnings.filterwarnings("ignore")
 
-import gc
+#import gc
 
 
 def disable_dropout_in_model(model: torch.nn.Module) -> None:

From a03174e8766374b99b6fa775ddf97f5b2b09fc91 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 13:33:58 +0000
Subject: [PATCH 120/168] Update optimizer in FullModelLLMTrainer to
 paged_lion_8bit for improved training performance

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index ba1835c5b..f3e5248e2 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -419,7 +419,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.0,
-            optim="galore_adamw_8bit_layerwise",
+            optim="paged_lion_8bit"",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")

From cf6ecb277fc43e42782c832df9a4fddf047b0577 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 13:45:34 +0000
Subject: [PATCH 121/168] Fix syntax error in optimizer assignment in
 FullModelLLMTrainer

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index f3e5248e2..5da192653 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -419,7 +419,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.0,
-            optim="paged_lion_8bit"",
+            optim="paged_lion_8bit",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")

From 9e1291fbca600ea4709233f7352b12c8cae49dcd Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 13:49:07 +0000
Subject: [PATCH 122/168] Update gradient checkpointing settings in
 FullModelLLMTrainer and configuration file

- Added `gradient_checkpointing_kwargs` to `FullModelLLMTrainer` to enhance training efficiency.
- Updated `gradient_checkpointing` setting in `grpo_gsm8k_test_config.yaml` to `True` for consistency with GRPO configuration.
---
 python/spotlight_prj/fedllm/custom_trainer.py                   | 1 +
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 5da192653..7cadedfc3 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -406,6 +406,7 @@ def train(self, train_data, device, args):
             fp16=not use_bf16,  # Use fp16 if not bf16
             gradient_checkpointing=getattr(args, 'gradient_checkpointing', False),
             #logging_steps=5 if grpo_max_steps > 0 and grpo_max_steps < 50 else 25,  # More frequent logging for short runs
+            gradient_checkpointing_kwargs={"use_reentrant": False},
             logging_steps=1,
             log_completions=False,
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 599c03e8b..707538340 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -43,7 +43,7 @@ train_args:
   seed: 1234
   fp16: False  # Use fp16 instead of bf16 for GPU compatibility
   bf16: True
-  gradient_checkpointing: False  # Match GRPO config
+  gradient_checkpointing: True  # Match GRPO config
   per_device_train_batch_size: 2  # Will be overridden by GRPO
   per_device_eval_batch_size: 8
   gradient_accumulation_steps: 2  # Will be overridden by GRPO

From 841b6f013020be98d2fc7de74c6c09760fca36c6 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 14:09:11 +0000
Subject: [PATCH 123/168] Update gradient checkpointing settings in
 FullModelLLMTrainer and configuration file

- Removed `gradient_checkpointing_kwargs` from `FullModelLLMTrainer` to simplify the training configuration.
- Updated `gradient_checkpointing` setting in `grpo_gsm8k_test_config.yaml` to `False` for consistency with GRPO configuration.
---
 python/spotlight_prj/fedllm/custom_trainer.py                   | 1 -
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 7cadedfc3..5da192653 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -406,7 +406,6 @@ def train(self, train_data, device, args):
             fp16=not use_bf16,  # Use fp16 if not bf16
             gradient_checkpointing=getattr(args, 'gradient_checkpointing', False),
             #logging_steps=5 if grpo_max_steps > 0 and grpo_max_steps < 50 else 25,  # More frequent logging for short runs
-            gradient_checkpointing_kwargs={"use_reentrant": False},
             logging_steps=1,
             log_completions=False,
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 707538340..599c03e8b 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -43,7 +43,7 @@ train_args:
   seed: 1234
   fp16: False  # Use fp16 instead of bf16 for GPU compatibility
   bf16: True
-  gradient_checkpointing: True  # Match GRPO config
+  gradient_checkpointing: False  # Match GRPO config
   per_device_train_batch_size: 2  # Will be overridden by GRPO
   per_device_eval_batch_size: 8
   gradient_accumulation_steps: 2  # Will be overridden by GRPO

From e6c0ebb502b76a07a0ce34de32d284a04ef0530e Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 14:38:59 +0000
Subject: [PATCH 124/168] Update max completion length and batch size in
 FullModelLLMTrainer and configuration file

- Increased `max_completion_length` and `max_new_tokens` from 128 to 256 in `FullModelLLMTrainer` to enhance model output capabilities.
- Adjusted `client_num_in_total` and `client_num_per_round` to 1, and updated `grpo_batch_size` to 2 in `grpo_gsm8k_test_config.yaml` for improved testing efficiency.
---
 python/spotlight_prj/fedllm/custom_trainer.py               | 4 ++--
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml         | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 5da192653..7b4912bf5 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -397,7 +397,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=128,
+            max_completion_length=256,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -440,7 +440,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 128,
+            "max_new_tokens": 256,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 599c03e8b..9eae6484c 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -27,13 +27,13 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 4  # Single client setup
-  client_num_per_round: 4  # Single client setup
+  client_num_in_total: 1  # Single client setup
+  client_num_per_round: 1  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 1  # Smaller batch size for faster testing
+  grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From 7a9ada313dbc5cb017dfbbe602b79b5c28b25e9f Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 15:54:24 +0000
Subject: [PATCH 125/168] Update max completion length and batch size in
 FullModelLLMTrainer and configuration file

- Reduced `max_completion_length` and `max_new_tokens` from 256 to 128 in `FullModelLLMTrainer` to optimize performance.
- Adjusted `grpo_batch_size` from 2 to 1 in `grpo_gsm8k_test_config.yaml` for faster testing efficiency.
---
 python/spotlight_prj/fedllm/custom_trainer.py               | 6 +++---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml         | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 7b4912bf5..8f03c3e40 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -397,7 +397,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=128,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -419,7 +419,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.0,
-            optim="paged_lion_8bit",
+            optim="sgd",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")
@@ -440,7 +440,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 256,
+            "max_new_tokens": 128,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 9eae6484c..74cface24 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -33,7 +33,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 2  # Smaller batch size for faster testing
+  grpo_batch_size: 1  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From 25c894ac423cadff3b09a8a2902e1ecb43bbc7be Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 16:33:36 +0000
Subject: [PATCH 126/168] Update max completion length and logging settings in
 FullModelLLMTrainer

- Increased `max_completion_length` and `max_new_tokens` from 128 to 256 in `FullModelLLMTrainer` to improve model output capabilities.
- Enabled `log_completions` to enhance logging during training.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 8f03c3e40..2d7bec64e 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -397,7 +397,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=128,
+            max_completion_length=256,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -407,7 +407,7 @@ def train(self, train_data, device, args):
             gradient_checkpointing=getattr(args, 'gradient_checkpointing', False),
             #logging_steps=5 if grpo_max_steps > 0 and grpo_max_steps < 50 else 25,  # More frequent logging for short runs
             logging_steps=1,
-            log_completions=False,
+            log_completions=True,
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
             # Add seed for reproducibility in federated setting
             seed=int(time.perf_counter_ns() % (2**32)),
@@ -440,7 +440,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 128,
+            "max_new_tokens": 256,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From 3a56c1b5f708b909205e8bb5efe333b5d81a9605 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 16:34:47 +0000
Subject: [PATCH 127/168] Update GRPO configuration for testing with increased
 epochs and batch size

- Changed `grpo_num_epochs` from 1 to 2 and `grpo_batch_size` from 1 to 2 in `grpo_gsm8k_test_config.yaml` to enhance testing efficiency.
---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 74cface24..96d44e8a1 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -32,8 +32,8 @@ train_args:
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50
-  grpo_num_epochs: 1  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 1  # Smaller batch size for faster testing
+  grpo_num_epochs: 2  # Ignored when grpo_max_steps > 0
+  grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From 28db52fffd11bfbaffcc67698cb72d7074073fa7 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 17:23:13 +0000
Subject: [PATCH 128/168] Update GRPO configuration in FullModelLLMTrainer for
 enhanced performance

- Increased `num_generations` from 2 to 4 to improve model output diversity.
- Updated `max_completion_length` and `max_new_tokens` from 256 to 512 for better response quality.
- Adjusted `beta` parameter from 0.0 to 0.1 to refine optimization settings.
---
 python/spotlight_prj/fedllm/custom_trainer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 2d7bec64e..83da43fd8 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -307,7 +307,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 2
+        num_generations = 4
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
@@ -397,7 +397,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=512,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -418,8 +418,8 @@ def train(self, train_data, device, args):
             top_k=50,
             repetition_penalty=1.1,
             epsilon=0.2,
-            beta=0.0,
-            optim="sgd",
+            beta=0.1,
+            #optim="sgd",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")
@@ -440,7 +440,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 256,
+            "max_new_tokens": 512,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From f0555a81b9bf9e0e3944e90ff83f3897b8d23028 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 17:26:16 +0000
Subject: [PATCH 129/168] Update GRPO configuration for multi-client setup in
 grpo_gsm8k_test_config.yaml

- Increased `client_num_in_total` and `client_num_per_round` from 1 to 4 to enable a multi-client testing environment.
---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 96d44e8a1..ad6a0681e 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -27,8 +27,8 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 1  # Single client setup
-  client_num_per_round: 1  # Single client setup
+  client_num_in_total: 4  # Single client setup
+  client_num_per_round: 4  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50

From e84b11f529d3c4cb1fc92706fc6417844df95d70 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 17:53:49 +0000
Subject: [PATCH 130/168] Update model configuration in GRPO test files to use
 Qwen3-0.6B

- Changed `model_name_or_path` from "Qwen/Qwen3-1.7B" to "Qwen/Qwen3-0.6B" in both `grpo_gsm8k_test_config.yaml` and `save_initial_checkpoint.py` for consistency in model usage.
---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index ad6a0681e..ed0285607 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-1.7B"
+  model_name_or_path: "Qwen/Qwen3-0.6B"
   model_dtype: "bfloat16"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index 2b00909dd..be46d17f4 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-1.7B"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From 9a8ce669a5adab585c9b51f5230be4c1a6c468b0 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Fri, 1 Aug 2025 18:21:47 +0000
Subject: [PATCH 131/168] Update model configuration and logging settings for
 GRPO testing

- Changed `model_name_or_path` from "Qwen/Qwen3-0.6B" to "Qwen/Qwen3-1.7B" in `grpo_gsm8k_test_config.yaml` and `save_initial_checkpoint.py` for consistency.
- Updated `log_completions` setting in `FullModelLLMTrainer` from `True` to `False` to reduce logging verbosity during training.
---
 python/spotlight_prj/fedllm/custom_trainer.py               | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml         | 6 +++---
 .../spotlight_prj/fedllm/scripts/save_initial_checkpoint.py | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 83da43fd8..0c7c923f1 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -407,7 +407,7 @@ def train(self, train_data, device, args):
             gradient_checkpointing=getattr(args, 'gradient_checkpointing', False),
             #logging_steps=5 if grpo_max_steps > 0 and grpo_max_steps < 50 else 25,  # More frequent logging for short runs
             logging_steps=1,
-            log_completions=True,
+            log_completions=False,
             save_steps=grpo_max_steps if grpo_max_steps > 0 else 500,  # Save at the end if using max_steps
             # Add seed for reproducibility in federated setting
             seed=int(time.perf_counter_ns() % (2**32)),
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index ed0285607..f0995c52d 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-0.6B"
+  model_name_or_path: "Qwen/Qwen3-1.7B"
   model_dtype: "bfloat16"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
@@ -27,8 +27,8 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 4  # Single client setup
-  client_num_per_round: 4  # Single client setup
+  client_num_in_total: 2  # Single client setup
+  client_num_per_round: 2  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50
diff --git a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
index be46d17f4..2b00909dd 100644
--- a/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
+++ b/python/spotlight_prj/fedllm/scripts/save_initial_checkpoint.py
@@ -9,7 +9,7 @@
 
 # Configuration
 RUN_ID = os.environ.get("RUN_ID", "test_run")
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "Qwen/Qwen3-1.7B"
 OUTPUT_DIR = f"/workspace/FedML/python/spotlight_prj/fedllm/.logs/FedML/{RUN_ID}/node_0/init"
 
 print(f"Saving initial checkpoint for model: {MODEL_NAME}")

From ba26160ea8251540965c1547063739c4989ad517 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 2 Aug 2025 00:57:40 +0000
Subject: [PATCH 132/168] Enhance validation script for custom model weights
 and output handling

- Added functions to check for weight files and complete checkpoints.
- Implemented setup for temporary model directories with custom weights.
- Updated argument parsing to support custom weight files and base model configuration.
- Improved output filename generation based on model path.
- Enhanced evaluation process to track individual example rewards and save them to a file.
---
 python/spotlight_prj/fedllm/validation.py | 273 ++++++++++++++++------
 1 file changed, 196 insertions(+), 77 deletions(-)

diff --git a/python/spotlight_prj/fedllm/validation.py b/python/spotlight_prj/fedllm/validation.py
index 5ffa8347c..ffae86eb2 100644
--- a/python/spotlight_prj/fedllm/validation.py
+++ b/python/spotlight_prj/fedllm/validation.py
@@ -13,15 +13,28 @@
   --rollouts 4 \
   --batch-examples 16 \
   --num-examples 200
+
+# Using custom weights with base model config/tokenizer
+python eval_qwen3_gsm8k.py \
+  --model /path/to/custom_weights.safetensors \
+  --base-model Qwen/Qwen3-0.6B \
+  --rollouts 4 \
+  --batch-examples 16 \
+  --num-examples 200
 """
 
 import argparse
+import os
 import re
+import shutil
+import tempfile
 import time
 from fractions import Fraction
+from pathlib import Path
 from typing import Optional, List
 
 from datasets import load_dataset          # pip install datasets
+from transformers import AutoConfig, AutoTokenizer  # pip install transformers
 from vllm import LLM, SamplingParams       # pip install vllm
 
 # --------------------------- reward configuration ---------------------------
@@ -35,6 +48,55 @@
 
 # ------------------------------- utilities ---------------------------------
 
+def is_weight_file(path: str) -> bool:
+    """Check if path points to a weight file (.bin, .safetensors, .pt, .pth)."""
+    if not os.path.isfile(path):
+        return False
+    return Path(path).suffix.lower() in {'.bin', '.safetensors', '.pt', '.pth'}
+
+
+def is_complete_checkpoint(path: str) -> bool:
+    """Check if path is a directory containing config.json (indicating a complete checkpoint)."""
+    if not os.path.isdir(path):
+        return False
+    return os.path.exists(os.path.join(path, 'config.json'))
+
+
+def setup_model_with_custom_weights(weight_path: str, base_model: str) -> str:
+    """
+    Create a temporary directory with base model config/tokenizer and custom weights.
+    Returns the path to the temporary directory.
+    """
+    # Create temporary directory
+    temp_dir = tempfile.mkdtemp(prefix="qwen_custom_weights_")
+    
+    try:
+        print(f"[INFO] Setting up temporary model directory at {temp_dir}")
+        print(f"[INFO] Loading config and tokenizer from base model: {base_model}")
+        
+        # Download and save config
+        config = AutoConfig.from_pretrained(base_model, trust_remote_code=True)
+        config.save_pretrained(temp_dir)
+        
+        # Download and save tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
+        tokenizer.save_pretrained(temp_dir)
+        
+        # Copy weight file to temporary directory
+        weight_filename = Path(weight_path).name
+        dest_weight_path = os.path.join(temp_dir, weight_filename)
+        print(f"[INFO] Copying weights from {weight_path} to {dest_weight_path}")
+        shutil.copy2(weight_path, dest_weight_path)
+        
+        print(f"[INFO] Custom model setup complete in {temp_dir}")
+        return temp_dir
+        
+    except Exception as e:
+        # Clean up on error
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        raise RuntimeError(f"Failed to setup custom model: {e}")
+
+
 def to_number(text: str) -> Optional[float]:
     """Convert string to float if possible, handling simple fractions."""
     text = text.replace(",", "").strip()
@@ -74,6 +136,26 @@ def batched(lst: List, n: int):
         yield lst[i:i + n]
 
 
+def get_output_filename(model_path: str) -> str:
+    """Generate a filesystem-safe filename based on the model path."""
+    if model_path is None:
+        return "Qwen_Qwen3-0.6B_rewards.csv"
+    
+    # Extract meaningful name from different model path formats
+    if "/" in model_path:
+        # HuggingFace model ID (e.g., "Qwen/Qwen3-0.6B") or file path
+        name = model_path.split("/")[-1]
+        if "." in name:  # Remove file extension for weight files
+            name = Path(name).stem
+    else:
+        name = model_path
+    
+    # Replace invalid filename characters
+    name = re.sub(r'[<>:"/\\|?*]', '_', name)
+    
+    return f"{name}_rewards.csv"
+
+
 # ------------------------------- main --------------------------------------
 
 def parse_args() -> argparse.Namespace:
@@ -82,24 +164,20 @@ def parse_args() -> argparse.Namespace:
                    help="completions per example (default: 4)")
     p.add_argument("--batch-examples", type=int, default=16,
                    help="examples per vLLM inference call (default: 2)")
-    p.add_argument("--num-examples", type=int, default=100,
-                   help="total GSM8K test examples to evaluate (default: 100)")
+    p.add_argument("--num-examples", type=int, default=-1,
+                   help="total GSM8K test examples to evaluate (default: 100, use -1 for full dataset)")
     p.add_argument(
         "--model",
         default=None,
-        help=("HF repo ID or local checkpoint dir. "
+        help=("HF repo ID, local checkpoint dir, or path to weight file(s). "
               "If omitted, downloads Qwen/Qwen3-0.6B automatically."),
     )
-
-    # When loading a locally fine-tuned checkpoint, the tokenizer files are
-    # often *not* included in the output directory.  Allow the user to point
-    # to an existing tokenizer (typically the original base model on the HF
-    # Hub) to avoid the `vocab_file is None` error coming from
-    # `transformers`.
-    p.add_argument("--tokenizer",
-                   default="Qwen/Qwen3-0.6B",
-                   help=("Tokenizer repo / path (default: Qwen/Qwen3-0.6B). "
-                         "Override if you need a different tokenizer."))
+    p.add_argument(
+        "--base-model",
+        default="Qwen/Qwen3-0.6B",
+        help=("Base model for config/tokenizer when using custom weight files. "
+              "Ignored when --model is a full checkpoint directory."),
+    )
     p.add_argument("--max-tokens", type=int, default=512,
                    help="generation length cap (tokens)")
     p.add_argument("--temperature", type=float, default=0.7)
@@ -109,72 +187,113 @@ def parse_args() -> argparse.Namespace:
 
 def main() -> None:
     args = parse_args()
+    temp_model_dir = None
 
-    # ------------------------- resolve model path --------------------------
-    if args.model is None:
-        args.model = "Qwen/Qwen3-0.6B"
-        print(f"[INFO] No --model given → downloading '{args.model}' "
-              "from Hugging Face Hub…")
-
-    # ----------------------- initialize LLM & sampler ----------------------
-    # Use a fallback tokenizer path if the user provided one; otherwise rely on
-    # the model path itself.  This prevents crashes when the checkpoint
-    # directory does not contain tokenizer artifacts.
-    llm = LLM(model=args.model,
-              tokenizer=args.tokenizer,
-              trust_remote_code=True,   # Qwen uses custom code
-              dtype="auto")             # let vLLM choose BF16 / FP16 / FP32
-
-    sampler = SamplingParams(
-        temperature=args.temperature,
-        top_p=args.top_p,
-        max_tokens=args.max_tokens,
-        n=args.rollouts
-    )
-
-    # --------------------------- load dataset -----------------------------
-    ds = load_dataset("openai/gsm8k", "main", split="test")
-    ds = ds.shuffle(seed=42).select(range(min(args.num_examples, len(ds))))
-
-    total_reward = 0.0
-    total_completions = len(ds) * args.rollouts
-
-    # --------------------------- evaluation -------------------------------
-    print(f"[INFO] Starting generation for {len(ds)} examples in batches of {args.batch_examples}...")
-    start_time = time.time()
-    
-    batch_count = 0
-    for batch in batched(list(ds), args.batch_examples):
-        batch_start = time.time()
-        prompts = [ex["question"] for ex in batch]   # **raw questions only**
-        outputs = llm.generate(prompts, sampler)
-        batch_end = time.time()
+    try:
+        # ------------------------- resolve model path --------------------------
+        original_model_arg = args.model  # Store original for filename
+        if args.model is None:
+            args.model = "Qwen/Qwen3-0.6B"
+            print(f"[INFO] No --model given → downloading '{args.model}' "
+                  "from Hugging Face Hub…")
+        elif is_weight_file(args.model):
+            print(f"[INFO] Detected weight file: {args.model}")
+            print(f"[INFO] Using base model: {args.base_model}")
+            temp_model_dir = setup_model_with_custom_weights(args.model, args.base_model)
+            args.model = temp_model_dir
+        elif is_complete_checkpoint(args.model):
+            print(f"[INFO] Using complete checkpoint directory: {args.model}")
+        else:
+            # Assume it's a HuggingFace model ID
+            print(f"[INFO] Using HuggingFace model: {args.model}")
+
+        # ----------------------- initialize LLM & sampler ----------------------
+        llm = LLM(model=args.model,
+                  trust_remote_code=True,   # Qwen uses custom code
+                  dtype="bfloat16")             # let vLLM choose BF16 / FP16 / FP32
+
+        sampler = SamplingParams(
+            temperature=args.temperature,
+            top_p=args.top_p,
+            max_tokens=args.max_tokens,
+            n=args.rollouts,
+            seed=42
+        )
+
+        # --------------------------- load dataset -----------------------------
+        ds = load_dataset("openai/gsm8k", "main", split="test")
+        if args.num_examples == -1:
+            # Use full dataset without shuffling
+            print(f"[INFO] Using full dataset ({len(ds)} examples)")
+        else:
+            # Shuffle and select specified number of examples
+            num_to_select = min(args.num_examples, len(ds))
+            ds = ds.shuffle(seed=42).select(range(num_to_select))
+            print(f"[INFO] Using {len(ds)} examples (shuffled)")
+
+        total_reward = 0.0
+        total_completions = len(ds) * args.rollouts
+        all_example_rewards = []  # Track all rollout rewards for each example
+
+        # --------------------------- evaluation -------------------------------
+        print(f"[INFO] Starting generation for {len(ds)} examples in batches of {args.batch_examples}...")
+        start_time = time.time()
         
-        batch_count += 1
-        batch_time = batch_end - batch_start
-        print(f"[TIMING] Batch {batch_count} ({len(batch)} examples): {batch_time:.2f}s")
-
-        for ex, gen in zip(batch, outputs):
-            gold = ex["answer"].split("####")[-1].strip()
-            for out in gen.outputs:
-                pred = extract_boxed(out.text)
-                total_reward += reward(pred, gold)
-
-    end_time = time.time()
-    total_generation_time = end_time - start_time
-    
-    avg_reward = total_reward / total_completions
-    print(f"\n[TIMING] Total generation time: {total_generation_time:.2f}s")
-    print(f"[TIMING] Average time per batch: {total_generation_time / batch_count:.2f}s")
-    print(f"[TIMING] Average time per example: {total_generation_time / len(ds):.2f}s")
-    print(f"[TIMING] Average time per completion: {total_generation_time / total_completions:.3f}s")
-    print(f"\nEvaluated {len(ds)} examples × {args.rollouts} rollouts "
-          f"(batch size = {args.batch_examples}).")
-    print(f"Average reward: {avg_reward:.4f}")
-
-    # Save average reward to file
-    with open("avg_reward.txt", "a") as f:
-        f.write(f"{avg_reward:.4f}\n")
+        batch_count = 0
+        for batch in batched(list(ds), args.batch_examples):
+            batch_start = time.time()
+            prompts = [ex["question"] for ex in batch]   # **raw questions only**
+            outputs = llm.generate(prompts, sampler)
+            batch_end = time.time()
+            
+            batch_count += 1
+            batch_time = batch_end - batch_start
+            print(f"[TIMING] Batch {batch_count} ({len(batch)} examples): {batch_time:.2f}s")
+
+            for ex, gen in zip(batch, outputs):
+                gold = ex["answer"].split("####")[-1].strip()
+                example_rollout_rewards = []
+                for out in gen.outputs:
+                    pred = extract_boxed(out.text)
+                    rollout_reward = reward(pred, gold)
+                    total_reward += rollout_reward
+                    example_rollout_rewards.append(rollout_reward)
+                
+                # Store all rollout rewards for this example
+                all_example_rewards.append(example_rollout_rewards)
+
+        end_time = time.time()
+        total_generation_time = end_time - start_time
+        
+        avg_reward = total_reward / total_completions
+        print(f"\n[TIMING] Total generation time: {total_generation_time:.2f}s")
+        print(f"[TIMING] Average time per batch: {total_generation_time / batch_count:.2f}s")
+        print(f"[TIMING] Average time per example: {total_generation_time / len(ds):.2f}s")
+        print(f"[TIMING] Average time per completion: {total_generation_time / total_completions:.3f}s")
+        print(f"\nEvaluated {len(ds)} examples × {args.rollouts} rollouts "
+              f"(batch size = {args.batch_examples}).")
+        print(f"Average reward: {avg_reward:.4f}")
+
+        # ------------------------ write rewards to file -------------------------
+        output_filename = get_output_filename(original_model_arg)
+        
+        with open(output_filename, 'w') as f:
+            for example_rewards in all_example_rewards:
+                line = ",".join(str(r) for r in example_rewards)
+                f.write(line + "\n")
+        
+        print(f"[INFO] Individual example rewards written to: {output_filename}")
+        print(f"[INFO] File contains {len(all_example_rewards)} lines, one per example")
+        
+        # Calculate max reward per example for summary stats
+        max_rewards_per_example = [max(rewards) for rewards in all_example_rewards]
+        print(f"[INFO] Max reward per example average: {sum(max_rewards_per_example) / len(max_rewards_per_example):.4f}")
+
+    finally:
+        # Clean up temporary directory if it was created
+        if temp_model_dir and os.path.exists(temp_model_dir):
+            print(f"[INFO] Cleaning up temporary directory: {temp_model_dir}")
+            shutil.rmtree(temp_model_dir, ignore_errors=True)
 
 
 if __name__ == "__main__":

From 5f19d21df250723200a1d721754ff4d97045eef0 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 2 Aug 2025 12:48:04 +0000
Subject: [PATCH 133/168] Add paired permutation test script for model reward
 evaluation

- Introduced `stat_test.py` to perform paired permutation tests on model rewards.
- Implemented functions to load rewards from files and calculate aggregated values.
- Added command-line interface for usage with two model reward files.
- Included error handling for file reading and data parsing to ensure robustness.
---
 python/spotlight_prj/fedllm/stat_test.py | 101 +++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 python/spotlight_prj/fedllm/stat_test.py

diff --git a/python/spotlight_prj/fedllm/stat_test.py b/python/spotlight_prj/fedllm/stat_test.py
new file mode 100644
index 000000000..d9a97b2c9
--- /dev/null
+++ b/python/spotlight_prj/fedllm/stat_test.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""
+perm_test_files.py  – Paired permutation test for model rewards (multi-rollout format)
+
+File format
+-----------
+Each line = one evaluation question.
+Each line contains comma-separated rewards (0, 1.5, 2) – one per rollout.
+
+Example (Three questions, two rollouts each):
+2,1.5
+2,2
+0,0
+
+Usage
+-----
+python perm_test_files.py <modelA_file> <modelB_file>
+"""
+from __future__ import annotations
+import sys
+import pathlib
+import numpy as np
+from typing import Callable
+
+# ----------------------------------------------------------------------
+# CONFIGURATION: choose how to collapse multiple roll-outs into one value.
+# ----------------------------------------------------------------------
+AGG_FUNC: Callable[[np.ndarray], float] = np.mean      # or np.max, etc.
+
+# ----------------------------------------------------------------------
+def load_rewards(path: str | pathlib.Path) -> np.ndarray:
+    """
+    Read *path* and return a 1-D array of per-question aggregated rewards.
+    Each line is split on commas, converted to floats, then collapsed with AGG_FUNC.
+    """
+    try:
+        lines = pathlib.Path(path).read_text().strip().splitlines()
+    except OSError as err:
+        sys.exit(f"Error reading '{path}': {err}")
+
+    if not lines:
+        sys.exit(f"Error: '{path}' is empty.")
+
+    per_question = []
+    for lineno, line in enumerate(lines, start=1):
+        if not line.strip():
+            sys.exit(f"Error: blank line at {path}:{lineno}.")
+        try:
+            values = np.fromstring(line, sep=",", dtype=float)
+        except ValueError as err:
+            sys.exit(f"Error parsing numbers in '{path}' line {lineno}: {err}")
+        if values.size == 0:
+            sys.exit(f"Error: no numeric values in '{path}' line {lineno}.")
+        per_question.append(AGG_FUNC(values))
+
+    return np.asarray(per_question, dtype=float)
+
+
+def permutation_test(rA: np.ndarray,
+                     rB: np.ndarray,
+                     B: int = 100_000,
+                     seed: int = 42) -> tuple[float, float]:
+    """
+    Paired permutation test on per-question reward differences.
+
+    Returns
+    -------
+    gap : float         mean(rA − rB)
+    p_two_sided : float permutation p-value
+    """
+    d   = rA - rB
+    gap = d.mean()
+
+    rng = np.random.default_rng(seed)
+    signs = rng.choice([1, -1], size=(B, d.size))
+    perm_gaps = (signs * d).mean(axis=1)
+    p_two_sided = (np.abs(perm_gaps) >= abs(gap)).mean()
+    return gap, p_two_sided
+
+
+def main() -> None:
+    if len(sys.argv) != 3:
+        print("Usage:  python perm_test_files.py <modelA_file> <modelB_file>")
+        sys.exit(1)
+
+    rA = load_rewards(sys.argv[1])
+    rB = load_rewards(sys.argv[2])
+
+    if rA.size != rB.size:
+        sys.exit("Error: the two files contain different numbers of questions.")
+
+    gap, p = permutation_test(rA, rB)
+
+    print(f"# questions                   : {rA.size}")
+    print(f"Aggregation over roll-outs    : {AGG_FUNC.__name__}")
+    print(f"Mean reward difference (A-B)  : {gap:.6f}")
+    print(f"Two-sided permutation p-value : {p:.6g}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 655bf9e6aea208fe452eb325a12aaaf6230319ab Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 2 Aug 2025 12:48:40 +0000
Subject: [PATCH 134/168] Update GRPO configuration for reduced batch size and
 completion length

- Changed `num_generations` from 4 to 2 in `FullModelLLMTrainer` to optimize training.
- Reduced `max_completion_length` and `max_new_tokens` from 512 to 256 for improved response handling.
- Updated `grpo_batch_size` from 2 to 1 in `grpo_gsm8k_test_config.yaml` for faster testing efficiency.
- Adjusted `client_num_in_total` and `client_num_per_round` from 2 to 1 for a single client setup.
---
 python/spotlight_prj/fedllm/custom_trainer.py               | 6 +++---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml         | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 0c7c923f1..05e340f24 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -307,7 +307,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 4
+        num_generations = 2
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
@@ -397,7 +397,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=512,
+            max_completion_length=256,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -440,7 +440,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 512,
+            "max_new_tokens": 256,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index f0995c52d..d0c3108bc 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -27,13 +27,13 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 2  # Single client setup
-  client_num_per_round: 2  # Single client setup
+  client_num_in_total: 1  # Single client setup
+  client_num_per_round: 1  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 2  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 2  # Smaller batch size for faster testing
+  grpo_batch_size: 1  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From 77cd14bd996540ef5a7646852ae356a80978588f Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 2 Aug 2025 12:52:25 +0000
Subject: [PATCH 135/168] Enable SGD optimization in GRPO configuration for
 FullModelLLMTrainer

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 05e340f24..51715b879 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -419,7 +419,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.1,
-            #optim="sgd",
+            optim="sgd",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")

From 5622116278ae868d578b5333f650848fc09d2dfe Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 2 Aug 2025 13:13:21 +0000
Subject: [PATCH 136/168] Reduce max completion length and new tokens in
 FullModelLLMTrainer from 256 to 200 for optimized response handling.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 51715b879..69a50743b 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -397,7 +397,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=200,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -440,7 +440,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 256,
+            "max_new_tokens": 200,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From aca1d325f7f12a4943a3756a6ddb0e34c22175a8 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 2 Aug 2025 13:23:41 +0000
Subject: [PATCH 137/168] Reduce max completion length and new tokens in
 FullModelLLMTrainer from 200 to 128 for improved response handling.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 69a50743b..278fbeace 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -397,7 +397,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=200,
+            max_completion_length=128,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -440,7 +440,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 200,
+            "max_new_tokens": 128,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From 140fbdab19aa35ce744deed49e8fe2c4ac036aca Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Sat, 2 Aug 2025 13:26:16 +0000
Subject: [PATCH 138/168] Update max completion length and new tokens in
 FullModelLLMTrainer from 128 to 256 for improved response handling.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 278fbeace..042759637 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -87,7 +87,7 @@ def disable_dropout_in_model(model: torch.nn.Module) -> None:
 
 
 class TimedGRPOTrainer(GRPOTrainer):
-    """
+    
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -132,7 +132,7 @@ def __init__(self, *args, **kwargs):
             self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator))
         
         #self.ref_model.to('cpu')
-    """
+    
 
     def _record_step_stats(self, stats):
         # -------------------------------------------------------------
@@ -397,7 +397,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=128,
+            max_completion_length=256,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -440,7 +440,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 128,
+            "max_new_tokens": 256,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From 52f61224550b7536ab8cb1b80bf4a1cffee0e9a4 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 4 Aug 2025 19:52:45 +0000
Subject: [PATCH 139/168] Increase max completion length and new tokens in
 FullModelLLMTrainer from 256 to 512 for enhanced response handling.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 042759637..68bc8fa25 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -397,7 +397,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=512,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -440,7 +440,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 256,
+            "max_new_tokens": 512,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From 133bb781ec134578af140c301421cb53651ad30c Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 4 Aug 2025 20:41:52 +0000
Subject: [PATCH 140/168] Refactor reference model loading in TimedGRPOTrainer
 to use AutoModelForCausalLM for improved flexibility and evaluation. Removed
 deprecated torch_dtype handling and ensured dropout is disabled for the
 reference model.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 46 +++----------------
 1 file changed, 7 insertions(+), 39 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 68bc8fa25..3923f60e1 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -63,7 +63,7 @@ def on_train_begin(self, args, state, control, **kwargs):
 from fractions import Fraction
 
 # New import for TrainerCallback
-from transformers import TrainerCallback, AutoConfig
+from transformers import TrainerCallback, AutoConfig, AutoModelForCausalLM
 import transformers
 
 import wandb
@@ -91,45 +91,13 @@ class TimedGRPOTrainer(GRPOTrainer):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        
-        model_init_kwargs = kwargs.get('args', GRPOConfig()).model_init_kwargs or {}
-        torch_dtype = model_init_kwargs.get("torch_dtype")
-        if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None:
-            pass  # torch_dtype is already a torch.dtype or "auto" or None
-        elif isinstance(torch_dtype, str):  # it's a str, but not "auto"
-            torch_dtype = getattr(torch, torch_dtype)
-            model_init_kwargs["torch_dtype"] = torch_dtype
-        else:
-            raise ValueError(
-                "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
-                f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}."
-            )
-
-        # Reference model
-        if self.beta != 0.0:
-            # For deepspeed, fsdp or non-distributed models, create a reference model from scratch
-            config = AutoConfig.from_pretrained("Qwen/Qwen3-1.7B-GPTQ-Int8")
-            architecture = getattr(transformers, config.architectures[0])
-            self.ref_model = architecture.from_pretrained("Qwen/Qwen3-1.7B-GPTQ-Int8", **model_init_kwargs)
-        
-        # Disable dropout in the models
-        if getattr(self.args, "disable_dropout", False):
-            disable_dropout_in_model(self.model)
-            if self.ref_model is not None:
-                disable_dropout_in_model(self.ref_model)
-        
         if self.ref_model is not None:
-            if self.is_deepspeed_enabled:
-                # Prepare reference model under DeepSpeed when enabled
-                per_device_bs = getattr(self.args, 'per_device_train_batch_size', 1)
-                self.ref_model = prepare_deepspeed(self.ref_model, per_device_bs, fp16=getattr(self.args, 'fp16', False), bf16=getattr(self.args, 'bf16', False))
-            elif self.is_fsdp_enabled:
-                self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
-            else:
-                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
-
-        if getattr(self.args, "sync_ref_model", False):
-            self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator))
+            # Load any model you like as the reference baseline
+            self.ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B-GPTQ-Int8")
+            self.ref_model.eval()
+            disable_dropout_in_model(self.ref_model)
+            for p in self.ref_model.parameters():
+                p.requires_grad_(False)
         
         #self.ref_model.to('cpu')
     

From 83f7051c6b07b9969073f1157a8580ae096b8426 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 4 Aug 2025 20:54:16 +0000
Subject: [PATCH 141/168] =?UTF-8?q?Move=20reference=20model=20to=20the=20s?=
 =?UTF-8?q?ame=20device=20as=20the=20policy=20in=20TimedGRPOTrainer=20to?=
 =?UTF-8?q?=20avoid=20CPU=E2=86=94GPU=20mismatch.=20Retain=20commented=20l?=
 =?UTF-8?q?ine=20for=20CPU=20off-loading=20during=20debugging.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/spotlight_prj/fedllm/custom_trainer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 3923f60e1..1b25d2059 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -96,10 +96,16 @@ def __init__(self, *args, **kwargs):
             self.ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B-GPTQ-Int8")
             self.ref_model.eval()
             disable_dropout_in_model(self.ref_model)
+            # Move reference model to the same device as the policy so that
+            # inputs and weights reside on a single device (avoids CPU↔GPU mismatch).
+            # `Trainer` already initialises an `accelerator` attribute so we can
+            # rely on `self.accelerator.device` to pick the correct target.
+            self.ref_model.to(self.accelerator.device)
             for p in self.ref_model.parameters():
                 p.requires_grad_(False)
         
-        #self.ref_model.to('cpu')
+        # Keep the commented line for quick CPU off-loading during debugging
+        # self.ref_model.to('cpu')
     
 
     def _record_step_stats(self, stats):

From 8da9d858869e9077de3dfa81a41423720f8862f7 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 4 Aug 2025 22:10:27 +0000
Subject: [PATCH 142/168] Update reference model in TimedGRPOTrainer to use
 Qwen/Qwen3-0.6B for improved performance and compatibility.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 1b25d2059..1d3dfa0e6 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -93,7 +93,7 @@ def __init__(self, *args, **kwargs):
 
         if self.ref_model is not None:
             # Load any model you like as the reference baseline
-            self.ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B-GPTQ-Int8")
+            self.ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B")
             self.ref_model.eval()
             disable_dropout_in_model(self.ref_model)
             # Move reference model to the same device as the policy so that

From 0417ee8eab622e3b53ecfbee9c50f8441cfc14d9 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 4 Aug 2025 22:19:49 +0000
Subject: [PATCH 143/168] Update reference model in TimedGRPOTrainer to use
 Qwen/Qwen3-0.6B-GPTQ-Int8 for enhanced performance and compatibility.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 1d3dfa0e6..ae46091d0 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -93,7 +93,7 @@ def __init__(self, *args, **kwargs):
 
         if self.ref_model is not None:
             # Load any model you like as the reference baseline
-            self.ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B")
+            self.ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-GPTQ-Int8")
             self.ref_model.eval()
             disable_dropout_in_model(self.ref_model)
             # Move reference model to the same device as the policy so that

From 2329b2ccb583c738238418d1eea1dd6bef97aac5 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 4 Aug 2025 22:30:06 +0000
Subject: [PATCH 144/168] Reduce max completion length and new tokens in
 FullModelLLMTrainer from 512 to 256 for optimized response handling.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index ae46091d0..8a777f359 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -371,7 +371,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=512,
+            max_completion_length=256,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -414,7 +414,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 512,
+            "max_new_tokens": 256,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From f5344ad5acf267c3d1dafbf3109ffc594d8c1085 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Mon, 4 Aug 2025 22:54:34 +0000
Subject: [PATCH 145/168] Increase max completion length and new tokens in
 FullModelLLMTrainer from 256 to 512 for enhanced response handling.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 8a777f359..ae46091d0 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -371,7 +371,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=512,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -414,7 +414,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 256,
+            "max_new_tokens": 512,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From 9d3c9056928179667786112b01f6567e906df6b1 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 16:54:15 +0000
Subject: [PATCH 146/168] Remove deprecated SyncRefModelCallback implementation
 from custom_trainer.py and ensure reference model is moved to CPU for
 consistency in TimedGRPOTrainer.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index ae46091d0..f85ffbf14 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -34,22 +34,6 @@ def prepare_fsdp(model, accelerator):
         """Minimal FSDP prep fallback – just use accelerator.prepare_model."""
         return accelerator.prepare_model(model, evaluation_mode=True)
 
-# Optional: stub SyncRefModelCallback if not provided upstream
-try:
-    from trl.trainer.callbacks import SyncRefModelCallback  # hypothetical future addition
-except Exception:
-    from transformers import TrainerCallback
-    class SyncRefModelCallback(TrainerCallback):
-        """Fallback no-op callback used when TRL doesn't ship one.
-        Simply keeps reference model on correct device and in eval mode.
-        """
-        def __init__(self, ref_model=None, accelerator=None):
-            self.ref_model = ref_model
-            self.accelerator = accelerator
-        def on_train_begin(self, args, state, control, **kwargs):
-            if self.ref_model is not None and self.accelerator is not None:
-                self.ref_model.to(self.accelerator.device)
-                self.ref_model.eval()
 from fedml.ml.aggregator.agg_operator import FedMLAggOperator
 
 from run_fedllm import LLMTrainer, LLMAggregator, save_checkpoint, load_checkpoint
@@ -105,7 +89,7 @@ def __init__(self, *args, **kwargs):
                 p.requires_grad_(False)
         
         # Keep the commented line for quick CPU off-loading during debugging
-        # self.ref_model.to('cpu')
+        self.ref_model.to('cpu')
     
 
     def _record_step_stats(self, stats):

From efd2e1884505c890836efbd023f3d620f034ac10 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 18:08:11 +0000
Subject: [PATCH 147/168] Remove fallback stub for prepare_fsdp in
 custom_trainer.py to streamline imports and improve code clarity.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index f85ffbf14..6f9f48d59 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -26,14 +26,6 @@
 from trl import GRPOTrainer, GRPOConfig
 from trl.trainer.utils import prepare_deepspeed
 
-# Fallback stub if prepare_fsdp is unavailable in current TRL version
-try:
-    from trl.trainer.utils import prepare_fsdp  # type: ignore
-except ImportError:  # pragma: no cover
-    def prepare_fsdp(model, accelerator):
-        """Minimal FSDP prep fallback – just use accelerator.prepare_model."""
-        return accelerator.prepare_model(model, evaluation_mode=True)
-
 from fedml.ml.aggregator.agg_operator import FedMLAggOperator
 
 from run_fedllm import LLMTrainer, LLMAggregator, save_checkpoint, load_checkpoint

From 48ec150e9634a7c9a24a22d6e98038aaecda996a Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 18:33:59 +0000
Subject: [PATCH 148/168] Update reference model in TimedGRPOTrainer to use
 Qwen/Qwen3-1.7B for improved performance and compatibility.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 6f9f48d59..207587d85 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -69,7 +69,7 @@ def __init__(self, *args, **kwargs):
 
         if self.ref_model is not None:
             # Load any model you like as the reference baseline
-            self.ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-GPTQ-Int8")
+            self.ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B")
             self.ref_model.eval()
             disable_dropout_in_model(self.ref_model)
             # Move reference model to the same device as the policy so that

From 44e075f938901ebca6cac5eeb18c06e5019c8e96 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 18:47:09 +0000
Subject: [PATCH 149/168] =?UTF-8?q?Add=20device=20compatibility=20in=20Tim?=
 =?UTF-8?q?edGRPOTrainer=20by=20moving=20batch=20tensors=20to=20model's=20?=
 =?UTF-8?q?device=20in=20=5Fget=5Fper=5Ftoken=5Flogps=5Fand=5Fentropies=20?=
 =?UTF-8?q?method=20to=20prevent=20CPU=E2=86=94GPU=20mismatch=20errors.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/spotlight_prj/fedllm/custom_trainer.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 207587d85..53fbdebfc 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -141,6 +141,24 @@ def _generate_and_score_completions(self, *args, **kwargs):
 
         return result
 
+        
+
+    def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
+        """Ensure inputs and model are on the same device before delegating to parent impl.
+
+        This override fixes CPU↔GPU mismatch errors when the reference model is kept
+        on CPU while the policy lives on GPU.  We simply move the tensor inputs in
+        ``batch`` to the device of ``model`` before invoking the upstream helper.
+        """
+        import torch
+        target_device = next(model.parameters()).device
+        # Move all tensor values in the batch to the model's device
+        batch = {
+            k: (v.to(target_device) if torch.is_tensor(v) else v)
+            for k, v in batch.items()
+        }
+        return super()._get_per_token_logps_and_entropies(model, batch, *args, **kwargs)
+
 
 class FullModelLLMTrainer(LLMTrainer):
     """Custom trainer that properly handles both PEFT and non-PEFT models with GRPO training."""

From c08bc3b3388d08bbecf24dfaab32b8dbaaf5a9b8 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 19:03:34 +0000
Subject: [PATCH 150/168] Enhance batch handling in TimedGRPOTrainer by adding
 support for single tensor inputs in _get_per_token_logps_and_entropies
 method, ensuring compatibility with both tensor and mapping types.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 53fbdebfc..5d73fd4b7 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -157,6 +157,15 @@ def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
             k: (v.to(target_device) if torch.is_tensor(v) else v)
             for k, v in batch.items()
         }
+        if torch.is_tensor(batch):
+            # Upstream may forward a single Tensor instead of a mapping
+            batch = batch.to(target_device)
+        else:
+            # Standard case: mapping of tensors / non-tensor objects
+            batch = {
+                k: (v.to(target_device) if torch.is_tensor(v) else v)
+                for k, v in batch.items()
+            }
         return super()._get_per_token_logps_and_entropies(model, batch, *args, **kwargs)
 
 

From bfcf77cce27f83cd8b5879cc89f4aa41bf943f78 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 19:32:18 +0000
Subject: [PATCH 151/168] Refactor batch tensor handling in TimedGRPOTrainer by
 removing unnecessary tensor conversion logic, simplifying the process of
 moving inputs to the model's device.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 5d73fd4b7..10f037fc3 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -150,13 +150,8 @@ def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
         on CPU while the policy lives on GPU.  We simply move the tensor inputs in
         ``batch`` to the device of ``model`` before invoking the upstream helper.
         """
-        import torch
         target_device = next(model.parameters()).device
         # Move all tensor values in the batch to the model's device
-        batch = {
-            k: (v.to(target_device) if torch.is_tensor(v) else v)
-            for k, v in batch.items()
-        }
         if torch.is_tensor(batch):
             # Upstream may forward a single Tensor instead of a mapping
             batch = batch.to(target_device)

From 69194e92335888d6e1e649a675d41f5b7bcfbf09 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 19:51:32 +0000
Subject: [PATCH 152/168] Enhance tensor handling in TimedGRPOTrainer by
 ensuring log probabilities and entropies are moved to the appropriate device
 after computation in _get_per_token_logps_and_entropies method, improving
 device compatibility.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 10f037fc3..f31330213 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -161,7 +161,13 @@ def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
                 k: (v.to(target_device) if torch.is_tensor(v) else v)
                 for k, v in batch.items()
             }
-        return super()._get_per_token_logps_and_entropies(model, batch, *args, **kwargs)
+        logps, entropies = super()._get_per_token_logps_and_entropies(model, batch, *args, **kwargs)
+        
+        out_device = self.accelerator.device
+        logps = logps.to(out_device)
+        entropies = entropies.to(out_device)
+        
+        return logps, entropies
 
 
 class FullModelLLMTrainer(LLMTrainer):

From 800f566662892d2dcb0cc75ffa6958d7bf740045 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 20:28:02 +0000
Subject: [PATCH 153/168] Improve tensor device handling in TimedGRPOTrainer by
 adding checks for tensor types before moving log probabilities and entropies
 to the appropriate device, enhancing robustness and preventing potential
 errors.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index f31330213..2872cae5c 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -163,9 +163,11 @@ def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
             }
         logps, entropies = super()._get_per_token_logps_and_entropies(model, batch, *args, **kwargs)
         
-        out_device = self.accelerator.device
-        logps = logps.to(out_device)
-        entropies = entropies.to(out_device)
+        policy_device = self.accelerator.device
+        if torch.is_tensor(logps):
+            logps = logps.to(policy_device)
+        if entropies is not None and torch.is_tensor(entropies):
+            entropies = entropies.to(policy_device)
         
         return logps, entropies
 

From a3d7b7be93890b3b0d4f6cdb34eb716dd74f08fc Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 20:48:12 +0000
Subject: [PATCH 154/168] Refine tensor device alignment in TimedGRPOTrainer by
 implementing a strategy to move policy outputs to CPU when the reference
 model is on CPU, ensuring efficient memory usage and preventing GPU memory
 spikes during rollouts.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 2872cae5c..4de84a0ce 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -163,12 +163,29 @@ def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
             }
         logps, entropies = super()._get_per_token_logps_and_entropies(model, batch, *args, **kwargs)
         
+        # -------------------------------------------------------------
+        # Align output tensors so both policy and reference tensors live
+        # on the SAME device.
+        # Strategy: if the reference model is on CPU (its outputs therefore
+        # on CPU) and the policy is on GPU, we move the *policy* outputs to
+        # CPU – they are much smaller than the reference logits.  This avoids
+        # a large GPU memory spike (~300 MB per rollout).
+        # -------------------------------------------------------------
         policy_device = self.accelerator.device
         if torch.is_tensor(logps):
-            logps = logps.to(policy_device)
-        if entropies is not None and torch.is_tensor(entropies):
-            entropies = entropies.to(policy_device)
-        
+            if logps.device == policy_device:
+                # Policy outputs – move to CPU to match reference tensors
+                logps = logps.cpu()
+                if entropies is not None and torch.is_tensor(entropies):
+                    entropies = entropies.cpu()
+            else:
+                # Reference outputs already on CPU – leave as-is
+                pass
+        elif entropies is not None and torch.is_tensor(entropies):
+            # Rare edge-case where logps is not a tensor but entropies is
+            if entropies.device == policy_device:
+                entropies = entropies.cpu()
+
         return logps, entropies
 
 

From e6e2c15718d132cb34d4d85f8b2d6cb9de9a0da1 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 21:06:40 +0000
Subject: [PATCH 155/168] Optimize tensor device management in TimedGRPOTrainer
 by converting log probabilities and entropies to float16 and ensuring they
 are moved to the appropriate device, enhancing performance and memory
 efficiency during training.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 27 +++++--------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 4de84a0ce..fa5086ab1 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -163,29 +163,14 @@ def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
             }
         logps, entropies = super()._get_per_token_logps_and_entropies(model, batch, *args, **kwargs)
         
-        # -------------------------------------------------------------
-        # Align output tensors so both policy and reference tensors live
-        # on the SAME device.
-        # Strategy: if the reference model is on CPU (its outputs therefore
-        # on CPU) and the policy is on GPU, we move the *policy* outputs to
-        # CPU – they are much smaller than the reference logits.  This avoids
-        # a large GPU memory spike (~300 MB per rollout).
-        # -------------------------------------------------------------
         policy_device = self.accelerator.device
         if torch.is_tensor(logps):
-            if logps.device == policy_device:
-                # Policy outputs – move to CPU to match reference tensors
-                logps = logps.cpu()
-                if entropies is not None and torch.is_tensor(entropies):
-                    entropies = entropies.cpu()
-            else:
-                # Reference outputs already on CPU – leave as-is
-                pass
-        elif entropies is not None and torch.is_tensor(entropies):
-            # Rare edge-case where logps is not a tensor but entropies is
-            if entropies.device == policy_device:
-                entropies = entropies.cpu()
-
+            logps = logps.to(torch.float16)
+            logps = logps.to(policy_device)
+        if entropies is not None and torch.is_tensor(entropies):
+            entropies = entropies.to(torch.float16)
+            entropies = entropies.to(policy_device)
+        
         return logps, entropies
 
 

From 92b942e56336d1ed1b425b5df465239f14d35f67 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 22:20:34 +0000
Subject: [PATCH 156/168] Adjust max completion length and new tokens in
 FullModelLLMTrainer to 256 for improved performance and resource management
 during training.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index fa5086ab1..ae91bd703 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -379,7 +379,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=512,
+            max_completion_length=256,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -422,7 +422,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 512,
+            "max_new_tokens": 256,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From 33f6ed217d8803d5694f2e1694213bdae8003fcd Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 22:38:32 +0000
Subject: [PATCH 157/168] Refactor TimedGRPOTrainer by adding docstrings for
 improved code documentation and clarity, and update max completion length and
 new tokens in FullModelLLMTrainer to 512 for enhanced training performance.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index ae91bd703..e1b895faf 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -63,7 +63,7 @@ def disable_dropout_in_model(model: torch.nn.Module) -> None:
 
 
 class TimedGRPOTrainer(GRPOTrainer):
-    
+    """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -82,6 +82,7 @@ def __init__(self, *args, **kwargs):
         
         # Keep the commented line for quick CPU off-loading during debugging
         self.ref_model.to('cpu')
+    """
     
 
     def _record_step_stats(self, stats):
@@ -142,14 +143,14 @@ def _generate_and_score_completions(self, *args, **kwargs):
         return result
 
         
-
+    """
     def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
-        """Ensure inputs and model are on the same device before delegating to parent impl.
+        Ensure inputs and model are on the same device before delegating to parent impl.
 
         This override fixes CPU↔GPU mismatch errors when the reference model is kept
         on CPU while the policy lives on GPU.  We simply move the tensor inputs in
         ``batch`` to the device of ``model`` before invoking the upstream helper.
-        """
+        
         target_device = next(model.parameters()).device
         # Move all tensor values in the batch to the model's device
         if torch.is_tensor(batch):
@@ -172,6 +173,7 @@ def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
             entropies = entropies.to(policy_device)
         
         return logps, entropies
+    """
 
 
 class FullModelLLMTrainer(LLMTrainer):
@@ -379,7 +381,7 @@ def train(self, train_data, device, args):
             output_dir=str(self.checkpoint_dir / "grpo"),
             per_device_train_batch_size=grpo_batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            max_completion_length=256,
+            max_completion_length=512,
             num_generations=num_generations,  # Adjusted based on effective batch size
             num_train_epochs=grpo_num_epochs if grpo_max_steps <= 0 else 1,  # Use 1 epoch if max_steps is set
             max_steps=grpo_max_steps if grpo_max_steps > 0 else -1,  # Override epochs with max_steps
@@ -400,7 +402,7 @@ def train(self, train_data, device, args):
             top_k=50,
             repetition_penalty=1.1,
             epsilon=0.2,
-            beta=0.1,
+            beta=0.0,
             optim="sgd",
         )
         
@@ -422,7 +424,7 @@ def train(self, train_data, device, args):
             "pad_token_id": fresh_tokenizer.eos_token_id,
             "eos_token_id": fresh_tokenizer.eos_token_id,
             "bos_token_id": fresh_tokenizer.bos_token_id,
-            "max_new_tokens": 256,
+            "max_new_tokens": 512,
             "length_penalty": 1.0,      # Neutral length penalty
         }
         

From 760493cd8326df9cfa7920fc98df793a4bf6332c Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 23:03:30 +0000
Subject: [PATCH 158/168] Update reference model initialization in
 TimedGRPOTrainer to include additional parameters for enhanced performance
 and compatibility, and clean up commented code for better readability.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index e1b895faf..22375f64b 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -63,26 +63,25 @@ def disable_dropout_in_model(model: torch.nn.Module) -> None:
 
 
 class TimedGRPOTrainer(GRPOTrainer):
-    """
+    
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         if self.ref_model is not None:
             # Load any model you like as the reference baseline
-            self.ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B")
+            self.ref_model = AutoModelForCausalLM.from_pretrained(
+                "Qwen/Qwen3-1.7B",
+                torch_dtype=torch.bfloat16,
+                device_map="cpu",
+                trust_remote_code=True,
+                use_cache=False,
+            )
             self.ref_model.eval()
             disable_dropout_in_model(self.ref_model)
-            # Move reference model to the same device as the policy so that
-            # inputs and weights reside on a single device (avoids CPU↔GPU mismatch).
-            # `Trainer` already initialises an `accelerator` attribute so we can
-            # rely on `self.accelerator.device` to pick the correct target.
-            self.ref_model.to(self.accelerator.device)
             for p in self.ref_model.parameters():
                 p.requires_grad_(False)
         
-        # Keep the commented line for quick CPU off-loading during debugging
-        self.ref_model.to('cpu')
-    """
+    
     
 
     def _record_step_stats(self, stats):
@@ -143,14 +142,15 @@ def _generate_and_score_completions(self, *args, **kwargs):
         return result
 
         
-    """
+    
     def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
+        """
         Ensure inputs and model are on the same device before delegating to parent impl.
 
         This override fixes CPU↔GPU mismatch errors when the reference model is kept
         on CPU while the policy lives on GPU.  We simply move the tensor inputs in
         ``batch`` to the device of ``model`` before invoking the upstream helper.
-        
+        """
         target_device = next(model.parameters()).device
         # Move all tensor values in the batch to the model's device
         if torch.is_tensor(batch):
@@ -173,7 +173,7 @@ def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
             entropies = entropies.to(policy_device)
         
         return logps, entropies
-    """
+    
 
 
 class FullModelLLMTrainer(LLMTrainer):

From a29d3eb0d1c61ed2d6537c24ce9fca58bee50d37 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Tue, 5 Aug 2025 23:13:19 +0000
Subject: [PATCH 159/168] Update beta parameter in FullModelLLMTrainer to 0.1
 for improved training performance and stability.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 22375f64b..94dedf837 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -402,7 +402,7 @@ def train(self, train_data, device, args):
             top_k=50,
             repetition_penalty=1.1,
             epsilon=0.2,
-            beta=0.0,
+            beta=0.1,
             optim="sgd",
         )
         

From 5cc053714521335c96b3dee63bbe15c1a6df0703 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 6 Aug 2025 15:56:13 +0000
Subject: [PATCH 160/168] Update generation count in FullModelLLMTrainer to 4
 and increase batch size in GRPO test config to 2 for improved testing
 efficiency.

---
 python/spotlight_prj/fedllm/custom_trainer.py                   | 2 +-
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 94dedf837..6f1634ab5 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -291,7 +291,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 2
+        num_generations = 4
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index d0c3108bc..96d44e8a1 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -33,7 +33,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 50
   grpo_num_epochs: 2  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 1  # Smaller batch size for faster testing
+  grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From 28d7513d2bda92ebb2a3a28195b9bf1aa6c7c85b Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 6 Aug 2025 16:25:19 +0000
Subject: [PATCH 161/168] Update generation count in FullModelLLMTrainer from 4
 to 2 for optimized testing configuration.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 6f1634ab5..94dedf837 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -291,7 +291,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 4
+        num_generations = 2
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:

From 5c68b924e30592a895060a33cba53c06a9e612f5 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 6 Aug 2025 16:36:07 +0000
Subject: [PATCH 162/168] Reduce grpo_max_steps in GRPO test configuration from
 50 to 20 for optimized testing efficiency.

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 96d44e8a1..2cf74e34a 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -31,7 +31,7 @@ train_args:
   client_num_per_round: 1  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 50
+  grpo_max_steps: 20
   grpo_num_epochs: 2  # Ignored when grpo_max_steps > 0
   grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)

From 2a7da13eb418659b9e5466eb5b7ec9649d2ff319 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 6 Aug 2025 17:24:01 +0000
Subject: [PATCH 163/168] Update client configuration in GRPO test setup to
 support 4 clients for enhanced testing scalability.

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 2cf74e34a..01bfc9288 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -27,8 +27,8 @@ train_args:
   federated_optimizer: "FedAvg"
   client_optimizer: "adamw_torch"
   server_optimizer: "FedAvg"
-  client_num_in_total: 1  # Single client setup
-  client_num_per_round: 1  # Single client setup
+  client_num_in_total: 4  # Single client setup
+  client_num_per_round: 4  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
   grpo_max_steps: 20

From 79929f788aac881478c04ed70da8ca8233f0587a Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 6 Aug 2025 17:52:16 +0000
Subject: [PATCH 164/168] Update GRPO test configuration to reduce batch size
 from 2 to 1 for faster testing.

---
 .../fedllm/fedml_config/grpo_gsm8k_test_config.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 01bfc9288..1a246f2fc 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -33,7 +33,7 @@ train_args:
   # GRPO-specific settings for testing
   grpo_max_steps: 20
   grpo_num_epochs: 2  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 2  # Smaller batch size for faster testing
+  grpo_batch_size: 1  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From 8c8739fd6adf233cad5496395341b5138a03bc12 Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 6 Aug 2025 21:06:24 +0000
Subject: [PATCH 165/168] Update reference model in TimedGRPOTrainer from
 "Qwen/Qwen3-1.7B" to "Qwen/Qwen3-0.6" for improved compatibility and
 performance.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 94dedf837..965ca0717 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -70,7 +70,7 @@ def __init__(self, *args, **kwargs):
         if self.ref_model is not None:
             # Load any model you like as the reference baseline
             self.ref_model = AutoModelForCausalLM.from_pretrained(
-                "Qwen/Qwen3-1.7B",
+                "Qwen/Qwen3-0.6",
                 torch_dtype=torch.bfloat16,
                 device_map="cpu",
                 trust_remote_code=True,

From 19e0aad3ac3b6238b120913c00e541d70f90862e Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Wed, 6 Aug 2025 21:25:47 +0000
Subject: [PATCH 166/168] Update reference model in TimedGRPOTrainer from
 "Qwen/Qwen3-0.6" to "Qwen/Qwen3-0.6B" for enhanced performance and
 compatibility.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 965ca0717..799d019de 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -70,7 +70,7 @@ def __init__(self, *args, **kwargs):
         if self.ref_model is not None:
             # Load any model you like as the reference baseline
             self.ref_model = AutoModelForCausalLM.from_pretrained(
-                "Qwen/Qwen3-0.6",
+                "Qwen/Qwen3-0.6B",
                 torch_dtype=torch.bfloat16,
                 device_map="cpu",
                 trust_remote_code=True,

From e57d3e2a92d28b06ffcf8a6f9bdddb1242a7737d Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 7 Aug 2025 23:05:04 +0000
Subject: [PATCH 167/168] Refactor TimedGRPOTrainer to improve code
 documentation with added docstrings, and update GRPO test configuration by
 increasing max steps from 20 to 50 and batch size from 1 to 2 for enhanced
 testing efficiency.

---
 python/spotlight_prj/fedllm/custom_trainer.py  | 18 ++++++++----------
 .../fedml_config/grpo_gsm8k_test_config.yaml   |  6 +++---
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 799d019de..0e9bc633c 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -48,8 +48,6 @@
 import warnings
 warnings.filterwarnings("ignore")
 
-#import gc
-
 
 def disable_dropout_in_model(model: torch.nn.Module) -> None:
     """
@@ -63,7 +61,7 @@ def disable_dropout_in_model(model: torch.nn.Module) -> None:
 
 
 class TimedGRPOTrainer(GRPOTrainer):
-    
+    """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -81,7 +79,7 @@ def __init__(self, *args, **kwargs):
             for p in self.ref_model.parameters():
                 p.requires_grad_(False)
         
-    
+    """
     
 
     def _record_step_stats(self, stats):
@@ -142,15 +140,15 @@ def _generate_and_score_completions(self, *args, **kwargs):
         return result
 
         
-    
+    """
     def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
-        """
+        
         Ensure inputs and model are on the same device before delegating to parent impl.
 
         This override fixes CPU↔GPU mismatch errors when the reference model is kept
         on CPU while the policy lives on GPU.  We simply move the tensor inputs in
         ``batch`` to the device of ``model`` before invoking the upstream helper.
-        """
+        
         target_device = next(model.parameters()).device
         # Move all tensor values in the batch to the model's device
         if torch.is_tensor(batch):
@@ -173,7 +171,7 @@ def _get_per_token_logps_and_entropies(self, model, batch, *args, **kwargs):
             entropies = entropies.to(policy_device)
         
         return logps, entropies
-    
+    """
 
 
 class FullModelLLMTrainer(LLMTrainer):
@@ -291,7 +289,7 @@ def train(self, train_data, device, args):
         else:
             num_generations = 2
         
-        num_generations = 2
+        num_generations = 4
         
         # For testing, we can use a very small number of steps
         if grpo_max_steps > 0:
@@ -403,7 +401,7 @@ def train(self, train_data, device, args):
             repetition_penalty=1.1,
             epsilon=0.2,
             beta=0.1,
-            optim="sgd",
+            #optim="sgd",
         )
         
         self.log(f"GRPO Config - bf16: {use_bf16}, fp16: {not use_bf16}, batch_size: {grpo_batch_size}")
diff --git a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
index 1a246f2fc..ed0285607 100644
--- a/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
+++ b/python/spotlight_prj/fedllm/fedml_config/grpo_gsm8k_test_config.yaml
@@ -18,7 +18,7 @@ data_args:
 
 model_args:
   skip_log_model_net: True
-  model_name_or_path: "Qwen/Qwen3-1.7B"
+  model_name_or_path: "Qwen/Qwen3-0.6B"
   model_dtype: "bfloat16"
   peft_type: "none"  # Full model fine-tuning
   use_flash_attention: False
@@ -31,9 +31,9 @@ train_args:
   client_num_per_round: 4  # Single client setup
   comm_round: 30  # Reduced to 3 rounds for testing
   # GRPO-specific settings for testing
-  grpo_max_steps: 20
+  grpo_max_steps: 50
   grpo_num_epochs: 2  # Ignored when grpo_max_steps > 0
-  grpo_batch_size: 1  # Smaller batch size for faster testing
+  grpo_batch_size: 2  # Smaller batch size for faster testing
   # FedML training settings (ignored when using GRPO)
   local_num_train_epochs: 1
   local_max_steps: -1

From 3b7a416022fc8cd8a4d94215d3b5b963a877329c Mon Sep 17 00:00:00 2001
From: Marcos Villagra <mdvillagra@gmail.com>
Date: Thu, 7 Aug 2025 23:46:45 +0000
Subject: [PATCH 168/168] Increase the number of retained old wallclock
 checkpoints from 6 to 12 in FullModelLLMAggregator for improved checkpoint
 management.

---
 python/spotlight_prj/fedllm/custom_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/spotlight_prj/fedllm/custom_trainer.py b/python/spotlight_prj/fedllm/custom_trainer.py
index 0e9bc633c..1bd37123d 100644
--- a/python/spotlight_prj/fedllm/custom_trainer.py
+++ b/python/spotlight_prj/fedllm/custom_trainer.py
@@ -809,7 +809,7 @@ def set_model_params(self, model_parameters) -> None:
 
         self.log("finished")
 
-    def _cleanup_old_wallclock_checkpoints(self, keep_last: int = 6):
+    def _cleanup_old_wallclock_checkpoints(self, keep_last: int = 12):
         """Delete old wallclock_* checkpoints but keep the most recent ``keep_last``.
 
         This complements the round-based checkpoint cleanup by pruning time-based