From 09e0124eadfd5fa2a88213c1c50e4aed0fdf725c Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Thu, 10 Apr 2025 10:42:16 -0400
Subject: [PATCH 01/15] sm fix

---
 .../accelerate_configs/deepspeed_zero2.yaml   | 21 +++++++++++++++++++
 .../accelerate_configs/deepspeed_zero3.yaml   |  2 +-
 benchmarks/dpo/continual_dpo_trainer.py       |  7 ++++---
 benchmarks/dpo/dpo_continual.py               | 15 +++++++------
 benchmarks/reward_modeling.py                 |  1 +
 5 files changed, 34 insertions(+), 12 deletions(-)
 create mode 100644 benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml

diff --git a/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml b/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml
new file mode 100644
index 00000000..877a5b8f
--- /dev/null
+++ b/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml
@@ -0,0 +1,21 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 2
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml b/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
index 7f17a48f..29507c4c 100644
--- a/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
+++ b/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
@@ -11,7 +11,7 @@ machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
-num_processes: 1  # TODO change to whatever number of gpus is used
+num_processes: 2  # TODO change to whatever number of gpus is used
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/benchmarks/dpo/continual_dpo_trainer.py b/benchmarks/dpo/continual_dpo_trainer.py
index ee1820ff..47340409 100644
--- a/benchmarks/dpo/continual_dpo_trainer.py
+++ b/benchmarks/dpo/continual_dpo_trainer.py
@@ -320,7 +320,8 @@ def log(
         train_eval = 'train' if 'loss' in logs else 'eval'
         print(f'Logging {train_eval} metrics...')
         if train_eval == 'eval':
-            print('Computing policy metrics...')
-            eval_policy_metrics = self.evaluate_policy()
-            logs.update(eval_policy_metrics)
+            if self.reward_model is not None:
+                print('Computing policy metrics...')
+                eval_policy_metrics = self.evaluate_policy()
+                logs.update(eval_policy_metrics)
         return super().log(logs, start_time)
diff --git a/benchmarks/dpo/dpo_continual.py b/benchmarks/dpo/dpo_continual.py
index 080d8d51..1b19b2df 100644
--- a/benchmarks/dpo/dpo_continual.py
+++ b/benchmarks/dpo/dpo_continual.py
@@ -3,11 +3,6 @@
 import os
 
 import torch
-from continual_dpo_trainer import (
-    ContinualDPOArguments,
-    ContinualDPOConfig,
-    ContinualDPOTrainer,
-)
 from datasets import Dataset
 from transformers import (
     AutoModelForCausalLM,
@@ -24,6 +19,8 @@
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 import wandb as wb
+from transformers.trainer_utils import is_main_process
+
 from benchmarks.dataloading import init_continual_dataset
 from benchmarks.dpo.continual_dpo_trainer import (
     ContinualDPOArguments,
@@ -104,7 +101,7 @@ def main(
             # first check the hub if the model is present
             try:
                 AutoModelForSequenceClassification.from_pretrained(
-                    reward_path, num_labels=1
+                    reward_path, num_labels=1, use_cache=True
                 )
             except:
                 # if not found in the hub, check the local path
@@ -152,8 +149,10 @@ def main(
             print(f'eval/dataset/{i}')
             trainer.log_metrics(f'eval/dataset/{i}', metrics)
             trainer.save_metrics(f'eval', metrics)
-            wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
-            wb.log({f'task/{current_dataset_name}/last': metrics})  # type: ignore[attr-defined]
+            # if is_main_process():
+            if training_args.local_rank in (None, -1, 0):
+                wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
+                wb.log({f'task/{current_dataset_name}/last': metrics})  # type: ignore[attr-defined]
 
         # Save and push to hub
         trainer.save_model(os.path.join(training_args.output_dir, 'last'))
diff --git a/benchmarks/reward_modeling.py b/benchmarks/reward_modeling.py
index 1e06bdf4..a211a740 100644
--- a/benchmarks/reward_modeling.py
+++ b/benchmarks/reward_modeling.py
@@ -226,6 +226,7 @@ def train_model(
             except Exception as e:
                 print(f'Job {i + 1} failed with error: {e}')
     else:
+        print(f'Running on {script_args.dataset_index} task out of {len(continual_dataset)} tasks')
         dataset = continual_dataset[script_args.dataset_index]
         train_model(
             script_args, training_args, model_args, dataset, script_args.dataset_index

From defebfb216b8322953f4893eb30c6f25a2f98ba5 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Thu, 10 Apr 2025 10:42:27 -0400
Subject: [PATCH 02/15] sm fix

---
 benchmarks/dpo/dpo_continual.py | 4 +---
 benchmarks/reward_modeling.py   | 4 +++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/dpo/dpo_continual.py b/benchmarks/dpo/dpo_continual.py
index 1b19b2df..31e08e24 100644
--- a/benchmarks/dpo/dpo_continual.py
+++ b/benchmarks/dpo/dpo_continual.py
@@ -3,6 +3,7 @@
 import os
 
 import torch
+import wandb as wb
 from datasets import Dataset
 from transformers import (
     AutoModelForCausalLM,
@@ -18,9 +19,6 @@
 )
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
-import wandb as wb
-from transformers.trainer_utils import is_main_process
-
 from benchmarks.dataloading import init_continual_dataset
 from benchmarks.dpo.continual_dpo_trainer import (
     ContinualDPOArguments,
diff --git a/benchmarks/reward_modeling.py b/benchmarks/reward_modeling.py
index a211a740..95053f32 100644
--- a/benchmarks/reward_modeling.py
+++ b/benchmarks/reward_modeling.py
@@ -226,7 +226,9 @@ def train_model(
             except Exception as e:
                 print(f'Job {i + 1} failed with error: {e}')
     else:
-        print(f'Running on {script_args.dataset_index} task out of {len(continual_dataset)} tasks')
+        print(
+            f'Running on {script_args.dataset_index} task out of {len(continual_dataset)} tasks'
+        )
         dataset = continual_dataset[script_args.dataset_index]
         train_model(
             script_args, training_args, model_args, dataset, script_args.dataset_index

From 363b5a9c92de67ffb66bfc53579c066dd5a378e3 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Thu, 24 Apr 2025 20:34:38 +0800
Subject: [PATCH 03/15] upd dataset json

---
 benchmarks/dataloading.py                         | 15 +++++++++++++--
 .../dpo/accelerate_configs/deepspeed_zero3.yaml   |  2 +-
 benchmarks/reward_modeling.py                     |  3 ++-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/benchmarks/dataloading.py b/benchmarks/dataloading.py
index b93d07b2..9dbec4b6 100644
--- a/benchmarks/dataloading.py
+++ b/benchmarks/dataloading.py
@@ -89,9 +89,20 @@ def init_continual_dataset(
             data = ContinualAlignmentDataset.from_json(dataset)
         except OSError:  # need to try downloading from hub
             try:
+                # json_name = dataset.split('/', )[-1]
+                # print(f'Downloading {json_name} from Hugging Face Hub...')
                 local_path = hf_hub_download(
-                    repo_id=dataset, filename='dataset.json', repo_type='dataset'
-                )
+                        repo_id=f"LifelongAlignment/{dataset}", filename='dataset.json', repo_type='dataset'
+                    )
+                # local_path = hf_hub_download(
+                #         repo_id=f"LifelongAlignment/{dataset}", filename=f'{dataset}.json', repo_type='dataset'
+                #     )
+                # local_path = hf_hub_download(
+                #         repo_id=f"LifelongAlignment/{dataset}", filename=f'{json_name}.json', repo_type='dataset'
+                #     )
+                # local_path = hf_hub_download(
+                #     repo_id=dataset, filename='dataset.json', repo_type='dataset'
+                # )
                 data = ContinualAlignmentDataset.from_json(local_path)
             except Exception as e:
                 raise ValueError(f'Error loading dataset: {e}')
diff --git a/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml b/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
index 29507c4c..6b68067b 100644
--- a/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
+++ b/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
@@ -11,7 +11,7 @@ machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
-num_processes: 2  # TODO change to whatever number of gpus is used
+num_processes: 8  # TODO change to whatever number of gpus is used
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/benchmarks/reward_modeling.py b/benchmarks/reward_modeling.py
index 95053f32..2f5f7905 100644
--- a/benchmarks/reward_modeling.py
+++ b/benchmarks/reward_modeling.py
@@ -129,6 +129,7 @@ def train_model(
         trust_remote_code=model_args.trust_remote_code,
         **model_kwargs,
     )
+
     # Align padding tokens between tokenizer and model
     model.config.pad_token_id = tokenizer.pad_token_id
 
@@ -227,7 +228,7 @@ def train_model(
                 print(f'Job {i + 1} failed with error: {e}')
     else:
         print(
-            f'Running on {script_args.dataset_index} task out of {len(continual_dataset)} tasks'
+            f'Running on {script_args.dataset_index+1} task out of {len(continual_dataset)} tasks'
         )
         dataset = continual_dataset[script_args.dataset_index]
         train_model(

From d6540331a234a19ffd077e0f0db8dee49790e341 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Thu, 24 Apr 2025 20:35:14 +0800
Subject: [PATCH 04/15] upd dataset json

---
 benchmarks/dataloading.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/benchmarks/dataloading.py b/benchmarks/dataloading.py
index 9dbec4b6..837942f2 100644
--- a/benchmarks/dataloading.py
+++ b/benchmarks/dataloading.py
@@ -92,8 +92,10 @@ def init_continual_dataset(
                 # json_name = dataset.split('/', )[-1]
                 # print(f'Downloading {json_name} from Hugging Face Hub...')
                 local_path = hf_hub_download(
-                        repo_id=f"LifelongAlignment/{dataset}", filename='dataset.json', repo_type='dataset'
-                    )
+                    repo_id=f'LifelongAlignment/{dataset}',
+                    filename='dataset.json',
+                    repo_type='dataset',
+                )
                 # local_path = hf_hub_download(
                 #         repo_id=f"LifelongAlignment/{dataset}", filename=f'{dataset}.json', repo_type='dataset'
                 #     )

From a1d0cdae2548acac57af53f39ffca3a3f0f63818 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Sun, 27 Apr 2025 10:15:45 +0800
Subject: [PATCH 05/15] upd

---
 benchmarks/dataloading.py               |  2 +-
 benchmarks/dpo/dpo_continual.py         |  4 +++-
 benchmarks/dpo_ewc/dpo_EWC_continual.py | 10 +++++++---
 benchmarks/ppo/ppo_continual.py         | 11 ++++++++---
 benchmarks/ppo_ewc/ppo_EWC_continual.py |  7 ++++---
 5 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/benchmarks/dataloading.py b/benchmarks/dataloading.py
index 837942f2..65b5dd7b 100644
--- a/benchmarks/dataloading.py
+++ b/benchmarks/dataloading.py
@@ -93,7 +93,7 @@ def init_continual_dataset(
                 # print(f'Downloading {json_name} from Hugging Face Hub...')
                 local_path = hf_hub_download(
                     repo_id=f'LifelongAlignment/{dataset}',
-                    filename='dataset.json',
+                    filename='data.json',
                     repo_type='dataset',
                 )
                 # local_path = hf_hub_download(
diff --git a/benchmarks/dpo/dpo_continual.py b/benchmarks/dpo/dpo_continual.py
index 31e08e24..f1edf2fe 100644
--- a/benchmarks/dpo/dpo_continual.py
+++ b/benchmarks/dpo/dpo_continual.py
@@ -132,6 +132,9 @@ def main(
             peft_config=peft_config,
         )
 
+        if i == 0:
+            trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
+
         # TODO will throw Invalidate trace cache @ step 10: expected module 11, but got module 19
         # https://github.com/deepspeedai/DeepSpeed/issues/6870
         # Fix with deepspeed fix release
@@ -147,7 +150,6 @@ def main(
             print(f'eval/dataset/{i}')
             trainer.log_metrics(f'eval/dataset/{i}', metrics)
             trainer.save_metrics(f'eval', metrics)
-            # if is_main_process():
             if training_args.local_rank in (None, -1, 0):
                 wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
                 wb.log({f'task/{current_dataset_name}/last': metrics})  # type: ignore[attr-defined]
diff --git a/benchmarks/dpo_ewc/dpo_EWC_continual.py b/benchmarks/dpo_ewc/dpo_EWC_continual.py
index 35a00b1e..547e87c0 100644
--- a/benchmarks/dpo_ewc/dpo_EWC_continual.py
+++ b/benchmarks/dpo_ewc/dpo_EWC_continual.py
@@ -3,6 +3,7 @@
 import os
 
 import torch
+import wandb as wb
 from continual_dpo_EWC_trainer import (
     ContinualDPOEWCArguments,
     ContinualDPOEWCConfig,
@@ -23,7 +24,6 @@
 )
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
-import wandb as wb
 from benchmarks.dataloading import init_continual_dataset
 
 
@@ -132,6 +132,9 @@ def main(
             peft_config=peft_config,
         )
 
+        if i == 0:
+            trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
+
         # TODO will throw Invalidate trace cache @ step 10: expected module 11, but got module 19
         # https://github.com/deepspeedai/DeepSpeed/issues/6870
         # Fix with deepspeed fix release
@@ -147,8 +150,9 @@ def main(
             print(f'eval/dataset/{i}')
             trainer.log_metrics(f'eval/dataset/{i}', metrics)
             trainer.save_metrics(f'eval', metrics)
-            wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
-            wb.log({f'task/{current_dataset_name}/last': metrics})  # type: ignore[attr-defined]
+            if training_args.local_rank in (None, -1, 0):
+                wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
+                wb.log({f'task/{current_dataset_name}/last': metrics})  # type: ignore[attr-defined]
 
         # Save and push to hub
         trainer.save_model(os.path.join(training_args.output_dir, 'last'))
diff --git a/benchmarks/ppo/ppo_continual.py b/benchmarks/ppo/ppo_continual.py
index 5fe18513..f2cfa1c3 100644
--- a/benchmarks/ppo/ppo_continual.py
+++ b/benchmarks/ppo/ppo_continual.py
@@ -3,6 +3,7 @@
 import os
 
 import torch
+import wandb as wb
 from continual_ppo_trainer import (
     ContinualPPOArguments,
     ContinualPPOConfig,
@@ -23,7 +24,6 @@
 )
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
-import wandb as wb
 from benchmarks.dataloading import init_continual_dataset
 
 
@@ -143,6 +143,10 @@ def main(
             eval_dataset=dataset[script_args.dataset_test_split],
             peft_config=peft_config,
         )
+
+        if i == 0:
+            trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
+
         # Set current task in trainer for task-based logging
         trainer.set_task(f'task_{i}')
 
@@ -164,8 +168,9 @@ def main(
             trainer.save_metrics('eval', metrics)
 
             # Log metrics to WandB
-            wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
-            wb.log({f'task/{custom_repo_name}/last': metrics})  # type: ignore[attr-defined]
+            if training_args.local_rank in (None, -1, 0):
+                wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
+                wb.log({f'task/{custom_repo_name}/last': metrics})  # type: ignore[attr-defined]
 
         # Save model checkpoint and optionally push
         if not training_args.push_to_hub:
diff --git a/benchmarks/ppo_ewc/ppo_EWC_continual.py b/benchmarks/ppo_ewc/ppo_EWC_continual.py
index 211bc56a..c71e90e6 100644
--- a/benchmarks/ppo_ewc/ppo_EWC_continual.py
+++ b/benchmarks/ppo_ewc/ppo_EWC_continual.py
@@ -3,6 +3,7 @@
 import os
 
 import torch
+import wandb as wb
 from datasets import Dataset
 from transformers import (
     AutoModelForCausalLM,
@@ -18,7 +19,6 @@
 )
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
-import wandb as wb
 from benchmarks.dataloading import init_continual_dataset
 from benchmarks.ppo_ewc.continual_ppo_EWC_trainer import (
     ContinualPPOEWCArguments,
@@ -176,8 +176,9 @@ def main(
             trainer.save_metrics('eval', metrics)
 
             # Log metrics to WandB
-            wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
-            wb.log({f'task/{custom_repo_name}/last': metrics})  # type: ignore[attr-defined]
+            if training_args.local_rank in (None, -1, 0):
+                wb.log({'eval': {'last': metrics}})  # type: ignore[attr-defined]
+                wb.log({f'task/{custom_repo_name}/last': metrics})  # type: ignore[attr-defined]
 
         # Save model checkpoint and optionally push
         if not training_args.push_to_hub:

From 07342977b076ebfb277b604cc1422755a3349d83 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Thu, 1 May 2025 23:34:20 -0400
Subject: [PATCH 06/15] sm fixes, hf upload

---
 benchmarks/dpo/dpo_continual.py               |  4 +--
 benchmarks/dpo_ewc/dpo_EWC_continual.py       |  4 +--
 benchmarks/hf_upload_models.py                | 28 +++++++++++++++++++
 benchmarks/ppo/README.md                      |  4 +--
 .../accelerate_configs/deepspeed_zero2.yaml   |  2 +-
 benchmarks/ppo/ppo_continual.py               |  4 +--
 pyproject.toml                                |  3 +-
 7 files changed, 39 insertions(+), 10 deletions(-)
 create mode 100644 benchmarks/hf_upload_models.py

diff --git a/benchmarks/dpo/dpo_continual.py b/benchmarks/dpo/dpo_continual.py
index f1edf2fe..7ff9bd4a 100644
--- a/benchmarks/dpo/dpo_continual.py
+++ b/benchmarks/dpo/dpo_continual.py
@@ -132,8 +132,8 @@ def main(
             peft_config=peft_config,
         )
 
-        if i == 0:
-            trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
+        # if i == 0:
+        #     trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
 
         # TODO will throw Invalidate trace cache @ step 10: expected module 11, but got module 19
         # https://github.com/deepspeedai/DeepSpeed/issues/6870
diff --git a/benchmarks/dpo_ewc/dpo_EWC_continual.py b/benchmarks/dpo_ewc/dpo_EWC_continual.py
index 547e87c0..021a4e71 100644
--- a/benchmarks/dpo_ewc/dpo_EWC_continual.py
+++ b/benchmarks/dpo_ewc/dpo_EWC_continual.py
@@ -132,8 +132,8 @@ def main(
             peft_config=peft_config,
         )
 
-        if i == 0:
-            trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
+        # if i == 0:
+        #     trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
 
         # TODO will throw Invalidate trace cache @ step 10: expected module 11, but got module 19
         # https://github.com/deepspeedai/DeepSpeed/issues/6870
diff --git a/benchmarks/hf_upload_models.py b/benchmarks/hf_upload_models.py
new file mode 100644
index 00000000..ebca79c2
--- /dev/null
+++ b/benchmarks/hf_upload_models.py
@@ -0,0 +1,28 @@
+from huggingface_hub import HfApi, upload_folder
+
+datasets="aifgen-long-piecewise aifgen-lipschitz aifgen-piecewise-preference-shift aifgen-domain-preference-shift aifgen-short-piecewise CPPO-REWARD"
+dataset_indices="0 1 2 3 4 5 6 7 8 9"
+# datasets="aifgen-long-piecewise"
+# dataset_indices="0"
+
+for dataset_name in datasets.split():
+    for dataset_index in dataset_indices.split():
+        # Upload the model to the Hugging Face Hub
+        try:
+            repo_id = f"LifelongAlignment/{dataset_name}-{dataset_index}-reward-model"
+            api = HfApi()
+            api.create_repo(repo_id, repo_type="model", exist_ok=True, private=False)
+
+            path = f"/lustre/orion/bif151/scratch/ivan.anokhin/AIF-Gen/{dataset_name}/Qwen2.5-0.5B-Reward-8gpus/Qwen2.5-0.5B-Instruct_{dataset_name}_REWARD_{dataset_index}"
+            print('path', path)
+
+            upload_folder(
+                repo_id=repo_id,
+                # path_in_repo=f"{dataset_name}-{dataset_index}/reward-model",
+                folder_path=path,
+                commit_message="Upload AIFGen reward model",
+                repo_type="model",
+            )
+        except:
+            print(f"Failed to upload {dataset_name}-{dataset_index} reward model")
+            continue
\ No newline at end of file
diff --git a/benchmarks/ppo/README.md b/benchmarks/ppo/README.md
index 21928fb2..63fa30b3 100644
--- a/benchmarks/ppo/README.md
+++ b/benchmarks/ppo/README.md
@@ -32,7 +32,7 @@ uv run benchmarks/ppo/ppo_continual.py \
     --use_peft \
     --lora_r 32 \
     --lora_alpha 16 \
-    --push_to_hub True
+    --push_to_hub False
 ```
 
 ### Using accelerate launch (with DeepSpeed / multi-GPU)
@@ -62,7 +62,7 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero
     --use_peft \
     --lora_r 32 \
     --lora_alpha 16 \
-    --push_to_hub True
+    --push_to_hub False
 ```
 
 *Make sure you do not add the dataset index to the reward model name as the script itself iterates over the dataset indices.*
diff --git a/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml b/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
index 8046cccc..239b14ac 100644
--- a/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
+++ b/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
@@ -12,7 +12,7 @@ machine_rank: 0
 main_training_function: main
 mixed_precision: 'bf16'
 num_machines: 1
-num_processes: 1
+num_processes: 8
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/benchmarks/ppo/ppo_continual.py b/benchmarks/ppo/ppo_continual.py
index f2cfa1c3..adfb8f1c 100644
--- a/benchmarks/ppo/ppo_continual.py
+++ b/benchmarks/ppo/ppo_continual.py
@@ -144,8 +144,8 @@ def main(
             peft_config=peft_config,
         )
 
-        if i == 0:
-            trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
+        # if i == 0:
+        #     trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
 
         # Set current task in trainer for task-based logging
         trainer.set_task(f'task_{i}')
diff --git a/pyproject.toml b/pyproject.toml
index b90c8e52..d895846f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,8 @@ dependencies = [
     "pydantic>=2.10.4",
     "pytest-asyncio>=0.25.3",
     "pytest-mock>=3.14.0",
-    "torch==2.3.0",
+#    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp312-cp312-linux_x86_64.whl#sha256=992c1ffb65c773a5848e4bbe22235c0386a7915690615ad68a45609228c13269",
+    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp310-cp310-linux_x86_64.whl#sha256=266af54cf4704aae08719305c205f0d12f40874006d3b8058f38e2f8ed08f56d",
     "types-pyyaml>=6.0.12.20241230",
 ]
 

From fcf7671a9b72fc936b9115187eeb9d0ff4cae2af Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Thu, 1 May 2025 23:34:32 -0400
Subject: [PATCH 07/15] sm fixes, hf upload

---
 benchmarks/hf_upload_models.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/benchmarks/hf_upload_models.py b/benchmarks/hf_upload_models.py
index ebca79c2..02646994 100644
--- a/benchmarks/hf_upload_models.py
+++ b/benchmarks/hf_upload_models.py
@@ -1,7 +1,7 @@
 from huggingface_hub import HfApi, upload_folder
 
-datasets="aifgen-long-piecewise aifgen-lipschitz aifgen-piecewise-preference-shift aifgen-domain-preference-shift aifgen-short-piecewise CPPO-REWARD"
-dataset_indices="0 1 2 3 4 5 6 7 8 9"
+datasets = 'aifgen-long-piecewise aifgen-lipschitz aifgen-piecewise-preference-shift aifgen-domain-preference-shift aifgen-short-piecewise CPPO-REWARD'
+dataset_indices = '0 1 2 3 4 5 6 7 8 9'
 # datasets="aifgen-long-piecewise"
 # dataset_indices="0"
 
@@ -9,20 +9,20 @@
     for dataset_index in dataset_indices.split():
         # Upload the model to the Hugging Face Hub
         try:
-            repo_id = f"LifelongAlignment/{dataset_name}-{dataset_index}-reward-model"
+            repo_id = f'LifelongAlignment/{dataset_name}-{dataset_index}-reward-model'
             api = HfApi()
-            api.create_repo(repo_id, repo_type="model", exist_ok=True, private=False)
+            api.create_repo(repo_id, repo_type='model', exist_ok=True, private=False)
 
-            path = f"/lustre/orion/bif151/scratch/ivan.anokhin/AIF-Gen/{dataset_name}/Qwen2.5-0.5B-Reward-8gpus/Qwen2.5-0.5B-Instruct_{dataset_name}_REWARD_{dataset_index}"
+            path = f'/lustre/orion/bif151/scratch/ivan.anokhin/AIF-Gen/{dataset_name}/Qwen2.5-0.5B-Reward-8gpus/Qwen2.5-0.5B-Instruct_{dataset_name}_REWARD_{dataset_index}'
             print('path', path)
 
             upload_folder(
                 repo_id=repo_id,
                 # path_in_repo=f"{dataset_name}-{dataset_index}/reward-model",
                 folder_path=path,
-                commit_message="Upload AIFGen reward model",
-                repo_type="model",
+                commit_message='Upload AIFGen reward model',
+                repo_type='model',
             )
         except:
-            print(f"Failed to upload {dataset_name}-{dataset_index} reward model")
-            continue
\ No newline at end of file
+            print(f'Failed to upload {dataset_name}-{dataset_index} reward model')
+            continue

From b219b419cb370c700edb4414bb9007a680c601ac Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Fri, 2 May 2025 01:57:34 -0400
Subject: [PATCH 08/15] remove accumulate in ppo trainer

---
 benchmarks/ppo/README.md                      |  20 +-
 .../accelerate_configs/deepspeed_zero2.yaml   |   7 +-
 benchmarks/ppo/continual_ppo_trainer.py       | 204 +++++++++---------
 pyproject.toml                                |   3 +-
 4 files changed, 120 insertions(+), 114 deletions(-)

diff --git a/benchmarks/ppo/README.md b/benchmarks/ppo/README.md
index 63fa30b3..1127788a 100644
--- a/benchmarks/ppo/README.md
+++ b/benchmarks/ppo/README.md
@@ -20,6 +20,7 @@ uv run benchmarks/ppo/ppo_continual.py \
     --reward_model_path Shahradmz/Qwen2-0.5B-Instruct_continual_data_debug_REWARD \
     --learning_rate 5.0e-6 \
     --num_train_epochs 1 \
+    --gradient_accumulation_steps 2 \
     --gradient_accumulation_steps 8 \
     --gradient_checkpointing \
     --logging_steps 20 \
@@ -50,12 +51,12 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero
     --learning_rate 5.0e-6 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 8 \
+    --gradient_accumulation_steps 1 \
     --gradient_checkpointing \
-    --logging_steps 2 \
+    --logging_steps 10 \
     --eval_strategy steps \
-    --eval_steps 5 \
-    --save_steps 5 \
+    --eval_steps 10 \
+    --save_steps 10 \
     --bf16 \
     --output_dir "$SCRATCH/Qwen2-0.5B-PPO-test" \
     --no_remove_unused_columns \
@@ -70,7 +71,8 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero
 ### Full Training (without PEFT push, for local evaluation)
 
 ```sh
-uv run benchmarks/ppo/ppo_continual.py \
+accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
+     benchmarks/ppo/ppo_continual.py \
     --dataset_name benchmarks/continual_data_debug.json \
     --mock False \
     --sft_model_path Qwen/Qwen2-0.5B-Instruct \
@@ -78,14 +80,16 @@ uv run benchmarks/ppo/ppo_continual.py \
     --reward_model_path Shahradmz/Qwen2-0.5B-Instruct_continual_data_debug_REWARD \
     --learning_rate 5.0e-7 \
     --num_train_epochs 1 \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 8 \
+    --bf16 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 1 \
     --gradient_checkpointing \
     --logging_steps 20 \
     --eval_strategy steps \
     --eval_steps 20 \
     --output_dir "$SCRATCH/Qwen2-0.5B-PPO" \
-    --no_remove_unused_columns
+    --no_remove_unused_columns \
+    --push_to_hub False
 ```
 
 ### Run a Sweep with wandb
diff --git a/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml b/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
index 239b14ac..825c1fcc 100644
--- a/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
+++ b/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
@@ -2,17 +2,18 @@ compute_environment: LOCAL_MACHINE
 debug: false
 deepspeed_config:
   deepspeed_multinode_launcher: standard
-  offload_optimizer_device: none
-  offload_param_device: none
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
   zero3_init_flag: false
   zero_stage: 2
+  gradient_accumulation_steps: 8
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
 mixed_precision: 'bf16'
 num_machines: 1
-num_processes: 8
+num_processes: 2
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/benchmarks/ppo/continual_ppo_trainer.py b/benchmarks/ppo/continual_ppo_trainer.py
index faa5d805..7718b9c7 100644
--- a/benchmarks/ppo/continual_ppo_trainer.py
+++ b/benchmarks/ppo/continual_ppo_trainer.py
@@ -729,110 +729,110 @@ def repeat_generator() -> DataLoader:
                     for micro_batch_start in range(
                         0, args.local_mini_batch_size, args.per_device_train_batch_size
                     ):
-                        with accelerator.accumulate(model):
-                            micro_batch_end = (
-                                micro_batch_start + args.per_device_train_batch_size
-                            )
-                            micro_batch_inds = mini_batch_inds[
-                                micro_batch_start:micro_batch_end
-                            ]
-                            mb_advantage = advantages[micro_batch_inds]
-                            mb_responses = responses[micro_batch_inds]
-                            mb_query_responses = query_responses[micro_batch_inds]
-                            mb_logprobs = logprobs[micro_batch_inds]
-                            mb_return = returns[micro_batch_inds]
-                            mb_values = values[micro_batch_inds]
-
-                            output, vpred_temp = forward(
-                                model, mb_query_responses, processing_class.pad_token_id
-                            )
-                            logits = output.logits[:, context_length - 1 : -1]
-                            logits /= args.temperature + 1e-7
-                            new_logprobs = selective_log_softmax(logits, mb_responses)
-                            new_logprobs = torch.masked_fill(
-                                new_logprobs,
-                                padding_mask[micro_batch_inds],
-                                INVALID_LOGPROB,
-                            )
-                            vpred = vpred_temp[:, context_length - 1 : -1].squeeze(-1)
-                            vpred = torch.masked_fill(
-                                vpred, padding_mask_p1[micro_batch_inds], 0
-                            )
-                            vpredclipped = torch.clamp(
-                                vpred,
-                                mb_values - args.cliprange_value,
-                                mb_values + args.cliprange_value,
-                            )
-                            vf_losses1 = torch.square(vpred - mb_return)
-                            vf_losses2 = torch.square(vpredclipped - mb_return)
-                            vf_loss_max = torch.max(vf_losses1, vf_losses2)
-                            vf_loss = 0.5 * masked_mean(
-                                vf_loss_max, ~padding_mask_p1[micro_batch_inds]
-                            )
-                            vf_clipfrac = masked_mean(
-                                (vf_losses2 > vf_losses1).float(),
-                                ~padding_mask_p1[micro_batch_inds],
-                            )
-                            logprobs_diff = new_logprobs - mb_logprobs
-                            ratio = torch.exp(logprobs_diff)
-                            pg_losses = -mb_advantage * ratio
-                            pg_losses2 = -mb_advantage * torch.clamp(
-                                ratio, 1.0 - args.cliprange, 1.0 + args.cliprange
+                        # with accelerator.accumulate(model):
+                        micro_batch_end = (
+                            micro_batch_start + args.per_device_train_batch_size
+                        )
+                        micro_batch_inds = mini_batch_inds[
+                            micro_batch_start:micro_batch_end
+                        ]
+                        mb_advantage = advantages[micro_batch_inds]
+                        mb_responses = responses[micro_batch_inds]
+                        mb_query_responses = query_responses[micro_batch_inds]
+                        mb_logprobs = logprobs[micro_batch_inds]
+                        mb_return = returns[micro_batch_inds]
+                        mb_values = values[micro_batch_inds]
+
+                        output, vpred_temp = forward(
+                            model, mb_query_responses, processing_class.pad_token_id
+                        )
+                        logits = output.logits[:, context_length - 1 : -1]
+                        logits /= args.temperature + 1e-7
+                        new_logprobs = selective_log_softmax(logits, mb_responses)
+                        new_logprobs = torch.masked_fill(
+                            new_logprobs,
+                            padding_mask[micro_batch_inds],
+                            INVALID_LOGPROB,
+                        )
+                        vpred = vpred_temp[:, context_length - 1 : -1].squeeze(-1)
+                        vpred = torch.masked_fill(
+                            vpred, padding_mask_p1[micro_batch_inds], 0
+                        )
+                        vpredclipped = torch.clamp(
+                            vpred,
+                            mb_values - args.cliprange_value,
+                            mb_values + args.cliprange_value,
+                        )
+                        vf_losses1 = torch.square(vpred - mb_return)
+                        vf_losses2 = torch.square(vpredclipped - mb_return)
+                        vf_loss_max = torch.max(vf_losses1, vf_losses2)
+                        vf_loss = 0.5 * masked_mean(
+                            vf_loss_max, ~padding_mask_p1[micro_batch_inds]
+                        )
+                        vf_clipfrac = masked_mean(
+                            (vf_losses2 > vf_losses1).float(),
+                            ~padding_mask_p1[micro_batch_inds],
+                        )
+                        logprobs_diff = new_logprobs - mb_logprobs
+                        ratio = torch.exp(logprobs_diff)
+                        pg_losses = -mb_advantage * ratio
+                        pg_losses2 = -mb_advantage * torch.clamp(
+                            ratio, 1.0 - args.cliprange, 1.0 + args.cliprange
+                        )
+                        pg_loss_max = torch.max(pg_losses, pg_losses2)
+                        pg_loss = masked_mean(
+                            pg_loss_max, ~padding_mask[micro_batch_inds]
+                        )
+                        loss = pg_loss + args.vf_coef * vf_loss
+                        accelerator.backward(loss)
+                        optimizer.step()
+                        optimizer.zero_grad()
+                        with torch.no_grad():
+                            pg_clipfrac = masked_mean(
+                                (pg_losses2 > pg_losses).float(),
+                                ~padding_mask[micro_batch_inds],
                             )
-                            pg_loss_max = torch.max(pg_losses, pg_losses2)
-                            pg_loss = masked_mean(
-                                pg_loss_max, ~padding_mask[micro_batch_inds]
+                            prob_dist = torch.nn.functional.softmax(logits, dim=-1)
+                            entropy = torch.logsumexp(logits, dim=-1) - torch.sum(
+                                prob_dist * logits, dim=-1
                             )
-                            loss = pg_loss + args.vf_coef * vf_loss
-                            accelerator.backward(loss)
-                            optimizer.step()
-                            optimizer.zero_grad()
-                            with torch.no_grad():
-                                pg_clipfrac = masked_mean(
-                                    (pg_losses2 > pg_losses).float(),
-                                    ~padding_mask[micro_batch_inds],
-                                )
-                                prob_dist = torch.nn.functional.softmax(logits, dim=-1)
-                                entropy = torch.logsumexp(logits, dim=-1) - torch.sum(
-                                    prob_dist * logits, dim=-1
-                                )
-                                approxkl = 0.5 * (logprobs_diff**2).mean()
-                                approxkl_stats[
-                                    ppo_epoch_idx,
-                                    minibatch_idx,
-                                    gradient_accumulation_idx,
-                                ] = approxkl
-                                pg_clipfrac_stats[
-                                    ppo_epoch_idx,
-                                    minibatch_idx,
-                                    gradient_accumulation_idx,
-                                ] = pg_clipfrac
-                                pg_loss_stats[
-                                    ppo_epoch_idx,
-                                    minibatch_idx,
-                                    gradient_accumulation_idx,
-                                ] = pg_loss
-                                vf_loss_stats[
-                                    ppo_epoch_idx,
-                                    minibatch_idx,
-                                    gradient_accumulation_idx,
-                                ] = vf_loss
-                                vf_clipfrac_stats[
-                                    ppo_epoch_idx,
-                                    minibatch_idx,
-                                    gradient_accumulation_idx,
-                                ] = vf_clipfrac
-                                entropy_stats[
-                                    ppo_epoch_idx,
-                                    minibatch_idx,
-                                    gradient_accumulation_idx,
-                                ] = entropy.mean()
-                                ratio_stats[
-                                    ppo_epoch_idx,
-                                    minibatch_idx,
-                                    gradient_accumulation_idx,
-                                ] = ratio.mean()
-                        gradient_accumulation_idx += 1
+                            approxkl = 0.5 * (logprobs_diff**2).mean()
+                            approxkl_stats[
+                                ppo_epoch_idx,
+                                minibatch_idx,
+                                gradient_accumulation_idx,
+                            ] = approxkl
+                            pg_clipfrac_stats[
+                                ppo_epoch_idx,
+                                minibatch_idx,
+                                gradient_accumulation_idx,
+                            ] = pg_clipfrac
+                            pg_loss_stats[
+                                ppo_epoch_idx,
+                                minibatch_idx,
+                                gradient_accumulation_idx,
+                            ] = pg_loss
+                            vf_loss_stats[
+                                ppo_epoch_idx,
+                                minibatch_idx,
+                                gradient_accumulation_idx,
+                            ] = vf_loss
+                            vf_clipfrac_stats[
+                                ppo_epoch_idx,
+                                minibatch_idx,
+                                gradient_accumulation_idx,
+                            ] = vf_clipfrac
+                            entropy_stats[
+                                ppo_epoch_idx,
+                                minibatch_idx,
+                                gradient_accumulation_idx,
+                            ] = entropy.mean()
+                            ratio_stats[
+                                ppo_epoch_idx,
+                                minibatch_idx,
+                                gradient_accumulation_idx,
+                            ] = ratio.mean()
+                    gradient_accumulation_idx += 1
                     minibatch_idx += 1
                     # del everything and empty cache
                     # fmt: off
diff --git a/pyproject.toml b/pyproject.toml
index d895846f..1533d5df 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,8 +26,9 @@ dependencies = [
     "pydantic>=2.10.4",
     "pytest-asyncio>=0.25.3",
     "pytest-mock>=3.14.0",
+    "torch==2.3.0",
 #    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp312-cp312-linux_x86_64.whl#sha256=992c1ffb65c773a5848e4bbe22235c0386a7915690615ad68a45609228c13269",
-    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp310-cp310-linux_x86_64.whl#sha256=266af54cf4704aae08719305c205f0d12f40874006d3b8058f38e2f8ed08f56d",
+#    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp310-cp310-linux_x86_64.whl#sha256=266af54cf4704aae08719305c205f0d12f40874006d3b8058f38e2f8ed08f56d",
     "types-pyyaml>=6.0.12.20241230",
 ]
 

From bea9d79c21122153f0f654745001d24fbb63ef10 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Fri, 2 May 2025 14:30:04 -0400
Subject: [PATCH 09/15] fix dpo ewc

---
 .../accelerate_configs/deepspeed_zero2.yaml   |  4 +--
 .../accelerate_configs/deepspeed_zero3.yaml   |  2 +-
 .../dpo_ewc/continual_dpo_EWC_trainer.py      | 28 +++++++++++++------
 .../accelerate_configs/deepspeed_zero2.yaml   |  8 +++---
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml b/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml
index 877a5b8f..f369ef96 100644
--- a/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml
+++ b/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml
@@ -11,8 +11,8 @@ downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
 mixed_precision: 'bf16'
-num_machines: 2
-num_processes: 1
+num_machines: 1
+num_processes: 2
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml b/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
index 6b68067b..29507c4c 100644
--- a/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
+++ b/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
@@ -11,7 +11,7 @@ machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
-num_processes: 8  # TODO change to whatever number of gpus is used
+num_processes: 2  # TODO change to whatever number of gpus is used
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/benchmarks/dpo_ewc/continual_dpo_EWC_trainer.py b/benchmarks/dpo_ewc/continual_dpo_EWC_trainer.py
index 815dd537..45b2af0c 100644
--- a/benchmarks/dpo_ewc/continual_dpo_EWC_trainer.py
+++ b/benchmarks/dpo_ewc/continual_dpo_EWC_trainer.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional, Union
 
+import deepspeed
 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel
@@ -119,7 +120,13 @@ def compute_ewc_loss(self) -> torch.Tensor:
 
         # Calculate the EWC penalty for each parameter
         model = self.accelerator.unwrap_model(self.model)
+
         for name, param in model.named_parameters():
+            if name not in ContinualDPOEWCTrainer.class_fisher_information:
+                continue
+            if not param.requires_grad:
+                continue
+
             if (
                 name in ContinualDPOEWCTrainer.class_fisher_information
                 and param.requires_grad
@@ -128,13 +135,15 @@ def compute_ewc_loss(self) -> torch.Tensor:
                 fisher = ContinualDPOEWCTrainer.class_fisher_information[name].to(
                     param.device
                 )
-                old_param = ContinualDPOEWCTrainer.class_old_params[name].to(
-                    param.device
-                )
 
-                # Calculate squared distance weighted by Fisher information
-                delta = param - old_param
-                ewc_loss += (fisher * delta.pow(2)).sum()
+            with deepspeed.zero.GatheredParameters([param], modifier_rank=0):
+                if self.accelerator.is_main_process:
+                    old_param = ContinualDPOEWCTrainer.class_old_params[name].to(
+                        param.device
+                    )
+                    # Calculate squared distance weighted by Fisher information
+                    delta = param - old_param
+                    ewc_loss = ewc_loss + (fisher * delta.pow(2)).sum()
 
         # Apply the EWC lambda coefficient and return
         return 0.5 * self.ewc_lambda * ewc_loss
@@ -237,9 +246,12 @@ def store_current_parameters(self) -> Dict[str, torch.Tensor]:
         """
         model = self.accelerator.unwrap_model(self.model)
         old_params = {}
+
         for name, param in model.named_parameters():
-            if param.requires_grad:
-                old_params[name] = param.data.clone().detach()
+            with deepspeed.zero.GatheredParameters([param], modifier_rank=0):
+                if self.accelerator.is_main_process:
+                    if param.requires_grad:
+                        old_params[name] = param.data.clone().detach()
         return old_params
 
     def train(self) -> Any:
diff --git a/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml b/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
index 825c1fcc..27d04d8d 100644
--- a/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
+++ b/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
@@ -2,18 +2,18 @@ compute_environment: LOCAL_MACHINE
 debug: false
 deepspeed_config:
   deepspeed_multinode_launcher: standard
-  offload_optimizer_device: cpu
-  offload_param_device: cpu
+  offload_optimizer_device: none
+  offload_param_device: none
   zero3_init_flag: false
   zero_stage: 2
-  gradient_accumulation_steps: 8
+  gradient_accumulation_steps: 4
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
 mixed_precision: 'bf16'
 num_machines: 1
-num_processes: 2
+num_processes: 8
 rdzv_backend: static
 same_network: true
 tpu_env: []

From 8f0660e4cc40aeafad5d48798ff90e15f42cc5d0 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Wed, 7 May 2025 13:07:35 -0400
Subject: [PATCH 10/15] sm fixes

---
 benchmarks/continual_eval_checkpoints.py      | 46 +++++++++++++------
 .../accelerate_configs/deepspeed_zero3.yaml   |  2 +-
 benchmarks/ppo/continual_ppo_trainer.py       |  4 +-
 3 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/benchmarks/continual_eval_checkpoints.py b/benchmarks/continual_eval_checkpoints.py
index 016887fa..a5302966 100644
--- a/benchmarks/continual_eval_checkpoints.py
+++ b/benchmarks/continual_eval_checkpoints.py
@@ -1,9 +1,9 @@
-"""Evaluating checkpoints obtained from training using the dpo_continual script."""
-
 import glob
 import os
+import re
 
 import torch
+import wandb as wb
 from dataloading import init_continual_dataset
 from datasets import Dataset
 from dpo.continual_dpo_trainer import (
@@ -17,9 +17,7 @@
     AutoTokenizer,
 )
 from trl import (
-    DPOConfig,
     ModelConfig,
-    ScriptArguments,
     TrlParser,
     get_kbit_device_map,
     get_peft_config,
@@ -27,12 +25,10 @@
 )
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
-import wandb as wb
-
 
 def main(
-    script_args: ScriptArguments,
-    training_args: DPOConfig,
+    script_args: ContinualDPOArguments,
+    training_args: ContinualDPOConfig,
     model_args: ModelConfig,
 ) -> None:
     # Determine torch dtype and quantization configs
@@ -41,6 +37,9 @@ def main(
         if model_args.torch_dtype in ['auto', None]
         else getattr(torch, model_args.torch_dtype)
     )
+    if script_args.wandb_run_name is not None:
+        training_args.run_name = script_args.wandb_run_name
+
     quantization_config = get_quantization_config(model_args)
 
     # Model & Tokenizer Setup
@@ -87,14 +86,26 @@ def main(
 
     # Validate reward model paths if provided
     for i, _ in enumerate(continual_dataset):
-        reward_path = os.path.join(training_args.reward_model_path, str(i))
+        reward_path = training_args.reward_model_path + '_' + str(i)
         if not os.path.exists(reward_path):
             raise FileNotFoundError(
                 f'Reward model not found for dataset {i} at {reward_path}'
             )
 
     checkpoint_paths = glob.glob(f'{script_args.checkpoint_dir}/*/*')
-    checkpoint_paths = sorted([ch for ch in checkpoint_paths if 'checkpoint' in ch])
+
+    def extract_indices(path):
+        match = re.search(r'dataset-(\d+)/checkpoint-(\d+)', path)
+        if match:
+            dataset_idx = int(match.group(1))
+            checkpoint_idx = int(match.group(2))
+            return (dataset_idx, checkpoint_idx)
+        else:
+            return (float('inf'), float('inf'))  # in case of unexpected format
+
+    checkpoint_paths = [ch for ch in checkpoint_paths if 'checkpoint' in ch]
+    checkpoint_paths.sort(key=extract_indices)
+    print('checkpoint_paths', checkpoint_paths)
 
     # Checkpoint loop
     for checkpoint_path in checkpoint_paths:
@@ -103,14 +114,20 @@ def main(
         print(
             f'Evaluating checkpoint: {checkpoint_step} trained on dataset: {dataset_name} on all tasks'
         )
-        adapter_name = dataset_name + checkpoint_step
-        model.load_adapter(checkpoint_path, adapter_name=adapter_name)
+        # adapter_name = dataset_name + checkpoint_step
+        # model.load_adapter(checkpoint_path, adapter_name=adapter_name)
+        model = AutoModelForCausalLM.from_pretrained(
+            checkpoint_path,
+            trust_remote_code=model_args.trust_remote_code,
+            **model_kwargs,
+        )
         metrics = {}
 
         # Task Loop
         for i, dataset in enumerate(continual_dataset):
+            print('task', i)
             reward_model = AutoModelForSequenceClassification.from_pretrained(
-                training_args.reward_model_path + f'/{str(i)}', num_labels=1
+                training_args.reward_model_path + f'_{str(i)}', num_labels=1
             )
 
             training_args.output_dir = f'{output_dir}/dataset-{i}'
@@ -130,7 +147,8 @@ def main(
             ev_metrics = {f'dataset-{i}/' + k: v for k, v in ev_metrics.items()}
             metrics.update(ev_metrics)
 
-        wb.log(metrics)  # type: ignore[attr-defined]
+        if training_args.local_rank in (None, -1, 0):
+            wb.log(metrics)  # type: ignore[attr-defined]
 
     print('Evaluation completed for all tasks and checkpoints!')
 
diff --git a/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml b/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
index 29507c4c..6b68067b 100644
--- a/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
+++ b/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml
@@ -11,7 +11,7 @@ machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
-num_processes: 2  # TODO change to whatever number of gpus is used
+num_processes: 8  # TODO change to whatever number of gpus is used
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/benchmarks/ppo/continual_ppo_trainer.py b/benchmarks/ppo/continual_ppo_trainer.py
index 7718b9c7..505ac3e5 100644
--- a/benchmarks/ppo/continual_ppo_trainer.py
+++ b/benchmarks/ppo/continual_ppo_trainer.py
@@ -311,8 +311,8 @@ def __init__(
         # Training scheduling
         args.num_total_batches = math.ceil(args.total_episodes / args.batch_size)
         time_tensor = torch.tensor(int(time.time()), device=self.accelerator.device)
-        time_int = broadcast(time_tensor, 0).item()
-        args.run_name = f'{args.exp_name}__{args.seed}__{time_int}'
+        broadcast(time_tensor, 0).item()
+        # args.run_name = f'{args.exp_name}__{args.seed}__{time_int}'
         self.local_seed = args.seed + self.accelerator.process_index * 100003  # Prime
         if args.num_sample_generations > 0:
             self.sample_generations_freq = max(

From 0d6630b263fd320d8c28bb06cc250a5d3bd01c74 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Thu, 8 May 2025 13:49:51 -0400
Subject: [PATCH 11/15] upd

---
 benchmarks/hf_upload_models.py                         | 5 +++--
 benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml | 3 +--
 benchmarks/ppo/ppo_continual.py                        | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/benchmarks/hf_upload_models.py b/benchmarks/hf_upload_models.py
index 02646994..c50c805c 100644
--- a/benchmarks/hf_upload_models.py
+++ b/benchmarks/hf_upload_models.py
@@ -2,6 +2,7 @@
 
 datasets = 'aifgen-long-piecewise aifgen-lipschitz aifgen-piecewise-preference-shift aifgen-domain-preference-shift aifgen-short-piecewise CPPO-REWARD'
 dataset_indices = '0 1 2 3 4 5 6 7 8 9'
+model = 'Qwen2-0.5B'
 # datasets="aifgen-long-piecewise"
 # dataset_indices="0"
 
@@ -9,11 +10,11 @@
     for dataset_index in dataset_indices.split():
         # Upload the model to the Hugging Face Hub
         try:
-            repo_id = f'LifelongAlignment/{dataset_name}-{dataset_index}-reward-model'
+            repo_id = f'LifelongAlignment/{model}-Instruct_{dataset_name}_REWARD_{dataset_index}'
             api = HfApi()
             api.create_repo(repo_id, repo_type='model', exist_ok=True, private=False)
 
-            path = f'/lustre/orion/bif151/scratch/ivan.anokhin/AIF-Gen/{dataset_name}/Qwen2.5-0.5B-Reward-8gpus/Qwen2.5-0.5B-Instruct_{dataset_name}_REWARD_{dataset_index}'
+            path = f'/lustre/orion/bif151/scratch/ivan.anokhin/AIF-Gen/{dataset_name}/{model}-Reward-8gpus/{model}-Instruct_{dataset_name}_REWARD_{dataset_index}'
             print('path', path)
 
             upload_folder(
diff --git a/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml b/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
index 27d04d8d..8046cccc 100644
--- a/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
+++ b/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
@@ -6,14 +6,13 @@ deepspeed_config:
   offload_param_device: none
   zero3_init_flag: false
   zero_stage: 2
-  gradient_accumulation_steps: 4
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
 mixed_precision: 'bf16'
 num_machines: 1
-num_processes: 8
+num_processes: 1
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/benchmarks/ppo/ppo_continual.py b/benchmarks/ppo/ppo_continual.py
index adfb8f1c..8db6aff3 100644
--- a/benchmarks/ppo/ppo_continual.py
+++ b/benchmarks/ppo/ppo_continual.py
@@ -100,6 +100,7 @@ def main(
     if '.' in clean_dataset_name:
         clean_dataset_name = clean_dataset_name.split('.')[0]
 
+    print(f'Training PPO on {len(continual_dataset)} tasks')
     # check if the reward models are present either in the path or in the hub
     if training_args.reward_model_path is not None:
         for i in range(len(continual_dataset)):

From e3846b5bf1cd60886c8a17103a95f6e3a1b37f18 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Thu, 8 May 2025 21:29:16 -0400
Subject: [PATCH 12/15] logging

---
 benchmarks/dataloading.py               |  10 ---
 benchmarks/dpo/continual_dpo_trainer.py | 115 +++++++++++++++---------
 2 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/benchmarks/dataloading.py b/benchmarks/dataloading.py
index 65b5dd7b..c6d8c704 100644
--- a/benchmarks/dataloading.py
+++ b/benchmarks/dataloading.py
@@ -89,22 +89,12 @@ def init_continual_dataset(
             data = ContinualAlignmentDataset.from_json(dataset)
         except OSError:  # need to try downloading from hub
             try:
-                # json_name = dataset.split('/', )[-1]
                 # print(f'Downloading {json_name} from Hugging Face Hub...')
                 local_path = hf_hub_download(
                     repo_id=f'LifelongAlignment/{dataset}',
                     filename='data.json',
                     repo_type='dataset',
                 )
-                # local_path = hf_hub_download(
-                #         repo_id=f"LifelongAlignment/{dataset}", filename=f'{dataset}.json', repo_type='dataset'
-                #     )
-                # local_path = hf_hub_download(
-                #         repo_id=f"LifelongAlignment/{dataset}", filename=f'{json_name}.json', repo_type='dataset'
-                #     )
-                # local_path = hf_hub_download(
-                #     repo_id=dataset, filename='dataset.json', repo_type='dataset'
-                # )
                 data = ContinualAlignmentDataset.from_json(local_path)
             except Exception as e:
                 raise ValueError(f'Error loading dataset: {e}')
diff --git a/benchmarks/dpo/continual_dpo_trainer.py b/benchmarks/dpo/continual_dpo_trainer.py
index 44374351..dc93f5ff 100644
--- a/benchmarks/dpo/continual_dpo_trainer.py
+++ b/benchmarks/dpo/continual_dpo_trainer.py
@@ -13,6 +13,8 @@
 from accelerate import Accelerator, PartialState
 from accelerate.utils import gather_object
 from datasets import Dataset
+from rich.console import Console
+from rich.table import Table
 from torch.utils.data import DataLoader
 from transformers import (
     BaseImageProcessor,
@@ -328,23 +330,31 @@ def log(
                 eval_policy_metrics = self.evaluate_policy()
                 logs.update(eval_policy_metrics)
 
-            # TODO: Only generation sample completions every x steps
-            do_generate_completions = True
-            if do_generate_completions:
-                self._generate_completions()
-                torch.cuda.empty_cache()
+        # TODO: Only generation sample completions every x steps
+        do_generate_completions = True
+        if do_generate_completions:
+            self._generate_completions()
+            torch.cuda.empty_cache()
 
         return super().log(logs, start_time)
 
     def _generate_completions(self) -> None:
         # Config from: https://github.com/huggingface/trl/blob/56e57662053e2d0cc6302dad404820b0c0ec6a91/trl/trainer/ppo_trainer.py#L688
+        # generation_config = GenerationConfig(
+        #     max_new_tokens=53,
+        #     temperature=(0.01 + 1e-7),
+        #     top_k=0.0,
+        #     top_p=1.0,
+        #     do_sample=True,
+        # )
         generation_config = GenerationConfig(
-            max_new_tokens=53,
-            temperature=(0.01 + 1e-7),
+            max_new_tokens=self.args.response_length,
+            temperature=(self.args.temperature + 1e-7),
             top_k=0.0,
             top_p=1.0,
             do_sample=True,
         )
+
         table = defaultdict(list)
         with torch.no_grad():
             with unwrap_model_for_generation(
@@ -352,44 +362,61 @@ def _generate_completions(self) -> None:
                 self.accelerator,
                 gather_deepspeed3_params=None,
             ) as unwrapped_model:
-                for batch in self.eval_dataloader:
-                    query = batch['input_ids']
-                    context_length = query.shape[1]
-                    query_response, _ = batch_generation(
-                        unwrapped_model,
-                        query,
-                        query.shape[0],
-                        self.processing_class.pad_token_id,
-                        generation_config,
-                    )
-                    response = query_response[:, context_length:]
-                    postprocessed_response = response
-                    postprocessed_query_response = torch.cat(
-                        (query, postprocessed_response), 1
-                    )
-                    _, score, _ = get_reward(
-                        self.reward_model,
-                        postprocessed_query_response,
-                        self.processing_class.pad_token_id,
-                        context_length,
-                    )
+                if self.eval_policy_dataloader is not None:
+                    for batch in self.eval_policy_dataloader:
+                        query = batch['input_ids']
+                        context_length = query.shape[1]
+                        query_response, _ = batch_generation(
+                            unwrapped_model,
+                            query,
+                            query.shape[0],
+                            self.processing_class.pad_token_id,
+                            generation_config,
+                        )
+                        response = query_response[:, context_length:]
+                        postprocessed_response = response
+                        postprocessed_query_response = torch.cat(
+                            (query, postprocessed_response), 1
+                        )
+                        _, score, _ = get_reward(
+                            self.reward_model,
+                            postprocessed_query_response,
+                            self.processing_class.pad_token_id,
+                            context_length,
+                        )
 
-                    queries = gather_object(
-                        self.processing_class.batch_decode(
-                            query, skip_special_tokens=True
+                        queries = gather_object(
+                            self.processing_class.batch_decode(
+                                query, skip_special_tokens=True
+                            )
                         )
-                    )
-                    responses = gather_object(
-                        self.processing_class.batch_decode(postprocessed_response)
-                    )
-                    scores = (
-                        self.accelerator.gather_for_metrics(score).float().cpu().numpy()
-                    )
-                    table['query'].extend(queries)
-                    table['model response'].extend(responses)
-                    table['score'].extend(scores)
-                    break
+                        responses = gather_object(
+                            self.processing_class.batch_decode(postprocessed_response)
+                        )
+                        scores = (
+                            self.accelerator.gather_for_metrics(score)
+                            .float()
+                            .cpu()
+                            .numpy()
+                        )
+                        table['query'].extend(queries)
+                        table['model response'].extend(responses)
+                        table['score'].extend(scores)
+                        break
 
         df = pd.DataFrame(table)
-        if self.accelerator.is_main_process and wb.run is not None:
-            wb.log({'completions': wb.Table(dataframe=df)})
+
+        if self.accelerator.is_main_process:
+            print_rich_table(df.iloc[0 : 0 + 5])
+            if wb.run is not None:
+                wb.log({'completions': wb.Table(dataframe=df)})
+
+
+def print_rich_table(df: pd.DataFrame) -> Table:
+    console = Console()
+    table = Table(show_lines=True)
+    for column in df.columns:
+        table.add_column(column)
+    for _, row in df.iterrows():
+        table.add_row(*row.astype(str).tolist())
+    console.print(table)

From 305f08ad4e439f072ba4e35e6b8410a217cda3df Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Thu, 8 May 2025 21:53:33 -0400
Subject: [PATCH 13/15] logging

---
 benchmarks/dpo/continual_dpo_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/dpo/continual_dpo_trainer.py b/benchmarks/dpo/continual_dpo_trainer.py
index dc93f5ff..ce507021 100644
--- a/benchmarks/dpo/continual_dpo_trainer.py
+++ b/benchmarks/dpo/continual_dpo_trainer.py
@@ -406,7 +406,7 @@ def _generate_completions(self) -> None:
 
         df = pd.DataFrame(table)
 
-        if self.accelerator.is_main_process:
+        if self.accelerator.is_main_process or self.accelerator is None:
             print_rich_table(df.iloc[0 : 0 + 5])
             if wb.run is not None:
                 wb.log({'completions': wb.Table(dataframe=df)})

From fcdaaaaa2b9f612d156274898373a658ea15cfe2 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Fri, 9 May 2025 18:45:17 -0400
Subject: [PATCH 14/15] upd eval scripts

---
 benchmarks/continual_eval_checkpoints.py |   9 ++
 benchmarks/dpo/continual_dpo_trainer.py  |   8 +-
 benchmarks/parallel_eval_checkpoints.py  | 142 +++++++++++++++++++++++
 3 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/parallel_eval_checkpoints.py

diff --git a/benchmarks/continual_eval_checkpoints.py b/benchmarks/continual_eval_checkpoints.py
index a5302966..6f748b1d 100644
--- a/benchmarks/continual_eval_checkpoints.py
+++ b/benchmarks/continual_eval_checkpoints.py
@@ -146,6 +146,15 @@ def extract_indices(path):
             ev_metrics = trainer.evaluate()
             ev_metrics = {f'dataset-{i}/' + k: v for k, v in ev_metrics.items()}
             metrics.update(ev_metrics)
+            if training_args.local_rank in (None, -1, 0):
+                wb.log({f'task/{dataset_name}/{k}': v for k, v in ev_metrics.items()})
+
+            # If using DeepSpeed through Accelerate, tear down the engine after training.
+            if hasattr(trainer, 'deepspeed') and trainer.deepspeed is not None:
+                # Remove reference to the DeepSpeed engine to allow proper cleanup.
+                del trainer.deepspeed
+            # Free cached GPU memory.
+            torch.cuda.empty_cache()
 
         if training_args.local_rank in (None, -1, 0):
             wb.log(metrics)  # type: ignore[attr-defined]
diff --git a/benchmarks/dpo/continual_dpo_trainer.py b/benchmarks/dpo/continual_dpo_trainer.py
index ce507021..024cbc08 100644
--- a/benchmarks/dpo/continual_dpo_trainer.py
+++ b/benchmarks/dpo/continual_dpo_trainer.py
@@ -286,7 +286,10 @@ def evaluate_policy(self) -> dict:
 
         with torch.no_grad():
             if self.eval_policy_dataloader is not None:
-                for batch in self.eval_policy_dataloader:
+                for idx, batch in enumerate(self.eval_policy_dataloader):
+                    print(
+                        f'Processing batch {idx} out of {len(self.eval_policy_dataloader)}'
+                    )
                     query = batch['input_ids'].to(self.accelerator.device)
                     context_length = query.shape[1]
                     with unwrap_model_for_generation(
@@ -333,6 +336,7 @@ def log(
         # TODO: Only generation sample completions every x steps
         do_generate_completions = True
         if do_generate_completions:
+            print('Generating completions...')
             self._generate_completions()
             torch.cuda.empty_cache()
 
@@ -355,6 +359,7 @@ def _generate_completions(self) -> None:
             do_sample=True,
         )
 
+        self.model.eval()
         table = defaultdict(list)
         with torch.no_grad():
             with unwrap_model_for_generation(
@@ -404,6 +409,7 @@ def _generate_completions(self) -> None:
                         table['score'].extend(scores)
                         break
 
+        self.model.train()
         df = pd.DataFrame(table)
 
         if self.accelerator.is_main_process or self.accelerator is None:
diff --git a/benchmarks/parallel_eval_checkpoints.py b/benchmarks/parallel_eval_checkpoints.py
new file mode 100644
index 00000000..158deb65
--- /dev/null
+++ b/benchmarks/parallel_eval_checkpoints.py
@@ -0,0 +1,142 @@
+import os
+
+import torch
+import wandb as wb
+from dataloading import init_continual_dataset
+from datasets import Dataset
+from dpo.continual_dpo_trainer import (
+    ContinualDPOArguments,
+    ContinualDPOConfig,
+    ContinualDPOTrainer,
+)
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+)
+from trl import (
+    ModelConfig,
+    TrlParser,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
+
+
+def main(
+    script_args: ContinualDPOArguments,
+    training_args: ContinualDPOConfig,
+    model_args: ModelConfig,
+) -> None:
+    # Determine torch dtype and quantization configs
+    torch_dtype = (
+        model_args.torch_dtype
+        if model_args.torch_dtype in ['auto', None]
+        else getattr(torch, model_args.torch_dtype)
+    )
+    if script_args.wandb_run_name is not None:
+        training_args.run_name = script_args.wandb_run_name
+
+    quantization_config = get_quantization_config(model_args)
+
+    # Model & Tokenizer Setup
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+
+    # Checkpoint loop
+    checkpoint_path = script_args.checkpoint_dir
+    dataset_name = checkpoint_path.split('/')[-2].replace('.', '')
+    checkpoint_step = checkpoint_path.split('/')[-1].replace('.', '')
+    print(
+        f'Evaluating checkpoint: {checkpoint_step} trained on dataset: {dataset_name} on all tasks'
+    )
+    checkpoint_name = dataset_name + '_' + checkpoint_step
+    print('checkpoint_name', checkpoint_name)
+
+    model = AutoModelForCausalLM.from_pretrained(
+        checkpoint_path,
+        trust_remote_code=model_args.trust_remote_code,
+        **model_kwargs,
+    )
+    peft_config = get_peft_config(model_args)
+
+    ref_model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        trust_remote_code=model_args.trust_remote_code,
+        **model_kwargs,
+    )
+
+    # Load tokenizer and set chat template if needed
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    if tokenizer.chat_template is None:
+        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
+
+    # Initialize continual dataset
+    continual_dataset: list[dict[str, Dataset]] = init_continual_dataset(
+        script_args.dataset_name,
+        mock=training_args.mock,
+        tokenizer=tokenizer,
+        tools=getattr(training_args, 'tools', None),
+    )
+    output_dir = training_args.output_dir
+
+    # Validate reward model paths if provided
+    for i, _ in enumerate(continual_dataset):
+        reward_path = training_args.reward_model_path + '_' + str(i)
+        if not os.path.exists(reward_path):
+            raise FileNotFoundError(
+                f'Reward model not found for dataset {i} at {reward_path}'
+            )
+
+    # Task Loop
+    for i, dataset in enumerate(continual_dataset):
+        print('task', i)
+        reward_model = AutoModelForSequenceClassification.from_pretrained(
+            training_args.reward_model_path + f'_{str(i)}', num_labels=1
+        )
+
+        training_args.output_dir = f'{output_dir}/dataset-{i}'
+        # using ContinualDPOTrainer for all pipelines (PPO, DPO, COPR, ..) only for evaluation
+        trainer = ContinualDPOTrainer(
+            args=training_args,
+            processing_class=tokenizer,
+            model=model,
+            ref_model=ref_model,
+            reward_model=reward_model,
+            train_dataset=dataset[script_args.dataset_test_split],
+            eval_dataset=dataset[script_args.dataset_test_split],
+            peft_config=peft_config,
+        )
+
+        print('evaluating...')
+        ev_metrics = trainer.evaluate()
+        # ev_metrics = {f'dataset-{i}/' + k: v for k, v in ev_metrics.items()}
+        if training_args.local_rank in (None, -1, 0):
+            print('ev_metrics', ev_metrics)
+            wb.log(ev_metrics)
+            wb.log({f'{checkpoint_name}/{k}': v for k, v in ev_metrics.items()})
+
+        # If using DeepSpeed through Accelerate, tear down the engine after training.
+        if hasattr(trainer, 'deepspeed') and trainer.deepspeed is not None:
+            # Remove reference to the DeepSpeed engine to allow proper cleanup.
+            del trainer.deepspeed
+        # Free cached GPU memory.
+        torch.cuda.empty_cache()
+
+
+if __name__ == '__main__':
+    dataclass_types = (ContinualDPOArguments, ContinualDPOConfig, ModelConfig)
+    parser = TrlParser(dataclass_types)
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

From ef6ef390fd46c2f785463a99ccee16842b8c4cf8 Mon Sep 17 00:00:00 2001
From: avecplezir <avecplezir@gmail.com>
Date: Tue, 13 May 2025 11:37:59 -0400
Subject: [PATCH 15/15] add parallel_eval_checkpoints

---
 .../dpo_ewc/continual_dpo_EWC_trainer.py      | 76 +++++++++----------
 benchmarks/parallel_eval_checkpoints.py       | 39 ++++++++--
 .../accelerate_configs/deepspeed_zero2.yaml   |  2 +-
 3 files changed, 69 insertions(+), 48 deletions(-)

diff --git a/benchmarks/dpo_ewc/continual_dpo_EWC_trainer.py b/benchmarks/dpo_ewc/continual_dpo_EWC_trainer.py
index 45b2af0c..5ee13556 100644
--- a/benchmarks/dpo_ewc/continual_dpo_EWC_trainer.py
+++ b/benchmarks/dpo_ewc/continual_dpo_EWC_trainer.py
@@ -116,37 +116,50 @@ def compute_ewc_loss(self) -> torch.Tensor:
             # No previous tasks, so no regularization needed
             return torch.tensor(0.0, device=self.accelerator.device)
 
-        ewc_loss = torch.tensor(0.0, device=self.accelerator.device)
-
         # Calculate the EWC penalty for each parameter
         model = self.accelerator.unwrap_model(self.model)
-
+        ewc_loss = torch.tensor(0.0, device=self.accelerator.device)
         for name, param in model.named_parameters():
-            if name not in ContinualDPOEWCTrainer.class_fisher_information:
-                continue
-            if not param.requires_grad:
+            if not param.requires_grad or name not in self.class_fisher_information:
                 continue
-
-            if (
-                name in ContinualDPOEWCTrainer.class_fisher_information
-                and param.requires_grad
-            ):
-                # Get the Fisher information and old parameter values
-                fisher = ContinualDPOEWCTrainer.class_fisher_information[name].to(
-                    param.device
-                )
-
+            # self.accelerator.print(name, param.shape)
             with deepspeed.zero.GatheredParameters([param], modifier_rank=0):
                 if self.accelerator.is_main_process:
+                    # Get the Fisher information and old parameter values
+                    fisher = ContinualDPOEWCTrainer.class_fisher_information[name].to(
+                        self.accelerator.device
+                    )
                     old_param = ContinualDPOEWCTrainer.class_old_params[name].to(
-                        param.device
+                        self.accelerator.device
                     )
+
                     # Calculate squared distance weighted by Fisher information
                     delta = param - old_param
                     ewc_loss = ewc_loss + (fisher * delta.pow(2)).sum()
 
-        # Apply the EWC lambda coefficient and return
-        return 0.5 * self.ewc_lambda * ewc_loss
+                    # Apply the EWC lambda coefficient and return
+                    ewc_loss = 0.5 * self.ewc_lambda * ewc_loss
+                else:
+                    # Non-main processes should not compute EWC loss
+                    ewc_loss = torch.tensor(0.0, device=self.accelerator.device)
+
+        ewc_loss = self.accelerator.reduce(ewc_loss, 'mean')
+        return ewc_loss
+
+    def store_current_parameters(self) -> Dict[str, torch.Tensor]:
+        """Store the current model parameters.
+
+        Returns:
+            Dictionary mapping parameter names to their current values
+        """
+        model = self.accelerator.unwrap_model(self.model)
+        old_params = {}
+        for name, param in model.named_parameters():
+            with deepspeed.zero.GatheredParameters([param], modifier_rank=0):
+                if self.accelerator.is_main_process:
+                    if param.requires_grad:
+                        old_params[name] = param.data.clone().detach()
+        return old_params
 
     def compute_fisher_information(
         self, num_samples: int = 120
@@ -161,11 +174,6 @@ def compute_fisher_information(
         """
         # Get unwrapped model for computing Fisher
         model = self.accelerator.unwrap_model(self.model)
-        self.accelerator.device
-
-        # Make sure parameters require gradients
-        for param in model.parameters():
-            param.requires_grad_(True)
 
         # Initialize fisher information dictionary
         fisher_info = {}
@@ -206,7 +214,9 @@ def compute_fisher_information(
             model.zero_grad()
 
             try:
-                loss, _ = self.compute_loss(model, batch, return_outputs=True)
+                loss, _ = super(ContinualDPOEWCTrainer, self).compute_loss(
+                    model, batch, return_outputs=True
+                )
 
                 # Check if loss requires gradient
                 if not loss.requires_grad:
@@ -238,22 +248,6 @@ def compute_fisher_information(
         print(f'Computed Fisher information for {sample_count} examples')
         return fisher_info
 
-    def store_current_parameters(self) -> Dict[str, torch.Tensor]:
-        """Store the current model parameters.
-
-        Returns:
-            Dictionary mapping parameter names to their current values
-        """
-        model = self.accelerator.unwrap_model(self.model)
-        old_params = {}
-
-        for name, param in model.named_parameters():
-            with deepspeed.zero.GatheredParameters([param], modifier_rank=0):
-                if self.accelerator.is_main_process:
-                    if param.requires_grad:
-                        old_params[name] = param.data.clone().detach()
-        return old_params
-
     def train(self) -> Any:
         """Override train method to incorporate EWC regularization."""
         # Regular training
diff --git a/benchmarks/parallel_eval_checkpoints.py b/benchmarks/parallel_eval_checkpoints.py
index 158deb65..c4921819 100644
--- a/benchmarks/parallel_eval_checkpoints.py
+++ b/benchmarks/parallel_eval_checkpoints.py
@@ -9,6 +9,7 @@
     ContinualDPOConfig,
     ContinualDPOTrainer,
 )
+from safetensors import safe_open
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForSequenceClassification,
@@ -30,6 +31,7 @@ def main(
     model_args: ModelConfig,
 ) -> None:
     # Determine torch dtype and quantization configs
+
     torch_dtype = (
         model_args.torch_dtype
         if model_args.torch_dtype in ['auto', None]
@@ -52,7 +54,11 @@ def main(
 
     # Checkpoint loop
     checkpoint_path = script_args.checkpoint_dir
-    dataset_name = checkpoint_path.split('/')[-2].replace('.', '')
+    if 'PPO' in checkpoint_path:
+        dataset_name = 'dataset-' + checkpoint_path.split('/')[-2].split('_')[-1]
+    else:
+        dataset_name = checkpoint_path.split('/')[-2].replace('.', '')
+
     checkpoint_step = checkpoint_path.split('/')[-1].replace('.', '')
     print(
         f'Evaluating checkpoint: {checkpoint_step} trained on dataset: {dataset_name} on all tasks'
@@ -60,11 +66,32 @@ def main(
     checkpoint_name = dataset_name + '_' + checkpoint_step
     print('checkpoint_name', checkpoint_name)
 
-    model = AutoModelForCausalLM.from_pretrained(
-        checkpoint_path,
-        trust_remote_code=model_args.trust_remote_code,
-        **model_kwargs,
-    )
+    if 'PPO' in checkpoint_path:
+        # remove the prefix 'policy.' from the keys to load the model; skip the critic and value model
+        prefix = 'policy.'
+        with safe_open(
+            checkpoint_path + '/model.safetensors', framework='pt', device='cpu'
+        ) as f:
+            clean_sd = {
+                k[len(prefix) :] if k.startswith(prefix) else k: f.get_tensor(k)
+                for k in f.keys()
+                if not (
+                    k.startswith('critic_backbone.') or k.startswith('value_model.')
+                )
+            }
+
+        model = AutoModelForCausalLM.from_pretrained(
+            checkpoint_path,
+            trust_remote_code=model_args.trust_remote_code,
+            state_dict=clean_sd,
+            **model_kwargs,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            checkpoint_path,
+            trust_remote_code=model_args.trust_remote_code,
+            **model_kwargs,
+        )
     peft_config = get_peft_config(model_args)
 
     ref_model = AutoModelForCausalLM.from_pretrained(
diff --git a/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml b/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
index 8046cccc..239b14ac 100644
--- a/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
+++ b/benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml
@@ -12,7 +12,7 @@ machine_rank: 0
 main_training_function: main
 mixed_precision: 'bf16'
 num_machines: 1
-num_processes: 1
+num_processes: 8
 rdzv_backend: static
 same_network: true
 tpu_env: []