diff --git a/benchmarks/cppo/README.md b/benchmarks/cppo/README.md index aea2af7f..d28a9163 100644 --- a/benchmarks/cppo/README.md +++ b/benchmarks/cppo/README.md @@ -16,8 +16,8 @@ uv sync --group benchmarks uv run benchmarks/cppo/cppo.py \ --dataset_name benchmarks/continual_data_debug.json \ --sft_model_path Qwen/Qwen2-0.5B-Instruct \ - --value_model_path Shahradmz/Qwen2-0.5B-Instruct_continual_data_debug_REWARD_0 \ - --reward_model_path Shahradmz/Qwen2-0.5B-Instruct_continual_data_debug_REWARD \ + --value_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_CPPO_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_CPPO_REWARD \ --learning_rate 5.0e-6 \ --num_train_epochs 1 \ --gradient_accumulation_steps 8 \ @@ -31,7 +31,7 @@ uv run benchmarks/cppo/cppo.py \ --no_remove_unused_columns \ --use_peft \ --lora_r 32 \ - --lora_alpha 16 \ + --lora_alpha 16 --push_to_hub True ``` diff --git a/benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml b/benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml index f369ef96..771e5fab 100644 --- a/benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml +++ b/benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml @@ -12,7 +12,7 @@ machine_rank: 0 main_training_function: main mixed_precision: 'bf16' num_machines: 1 -num_processes: 2 +num_processes: 4 rdzv_backend: static same_network: true tpu_env: [] diff --git a/benchmarks/cppo/cppo.py b/benchmarks/cppo/cppo.py index 57df5c48..d1ce2dd0 100644 --- a/benchmarks/cppo/cppo.py +++ b/benchmarks/cppo/cppo.py @@ -17,7 +17,7 @@ get_peft_config, get_quantization_config, ) -from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE +from trl import setup_chat_format import wandb as wb from benchmarks.dataloading import init_continual_dataset @@ -50,11 +50,6 @@ def main( # Load main model and (optionally) reference model model = str(training_args.sft_model_path) - policy = AutoModelForCausalLM.from_pretrained( - training_args.sft_model_path, - trust_remote_code=model_args.trust_remote_code, - **model_kwargs, - ) peft_config = get_peft_config(model_args) if peft_config is None: ref_policy = AutoModelForCausalLM.from_pretrained( @@ -65,22 +60,11 @@ def main( else: ref_policy = None - # Load value model and policy model (main model) - value_model = AutoModelForSequenceClassification.from_pretrained( - script_args.value_model_path, - trust_remote_code=model_args.trust_remote_code, - num_labels=1, - ) - # Load tokenizer and set chat template if needed tokenizer = AutoTokenizer.from_pretrained( training_args.sft_model_path, trust_remote_code=model_args.trust_remote_code, ) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - if tokenizer.chat_template is None: - tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE # Initialize continual dataset continual_dataset: list[dict[str, Dataset]] = init_continual_dataset( @@ -114,6 +98,34 @@ def main( old_logprobs, old_rewards = None, None for i, dataset in enumerate(continual_dataset): + # Load main model and (optionally) reference model + if i == 0: + model_path = training_args.sft_model_path + value_model_path = script_args.value_model_path + else: + model_path = os.path.join(training_args.output_dir, 'last') + value_model_path = os.path.join(training_args.output_dir, 'last', 'value_model') + policy = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_path, + trust_remote_code=model_args.trust_remote_code, + **model_kwargs, + ) + + # Load value model and policy model (main model) + try: + value_model = AutoModelForSequenceClassification.from_pretrained( + value_model_path, + trust_remote_code=model_args.trust_remote_code, + num_labels=1, + ) + except OSError: + # Maybe it was saved as safetensors? + value_model = AutoModelForSequenceClassification.from_pretrained( + value_model_path, + trust_remote_code=model_args.trust_remote_code, + num_labels=1, + from_tf=True, # or use `subfolder="safetensors"` if you saved a .safetensors file + ) # Build custom repository name for this task custom_repo_name = ( model.split('/')[-1] + '_' + clean_dataset_name + '_CPPO_' + str(i) @@ -127,6 +139,22 @@ def main( training_args.reward_model_path + '_' + str(i), num_labels=1 ) + for idx, _model in enumerate([policy, value_model, reward_model]): + # Align padding tokens between tokenizer and model + _model.config.pad_token_id = tokenizer.pad_token_id + + # Use ChatML format if the tokenizer doesn't already have a chat template + if tokenizer.chat_template is None: + updated_model, updated_tokenizer = setup_chat_format(_model, tokenizer) + # Actually store the updated model + if idx == 0: + policy = updated_model + elif idx == 1: + value_model = updated_model + else: + reward_model = updated_model + tokenizer = updated_tokenizer + ################ # Training and Evaluation ################ @@ -163,21 +191,33 @@ def main( trainer.log_metrics(f'eval/dataset/{i}', metrics) trainer.save_metrics('eval', metrics) - # Log metrics to WandB - wb.log({'eval': {'last': metrics}}) - wb.log({f'task/{custom_repo_name}/last': metrics}) + if training_args.local_rank in (None, -1, 0): + # Log metrics to WandB + wb.log({'eval': {'dataset': i, 'last': metrics}}) + wb.log({f'task/{custom_repo_name}/dataset/{i}': metrics}) - # Save model checkpoint and optionally push - if not training_args.push_to_hub: - trainer.save_model(os.path.join(training_args.output_dir, 'last')) - else: + last_dir = os.path.join(training_args.output_dir, 'last') + policy.save_pretrained(last_dir) + tokenizer.save_pretrained(last_dir) + + value_model_dir = os.path.join(last_dir, 'value_model') + os.makedirs(value_model_dir, exist_ok=True) + value_model.save_pretrained(value_model_dir, + safe_serialization=False) + + trainer.accelerator.wait_for_everyone() + + if training_args.push_to_hub: trainer.push_to_hub( model_name=custom_repo_name, - dataset_name='CPPO_' + clean_dataset_name + '_' + str(i), + dataset_name='Continual_CPPO_' + clean_dataset_name + '_' + str(i), ) ref_policy = None old_logprobs, old_rewards = trainer.old_logprobs, trainer.old_rewards + if hasattr(trainer, 'deepspeed') and trainer.deepspeed is not None: + del trainer.deepspeed + torch.cuda.empty_cache() print('Training completed for all tasks!') diff --git a/benchmarks/cppo/cppo_trainer.py b/benchmarks/cppo/cppo_trainer.py index f941c72f..0424aecc 100644 --- a/benchmarks/cppo/cppo_trainer.py +++ b/benchmarks/cppo/cppo_trainer.py @@ -118,13 +118,6 @@ class CPPOConfig(PPOConfig): class CPPOTrainer(PPOTrainer): - # Shared accelerator instance across all trainer instances - shared_accelerator: Optional[Accelerator] = None - current_task_index: Optional[int] = None - policy_value_models: Any # the policy and value model wrapper - ds_wrapped_models: Any # TODO work with this after deepspeed is initialized - accelerator: Accelerator # now non-optional after creation - ref_model: Optional[Union[PreTrainedModel, nn.Module]] = None def __init__( self, @@ -153,6 +146,14 @@ def __init__( old_logprobs: Optional[Tensor] = None, old_rewards: Optional[Tensor] = None, ): + self.shared_accelerator: Optional[Accelerator] = None + self.current_task_index: Optional[int] = None + self.policy_value_models: Any = None # the policy and value model wrapper + self.ds_wrapped_models: Any = None # TODO work with this after deepspeed is initialized + self.accelerator: Accelerator = None # now non-optional after creation + self.ref_model: Optional[Union[PreTrainedModel, nn.Module]] = None + + # Basic setup and validation if args is None: raise ValueError('`args` cannot be None') @@ -175,18 +176,18 @@ def __init__( # Initialize task tracking self._stored_metrics: Dict = defaultdict(lambda: defaultdict(list)) self.current_task = ( - f'task_{CPPOTrainer.current_task_index}' - if CPPOTrainer.current_task_index is not None + f'task_{self.current_task_index}' + if self.current_task_index is not None else 'task_0' ) # Set up task index tracking is_first_task = False - if CPPOTrainer.current_task_index is None: - CPPOTrainer.current_task_index = 0 + if self.current_task_index is None: + self.current_task_index = 0 is_first_task = True else: - CPPOTrainer.current_task_index += 1 + self.current_task_index += 1 self.is_final_eval = False # Store basic configuration @@ -247,7 +248,7 @@ def __init__( else: self.ref_model = create_reference_model(self.policy_model) - CPPOTrainer.class_ref_model = self.ref_model + self.class_ref_model = self.ref_model else: # For subsequent tasks, reuse the reference model @@ -284,14 +285,14 @@ def __init__( args.total_episodes = int(args.num_train_epochs * self.train_dataset_len) # Setup accelerator - shared across all tasks - if CPPOTrainer.shared_accelerator is None: + if self.shared_accelerator is None: accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps ) self.accelerator = accelerator - CPPOTrainer.shared_accelerator = accelerator + self.shared_accelerator = accelerator else: - self.accelerator = CPPOTrainer.shared_accelerator + self.accelerator = self.shared_accelerator self.gather_function = self.accelerator.gather_for_metrics if ( 'use_gather_object' @@ -331,7 +332,7 @@ def __init__( args.num_total_batches = math.ceil(args.total_episodes / args.batch_size) time_tensor = torch.tensor(int(time.time()), device=self.accelerator.device) time_int = broadcast(time_tensor, 0).item() - args.run_name = f'{args.exp_name}__{args.seed}__{time_int}' + # args.run_name = f'{args.exp_name}__{args.seed}__{time_int}' self.local_seed = args.seed + self.accelerator.process_index * 100003 # Prime if args.num_sample_generations > 0: self.sample_generations_freq = max( @@ -353,11 +354,12 @@ def __init__( # Create policy and value model wrapper self.model = PolicyAndValueWrapper(self.policy_model, self.value_model) - CPPOTrainer.policy_value_models = self.model + self.policy_value_models = self.model self.model.config = self.policy_model.config # needed for pushing to hub else: + disable_dropout_in_model(self.reward_model) # Subsequent tasks: Reuse existing model - self.model = CPPOTrainer.policy_value_models + self.model = self.policy_value_models self.model.config = self.policy_model.config # needed for pushing to hub # Always create optimizer and scheduler for each task @@ -425,14 +427,14 @@ def __init__( self.model, self.optimizer, self.dataloader = self.accelerator.prepare( self.model, self.optimizer, self.dataloader ) - CPPOTrainer.ds_wrapped_models = self.model + self.ds_wrapped_models = self.model else: # For subsequent tasks, only prepare optimizer and dataloader self.optimizer, self.dataloader = self.accelerator.prepare( self.optimizer, self.dataloader ) # Reuse the model from the first task - self.model = CPPOTrainer.ds_wrapped_models + self.model = self.ds_wrapped_models torch.manual_seed(self.local_seed) # Reset local seed @@ -469,10 +471,10 @@ def __init__( args.fp16, args.bf16, ) - CPPOTrainer.class_ref_model = self.ref_model + self.class_ref_model = self.ref_model else: # Reuse prepared ref_model on subsequent tasks - self.ref_model = CPPOTrainer.class_ref_model + self.ref_model = self.class_ref_model else: # Non-DeepSpeed path if self.ref_model is None: @@ -483,10 +485,10 @@ def __init__( elif is_first_task: # Only move ref_model to device on first task self.ref_model = self.ref_model.to(self.accelerator.device) # type: ignore - CPPOTrainer.class_ref_model = self.ref_model + self.class_ref_model = self.ref_model else: # Reuse ref_model on subsequent tasks - self.ref_model = CPPOTrainer.class_ref_model + self.ref_model = self.class_ref_model # Always move reward model to device self.reward_model = self.reward_model.to(self.accelerator.device) # type: ignore @@ -1019,15 +1021,15 @@ def _get_mask(coef: Optional[Tensor]) -> Tensor: if self.ref_model is None and original_ref_model is not None: print('Reference model was cleared during training - restoring') self.ref_model = original_ref_model - CPPOTrainer.class_ref_model = original_ref_model + self.class_ref_model = original_ref_model # Ensure the class variable is updated - CPPOTrainer.class_ref_model = self.ref_model + self.class_ref_model = self.ref_model if self.is_deepspeed_enabled: - CPPOTrainer.ds_wrapped_models = self.deepspeed + self.ds_wrapped_models = self.deepspeed else: - CPPOTrainer.ds_wrapped_models = self.model - CPPOTrainer.policy_value_models = self.model + self.ds_wrapped_models = self.model + self.policy_value_models = self.model def evaluate(self) -> Dict[str, float]: """Custom evaluation method for PPO. Generates completions from the evaluation dataloader, @@ -1240,32 +1242,29 @@ def mark_final_eval(self, is_final: bool = True) -> 'CPPOTrainer': self.is_final_eval = is_final return self - def save_model( - self, output_dir: Optional[str] = None, _internal_call: bool = False - ) -> None: - """Save the model, dealing with the case where it's a PEFT model without a policy attribute.""" - # Store the original model - original_model = self.model - - # For PEFT models (which lack .policy attribute), use the model directly - if hasattr(self.model, 'base_model'): - # PEFT model case - don't try to access .policy - pass # Keep the model as is - elif hasattr(self.model, 'policy'): - # Standard PPO case - use the policy as in the original implementation - self.model = self.model.policy - elif hasattr(self.model, 'policy_model'): - # Standard PPO case - use the policy_model as in the original implementation - self.model = self.model.policy_model - - # Call the parent class's save_model - if output_dir is None: - output_dir = self.args.output_dir - - Trainer.save_model(self, output_dir, _internal_call) - - # Restore the original model - self.model = original_model + def save_model(self, output_dir: str, _internal_call=True) -> None: + """ + Manually save the model (and training state) to a specified directory. + This follows a similar procedure as _save_checkpoint. + """ + + # Save the model files to output_dir (marking _internal_call True) + from transformers import Trainer # ensure Trainer is imported + Trainer.save_model(self, output_dir, _internal_call=True) + + # If not saving only the model, save optimizer, scheduler, and RNG state + if not self.args.save_only_model: + self._save_optimizer_and_scheduler(output_dir) + self._save_scaler(output_dir) + self._save_rng_state(output_dir) + + # Save the trainer state + trainer_state_path = os.path.join(output_dir, "trainer_state.json") + self.state.save_to_json(trainer_state_path) + + # Optionally push to hub if that option is enabled + if self.args.push_to_hub: + self._push_from_checkpoint(output_dir) def get_cppo_plasticity_weights( diff --git a/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml b/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml index f369ef96..771e5fab 100644 --- a/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml +++ b/benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml @@ -12,7 +12,7 @@ machine_rank: 0 main_training_function: main mixed_precision: 'bf16' num_machines: 1 -num_processes: 2 +num_processes: 4 rdzv_backend: static same_network: true tpu_env: [] diff --git a/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml b/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml index 6b68067b..b10a978f 100644 --- a/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml +++ b/benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml @@ -11,7 +11,7 @@ machine_rank: 0 main_training_function: main mixed_precision: bf16 num_machines: 1 -num_processes: 8 # TODO change to whatever number of gpus is used +num_processes: 4 # TODO change to whatever number of gpus is used rdzv_backend: static same_network: true tpu_env: [] diff --git a/benchmarks/dpo/continual_dpo_trainer.py b/benchmarks/dpo/continual_dpo_trainer.py index 024cbc08..f7cee2ca 100644 --- a/benchmarks/dpo/continual_dpo_trainer.py +++ b/benchmarks/dpo/continual_dpo_trainer.py @@ -419,7 +419,7 @@ def _generate_completions(self) -> None: def print_rich_table(df: pd.DataFrame) -> Table: - console = Console() + console = Console(markup=False) table = Table(show_lines=True) for column in df.columns: table.add_column(column) diff --git a/benchmarks/ppo/continual_ppo_trainer.py b/benchmarks/ppo/continual_ppo_trainer.py index cb3d747f..79ea9c62 100644 --- a/benchmarks/ppo/continual_ppo_trainer.py +++ b/benchmarks/ppo/continual_ppo_trainer.py @@ -113,13 +113,6 @@ class ContinualPPOConfig(PPOConfig): class ContinualPPOTrainer(PPOTrainer): - # Shared accelerator instance across all trainer instances - shared_accelerator: Optional[Accelerator] = None - current_task_index: Optional[int] = None - policy_value_models: Any # the policy and value model wrapper - ds_wrapped_models: Any # TODO work with this after deepspeed is initialized - accelerator: Accelerator # now non-optional after creation - def __init__( self, args: Optional[PPOConfig] = None, @@ -145,6 +138,14 @@ def __init__( callbacks: Optional[list[TrainerCallback]] = None, peft_config: Optional[dict] = None, ): + self.shared_accelerator: Optional[Accelerator] = None + self.current_task_index: Optional[int] = None + self.policy_value_models: Any = None # the policy and value model wrapper + self.ds_wrapped_models: Any = ( + None # TODO work with this after deepspeed is initialized + ) + self.accelerator: Accelerator = None # now non-optional after creation + # Basic setup and validation if args is None: raise ValueError('`args` cannot be None') @@ -167,18 +168,18 @@ def __init__( # Initialize task tracking self._stored_metrics: Dict = defaultdict(lambda: defaultdict(list)) self.current_task = ( - f'task_{ContinualPPOTrainer.current_task_index}' - if ContinualPPOTrainer.current_task_index is not None + f'task_{self.current_task_index}' + if self.current_task_index is not None else 'task_0' ) # Set up task index tracking is_first_task = False - if ContinualPPOTrainer.current_task_index is None: - ContinualPPOTrainer.current_task_index = 0 + if self.current_task_index is None: + self.current_task_index = 0 is_first_task = True else: - ContinualPPOTrainer.current_task_index += 1 + self.current_task_index += 1 self.is_final_eval = False # Store basic configuration @@ -239,11 +240,11 @@ def __init__( else: self.ref_model = create_reference_model(self.policy_model) - ContinualPPOTrainer.class_ref_model = self.ref_model + self.class_ref_model = self.ref_model else: # For subsequent tasks, reuse the reference model - self.ref_model = ContinualPPOTrainer.class_ref_model + self.ref_model = self.class_ref_model # Always process new datasets for each task self.reward_model = reward_model @@ -265,14 +266,15 @@ def __init__( args.total_episodes = int(args.num_train_epochs * self.train_dataset_len) # Setup accelerator - shared across all tasks - if ContinualPPOTrainer.shared_accelerator is None: + if self.shared_accelerator is None: accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps ) self.accelerator = accelerator - ContinualPPOTrainer.shared_accelerator = accelerator - else: - self.accelerator = ContinualPPOTrainer.shared_accelerator + self.gather_function = self.accelerator.gather_for_metrics + self.shared_accelerator = accelerator + elif False: + self.accelerator = self.shared_accelerator self.gather_function = self.accelerator.gather_for_metrics if ( 'use_gather_object' @@ -334,11 +336,11 @@ def __init__( # Create policy and value model wrapper self.model = PolicyAndValueWrapper(self.policy_model, self.value_model) - ContinualPPOTrainer.policy_value_models = self.model + self.policy_value_models = self.model self.model.config = self.policy_model.config # needed for pushing to hub - else: + elif False: # Subsequent tasks: Reuse existing model - self.model = ContinualPPOTrainer.policy_value_models + self.model = self.policy_value_models self.model.config = self.policy_model.config # needed for pushing to hub # Always create optimizer and scheduler for each task @@ -406,14 +408,14 @@ def __init__( self.model, self.optimizer, self.dataloader = self.accelerator.prepare( self.model, self.optimizer, self.dataloader ) - ContinualPPOTrainer.ds_wrapped_models = self.model - else: + self.ds_wrapped_models = self.model + elif False: # For subsequent tasks, only prepare optimizer and dataloader self.optimizer, self.dataloader = self.accelerator.prepare( self.optimizer, self.dataloader ) # Reuse the model from the first task - self.model = ContinualPPOTrainer.ds_wrapped_models + self.model = self.ds_wrapped_models torch.manual_seed(self.local_seed) # Reset local seed @@ -450,10 +452,10 @@ def __init__( args.fp16, args.bf16, ) - ContinualPPOTrainer.class_ref_model = self.ref_model + self.class_ref_model = self.ref_model else: # Reuse prepared ref_model on subsequent tasks - self.ref_model = ContinualPPOTrainer.class_ref_model + self.ref_model = self.class_ref_model else: # Non-DeepSpeed path if self.ref_model is None: @@ -464,10 +466,10 @@ def __init__( elif is_first_task: # Only move ref_model to device on first task self.ref_model = self.ref_model.to(self.accelerator.device) # type: ignore - ContinualPPOTrainer.class_ref_model = self.ref_model + self.class_ref_model = self.ref_model else: # Reuse ref_model on subsequent tasks - self.ref_model = ContinualPPOTrainer.class_ref_model + self.ref_model = self.class_ref_model # Always move reward model to device self.reward_model = self.reward_model.to(self.accelerator.device) # type: ignore @@ -968,15 +970,16 @@ def repeat_generator() -> DataLoader: if self.ref_model is None and original_ref_model is not None: print('Reference model was cleared during training - restoring') self.ref_model = original_ref_model - ContinualPPOTrainer.class_ref_model = original_ref_model + self.class_ref_model = original_ref_model # Ensure the class variable is updated - ContinualPPOTrainer.class_ref_model = self.ref_model + # TODO: Double check this is fine to keep + self.class_ref_model = self.ref_model if self.is_deepspeed_enabled: - ContinualPPOTrainer.ds_wrapped_models = self.deepspeed + self.ds_wrapped_models = self.deepspeed else: - ContinualPPOTrainer.ds_wrapped_models = self.model - ContinualPPOTrainer.policy_value_models = self.model + self.ds_wrapped_models = self.model + self.policy_value_models = self.model def evaluate(self) -> Dict[str, float]: """Custom evaluation method for PPO. Generates completions from the evaluation dataloader, @@ -1189,29 +1192,25 @@ def mark_final_eval(self, is_final: bool = True) -> 'ContinualPPOTrainer': self.is_final_eval = is_final return self - def save_model( - self, output_dir: Optional[str] = None, _internal_call: bool = False - ) -> None: - """Save the model, dealing with the case where it's a PEFT model without a policy attribute.""" - # Store the original model - original_model = self.model - - # For PEFT models (which lack .policy attribute), use the model directly - if hasattr(self.model, 'base_model'): - # PEFT model case - don't try to access .policy - pass # Keep the model as is - elif hasattr(self.model, 'policy'): - # Standard PPO case - use the policy as in the original implementation - self.model = self.model.policy - elif hasattr(self.model, 'policy_model'): - # Standard PPO case - use the policy_model as in the original implementation - self.model = self.model.policy_model - - # Call the parent class's save_model - if output_dir is None: - output_dir = self.args.output_dir - - Trainer.save_model(self, output_dir, _internal_call) - - # Restore the original model - self.model = original_model + def save_model(self, output_dir: str, _internal_call=True) -> None: + """Manually save the model (and training state) to a specified directory. + This follows a similar procedure as _save_checkpoint. + """ + # Save the model files to output_dir (marking _internal_call True) + from transformers import Trainer # ensure Trainer is imported + + Trainer.save_model(self, output_dir, _internal_call=True) + + # If not saving only the model, save optimizer, scheduler, and RNG state + if not self.args.save_only_model: + self._save_optimizer_and_scheduler(output_dir) + self._save_scaler(output_dir) + self._save_rng_state(output_dir) + + # Save the trainer state + trainer_state_path = os.path.join(output_dir, 'trainer_state.json') + self.state.save_to_json(trainer_state_path) + + # Optionally push to hub if that option is enabled + if self.args.push_to_hub: + self._push_from_checkpoint(output_dir) diff --git a/benchmarks/ppo/ppo_continual.py b/benchmarks/ppo/ppo_continual.py index 8db6aff3..5c98e833 100644 --- a/benchmarks/ppo/ppo_continual.py +++ b/benchmarks/ppo/ppo_continual.py @@ -4,11 +4,6 @@ import torch import wandb as wb -from continual_ppo_trainer import ( - ContinualPPOArguments, - ContinualPPOConfig, - ContinualPPOTrainer, -) from datasets import Dataset from transformers import ( AutoModelForCausalLM, @@ -21,10 +16,15 @@ get_kbit_device_map, get_peft_config, get_quantization_config, + setup_chat_format, ) -from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE from benchmarks.dataloading import init_continual_dataset +from benchmarks.ppo.continual_ppo_trainer import ( + ContinualPPOArguments, + ContinualPPOConfig, + ContinualPPOTrainer, +) def main( @@ -52,13 +52,7 @@ def main( quantization_config=quantization_config, ) - # Load main model and (optionally) reference model model = str(training_args.sft_model_path) - policy = AutoModelForCausalLM.from_pretrained( - training_args.sft_model_path, - trust_remote_code=model_args.trust_remote_code, - **model_kwargs, - ) peft_config = get_peft_config(model_args) if peft_config is None: ref_policy = AutoModelForCausalLM.from_pretrained( @@ -69,22 +63,11 @@ def main( else: ref_policy = None - # Load value model and policy model (main model) - value_model = AutoModelForSequenceClassification.from_pretrained( - script_args.value_model_path, - trust_remote_code=model_args.trust_remote_code, - num_labels=1, - ) - # Load tokenizer and set chat template if needed tokenizer = AutoTokenizer.from_pretrained( training_args.sft_model_path, trust_remote_code=model_args.trust_remote_code, ) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - if tokenizer.chat_template is None: - tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE # Initialize continual dataset continual_dataset: list[dict[str, Dataset]] = init_continual_dataset( @@ -117,6 +100,37 @@ def main( # Task Loop for i, dataset in enumerate(continual_dataset): + # Load main model and (optionally) reference model + if i == 0: + model_path = training_args.sft_model_path + value_model_path = script_args.value_model_path + else: + model_path = os.path.join(training_args.output_dir, 'last') + value_model_path = os.path.join( + training_args.output_dir, 'last', 'value_model' + ) + policy = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_path, + trust_remote_code=model_args.trust_remote_code, + **model_kwargs, + ) + + # Load value model and policy model (main model) + try: + value_model = AutoModelForSequenceClassification.from_pretrained( + value_model_path, + trust_remote_code=model_args.trust_remote_code, + num_labels=1, + ) + except OSError: + # Maybe it was saved as safetensors? + value_model = AutoModelForSequenceClassification.from_pretrained( + value_model_path, + trust_remote_code=model_args.trust_remote_code, + num_labels=1, + from_tf=True, # or use `subfolder="safetensors"` if you saved a .safetensors file + ) + # Build custom repository name for this task custom_repo_name = ( model.split('/')[-1] + '_' + clean_dataset_name + '_PPO_' + str(i) @@ -130,6 +144,22 @@ def main( training_args.reward_model_path + '_' + str(i), num_labels=1 ) + for idx, _model in enumerate([policy, value_model, reward_model]): + # Align padding tokens between tokenizer and model + _model.config.pad_token_id = tokenizer.pad_token_id + + # Use ChatML format if the tokenizer doesn't already have a chat template + if tokenizer.chat_template is None: + updated_model, updated_tokenizer = setup_chat_format(_model, tokenizer) + # Actually store the updated model + if idx == 0: + policy = updated_model + elif idx == 1: + value_model = updated_model + else: + reward_model = updated_model + tokenizer = updated_tokenizer + ################ # Training and Evaluation ################ @@ -145,9 +175,6 @@ def main( peft_config=peft_config, ) - # if i == 0: - # trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0')) - # Set current task in trainer for task-based logging trainer.set_task(f'task_{i}') @@ -174,9 +201,17 @@ def main( wb.log({f'task/{custom_repo_name}/last': metrics}) # type: ignore[attr-defined] # Save model checkpoint and optionally push - if not training_args.push_to_hub: - trainer.save_model(os.path.join(training_args.output_dir, 'last')) - else: + last_dir = os.path.join(training_args.output_dir, 'last') + policy.save_pretrained(last_dir) + tokenizer.save_pretrained(last_dir) + + value_model_dir = os.path.join(last_dir, 'value_model') + os.makedirs(value_model_dir, exist_ok=True) + value_model.save_pretrained(value_model_dir, safe_serialization=False) + + trainer.accelerator.wait_for_everyone() + + if training_args.push_to_hub: trainer.push_to_hub( model_name=custom_repo_name, dataset_name='Continual_PPO_' + clean_dataset_name + '_' + str(i), diff --git a/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py b/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py index 22717ebc..c47cab2c 100644 --- a/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py +++ b/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py @@ -112,9 +112,7 @@ def __init__( # Store EWC-specific parameters self.ewc_lambda = args.ewc_lambda - # Track if we're on the first task - is_first_task = ContinualPPOTrainer.current_task_index == 0 - if is_first_task: + if self.current_task_index == 0: # Initialize empty dictionaries for first task ContinualPPOEWCTrainer.class_fisher_information = {} ContinualPPOEWCTrainer.class_old_params = {} @@ -775,15 +773,15 @@ def repeat_generator() -> DataLoader: if self.ref_model is None and original_ref_model is not None: print('Reference model was cleared during training - restoring') self.ref_model = original_ref_model - ContinualPPOTrainer.class_ref_model = original_ref_model + self.class_ref_model = original_ref_model # Ensure the class variable is updated - ContinualPPOTrainer.class_ref_model = self.ref_model + self.class_ref_model = self.ref_model if self.is_deepspeed_enabled: - ContinualPPOTrainer.ds_wrapped_models = self.deepspeed + self.ds_wrapped_models = self.deepspeed else: - ContinualPPOTrainer.ds_wrapped_models = self.model - ContinualPPOTrainer.policy_value_models = self.model + self.ds_wrapped_models = self.model + self.policy_value_models = self.model def update_fisher_and_params(self) -> None: """Explicitly update the Fisher information and parameter values. diff --git a/benchmarks/ppo_ewc/ppo_EWC_continual.py b/benchmarks/ppo_ewc/ppo_EWC_continual.py index c71e90e6..f6cb4967 100644 --- a/benchmarks/ppo_ewc/ppo_EWC_continual.py +++ b/benchmarks/ppo_ewc/ppo_EWC_continual.py @@ -16,8 +16,8 @@ get_kbit_device_map, get_peft_config, get_quantization_config, + setup_chat_format, ) -from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE from benchmarks.dataloading import init_continual_dataset from benchmarks.ppo_ewc.continual_ppo_EWC_trainer import ( @@ -52,15 +52,7 @@ def main( quantization_config=quantization_config, ) - # Load main model and (optionally) reference model model = str(training_args.sft_model_path) - policy = AutoModelForCausalLM.from_pretrained( - training_args.sft_model_path, - trust_remote_code=model_args.trust_remote_code, - **model_kwargs, - ) - - # Configure PEFT if needed peft_config = get_peft_config(model_args) if peft_config is None: ref_policy = AutoModelForCausalLM.from_pretrained( @@ -71,32 +63,11 @@ def main( else: ref_policy = None - # Load value model - value_model = None - if script_args.value_model_path: - value_model = AutoModelForSequenceClassification.from_pretrained( - script_args.value_model_path, - trust_remote_code=model_args.trust_remote_code, - num_labels=1, - ) - # Load tokenizer and set chat template if needed tokenizer = AutoTokenizer.from_pretrained( training_args.sft_model_path, trust_remote_code=model_args.trust_remote_code, ) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - if tokenizer.chat_template is None: - tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE - - # EWC-specific: DDPT distributed setup - if script_args.ignore_bias_buffers: - policy._ddp_params_and_buffers_to_ignore = [ - name - for name, buffer in policy.named_buffers() - if buffer.dtype == torch.bool - ] # Initialize continual dataset continual_dataset: list[dict[str, Dataset]] = init_continual_dataset( @@ -112,6 +83,7 @@ def main( if '.' in clean_dataset_name: clean_dataset_name = clean_dataset_name.split('.')[0] + print(f'Training PPO-EWC on {len(continual_dataset)} tasks') # check if the reward models are present either in the path or in the hub if training_args.reward_model_path is not None: for i in range(len(continual_dataset)): @@ -128,6 +100,44 @@ def main( # Task Loop for i, dataset in enumerate(continual_dataset): + # Load main model and (optionally) reference model + if i == 0: + model_path = training_args.sft_model_path + value_model_path = script_args.value_model_path + else: + model_path = os.path.join(training_args.output_dir, 'last') + value_model_path = os.path.join( + training_args.output_dir, 'last', 'value_model' + ) + policy = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_path, + trust_remote_code=model_args.trust_remote_code, + **model_kwargs, + ) + # EWC-specific: DDPT distributed setup + if script_args.ignore_bias_buffers: + policy._ddp_params_and_buffers_to_ignore = [ + name + for name, buffer in policy.named_buffers() + if buffer.dtype == torch.bool + ] + + # Load value model and policy model (main model) + try: + value_model = AutoModelForSequenceClassification.from_pretrained( + value_model_path, + trust_remote_code=model_args.trust_remote_code, + num_labels=1, + ) + except OSError: + # Maybe it was saved as safetensors? + value_model = AutoModelForSequenceClassification.from_pretrained( + value_model_path, + trust_remote_code=model_args.trust_remote_code, + num_labels=1, + from_tf=True, # or use `subfolder="safetensors"` if you saved a .safetensors file + ) + # Build custom repository name for this task custom_repo_name = ( model.split('/')[-1] + '_' + clean_dataset_name + '_PPO_EWC_' + str(i) @@ -141,6 +151,22 @@ def main( training_args.reward_model_path + '_' + str(i), num_labels=1 ) + for idx, _model in enumerate([policy, value_model, reward_model]): + # Align padding tokens between tokenizer and model + _model.config.pad_token_id = tokenizer.pad_token_id + + # Use ChatML format if the tokenizer doesn't already have a chat template + if tokenizer.chat_template is None: + updated_model, updated_tokenizer = setup_chat_format(_model, tokenizer) + # Actually store the updated model + if idx == 0: + policy = updated_model + elif idx == 1: + value_model = updated_model + else: + reward_model = updated_model + tokenizer = updated_tokenizer + ################ # Training and Evaluation ################ @@ -181,21 +207,22 @@ def main( wb.log({f'task/{custom_repo_name}/last': metrics}) # type: ignore[attr-defined] # Save model checkpoint and optionally push - if not training_args.push_to_hub: - trainer.save_model(os.path.join(training_args.output_dir, 'last')) - else: + last_dir = os.path.join(training_args.output_dir, 'last') + policy.save_pretrained(last_dir) + tokenizer.save_pretrained(last_dir) + + value_model_dir = os.path.join(last_dir, 'value_model') + os.makedirs(value_model_dir, exist_ok=True) + value_model.save_pretrained(value_model_dir, safe_serialization=False) + + trainer.accelerator.wait_for_everyone() + + if training_args.push_to_hub: trainer.push_to_hub( model_name=custom_repo_name, dataset_name='Continual_PPO_EWC_' + clean_dataset_name + '_' + str(i), ) - # Clean up for next task - EWC specific - if hasattr(trainer, 'deepspeed') and trainer.deepspeed is not None: - # Remove reference to the DeepSpeed engine to allow proper cleanup - del trainer.deepspeed - # Free cached GPU memory - torch.cuda.empty_cache() - print('Training completed for all tasks!') diff --git a/benchmarks/reward_modeling.py b/benchmarks/reward_modeling.py index f214cfb2..7f3b1958 100644 --- a/benchmarks/reward_modeling.py +++ b/benchmarks/reward_modeling.py @@ -196,9 +196,13 @@ def train_model( if __name__ == '__main__': parser = HfArgumentParser((ExtendedScriptArguments, RewardConfig, ModelConfig)) script_args, training_args, model_args = parser.parse_args_into_dataclasses() - + tokenizer = AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + trust_remote_code=model_args.trust_remote_code, + use_fast=True, + ) continual_dataset: list[Dict[str, Dataset]] = init_continual_dataset( - script_args.dataset_name, mock=script_args.mock + script_args.dataset_name, mock=script_args.mock, tokenizer=tokenizer ) if script_args.all_datasets: diff --git a/jobs/cppo/cppo_cppo_multi_gpu.sh b/jobs/cppo/cppo_cppo_multi_gpu.sh new file mode 100644 index 00000000..af30853e --- /dev/null +++ b/jobs/cppo/cppo_cppo_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-cppo-cppo +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='CPPO-RL' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/cppo/cppo.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-CPPO-${dataset_name}-multi-gpu" \ + --dataset_name "$dataset_name" \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_CPPO_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_CPPO_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$HOME/Qwen2-0.5B-CPPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/cppo/cppo_domain_shift_multi_gpu.sh b/jobs/cppo/cppo_domain_shift_multi_gpu.sh new file mode 100644 index 00000000..e4f9f26e --- /dev/null +++ b/jobs/cppo/cppo_domain_shift_multi_gpu.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-cppo-domain_shift +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email +source .env + +dataset_name='aifgen-domain-preference-shift' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/cppo/cppo.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-CPPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$HOME/Qwen2-0.5B-CPPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/cppo/cppo_lipschitz_multi_gpu.sh b/jobs/cppo/cppo_lipschitz_multi_gpu.sh new file mode 100644 index 00000000..06a533af --- /dev/null +++ b/jobs/cppo/cppo_lipschitz_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-cppo-lipschitz +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-lipschitz' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/cppo/cppo.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-CPPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 8 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$HOME/Qwen2-0.5B-CPPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/cppo/cppo_long_piecewise_multi_gpu.sh b/jobs/cppo/cppo_long_piecewise_multi_gpu.sh new file mode 100644 index 00000000..2614ad53 --- /dev/null +++ b/jobs/cppo/cppo_long_piecewise_multi_gpu.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-cppo-long-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email +source .env + +dataset_name='aifgen-long-piecewise' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/cppo/cppo.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-CPPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$HOME/Qwen2-0.5B-CPPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/cppo/cppo_piecewise_multi_gpu.sh b/jobs/cppo/cppo_piecewise_multi_gpu.sh new file mode 100644 index 00000000..ea84fae2 --- /dev/null +++ b/jobs/cppo/cppo_piecewise_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-cppo-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-piecewise-preference-shift' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/cppo/cppo.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-CPPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 8 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$HOME/Qwen2-0.5B-CPPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/cppo/cppo_short_piecewise_multi_gpu.sh b/jobs/cppo/cppo_short_piecewise_multi_gpu.sh new file mode 100644 index 00000000..c983fcde --- /dev/null +++ b/jobs/cppo/cppo_short_piecewise_multi_gpu.sh @@ -0,0 +1,38 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-cppo-short-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-short-piecewise' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/cppo/cppo.py \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-CPPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 300 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$HOME/Qwen2-0.5B-CPPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/cppo_multi_gpu.sh b/jobs/cppo_multi_gpu.sh deleted file mode 100644 index 5eb5ee71..00000000 --- a/jobs/cppo_multi_gpu.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=cppo_debug_multi_gpu -#SBATCH --partition=main -#SBATCH --cpus-per-task=6 -#SBATCH --gres=gpu:a100l:2 -#SBATCH --mem=48G -#SBATCH --time=1:00:00 -#SBATCH --output=slurm-%j.out -#SBATCH --error=slurm-%j.err -#SBATCH --mail-type=ALL -#SBATCH --mail-user= - -source .env - -accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ - benchmarks/cppo/cppo.py \ - --dataset_name benchmarks/continual_data_debug.json \ - --sft_model_path Qwen/Qwen2-0.5B-Instruct \ - --value_model_path Shahradmz/Qwen2-0.5B-Instruct_continual_data_debug_REWARD_0 \ - --reward_model_path Shahradmz/Qwen2-0.5B-Instruct_continual_data_debug_REWARD \ - --learning_rate 5.0e-6 \ - --num_train_epochs 1 \ - --gradient_checkpointing \ - --per_device_train_batch_size 2 \ - --logging_steps 2 \ - --eval_strategy steps \ - --eval_steps 5 \ - --save_steps 5 \ - --bf16 \ - --output_dir "$SCRATCH/Qwen2-0.5B-CPPO-test" \ - --no_remove_unused_columns diff --git a/jobs/download_cppo_jobs.sh b/jobs/download_cppo_jobs.sh new file mode 100755 index 00000000..65df91b6 --- /dev/null +++ b/jobs/download_cppo_jobs.sh @@ -0,0 +1,14 @@ +source .env +export HF_HUB_OFFLINE=0 + + +# for name in dataset names create the model name based on : LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_{i} +# then add the i to the model name from 1 to 9 and use python and from huggingface_hub import snapshot_download to download the model + +python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='LifelongAlignment/CPPO-RL', revision='main', repo_type='dataset')" + +model_name="LifelongAlignment/Qwen2.5-0.5B-Instruct_CPPO_REWARD_0" +python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='$model_name', revision='main')" +model_name="LifelongAlignment/Qwen2.5-0.5B-Instruct_CPPO_REWARD_1" +python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='$model_name', revision='main')" + diff --git a/jobs/download_jobs.sh b/jobs/download_jobs.sh new file mode 100755 index 00000000..5cbf6272 --- /dev/null +++ b/jobs/download_jobs.sh @@ -0,0 +1,24 @@ +source .env +export HF_HUB_OFFLINE=0 + +dataset_names=( + "aifgen-domain-preference-shift" + "aifgen-lipschitz" + "aifgen-short-piecewise" + "aifgen-long-piecewise" + "aifgen-piecewise-preference-shift" +) + +# for name in dataset names create the model name based on : LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_{i} +# then add the i to the model name from 1 to 9 and use python and from huggingface_hub import snapshot_download to download the model + +for dataset_name in "${dataset_names[@]}"; do + for i in {0..9}; do + model_name="LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_${i}" + data_name="LifelongAlignment/${dataset_name}" + echo "Downloading model: $model_name" + python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='$data_name', revision='main', repo_type='dataset')" + python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='$model_name', revision='main')" + done +done + diff --git a/jobs/dpo/dpo_cppo_multi_gpu.sh b/jobs/dpo/dpo_cppo_multi_gpu.sh new file mode 100644 index 00000000..d427e72e --- /dev/null +++ b/jobs/dpo/dpo_cppo_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-cppo +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='CPPO-RL' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml \ + benchmarks/dpo/dpo_continual.py \ + --dataset_name 'CPPO-RL' \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_CPPO_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-${dataset_name}-multi-gpu" diff --git a/jobs/dpo/dpo_domain_shift_multi_gpu.sh b/jobs/dpo/dpo_domain_shift_multi_gpu.sh new file mode 100644 index 00000000..c95af9e9 --- /dev/null +++ b/jobs/dpo/dpo_domain_shift_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-domain_shift +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-domain-preference-shift' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml \ + benchmarks/dpo/dpo_continual.py \ + --dataset_name $dataset_name \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-${dataset_name}-multi-gpu" diff --git a/jobs/dpo/dpo_lipschitz_multi_gpu.sh b/jobs/dpo/dpo_lipschitz_multi_gpu.sh new file mode 100644 index 00000000..9f07cf8f --- /dev/null +++ b/jobs/dpo/dpo_lipschitz_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-lipschitz +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-lipschitz' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml \ + benchmarks/dpo/dpo_continual.py \ + --dataset_name $dataset_name \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-${dataset_name}-multi-gpu" diff --git a/jobs/dpo/dpo_long_piecewise_multi_gpu.sh b/jobs/dpo/dpo_long_piecewise_multi_gpu.sh new file mode 100644 index 00000000..7c63fa6e --- /dev/null +++ b/jobs/dpo/dpo_long_piecewise_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-long_piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-long-piecewise' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml \ + benchmarks/dpo/dpo_continual.py \ + --dataset_name $dataset_name \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-${dataset_name}-multi-gpu" diff --git a/jobs/dpo/dpo_piecewise_multi_gpu.sh b/jobs/dpo/dpo_piecewise_multi_gpu.sh new file mode 100644 index 00000000..4e3f1b4e --- /dev/null +++ b/jobs/dpo/dpo_piecewise_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-piecewise-preference-shift +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-piecewise-preference-shift' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml \ + benchmarks/dpo/dpo_continual.py \ + --dataset_name $dataset_name \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-${dataset_name}-multi-gpu" diff --git a/jobs/dpo/dpo_short_piecewise_multi_gpu.sh b/jobs/dpo/dpo_short_piecewise_multi_gpu.sh new file mode 100644 index 00000000..542cb4a6 --- /dev/null +++ b/jobs/dpo/dpo_short_piecewise_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-short-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-short-piecewise' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml \ + benchmarks/dpo/dpo_continual.py \ + --dataset_name $dataset_name \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-${dataset_name}-multi-gpu" diff --git a/jobs/dpo_ewc/dpo_ewc_cppo_multi_gpu.sh b/jobs/dpo_ewc/dpo_ewc_cppo_multi_gpu.sh new file mode 100644 index 00000000..dd4a2262 --- /dev/null +++ b/jobs/dpo_ewc/dpo_ewc_cppo_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-ewc-cppo +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='CPPO-RL' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/dpo_ewc/dpo_EWC_continual.py \ + --dataset_name 'CPPO-RL' \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_CPPO_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-EWC-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-EWC-${dataset_name}-multi-gpu" diff --git a/jobs/dpo_ewc/dpo_ewc_domain_shift_multi_gpu.sh b/jobs/dpo_ewc/dpo_ewc_domain_shift_multi_gpu.sh new file mode 100644 index 00000000..2cfcb1a9 --- /dev/null +++ b/jobs/dpo_ewc/dpo_ewc_domain_shift_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-ewc-domain_shift +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-domain-preference-shift' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/dpo_ewc/dpo_EWC_continual.py \ + --dataset_name $dataset_name \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-EWC-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-EWC-${dataset_name}-multi-gpu" diff --git a/jobs/dpo_ewc/dpo_ewc_lipschitz_multi_gpu.sh b/jobs/dpo_ewc/dpo_ewc_lipschitz_multi_gpu.sh new file mode 100644 index 00000000..7cda0f99 --- /dev/null +++ b/jobs/dpo_ewc/dpo_ewc_lipschitz_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-ewc-lipschitz +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-lipschitz' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/dpo_ewc/dpo_EWC_continual.py \ + --dataset_name $dataset_name \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-EWC-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-EWC-${dataset_name}-multi-gpu" diff --git a/jobs/dpo_ewc/dpo_ewc_long_piecewise_multi_gpu.sh b/jobs/dpo_ewc/dpo_ewc_long_piecewise_multi_gpu.sh new file mode 100644 index 00000000..39ae8eea --- /dev/null +++ b/jobs/dpo_ewc/dpo_ewc_long_piecewise_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-ewc-long_piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-long-piecewise' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/dpo_ewc/dpo_EWC_continual.py \ + --dataset_name $dataset_name \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-EWC-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-EWC-${dataset_name}-multi-gpu" diff --git a/jobs/dpo_ewc/dpo_ewc_piecewise_multi_gpu.sh b/jobs/dpo_ewc/dpo_ewc_piecewise_multi_gpu.sh new file mode 100644 index 00000000..7f477e12 --- /dev/null +++ b/jobs/dpo_ewc/dpo_ewc_piecewise_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-ewc-piecewise-preference-shift +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-piecewise-preference-shift' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/dpo_ewc/dpo_EWC_continual.py \ + --dataset_name $dataset_name \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-EWC-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-EWC-${dataset_name}-multi-gpu" diff --git a/jobs/dpo_ewc/dpo_ewc_short_piecewise_multi_gpu.sh b/jobs/dpo_ewc/dpo_ewc_short_piecewise_multi_gpu.sh new file mode 100644 index 00000000..9c88d6e3 --- /dev/null +++ b/jobs/dpo_ewc/dpo_ewc_short_piecewise_multi_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-dpo-short-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-short-piecewise' + +accelerate launch --config_file benchmarks/dpo/accelerate_configs/deepspeed_zero3.yaml \ + benchmarks/dpo_ewc/dpo_EWC_continual.py \ + --dataset_name $dataset_name \ + --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --num_train_epochs 4 \ + --per_device_train_batch_size 8 \ + --gradient_checkpointing \ + --logging_steps 20 \ + --eval_strategy steps \ + --response_length 256 \ + --eval_steps 500 \ + --save_steps 500 \ + --bf16 \ + --output_dir "$SCRATCH/projects/Qwen2-0.5B-DPO-${dataset_name}" \ + --no_remove_unused_columns \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-DPO-${dataset_name}-multi-gpu" diff --git a/jobs/ppo/ppo_cppo_multi_gpu.sh b/jobs/ppo/ppo_cppo_multi_gpu.sh new file mode 100644 index 00000000..a654ed63 --- /dev/null +++ b/jobs/ppo/ppo_cppo_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-cppo +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='CPPO-RL' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo/ppo_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name "$dataset_name" \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_CPPO_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2.5-0.5B-Instruct_CPPO_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$HOME/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo/ppo_domain_shift_multi_gpu.sh b/jobs/ppo/ppo_domain_shift_multi_gpu.sh new file mode 100644 index 00000000..1665f306 --- /dev/null +++ b/jobs/ppo/ppo_domain_shift_multi_gpu.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-domain_shift +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email +source .env + +dataset_name='aifgen-domain-preference-shift' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo/ppo_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo/ppo_lipschitz_multi_gpu.sh b/jobs/ppo/ppo_lipschitz_multi_gpu.sh new file mode 100644 index 00000000..d65cb6d3 --- /dev/null +++ b/jobs/ppo/ppo_lipschitz_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-lipschitz +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-lipschitz' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo/ppo_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 8 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo/ppo_long_piecewise_multi_gpu.sh b/jobs/ppo/ppo_long_piecewise_multi_gpu.sh new file mode 100644 index 00000000..848046bd --- /dev/null +++ b/jobs/ppo/ppo_long_piecewise_multi_gpu.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-long-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email +source .env + +dataset_name='aifgen-long-piecewise' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo/ppo_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo/ppo_piecewise_multi_gpu.sh b/jobs/ppo/ppo_piecewise_multi_gpu.sh new file mode 100644 index 00000000..5078949f --- /dev/null +++ b/jobs/ppo/ppo_piecewise_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-piecewise-preference-shift' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo/ppo_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 8 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo/ppo_short_piecewise_multi_gpu.sh b/jobs/ppo/ppo_short_piecewise_multi_gpu.sh new file mode 100644 index 00000000..53c20c6b --- /dev/null +++ b/jobs/ppo/ppo_short_piecewise_multi_gpu.sh @@ -0,0 +1,38 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-short-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-short-piecewise' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo/ppo_continual.py \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 300 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$HOME/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh new file mode 100644 index 00000000..4615ccde --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-ewc-cppo +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='CPPO-RL' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ + --dataset_name "$dataset_name" \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_CPPO_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_CPPO_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh new file mode 100644 index 00000000..22a943cd --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-ewc-domain_shift +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email +source .env + +dataset_name='aifgen-domain-preference-shift' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh new file mode 100644 index 00000000..cc1406a5 --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-ewc-lipschitz +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-lipschitz' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 8 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh new file mode 100644 index 00000000..22cf7201 --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-ewc-long-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email +source .env + +dataset_name='aifgen-long-piecewise' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh new file mode 100644 index 00000000..721a46c7 --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-ewc-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-piecewise-preference-shift' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 8 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh new file mode 100644 index 00000000..029eb2e2 --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh @@ -0,0 +1,38 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-ewc-short-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-short-piecewise' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 300 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/reward_modeling/reward_sweep_template.sh b/jobs/reward_modeling/reward_sweep_template.sh new file mode 100644 index 00000000..438a9482 --- /dev/null +++ b/jobs/reward_modeling/reward_sweep_template.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-reward-long-piecewise-8B +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=0G +#SBATCH --time=1:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email +source .env +# Set PyTorch to use more aggressive memory allocation +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + +dataset_name='aifgen-long-piecewise' + +accelerate launch --config_file benchmarks/cppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/reward_modeling.py \ + --model_name_or_path Qwen/Qwen3-8B-Base \ + --dataset_name $dataset_name \ + --dataset_index 0 \ + --output_dir "$SCRATCH/Qwen3-8B-REWARD-${dataset_name}" \ + --num_train_epochs 4 \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --gradient_checkpointing True \ + --bf16 \ + --learning_rate 5.0e-6 \ + --logging_steps 30 \ + --eval_strategy steps \ + --eval_steps 70 \ + --max_length 2048 \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen3-8B-REWARD-${dataset_name}-multi-gpu" \ No newline at end of file diff --git a/jobs/wandb_sync.sh b/jobs/wandb_sync.sh new file mode 100755 index 00000000..8ba91336 --- /dev/null +++ b/jobs/wandb_sync.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# sync_wandb_runs.sh - Syncs all wandb offline runs in the current directory + +echo "🔎 Searching for WandB offline runs..." + +# Find all directories that match the wandb offline run pattern +OFFLINE_DIRS=$(find . -path "$SCRATCH/wandb/offline-run-*" -type d) + +if [ -z "$OFFLINE_DIRS" ]; then + echo "❌ No WandB offline runs found." + exit 0 +fi + +# Count total runs +TOTAL_RUNS=$(echo "$OFFLINE_DIRS" | wc -l) +echo "🔍 Found $TOTAL_RUNS WandB offline runs." + +# Counter for synced runs +SYNCED=0 +FAILED=0 + +echo "🔄 Starting sync process..." +echo "-----------------------------" + +# Process each run directory +echo "$OFFLINE_DIRS" | while read -r dir; do + echo "⏳ Syncing run: $dir" + if wandb sync "$dir"; then + SYNCED=$((SYNCED + 1)) + echo "✅ Successfully synced: $dir" + else + FAILED=$((FAILED + 1)) + echo "❌ Failed to sync: $dir" + fi + echo "-----------------------------" + echo "Progress: $SYNCED/$TOTAL_RUNS completed" +done + +echo "🏁 Sync complete!" +echo "✅ Successfully synced: $SYNCED runs" +echo "❌ Failed to sync: $FAILED runs" \ No newline at end of file