From 98cbed75969f1a406727a6ef6db5a9b2c64f7c90 Mon Sep 17 00:00:00 2001 From: edbeeching Date: Fri, 11 Apr 2025 09:26:05 +0000 Subject: [PATCH 1/5] add WIP code GRPO configs --- .../grpo/config_v03.00.yaml | 67 ++++++++++++++++++ .../grpo/config_v01.05.yaml | 58 +++++++++++++++ .../grpo/config_v01.06.yaml | 60 ++++++++++++++++ .../grpo/config_v01.07.yaml | 61 ++++++++++++++++ .../grpo/config_v01.08.yaml | 62 ++++++++++++++++ .../grpo/config_v01.09.yaml | 62 ++++++++++++++++ .../grpo/config_v05.00.yaml | 65 +++++++++++++++++ .../grpo/config_v05.02.yaml | 65 +++++++++++++++++ .../grpo/config_v05.03.yaml | 65 +++++++++++++++++ .../grpo/config_v05.04.yaml | 65 +++++++++++++++++ .../grpo/config_v05.05.yaml | 67 ++++++++++++++++++ .../grpo/config_v05.06.yaml | 65 +++++++++++++++++ .../grpo/config_v05.07.yaml | 67 ++++++++++++++++++ .../grpo/config_v05.08.yaml | 67 ++++++++++++++++++ .../grpo/config_v05.09.yaml | 67 ++++++++++++++++++ .../grpo/config_v05.10.yaml | 67 ++++++++++++++++++ .../grpo/config_v05.11.yaml | 67 ++++++++++++++++++ .../grpo/config_v05.12.yaml | 67 ++++++++++++++++++ .../grpo/config_v05.13.yaml | 66 +++++++++++++++++ .../grpo/config_v05.14.yaml | 66 +++++++++++++++++ .../grpo/config_v05.15.yaml | 65 +++++++++++++++++ .../grpo/config_v05.16.yaml | 67 ++++++++++++++++++ .../grpo/config_v05.17.yaml | 67 ++++++++++++++++++ .../grpo/config_v05.18.yaml | 67 ++++++++++++++++++ .../grpo/config_v05.20.yaml | 69 ++++++++++++++++++ .../grpo/config_v05.30.yaml | 70 +++++++++++++++++++ .../grpo/config_v02.00.yaml | 64 +++++++++++++++++ .../grpo/config_v03.00.yaml | 64 +++++++++++++++++ 28 files changed, 1829 insertions(+) create mode 100644 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml create mode 100644 recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml create mode 100644 recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml new file mode 100644 index 000000000..90351b619 --- /dev/null +++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 64 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO +hub_model_revision: v03.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 24000 +max_steps: -1 +num_generations: 8 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v03.00 +overwrite_output_dir: true +per_device_train_batch_size: 1 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: "ip-10-53-85-124:8000" +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml new file mode 100644 index 000000000..174c816b9 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml @@ -0,0 +1,58 @@ + +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenR1-Math-cn_k12-86k +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.001 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true + +do_eval: false +gradient_accumulation_steps: 64 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v01.05 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 8 +num_train_epochs: 1 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.05 +overwrite_output_dir: true +per_device_train_batch_size: 1 +push_to_hub: true +use_liger_kernel: true +report_to: +- wandb +reward_funcs: +- accuracy +- format +reward_weights: +- 1.0 +- 0.2 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +warmup_ratio: 0.1 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml new file mode 100644 index 000000000..8a456c9f5 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml @@ -0,0 +1,60 @@ + +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenR1-Math-cn_k12-86k +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.001 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true + +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v01.06 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_train_epochs: 1 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.06 +overwrite_output_dir: true +per_device_train_batch_size: 4 +push_to_hub: true +use_liger_kernel: true +report_to: +- wandb +reward_funcs: +- accuracy +- format +reward_weights: +- 1.0 +- 0.2 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +warmup_ratio: 0.1 +wandb_entity: huggingface +wandb_project: open-r1 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml new file mode 100644 index 000000000..c13458da3 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml @@ -0,0 +1,61 @@ + +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenR1-Math-cn_k12-86k +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.001 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true + +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v01.07 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_train_epochs: 1 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.07 +overwrite_output_dir: true +per_device_train_batch_size: 4 +push_to_hub: true +use_liger_kernel: true +report_to: +- wandb +reward_funcs: +- accuracy +- format +reward_weights: +- 1.0 +- 0.2 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +warmup_ratio: 0.1 +wandb_entity: huggingface +wandb_project: open-r1 +scale_rewards: false \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml new file mode 100644 index 000000000..5a422d214 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml @@ -0,0 +1,62 @@ + +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenR1-Math-cn_k12-86k +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.0 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true + +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v01.08 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_train_epochs: 1 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.08 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +use_liger_kernel: true +report_to: +- wandb +reward_funcs: +- accuracy +- format +reward_weights: +- 1.0 +- 0.2 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +warmup_ratio: 0.1 +wandb_entity: huggingface +wandb_project: open-r1 +num_iterations: 4 +scale_rewards: false \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml new file mode 100644 index 000000000..e162a16d6 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml @@ -0,0 +1,62 @@ + +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 + +# Data training arguments +dataset_name: open-r1/OpenR1-Math-cn_k12-86k +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +beta: 0.0 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true + +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v01.09 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_train_epochs: 1 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.09 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +use_liger_kernel: true +report_to: +- wandb +reward_funcs: +- accuracy +- format +reward_weights: +- 1.0 +- 0.2 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +warmup_ratio: 0.1 +wandb_entity: huggingface +wandb_project: open-r1 +num_iterations: 4 +scale_rewards: true \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml new file mode 100644 index 000000000..5e9156c89 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- math_500 +- aime24 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.00 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml new file mode 100644 index 000000000..5d1092b85 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.02 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.02 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml new file mode 100644 index 000000000..6053143c9 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.03 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.03 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml new file mode 100644 index 000000000..081ef05c8 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 64 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.04 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.04 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml new file mode 100644 index 000000000..cbb5c5276 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.05 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.05 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml new file mode 100644 index 000000000..87d47e8b4 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 64 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.06 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 64 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.06 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml new file mode 100644 index 000000000..cb0e44266 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 64 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.07 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 64 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.07 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml new file mode 100644 index 000000000..d21020068 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.08 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.08 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 4 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml new file mode 100644 index 000000000..b8d5eb696 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.09 +hub_strategy: every_save +learning_rate: 5.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.09 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 4 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml new file mode 100644 index 000000000..3ea1630b3 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.10 +hub_strategy: every_save +learning_rate: 1.0e-05 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.10 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 4 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml new file mode 100644 index 000000000..50997275d --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.11 +hub_strategy: every_save +learning_rate: 4.0e-05 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.11 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 4 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml new file mode 100644 index 000000000..0628f4822 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.12 +hub_strategy: every_save +learning_rate: 5.0e-07 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: cosine_with_min_lr +lr_scheduler_kwargs: + min_lr_rate: 0.1 +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.12 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 4 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml new file mode 100644 index 000000000..aa4c0f763 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml @@ -0,0 +1,66 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.13 +hub_strategy: every_save +learning_rate: 5.0e-07 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.13 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml new file mode 100644 index 000000000..4891fb2c7 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml @@ -0,0 +1,66 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.14 +hub_strategy: every_save +learning_rate: 5.0e-07 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.14 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.01 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml new file mode 100644 index 000000000..97fd1c3f6 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml @@ -0,0 +1,65 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.15 +hub_strategy: every_save +learning_rate: 5.0e-07 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 1.0 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.15 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.01 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml new file mode 100644 index 000000000..361c8d898 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.16 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.16 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: "ip-10-53-85-124:8000" +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml new file mode 100644 index 000000000..072b61c4b --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.17 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.17 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: "ip-10-53-85-124:8000" +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.4 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml new file mode 100644 index 000000000..585b7155b --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml @@ -0,0 +1,67 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.18 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.18 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: "ip-10-53-85-124:8000" +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 1.0 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml new file mode 100644 index 000000000..0ae2b55ba --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml @@ -0,0 +1,69 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.20 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.20 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: ip-10-53-95-216:8000 +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 + +mask_truncated_completions: true \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml new file mode 100644 index 000000000..239afa7a8 --- /dev/null +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml @@ -0,0 +1,70 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO +hub_model_revision: v05.30 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 16 +num_iterations: 4 +num_train_epochs: 1.0 +output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.30 +overwrite_output_dir: true +per_device_train_batch_size: 8 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +- code_format +e2b_router_url: ip-10-53-95-216:8000 +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +parallel_code_exec_per_proc: 10 + +mask_truncated_completions: true +loss_type: dr_grpo \ No newline at end of file diff --git a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml new file mode 100644 index 000000000..0c4cf5fe3 --- /dev/null +++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml @@ -0,0 +1,64 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- math_500 +- aime24 +beta: 0.001 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 14 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO +hub_model_revision: v02.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 14 +num_train_epochs: 0.1 +output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v02.00 +overwrite_output_dir: true +per_device_train_batch_size: 4 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 \ No newline at end of file diff --git a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml new file mode 100644 index 000000000..962e66190 --- /dev/null +++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml @@ -0,0 +1,64 @@ +# Model arguments +model_name_or_path: Qwen/Qwen2.5-7B-Instruct +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested + +system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- math_500 +- aime24 +beta: 0.001 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 14 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO +hub_model_revision: v03.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 4096 +max_steps: -1 +num_generations: 14 +num_train_epochs: 0.1 +output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v03.00 +overwrite_output_dir: true +per_device_train_batch_size: 4 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- code +- code_format +reward_weights: +- 1.0 +- 0.1 +save_strategy: "steps" +save_steps: 0.1 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 \ No newline at end of file From 243db805c2b2c8255aaabea2934517d6ed89d8f8 Mon Sep 17 00:00:00 2001 From: edbeeching Date: Fri, 11 Apr 2025 15:38:22 +0000 Subject: [PATCH 2/5] bin reward --- src/open_r1/rewards.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index b62a81a81..d780ded96 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -387,7 +387,15 @@ def extract_code(completion: str, language: str = "python") -> str: def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]: rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs) BINARY_THRESHOLD = 0.99 - return [1.0 if reward > BINARY_THRESHOLD else 0.0 for reward in rewards] + + output = [] + for reward in rewards: + if reward is None: + output.append(None) + else: + output.append(1.0 if reward > BINARY_THRESHOLD else 0.0) + + return output def code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]: From 7ddc0282cddfed14065a5e51f061036f0b517ea3 Mon Sep 17 00:00:00 2001 From: edbeeching Date: Thu, 17 Apr 2025 12:08:12 +0000 Subject: [PATCH 3/5] save wip --- recipes/OlympicCoder-7B/sft/config_v00.00.yaml | 10 +++++----- recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml | 2 +- recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml | 2 +- src/open_r1/utils/evaluation.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml index dd0be5d96..f5b8385da 100644 --- a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml +++ b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml @@ -14,7 +14,7 @@ dataset_num_proc: 48 bf16: true do_eval: false eval_strategy: 'no' -gradient_accumulation_steps: 8 +gradient_accumulation_steps: 2 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false @@ -27,20 +27,20 @@ logging_strategy: steps lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 -packing: false +packing: true max_grad_norm: 0.2 -max_length: 32768 +max_length: 16000 max_steps: -1 num_train_epochs: 10 output_dir: data/OlympicCoder-7B overwrite_output_dir: true per_device_eval_batch_size: 1 -per_device_train_batch_size: 2 +per_device_train_batch_size: 1 push_to_hub: true report_to: - wandb save_strategy: epoch save_total_limit: 1 seed: 42 -use_liger_kernel: true +use_liger_kernel: false warmup_ratio: 0.03 \ No newline at end of file diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml index 0ae2b55ba..4b8aa6b31 100644 --- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml @@ -52,7 +52,7 @@ report_to: reward_funcs: - binary_code - code_format -e2b_router_url: ip-10-53-95-216:8000 +e2b_router_url: ip-10-53-86-47:8000 reward_weights: - 1.0 - 0.1 diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml index 239afa7a8..08244af8a 100644 --- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml +++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml @@ -52,7 +52,7 @@ report_to: reward_funcs: - binary_code - code_format -e2b_router_url: ip-10-53-95-216:8000 +e2b_router_url: ip-10-53-86-47:8000 reward_weights: - 1.0 - 0.1 diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py index 5719350fb..dd8400b93 100644 --- a/src/open_r1/utils/evaluation.py +++ b/src/open_r1/utils/evaluation.py @@ -73,7 +73,7 @@ def run_lighteval_job( if get_param_count_from_repo_id(model_name) >= 30_000_000_000: tensor_parallel = True else: - num_gpus = 8 + num_gpus = 1 tensor_parallel = False cmd = VLLM_SLURM_PREFIX.copy() From 0662164248a5f113b0fba16d0d448ac64d362414 Mon Sep 17 00:00:00 2001 From: edbeeching Date: Fri, 18 Apr 2025 11:38:17 +0000 Subject: [PATCH 4/5] add new reward, configs --- .../grpo/config_v04.00.yaml | 66 +++++++++++++++++++ .../grpo/config_v05.00.yaml | 66 +++++++++++++++++++ .../grpo/config_v06.00.yaml | 66 +++++++++++++++++++ src/open_r1/rewards.py | 24 +++++++ 4 files changed, 222 insertions(+) create mode 100644 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml create mode 100644 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml create mode 100644 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml new file mode 100644 index 000000000..b44fabe06 --- /dev/null +++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml @@ -0,0 +1,66 @@ +# Model arguments +model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 128 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO +hub_model_revision: v04.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 24000 +max_steps: -1 +num_generations: 8 +num_iterations: 1 +num_train_epochs: 1.0 +output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v04.00 +overwrite_output_dir: true +per_device_train_batch_size: 1 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +e2b_router_url: ip-10-53-86-47:8000 +reward_weights: +- 1.0 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +mask_truncated_completions: true +loss_type: dr_grpo \ No newline at end of file diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml new file mode 100644 index 000000000..6b775eae3 --- /dev/null +++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml @@ -0,0 +1,66 @@ +# Model arguments +model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO +hub_model_revision: v05.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 24000 +max_steps: -1 +num_generations: 16 +num_iterations: 1 +num_train_epochs: 1.0 +output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v05.00 +overwrite_output_dir: true +per_device_train_batch_size: 1 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- binary_code +e2b_router_url: ip-10-53-86-47:8000 +reward_weights: +- 1.0 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +mask_truncated_completions: true +loss_type: dr_grpo \ No newline at end of file diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml new file mode 100644 index 000000000..167df6138 --- /dev/null +++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml @@ -0,0 +1,66 @@ +# Model arguments +model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +model_revision: main +torch_dtype: bfloat16 +attn_implementation: flash_attention_2 +# Data training arguments +dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled +dataset_prompt_column: problem + +# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: \n...\n\n\n...\n" + +# GRPO trainer config +callbacks: +- push_to_hub_revision +benchmarks: +- lcb_v4 +beta: 0.000 +bf16: true +do_eval: false +eval_strategy: "no" +use_vllm: true +vllm_device: auto +vllm_gpu_memory_utilization: 0.7 +do_eval: false +gradient_accumulation_steps: 16 +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO +hub_model_revision: v06.00 +hub_strategy: every_save +learning_rate: 1.0e-06 +log_completions: true +log_level: info +logging_first_step: true +logging_steps: 1 +logging_strategy: steps +lr_scheduler_type: constant_with_warmup +max_grad_norm: 0.2 +max_prompt_length: 1024 +max_completion_length: 24000 +max_steps: -1 +num_generations: 16 +num_iterations: 1 +num_train_epochs: 1.0 +output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v06.00 +overwrite_output_dir: true +per_device_train_batch_size: 1 +push_to_hub: true +report_to: +- wandb +reward_funcs: +- weighted_binary_code_reward +e2b_router_url: ip-10-53-86-47:8000 +reward_weights: +- 1.0 +save_strategy: "steps" +save_steps: 0.05 +save_total_limit: 1 +seed: 42 +temperature: 0.7 +wandb_entity: huggingface +wandb_project: open-r1 +warmup_ratio: 0.1 +mask_truncated_completions: true +loss_type: dr_grpo \ No newline at end of file diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py index d780ded96..a436bedc9 100644 --- a/src/open_r1/rewards.py +++ b/src/open_r1/rewards.py @@ -397,6 +397,22 @@ def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, return output +def weighted_binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]: + # combines binary reward with a weighted reward code reward + rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs) + BINARY_THRESHOLD = 0.99 + NON_BINARY_WEIGHT = 0.1 + + output = [] + for reward in rewards: + if reward is None: + output.append(None) + else: + binary_reward = 1.0 if reward > BINARY_THRESHOLD else 0.0 + output.append(binary_reward + NON_BINARY_WEIGHT * reward) + + return output + def code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]: """Reward function that evaluates code snippets using the E2B code interpreter. @@ -596,6 +612,14 @@ def get_reward_funcs(script_args) -> list[Callable]: ), binary_code_reward, ), + "weighted_binary_code_reward": update_wrapper( + partial( + weighted_binary_code_reward, + num_parallel=script_args.parallel_code_exec_per_proc, + e2b_router_url=script_args.e2b_router_url, + ), + weighted_binary_code_reward, + ), "ioi_code": update_wrapper( partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward ), From 9b6c9704da138a4f5bf0bc8dc346229da373d115 Mon Sep 17 00:00:00 2001 From: edbeeching Date: Fri, 18 Apr 2025 11:38:38 +0000 Subject: [PATCH 5/5] update trl version in setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 74d7e0d77..4980a1138 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ "sentencepiece>=0.1.99", "torch==2.6.0", "transformers==4.51.2", - "trl @ git+https://github.com/huggingface/trl.git@d625c5533a6b1c84d3565c8080857f6bb81c538a", # Bump for vLLM and 2x faster throughput: https://github.com/huggingface/trl/pull/3276 + "trl @ git+https://github.com/huggingface/trl.git@294f35bf3c0043d3ee6b9b5d22385e5736f6ce9e", # Bump for vLLM and 2x faster throughput: https://github.com/huggingface/trl/pull/3276 "vllm==0.8.3", "wandb>=0.19.1", ]