From 98cbed75969f1a406727a6ef6db5a9b2c64f7c90 Mon Sep 17 00:00:00 2001
From: edbeeching <edbeeching@gmail.com>
Date: Fri, 11 Apr 2025 09:26:05 +0000
Subject: [PATCH 1/5] add WIP code GRPO configs

---
 .../grpo/config_v03.00.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v01.05.yaml                   | 58 +++++++++++++++
 .../grpo/config_v01.06.yaml                   | 60 ++++++++++++++++
 .../grpo/config_v01.07.yaml                   | 61 ++++++++++++++++
 .../grpo/config_v01.08.yaml                   | 62 ++++++++++++++++
 .../grpo/config_v01.09.yaml                   | 62 ++++++++++++++++
 .../grpo/config_v05.00.yaml                   | 65 +++++++++++++++++
 .../grpo/config_v05.02.yaml                   | 65 +++++++++++++++++
 .../grpo/config_v05.03.yaml                   | 65 +++++++++++++++++
 .../grpo/config_v05.04.yaml                   | 65 +++++++++++++++++
 .../grpo/config_v05.05.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v05.06.yaml                   | 65 +++++++++++++++++
 .../grpo/config_v05.07.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v05.08.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v05.09.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v05.10.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v05.11.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v05.12.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v05.13.yaml                   | 66 +++++++++++++++++
 .../grpo/config_v05.14.yaml                   | 66 +++++++++++++++++
 .../grpo/config_v05.15.yaml                   | 65 +++++++++++++++++
 .../grpo/config_v05.16.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v05.17.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v05.18.yaml                   | 67 ++++++++++++++++++
 .../grpo/config_v05.20.yaml                   | 69 ++++++++++++++++++
 .../grpo/config_v05.30.yaml                   | 70 +++++++++++++++++++
 .../grpo/config_v02.00.yaml                   | 64 +++++++++++++++++
 .../grpo/config_v03.00.yaml                   | 64 +++++++++++++++++
 28 files changed, 1829 insertions(+)
 create mode 100644 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
 create mode 100644 recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml
 create mode 100644 recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml

diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
new file mode 100644
index 000000000..90351b619
--- /dev/null
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v03.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 8
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v03.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
new file mode 100644
index 000000000..174c816b9
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
@@ -0,0 +1,58 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.05
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 8
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.05
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml
new file mode 100644
index 000000000..8a456c9f5
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.06.yaml
@@ -0,0 +1,60 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.06
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.06
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml
new file mode 100644
index 000000000..c13458da3
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.07.yaml
@@ -0,0 +1,61 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.07
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.07
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
+scale_rewards: false
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml
new file mode 100644
index 000000000..5a422d214
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.08.yaml
@@ -0,0 +1,62 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.08
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.08
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
+num_iterations: 4
+scale_rewards: false
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml
new file mode 100644
index 000000000..e162a16d6
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.09.yaml
@@ -0,0 +1,62 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.0
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.09
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.09
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+wandb_entity: huggingface 
+wandb_project: open-r1
+num_iterations: 4
+scale_rewards: true
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml
new file mode 100644
index 000000000..5e9156c89
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.00.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.00
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml
new file mode 100644
index 000000000..5d1092b85
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.02.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.02
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.02
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml
new file mode 100644
index 000000000..6053143c9
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.03.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.03
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.03
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml
new file mode 100644
index 000000000..081ef05c8
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.04.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.04
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.04
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml
new file mode 100644
index 000000000..cbb5c5276
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.05.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.05
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.05
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml
new file mode 100644
index 000000000..87d47e8b4
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.06.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.06
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 64
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.06
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml
new file mode 100644
index 000000000..cb0e44266
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.07.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.07
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 64
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.07
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml
new file mode 100644
index 000000000..d21020068
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.08.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.08
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.08
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml
new file mode 100644
index 000000000..b8d5eb696
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.09.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.09
+hub_strategy: every_save
+learning_rate: 5.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.09
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml
new file mode 100644
index 000000000..3ea1630b3
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.10.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.10
+hub_strategy: every_save
+learning_rate: 1.0e-05
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.10
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml
new file mode 100644
index 000000000..50997275d
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.11.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.11
+hub_strategy: every_save
+learning_rate: 4.0e-05
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.11
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml
new file mode 100644
index 000000000..0628f4822
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.12.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.12
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.12
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 4
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml
new file mode 100644
index 000000000..aa4c0f763
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.13.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.13
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.13
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml
new file mode 100644
index 000000000..4891fb2c7
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.14.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.14
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.14
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.01
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml
new file mode 100644
index 000000000..97fd1c3f6
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.15.yaml
@@ -0,0 +1,65 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.15
+hub_strategy: every_save
+learning_rate: 5.0e-07
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 1.0
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.15
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.01
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml
new file mode 100644
index 000000000..361c8d898
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.16.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.16
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.16
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml
new file mode 100644
index 000000000..072b61c4b
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.17.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.17
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.17
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.4
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml
new file mode 100644
index 000000000..585b7155b
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.18.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.18
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.18
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 1.0
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
new file mode 100644
index 000000000..0ae2b55ba
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
@@ -0,0 +1,69 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.20
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.20
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: ip-10-53-95-216:8000
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
+
+mask_truncated_completions: true
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
new file mode 100644
index 000000000..239afa7a8
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
@@ -0,0 +1,70 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v05.30
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v05.30
+overwrite_output_dir: true
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: ip-10-53-95-216:8000
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
+
+mask_truncated_completions: true
+loss_type: dr_grpo
\ No newline at end of file
diff --git a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml
new file mode 100644
index 000000000..0c4cf5fe3
--- /dev/null
+++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v02.00.yaml
@@ -0,0 +1,64 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 14
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO
+hub_model_revision: v02.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 14
+num_train_epochs: 0.1
+output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v02.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml
new file mode 100644
index 000000000..962e66190
--- /dev/null
+++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_v03.00.yaml
@@ -0,0 +1,64 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 14
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-GRPO
+hub_model_revision: v03.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 14
+num_train_epochs: 0.1
+output_dir: data/Qwen2.5-Coder-7B-Instruct-GRPO_v03.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
\ No newline at end of file

From 243db805c2b2c8255aaabea2934517d6ed89d8f8 Mon Sep 17 00:00:00 2001
From: edbeeching <edbeeching@gmail.com>
Date: Fri, 11 Apr 2025 15:38:22 +0000
Subject: [PATCH 2/5] bin reward

---
 src/open_r1/rewards.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index b62a81a81..d780ded96 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -387,7 +387,15 @@ def extract_code(completion: str, language: str = "python") -> str:
 def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
     rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
     BINARY_THRESHOLD = 0.99
-    return [1.0 if reward > BINARY_THRESHOLD else 0.0 for reward in rewards]
+    
+    output = []
+    for reward in rewards:
+        if reward is None:
+            output.append(None)
+        else:
+            output.append(1.0 if reward > BINARY_THRESHOLD else 0.0)
+  
+    return output
 
 
 def code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:

From 7ddc0282cddfed14065a5e51f061036f0b517ea3 Mon Sep 17 00:00:00 2001
From: edbeeching <edbeeching@gmail.com>
Date: Thu, 17 Apr 2025 12:08:12 +0000
Subject: [PATCH 3/5] save wip

---
 recipes/OlympicCoder-7B/sft/config_v00.00.yaml      | 10 +++++-----
 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml |  2 +-
 recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml |  2 +-
 src/open_r1/utils/evaluation.py                     |  2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
index dd0be5d96..f5b8385da 100644
--- a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
+++ b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
@@ -14,7 +14,7 @@ dataset_num_proc: 48
 bf16: true
 do_eval: false
 eval_strategy: 'no'
-gradient_accumulation_steps: 8
+gradient_accumulation_steps: 2
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
@@ -27,20 +27,20 @@ logging_strategy: steps
 lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1
-packing: false
+packing: true
 max_grad_norm: 0.2
-max_length: 32768
+max_length: 16000
 max_steps: -1
 num_train_epochs: 10
 output_dir: data/OlympicCoder-7B
 overwrite_output_dir: true
 per_device_eval_batch_size: 1
-per_device_train_batch_size: 2
+per_device_train_batch_size: 1
 push_to_hub: true
 report_to:
 - wandb
 save_strategy: epoch
 save_total_limit: 1
 seed: 42
-use_liger_kernel: true
+use_liger_kernel: false
 warmup_ratio: 0.03
\ No newline at end of file
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
index 0ae2b55ba..4b8aa6b31 100644
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.20.yaml
@@ -52,7 +52,7 @@ report_to:
 reward_funcs:
 - binary_code
 - code_format
-e2b_router_url: ip-10-53-95-216:8000
+e2b_router_url: ip-10-53-86-47:8000
 reward_weights:
 - 1.0
 - 0.1
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
index 239afa7a8..08244af8a 100644
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_v05.30.yaml
@@ -52,7 +52,7 @@ report_to:
 reward_funcs:
 - binary_code
 - code_format
-e2b_router_url: ip-10-53-95-216:8000
+e2b_router_url: ip-10-53-86-47:8000
 reward_weights:
 - 1.0
 - 0.1
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 5719350fb..dd8400b93 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -73,7 +73,7 @@ def run_lighteval_job(
     if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
         tensor_parallel = True
     else:
-        num_gpus = 8
+        num_gpus = 1
         tensor_parallel = False
 
     cmd = VLLM_SLURM_PREFIX.copy()

From 0662164248a5f113b0fba16d0d448ac64d362414 Mon Sep 17 00:00:00 2001
From: edbeeching <edbeeching@gmail.com>
Date: Fri, 18 Apr 2025 11:38:17 +0000
Subject: [PATCH 4/5] add new reward, configs

---
 .../grpo/config_v04.00.yaml                   | 66 +++++++++++++++++++
 .../grpo/config_v05.00.yaml                   | 66 +++++++++++++++++++
 .../grpo/config_v06.00.yaml                   | 66 +++++++++++++++++++
 src/open_r1/rewards.py                        | 24 +++++++
 4 files changed, 222 insertions(+)
 create mode 100644 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
 create mode 100644 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
 create mode 100644 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml

diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
new file mode 100644
index 000000000..b44fabe06
--- /dev/null
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 128
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v04.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 8
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v04.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
\ No newline at end of file
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
new file mode 100644
index 000000000..6b775eae3
--- /dev/null
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v05.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v05.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
\ No newline at end of file
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml
new file mode 100644
index 000000000..167df6138
--- /dev/null
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v06.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v06.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- weighted_binary_code_reward
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
\ No newline at end of file
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index d780ded96..a436bedc9 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -397,6 +397,22 @@ def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None,
   
     return output
 
+def weighted_binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
+    # combines binary reward with a weighted reward code reward
+    rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
+    BINARY_THRESHOLD = 0.99
+    NON_BINARY_WEIGHT = 0.1
+    
+    output = []
+    for reward in rewards:
+        if reward is None:
+            output.append(None)
+        else:
+            binary_reward = 1.0 if reward > BINARY_THRESHOLD else 0.0
+            output.append(binary_reward + NON_BINARY_WEIGHT * reward)
+  
+    return output
+
 
 def code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
     """Reward function that evaluates code snippets using the E2B code interpreter.
@@ -596,6 +612,14 @@ def get_reward_funcs(script_args) -> list[Callable]:
             ),
             binary_code_reward,
         ),
+        "weighted_binary_code_reward": update_wrapper(
+            partial(
+                weighted_binary_code_reward,
+                num_parallel=script_args.parallel_code_exec_per_proc,
+                e2b_router_url=script_args.e2b_router_url,
+            ),
+            weighted_binary_code_reward,
+        ),
         "ioi_code": update_wrapper(
             partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
         ),

From 9b6c9704da138a4f5bf0bc8dc346229da373d115 Mon Sep 17 00:00:00 2001
From: edbeeching <edbeeching@gmail.com>
Date: Fri, 18 Apr 2025 11:38:38 +0000
Subject: [PATCH 5/5] update trl version in setup

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 74d7e0d77..4980a1138 100644
--- a/setup.py
+++ b/setup.py
@@ -67,7 +67,7 @@
     "sentencepiece>=0.1.99",
     "torch==2.6.0",
     "transformers==4.51.2",
-    "trl @ git+https://github.com/huggingface/trl.git@d625c5533a6b1c84d3565c8080857f6bb81c538a",  # Bump for vLLM and 2x faster throughput: https://github.com/huggingface/trl/pull/3276
+    "trl @ git+https://github.com/huggingface/trl.git@294f35bf3c0043d3ee6b9b5d22385e5736f6ce9e",  # Bump for vLLM and 2x faster throughput: https://github.com/huggingface/trl/pull/3276
     "vllm==0.8.3",
     "wandb>=0.19.1",
 ]