diff --git a/README.md b/README.md
index 74ac68b..41675b1 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,164 @@ where `<env-url>` and `<env-manager-port>` are the URL and port of the API manag
 
 ### Training
 
-Based on [OpenRLHF](./openrlhf/), coming soon.
+Before starting training, you need to first refer to the [OSWorld tutorial](https://github.com/OpenGVLab/ZeroGUI/tree/main/osworld#setup) to start the environment server. Then, refer to the [vLLM tutorial](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) to start the vLLM server. And pass the server's IP and port information to the execution script.
+
+We also provide a complete training launch code in the ./scripts/train directory, which includes the relevant code for initializing Ray under Slurm.
+
+```bash
+# Path of training data
+DATA_PATH=./data/osworld_test_all.jsonl
+
+# Path of backbone model
+TOKENIZER_PATH=/path/to/model
+
+# env setting
+ENV_URL=${ENV_URL:-"http://10.140.52.49"}
+ENV_MANAGER_PORT=${ENV_MANAGER_PORT:-10001}
+IFS=',' read -ra URL_LIST <<< "$ENV_URL"
+NUM_URLS=${#URL_LIST[@]}
+# clean all existing remote envs
+if [[ $NODE_RANK -eq 0 ]]; then
+   for (( i=0; i<$NUM_URLS; i+=1 )) do
+      url=${URL_LIST[$i]}
+      curl -X POST $url:$ENV_MANAGER_PORT/clean
+   done
+fi
+
+# node setting
+NNODES=${NNODES:-4}
+N_ENGINES=${N_ENGINES:-8}
+ENGINE_TP=${ENGINE_TP:-4}
+
+# training setting
+EPISODE=${EPISODE:-20}
+TRAIN_STEP=${TRAIN_STEP:-1000}
+RBS=${RBS:-1}
+N_SAMPLES=${N_SAMPLES:-64}
+R_TARGET_SIZE=${R_TARGET_SIZE:-2048}
+TBS=${TBS:-2048} # one update per rollout, TBS = R_TARGET_SIZE
+MAX_GEN_BATCH=${MAX_GEN_BATCH:--1}
+N_GROUPS=${N_GROUPS:-1}
+
+KL_TYPE=${KL_TYPE:-"mse"}
+KL=${KL:-1e-1}
+LR=${LR:-2e-6}
+LR_SCHEDULE=${LR_SCHEDULE:-"constant_with_warmup"} # constant for ablation
+WARMUP=${WARMUP:-0.0}
+MAX_LENGTH=${MAX_LENGTH:-512}
+export MIN_PIXELS=3136
+export MAX_PIXELS=2116800
+REWARD_PORT=1278
+PY_ARGS=${PY_ARGS:-"--kl_threshold_type=advantage --env_reset_sleep_range=60"}
+
+# llm eval
+API_TYPE=${API_TYPE:-"qwen"}
+API_MODEL=${API_MODEL:-"Qwen2.5-VL-32B-Instruct"}
+API_BASE_URL=${API_BASE_URL:-"http://10.140.37.106:21101"}
+API_KEY=${API_KEY:-"empty"}
+EVAL_PROMPT_FILE=${EVAL_PROMPT_FILE:-"osworld_llm_eval_v1.json"}
+
+# sampling setting
+TEMP=${TEMP:-0.5}
+TOP_P=${TOP_P:-0.9}
+FREQ_PEN=${FREQ_PEN:-1}
+
+# save & log
+EXP_FLAG=${EXP_FLAG:-""}
+SAVE_MODEL_NAME=${EXP_FLAG}-kl_${KL_TYPE}_${KL}-rbs_${RBS}-sample_${N_SAMPLES}-rtarget_${R_TARGET_SIZE}-tbs_${TBS}-lr_${LR}-temp_${TEMP}
+LOG_BASE=log
+mkdir -p results/$SAVE_MODEL_NAME
+mkdir -p results/$SAVE_MODEL_NAME/trajectory
+MAX_CKPT_NUM=${MAX_CKPT_NUM:-10}
+
+# launch the master node of ray in container
+ray start --head --node-ip-address 0.0.0.0 --num-gpus 8
+
+# if you want to launch ray on more nodes, use
+ray start --address {MASTER-NODE-ADDRESS}:6379  --num-gpus 8
+
+ray job submit \
+   -- python3 -m openrlhf.cli.train_ppo_ray \
+   --ref_num_nodes $NNODES \
+   --ref_num_gpus_per_node 8 \
+   --actor_num_nodes $NNODES \
+   --actor_num_gpus_per_node 8 \
+   --vllm_num_engines $N_ENGINES \
+   --vllm_tensor_parallel_size $ENGINE_TP \
+   --enforce_eager \
+   --pretrain ${TOKENIZER_PATH} \
+   --save_path results/$SAVE_MODEL_NAME \
+   --ckpt_path results/$SAVE_MODEL_NAME \
+   --micro_train_batch_size 1 \
+   --train_batch_size ${TBS} \
+   --micro_rollout_batch_size 1 \
+   --rollout_batch_size ${RBS} \
+   --advantage_estimator group_norm \
+   --use_dapo_trainer \
+   --dapo_dynamic_sampling \
+   --rollout_target_size ${R_TARGET_SIZE} \
+   --max_num_gen_batches ${MAX_GEN_BATCH} \
+   --max_samples 100000 \
+   --max_epochs 1 \
+   --num_episodes ${EPISODE} \
+   --num_train_steps ${TRAIN_STEP} \
+   --lr_warmup_ratio ${WARMUP} \
+   --n_samples_per_prompt $N_SAMPLES \
+   --prompt_max_len 20480 \
+   --generate_max_len $MAX_LENGTH \
+   --zero_stage 3 \
+   --bf16 \
+   --actor_learning_rate $LR \
+   --critic_learning_rate 9e-6 \
+   --actor_lr_schedule $LR_SCHEDULE \
+   --init_kl_coef $KL \
+   --kl_loss_coef $KL \
+   --kl_penalty_type $KL_TYPE \
+   --not_normalize_advantage \
+   --prompt_data $DATA_PATH \
+   --simple_load_dataset \
+   --packing_samples \
+   --flash_attn \
+   --gradient_checkpointing \
+   --save_steps 1 \
+   --save_hf_model \
+   --wandb_run_name $SAVE_MODEL_NAME \
+   --use_tensorboard tb_log \
+   --vllm_sync_backend nccl \
+   --max_ckpt_num $MAX_CKPT_NUM \
+   --group_method normal \
+   --use_length_reward_in_efficiency \
+   --temperature $TEMP \
+   --top_p $TOP_P \
+   --frequency_penalty $FREQ_PEN \
+   --overlap_comm \
+   --train_agent \
+   --task_group_distributed \
+   --num_distributed_groups $N_GROUPS \
+   --data_gather_redistribute \
+   --env_type osworld \
+   --env_url $ENV_URL \
+   --env_manager_port $ENV_MANAGER_PORT \
+   --action_space pyautogui \
+   --observation_type screenshot \
+   --agent_max_steps 15 \
+   --save_trajectory \
+   --agent_type uitars \
+   --num_history 5 \
+   --num_input_image 5 \
+   --use_llm_evaluator \
+   --api_type $API_TYPE \
+   --api_model $API_MODEL \
+   --api_base_url $API_BASE_URL \
+   --api_key $API_KEY \
+   --eval_prompt_file $EVAL_PROMPT_FILE \
+   --load_checkpoint \
+   --colocate_all_models \
+   --vllm_enable_sleep \
+   --vllm_gpu_memory_utilization 0.6 \
+   --deepspeed_enable_sleep \
+   ${PY_ARGS}
+```
 
 ## 📚 Citation
 
diff --git a/scripts/train/grpo.sh b/scripts/train/grpo.sh
new file mode 100644
index 0000000..21fc8fa
--- /dev/null
+++ b/scripts/train/grpo.sh
@@ -0,0 +1,158 @@
+NODE_RANK=${1:-0}
+
+# export TORCH_HOME=/opt/aps/workdir
+export NUMEXPR_MAX_THREADS=128
+export RAY_DEDUP_LOGS=0
+
+# Path of training data
+DATA_PATH=${DATA_PATH:-"./data/osworld_test_all.jsonl"}
+
+# Path of backbone model
+TOKENIZER_PATH=${TOKENIZER_PATH:="/path/to/model"}
+
+# env setting
+ENV_URL=${ENV_URL:-"http://10.140.52.49"}
+ENV_MANAGER_PORT=${ENV_MANAGER_PORT:-10001}
+IFS=',' read -ra URL_LIST <<< "$ENV_URL"
+NUM_URLS=${#URL_LIST[@]}
+# TODO: clean all existing remote envs
+if [[ $NODE_RANK -eq 0 ]]; then
+   for (( i=0; i<$NUM_URLS; i+=1 )) do
+      url=${URL_LIST[$i]}
+      curl -X POST $url:$ENV_MANAGER_PORT/clean
+   done
+fi
+
+# node setting
+NNODES=${NNODES:-4}
+N_ENGINES=${N_ENGINES:-8}
+ENGINE_TP=${ENGINE_TP:-4}
+
+# training setting
+EPISODE=${EPISODE:-20}
+TRAIN_STEP=${TRAIN_STEP:-1000}
+RBS=${RBS:-1}
+N_SAMPLES=${N_SAMPLE:-64}
+R_TARGET_SIZE=${R_TARGET_SIZE:-2048}
+TBS=${TBS:-2048} # one update per rollout, TBS = R_TARGET_SIZE
+MAX_GEN_BATCH=${MAX_GEN_BATCH:--1}
+N_GROUPS=${N_GROUPS:-1}
+
+KL_TYPE=${KL_TYPE:-"mse"}
+KL=${KL:-1e-1}
+LR=${LR:-2e-6}
+LR_SCHEDULE=${LR_SCHEDULE:-"constant_with_warmup"} # constant for ablation
+WARMUP=${WARMUP:-0.0}
+MAX_LENGTH=${MAX_LENGTH:-512}
+export MIN_PIXELS=3136
+export MAX_PIXELS=2116800
+REWARD_PORT=1278
+PY_ARGS=${PY_ARGS:-"--kl_threshold_type=advantage --env_reset_sleep_range=60"}
+
+# llm eval
+API_TYPE=${API_TYPE:-"qwen"}
+API_MODEL=${API_MODEL:-"Qwen2.5-VL-32B-Instruct"}
+API_BASE_URL=${API_BASE_URL:-"http://10.140.37.106:21101"}
+API_KEY=${API_KEY:-"empty"}
+EVAL_PROMPT_FILE=${EVAL_PROMPT_FILE:-"osworld_llm_eval_v1.json"}
+
+# sampling setting
+TEMP=${TEMP:-0.5}
+TOP_P=${TOP_P:-0.9}
+FREQ_PEN=${FREQ_PEN:-1}
+
+# save & log
+EXP_FLAG=${EXP_FLAG:-""}
+SAVE_MODEL_NAME=${EXP_FLAG}-kl_${KL_TYPE}_${KL}-rbs_${RBS}-sample_${N_SAMPLES}-rtarget_${R_TARGET_SIZE}-tbs_${TBS}-lr_${LR}-temp_${TEMP}
+LOG_BASE=log
+mkdir -p results/$SAVE_MODEL_NAME
+mkdir -p results/$SAVE_MODEL_NAME/trajectory
+MAX_CKPT_NUM=${MAX_CKPT_NUM:-10}
+
+export RAY_ADDRESS="http://127.0.0.1:$DASHBORAD_PORT"
+
+if [ "$NODE_RANK" = "0" ]; then
+PYTHONPATH=./:$PYTHONPATH \
+ray job submit \
+   -- python3 -m openrlhf.cli.train_ppo_ray \
+   --ref_num_nodes $NNODES \
+   --ref_num_gpus_per_node 8 \
+   --actor_num_nodes $NNODES \
+   --actor_num_gpus_per_node 8 \
+   --vllm_num_engines $N_ENGINES \
+   --vllm_tensor_parallel_size $ENGINE_TP \
+   --enforce_eager \
+   --pretrain ${TOKENIZER_PATH} \
+   --remote_rm_url http://localhost:${REWARD_PORT}/get_reward \
+   --save_path results/$SAVE_MODEL_NAME \
+   --ckpt_path results/$SAVE_MODEL_NAME \
+   --micro_train_batch_size 1 \
+   --train_batch_size ${TBS} \
+   --micro_rollout_batch_size 1 \
+   --rollout_batch_size ${RBS} \
+   --advantage_estimator group_norm \
+   --use_dapo_trainer \
+   --dapo_dynamic_sampling \
+   --rollout_target_size ${R_TARGET_SIZE} \
+   --max_num_gen_batches ${MAX_GEN_BATCH} \
+   --max_samples 100000 \
+   --max_epochs 1 \
+   --num_episodes ${EPISODE} \
+   --num_train_steps ${TRAIN_STEP} \
+   --lr_warmup_ratio ${WARMUP} \
+   --n_samples_per_prompt $N_SAMPLES \
+   --prompt_max_len 20480 \
+   --generate_max_len $MAX_LENGTH \
+   --zero_stage 3 \
+   --bf16 \
+   --actor_learning_rate $LR \
+   --critic_learning_rate 9e-6 \
+   --actor_lr_schedule $LR_SCHEDULE \
+   --init_kl_coef $KL \
+   --kl_loss_coef $KL \
+   --kl_penalty_type $KL_TYPE \
+   --not_normalize_advantage \
+   --prompt_data $DATA_PATH \
+   --simple_load_dataset \
+   --packing_samples \
+   --flash_attn \
+   --gradient_checkpointing \
+   --save_steps 1 \
+   --save_hf_model \
+   --wandb_run_name $SAVE_MODEL_NAME \
+   --use_tensorboard tb_log \
+   --vllm_sync_backend nccl \
+   --max_ckpt_num $MAX_CKPT_NUM \
+   --group_method normal \
+   --use_length_reward_in_efficiency \
+   --temperature $TEMP \
+   --top_p $TOP_P \
+   --frequency_penalty $FREQ_PEN \
+   --overlap_comm \
+   --train_agent \
+   --task_group_distributed \
+   --num_distributed_groups $N_GROUPS \
+   --data_gather_redistribute \
+   --env_type osworld \
+   --env_url $ENV_URL \
+   --env_manager_port $ENV_MANAGER_PORT \
+   --action_space pyautogui \
+   --observation_type screenshot \
+   --agent_max_steps 15 \
+   --save_trajectory \
+   --agent_type uitars \
+   --num_history 5 \
+   --num_input_image 5 \
+   --use_llm_evaluator \
+   --api_type $API_TYPE \
+   --api_model $API_MODEL \
+   --api_base_url $API_BASE_URL \
+   --api_key $API_KEY \
+   --eval_prompt_file $EVAL_PROMPT_FILE \
+   --load_checkpoint \
+   --colocate_all_models \
+   --vllm_enable_sleep \
+   --vllm_gpu_memory_utilization 0.6 \
+   --deepspeed_enable_sleep \
+   ${PY_ARGS}
+fi
\ No newline at end of file
diff --git a/scripts/train/srun.sh b/scripts/train/srun.sh
new file mode 100644
index 0000000..2e54b12
--- /dev/null
+++ b/scripts/train/srun.sh
@@ -0,0 +1,21 @@
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+
+#############################
+JOBID=$1
+NODELIST=$2
+NNODES=${NNODES:-4}
+GPUS_PER_TASK=8
+CPUS_PER_TASK=96
+SCRIPT=$SCRIPT_DIR/train.sh
+#############################
+
+export GPUS=$((GPU_PER_NODE * NNODES))
+
+srun --jobid=${JOBID} \
+  --nodelist=${NODELIST} \
+  --ntasks ${NNODES} \
+  --ntasks-per-node 1 \
+  --gpus-per-task ${GPUS_PER_TASK} \
+  --cpus-per-task ${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  bash $SCRIPT
\ No newline at end of file
diff --git a/scripts/train/train.sh b/scripts/train/train.sh
new file mode 100644
index 0000000..69bddca
--- /dev/null
+++ b/scripts/train/train.sh
@@ -0,0 +1,109 @@
+set -ex
+
+ROOT=$PWD
+cd $(dirname $0)
+
+WORLD_SIZE=$((SLURM_NTASKS))
+RANK=$((SLURM_PROCID))
+MASTER_ADDR=$(scontrol show hostname ${SLURM_STEP_NODELIST} | head -n1)
+MASTER_ADDR=$(echo $MASTER_ADDR | cut -d '-' -f 3-6 | tr '-' '.')
+MASTER_PORT=29500
+echo $MASTER_ADDR 
+echo $MASTER_PORT 
+echo $WORLD_SIZE 
+echo $RANK
+export DASHBORAD_PORT=10000
+
+export NCCL_P2P_LEVEL=NVL
+export PYTHONPATH=$ROOT:$PYTHONPATH
+# export HF_HOME=$(realpath $PWD/../../cache/huggingface/)
+
+num_nodes=$WORLD_SIZE
+start_time=$(date +%Y%m%d%H%M)
+
+# num_nodes has to be at least 1
+if [ $num_nodes -lt 1 ]; then
+    echo "Number of nodes must be at least 1"
+    exit 1
+fi
+
+# if HOST contains "master", then this is the head node
+if [[ $RANK -eq 0 ]]; then
+    node_role="master"
+else
+    node_role="worker"
+fi
+head_node_ip=$MASTER_ADDR
+
+script="grpo.sh"
+
+# logging
+N_SAMPLES=${N_SAMPLE:-64}
+TBS=${TBS:-2048}
+RBS=${RBS:-1}
+R_TARGET_SIZE=${R_TARGET_SIZE:-2048}
+KL=${KL:-1e-1}
+LR=${LR:-2e-6}
+TEMP=${TEMP:-0.5}
+EXP_FLAG=${EXP_FLAG:-""}
+POLICY_TYPE=${POLICY_TYPE:-"ppo"}
+KL_TYPE=${KL_TYPE:-"low_var_kl"}
+LOG_DIR=log/${EXP_FLAG}-kl_${KL_TYPE}_${KL}-rbs_${RBS}-sample_${N_SAMPLES}-rtarget_${R_TARGET_SIZE}-tbs_${TBS}-lr_${LR}-temp_${TEMP}
+mkdir -p $(dirname $0)/$LOG_DIR
+
+wait_time=15
+if [ "$node_role" == "master" ]; then
+    echo "Starting Ray head node..."
+    # Start Ray on this node as the head node and extract its address
+    # The `ray start --head` command outputs information that includes the address,
+    # but here we're assuming it's known or statically assigned for simplicity.
+    ray start --head --dashboard-host 0.0.0.0 --port=6379 --ray-debugger-external --dashboard-port=$DASHBORAD_PORT --resources '{"COMPUTE": 100000000000000.0, "HEAD": 100000000000000.0}'
+    sleep $wait_time
+elif [ "$node_role" == "worker" ]; then
+    sleep $wait_time
+    attempt=1
+    echo "Starting Ray worker node and attempting to connect to the head node at $head_node_ip:6379"
+    while true; do
+        # Attempt to start Ray and connect to the head node
+        ray start --address="$head_node_ip:6379" --dashboard-port=$DASHBORAD_PORT --resources '{"COMPUTE": 100000000000000.0, "virtual_cluster_default": 100000000000000.0}'  && break || {
+            if [ $attempt -le 5 ]; then
+                echo "Ray worker start attempt $attempt failed. Retrying in $wait_time seconds..."
+                ((attempt++))
+                sleep $wait_time
+            else
+                echo "Failed to connect to the head node after $wait_time attempts. Exiting."
+                exit 1
+            fi
+        }
+    done
+fi
+# run the training script once Ray has been started on all nodes
+sleep $wait_time
+if [ "$node_role" == "master" ]; then
+    num_active_ray_nodes=$(ray list nodes | grep ALIVE | wc -l)
+    echo "Number of active Ray nodes: $num_active_ray_nodes"
+    if [ $num_active_ray_nodes -lt $num_nodes ]; then
+        echo "Waiting for all Ray nodes to start..."
+        attempt=1
+        while true; do
+            num_active_ray_nodes=$(ray list nodes | grep ALIVE | wc -l)
+            if [ $num_active_ray_nodes -eq $num_nodes ]; then
+                break
+            elif [ $attempt -le 5 ]; then
+                echo "python command attempt $attempt failed. Retrying in $wait_time seconds..."
+                ((attempt++))
+                sleep $wait_time
+            else
+                echo "Failed to connect to the head node after $wait_time attempts. Exiting."
+                exit 1
+            fi
+        done
+    fi
+    echo "End starting"
+    # python examples/scripts/test_ray.py
+    sh $(dirname $0)/${script} $RANK 2>&1 | tee $LOG_DIR/grpo_ray_${num_nodes}_${node_role}_${RANK}.log
+else
+    echo "End starting"
+    sh $(dirname $0)/${script} $RANK 2>&1 | tee $LOG_DIR/grpo_ray_${num_nodes}_${node_role}_${RANK}.log
+    sleep infinity
+fi