diff --git a/README.md b/README.md index 74ac68b..41675b1 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,164 @@ where `` and `` are the URL and port of the API manag ### Training -Based on [OpenRLHF](./openrlhf/), coming soon. +Before starting training, you need to first refer to the [OSWorld tutorial](https://github.com/OpenGVLab/ZeroGUI/tree/main/osworld#setup) to start the environment server. Then, refer to the [vLLM tutorial](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) to start the vLLM server. And pass the server's IP and port information to the execution script. + +We also provide a complete training launch code in the ./scripts/train directory, which includes the relevant code for initializing Ray under Slurm. + +```bash +# Path of training data +DATA_PATH=./data/osworld_test_all.jsonl + +# Path of backbone model +TOKENIZER_PATH=/path/to/model + +# env setting +ENV_URL=${ENV_URL:-"http://10.140.52.49"} +ENV_MANAGER_PORT=${ENV_MANAGER_PORT:-10001} +IFS=',' read -ra URL_LIST <<< "$ENV_URL" +NUM_URLS=${#URL_LIST[@]} +# clean all existing remote envs +if [[ $NODE_RANK -eq 0 ]]; then + for (( i=0; i<$NUM_URLS; i+=1 )) do + url=${URL_LIST[$i]} + curl -X POST $url:$ENV_MANAGER_PORT/clean + done +fi + +# node setting +NNODES=${NNODES:-4} +N_ENGINES=${N_ENGINES:-8} +ENGINE_TP=${ENGINE_TP:-4} + +# training setting +EPISODE=${EPISODE:-20} +TRAIN_STEP=${TRAIN_STEP:-1000} +RBS=${RBS:-1} +N_SAMPLES=${N_SAMPLES:-64} +R_TARGET_SIZE=${R_TARGET_SIZE:-2048} +TBS=${TBS:-2048} # one update per rollout, TBS = R_TARGET_SIZE +MAX_GEN_BATCH=${MAX_GEN_BATCH:--1} +N_GROUPS=${N_GROUPS:-1} + +KL_TYPE=${KL_TYPE:-"mse"} +KL=${KL:-1e-1} +LR=${LR:-2e-6} +LR_SCHEDULE=${LR_SCHEDULE:-"constant_with_warmup"} # constant for ablation +WARMUP=${WARMUP:-0.0} +MAX_LENGTH=${MAX_LENGTH:-512} +export MIN_PIXELS=3136 +export MAX_PIXELS=2116800 +REWARD_PORT=1278 +PY_ARGS=${PY_ARGS:-"--kl_threshold_type=advantage --env_reset_sleep_range=60"} + +# llm eval +API_TYPE=${API_TYPE:-"qwen"} +API_MODEL=${API_MODEL:-"Qwen2.5-VL-32B-Instruct"} +API_BASE_URL=${API_BASE_URL:-"http://10.140.37.106:21101"} +API_KEY=${API_KEY:-"empty"} +EVAL_PROMPT_FILE=${EVAL_PROMPT_FILE:-"osworld_llm_eval_v1.json"} + +# sampling setting +TEMP=${TEMP:-0.5} +TOP_P=${TOP_P:-0.9} +FREQ_PEN=${FREQ_PEN:-1} + +# save & log +EXP_FLAG=${EXP_FLAG:-""} +SAVE_MODEL_NAME=${EXP_FLAG}-kl_${KL_TYPE}_${KL}-rbs_${RBS}-sample_${N_SAMPLES}-rtarget_${R_TARGET_SIZE}-tbs_${TBS}-lr_${LR}-temp_${TEMP} +LOG_BASE=log +mkdir -p results/$SAVE_MODEL_NAME +mkdir -p results/$SAVE_MODEL_NAME/trajectory +MAX_CKPT_NUM=${MAX_CKPT_NUM:-10} + +# launch the master node of ray in container +ray start --head --node-ip-address 0.0.0.0 --num-gpus 8 + +# if you want to launch ray on more nodes, use +ray start --address {MASTER-NODE-ADDRESS}:6379 --num-gpus 8 + +ray job submit \ + -- python3 -m openrlhf.cli.train_ppo_ray \ + --ref_num_nodes $NNODES \ + --ref_num_gpus_per_node 8 \ + --actor_num_nodes $NNODES \ + --actor_num_gpus_per_node 8 \ + --vllm_num_engines $N_ENGINES \ + --vllm_tensor_parallel_size $ENGINE_TP \ + --enforce_eager \ + --pretrain ${TOKENIZER_PATH} \ + --save_path results/$SAVE_MODEL_NAME \ + --ckpt_path results/$SAVE_MODEL_NAME \ + --micro_train_batch_size 1 \ + --train_batch_size ${TBS} \ + --micro_rollout_batch_size 1 \ + --rollout_batch_size ${RBS} \ + --advantage_estimator group_norm \ + --use_dapo_trainer \ + --dapo_dynamic_sampling \ + --rollout_target_size ${R_TARGET_SIZE} \ + --max_num_gen_batches ${MAX_GEN_BATCH} \ + --max_samples 100000 \ + --max_epochs 1 \ + --num_episodes ${EPISODE} \ + --num_train_steps ${TRAIN_STEP} \ + --lr_warmup_ratio ${WARMUP} \ + --n_samples_per_prompt $N_SAMPLES \ + --prompt_max_len 20480 \ + --generate_max_len $MAX_LENGTH \ + --zero_stage 3 \ + --bf16 \ + --actor_learning_rate $LR \ + --critic_learning_rate 9e-6 \ + --actor_lr_schedule $LR_SCHEDULE \ + --init_kl_coef $KL \ + --kl_loss_coef $KL \ + --kl_penalty_type $KL_TYPE \ + --not_normalize_advantage \ + --prompt_data $DATA_PATH \ + --simple_load_dataset \ + --packing_samples \ + --flash_attn \ + --gradient_checkpointing \ + --save_steps 1 \ + --save_hf_model \ + --wandb_run_name $SAVE_MODEL_NAME \ + --use_tensorboard tb_log \ + --vllm_sync_backend nccl \ + --max_ckpt_num $MAX_CKPT_NUM \ + --group_method normal \ + --use_length_reward_in_efficiency \ + --temperature $TEMP \ + --top_p $TOP_P \ + --frequency_penalty $FREQ_PEN \ + --overlap_comm \ + --train_agent \ + --task_group_distributed \ + --num_distributed_groups $N_GROUPS \ + --data_gather_redistribute \ + --env_type osworld \ + --env_url $ENV_URL \ + --env_manager_port $ENV_MANAGER_PORT \ + --action_space pyautogui \ + --observation_type screenshot \ + --agent_max_steps 15 \ + --save_trajectory \ + --agent_type uitars \ + --num_history 5 \ + --num_input_image 5 \ + --use_llm_evaluator \ + --api_type $API_TYPE \ + --api_model $API_MODEL \ + --api_base_url $API_BASE_URL \ + --api_key $API_KEY \ + --eval_prompt_file $EVAL_PROMPT_FILE \ + --load_checkpoint \ + --colocate_all_models \ + --vllm_enable_sleep \ + --vllm_gpu_memory_utilization 0.6 \ + --deepspeed_enable_sleep \ + ${PY_ARGS} +``` ## 📚 Citation diff --git a/scripts/train/grpo.sh b/scripts/train/grpo.sh new file mode 100644 index 0000000..21fc8fa --- /dev/null +++ b/scripts/train/grpo.sh @@ -0,0 +1,158 @@ +NODE_RANK=${1:-0} + +# export TORCH_HOME=/opt/aps/workdir +export NUMEXPR_MAX_THREADS=128 +export RAY_DEDUP_LOGS=0 + +# Path of training data +DATA_PATH=${DATA_PATH:-"./data/osworld_test_all.jsonl"} + +# Path of backbone model +TOKENIZER_PATH=${TOKENIZER_PATH:="/path/to/model"} + +# env setting +ENV_URL=${ENV_URL:-"http://10.140.52.49"} +ENV_MANAGER_PORT=${ENV_MANAGER_PORT:-10001} +IFS=',' read -ra URL_LIST <<< "$ENV_URL" +NUM_URLS=${#URL_LIST[@]} +# TODO: clean all existing remote envs +if [[ $NODE_RANK -eq 0 ]]; then + for (( i=0; i<$NUM_URLS; i+=1 )) do + url=${URL_LIST[$i]} + curl -X POST $url:$ENV_MANAGER_PORT/clean + done +fi + +# node setting +NNODES=${NNODES:-4} +N_ENGINES=${N_ENGINES:-8} +ENGINE_TP=${ENGINE_TP:-4} + +# training setting +EPISODE=${EPISODE:-20} +TRAIN_STEP=${TRAIN_STEP:-1000} +RBS=${RBS:-1} +N_SAMPLES=${N_SAMPLE:-64} +R_TARGET_SIZE=${R_TARGET_SIZE:-2048} +TBS=${TBS:-2048} # one update per rollout, TBS = R_TARGET_SIZE +MAX_GEN_BATCH=${MAX_GEN_BATCH:--1} +N_GROUPS=${N_GROUPS:-1} + +KL_TYPE=${KL_TYPE:-"mse"} +KL=${KL:-1e-1} +LR=${LR:-2e-6} +LR_SCHEDULE=${LR_SCHEDULE:-"constant_with_warmup"} # constant for ablation +WARMUP=${WARMUP:-0.0} +MAX_LENGTH=${MAX_LENGTH:-512} +export MIN_PIXELS=3136 +export MAX_PIXELS=2116800 +REWARD_PORT=1278 +PY_ARGS=${PY_ARGS:-"--kl_threshold_type=advantage --env_reset_sleep_range=60"} + +# llm eval +API_TYPE=${API_TYPE:-"qwen"} +API_MODEL=${API_MODEL:-"Qwen2.5-VL-32B-Instruct"} +API_BASE_URL=${API_BASE_URL:-"http://10.140.37.106:21101"} +API_KEY=${API_KEY:-"empty"} +EVAL_PROMPT_FILE=${EVAL_PROMPT_FILE:-"osworld_llm_eval_v1.json"} + +# sampling setting +TEMP=${TEMP:-0.5} +TOP_P=${TOP_P:-0.9} +FREQ_PEN=${FREQ_PEN:-1} + +# save & log +EXP_FLAG=${EXP_FLAG:-""} +SAVE_MODEL_NAME=${EXP_FLAG}-kl_${KL_TYPE}_${KL}-rbs_${RBS}-sample_${N_SAMPLES}-rtarget_${R_TARGET_SIZE}-tbs_${TBS}-lr_${LR}-temp_${TEMP} +LOG_BASE=log +mkdir -p results/$SAVE_MODEL_NAME +mkdir -p results/$SAVE_MODEL_NAME/trajectory +MAX_CKPT_NUM=${MAX_CKPT_NUM:-10} + +export RAY_ADDRESS="http://127.0.0.1:$DASHBORAD_PORT" + +if [ "$NODE_RANK" = "0" ]; then +PYTHONPATH=./:$PYTHONPATH \ +ray job submit \ + -- python3 -m openrlhf.cli.train_ppo_ray \ + --ref_num_nodes $NNODES \ + --ref_num_gpus_per_node 8 \ + --actor_num_nodes $NNODES \ + --actor_num_gpus_per_node 8 \ + --vllm_num_engines $N_ENGINES \ + --vllm_tensor_parallel_size $ENGINE_TP \ + --enforce_eager \ + --pretrain ${TOKENIZER_PATH} \ + --remote_rm_url http://localhost:${REWARD_PORT}/get_reward \ + --save_path results/$SAVE_MODEL_NAME \ + --ckpt_path results/$SAVE_MODEL_NAME \ + --micro_train_batch_size 1 \ + --train_batch_size ${TBS} \ + --micro_rollout_batch_size 1 \ + --rollout_batch_size ${RBS} \ + --advantage_estimator group_norm \ + --use_dapo_trainer \ + --dapo_dynamic_sampling \ + --rollout_target_size ${R_TARGET_SIZE} \ + --max_num_gen_batches ${MAX_GEN_BATCH} \ + --max_samples 100000 \ + --max_epochs 1 \ + --num_episodes ${EPISODE} \ + --num_train_steps ${TRAIN_STEP} \ + --lr_warmup_ratio ${WARMUP} \ + --n_samples_per_prompt $N_SAMPLES \ + --prompt_max_len 20480 \ + --generate_max_len $MAX_LENGTH \ + --zero_stage 3 \ + --bf16 \ + --actor_learning_rate $LR \ + --critic_learning_rate 9e-6 \ + --actor_lr_schedule $LR_SCHEDULE \ + --init_kl_coef $KL \ + --kl_loss_coef $KL \ + --kl_penalty_type $KL_TYPE \ + --not_normalize_advantage \ + --prompt_data $DATA_PATH \ + --simple_load_dataset \ + --packing_samples \ + --flash_attn \ + --gradient_checkpointing \ + --save_steps 1 \ + --save_hf_model \ + --wandb_run_name $SAVE_MODEL_NAME \ + --use_tensorboard tb_log \ + --vllm_sync_backend nccl \ + --max_ckpt_num $MAX_CKPT_NUM \ + --group_method normal \ + --use_length_reward_in_efficiency \ + --temperature $TEMP \ + --top_p $TOP_P \ + --frequency_penalty $FREQ_PEN \ + --overlap_comm \ + --train_agent \ + --task_group_distributed \ + --num_distributed_groups $N_GROUPS \ + --data_gather_redistribute \ + --env_type osworld \ + --env_url $ENV_URL \ + --env_manager_port $ENV_MANAGER_PORT \ + --action_space pyautogui \ + --observation_type screenshot \ + --agent_max_steps 15 \ + --save_trajectory \ + --agent_type uitars \ + --num_history 5 \ + --num_input_image 5 \ + --use_llm_evaluator \ + --api_type $API_TYPE \ + --api_model $API_MODEL \ + --api_base_url $API_BASE_URL \ + --api_key $API_KEY \ + --eval_prompt_file $EVAL_PROMPT_FILE \ + --load_checkpoint \ + --colocate_all_models \ + --vllm_enable_sleep \ + --vllm_gpu_memory_utilization 0.6 \ + --deepspeed_enable_sleep \ + ${PY_ARGS} +fi \ No newline at end of file diff --git a/scripts/train/srun.sh b/scripts/train/srun.sh new file mode 100644 index 0000000..2e54b12 --- /dev/null +++ b/scripts/train/srun.sh @@ -0,0 +1,21 @@ +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") + +############################# +JOBID=$1 +NODELIST=$2 +NNODES=${NNODES:-4} +GPUS_PER_TASK=8 +CPUS_PER_TASK=96 +SCRIPT=$SCRIPT_DIR/train.sh +############################# + +export GPUS=$((GPU_PER_NODE * NNODES)) + +srun --jobid=${JOBID} \ + --nodelist=${NODELIST} \ + --ntasks ${NNODES} \ + --ntasks-per-node 1 \ + --gpus-per-task ${GPUS_PER_TASK} \ + --cpus-per-task ${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + bash $SCRIPT \ No newline at end of file diff --git a/scripts/train/train.sh b/scripts/train/train.sh new file mode 100644 index 0000000..69bddca --- /dev/null +++ b/scripts/train/train.sh @@ -0,0 +1,109 @@ +set -ex + +ROOT=$PWD +cd $(dirname $0) + +WORLD_SIZE=$((SLURM_NTASKS)) +RANK=$((SLURM_PROCID)) +MASTER_ADDR=$(scontrol show hostname ${SLURM_STEP_NODELIST} | head -n1) +MASTER_ADDR=$(echo $MASTER_ADDR | cut -d '-' -f 3-6 | tr '-' '.') +MASTER_PORT=29500 +echo $MASTER_ADDR +echo $MASTER_PORT +echo $WORLD_SIZE +echo $RANK +export DASHBORAD_PORT=10000 + +export NCCL_P2P_LEVEL=NVL +export PYTHONPATH=$ROOT:$PYTHONPATH +# export HF_HOME=$(realpath $PWD/../../cache/huggingface/) + +num_nodes=$WORLD_SIZE +start_time=$(date +%Y%m%d%H%M) + +# num_nodes has to be at least 1 +if [ $num_nodes -lt 1 ]; then + echo "Number of nodes must be at least 1" + exit 1 +fi + +# if HOST contains "master", then this is the head node +if [[ $RANK -eq 0 ]]; then + node_role="master" +else + node_role="worker" +fi +head_node_ip=$MASTER_ADDR + +script="grpo.sh" + +# logging +N_SAMPLES=${N_SAMPLE:-64} +TBS=${TBS:-2048} +RBS=${RBS:-1} +R_TARGET_SIZE=${R_TARGET_SIZE:-2048} +KL=${KL:-1e-1} +LR=${LR:-2e-6} +TEMP=${TEMP:-0.5} +EXP_FLAG=${EXP_FLAG:-""} +POLICY_TYPE=${POLICY_TYPE:-"ppo"} +KL_TYPE=${KL_TYPE:-"low_var_kl"} +LOG_DIR=log/${EXP_FLAG}-kl_${KL_TYPE}_${KL}-rbs_${RBS}-sample_${N_SAMPLES}-rtarget_${R_TARGET_SIZE}-tbs_${TBS}-lr_${LR}-temp_${TEMP} +mkdir -p $(dirname $0)/$LOG_DIR + +wait_time=15 +if [ "$node_role" == "master" ]; then + echo "Starting Ray head node..." + # Start Ray on this node as the head node and extract its address + # The `ray start --head` command outputs information that includes the address, + # but here we're assuming it's known or statically assigned for simplicity. + ray start --head --dashboard-host 0.0.0.0 --port=6379 --ray-debugger-external --dashboard-port=$DASHBORAD_PORT --resources '{"COMPUTE": 100000000000000.0, "HEAD": 100000000000000.0}' + sleep $wait_time +elif [ "$node_role" == "worker" ]; then + sleep $wait_time + attempt=1 + echo "Starting Ray worker node and attempting to connect to the head node at $head_node_ip:6379" + while true; do + # Attempt to start Ray and connect to the head node + ray start --address="$head_node_ip:6379" --dashboard-port=$DASHBORAD_PORT --resources '{"COMPUTE": 100000000000000.0, "virtual_cluster_default": 100000000000000.0}' && break || { + if [ $attempt -le 5 ]; then + echo "Ray worker start attempt $attempt failed. Retrying in $wait_time seconds..." + ((attempt++)) + sleep $wait_time + else + echo "Failed to connect to the head node after $wait_time attempts. Exiting." + exit 1 + fi + } + done +fi +# run the training script once Ray has been started on all nodes +sleep $wait_time +if [ "$node_role" == "master" ]; then + num_active_ray_nodes=$(ray list nodes | grep ALIVE | wc -l) + echo "Number of active Ray nodes: $num_active_ray_nodes" + if [ $num_active_ray_nodes -lt $num_nodes ]; then + echo "Waiting for all Ray nodes to start..." + attempt=1 + while true; do + num_active_ray_nodes=$(ray list nodes | grep ALIVE | wc -l) + if [ $num_active_ray_nodes -eq $num_nodes ]; then + break + elif [ $attempt -le 5 ]; then + echo "python command attempt $attempt failed. Retrying in $wait_time seconds..." + ((attempt++)) + sleep $wait_time + else + echo "Failed to connect to the head node after $wait_time attempts. Exiting." + exit 1 + fi + done + fi + echo "End starting" + # python examples/scripts/test_ray.py + sh $(dirname $0)/${script} $RANK 2>&1 | tee $LOG_DIR/grpo_ray_${num_nodes}_${node_role}_${RANK}.log +else + echo "End starting" + sh $(dirname $0)/${script} $RANK 2>&1 | tee $LOG_DIR/grpo_ray_${num_nodes}_${node_role}_${RANK}.log + sleep infinity +fi