-
Notifications
You must be signed in to change notification settings - Fork 90
Description
作者你好,我遇到了如下2个问题,像请教一下您:
/mnt/nvme/yfyuan/wangxiao/simplevla/LIBERO/libero/libero/../datasets does not exist!
(WorkerDict pid=4482) Saved rollout MP4 at path ./rollouts/libero_goal/step=0--task=libero_goal_task_0_trial_7--success=False--ran=5847.mp4
(WorkerDict pid=4482) Saved rollout MP4 at path ./rollouts/libero_goal/step=0--task=libero_goal_task_3_trial_5--success=True--ran=3310.mp4
(WorkerDict pid=4482) Saved rollout MP4 at path ./rollouts/libero_goal/step=0--task=libero_goal_task_3_trial_24--success=False--ran=9152.mp4
(WorkerDict pid=4482) Saved rollout MP4 at path ./rollouts/libero_goal/step=0--task=libero_goal_task_8_trial_8--success=False--ran=6695.mp4
(WorkerDict pid=4482) Saved rollout MP4 at path ./rollouts/libero_goal/step=0--task=libero_goal_task_3_trial_23--success=True--ran=9779.mp4
(WorkerDict pid=4482) Saved rollout MP4 at path ./rollouts/libero_goal/step=0--task=libero_goal_task_9_trial_10--success=False--ran=6578.mp4
(WorkerDict pid=4482) Saved rollout MP4 at path ./rollouts/libero_goal/step=0--task=libero_goal_task_5_trial_21--success=True--ran=755.mp4
(WorkerDict pid=4482) Saved rollout MP4 at path ./rollouts/libero_goal/step=0--task=libero_goal_task_9_trial_39--success=True--ran=5255.mp4
(main_task pid=4227) validation generation end
(main_task pid=45136) gen: 478.6 seconds
Error executing job with overrides: ['data.task_suite_name=libero_goal', 'data.num_trials_per_task=50', 'data.n_samples=8', 'data.filter_accuracy=True', 'data.accuracy_lower_bound=0.1', 'data.accuracy_upper_bound=0.9', 'data.oversample_factor=1', 'data.train_batch_size=8', 'data.val_batch_size=16', 'data.max_prompt_length=256', 'data.max_response_length=128', 'actor_rollout_ref.model.path=/mnt/nvme/yfyuan/wangxiao/models/simplevla/Openvla-oft-SFT-libero-goal-traj1', 'actor_rollout_ref.model.vla=openvla-oft', 'actor_rollout_ref.model.action_token_len=7', 'actor_rollout_ref.model.action_chunks_len=8', 'actor_rollout_ref.actor.optim.lr=5e-6', 'actor_rollout_ref.actor.optim.warmup_style=constant', 'actor_rollout_ref.actor.ppo_mini_batch_size=128', 'actor_rollout_ref.actor.ppo_micro_batch_size=1', 'actor_rollout_ref.actor.use_dynamic_bsz=False', 'actor_rollout_ref.actor.fsdp_config.param_offload=False', 'actor_rollout_ref.actor.fsdp_config.grad_offload=False', 'actor_rollout_ref.actor.fsdp_config.optimizer_offload=False', 'actor_rollout_ref.actor.grad_clip=1', 'actor_rollout_ref.actor.clip_ratio_high=0.28', 'actor_rollout_ref.actor.clip_ratio_low=0.2', 'actor_rollout_ref.actor.num_images_in_input=1', 'actor_rollout_ref.actor.traj_mini_batch_size=16', 'actor_rollout_ref.model.enable_gradient_checkpointing=False', 'actor_rollout_ref.model.use_remove_padding=False', 'actor_rollout_ref.actor.entropy_coeff=0.', 'actor_rollout_ref.rollout.num_images_in_input=1', 'actor_rollout_ref.rollout.use_proprio=False', 'actor_rollout_ref.rollout.val_micro_batch_size=8', 'actor_rollout_ref.rollout.temperature=1.6', 'actor_rollout_ref.rollout.experiment_name=libero_goal', 'actor_rollout_ref.rollout.micro_batch_size=1', 'actor_rollout_ref.rollout.unnorm_key=libero_goal', 'actor_rollout_ref.rollout.model_family=openvla', 'actor_rollout_ref.rollout.task_suite_name=libero_goal', 'actor_rollout_ref.rollout.num_steps_wait=10', 'actor_rollout_ref.rollout.pretrained_checkpoint=/mnt/nvme/yfyuan/wangxiao/models/simplevla/Openvla-oft-SFT-libero-goal-traj1', 'actor_rollout_ref.rollout.center_crop=True', 'actor_rollout_ref.rollout.max_prompt_length=512', 'actor_rollout_ref.rollout.log_prob_micro_batch_size=32', 'actor_rollout_ref.rollout.tensor_model_parallel_size=1', 'actor_rollout_ref.rollout.name=hf', 'actor_rollout_ref.rollout.gpu_memory_utilization=0.9', 'actor_rollout_ref.ref.log_prob_micro_batch_size=32', 'actor_rollout_ref.ref.fsdp_config.param_offload=True', 'algorithm.kl_ctrl.kl_coef=0.00', 'trainer.logger=[console]', 'trainer.project_name=SimpleVLA-RL', 'trainer.experiment_name=libero_goal', 'trainer.default_local_dir=/mnt/nvme/yfyuan/wangxiao/simplevla/checkpoints/SimpleVLA-RL/libero_goal', 'trainer.n_gpus_per_node=1', 'trainer.nnodes=1', 'trainer.save_freq=25', 'trainer.test_freq=4', 'trainer.total_epochs=100', 'trainer.val_only=False', 'algorithm.adv_estimator=grpo', 'algorithm.adv_params.verifier_gamma=1.0', 'algorithm.adv_params.reward_model_gamma=1.0', 'trainer.runtime_env=/mnt/pami203/yfyuan/wangxiao/simplevla/SimpleVLA-RL/align.json', 'trainer.wandb_mode=disabled', 'trainer.val_before_train=False']
Traceback (most recent call last):
File "/opt/conda/envs/simplevla/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/envs/simplevla/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/trainer/main_ppo.py", line 212, in
main()
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/hydra/_internal/utils.py", line 458, in
lambda: hydra.run(
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/trainer/main_ppo.py", line 116, in main
ray.get(main_task.remote(config))
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/ray/_private/worker.py", line 2972, in get
values, debugger_breakpoint = worker.get_objects(
File "/opt/conda/envs/simplevla/lib/python3.10/site-packages/ray/_private/worker.py", line 1031, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): ray::main_task() (pid=45136, ip=10.244.136.198)
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/trainer/main_ppo.py", line 208, in main_task
trainer.fit()
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/trainer/ppo/ray_trainer.py", line 549, in fit
gen_batch_output = self.actor_rollout_wg.generate_sequences(prompts=gen_batch)
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/single_controller/ray/base.py", line 42, in func
output = ray.get(output)
ray.exceptions.RayTaskError(ValueError): ray::WorkerDict.actor_rollout_generate_sequences() (pid=45392, ip=10.244.136.198, actor_id=b2ad906862dc93628f67b31201000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7f070e09a7a0>)
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/single_controller/ray/base.py", line 397, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/single_controller/base/decorator.py", line 404, in inner
return func(*args, **kwargs)
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/workers/fsdp_workers.py", line 523, in generate_sequences
old_log_probs = self.actor.compute_log_prob(data=output)
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/workers/actor/dp_rob.py", line 389, in compute_log_prob
_, log_probs = self._forward_micro_batch(micro_batch, temperature=temperature)
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/workers/actor/dp_rob.py", line 143, in _forward_micro_batch
input_ids_unpad, _ = self.process_tensor(input_ids, self.pad_token_id)
File "/mnt/nvme/yfyuan/wangxiao/simplevla/SimpleVLA-RL/verl/workers/actor/dp_rob.py", line 61, in process_tensor
raise ValueError("Padding error!")
ValueError: Padding error!