From 1303a69bb235671a76fb02d5a4ea7ca57469ede4 Mon Sep 17 00:00:00 2001
From: DDVD <zjdavid.2003@gmail.com>
Date: Mon, 11 Aug 2025 14:54:55 -0400
Subject: [PATCH 001/232] Adapt to Our Datasets (#1)

---
 .../run_qwen2_5_vl-7b_climb_no_thinking.sh    |  54 ++
 examples/format_prompt/README.md              |  63 ++
 examples/format_prompt/default.jinja          |   1 +
 examples/format_prompt/no_thinking.jinja      |   1 +
 .../generation/run_deepseek7b_mutli_node.sh   |   0
 .../generation/run_deepseek_v2_lite_math.sh   |   0
 examples/gmpo_trainer/run_qwen2_5-7b_math.sh  |   0
 examples/gmpo_trainer/test_dapo_7b_math.sh    |   0
 .../gmpo_trainer/test_dapo_qwen3_30b_math.sh  |   0
 .../run_deepseek671b_math_megatron.sh         |   0
 examples/grpo_trainer/run_deepseek7b_llm.sh   |   0
 .../grpo_trainer/run_deepseek7b_llm_math.sh   |   0
 .../run_deepseek7b_llm_math_megatron.sh       |   0
 .../run_deepseek7b_llm_seq_balance.sh         |   0
 examples/grpo_trainer/run_minicpmo2_6.sh      |   0
 .../run_moonlight16b_math_megatron.sh         |   0
 examples/grpo_trainer/run_qwen2-7b.sh         |   0
 examples/grpo_trainer/run_qwen2-7b_math.sh    |   0
 .../run_qwen2-7b_math_megatron.sh             |   0
 .../grpo_trainer/run_qwen2-7b_seq_balance.sh  |   0
 .../run_qwen2-7b_seq_balance_math_megatron.sh |   0
 .../grpo_trainer/run_qwen2-7b_sgl_megatron.sh |   0
 .../run_qwen2_5-3b_gsm8k_grpo_lora.sh         |   0
 .../run_qwen2_5-7b_math_megatron_diff_tp.sh   |   0
 .../grpo_trainer/run_qwen2_5_32b_grpo_npu.sh  |   0
 .../run_qwen2_5_7b_grpo_discrete_prof_npu.sh  |   0
 .../run_qwen2_5_7b_grpo_e2e_prof_npu.sh       |   0
 .../grpo_trainer/run_qwen2_5_7b_grpo_npu.sh   |   0
 .../run_qwen2_5_vl-7b-megatron.sh             |   0
 examples/grpo_trainer/run_qwen2_5_vl-7b.sh    |   0
 .../grpo_trainer/run_qwen2_5_vl-7b_climb.sh   |  54 ++
 .../grpo_trainer/run_qwen2_5_vl-7b_lora.sh    |   0
 .../run_qwen2_5_vl-7b_seq_balance.sh          |   0
 .../grpo_trainer/run_qwen2_5_vl_32b_npu.sh    |   0
 .../grpo_trainer/run_qwen2_5_vl_3b_npu.sh     |   0
 .../grpo_trainer/run_qwen2_5_vl_7b_npu.sh     |   0
 .../grpo_trainer/run_qwen3-236b_megatron.sh   |   0
 examples/grpo_trainer/run_qwen3-8b.sh         |   0
 .../grpo_trainer/run_qwen3moe-30b_megatron.sh |   0
 examples/ppo_trainer/run_deepseek7b_llm.sh    |   0
 .../run_deepseek7b_llm_modelscope.sh          |   0
 .../ppo_trainer/run_deepseek7b_llm_pfppo.sh   |   0
 .../run_deepseek7b_llm_sandbox_fusion.sh      |   0
 .../ppo_trainer/run_deepseek7b_llm_sp2.sh     |   0
 .../ppo_trainer/run_deepseek_full_hh_rlhf.sh  |   0
 .../run_deepseek_math_gsm8k_megatron.sh       |   0
 .../run_deepseek_math_gsm8k_megatron_nsys.sh  |   0
 examples/ppo_trainer/run_gemma.sh             |   0
 .../run_moonlight16b_a3b_gsm8k_megatron.sh    |   0
 .../run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh   |   0
 .../run_qwen2-7b_math_gsm8k_megatron.sh       |   0
 examples/ppo_trainer/run_qwen2-7b_rm.sh       |   0
 .../run_qwen2-7b_rm_seq_balance.sh            |   0
 ...n_qwen2-7b_rm_seq_balance_fused_kernels.sh |   0
 .../run_qwen2-7b_rm_seq_balance_nsys.sh       |   0
 .../ppo_trainer/run_qwen2-7b_seq_balance.sh   |   0
 .../run_qwen2-7b_sglang_seq_balance.sh        |   0
 examples/ppo_trainer/run_qwen2.5-32b.sh       |   0
 .../run_qwen2-7b_math_rf.sh                   |   0
 .../run_qwen2-7b_math_rf_baseline.sh          |   0
 .../run_qwen2.5-3b_seq_balance.sh             |   0
 .../run_qwen2.5-7b_seq_balance.sh             |   0
 examples/reward_function/dapo.py              | 163 ++++++
 examples/reward_function/evaluation.py        | 552 ++++++++++++++++++
 examples/reward_function/math.py              |  49 ++
 examples/reward_function/medical.py           | 460 +++++++++++++++
 examples/reward_function/r1v.py               |  50 ++
 examples/rloo_trainer/run_qwen2-7b.sh         |   0
 examples/sft/gsm8k/run_deepseek_6b7.sh        |   0
 examples/sft/gsm8k/run_gemma_2b.sh            |   0
 examples/sft/gsm8k/run_gemma_7b.sh            |   0
 .../gsm8k/run_qwen2_5_05b_sft_peft_sp2_npu.sh |   0
 examples/sft/gsm8k/run_qwen_05_peft.sh        |   0
 examples/sft/gsm8k/run_qwen_05_sp2.sh         |   0
 examples/sft/gsm8k/run_qwen_05_sp2_liger.sh   |   0
 examples/sft/multiturn/run_qwen_05_sp2.sh     |   0
 .../geo3k/run_qwen2.5-3b_geo3k_multiturn.sh   |   0
 .../run_qwen2.5-3b_geo3k_multiturn_4xgpu.sh   |   0
 ...run_qwen2.5-3b_megatron_geo3k_multiturn.sh |   0
 ...n2.5-0.5b_gsm8k_multiturn_w_interaction.sh |   0
 .../run_qwen2.5-3b_gsm8k_multiturn.sh         |   0
 .../run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh   |   0
 .../run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh |   0
 ...run_qwen2.5-3b_megatron_gsm8k_multiturn.sh |   0
 .../run_qwen2_3b_dapo_multiturn.sh            |   0
 ...un_qwen2.5-3b_instruct_search_multiturn.sh |   0
 .../split_placement/run_deepseek7b_llm.sh     |   0
 .../qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh  |   0
 .../qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh  |   0
 .../qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh   |   0
 .../14b/qwen2_14b_grpo_4_h800_fsdp_vllm.sh    |   0
 .../qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh   |   0
 .../32b/qwen2_32B_grpo_8_h20_megatron_vllm.sh |   0
 .../3b/qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh |   0
 .../70b/qwen2-70b_grpo_32_h20_fsdp_vllm.sh    |   0
 .../70b/qwen2-70b_grpo_32_h800_fsdp_vllm.sh   |   0
 .../qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh   |   0
 .../7b/qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh |   0
 .../7b/qwen2-7b_grpo_2_h800_fsdp_vllm.sh      |   0
 scripts/process_mosei_annotations.py          |  74 +++
 .../_generated_ppo_megatron_trainer.yaml      |   1 +
 .../config/_generated_ppo_trainer.yaml        |   1 +
 verl/trainer/config/data/legacy_data.yaml     |   5 +
 verl/trainer/ppo/core_algos.py                | 180 +++++-
 verl/trainer/ppo/ray_trainer.py               | 110 +++-
 verl/utils/dataset/rl_dataset.py              | 130 ++++-
 verl/utils/dataset/vision_utils.py            |  23 +-
 verl/workers/actor/dp_actor.py                |   4 +-
 108 files changed, 1942 insertions(+), 33 deletions(-)
 create mode 100755 examples/drpo_trainer/run_qwen2_5_vl-7b_climb_no_thinking.sh
 create mode 100644 examples/format_prompt/README.md
 create mode 100644 examples/format_prompt/default.jinja
 create mode 100644 examples/format_prompt/no_thinking.jinja
 mode change 100644 => 100755 examples/generation/run_deepseek7b_mutli_node.sh
 mode change 100644 => 100755 examples/generation/run_deepseek_v2_lite_math.sh
 mode change 100644 => 100755 examples/gmpo_trainer/run_qwen2_5-7b_math.sh
 mode change 100644 => 100755 examples/gmpo_trainer/test_dapo_7b_math.sh
 mode change 100644 => 100755 examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_deepseek671b_math_megatron.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_deepseek7b_llm.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_deepseek7b_llm_math.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_deepseek7b_llm_seq_balance.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_minicpmo2_6.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_moonlight16b_math_megatron.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2-7b.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2-7b_math.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2-7b_math_megatron.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2-7b_seq_balance_math_megatron.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2-7b_sgl_megatron.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5-7b_math_megatron_diff_tp.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_vl-7b.sh
 create mode 100755 examples/grpo_trainer/run_qwen2_5_vl-7b_climb.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_vl-7b_seq_balance.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen3-236b_megatron.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen3-8b.sh
 mode change 100644 => 100755 examples/grpo_trainer/run_qwen3moe-30b_megatron.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_deepseek7b_llm.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_deepseek7b_llm_sp2.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_deepseek_math_gsm8k_megatron_nsys.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_gemma.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_qwen2-7b_math_gsm8k_megatron.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_qwen2-7b_rm.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_qwen2-7b_seq_balance.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_qwen2-7b_sglang_seq_balance.sh
 mode change 100644 => 100755 examples/ppo_trainer/run_qwen2.5-32b.sh
 mode change 100644 => 100755 examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf.sh
 mode change 100644 => 100755 examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf_baseline.sh
 mode change 100644 => 100755 examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh
 mode change 100644 => 100755 examples/remax_trainer/run_qwen2.5-7b_seq_balance.sh
 create mode 100644 examples/reward_function/dapo.py
 create mode 100644 examples/reward_function/evaluation.py
 create mode 100644 examples/reward_function/math.py
 create mode 100644 examples/reward_function/medical.py
 create mode 100644 examples/reward_function/r1v.py
 mode change 100644 => 100755 examples/rloo_trainer/run_qwen2-7b.sh
 mode change 100644 => 100755 examples/sft/gsm8k/run_deepseek_6b7.sh
 mode change 100644 => 100755 examples/sft/gsm8k/run_gemma_2b.sh
 mode change 100644 => 100755 examples/sft/gsm8k/run_gemma_7b.sh
 mode change 100644 => 100755 examples/sft/gsm8k/run_qwen2_5_05b_sft_peft_sp2_npu.sh
 mode change 100644 => 100755 examples/sft/gsm8k/run_qwen_05_peft.sh
 mode change 100644 => 100755 examples/sft/gsm8k/run_qwen_05_sp2.sh
 mode change 100644 => 100755 examples/sft/gsm8k/run_qwen_05_sp2_liger.sh
 mode change 100644 => 100755 examples/sft/multiturn/run_qwen_05_sp2.sh
 mode change 100644 => 100755 examples/sglang_multiturn/geo3k/run_qwen2.5-3b_geo3k_multiturn.sh
 mode change 100644 => 100755 examples/sglang_multiturn/geo3k/run_qwen2.5-3b_geo3k_multiturn_4xgpu.sh
 mode change 100644 => 100755 examples/sglang_multiturn/geo3k/run_qwen2.5-3b_megatron_geo3k_multiturn.sh
 mode change 100644 => 100755 examples/sglang_multiturn/run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh
 mode change 100644 => 100755 examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh
 mode change 100644 => 100755 examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh
 mode change 100644 => 100755 examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh
 mode change 100644 => 100755 examples/sglang_multiturn/run_qwen2.5-3b_megatron_gsm8k_multiturn.sh
 mode change 100644 => 100755 examples/sglang_multiturn/run_qwen2_3b_dapo_multiturn.sh
 mode change 100644 => 100755 examples/sglang_multiturn/search_r1_like/run_qwen2.5-3b_instruct_search_multiturn.sh
 mode change 100644 => 100755 examples/split_placement/run_deepseek7b_llm.sh
 mode change 100644 => 100755 examples/tuning/0.5b/qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh
 mode change 100644 => 100755 examples/tuning/1.5b/qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh
 mode change 100644 => 100755 examples/tuning/14b/qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh
 mode change 100644 => 100755 examples/tuning/14b/qwen2_14b_grpo_4_h800_fsdp_vllm.sh
 mode change 100644 => 100755 examples/tuning/32b/qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh
 mode change 100644 => 100755 examples/tuning/32b/qwen2_32B_grpo_8_h20_megatron_vllm.sh
 mode change 100644 => 100755 examples/tuning/3b/qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh
 mode change 100644 => 100755 examples/tuning/70b/qwen2-70b_grpo_32_h20_fsdp_vllm.sh
 mode change 100644 => 100755 examples/tuning/70b/qwen2-70b_grpo_32_h800_fsdp_vllm.sh
 mode change 100644 => 100755 examples/tuning/70b/qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh
 mode change 100644 => 100755 examples/tuning/7b/qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh
 mode change 100644 => 100755 examples/tuning/7b/qwen2-7b_grpo_2_h800_fsdp_vllm.sh
 create mode 100644 scripts/process_mosei_annotations.py

diff --git a/examples/drpo_trainer/run_qwen2_5_vl-7b_climb_no_thinking.sh b/examples/drpo_trainer/run_qwen2_5_vl-7b_climb_no_thinking.sh
new file mode 100755
index 00000000000..87c368ad1a2
--- /dev/null
+++ b/examples/drpo_trainer/run_qwen2_5_vl-7b_climb_no_thinking.sh
@@ -0,0 +1,54 @@
+set -x
+ENGINE=${1:-vllm}
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=drpo \
+    data.train_files=/home/dvdai/orcd/scratch/high_modality/geom_train_upsampled_new.jsonl \
+    data.val_files=/home/dvdai/orcd/scratch/high_modality/geom_valid_mini_new.jsonl \
+    data.train_batch_size=512 \
+    data.max_prompt_length=4096 \
+    data.max_response_length=4096 \
+    data.filter_overlong_prompts=False \
+    data.truncation='left' \
+    data.image_key=images \
+    data.video_key=videos \
+    data.prompt_key=problem \
+    data.format_prompt=examples/format_prompt/no_thinking.jinja \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
+    actor_rollout_ref.actor.optim.lr=2e-6 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=1e-8 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=$ENGINE \
+    actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    custom_reward_function.path=examples/reward_function/medical.py \
+    custom_reward_function.name=medical_compute_score_batch \
+    reward_model.reward_manager=batch \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_climb' \
+    trainer.experiment_name='drpo_nothinking' \
+    trainer.n_gpus_per_node=4 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.val_before_train=False \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@
diff --git a/examples/format_prompt/README.md b/examples/format_prompt/README.md
new file mode 100644
index 00000000000..412c5a558e3
--- /dev/null
+++ b/examples/format_prompt/README.md
@@ -0,0 +1,63 @@
+# Format Prompt Templates
+
+This directory contains Jinja2 templates for formatting prompts in RLHF datasets.
+
+## Overview
+
+The format prompt feature allows you to apply custom formatting to each prompt in your dataset using Jinja2 templates. This is useful when you want to add consistent instructions or formatting to all prompts without modifying the original dataset.
+
+## Default Template
+
+The default template (`default.jinja`) appends the following instruction to each prompt:
+
+```
+{{ content }}You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}.
+```
+
+## Usage
+
+To use a format prompt template, specify the `format_prompt` parameter in your data configuration:
+
+```yaml
+data:
+  # ... other data config ...
+  format_prompt: examples/format_prompt/default.jinja  # Path to your template file
+```
+
+Or set it to `null` to disable format prompting:
+
+```yaml
+data:
+  format_prompt: null
+```
+
+## Creating Custom Templates
+
+To create a custom format prompt:
+
+1. Create a new `.jinja` file in this directory or elsewhere
+2. Use `{{ content }}` as the placeholder for the original prompt content
+3. Add your custom formatting around it
+
+Example custom template:
+
+```jinja
+{{ content }}
+
+Please solve this problem step by step:
+1. Understand the problem
+2. Plan your approach
+3. Execute the solution
+4. Verify your answer
+```
+
+## Template Variables
+
+Currently, the template receives one variable:
+- `content`: The original prompt text
+
+## Notes
+
+- The template is applied during dataset preprocessing
+- If the template file is not found, the system will use the original prompt without formatting
+- For multimodal datasets (images/videos), the formatting is applied to text segments only
\ No newline at end of file
diff --git a/examples/format_prompt/default.jinja b/examples/format_prompt/default.jinja
new file mode 100644
index 00000000000..be95b0ef441
--- /dev/null
+++ b/examples/format_prompt/default.jinja
@@ -0,0 +1 @@
+{{ content }}You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}.
\ No newline at end of file
diff --git a/examples/format_prompt/no_thinking.jinja b/examples/format_prompt/no_thinking.jinja
new file mode 100644
index 00000000000..39a137c9384
--- /dev/null
+++ b/examples/format_prompt/no_thinking.jinja
@@ -0,0 +1 @@
+{{ content }}You MUST provide the final answer directly without any extra information. Enclose the final answer in \boxed{}.
\ No newline at end of file
diff --git a/examples/generation/run_deepseek7b_mutli_node.sh b/examples/generation/run_deepseek7b_mutli_node.sh
old mode 100644
new mode 100755
diff --git a/examples/generation/run_deepseek_v2_lite_math.sh b/examples/generation/run_deepseek_v2_lite_math.sh
old mode 100644
new mode 100755
diff --git a/examples/gmpo_trainer/run_qwen2_5-7b_math.sh b/examples/gmpo_trainer/run_qwen2_5-7b_math.sh
old mode 100644
new mode 100755
diff --git a/examples/gmpo_trainer/test_dapo_7b_math.sh b/examples/gmpo_trainer/test_dapo_7b_math.sh
old mode 100644
new mode 100755
diff --git a/examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh b/examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_deepseek671b_math_megatron.sh b/examples/grpo_trainer/run_deepseek671b_math_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_deepseek7b_llm.sh b/examples/grpo_trainer/run_deepseek7b_llm.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_deepseek7b_llm_math.sh b/examples/grpo_trainer/run_deepseek7b_llm_math.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh b/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_deepseek7b_llm_seq_balance.sh b/examples/grpo_trainer/run_deepseek7b_llm_seq_balance.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_minicpmo2_6.sh b/examples/grpo_trainer/run_minicpmo2_6.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_moonlight16b_math_megatron.sh b/examples/grpo_trainer/run_moonlight16b_math_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2-7b.sh b/examples/grpo_trainer/run_qwen2-7b.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2-7b_math.sh b/examples/grpo_trainer/run_qwen2-7b_math.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2-7b_math_megatron.sh b/examples/grpo_trainer/run_qwen2-7b_math_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh b/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2-7b_seq_balance_math_megatron.sh b/examples/grpo_trainer/run_qwen2-7b_seq_balance_math_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2-7b_sgl_megatron.sh b/examples/grpo_trainer/run_qwen2-7b_sgl_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh b/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5-7b_math_megatron_diff_tp.sh b/examples/grpo_trainer/run_qwen2_5-7b_math_megatron_diff_tp.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh b/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b_climb.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b_climb.sh
new file mode 100755
index 00000000000..761abd09784
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen2_5_vl-7b_climb.sh
@@ -0,0 +1,54 @@
+set -x
+ENGINE=${1:-vllm}
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/home/dvdai/orcd/scratch/high_modality/geom_train_upsampled_new.jsonl \
+    data.val_files=/home/dvdai/orcd/scratch/high_modality/geom_valid_mini_new.jsonl \
+    data.train_batch_size=512 \
+    data.max_prompt_length=4096 \
+    data.max_response_length=4096 \
+    data.filter_overlong_prompts=False \
+    data.truncation='left' \
+    data.image_key=images \
+    data.video_key=videos \
+    data.prompt_key=problem \
+    data.format_prompt=examples/format_prompt/default.jinja \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=1e-8 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=$ENGINE \
+    actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    custom_reward_function.path=examples/reward_function/medical.py \
+    custom_reward_function.name=medical_compute_score_batch \
+    reward_model.reward_manager=batch \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_climb' \
+    trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
+    trainer.n_gpus_per_node=4 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.val_before_train=False \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@
diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b_seq_balance.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b_seq_balance.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh b/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh b/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh b/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen3-236b_megatron.sh b/examples/grpo_trainer/run_qwen3-236b_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen3-8b.sh b/examples/grpo_trainer/run_qwen3-8b.sh
old mode 100644
new mode 100755
diff --git a/examples/grpo_trainer/run_qwen3moe-30b_megatron.sh b/examples/grpo_trainer/run_qwen3moe-30b_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_deepseek7b_llm.sh b/examples/ppo_trainer/run_deepseek7b_llm.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh b/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh b/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh b/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh b/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh b/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh b/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron_nsys.sh b/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron_nsys.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_gemma.sh b/examples/ppo_trainer/run_gemma.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh b/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh b/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_qwen2-7b_math_gsm8k_megatron.sh b/examples/ppo_trainer/run_qwen2-7b_math_gsm8k_megatron.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm.sh b/examples/ppo_trainer/run_qwen2-7b_rm.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_qwen2-7b_seq_balance.sh b/examples/ppo_trainer/run_qwen2-7b_seq_balance.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_qwen2-7b_sglang_seq_balance.sh b/examples/ppo_trainer/run_qwen2-7b_sglang_seq_balance.sh
old mode 100644
new mode 100755
diff --git a/examples/ppo_trainer/run_qwen2.5-32b.sh b/examples/ppo_trainer/run_qwen2.5-32b.sh
old mode 100644
new mode 100755
diff --git a/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf.sh b/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf.sh
old mode 100644
new mode 100755
diff --git a/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf_baseline.sh b/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf_baseline.sh
old mode 100644
new mode 100755
diff --git a/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh b/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh
old mode 100644
new mode 100755
diff --git a/examples/remax_trainer/run_qwen2.5-7b_seq_balance.sh b/examples/remax_trainer/run_qwen2.5-7b_seq_balance.sh
old mode 100644
new mode 100755
diff --git a/examples/reward_function/dapo.py b/examples/reward_function/dapo.py
new file mode 100644
index 00000000000..9285cd1d0fd
--- /dev/null
+++ b/examples/reward_function/dapo.py
@@ -0,0 +1,163 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Any
+
+
+# Constants for normalization
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+
+
+def normalize_final_answer(final_answer: str) -> str:
+    """Normalize a final answer to a quantitative reasoning question.
+
+    Args:
+        final_answer: The answer string to normalize
+
+    Returns:
+        Normalized answer string
+    """
+    final_answer = final_answer.split("=")[-1]
+
+    # Apply substitutions and removals
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+
+    # Extract and normalize LaTeX math
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+
+    # Normalize shorthand TeX:
+    #  \fracab -> \frac{a}{b}
+    #  \frac{abc}{bef} -> \frac{abc}{bef}
+    #  \fracabc -> \frac{a}{b}c
+    #  \sqrta -> \sqrt{a}
+    #  \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+
+    # Normalize numbers
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+
+    return final_answer.strip()
+
+
+def accuracy_reward(response: str, ground_truth: str) -> float:
+    match = re.findall(r"(?i)Answer\s*:\s*([^\n]+)", response)
+    answer = match[-1] if match else "[INVALID]"
+    if normalize_final_answer(answer) == normalize_final_answer(ground_truth):
+        return 1.0
+    else:
+        return -1.0
+
+
+def soft_overlong_punishment(response_length: int, max_response_length: int, overlong_buffer_length: int):
+    expected_len = max_response_length - overlong_buffer_length
+    if response_length <= expected_len:
+        return 0.0
+    elif response_length <= max_response_length:
+        return (expected_len - response_length) / overlong_buffer_length
+    else:
+        return -1.0
+
+
+def compute_score(
+    reward_inputs: list[dict[str, Any]],
+    max_response_length: int,
+    overlong_buffer_length: int,
+    overlong_penalty_factor: float,
+) -> list[dict[str, float]]:
+    if not isinstance(reward_inputs, list):
+        raise ValueError("Please use `reward_type=batch` for dapo reward function.")
+
+    scores = []
+    for reward_input in reward_inputs:
+        response = reward_input["response"][-300:]  # The longest answer in MATH-500 has 159 characters
+        accuracy_score = accuracy_reward(response, reward_input["ground_truth"])
+        overlong_score = soft_overlong_punishment(
+            reward_input["response_length"], max_response_length, overlong_buffer_length
+        )
+        scores.append(
+            {
+                "overall": accuracy_score + overlong_score * overlong_penalty_factor,
+                "accuracy": accuracy_score,
+                "overlong": overlong_score,
+                "accuracy_normalized": 0.5 * (accuracy_score + 1.0),
+            }
+        )
+
+    return scores
diff --git a/examples/reward_function/evaluation.py b/examples/reward_function/evaluation.py
new file mode 100644
index 00000000000..45ec549d862
--- /dev/null
+++ b/examples/reward_function/evaluation.py
@@ -0,0 +1,552 @@
+import datetime
+import json
+import os
+from collections import defaultdict
+from typing import Dict, List, Set
+import statistics
+
+def parse_conditions(text: str) -> Set[str]:
+    """
+    Parse medical conditions from text, handling various separators.
+
+    Args:
+        text (str): Text containing medical conditions.
+
+    Returns:
+        Set[str]: Set of individual medical conditions.
+    """
+    # Remove any boxing notation if present
+    text = text.replace("\\boxed{", "").replace("}", "")
+
+    # Split by common separators
+    for sep in [", ", " and ", " & ", ",", "&"]:
+        if sep in text:
+            return set(cond.strip() for cond in text.split(sep))
+
+    # If no separator found, treat as single condition
+    return {text.strip()}
+
+
+def extract_boxed_content(text: str) -> str:
+    """
+    Extract content within \boxed{} or similar boxing notations.
+
+    Args:
+        text (str): Text containing potentially boxed content.
+
+    Returns:
+        str: Extracted boxed content or the original text if no box found.
+    """
+    import re
+
+    # Look for LaTeX \boxed{} notation
+    boxed_match = re.search(r"\\boxed{([^}]*)}", text)
+    if boxed_match:
+        return boxed_match.group(1)
+
+    # Look for markdown boxed notation (e.g., [boxed content])
+    markdown_match = re.search(r"\[(.*?)\]", text)
+    if markdown_match:
+        return markdown_match.group(1)
+
+    # Return the text as is if no boxed content is found
+    return text
+
+
+def compute_class_metrics(class_name: str, confusion_matrix: Dict[str, int]) -> Dict[str, float]:
+    """
+    Compute metrics for a single class based on its confusion matrix.
+
+    Args:
+        class_name (str): Name of the class.
+        confusion_matrix (Dict[str, int]): Confusion matrix with tp, fp, fn, tn.
+
+    Returns:
+        Dict[str, float]: Dictionary of metrics for this class.
+    """
+    tp = confusion_matrix["tp"]
+    fp = confusion_matrix["fp"]
+    fn = confusion_matrix["fn"]
+    tn = confusion_matrix["tn"]
+
+    # Calculate metrics (avoid division by zero)
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+    sensitivity = recall  # sensitivity is the same as recall
+    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
+    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
+    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
+
+    return {
+        "precision": precision,
+        "recall": recall,
+        "sensitivity": sensitivity,
+        "specificity": specificity,
+        "f1": f1,
+        "accuracy": accuracy,
+        "count": confusion_matrix["count"],
+        "confusion_matrix": {"tp": tp, "fp": fp, "fn": fn, "tn": tn},
+    }
+
+
+def gender(predictions: List[str], ground_truths: List[str], demographics: List[str]) -> Dict[str, float]:
+    groups = {"male": {"preds": [], "gts": []}, "female": {"preds": [], "gts": []}}
+
+    for pred, gt, demo in zip(predictions, ground_truths, demographics):
+        if demo is not None and "female" in demo.lower():
+            groups["female"]["preds"].append(pred)
+            groups["female"]["gts"].append(gt)
+        elif demo is not None and "male" in demo.lower():
+            groups["male"]["preds"].append(pred)
+            groups["male"]["gts"].append(gt)
+
+    results = {}
+    acc_values = []
+    f1_values = []
+
+    for sex in ["male", "female"]:
+        preds = groups[sex]["preds"]
+        gts = groups[sex]["gts"]
+        if len(preds) == 0:
+            continue
+        metrics = compute_dataset_metrics(preds, gts)["dataset_metrics"]
+        acc = metrics["accuracy"]
+        f1 = metrics["f1"]
+        results[f"{sex}/accuracy"] = acc
+        results[f"{sex}/f1"] = f1
+        acc_values.append(acc)
+        f1_values.append(f1)
+        print(f"{sex}: accuracy = {acc:.4f}, f1 = {f1:.4f}")
+
+    if len(acc_values) >= 2:
+        acc_diff = abs(acc_values[0] - acc_values[1])
+        results["acc_diff for sex"] = acc_diff
+        results["std_accuracy for sex"] = statistics.stdev(acc_values)
+        print(f"Accuracy max diff for sex = {acc_diff:.4f}")
+        print(f"std of accuracy for sex = {results['std_accuracy for sex']:.4f}")
+
+    if len(f1_values) >= 2:
+        f1_diff = abs(f1_values[0] - f1_values[1])
+        results["f1_diff for sex"] = f1_diff
+        results["std_f1 for sex"] = statistics.stdev(f1_values)
+        print(f"F1 max diff for sex = {f1_diff:.4f}")
+        print(f"std of f1 for sex = {results['std_f1 for sex']:.4f}")
+
+    return results
+
+
+def parent(predictions: List[str], ground_truths: List[str], demographics: List[str]) -> Dict[str, float]:
+    groups = {}
+    for pred, gt, demo in zip(predictions, ground_truths, demographics):
+        if demo is not None and "father" in demo.lower():
+            if (
+                demo.split("father:")[1].strip().split()[0] not in groups
+                and demo.split("father:")[1].strip().split()[0] != "NAN"
+            ):
+                groups[demo.split("father:")[1].strip().split()[0]] = {"preds": [], "gts": []}
+                groups[demo.split("father:")[1].strip().split()[0]]["preds"].append(pred)
+                groups[demo.split("father:")[1].strip().split()[0]]["gts"].append(gt)
+            else:
+                groups[demo.split("father:")[1].strip().split()[0]]["preds"].append(pred)
+                groups[demo.split("father:")[1].strip().split()[0]]["gts"].append(gt)
+        if demo is not None and "mother" in demo.lower():
+            if (
+                demo.split("mother:")[1].strip().split()[0] not in groups
+                and demo.split("mother:")[1].strip().split()[0] != "NAN"
+            ):
+                groups[demo.split("mother:")[1].strip().split()[0]] = {"preds": [], "gts": []}
+                groups[demo.split("mother:")[1].strip().split()[0]]["preds"].append(pred)
+                groups[demo.split("mother:")[1].strip().split()[0]]["gts"].append(gt)
+            else:
+                groups[demo.split("father:")[1].strip().split()[0]]["preds"].append(pred)
+                groups[demo.split("father:")[1].strip().split()[0]]["gts"].append(gt)
+
+    results = {}
+    acc_values = []
+    f1_values = []
+
+    for race in groups:
+        preds = groups[race]["preds"]
+        gts = groups[race]["gts"]
+        if len(preds) == 0:
+            continue
+        metrics = compute_dataset_metrics(preds, gts)["dataset_metrics"]
+        acc = metrics["accuracy"]
+        f1 = metrics["f1"]
+        results[f"{race}/accuracy"] = acc
+        results[f"{race}/f1"] = f1
+        acc_values.append(acc)
+        f1_values.append(f1)
+        print(f"{race}: accuracy = {acc:.4f}, f1 = {f1:.4f}")
+
+    if len(acc_values) >= 2:
+        acc_diff = max(acc_values) - min(acc_values)
+        results["acc_diff"] = acc_diff
+        print(f"Accuracy max diff for parent = {acc_diff:.4f}")
+        std_acc = statistics.stdev(acc_values)
+        results["std_accuracy"] = std_acc
+        print(f"std of accuracy for parent = {std_acc:.4f}")
+
+    if len(f1_values) >= 2:
+        f1_diff = max(f1_values) - min(f1_values)
+        results["f1_diff"] = f1_diff
+        print(f"F1 max diff for parent = {f1_diff:.4f}")
+        std_f1 = statistics.stdev(f1_values)
+        results["std_f1"] = std_f1
+        print(f"std of f1 for parent = {std_f1:.4f}")
+
+    return results
+
+
+def age(predictions: List[str], ground_truths: List[str], demographics: List[str]) -> Dict[str, float]:
+    groups = {
+        "a1": {"preds": [], "gts": []},
+        "a2": {"preds": [], "gts": []},
+        "a3": {"preds": [], "gts": []},
+        "a4": {"preds": [], "gts": []},
+    }
+
+    for pred, gt, demo in zip(predictions, ground_truths, demographics):
+        if demo is not None and "age" in demo.lower():
+            try:
+                age_str = demo.split("age:")[1].strip().split()[0].replace(",", "")
+                age_val = float(age_str)
+            except (IndexError, ValueError):
+                continue
+
+            if age_val <= 25:
+                groups["a1"]["preds"].append(pred)
+                groups["a1"]["gts"].append(gt)
+            elif 25 < age_val <= 50:
+                groups["a2"]["preds"].append(pred)
+                groups["a2"]["gts"].append(gt)
+            elif 50 < age_val <= 75:
+                groups["a3"]["preds"].append(pred)
+                groups["a3"]["gts"].append(gt)
+            elif 75 < age_val:
+                groups["a4"]["preds"].append(pred)
+                groups["a4"]["gts"].append(gt)
+
+    results = {}
+    acc_values = []
+    f1_values = []
+
+    for group in ["a1", "a2", "a3", "a4"]:
+        preds = groups[group]["preds"]
+        gts = groups[group]["gts"]
+        if len(preds) == 0:
+            continue
+        metrics = compute_dataset_metrics(preds, gts)["dataset_metrics"]
+        acc = metrics["accuracy"]
+        f1 = metrics["f1"]
+        results[f"{group}/accuracy"] = acc
+        results[f"{group}/f1"] = f1
+        acc_values.append(acc)
+        f1_values.append(f1)
+
+    if len(acc_values) >= 2:
+        results["acc_diff"] = max(acc_values) - min(acc_values)
+        results["std_accuracy"] = statistics.stdev(acc_values)
+
+    if len(f1_values) >= 2:
+        results["f1_diff"] = max(f1_values) - min(f1_values)
+        results["std_f1"] = statistics.stdev(f1_values)
+
+    for group in ["a1", "a2", "a3", "a4"]:
+        acc = results.get(f"{group}/accuracy")
+        f1 = results.get(f"{group}/f1")
+        if acc is not None and f1 is not None:
+            print(f"{group}: accuracy = {acc:.4f}, f1 = {f1:.4f}")
+
+    if "acc_diff" in results:
+        print(f"Accuracy max diff = {results['acc_diff']:.4f}")
+        print(f"std of accuracy for age = {results['std_accuracy']:.4f}")
+    if "f1_diff" in results:
+        print(f"F1 max diff = {results['f1_diff']:.4f}")
+        print(f"std of f1 for age = {results['std_f1']:.4f}")
+
+    return results
+def compute_confusion_matrices(predictions: List[str], ground_truths: List[str]) -> Dict[str, Dict[str, int]]:
+    """
+    Compute confusion matrices for each class.
+
+    Args:
+        predictions (List[str]): List of model predictions.
+        ground_truths (List[str]): List of ground truth labels.
+
+    Returns:
+        Dict[str, Dict[str, int]]: Confusion matrices for each class.
+    """
+    # Initialize counters for each condition
+    all_conditions = set()
+    condition_matrices = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0, "tn": 0, "count": 0})
+
+    # First pass: identify all unique conditions
+    for gt in ground_truths:
+        gt_conditions = parse_conditions(gt)
+        all_conditions.update(gt_conditions)
+
+    for pred in predictions:
+        pred_answer = extract_boxed_content(pred)
+        if pred_answer != "None":
+            pred_conditions = parse_conditions(pred_answer)
+            all_conditions.update(pred_conditions)
+
+    # Second pass: compute confusion matrices
+    for pred, gt in zip(predictions, ground_truths):
+        pred_answer = extract_boxed_content(pred)
+        if pred_answer == "None":
+            pred_conditions = set()
+        else:
+            pred_conditions = parse_conditions(pred_answer)
+
+        gt_conditions = parse_conditions(gt)
+
+        # For each possible condition
+        for condition in all_conditions:
+            condition_present_in_gt = condition in gt_conditions
+            condition_present_in_pred = condition in pred_conditions
+
+            if condition_present_in_gt:
+                condition_matrices[condition]["count"] += 1
+
+            if condition_present_in_gt and condition_present_in_pred:
+                # True positive
+                condition_matrices[condition]["tp"] += 1
+            elif condition_present_in_gt and not condition_present_in_pred:
+                # False negative
+                condition_matrices[condition]["fn"] += 1
+            elif not condition_present_in_gt and condition_present_in_pred:
+                # False positive
+                condition_matrices[condition]["fp"] += 1
+            else:
+                # True negative
+                condition_matrices[condition]["tn"] += 1
+
+    return condition_matrices
+
+
+def compute_dataset_metrics(predictions: List[str], ground_truths: List[str]) -> Dict[str, Dict]:
+    """
+    Compute metrics for a single dataset, with class-wise averaging.
+
+    Args:
+        predictions (List[str]): List of model predictions for this dataset.
+        ground_truths (List[str]): List of ground truth labels for this dataset.
+
+    Returns:
+        Dict[str, Dict]: Class metrics and averaged dataset metrics.
+    """
+    # Compute confusion matrices for each class
+    class_matrices = compute_confusion_matrices(predictions, ground_truths)
+
+    # Compute metrics for each class
+    class_metrics = {}
+    active_classes = 0
+
+    # Accumulators for dataset-level metrics
+    dataset_metrics = {
+        "precision": 0.0,
+        "recall": 0.0,
+        "sensitivity": 0.0,
+        "specificity": 0.0,
+        "f1": 0.0,
+        "accuracy": 0.0,
+    }
+
+    # Compute metrics for each class and accumulate for dataset average
+    for class_name, matrix in class_matrices.items():
+        # Skip classes that never appear in ground truth
+        if matrix["count"] == 0:
+            continue
+
+        active_classes += 1
+        metrics = compute_class_metrics(class_name, matrix)
+        class_metrics[class_name] = metrics
+
+        # Accumulate for dataset average (equal class weighting)
+        for metric_name in dataset_metrics.keys():
+            dataset_metrics[metric_name] += metrics[metric_name]
+
+    # Calculate dataset average (equal class weighting)
+    if active_classes > 0:
+        for metric_name in dataset_metrics.keys():
+            dataset_metrics[metric_name] /= active_classes
+
+    # Add class metrics to the result
+    result = {"class_metrics": class_metrics, "dataset_metrics": dataset_metrics, "active_classes": active_classes}
+
+    return result
+
+
+def compute_metrics_by_data_source(
+    predictions: List[str],
+    ground_truths: List[str],
+    data_sources: List[str],
+    datasets: List[str],
+    demographics: List[str],
+) -> Dict[str, float]:
+    """
+    Compute hierarchical metrics: class -> dataset -> data source -> global.
+
+    Args:
+        predictions (List[str]): List of model predictions.
+        ground_truths (List[str]): List of ground truth labels.
+        data_sources (List[str]): List of data sources for each example.
+        datasets (List[str]): List of dataset identifiers for each example.
+        demographics (List[str]): List of demographic information for each example.
+
+    Returns:
+        Dict[str, float]: Flattened dictionary of metrics at all levels with keys:
+            - "val/{metric}" for global metrics
+            - "{data_source}/{metric}" for data source metrics
+            - "{data_source}/{dataset}/{metric}" for dataset metrics
+    """
+    # Save inputs to json for debugging under outputs/
+
+    output_dir = "outputs"
+    os.makedirs(output_dir, exist_ok=True)
+    input_data = {
+        "predictions": predictions,
+        "ground_truths": ground_truths,
+        "data_sources": data_sources,
+        "datasets": datasets,
+        "demographics": demographics,
+    }
+    # name is time in yyyy-mm-dd_hh-mm-ss format
+    with open(
+        os.path.join(output_dir, f"input_data_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"), "w"
+    ) as f:
+        json.dump(input_data, f, indent=4)
+
+    # Group examples by data source and dataset
+    grouped_data = defaultdict(lambda: defaultdict(lambda: {"preds": [], "gts": []}))
+
+    for pred, gt, source, dataset in zip(predictions, ground_truths, data_sources, datasets):
+        grouped_data[source][dataset]["preds"].append(pred)
+        grouped_data[source][dataset]["gts"].append(gt)
+
+    # Initialize the flattened result dictionary
+    result = {}
+
+    # Initialize global metrics accumulators
+    global_metrics = {
+        "precision": 0.0,
+        "recall": 0.0,
+        "sensitivity": 0.0,
+        "specificity": 0.0,
+        "f1": 0.0,
+        "accuracy": 0.0,
+    }
+
+    # Compute metrics for each dataset within each data source
+    total_data_sources = 0
+
+    for source_name, source_datasets in grouped_data.items():
+        # Initialize metrics accumulators for this data source
+        source_metrics = {
+            "precision": 0.0,
+            "recall": 0.0,
+            "sensitivity": 0.0,
+            "specificity": 0.0,
+            "f1": 0.0,
+            "accuracy": 0.0,
+        }
+
+        total_datasets_in_source = 0
+
+        for dataset_name, dataset_data in source_datasets.items():
+            # Compute metrics for this dataset
+            dataset_result = compute_dataset_metrics(dataset_data["preds"], dataset_data["gts"])
+
+            # Store dataset-level metrics with the format "data_source/dataset/metric"
+            for metric_name, metric_value in dataset_result["dataset_metrics"].items():
+                result[f"{source_name}/{dataset_name}/{metric_name}"] = metric_value
+
+            # Skip empty datasets
+            if dataset_result["active_classes"] == 0:
+                continue
+
+            total_datasets_in_source += 1
+
+            # Accumulate metrics for data source average (equal dataset weighting)
+            for metric_name in source_metrics.keys():
+                source_metrics[metric_name] += dataset_result["dataset_metrics"][metric_name]
+
+        # Calculate data source average (equal dataset weighting)
+        if total_datasets_in_source > 0:
+            for metric_name in source_metrics.keys():
+                source_metrics[metric_name] /= total_datasets_in_source
+
+            # Store data source metrics with the format "data_source/metric"
+            for metric_name, metric_value in source_metrics.items():
+                result[f"{source_name}/{metric_name}"] = metric_value
+
+            total_data_sources += 1
+
+            # Accumulate for global metrics (equal data source weighting)
+            for metric_name in global_metrics.keys():
+                global_metrics[metric_name] += source_metrics[metric_name]
+
+    # Calculate global average (equal data source weighting)
+    if total_data_sources > 0:
+        for metric_name in global_metrics.keys():
+            global_metrics[metric_name] /= total_data_sources
+
+        # Store global metrics with the format "val/metric"
+        for metric_name, metric_value in global_metrics.items():
+            result[f"val/{metric_name}"] = metric_value
+
+    gender_results = gender(predictions, ground_truths, demographics)
+    for k, v in gender_results.items():
+        result[f"fairness/gender/{k}"] = v
+
+    age_results = age(predictions, ground_truths, demographics)
+    for k, v in age_results.items():
+        result[f"fairness/age/{k}"] = v
+
+    parent_results = parent(predictions, ground_truths, demographics)
+    for k, v in parent_results.items():
+        result[f"fairness/parent/{k}"] = v
+
+
+    std_acc_values = []
+    std_f1_values = []
+    try:
+
+        std_acc_values.append(gender_results["std_accuracy for sex"])
+        std_f1_values.append(gender_results["std_f1 for sex"])
+
+
+        std_acc_values.append(age_results["std_accuracy"])
+        std_f1_values.append(age_results["std_f1"])
+
+        std_acc_values.append(parent_results["std_accuracy"])
+        std_f1_values.append(parent_results["std_f1"])
+
+        result["fairness/avg_std_accuracy"] = sum(std_acc_values) / len(std_acc_values)
+        result["fairness/avg_std_f1"] = sum(std_f1_values) / len(std_f1_values)
+    except KeyError:
+        print("Some fairness metrics do not have standard deviation values, skipping average calculation.")
+
+    return result
+
+
+if __name__ == "__main__":
+    outputs_dir = "../../outputs"
+    output_files = [f for f in os.listdir(outputs_dir) if f.startswith("input_data_") and f.endswith(".json")]
+    if not output_files:
+        print("No output files found in the outputs directory.")
+    else:
+        latest_file = max(output_files, key=lambda f: os.path.getmtime(os.path.join(outputs_dir, f)))
+        with open(os.path.join(outputs_dir, latest_file), "r") as f:
+            input_data = json.load(f)
+
+        predictions = input_data["predictions"]
+        ground_truths = input_data["ground_truths"]
+        data_sources = input_data["data_sources"]
+        datasets = input_data["datasets"]
+        demographics = input_data["demographics"]
+
+        metrics = compute_metrics_by_data_source(predictions, ground_truths, data_sources, datasets, demographics)
+        print(json.dumps(metrics, indent=4))
\ No newline at end of file
diff --git a/examples/reward_function/math.py b/examples/reward_function/math.py
new file mode 100644
index 00000000000..ea75e3e91b5
--- /dev/null
+++ b/examples/reward_function/math.py
@@ -0,0 +1,49 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Any
+
+from mathruler.grader import extract_boxed_content, grade_answer
+
+
+def format_reward(response: str) -> float:
+    pattern = re.compile(r"<think>.*</think>.*\\boxed\{.*\}.*", re.DOTALL)
+    format_match = re.fullmatch(pattern, response)
+    return 1.0 if format_match else 0.0
+
+
+def accuracy_reward(response: str, ground_truth: str) -> float:
+    answer = extract_boxed_content(response)
+    return 1.0 if grade_answer(answer, ground_truth) else 0.0
+
+
+def compute_score(reward_inputs: list[dict[str, Any]], format_weight: float = 0.1) -> list[dict[str, float]]:
+    if not isinstance(reward_inputs, list):
+        raise ValueError("Please use `reward_type=batch` for math reward function.")
+
+    scores = []
+    for reward_input in reward_inputs:
+        response = re.sub(r"\s*(<|>|/)\s*", r"\1", reward_input["response"])  # handle qwen2.5vl-32b format
+        format_score = format_reward(response)
+        accuracy_score = accuracy_reward(response, reward_input["ground_truth"])
+        scores.append(
+            {
+                "overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
+                "format": format_score,
+                "accuracy": accuracy_score,
+            }
+        )
+
+    return scores
diff --git a/examples/reward_function/medical.py b/examples/reward_function/medical.py
new file mode 100644
index 00000000000..aeeac05b019
--- /dev/null
+++ b/examples/reward_function/medical.py
@@ -0,0 +1,460 @@
+import re
+import json
+from typing import Dict, List
+
+import numpy
+import torch
+import numpy as np
+from mathruler.grader import extract_boxed_content
+import wandb
+import random
+
+
+def parse_conditions(text):
+    # Remove any boxing notation if present
+    text = text.replace("\\boxed{", "").replace("}", "")
+
+    # Split by common separators
+    for sep in [", ", " and ", " & ", ",", "&"]:
+        if sep in text:
+            return set(cond.strip() for cond in text.split(sep))
+
+    # If no separator found, treat as single condition
+    return {text.strip()}
+
+
+def parse_json(json_output):
+    """
+    Parsing out the markdown fencing from JSON code blocks.
+    """
+    # Look for content between ```json and ```
+    lines = json_output.splitlines()
+    for i, line in enumerate(lines):
+        if line == "```json" or line.strip() == "```":
+            json_output = "\n".join(lines[i + 1:])  # Remove everything before ```json
+            if "```" in json_output:
+                json_output = json_output.split("```")[0]  # Remove everything after the closing ```
+            break  # Exit the loop once code block marker is found
+    return json_output
+
+
+def extract_json_from_response(text):
+    """
+    Extract JSON content from markdown code blocks in the response.
+
+    Args:
+        text: The model's response text
+
+    Returns:
+        Parsed JSON object or None if no valid JSON found
+    """
+    # Find content between ```json and ```
+    json_pattern = r"```(?:json)?\s*([\s\S]*?)```"
+    matches = re.findall(json_pattern, text)
+
+    if not matches:
+        return None
+
+    # Try to parse each match as JSON
+    for match in matches:
+        try:
+            parsed_json = json.loads(match.strip())
+            return parsed_json
+        except json.JSONDecodeError:
+            continue
+
+    # If we couldn't parse any match as valid JSON, try with ast.literal_eval
+    import ast
+    for match in matches:
+        try:
+            # Clean up the match a bit
+            cleaned = match.strip().replace("'", "\"")
+            parsed_json = ast.literal_eval(cleaned)
+            return parsed_json
+        except:
+            continue
+
+    return None
+
+
+def bbox_to_mask(bbox, height, width):
+    """
+    Convert bounding box to binary mask.
+
+    Args:
+        bbox: Bounding box in format [x1, y1, x2, y2]
+        height: Height of the mask
+        width: Width of the mask
+
+    Returns:
+        Binary mask of shape (height, width)
+    """
+    mask = torch.zeros((height, width), dtype=torch.float32)
+
+    # Ensure bbox coordinates are within image boundaries
+    x1 = max(0, min(int(bbox[0]), width - 1))
+    y1 = max(0, min(int(bbox[1]), height - 1))
+    x2 = max(0, min(int(bbox[2]), width - 1))
+    y2 = max(0, min(int(bbox[3]), height - 1))
+
+    # Handle cases where x1>x2 or y1>y2
+    if x1 > x2:
+        x1, x2 = x2, x1
+    if y1 > y2:
+        y1, y2 = y2, y1
+
+    # Set the box region to 1
+    if x1 < x2 and y1 < y2:  # Ensure valid box dimensions
+        mask[y1:y2 + 1, x1:x2 + 1] = 1.0
+
+    return mask
+
+
+def calculate_bbox_iou(pred_bboxes, seg_mask=None, gt_bbox=None):
+    """
+    Calculate IoU between predicted bounding boxes and ground truth (segmentation mask or bbox).
+
+    Args:
+        pred_bboxes: List of predicted bounding boxes in format [x1, y1, x2, y2]
+        seg_mask: Ground truth segmentation mask tensor
+        gt_bbox: Ground truth bounding box in format [x1, y1, x2, y2]
+
+    Returns:
+        Mean IoU score across all bounding boxes
+    """
+    if not pred_bboxes:
+        return 0.0
+
+    # If single layer bbox, wrap it in a list
+    if not isinstance(pred_bboxes[0], list):
+        pred_bboxes = [pred_bboxes]
+
+    if seg_mask is not None and isinstance(seg_mask, numpy.ndarray):
+        seg_mask = torch.from_numpy(seg_mask)
+
+    # Not none and not all zero
+    if seg_mask is not None and torch.sum(seg_mask) > 0:
+        # Get mask dimensions
+        if len(seg_mask.shape) == 3:  # Channel dimension
+            height, width = seg_mask.shape[1], seg_mask.shape[2]
+        else:
+            height, width = seg_mask.shape[0], seg_mask.shape[1]
+
+        # Convert segmentation mask to binary (1 for any positive value)
+        binary_seg_mask = (seg_mask > 0).float()
+
+        total_iou = 0.0
+        for bbox in pred_bboxes:
+            if len(bbox) < 4:
+                continue
+            # Convert bbox to mask
+            try:
+                bbox_mask = bbox_to_mask(bbox, height, width)
+            except:
+                continue
+
+            # Calculate intersection and union
+            intersection = torch.sum(bbox_mask * binary_seg_mask)
+            union = torch.sum(torch.clamp(bbox_mask + binary_seg_mask, 0, 1))
+
+            # Calculate IoU
+            iou = intersection / union if union > 0 else 0.0
+            total_iou += iou
+
+        # Return mean IoU
+        return total_iou / len(pred_bboxes)
+
+    elif gt_bbox is not None:
+        # Calculate IoU directly between bounding boxes
+        total_iou = 0.0
+        for pred_bbox in pred_bboxes:
+            if len(pred_bbox) < 4:
+                continue
+            # Calculate intersection
+            gt_bbox = gt_bbox.tolist()
+            # print("pred_bbox: ", pred_bbox.__class__)
+            # print("gt_bbox: ", gt_bbox.__class__)
+            x1 = max(pred_bbox[0], gt_bbox[0])
+            y1 = max(pred_bbox[1], gt_bbox[1])
+            x2 = min(pred_bbox[2], gt_bbox[2])
+            y2 = min(pred_bbox[3], gt_bbox[3])
+
+            # Check if boxes overlap
+            if x1 >= x2 or y1 >= y2:
+                iou = 0.0
+            else:
+                # Calculate areas
+                intersection = (x2 - x1) * (y2 - y1)
+                pred_area = (pred_bbox[2] - pred_bbox[0]) * (pred_bbox[3] - pred_bbox[1])
+                gt_area = (gt_bbox[2] - gt_bbox[0]) * (gt_bbox[3] - gt_bbox[1])
+                union = pred_area + gt_area - intersection
+
+                # Calculate IoU
+                iou = intersection / union if union > 0 else 0.0
+
+            total_iou += iou
+
+        # Return mean IoU
+        return total_iou / len(pred_bboxes)
+
+    else:
+        # Neither segmentation mask nor ground truth bbox provided
+        return 0.0
+
+
+def evaluate_bbox_format(predict_str):
+    """
+    Evaluate the format correctness of the bounding box JSON in the response.
+    Returns a score based on how well the response follows the expected format.
+
+    Args:
+        predict_str: The model's prediction string
+
+    Returns:
+        Format score between 0.0 and 1.0
+    """
+    format_score = 0.0
+
+    # Check if response contains a code block
+    if "```" in predict_str:
+        format_score += 0.2  # 20% for having a code block
+
+        # Check if it's specifically marked as JSON
+        if "```json" in predict_str:
+            format_score += 0.1  # Additional 10% for correct JSON marker
+
+    # Try to extract and parse JSON
+    json_str = parse_json(predict_str)
+    if not json_str:
+        return format_score  # Failed to find JSON content
+
+    try:
+        # Try to parse as JSON
+        parsed_json = None
+        try:
+            parsed_json = json.loads(json_str)
+            format_score += 0.2  # Additional 20% for valid JSON
+        except json.JSONDecodeError:
+            # Try with ast.literal_eval as fallback
+            import ast
+            try:
+                cleaned = json_str.replace("'", "\"")
+                parsed_json = ast.literal_eval(cleaned)
+                format_score += 0.1  # Only 10% for requiring fallback parsing
+            except:
+                return format_score  # Failed to parse
+
+        # Check if it's a list of objects
+        if not isinstance(parsed_json, list):
+            return format_score
+
+        format_score += 0.1  # Additional 10% for being a list
+
+        # Check each item for proper bbox structure
+        valid_items = 0
+        total_items = len(parsed_json)
+
+        for item in parsed_json:
+            if not isinstance(item, dict):
+                continue
+
+            # Check for required fields
+            has_bbox = "bbox_2d" in item
+            has_label = "label" in item
+
+            if has_bbox and has_label:
+                bbox = item["bbox_2d"]
+                # Check bbox format [x1, y1, x2, y2]
+                if (isinstance(bbox, list) and len(bbox) == 4 and
+                        all(isinstance(coord, (int, float)) for coord in bbox)):
+                    valid_items += 1
+
+        # Add up to 40% based on proportion of valid items
+        if total_items > 0:
+            format_score += 0.4 * (valid_items / total_items)
+
+    except Exception:
+        # Any other parsing issues
+        pass
+
+    return format_score
+
+
+def medical_compute_score(predict_str: str, ground_truth: str, segmentation_mask=None, bbox=None) -> Dict[str, float]:
+    """
+    Compute medical scoring including standard score, bounding box IoU, and format score.
+
+    Args:
+        predict_str: The model's prediction string
+        ground_truth: The ground truth string
+        segmentation_mask: Ground truth segmentation mask tensor
+        bbox: Ground truth bounding box
+
+    Returns:
+        Tuple of (standard_score, bbox_score)
+        Note: bbox_score is a combination of IoU score and format score
+    """
+    # Calculate standard score
+    answer = extract_boxed_content(predict_str)
+    if answer == "None":
+        standard_score = 0.0  # no answer
+    else:
+        # Parse both prediction and ground truth into sets of conditions
+        predicted_conditions = parse_conditions(answer)
+        ground_truth_conditions = parse_conditions(ground_truth)
+
+        # Calculate true positives, false positives, and false negatives
+        true_positives = len(predicted_conditions.intersection(ground_truth_conditions))
+        false_positives = len(predicted_conditions - ground_truth_conditions)
+        false_negatives = len(ground_truth_conditions - predicted_conditions)
+
+        # Calculate F1 score components
+        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
+        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
+
+        # Calculate F1 score (harmonic mean of precision and recall)
+        standard_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+
+    # Calculate format score (how well the JSON follows the expected format)
+    format_score = evaluate_bbox_format(predict_str)
+
+    # length score
+    if len(predict_str) > 600:  # ~200 words
+        length_score = 1
+    else:
+        length_score = len(predict_str) * 0.001
+
+
+    # Calculate bounding box IoU score
+    iou_score = 0.0
+    # Extract predicted bounding boxes from the response
+    json_data = extract_json_from_response(predict_str)
+    if json_data:
+        # Extract bounding boxes from the JSON
+        try:
+            pred_bboxes = []
+            if isinstance(json_data, list):
+                for item in json_data:
+                    if isinstance(item, dict) and "bbox_2d" in item:
+                        pred_bboxes.append(item["bbox_2d"])
+            elif isinstance(json_data, dict) and "bbox_2d" in json_data:
+                pred_bboxes.append(json_data["bbox_2d"])
+            elif isinstance(json_data, dict) and 'objects_of_interest' in json_data:
+                for item in json_data['objects_of_interest']:
+                    if isinstance(item, dict) and "bbox_2d" in item:
+                        pred_bboxes.append(item["bbox_2d"])
+            # else:
+            #     print("Error: Invalid JSON format")
+            if random.random() < 0.0005:  # print every 0.5%
+                print("[Bounding Box] ", json_data)
+                print("[Formatted Bounding Box] ", pred_bboxes)
+                print('[GT Bounding Box] ', bbox)
+
+            # Calculate IoU between predicted boxes and ground truth
+            if pred_bboxes:
+                iou_score = calculate_bbox_iou(pred_bboxes, segmentation_mask, bbox)
+        except:
+            pass
+            # traceback.print_exc()
+
+    scores = {
+        "overall": 0.6 * standard_score + 0.2 * iou_score + 0.1 * format_score + 0.1 * length_score,
+        "standard_score": standard_score,
+        "iou_score": iou_score,
+        "format_score": format_score,
+    }
+    return scores
+
+
+def medical_compute_score_batch(data_sources: List[str], solution_strs: List[str], ground_truths: List[str], extra_infos: List[str], **kwargs) -> List[Dict[str, float]]:
+    """
+    Compute medical scoring for batch inputs including standard score, bounding box IoU, and format score.
+
+    Args:
+        data_sources: List of data sources (e.g., file paths or identifiers)
+        solution_strs: List of model prediction strings
+        ground_truths: List of ground truth strings
+        extra_infos: List of extra information (e.g., segmentation masks, bounding boxes)
+
+    Returns:
+        List of score dictionaries
+    """
+    batch_scores = []
+
+    for data_source, predict_str, ground_truth, extra_info in zip(data_sources, solution_strs, ground_truths, extra_infos):
+        segmentation_mask = None
+        bbox = None
+
+        # Calculate standard score
+        answer = extract_boxed_content(predict_str)
+        if answer == "None":
+            standard_score = 0.0  # no answer
+        else:
+            # Parse both prediction and ground truth into sets of conditions
+            predicted_conditions = parse_conditions(answer)
+            ground_truth_conditions = parse_conditions(ground_truth)
+
+            # Calculate true positives, false positives, and false negatives
+            true_positives = len(predicted_conditions.intersection(ground_truth_conditions))
+            false_positives = len(predicted_conditions - ground_truth_conditions)
+            false_negatives = len(ground_truth_conditions - predicted_conditions)
+
+            # Calculate F1 score components
+            precision = (
+                true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
+            )
+            recall = true_positives / (true_positives + false_negatives) if (
+                                                                                        true_positives + false_negatives) > 0 else 0
+
+            # Calculate F1 score (harmonic mean of precision and recall)
+            standard_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+
+        # Calculate format score (how well the JSON follows the expected format)
+        format_score = evaluate_bbox_format(predict_str)
+
+        # length score
+        if len(predict_str) > 600:  # ~200 words
+            length_score = 1
+        else:
+            length_score = len(predict_str) * 0.001
+
+        # Calculate bounding box IoU score
+        iou_score = 0.0
+        # Extract predicted bounding boxes from the response
+        json_data = extract_json_from_response(predict_str)
+        if json_data:
+            # Extract bounding boxes from the JSON
+            try:
+                pred_bboxes = []
+                if isinstance(json_data, list):
+                    for item in json_data:
+                        if isinstance(item, dict) and "bbox_2d" in item:
+                            pred_bboxes.append(item["bbox_2d"])
+                elif isinstance(json_data, dict) and "bbox_2d" in json_data:
+                    pred_bboxes.append(json_data["bbox_2d"])
+                elif isinstance(json_data, dict) and "objects_of_interest" in json_data:
+                    for item in json_data["objects_of_interest"]:
+                        if isinstance(item, dict) and "bbox_2d" in item:
+                            pred_bboxes.append(item["bbox_2d"])
+
+                if random.random() < 0.005:  # print every 0.5%
+                    print("[Bounding Box] ", json_data)
+                    print("[Formatted Bounding Box] ", pred_bboxes)
+                    print("[GT Bounding Box] ", bbox)
+
+                # Calculate IoU between predicted boxes and ground truth
+                if pred_bboxes:
+                    iou_score = calculate_bbox_iou(pred_bboxes, segmentation_mask, bbox)
+            except:
+                pass
+
+        scores = {
+            "score": 0.5 * standard_score + 0.3 * iou_score + 0.1 * format_score,
+            "standard_score": standard_score,
+            "iou_score": iou_score,
+            "format_score": format_score,
+            "length_score": length_score,
+        }
+        batch_scores.append(scores)
+
+    return batch_scores
\ No newline at end of file
diff --git a/examples/reward_function/r1v.py b/examples/reward_function/r1v.py
new file mode 100644
index 00000000000..6a28548b292
--- /dev/null
+++ b/examples/reward_function/r1v.py
@@ -0,0 +1,50 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Any
+
+from mathruler.grader import grade_answer
+
+
+def format_reward(response: str) -> float:
+    pattern = re.compile(r"<think>.*?</think>\s*<answer>.*?</answer>", re.DOTALL)
+    format_match = re.fullmatch(pattern, response)
+    return 1.0 if format_match else 0.0
+
+
+def accuracy_reward(response: str, ground_truth: str) -> float:
+    try:
+        content_match = re.search(r"<answer>(.*?)</answer>", response)
+        given_answer = content_match.group(1).strip() if content_match else response.strip()
+        if grade_answer(given_answer, ground_truth.strip()):
+            return 1.0
+
+    except Exception:
+        pass
+
+    return 0.0
+
+
+def compute_score(reward_input: dict[str, Any], format_weight: float = 0.5) -> dict[str, float]:
+    if not isinstance(reward_input, dict):
+        raise ValueError("Please use `reward_type=sequential` for r1v reward function.")
+
+    format_score = format_reward(reward_input["response"])
+    accuracy_score = accuracy_reward(reward_input["response"], reward_input["ground_truth"])
+    return {
+        "overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
+        "format": format_score,
+        "accuracy": accuracy_score,
+    }
diff --git a/examples/rloo_trainer/run_qwen2-7b.sh b/examples/rloo_trainer/run_qwen2-7b.sh
old mode 100644
new mode 100755
diff --git a/examples/sft/gsm8k/run_deepseek_6b7.sh b/examples/sft/gsm8k/run_deepseek_6b7.sh
old mode 100644
new mode 100755
diff --git a/examples/sft/gsm8k/run_gemma_2b.sh b/examples/sft/gsm8k/run_gemma_2b.sh
old mode 100644
new mode 100755
diff --git a/examples/sft/gsm8k/run_gemma_7b.sh b/examples/sft/gsm8k/run_gemma_7b.sh
old mode 100644
new mode 100755
diff --git a/examples/sft/gsm8k/run_qwen2_5_05b_sft_peft_sp2_npu.sh b/examples/sft/gsm8k/run_qwen2_5_05b_sft_peft_sp2_npu.sh
old mode 100644
new mode 100755
diff --git a/examples/sft/gsm8k/run_qwen_05_peft.sh b/examples/sft/gsm8k/run_qwen_05_peft.sh
old mode 100644
new mode 100755
diff --git a/examples/sft/gsm8k/run_qwen_05_sp2.sh b/examples/sft/gsm8k/run_qwen_05_sp2.sh
old mode 100644
new mode 100755
diff --git a/examples/sft/gsm8k/run_qwen_05_sp2_liger.sh b/examples/sft/gsm8k/run_qwen_05_sp2_liger.sh
old mode 100644
new mode 100755
diff --git a/examples/sft/multiturn/run_qwen_05_sp2.sh b/examples/sft/multiturn/run_qwen_05_sp2.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang_multiturn/geo3k/run_qwen2.5-3b_geo3k_multiturn.sh b/examples/sglang_multiturn/geo3k/run_qwen2.5-3b_geo3k_multiturn.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang_multiturn/geo3k/run_qwen2.5-3b_geo3k_multiturn_4xgpu.sh b/examples/sglang_multiturn/geo3k/run_qwen2.5-3b_geo3k_multiturn_4xgpu.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang_multiturn/geo3k/run_qwen2.5-3b_megatron_geo3k_multiturn.sh b/examples/sglang_multiturn/geo3k/run_qwen2.5-3b_megatron_geo3k_multiturn.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang_multiturn/run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh b/examples/sglang_multiturn/run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang_multiturn/run_qwen2.5-3b_megatron_gsm8k_multiturn.sh b/examples/sglang_multiturn/run_qwen2.5-3b_megatron_gsm8k_multiturn.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang_multiturn/run_qwen2_3b_dapo_multiturn.sh b/examples/sglang_multiturn/run_qwen2_3b_dapo_multiturn.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang_multiturn/search_r1_like/run_qwen2.5-3b_instruct_search_multiturn.sh b/examples/sglang_multiturn/search_r1_like/run_qwen2.5-3b_instruct_search_multiturn.sh
old mode 100644
new mode 100755
diff --git a/examples/split_placement/run_deepseek7b_llm.sh b/examples/split_placement/run_deepseek7b_llm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/0.5b/qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh b/examples/tuning/0.5b/qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/1.5b/qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh b/examples/tuning/1.5b/qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/14b/qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh b/examples/tuning/14b/qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/14b/qwen2_14b_grpo_4_h800_fsdp_vllm.sh b/examples/tuning/14b/qwen2_14b_grpo_4_h800_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/32b/qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh b/examples/tuning/32b/qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/32b/qwen2_32B_grpo_8_h20_megatron_vllm.sh b/examples/tuning/32b/qwen2_32B_grpo_8_h20_megatron_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/3b/qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh b/examples/tuning/3b/qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/70b/qwen2-70b_grpo_32_h20_fsdp_vllm.sh b/examples/tuning/70b/qwen2-70b_grpo_32_h20_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/70b/qwen2-70b_grpo_32_h800_fsdp_vllm.sh b/examples/tuning/70b/qwen2-70b_grpo_32_h800_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/70b/qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh b/examples/tuning/70b/qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/7b/qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh b/examples/tuning/7b/qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/examples/tuning/7b/qwen2-7b_grpo_2_h800_fsdp_vllm.sh b/examples/tuning/7b/qwen2-7b_grpo_2_h800_fsdp_vllm.sh
old mode 100644
new mode 100755
diff --git a/scripts/process_mosei_annotations.py b/scripts/process_mosei_annotations.py
new file mode 100644
index 00000000000..6d655e0998b
--- /dev/null
+++ b/scripts/process_mosei_annotations.py
@@ -0,0 +1,74 @@
+import json
+import tqdm
+
+
+def process_mosei_annotations(annotation_path: str) -> None:
+    data = []
+    with open(annotation_path, "r") as f:  # jsonl file
+        for line in f:
+            entry = json.loads(line.strip())
+            data.append(entry)
+
+    formatted_data = []
+    for sample in tqdm.tqdm(data):
+        image_path = sample["image"]
+        video_id = image_path.split("/")[1].split("_")[0]
+        clip_id = image_path.split("_")[-1].split(".")[0]
+        raw_video_path = f"Raw/{video_id}/{clip_id}.mp4"
+
+        problem: str = sample["conversations"][0]["value"]
+        question_statement = problem.index("What is ")
+        question_str = problem[question_statement:]
+        answer_str = sample["conversations"][1]["value"]
+
+        new_entry = {
+            "videos": [raw_video_path],
+            "problem": question_str,
+            "answer": answer_str,
+        }
+
+        # avoid adding if the video and problem already exists
+        if not any(
+            entry["videos"] == new_entry["videos"] and entry["problem"] == new_entry["problem"]
+            for entry in formatted_data
+        ):
+            formatted_data.append(new_entry)
+
+    formatted_data = sorted(formatted_data, key=lambda entry: entry["videos"])
+
+    output_path = annotation_path.replace(".jsonl", "_formatted.jsonl")
+    with open(output_path, "w") as f:
+        for entry in formatted_data:
+            f.write(json.dumps(entry) + "\n")
+
+    # Add train test split of 80-20, calling it annotations_train.jsonl and annotations_test.jsonl
+    split_index = int(0.8 * len(formatted_data))
+    train_data = formatted_data[:split_index]
+    test_data = formatted_data[split_index:]
+    folder_name = annotation_path.rsplit("/", 1)[0] if "/" in annotation_path else "."
+    train_output_path = f"{folder_name}/annotations_train.jsonl"
+    test_output_path = f"{folder_name}/annotations_test.jsonl"
+
+    with open(train_output_path, "w") as f:
+        for entry in train_data:
+            f.write(json.dumps(entry) + "\n")
+
+    with open(test_output_path, "w") as f:
+        for entry in test_data:
+            f.write(json.dumps(entry) + "\n")
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Process MOSEI annotations")
+    parser.add_argument(
+        "--annotation_path",
+        type=str,
+        default="mosei_annotations.jsonl",
+        help="Path to the MOSEI annotations file (default: mosei_annotations.jsonl)"
+    )
+
+    args = parser.parse_args()
+
+    process_mosei_annotations(args.annotation_path)
+    print(f"Processed annotations saved to {args.annotation_path.replace('.jsonl', '_formatted.jsonl')}")
diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
index 27233a87994..7e1b62b4cee 100644
--- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -264,6 +264,7 @@ data:
   train_files: ~/data/rlhf/gsm8k/train.parquet
   val_files: ~/data/rlhf/gsm8k/test.parquet
   prompt_key: prompt
+  format_prompt: examples/format_prompt/default.jinja
   reward_fn_key: data_source
   max_prompt_length: 512
   max_response_length: 512
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
index bca4e51679c..d2378a75223 100644
--- a/verl/trainer/config/_generated_ppo_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -237,6 +237,7 @@ data:
   train_files: ~/data/rlhf/gsm8k/train.parquet
   val_files: ~/data/rlhf/gsm8k/test.parquet
   prompt_key: prompt
+  format_prompt: examples/format_prompt/default.jinja
   reward_fn_key: data_source
   max_prompt_length: 512
   max_response_length: 512
diff --git a/verl/trainer/config/data/legacy_data.yaml b/verl/trainer/config/data/legacy_data.yaml
index 9a5ce8f0dd1..ffeaa5cb19c 100644
--- a/verl/trainer/config/data/legacy_data.yaml
+++ b/verl/trainer/config/data/legacy_data.yaml
@@ -16,6 +16,11 @@ val_files: ~/data/rlhf/gsm8k/test.parquet
 # The field in the dataset where the prompt is located. Default is 'prompt'.
 prompt_key: prompt
 
+# Path to the format prompt template file. If null, uses the default format prompt.
+# The template should be a Jinja2 template that will be applied to each prompt.
+# Example: examples/format_prompt/default.jinja
+format_prompt: examples/format_prompt/default.jinja
+
 # The field used to select the reward function (if using different ones per example).
 reward_fn_key: data_source
 
diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py
index 7ec622036d9..2ea8bfb6305 100644
--- a/verl/trainer/ppo/core_algos.py
+++ b/verl/trainer/ppo/core_algos.py
@@ -20,9 +20,11 @@
 
 __all__ = ["register_adv_est", "get_adv_estimator_fn", "AdvantageEstimator"]
 
+import math
 from collections import defaultdict
 from enum import Enum
-from typing import Any, Callable, Optional
+from sklearn.cluster import KMeans
+from typing import Any, Callable, Optional, Dict, List, Tuple
 
 import numpy as np
 import torch
@@ -101,6 +103,7 @@ class AdvantageEstimator(str, Enum):
     OPO = "opo"
     GRPO_PASSK = "grpo_passk"
     GPG = "gpg"
+    DRPO = "drpo"
 
 
 ADV_ESTIMATOR_REGISTRY: dict[str, Any] = {}
@@ -324,6 +327,181 @@ def compute_grpo_outcome_advantage(
     return scores, scores
 
 
+EPS_DEFAULT: float = 1e-6
+
+# Per‑domain question history ------------------------------------------------ #
+#   domain_qstats[dom] = {
+#       "vectors": List[np.ndarray]   # shape = (Q, R)
+#       "q_ids":   List[int],        # question ids in same order as vectors
+#       "count":   int,              # #questions accumulated so far
+#   }
+# --------------------------------------------------------------------------- #
+domain_qstats: Dict[Any, Dict[str, Any]] = defaultdict(lambda: {
+    "vectors": [],
+    "q_ids":   [],
+    "count":   0,
+})
+
+global_running_stats: Dict[str, int] = {"q_count": 0}
+
+# --------------------------------------------------------------------------- #
+#  Helpers                                                                    #
+# --------------------------------------------------------------------------- #
+
+def _select_k_elbow(vals: np.ndarray, k_max: int = 10, tol: float = 0.10) -> int:
+    """k‑means elbow pick on multi‑dimensional points."""
+    unique_cnt = len(np.unique(vals, axis=0))
+    k_cap      = min(k_max, unique_cnt)
+    ks         = range(1, k_cap + 1)
+    inertias   = [KMeans(n_clusters=k, n_init="auto", random_state=0).fit(vals).inertia_ for k in ks]
+    if len(inertias) == 1:
+        return 1
+    drops = np.diff(inertias) * -1.0
+    for i in range(1, len(drops)):
+        if drops[i] < tol * drops[i - 1]:
+            return i + 1
+    return ks[-1]
+
+
+def _cluster_info_question(vectors: List[np.ndarray]) -> Tuple[float, np.ndarray, np.ndarray, np.ndarray]:
+    """K‑means on question‑level vectors.
+
+    Returns
+    -------
+    mu_d        : float   – inverse‑cluster‑size weighted mean of the centroid means
+    assignments : (Q,)    – cluster index for each question vector
+    counts      : (k,)    – cluster sizes
+    centroids   : (k,R)   – cluster centroid vectors
+    """
+    if len(vectors) == 0:
+        return 0.0, np.empty(0, int), np.empty(0), np.empty((0, 0))
+
+    X = np.stack(vectors, axis=0)            # (Q,R) – R inferred from data
+    k_opt = _select_k_elbow(X, k_max=20)
+    km    = KMeans(n_clusters=k_opt, n_init="auto", random_state=0).fit(X)
+
+    centroids   = km.cluster_centers_        # (k,R)
+    assignments = km.labels_                 # (Q,)
+    _, counts   = np.unique(assignments, return_counts=True)
+    counts      = counts.astype(float)
+
+    centroid_means = centroids.mean(axis=1)  # (k,)
+    weights        = 1.0 / counts
+    mu_d           = float((weights * centroid_means).sum() / weights.sum())
+
+    # Debug ------------------------------------------------------------- #
+    print(
+        f"[KMEANS‑Q] k={k_opt} | centroid_means="
+        f"[{', '.join(f'{m:.3f}' for m in centroid_means)}] | counts={counts.tolist()} | μ_d={mu_d:.3f}"
+    )
+
+    return mu_d, assignments, counts, centroids
+
+
+@register_adv_est(AdvantageEstimator.DRPO)
+def compute_drpo_outcome_advantage(
+    token_level_rewards: torch.Tensor,      # (B,L)
+    response_mask:      torch.Tensor,       # (B,L)
+    index:              np.ndarray[str],         # (B,) question ids
+    domain_info: np.ndarray,  # (B,) domain ids
+    epsilon: float = EPS_DEFAULT,
+):
+    """DRPO with question‑level clustering."""
+
+    B, L = token_level_rewards.shape
+
+    # 1) raw rollout‑level rewards -------------------------------------- #
+    raw_scores = token_level_rewards.sum(dim=-1)                          # (B,)
+
+    # 2) collect rollouts per question for this mini‑batch -------------- #
+    q2rollouts: Dict[str, List[float]] = defaultdict(list)
+    q2domain:   Dict[str, Any]         = {}
+    for i in range(B):
+        qid: str = index[i]
+        q2rollouts[qid].append(raw_scores[i].item())
+        q2domain[qid] = domain_info[i]
+
+    # ensure consistent rollout count ----------------------------------- #
+    rollout_lens = {len(v) for v in q2rollouts.values()}
+    assert len(rollout_lens) == 1, "Inconsistent rollout counts per question in batch!"
+
+    # build vector per question ----------------------------------------- #
+    q_vectors = {qid: np.asarray(v, dtype=np.float32) for qid, v in q2rollouts.items()}
+
+    # 3) update per‑domain question history ----------------------------- #
+    for qid, vec in q_vectors.items():
+        dom = q2domain[qid]
+        dstat = domain_qstats[dom]
+        dstat["vectors"].append(vec)
+        dstat["q_ids"].append(qid)
+        dstat["count"] += 1
+        global_running_stats["q_count"] += 1
+
+    # 4) GRPO normalisation (within‑question) --------------------------- #
+    scores = raw_scores.clone()
+    id2mean = {qid: torch.mean(torch.tensor(v)) for qid, v in q2rollouts.items()}
+    id2std  = {qid: torch.std (torch.tensor(v)) for qid, v in q2rollouts.items()}
+    for i in range(B):
+        qid: str = index[i]
+        scores[i] = (scores[i] - id2mean[qid]) / (id2std[qid] + epsilon)
+    before_scale_score = scores.clone()
+
+    # 5) Domain‑wise question clustering -------------------------------- #
+    domain_cluster_cache: Dict[Any, Dict[str, Any]] = {}
+    for dom, dstat in domain_qstats.items():
+        if dstat["count"] == 0:
+            continue
+        mu_d, assign, counts, centroids = _cluster_info_question(dstat["vectors"])
+        domain_cluster_cache[dom] = {
+            "mu_d":      mu_d,
+            "assign":    assign,
+            "counts":    counts,
+            "centroids": centroids,
+            "q_ids":     dstat["q_ids"],
+        }
+
+    # 6) Apply scaling --------------------------------------------------- #
+    scaling_factors: List[float] = []
+    for i in range(B):
+        qid: str  = index[i]
+        dom  = q2domain[qid]
+        cache = domain_cluster_cache[dom]
+
+        # map qid → cluster idx ---------------------------------------- #
+        q_idx       = cache["q_ids"].index(qid)
+        cluster_idx = cache["assign"][q_idx]
+
+        N_d  = float(domain_qstats[dom]["count"])
+        mu_d = cache["mu_d"]
+        T_d  = max(math.sqrt(N_d) * mu_d, epsilon)
+
+        N_c  = float(cache["counts"][cluster_idx])
+        mu_c = float(cache["centroids"][cluster_idx].mean())
+
+        factor = T_d * math.sqrt(N_c) * mu_c
+        scaling_factors.append(factor)
+        scores[i] = scores[i] / factor
+
+    # divide scores by std of scores
+    scores_std = torch.std(scores)
+    scores = scores / (scores_std + epsilon)
+
+    # Debug report -------------------------------------------------------- #
+    print("--------------Hierarchical scaling report--------------")
+    dom2scale: Dict[Any, List[torch.Tensor]] = defaultdict(list)
+    for i in range(B):
+        dom2scale[domain_info[i]].append(scores[i] / (before_scale_score[i] + epsilon))
+    for dom, lst in dom2scale.items():
+        avg_sf = torch.mean(torch.stack(lst)).item()
+        print(f"[HDRPO] domain = {dom:<15} | mean overall scale = {avg_sf:6.3f}")
+
+    # Print global reward mean
+    print(f"[HDRPO] global reward mean = {torch.mean(scores):.3f}")
+
+    returns = scores.unsqueeze(-1) * response_mask
+    return returns, returns
+
+
 @register_adv_est(AdvantageEstimator.GRPO_PASSK)  # or simply: @register_adv_est("grpo_passk")
 def compute_grpo_passk_outcome_advantage(
     token_level_rewards: torch.Tensor,
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 97b68684d5c..bb783854aaf 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -27,11 +27,13 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from pprint import pprint
-from typing import Optional
+from typing import Optional, Dict
 
 import numpy as np
 import ray
 import torch
+import ujson
+import wandb
 from omegaconf import OmegaConf, open_dict
 from torch.utils.data import Dataset, Sampler
 from torchdata.stateful_dataloader import StatefulDataLoader
@@ -61,6 +63,7 @@
 from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
 from verl.utils.torch_functional import masked_mean
 from verl.utils.tracking import ValidationGenerationsLogger
+from examples.reward_function.evaluation import compute_metrics_by_data_source
 
 WorkerType = type[Worker]
 
@@ -271,6 +274,18 @@ def compute_advantage(
         )
         data.batch["advantages"] = advantages
         data.batch["returns"] = returns
+    elif adv_estimator == AdvantageEstimator.DRPO:
+        grpo_calculation_mask = data.batch["response_mask"]
+        domain_info = data.non_tensor_batch["dataset"]
+
+        advantages, returns = core_algos.compute_drpo_outcome_advantage(
+            token_level_rewards=data.batch["token_level_rewards"],
+            response_mask=grpo_calculation_mask,
+            index=data.non_tensor_batch["uid"],
+            domain_info=domain_info
+        )
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
     else:
         # handle all other adv estimator type other than GAE and GRPO
         adv_estimator_fn = core_algos.get_adv_estimator_fn(adv_estimator)
@@ -573,7 +588,7 @@ def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampl
         except Exception as e:
             print(f"Warning: Could not set total_training_steps in config. Structure missing? Error: {e}")
 
-    def _dump_generations(self, inputs, outputs, gts, scores, reward_extra_infos_dict, dump_path):
+    def _dump_generations(self, inputs, outputs, gts, scores, reward_extra_infos_dict, dump_path, **kwargs):
         """Dump rollout/validation samples as JSONL."""
         os.makedirs(dump_path, exist_ok=True)
         filename = os.path.join(dump_path, f"{self.global_steps}.jsonl")
@@ -591,6 +606,14 @@ def _dump_generations(self, inputs, outputs, gts, scores, reward_extra_infos_dic
             if len(v) == n:
                 base_data[k] = v
 
+        for k, v in kwargs.items():
+            if isinstance(v, np.ndarray):
+                base_data[k] = v.tolist()
+            elif hasattr(v, 'cpu'):  # Check if it's a torch tensor
+                base_data[k] = v.cpu().numpy().tolist()
+            else:
+                base_data[k] = v
+
         lines = []
         for i in range(n):
             entry = {k: v[i] for k, v in base_data.items()}
@@ -636,6 +659,14 @@ def _validate(self):
         sample_scores = []
         sample_turns = []
 
+        # New lists for metric calculation
+        all_predictions = []
+        all_ground_truths = []
+        all_data_sources = []
+        all_demographics = []
+        all_datasets = []
+        data_source_lst = []
+
         for test_data in self.val_dataloader:
             test_batch = DataProto.from_single_dict(test_data)
 
@@ -658,6 +689,9 @@ def _validate(self):
                 item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) for item in test_batch
             ]
             sample_gts.extend(ground_truths)
+            data_sources = test_batch.non_tensor_batch.get("data_source", ["unknown"] * len(input_texts))
+            datasets = test_batch.non_tensor_batch.get("dataset", ["unknown"] * len(input_texts))
+            demographics = test_batch.non_tensor_batch.get("demo", ["unknown"] * len(input_texts))
 
             batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
             non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
@@ -708,6 +742,16 @@ def _validate(self):
             output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
             sample_outputs.extend(output_texts)
 
+            # Collect for metrics calculation
+            all_predictions.extend(output_texts)
+            all_ground_truths.extend(ground_truths)
+            all_data_sources.extend(data_sources)
+            all_datasets.extend(datasets)
+            all_demographics.extend(demographics)
+            data_source_lst.append(
+                test_batch.non_tensor_batch.get("data_source", ["unknown"] * len(input_texts))
+            )
+
             test_batch = test_batch.union(test_output_gen_batch)
             test_batch.meta_info["validate"] = True
 
@@ -730,27 +774,23 @@ def _validate(self):
             if "__num_turns__" in test_batch.non_tensor_batch:
                 sample_turns.append(test_batch.non_tensor_batch["__num_turns__"])
 
-            data_source_lst.append(test_batch.non_tensor_batch.get("data_source", ["unknown"] * reward_tensor.shape[0]))
-
         self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores)
 
-        # dump generations
-        val_data_dir = self.config.trainer.get("validation_data_dir", None)
-        if val_data_dir:
-            self._dump_generations(
-                inputs=sample_inputs,
-                outputs=sample_outputs,
-                gts=sample_gts,
-                scores=sample_scores,
-                reward_extra_infos_dict=reward_extra_infos_dict,
-                dump_path=val_data_dir,
-            )
+        # Per data source metrics
+        metrics = compute_metrics_by_data_source(all_predictions, all_ground_truths,
+                                                 all_data_sources, all_datasets, all_demographics)
+        wandb.log(metrics, step=self.global_steps)
 
         for key_info, lst in reward_extra_infos_dict.items():
             assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}"
 
         data_sources = np.concatenate(data_source_lst, axis=0)
+        # convert to list for easier processing
+        data_sources = data_sources.tolist()
 
+        print(f"size of sample_scores: {len(sample_scores)}, size of sample_outputs: {len(sample_outputs)},"
+              f" size of sample_gts: {len(sample_gts)}, size of sample_inputs: {len(sample_inputs)}"
+              f", size of data_sources: {len(data_sources)}, size of sample_turns: {len(sample_turns)}")
         data_src2var2metric2val = process_validation_metrics(data_sources, sample_inputs, reward_extra_infos_dict)
         metric_dict = {}
         for data_source, var2metric2val in data_src2var2metric2val.items():
@@ -769,6 +809,20 @@ def _validate(self):
                     pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}"
                     metric_dict[pfx] = metric_val
 
+        # dump generations
+        val_data_dir = self.config.trainer.get("validation_data_dir", self.config.trainer.default_local_dir)
+        if val_data_dir:
+            self._dump_generations(
+                inputs=sample_inputs,
+                outputs=sample_outputs,
+                gts=sample_gts,
+                scores=sample_scores,
+                reward_extra_infos_dict=reward_extra_infos_dict,
+                dump_path=val_data_dir,
+                datasets=all_datasets,
+                data_paths=data_sources,
+            )
+
         if len(sample_turns) > 0:
             sample_turns = np.concatenate(sample_turns)
             metric_dict["val-aux/num_turns/min"] = sample_turns.min()
@@ -777,6 +831,32 @@ def _validate(self):
 
         return metric_dict
 
+    def save_generations(self, sample_datapaths, sample_datasets, sample_inputs, sample_labels, sample_outputs,
+                         sample_scores):
+        generation_save_folder = os.path.join(self.config.trainer.default_local_dir,
+                                              f"global_step_{self.global_steps}")
+        if not os.path.exists(generation_save_folder):
+            os.makedirs(generation_save_folder, exist_ok=True)
+        with open(os.path.join(generation_save_folder, "generations.jsonl"), "w") as f:
+            for i in range(len(sample_inputs)):
+                try:
+                    short_answer = sample_outputs[i].split("boxed{")[1].split("}")[0]
+                except IndexError:
+                    short_answer = ''
+                answer_is_correct = short_answer == sample_labels[i]
+                f.write(
+                    ujson.dumps({
+                        "input": sample_inputs[i],
+                        "generations": sample_outputs[i],
+                        "short_answer": short_answer,
+                        "answer_is_correct": answer_is_correct,
+                        "label": sample_labels[i],
+                        "score": sample_scores[i],
+                        "dataset": sample_datasets[i],
+                        "datapath": sample_datapaths[i],
+                    }) + "\n"
+                )
+
     def init_workers(self):
         """Initialize distributed training workers using Ray backend.
 
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 2c19385c2b3..6024ba32ca7 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -24,6 +24,7 @@
 import datasets
 import numpy as np
 import torch
+from jinja2 import Template
 from omegaconf import DictConfig, ListConfig
 from torch.utils.data import Dataset
 from transformers import PreTrainedTokenizer, ProcessorMixin
@@ -107,6 +108,10 @@ def __init__(
         self.return_full_prompt = config.get("return_full_prompt", False)
         self.truncation = config.get("truncation", "error")
         self.filter_overlong_prompts = config.get("filter_overlong_prompts", True)
+        if isinstance(data_files, str):
+            self.base_dir = os.path.dirname(os.path.abspath(data_files))
+        else:
+            self.base_dir = os.path.dirname(os.path.abspath(data_files[0]))
 
         self.num_workers = config.get("filter_overlong_prompts_workers", max(1, os.cpu_count() // 4))
         self.num_workers = min(self.num_workers, os.cpu_count())
@@ -116,10 +121,22 @@ def __init__(
         self.filter_prompts = config.get("filter_prompts", True)
         self.serialize_dataset = False
         self.return_multi_modal_inputs = config.get("return_multi_modal_inputs", True)
+        
+        # Load format prompt from file if specified
+        self.format_prompt_path = config.get("format_prompt", "examples/format_prompt/default.jinja")
+        self.format_prompt = self._load_format_prompt()
 
         self._download()
         self._read_files_and_tokenize()
 
+    def _load_format_prompt(self) -> Optional[Template]:
+        """Load format prompt from file if specified."""
+        if self.format_prompt_path:
+            with open(self.format_prompt_path, 'r', encoding='utf-8') as f:
+                template_content = f.read()
+            return Template(template_content)
+        return None
+
     def _download(self, use_origin_parquet=False):
         from verl.utils.fs import copy_to_local
 
@@ -131,7 +148,12 @@ def _read_files_and_tokenize(self):
         dataframes = []
         for parquet_file in self.data_files:
             # read parquet files and cache
-            dataframe = datasets.load_dataset("parquet", data_files=parquet_file)["train"]
+            if parquet_file.endswith(".parquet"):
+                dataframe = datasets.load_dataset("parquet", data_files=parquet_file)["train"]
+            elif parquet_file.endswith(".json") or parquet_file.endswith(".jsonl"):
+                dataframe = datasets.load_dataset("json", data_files=parquet_file)["train"]
+            else:
+                raise ValueError(f"Unsupported file format: {parquet_file}. Only .parquet, .json, .jsonl are supported.")
             dataframes.append(dataframe)
         self.dataframe: datasets.Dataset = datasets.concatenate_datasets(dataframes)
 
@@ -188,11 +210,35 @@ def __len__(self):
         return len(self.dataframe)
 
     def _build_messages(self, example: dict):
-        messages: list = example.pop(self.prompt_key)
+        messages: list = example.get(self.prompt_key)
+        if isinstance(messages, str):
+            messages = [messages]
 
         if self.image_key in example or self.video_key in example:
+            new_messages = []
             for message in messages:
-                content = message["content"]
+                new_message = copy.deepcopy(message)
+                if isinstance(new_message, str):
+                    new_message = {"role": "user", "content": new_message}
+                content = new_message["content"]
+                
+                # Apply format prompt to the entire content first if template is loaded
+                if self.format_prompt:
+                    content = self.format_prompt.render(content=content)
+
+                image_count = len(example.get(self.image_key, []))
+                video_count = len(example.get(self.video_key, []))
+                image_tag_count = content.count("<image>")
+                video_tag_count = content.count("<video>")
+                if image_tag_count < image_count:
+                    content = "<image>" * (image_count - image_tag_count) + content
+                    logger.warning("<image> tag count is less than image count, adding missing <image> tags."
+                                   " content: %s", content)
+                if video_tag_count < video_count:
+                    content = "<video>" * (video_count - video_tag_count) + content
+                    logger.warning("<video> tag count is less than video count, adding missing <video> tags."
+                                 " content: %s", content)
+
                 content_list = []
                 segments = re.split("(<image>|<video>)", content)
                 segments = [item for item in segments if item != ""]
@@ -203,16 +249,70 @@ def _build_messages(self, example: dict):
                         content_list.append({"type": "video"})
                     else:
                         content_list.append({"type": "text", "text": segment})
-
-                message["content"] = content_list
-
-        return messages
+                new_message["content"] = content_list
+                new_messages.append(new_message)
+        else:
+            new_messages = copy.deepcopy(messages)
+            if isinstance(new_messages, str):
+                new_messages = [{"role": "user", "content": new_messages}]
+            elif isinstance(new_messages, list) and isinstance(new_messages[0], str):
+                new_messages = [{"role": "user", "content": new_messages}]
+            
+            # Apply format prompt to text-only messages if template is loaded
+            if self.format_prompt and len(new_messages) > 0:
+                for i, msg in enumerate(new_messages):
+                    if isinstance(msg, dict) and msg.get("role") == "user":
+                        content = msg.get("content", "")
+                        if isinstance(content, str):
+                            new_messages[i]["content"] = self.format_prompt.render(content=content)
+        return new_messages
 
     def __getitem__(self, item):
         """
         Note that we also return the raw_input_ids so that it can be combined with other chat template
         """
         row_dict: dict = self.dataframe[item]
+
+        is_timeseries = False
+        vision_path = row_dict['images'][0] if 'images' in row_dict and len(row_dict['images']) != 0 else None
+        if vision_path is None:  # this may be video
+            vision_path = row_dict['videos'][0] if 'videos' in row_dict and len(row_dict['videos']) != 0 else None
+        if vision_path is None:  # this may be time series only
+            vision_path = row_dict['time_series'][0] if 'time_series' in row_dict and len(
+                row_dict['time_series']) != 0 else ''
+            is_timeseries = True
+        prompt_str = row_dict[self.prompt_key]
+
+        if 'How long will the patient stay in the hospital?' in prompt_str:
+            row_dict["data_source"] = "multimodal"
+            row_dict["dataset"] = "los_prediction"
+        elif 'Will the patient survive for at least 48 hours?' in prompt_str:
+            row_dict["data_source"] = "multimodal"
+            row_dict["dataset"] = "48_ihm"
+        elif len(vision_path) != 0:
+            try:
+                row_dict["data_source"] = vision_path.split("/")[0]
+                row_dict["dataset"] = vision_path.split("/")[1]
+            except IndexError:
+                row_dict["data_source"] = "unknown"
+                row_dict["dataset"] = "unknown"
+                print(
+                    f"Failed to parse vision path: {vision_path}. The annotation is {row_dict}. Using default values.")
+        elif is_timeseries:
+            row_dict["data_source"] = "ecg"
+            # dataset already set in json
+        else:
+            raise ValueError("No modality found.")
+
+        if 'reward_model' not in row_dict:
+            if 'answer' in row_dict:
+                answer = row_dict['answer']
+            elif 'ground_truth' in row_dict:
+                answer = row_dict['ground_truth']
+            else:
+                raise ValueError("No answer or ground_truth found in the row_dict.")
+            row_dict['reward_model'] = {'ground_truth': answer}
+
         messages = self._build_messages(row_dict)
         model_inputs = {}
 
@@ -223,16 +323,24 @@ def __getitem__(self, item):
             multi_modal_data = {}
 
             images = None
-            if self.image_key in row_dict and row_dict.get(self.image_key, None) is not None:
-                images = [process_image(image) for image in row_dict.pop(self.image_key)]
+            if self.image_key in row_dict and row_dict.get(self.image_key, None) is not None and len(row_dict[self.image_key]) > 0:
+                # images = [process_image(image) for image in row_dict.get(self.image_key)]
+                images = []
+                for image in row_dict.get(self.image_key):
+                    image = os.path.join(self.base_dir, image) if isinstance(image, str) else image
+                    images.append(process_image(image))
 
                 # due to the image key is "image" instead of "images" in vllm, we need to use "image" here
                 # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
                 multi_modal_data["image"] = images
 
             videos = None
-            if self.video_key in row_dict and row_dict.get(self.video_key, None) is not None:
-                videos = [process_video(video) for video in row_dict.pop(self.video_key)]
+            if self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
+                # videos = [process_video(video) for video in row_dict.get(self.video_key)]
+                videos = []
+                for video in row_dict.get(self.video_key):
+                    video = os.path.join(self.base_dir, video) if isinstance(video, str) else video
+                    videos.append(process_video(video))
 
                 # due to the video key is "video" instead of "videos" in vllm, we need to use "video" here
                 # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index 3052e340c0a..7662542b13e 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -20,7 +20,10 @@
 from qwen_vl_utils import fetch_image, fetch_video
 
 
-def process_image(image: dict | Image.Image) -> Image.Image:
+def process_image(image: dict | Image.Image | str) -> Image.Image:
+    if isinstance(image, str):
+        image = {"type": "image", "image": image, "min_pixels": 65536, "max_pixels": 524288}
+
     if isinstance(image, Image.Image):
         return image.convert("RGB")
 
@@ -28,7 +31,12 @@ def process_image(image: dict | Image.Image) -> Image.Image:
         assert "image" not in image, "Cannot have both `bytes` and `image`"
         image["image"] = Image.open(BytesIO(image["bytes"]))
 
-    return fetch_image(image)
+    try:
+        return fetch_image(image)
+    except Exception as e:
+        print(e)
+        dummy_image = Image.new("RGB", (224, 224))
+        return process_image(dummy_image)
 
 
 VIDEO_FORMAT_HELP = """Currently, we only support the video formats introduced in qwen2-vl.
@@ -70,6 +78,9 @@ def process_video(
 
     Add video sample FPS in a future MR
     """
+    if isinstance(video, str):
+        video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
+                 "nframes": 4}
 
     if not isinstance(video, dict) or "video" not in video:
         raise NotImplementedError(VIDEO_FORMAT_HELP)
@@ -88,8 +99,12 @@ def process_video(
                 video["min_frames"] = fps_min_frames
             if fps_max_frames is not None:
                 video["max_frames"] = fps_max_frames
-
-    return fetch_video(video)
+    try:
+        return fetch_video(video)
+    except Exception as e:
+        print(e)
+        dummy_video = torch.zeros((1, 3, 224, 224), dtype=torch.uint8)
+        return dummy_video
 
 
 def process_multi_modal_inputs_for_minicpmo(input_ids, attention_mask, position_ids, cu_seqlens, multi_modal_inputs):
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
index 3318e9f7216..3cf8db9e168 100644
--- a/verl/workers/actor/dp_actor.py
+++ b/verl/workers/actor/dp_actor.py
@@ -100,11 +100,11 @@ def _forward_micro_batch(
         if "multi_modal_inputs" in micro_batch.keys():
             if "image_bound" in micro_batch["multi_modal_inputs"][0]:  # minicpm-o logic
                 for key in micro_batch["multi_modal_inputs"][0].keys():
-                    multi_modal_inputs[key] = [inputs[key] for inputs in micro_batch["multi_modal_inputs"]]
+                    multi_modal_inputs[key] = [inputs[key] for inputs in micro_batch["multi_modal_inputs"] if key in inputs]
             else:
                 for key in micro_batch["multi_modal_inputs"][0].keys():
                     multi_modal_inputs[key] = torch.cat(
-                        [inputs[key] for inputs in micro_batch["multi_modal_inputs"]], dim=0
+                        [inputs[key] for inputs in micro_batch["multi_modal_inputs"] if key in inputs], dim=0
                     )
 
         with torch.autocast(device_type=self.device_name, dtype=torch.bfloat16):

From a824ed5d4c3b9ed5e94b5b71e8a16ad7985e3ad6 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Mon, 11 Aug 2025 16:06:00 -0400
Subject: [PATCH 002/232] Experimental: Add support for audio training

---
 .../_generated_ppo_megatron_trainer.yaml      |  2 +
 .../config/_generated_ppo_trainer.yaml        |  2 +
 verl/trainer/config/data/legacy_data.yaml     |  8 ++
 verl/utils/dataset/audio_utils.py             | 51 ++++++++++
 verl/utils/dataset/rl_dataset.py              | 97 +++++++++++++++----
 5 files changed, 140 insertions(+), 20 deletions(-)
 create mode 100644 verl/utils/dataset/audio_utils.py

diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
index 7e1b62b4cee..3b7220d717d 100644
--- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -281,6 +281,8 @@ data:
   truncation: error
   image_key: images
   video_key: videos
+  audio_key: audios
+  modalities: images,videos
   trust_remote_code: false
   custom_cls:
     path: null
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
index d2378a75223..7c247ddb822 100644
--- a/verl/trainer/config/_generated_ppo_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -254,6 +254,8 @@ data:
   truncation: error
   image_key: images
   video_key: videos
+  audio_key: audios
+  modalities: images,videos
   trust_remote_code: false
   custom_cls:
     path: null
diff --git a/verl/trainer/config/data/legacy_data.yaml b/verl/trainer/config/data/legacy_data.yaml
index ffeaa5cb19c..54e476adff8 100644
--- a/verl/trainer/config/data/legacy_data.yaml
+++ b/verl/trainer/config/data/legacy_data.yaml
@@ -77,6 +77,14 @@ image_key: images
 # The field in the multi-modal dataset where the video is located.
 video_key: videos
 
+# The field in the multi-modal dataset where the audio is located. Default is 'audios'.
+audio_key: audios
+
+# Comma-separated list of modalities to process. Default is 'images,videos'.
+# Available modalities: images, videos, audios
+# Example: 'images,videos,audios' to enable all modalities
+modalities: images,videos
+
 # If the remote tokenizer has a Python file, this flag determines whether to allow using it.
 trust_remote_code: False
 
diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
new file mode 100644
index 00000000000..7c05573ac00
--- /dev/null
+++ b/verl/utils/dataset/audio_utils.py
@@ -0,0 +1,51 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, Optional
+
+import torch
+from transformers.pipelines.audio_utils import ffmpeg_read
+
+
+def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]:
+    """Process audio file and return (audio_data, sampling_rate) tuple.
+    
+    Args:
+        audio: Audio file path (str) or audio dict containing file path
+        processor: Audio processor with feature_extractor for sampling rate
+        
+    Returns:
+        Tuple of (audio_data, sampling_rate)
+    """
+    if isinstance(audio, dict):
+        audio_path = audio.get("audio", audio)
+    else:
+        audio_path = audio
+    
+    try:
+        # Get sampling rate from processor if available, otherwise use default
+        if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor, 'sampling_rate'):
+            sampling_rate = processor.feature_extractor.sampling_rate
+        else:
+            sampling_rate = 16000  # Default sampling rate
+            
+        # Read audio using ffmpeg_read with the specified sampling rate
+        audio_data = ffmpeg_read(audio_path, sampling_rate=sampling_rate)
+        
+        return audio_data, sampling_rate
+    except Exception as e:
+        print(f"Error processing audio {audio_path}: {e}")
+        # Return dummy audio data on error
+        dummy_audio = torch.zeros((1000,), dtype=torch.float32)  # 1 second of silence at 1kHz
+        return dummy_audio, 16000
\ No newline at end of file
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 6024ba32ca7..b37a66db70f 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -103,6 +103,8 @@ def __init__(
         self.prompt_key = config.get("prompt_key", "prompt")
         self.image_key = config.get("image_key", "images")
         self.video_key = config.get("video_key", "videos")
+        self.audio_key = config.get("audio_key", "audios")
+        self.modalities = set(config.get("modalities", "images,videos").split(","))
         self.max_prompt_length = config.get("max_prompt_length", 1024)
         self.return_raw_chat = config.get("return_raw_chat", False)
         self.return_full_prompt = config.get("return_full_prompt", False)
@@ -169,19 +171,32 @@ def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
             prompt_key = self.prompt_key
             image_key = self.image_key
             video_key = self.video_key
+            audio_key = self.audio_key
 
             if processor is not None:
                 from verl.utils.dataset.vision_utils import process_image, process_video
+                from verl.utils.dataset.audio_utils import process_audio
 
                 def doc2len(doc) -> int:
                     messages = self._build_messages(doc)
                     raw_prompt = self.processor.apply_chat_template(
                         messages, add_generation_prompt=True, tokenize=False
                     )
-                    images = [process_image(image) for image in doc[image_key]] if image_key in doc else None
-                    videos = [process_video(video) for video in doc[video_key]] if video_key in doc else None
-
-                    return len(processor(text=[raw_prompt], images=images, videos=videos)["input_ids"][0])
+                    processor_kwargs = {"text": [raw_prompt]}
+                    
+                    if "images" in self.modalities and image_key in doc:
+                        images = [process_image(image) for image in doc[image_key]]
+                        processor_kwargs["images"] = images
+                        
+                    if "videos" in self.modalities and video_key in doc:
+                        videos = [process_video(video) for video in doc[video_key]]
+                        processor_kwargs["videos"] = videos
+                        
+                    if "audios" in self.modalities and audio_key in doc and doc.get(audio_key, None) is not None:
+                        audios = [process_audio(audio, processor) for audio in doc[audio_key]]
+                        processor_kwargs["audio"] = audios
+
+                    return len(processor(**processor_kwargs)["input_ids"][0])
 
             else:
 
@@ -214,7 +229,13 @@ def _build_messages(self, example: dict):
         if isinstance(messages, str):
             messages = [messages]
 
-        if self.image_key in example or self.video_key in example:
+        has_multimodal = (
+            ("images" in self.modalities and self.image_key in example) or
+            ("videos" in self.modalities and self.video_key in example) or
+            ("audios" in self.modalities and self.audio_key in example)
+        )
+        
+        if has_multimodal:
             new_messages = []
             for message in messages:
                 new_message = copy.deepcopy(message)
@@ -228,8 +249,10 @@ def _build_messages(self, example: dict):
 
                 image_count = len(example.get(self.image_key, []))
                 video_count = len(example.get(self.video_key, []))
+                audio_count = len(example.get(self.audio_key, []))
                 image_tag_count = content.count("<image>")
                 video_tag_count = content.count("<video>")
+                audio_tag_count = content.count("<audio>")
                 if image_tag_count < image_count:
                     content = "<image>" * (image_count - image_tag_count) + content
                     logger.warning("<image> tag count is less than image count, adding missing <image> tags."
@@ -238,17 +261,36 @@ def _build_messages(self, example: dict):
                     content = "<video>" * (video_count - video_tag_count) + content
                     logger.warning("<video> tag count is less than video count, adding missing <video> tags."
                                  " content: %s", content)
+                if audio_tag_count < audio_count:
+                    content = "<audio>" * (audio_count - audio_tag_count) + content
+                    logger.warning("<audio> tag count is less than audio count, adding missing <audio> tags."
+                                   " content: %s", content)
 
                 content_list = []
-                segments = re.split("(<image>|<video>)", content)
-                segments = [item for item in segments if item != ""]
-                for segment in segments:
-                    if segment == "<image>":
-                        content_list.append({"type": "image"})
-                    elif segment == "<video>":
-                        content_list.append({"type": "video"})
-                    else:
-                        content_list.append({"type": "text", "text": segment})
+                # Build regex pattern based on enabled modalities
+                tag_patterns = []
+                if "images" in self.modalities:
+                    tag_patterns.append("<image>")
+                if "videos" in self.modalities:
+                    tag_patterns.append("<video>")
+                if "audios" in self.modalities:
+                    tag_patterns.append("<audio>")
+                
+                if tag_patterns:
+                    pattern = "(" + "|".join(tag_patterns) + ")"
+                    segments = re.split(pattern, content)
+                    segments = [item for item in segments if item != ""]
+                    for segment in segments:
+                        if segment == "<image>" and "images" in self.modalities:
+                            content_list.append({"type": "image"})
+                        elif segment == "<video>" and "videos" in self.modalities:
+                            content_list.append({"type": "video"})
+                        elif segment == "<audio>" and "audios" in self.modalities:
+                            content_list.append({"type": "audio"})
+                        else:
+                            content_list.append({"type": "text", "text": segment})
+                else:
+                    content_list.append({"type": "text", "text": content})
                 new_message["content"] = content_list
                 new_messages.append(new_message)
         else:
@@ -318,13 +360,14 @@ def __getitem__(self, item):
 
         if self.processor is not None:
             from verl.utils.dataset.vision_utils import process_image, process_video
+            from verl.utils.dataset.audio_utils import process_audio
 
             raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
             multi_modal_data = {}
+            processor_kwargs = {"text": [raw_prompt], "return_tensors": "pt"}
 
             images = None
-            if self.image_key in row_dict and row_dict.get(self.image_key, None) is not None and len(row_dict[self.image_key]) > 0:
-                # images = [process_image(image) for image in row_dict.get(self.image_key)]
+            if "images" in self.modalities and self.image_key in row_dict and row_dict.get(self.image_key, None) is not None and len(row_dict[self.image_key]) > 0:
                 images = []
                 for image in row_dict.get(self.image_key):
                     image = os.path.join(self.base_dir, image) if isinstance(image, str) else image
@@ -333,10 +376,10 @@ def __getitem__(self, item):
                 # due to the image key is "image" instead of "images" in vllm, we need to use "image" here
                 # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
                 multi_modal_data["image"] = images
+                processor_kwargs["images"] = images
 
             videos = None
-            if self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
-                # videos = [process_video(video) for video in row_dict.get(self.video_key)]
+            if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
                 videos = []
                 for video in row_dict.get(self.video_key):
                     video = os.path.join(self.base_dir, video) if isinstance(video, str) else video
@@ -345,8 +388,22 @@ def __getitem__(self, item):
                 # due to the video key is "video" instead of "videos" in vllm, we need to use "video" here
                 # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
                 multi_modal_data["video"] = [video.numpy() for video in videos]
-
-            model_inputs = self.processor(text=[raw_prompt], images=images, videos=videos, return_tensors="pt")
+                processor_kwargs["videos"] = videos
+
+            audios = None
+            if "audios" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
+                audios = []
+                for audio in row_dict.get(self.audio_key):
+                    audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
+                    audio_data, sampling_rate = process_audio(audio_path, self.processor)
+                    audios.append((audio_data, sampling_rate))
+
+                # due to the audio key is "audio" instead of "audios" in vllm, we need to use "audio" here
+                # following the same pattern as images and videos for vllm compatibility
+                multi_modal_data["audio"] = audios
+                processor_kwargs["audio"] = audios
+
+            model_inputs = self.processor(**processor_kwargs)
 
             input_ids = model_inputs.pop("input_ids")
             attention_mask = model_inputs.pop("attention_mask")

From 30e1e06d0d4077e416a16661f60ce87189936e16 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 18:01:16 -0400
Subject: [PATCH 003/232] Debug for audios

---
 verl/utils/dataset/rl_dataset.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index b37a66db70f..679aab852d9 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -355,6 +355,10 @@ def __getitem__(self, item):
                 raise ValueError("No answer or ground_truth found in the row_dict.")
             row_dict['reward_model'] = {'ground_truth': answer}
 
+        for key, item in row_dict.items():
+            if item is None:
+                row_dict.pop(key)
+
         messages = self._build_messages(row_dict)
         model_inputs = {}
 

From 732e70b143dbd8d4497c423c7c9ad5082923c9c7 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 18:06:42 -0400
Subject: [PATCH 004/232] Debug for audios

---
 verl/utils/dataset/rl_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 679aab852d9..4f781e2d7bd 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -357,7 +357,7 @@ def __getitem__(self, item):
 
         for key, item in row_dict.items():
             if item is None:
-                row_dict.pop(key)
+                row_dict[key] = []
 
         messages = self._build_messages(row_dict)
         model_inputs = {}

From 3892b059791d7b0133f2392f9a453f5e0cb5f20c Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 18:56:51 -0400
Subject: [PATCH 005/232] Add omni support

---
 verl/model_merger/base_model_merger.py | 3 +++
 verl/utils/dataset/rl_dataset.py       | 2 --
 verl/workers/fsdp_workers.py           | 5 ++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/verl/model_merger/base_model_merger.py b/verl/model_merger/base_model_merger.py
index 08859cc5568..4f8091788b0 100644
--- a/verl/model_merger/base_model_merger.py
+++ b/verl/model_merger/base_model_merger.py
@@ -191,6 +191,9 @@ def __init__(self, config: ModelMergerConfig):
     def get_transformers_auto_model_class(self):
         if "ForTokenClassification" in self.model_config.architectures[0]:
             return AutoModelForTokenClassification
+        elif "Qwen2.5-Omni" in self.hf_model_config_path:
+            from transformers import Qwen2_5OmniThinkerForConditionalGeneration
+            return Qwen2_5OmniThinkerForConditionalGeneration
         elif "ForCausalLM" in self.model_config.architectures[0]:
             return AutoModelForCausalLM
         elif "ForConditionalGeneration" in self.model_config.architectures[0]:
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 4f781e2d7bd..0b45146c3b2 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -402,8 +402,6 @@ def __getitem__(self, item):
                     audio_data, sampling_rate = process_audio(audio_path, self.processor)
                     audios.append((audio_data, sampling_rate))
 
-                # due to the audio key is "audio" instead of "audios" in vllm, we need to use "audio" here
-                # following the same pattern as images and videos for vllm compatibility
                 multi_modal_data["audio"] = audios
                 processor_kwargs["audio"] = audios
 
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
index 85ac8aa5e8c..af48d112ec3 100644
--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -291,7 +291,10 @@ def _build_model_optimizer(
 
         with init_context(), warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            if type(actor_model_config) in AutoModelForVision2Seq._model_mapping.keys():
+            if "Qwen2.5-Omni" in local_path:
+                from transformers import Qwen2_5OmniForConditionalGeneration
+                actor_module_class = Qwen2_5OmniForConditionalGeneration
+            elif type(actor_model_config) in AutoModelForVision2Seq._model_mapping.keys():
                 actor_module_class = AutoModelForVision2Seq
             else:
                 actor_module_class = AutoModelForCausalLM

From 364713f9b492aacb722264a093bc13b738e64f2e Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 19:05:42 -0400
Subject: [PATCH 006/232] Add omni support

---
 verl/workers/fsdp_workers.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
index af48d112ec3..b6af0e6b009 100644
--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -264,9 +264,15 @@ def _build_model_optimizer(
             torch_dtype = PrecisionType.to_dtype(torch_dtype)
 
         # override model kwargs
-        actor_model_config = AutoConfig.from_pretrained(
-            local_path, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2"
-        )
+        if "Qwen2.5-Omni" in local_path:
+            from transformers import Qwen2_5OmniThinkerConfig
+            actor_model_config = Qwen2_5OmniThinkerConfig.from_pretrained(
+                local_path, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2"
+            )
+        else:
+            actor_model_config = AutoConfig.from_pretrained(
+                local_path, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2"
+            )
 
         # patch for kimi-vl
         if getattr(actor_model_config, "model_type", None) == "kimi_vl":

From 954a52d7822b204b7b843cfc2ac4c6b1b06f3537 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 19:09:39 -0400
Subject: [PATCH 007/232] Add omni support

---
 verl/workers/fsdp_workers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
index b6af0e6b009..28b1c9de1ed 100644
--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -298,8 +298,8 @@ def _build_model_optimizer(
         with init_context(), warnings.catch_warnings():
             warnings.simplefilter("ignore")
             if "Qwen2.5-Omni" in local_path:
-                from transformers import Qwen2_5OmniForConditionalGeneration
-                actor_module_class = Qwen2_5OmniForConditionalGeneration
+                from transformers import Qwen2_5OmniThinkerForConditionalGeneration
+                actor_module_class = Qwen2_5OmniThinkerForConditionalGeneration
             elif type(actor_model_config) in AutoModelForVision2Seq._model_mapping.keys():
                 actor_module_class = AutoModelForVision2Seq
             else:

From e6ee543151d3f98dfb76821d60aee3922ea619ad Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 19:38:57 -0400
Subject: [PATCH 008/232] Add omni support

---
 verl/utils/dataset/rl_dataset.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 0b45146c3b2..bd62410d669 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -192,7 +192,7 @@ def doc2len(doc) -> int:
                         videos = [process_video(video) for video in doc[video_key]]
                         processor_kwargs["videos"] = videos
                         
-                    if "audios" in self.modalities and audio_key in doc and doc.get(audio_key, None) is not None:
+                    if "audio" in self.modalities and audio_key in doc and doc.get(audio_key, None) is not None:
                         audios = [process_audio(audio, processor) for audio in doc[audio_key]]
                         processor_kwargs["audio"] = audios
 
@@ -232,7 +232,7 @@ def _build_messages(self, example: dict):
         has_multimodal = (
             ("images" in self.modalities and self.image_key in example) or
             ("videos" in self.modalities and self.video_key in example) or
-            ("audios" in self.modalities and self.audio_key in example)
+            ("audio" in self.modalities and self.audio_key in example)
         )
         
         if has_multimodal:
@@ -273,7 +273,7 @@ def _build_messages(self, example: dict):
                     tag_patterns.append("<image>")
                 if "videos" in self.modalities:
                     tag_patterns.append("<video>")
-                if "audios" in self.modalities:
+                if "audio" in self.modalities:
                     tag_patterns.append("<audio>")
                 
                 if tag_patterns:
@@ -285,7 +285,7 @@ def _build_messages(self, example: dict):
                             content_list.append({"type": "image"})
                         elif segment == "<video>" and "videos" in self.modalities:
                             content_list.append({"type": "video"})
-                        elif segment == "<audio>" and "audios" in self.modalities:
+                        elif segment == "<audio>" and "audio" in self.modalities:
                             content_list.append({"type": "audio"})
                         else:
                             content_list.append({"type": "text", "text": segment})
@@ -395,7 +395,7 @@ def __getitem__(self, item):
                 processor_kwargs["videos"] = videos
 
             audios = None
-            if "audios" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
+            if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
                 audios = []
                 for audio in row_dict.get(self.audio_key):
                     audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio

From 313846a4bf81e3c6a382e75f549186fdd06973e3 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 19:52:21 -0400
Subject: [PATCH 009/232] Use torchaudio

---
 verl/utils/dataset/audio_utils.py | 46 ++++++++++++++++---------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index 7c05573ac00..d1936e25810 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -15,37 +15,39 @@
 from typing import Tuple, Optional
 
 import torch
-from transformers.pipelines.audio_utils import ffmpeg_read
+import torchaudio
 
 
 def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]:
-    """Process audio file and return (audio_data, sampling_rate) tuple.
-    
-    Args:
-        audio: Audio file path (str) or audio dict containing file path
-        processor: Audio processor with feature_extractor for sampling rate
-        
-    Returns:
-        Tuple of (audio_data, sampling_rate)
-    """
     if isinstance(audio, dict):
         audio_path = audio.get("audio", audio)
     else:
         audio_path = audio
-    
+
     try:
-        # Get sampling rate from processor if available, otherwise use default
-        if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor, 'sampling_rate'):
-            sampling_rate = processor.feature_extractor.sampling_rate
+        # Load audio
+        audio_data, original_sr = torchaudio.load(audio_path)
+
+        # Get target sampling rate
+        if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor,
+                                                                             'sampling_rate'):
+            target_sr = processor.feature_extractor.sampling_rate
+        else:
+            target_sr = 16000
+
+        # Resample if needed
+        if original_sr != target_sr:
+            resampler = torchaudio.transforms.Resample(original_sr, target_sr)
+            audio_data = resampler(audio_data)
+
+        # Convert to mono if stereo
+        if audio_data.shape[0] > 1:
+            audio_data = audio_data.mean(dim=0, keepdim=False)
         else:
-            sampling_rate = 16000  # Default sampling rate
-            
-        # Read audio using ffmpeg_read with the specified sampling rate
-        audio_data = ffmpeg_read(audio_path, sampling_rate=sampling_rate)
-        
-        return audio_data, sampling_rate
+            audio_data = audio_data.squeeze(0)
+
+        return audio_data, target_sr
     except Exception as e:
         print(f"Error processing audio {audio_path}: {e}")
-        # Return dummy audio data on error
-        dummy_audio = torch.zeros((1000,), dtype=torch.float32)  # 1 second of silence at 1kHz
+        dummy_audio = torch.zeros((1000,), dtype=torch.float32)
         return dummy_audio, 16000
\ No newline at end of file

From fbe5a7fbbfb964dbed1bfa861edad66337b3ba27 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 20:09:17 -0400
Subject: [PATCH 010/232] Debug for audio

---
 verl/utils/dataset/rl_dataset.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index bd62410d669..a3a5e841b9e 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -397,13 +397,15 @@ def __getitem__(self, item):
             audios = None
             if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
                 audios = []
+                audio_tuples = []  # Keep tuples for multi_modal_data
                 for audio in row_dict.get(self.audio_key):
                     audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
                     audio_data, sampling_rate = process_audio(audio_path, self.processor)
-                    audios.append((audio_data, sampling_rate))
+                    audio_tuples.append((audio_data, sampling_rate))
+                    audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
 
-                multi_modal_data["audio"] = audios
-                processor_kwargs["audio"] = audios
+                multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
+                processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
 
             model_inputs = self.processor(**processor_kwargs)
 

From 704a66657aca15f19b177276d70ef48bd31eef7c Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 20:28:26 -0400
Subject: [PATCH 011/232] Debug for audio

---
 verl/models/transformers/qwen2_vl.py | 29 ++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/verl/models/transformers/qwen2_vl.py b/verl/models/transformers/qwen2_vl.py
index 2866a02353e..2597ef6faa5 100644
--- a/verl/models/transformers/qwen2_vl.py
+++ b/verl/models/transformers/qwen2_vl.py
@@ -58,9 +58,20 @@ def get_rope_index(
     """
     spatial_merge_size = processor.image_processor.merge_size
     tokens_per_second = 2
+    
+    # Try old token names first, then fall back to new ones
     image_token_id = processor.tokenizer.convert_tokens_to_ids("<|image_pad|>")
+    if image_token_id is None:
+        image_token_id = processor.tokenizer.convert_tokens_to_ids("<|vision_pad|>")  # New tokenizer uses vision_pad for images
+    
     video_token_id = processor.tokenizer.convert_tokens_to_ids("<|video_pad|>")
+    if video_token_id is None:
+        video_token_id = processor.tokenizer.convert_tokens_to_ids("<|vision_pad|>")  # New tokenizer uses vision_pad for videos
+    
     vision_start_token_id = processor.tokenizer.convert_tokens_to_ids("<|vision_start|>")
+    if vision_start_token_id is None:
+        vision_start_token_id = processor.tokenizer.convert_tokens_to_ids("<|vision_bos|>")  # New tokenizer uses vision_bos
+    
     if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
@@ -69,10 +80,20 @@ def get_rope_index(
         image_index, video_index = 0, 0
         input_ids = input_ids[attention_mask == 1]
         image_nums, video_nums = 0, 0
-        vision_start_indices = torch.argwhere(input_ids == vision_start_token_id)
-        vision_tokens = input_ids[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
+        
+        # Ensure vision_start_token_id is valid before comparison
+        if vision_start_token_id is not None:
+            vision_start_indices = torch.argwhere(input_ids == vision_start_token_id)
+        else:
+            vision_start_indices = torch.empty((0, 1), dtype=torch.long, device=input_ids.device)
+        
+        # Handle case where there are vision tokens
+        if vision_start_indices.numel() > 0:
+            vision_tokens = input_ids[vision_start_indices + 1]
+            image_nums = (vision_tokens == image_token_id).sum().item() if image_token_id is not None else 0
+            video_nums = (vision_tokens == video_token_id).sum().item() if video_token_id is not None else 0
+        else:
+            image_nums, video_nums = 0, 0
         input_tokens = input_ids.tolist()
         llm_pos_ids_list: list = []
         st = 0

From ee0f78f11f61e2222c01b6e88de402fd2a4c2fdc Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 20:41:39 -0400
Subject: [PATCH 012/232] Debug for audio

---
 verl/utils/dataset/rl_dataset.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index a3a5e841b9e..f942528b31e 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -28,6 +28,7 @@
 from omegaconf import DictConfig, ListConfig
 from torch.utils.data import Dataset
 from transformers import PreTrainedTokenizer, ProcessorMixin
+import warnings
 
 import verl.utils.torch_functional as verl_F
 from verl.utils.model import compute_position_id_with_mask
@@ -366,7 +367,9 @@ def __getitem__(self, item):
             from verl.utils.dataset.vision_utils import process_image, process_video
             from verl.utils.dataset.audio_utils import process_audio
 
-            raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="System prompt modified")
+                raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
             multi_modal_data = {}
             processor_kwargs = {"text": [raw_prompt], "return_tensors": "pt"}
 

From 3cb51659211504f08bb8a24dac5ab08e786aca97 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 20:47:32 -0400
Subject: [PATCH 013/232] Debug for audio

---
 verl/utils/dataset/rl_dataset.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index f942528b31e..13f0ad63952 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -361,6 +361,11 @@ def __getitem__(self, item):
                 row_dict[key] = []
 
         messages = self._build_messages(row_dict)
+        if "audio" in self.modalities:
+            messages.insert(0, {
+                "role": "system",
+                "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
+            })
         model_inputs = {}
 
         if self.processor is not None:

From 902007c23e333d8443b73c5eedaf707885644b3a Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 21:06:08 -0400
Subject: [PATCH 014/232] Update prompt

---
 verl/utils/dataset/rl_dataset.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 13f0ad63952..a56572b2df0 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -364,7 +364,10 @@ def __getitem__(self, item):
         if "audio" in self.modalities:
             messages.insert(0, {
                 "role": "system",
-                "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
+                "content": [
+                    {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
+                                             "capable of perceiving auditory and visual inputs, as well as generating text and speech."}
+                ]
             })
         model_inputs = {}
 

From f976def584acb597a04ae7dc40b9355ea6e14ffc Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 21:48:48 -0400
Subject: [PATCH 015/232] debug

---
 verl/utils/dataset/rl_dataset.py           | 3 ---
 verl/workers/sharding_manager/fsdp_vllm.py | 9 +++++++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index a56572b2df0..b34f1a049aa 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -381,7 +381,6 @@ def __getitem__(self, item):
             multi_modal_data = {}
             processor_kwargs = {"text": [raw_prompt], "return_tensors": "pt"}
 
-            images = None
             if "images" in self.modalities and self.image_key in row_dict and row_dict.get(self.image_key, None) is not None and len(row_dict[self.image_key]) > 0:
                 images = []
                 for image in row_dict.get(self.image_key):
@@ -393,7 +392,6 @@ def __getitem__(self, item):
                 multi_modal_data["image"] = images
                 processor_kwargs["images"] = images
 
-            videos = None
             if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
                 videos = []
                 for video in row_dict.get(self.video_key):
@@ -405,7 +403,6 @@ def __getitem__(self, item):
                 multi_modal_data["video"] = [video.numpy() for video in videos]
                 processor_kwargs["videos"] = videos
 
-            audios = None
             if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
                 audios = []
                 audio_tuples = []  # Keep tuples for multi_modal_data
diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 73866ebb21c..8b39561fde7 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -334,6 +334,15 @@ def replace_lora_wrapper(k):
 
         patch_vllm_moe_model_weight_loader(model)
         device = get_device_id()  # used when fsdp2 set cpu_offload_policy
+        
+        # Special handling for Qwen2_5OmniThinkerForConditionalGeneration
+        # This model doesn't have a 'model.' prefix in its parameter names
+        model_class_name = model.__class__.__name__
+        if model_class_name == "Qwen2_5OmniThinkerForConditionalGeneration":
+            # Add 'model.' prefix to all parameter names for compatibility with vLLM
+            updated_params = {f"model.{name}" if not name.startswith("model.") else name: param 
+                            for name, param in updated_params.items()}
+        
         loaded_params = model.load_weights(
             (
                 (name, param.to(device, non_blocking=True).full_tensor() if isinstance(param, DTensor) else param)

From 95735c8fc316cd211b03cb7fed0d58fd2a07725b Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 21:54:40 -0400
Subject: [PATCH 016/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 8b39561fde7..7bc49be6ba7 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -336,11 +336,11 @@ def replace_lora_wrapper(k):
         device = get_device_id()  # used when fsdp2 set cpu_offload_policy
         
         # Special handling for Qwen2_5OmniThinkerForConditionalGeneration
-        # This model doesn't have a 'model.' prefix in its parameter names
+        # This vLLM model doesn't have a 'model.' attribute, so we need to remove the prefix
         model_class_name = model.__class__.__name__
         if model_class_name == "Qwen2_5OmniThinkerForConditionalGeneration":
-            # Add 'model.' prefix to all parameter names for compatibility with vLLM
-            updated_params = {f"model.{name}" if not name.startswith("model.") else name: param 
+            # Remove 'model.' prefix from parameter names for this specific model
+            updated_params = {name.replace("model.", "") if name.startswith("model.") else name: param 
                             for name, param in updated_params.items()}
         
         loaded_params = model.load_weights(

From 2fdd0782bedaeabeb1aa3b2d4c139b4572d45af2 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:00:57 -0400
Subject: [PATCH 017/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 7bc49be6ba7..530b3f38230 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -342,7 +342,7 @@ def replace_lora_wrapper(k):
             # Remove 'model.' prefix from parameter names for this specific model
             updated_params = {name.replace("model.", "") if name.startswith("model.") else name: param 
                             for name, param in updated_params.items()}
-        
+        print(updated_params)
         loaded_params = model.load_weights(
             (
                 (name, param.to(device, non_blocking=True).full_tensor() if isinstance(param, DTensor) else param)

From 461e9954a0de3da1378e9ce7ea58e4297fa28fad Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:03:53 -0400
Subject: [PATCH 018/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 530b3f38230..2feae3ae2cd 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -343,6 +343,8 @@ def replace_lora_wrapper(k):
             updated_params = {name.replace("model.", "") if name.startswith("model.") else name: param 
                             for name, param in updated_params.items()}
         print(updated_params)
+        # print model modules
+        print(model._modules)
         loaded_params = model.load_weights(
             (
                 (name, param.to(device, non_blocking=True).full_tensor() if isinstance(param, DTensor) else param)

From 4b6ee75e1e25826d6d50fd850e383098f9b0099d Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:05:14 -0400
Subject: [PATCH 019/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 2feae3ae2cd..fff327f01a5 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -342,9 +342,9 @@ def replace_lora_wrapper(k):
             # Remove 'model.' prefix from parameter names for this specific model
             updated_params = {name.replace("model.", "") if name.startswith("model.") else name: param 
                             for name, param in updated_params.items()}
-        print(updated_params)
-        # print model modules
-        print(model._modules)
+            # drop embed_tokens
+            updated_params = {name: param for name, param in updated_params.items() if not name.startswith("embed_tokens.")}
+
         loaded_params = model.load_weights(
             (
                 (name, param.to(device, non_blocking=True).full_tensor() if isinstance(param, DTensor) else param)

From 551fde3a9a144cb598041653a37b2b7c4dacc00d Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:09:07 -0400
Subject: [PATCH 020/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index fff327f01a5..4566dab25f8 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -345,6 +345,8 @@ def replace_lora_wrapper(k):
             # drop embed_tokens
             updated_params = {name: param for name, param in updated_params.items() if not name.startswith("embed_tokens.")}
 
+        print(updated_params.keys())
+
         loaded_params = model.load_weights(
             (
                 (name, param.to(device, non_blocking=True).full_tensor() if isinstance(param, DTensor) else param)

From c3a0f66a676323987b802854375d8fc0180b32b5 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:14:30 -0400
Subject: [PATCH 021/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 4566dab25f8..47dabd3ab4f 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -344,6 +344,8 @@ def replace_lora_wrapper(k):
                             for name, param in updated_params.items()}
             # drop embed_tokens
             updated_params = {name: param for name, param in updated_params.items() if not name.startswith("embed_tokens.")}
+            # layers. -> language_model.layers.
+            updated_params = {name.replace("layers.", "language_model.layers."): param for name, param in updated_params.items()}
 
         print(updated_params.keys())
 

From 3bae315ee9ad70498aa7f05771ec015c1cbdb213 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:19:13 -0400
Subject: [PATCH 022/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 47dabd3ab4f..fec59696176 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -345,7 +345,8 @@ def replace_lora_wrapper(k):
             # drop embed_tokens
             updated_params = {name: param for name, param in updated_params.items() if not name.startswith("embed_tokens.")}
             # layers. -> language_model.layers.
-            updated_params = {name.replace("layers.", "language_model.layers."): param for name, param in updated_params.items()}
+            updated_params = {name.replace("layers.", "language_model.layers.") if name.startswith("layers.") else name: param
+                              for name, param in updated_params.items()}
 
         print(updated_params.keys())
 

From 2552705c5cf117a4d09b17b50b973eb401fd56b0 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:25:32 -0400
Subject: [PATCH 023/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 25 +++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index fec59696176..040d7729f62 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -348,7 +348,30 @@ def replace_lora_wrapper(k):
             updated_params = {name.replace("layers.", "language_model.layers.") if name.startswith("layers.") else name: param
                               for name, param in updated_params.items()}
 
-        print(updated_params.keys())
+        # Debug: Print model structure to understand weight loading paths
+        def print_model_tree(module, prefix="", indent=""):
+            """Recursively print model structure showing modules that can accept weights."""
+            print(f"{indent}{prefix}{module.__class__.__name__}")
+            for name, child in module.named_children():
+                # Check if this module has parameters (can load weights)
+                has_params = any(child.parameters())
+                param_indicator = " [has params]" if has_params else ""
+                new_prefix = f"{prefix}{name}." if prefix else f"{name}."
+                print(f"{indent}  └─ {name}: {child.__class__.__name__}{param_indicator}")
+                # Recursively print children with deeper indentation
+                if len(list(child.children())) > 0:
+                    print_model_tree(child, new_prefix, indent + "    ")
+        
+        print("\n=== Model Structure Tree ===")
+        print_model_tree(model)
+        print("\n=== Available Parameter Keys ===")
+        # Print first 20 parameter keys as example
+        param_keys = list(updated_params.keys())
+        for key in param_keys[:20]:
+            print(f"  {key}")
+        if len(param_keys) > 20:
+            print(f"  ... and {len(param_keys) - 20} more")
+        print(f"\n=== Total parameters to load: {len(param_keys)} ===\n")
 
         loaded_params = model.load_weights(
             (

From 27b0deed62b591bf5430164aceb9f2b89806687a Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:29:56 -0400
Subject: [PATCH 024/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 040d7729f62..7282d78822c 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -354,7 +354,10 @@ def print_model_tree(module, prefix="", indent=""):
             print(f"{indent}{prefix}{module.__class__.__name__}")
             for name, child in module.named_children():
                 # Check if this module has parameters (can load weights)
-                has_params = any(child.parameters())
+                try:
+                    has_params = len(list(child.parameters())) > 0
+                except:
+                    has_params = False
                 param_indicator = " [has params]" if has_params else ""
                 new_prefix = f"{prefix}{name}." if prefix else f"{name}."
                 print(f"{indent}  └─ {name}: {child.__class__.__name__}{param_indicator}")

From 78b97e37a13d2d7673b1daf8256d5dc20debd3be Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:34:40 -0400
Subject: [PATCH 025/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 7282d78822c..7f265fbfffb 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -340,13 +340,13 @@ def replace_lora_wrapper(k):
         model_class_name = model.__class__.__name__
         if model_class_name == "Qwen2_5OmniThinkerForConditionalGeneration":
             # Remove 'model.' prefix from parameter names for this specific model
-            updated_params = {name.replace("model.", "") if name.startswith("model.") else name: param 
+            updated_params = {name.replace("model.", "language_model.model.") if name.startswith("model.") else name: param
                             for name, param in updated_params.items()}
             # drop embed_tokens
-            updated_params = {name: param for name, param in updated_params.items() if not name.startswith("embed_tokens.")}
-            # layers. -> language_model.layers.
-            updated_params = {name.replace("layers.", "language_model.layers.") if name.startswith("layers.") else name: param
-                              for name, param in updated_params.items()}
+            # updated_params = {name: param for name, param in updated_params.items() if not name.startswith("embed_tokens.")}
+            # # layers. -> language_model.layers.
+            # updated_params = {name.replace("layers.", "language_model.layers.") if name.startswith("layers.") else name: param
+            #                   for name, param in updated_params.items()}
 
         # Debug: Print model structure to understand weight loading paths
         def print_model_tree(module, prefix="", indent=""):

From ada904d9b0967f894c4c07e3a3fac4390f3fa8a5 Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:40:21 -0400
Subject: [PATCH 026/232] debug

---
 verl/workers/sharding_manager/fsdp_vllm.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 7f265fbfffb..2f5c5a66ec7 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -342,11 +342,12 @@ def replace_lora_wrapper(k):
             # Remove 'model.' prefix from parameter names for this specific model
             updated_params = {name.replace("model.", "language_model.model.") if name.startswith("model.") else name: param
                             for name, param in updated_params.items()}
-            # drop embed_tokens
-            # updated_params = {name: param for name, param in updated_params.items() if not name.startswith("embed_tokens.")}
-            # # layers. -> language_model.layers.
-            # updated_params = {name.replace("layers.", "language_model.layers.") if name.startswith("layers.") else name: param
-            #                   for name, param in updated_params.items()}
+            updated_params = {
+                name.replace("lm_head.", "language_model.lm_head.") if name.startswith("lm_head.") else name: param
+                for name, param in updated_params.items()}
+            updated_params = {
+                name.replace("logits_processor.", "language_model.logits_processor.") if name.startswith("logits_processor.") else name: param
+                for name, param in updated_params.items()}
 
         # Debug: Print model structure to understand weight loading paths
         def print_model_tree(module, prefix="", indent=""):

From b0162afa23b396153cfbebdadb19c89884bb0aab Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 22:50:27 -0400
Subject: [PATCH 027/232] debug

---
 .../rollout/vllm_rollout/vllm_rollout_spmd.py |  1 +
 verl/workers/sharding_manager/fsdp_vllm.py    | 28 -------------------
 2 files changed, 1 insertion(+), 28 deletions(-)

diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
index cb0c7742d68..ca3c25b9a44 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
@@ -269,6 +269,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
 
         # used to construct attention_mask
         eos_token_id = prompts.meta_info["eos_token_id"]
+        print(f"Metainfo: {prompts.meta_info}")
 
         batch_size = idx.size(0)
 
diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
index 2f5c5a66ec7..b76d20aeeba 100644
--- a/verl/workers/sharding_manager/fsdp_vllm.py
+++ b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -349,34 +349,6 @@ def replace_lora_wrapper(k):
                 name.replace("logits_processor.", "language_model.logits_processor.") if name.startswith("logits_processor.") else name: param
                 for name, param in updated_params.items()}
 
-        # Debug: Print model structure to understand weight loading paths
-        def print_model_tree(module, prefix="", indent=""):
-            """Recursively print model structure showing modules that can accept weights."""
-            print(f"{indent}{prefix}{module.__class__.__name__}")
-            for name, child in module.named_children():
-                # Check if this module has parameters (can load weights)
-                try:
-                    has_params = len(list(child.parameters())) > 0
-                except:
-                    has_params = False
-                param_indicator = " [has params]" if has_params else ""
-                new_prefix = f"{prefix}{name}." if prefix else f"{name}."
-                print(f"{indent}  └─ {name}: {child.__class__.__name__}{param_indicator}")
-                # Recursively print children with deeper indentation
-                if len(list(child.children())) > 0:
-                    print_model_tree(child, new_prefix, indent + "    ")
-        
-        print("\n=== Model Structure Tree ===")
-        print_model_tree(model)
-        print("\n=== Available Parameter Keys ===")
-        # Print first 20 parameter keys as example
-        param_keys = list(updated_params.keys())
-        for key in param_keys[:20]:
-            print(f"  {key}")
-        if len(param_keys) > 20:
-            print(f"  ... and {len(param_keys) - 20} more")
-        print(f"\n=== Total parameters to load: {len(param_keys)} ===\n")
-
         loaded_params = model.load_weights(
             (
                 (name, param.to(device, non_blocking=True).full_tensor() if isinstance(param, DTensor) else param)

From efbab94c1a538d390997f455e79204ce4b295b6f Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Thu, 14 Aug 2025 23:00:05 -0400
Subject: [PATCH 028/232] debug

---
 verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
index ca3c25b9a44..0b001538595 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
@@ -269,6 +269,9 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
 
         # used to construct attention_mask
         eos_token_id = prompts.meta_info["eos_token_id"]
+        if eos_token_id is None:
+            # tokenize <|im_end|> token
+            eos_token_id = 151645
         print(f"Metainfo: {prompts.meta_info}")
 
         batch_size = idx.size(0)

From 6985493a2adef7769aa88d79ed7e069bb568931b Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Fri, 15 Aug 2025 07:27:13 -0400
Subject: [PATCH 029/232] Debug

---
 examples/grpo_trainer/run_qwen2_5_vl-7b_hb.sh | 53 ++++++++++++++++++
 .../run_qwen2_5_vl-7b_hb_all_modalities.sh    | 55 +++++++++++++++++++
 verl/trainer/main_ppo.py                      |  1 +
 3 files changed, 109 insertions(+)
 create mode 100755 examples/grpo_trainer/run_qwen2_5_vl-7b_hb.sh
 create mode 100755 examples/grpo_trainer/run_qwen2_5_vl-7b_hb_all_modalities.sh

diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b_hb.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b_hb.sh
new file mode 100755
index 00000000000..3f1f0796af4
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen2_5_vl-7b_hb.sh
@@ -0,0 +1,53 @@
+set -x
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
+    data.train_batch_size=512 \
+    data.max_prompt_length=4096 \
+    data.max_response_length=4096 \
+    data.filter_overlong_prompts=False \
+    data.truncation='left' \
+    data.image_key=images \
+    data.video_key=videos \
+    data.prompt_key=problem \
+    data.format_prompt=examples/format_prompt/default.jinja \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=1e-8 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    custom_reward_function.path=examples/reward_function/medical.py \
+    custom_reward_function.name=medical_compute_score_batch \
+    reward_model.reward_manager=batch \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_hb' \
+    trainer.experiment_name='vision_only' \
+    trainer.n_gpus_per_node=2 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.val_before_train=False \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@
diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b_hb_all_modalities.sh
new file mode 100755
index 00000000000..84675cb89cb
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -0,0 +1,55 @@
+set -x
+
+PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/home/dvdai/multimodal/human_behaviour_data/old_train_template_prompts.jsonl \
+    data.val_files=/home/dvdai/multimodal/human_behaviour_data/old_val_template_prompts.jsonl \
+    data.train_batch_size=512 \
+    data.max_prompt_length=4096 \
+    data.max_response_length=4096 \
+    data.filter_overlong_prompts=False \
+    data.truncation='left' \
+    data.image_key=images \
+    data.video_key=videos \
+    data.prompt_key=problem \
+    data.dataloader_num_workers=0 \
+    data.modalities=\'audio,videos\' \
+    data.format_prompt=examples/format_prompt/default.jinja \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=1e-8 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    custom_reward_function.path=examples/reward_function/medical.py \
+    custom_reward_function.name=medical_compute_score_batch \
+    reward_model.reward_manager=batch \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_hb' \
+    trainer.experiment_name='vision_only' \
+    trainer.n_gpus_per_node=2 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.val_before_train=False \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 071035b33fd..c53f1100745 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -58,6 +58,7 @@ def run_ppo(config) -> None:
         ray.init(
             runtime_env=get_ppo_ray_runtime_env(),
             num_cpus=config.ray_init.num_cpus,
+            dashboard_host="0.0.0.0",
         )
 
     # Create a remote instance of the TaskRunner class, and

From 0f94e3be29b9f8600753342ed09dc6011f73f32f Mon Sep 17 00:00:00 2001
From: DVD <zjdavid.2003@gmail.com>
Date: Fri, 15 Aug 2025 10:35:46 -0400
Subject: [PATCH 030/232] Reduce batch size / remove kl

---
 .../grpo_trainer/run_qwen2_5_vl-7b_hb_all_modalities.sh  | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b_hb_all_modalities.sh
index 84675cb89cb..fd2d60ba553 100755
--- a/examples/grpo_trainer/run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -2,9 +2,10 @@ set -x
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/home/dvdai/multimodal/human_behaviour_data/old_train_template_prompts.jsonl \
-    data.val_files=/home/dvdai/multimodal/human_behaviour_data/old_val_template_prompts.jsonl \
-    data.train_batch_size=512 \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
+    data.train_batch_size=128 \
+    data.val_batch_size=128 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \
@@ -20,7 +21,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=128 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-8 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
     actor_rollout_ref.actor.entropy_coeff=0 \

From 092380d342157ee00dd386594c70581ceecdc406 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Fri, 15 Aug 2025 13:27:52 -0400
Subject: [PATCH 031/232] _

---
 examples/grpo_trainer/_human_behaviour.sh | 54 +++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100755 examples/grpo_trainer/_human_behaviour.sh

diff --git a/examples/grpo_trainer/_human_behaviour.sh b/examples/grpo_trainer/_human_behaviour.sh
new file mode 100755
index 00000000000..43f668076e8
--- /dev/null
+++ b/examples/grpo_trainer/_human_behaviour.sh
@@ -0,0 +1,54 @@
+set -x
+ENGINE=${1:-vllm}
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+    data.train_batch_size=512 \
+    data.max_prompt_length=4096 \
+    data.max_response_length=4096 \
+    data.filter_overlong_prompts=False \
+    data.truncation='left' \
+    data.image_key=images \
+    data.video_key=videos \
+    data.prompt_key=problem \
+    data.format_prompt=examples/format_prompt/default.jinja \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=1e-8 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=$ENGINE \
+    actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    custom_reward_function.path=/Users/keane/Desktop/research/human-behavior/verl/examples/reward_function/medical.py \
+    custom_reward_function.name=medical_compute_score_batch \
+    reward_model.reward_manager=batch \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_human_behaviour' \
+    trainer.experiment_name='qwen2_5_vl_7b_function_trial' \
+    trainer.n_gpus_per_node=4 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.val_before_train=False \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

From af235c02e7c40928f02a0a6e71aff4d0aaa7904c Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Fri, 15 Aug 2025 14:41:06 -0400
Subject: [PATCH 032/232] _

---
 examples/grpo_trainer/_human_behaviour.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_human_behaviour.sh b/examples/grpo_trainer/_human_behaviour.sh
index 43f668076e8..c8598534d65 100755
--- a/examples/grpo_trainer/_human_behaviour.sh
+++ b/examples/grpo_trainer/_human_behaviour.sh
@@ -39,7 +39,7 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     algorithm.use_kl_in_reward=False \
-    custom_reward_function.path=/Users/keane/Desktop/research/human-behavior/verl/examples/reward_function/medical.py \
+    custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/medical.py \
     custom_reward_function.name=medical_compute_score_batch \
     reward_model.reward_manager=batch \
     trainer.critic_warmup=0 \

From 9d4267e434af6b6da50d3bf7b991b23a007054b6 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Fri, 15 Aug 2025 14:51:56 -0400
Subject: [PATCH 033/232] _

---
 examples/grpo_trainer/_human_behaviour.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_human_behaviour.sh b/examples/grpo_trainer/_human_behaviour.sh
index c8598534d65..94da1e39b22 100755
--- a/examples/grpo_trainer/_human_behaviour.sh
+++ b/examples/grpo_trainer/_human_behaviour.sh
@@ -13,7 +13,7 @@ python3 -m verl.trainer.main_ppo \
     data.image_key=images \
     data.video_key=videos \
     data.prompt_key=problem \
-    data.format_prompt=examples/format_prompt/default.jinja \
+    data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \

From f7d820789804893eb0406a1793ce33428f659ddc Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Fri, 15 Aug 2025 14:53:38 -0400
Subject: [PATCH 034/232] _

---
 examples/grpo_trainer/_human_behaviour.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_human_behaviour.sh b/examples/grpo_trainer/_human_behaviour.sh
index 94da1e39b22..06e26b5ad3c 100755
--- a/examples/grpo_trainer/_human_behaviour.sh
+++ b/examples/grpo_trainer/_human_behaviour.sh
@@ -46,7 +46,7 @@ python3 -m verl.trainer.main_ppo \
     trainer.logger='["console","wandb"]' \
     trainer.project_name='verl_human_behaviour' \
     trainer.experiment_name='qwen2_5_vl_7b_function_trial' \
-    trainer.n_gpus_per_node=4 \
+    trainer.n_gpus_per_node= 2 \
     trainer.nnodes=1 \
     trainer.save_freq=20 \
     trainer.val_before_train=False \

From 231440e9be0ae6151fc7aa799cb60026f1b786f4 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Fri, 15 Aug 2025 15:00:32 -0400
Subject: [PATCH 035/232] _

---
 examples/grpo_trainer/_human_behaviour.sh     |  3 +-
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 56 +++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100755 examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh

diff --git a/examples/grpo_trainer/_human_behaviour.sh b/examples/grpo_trainer/_human_behaviour.sh
index 06e26b5ad3c..fd946bd8389 100755
--- a/examples/grpo_trainer/_human_behaviour.sh
+++ b/examples/grpo_trainer/_human_behaviour.sh
@@ -1,11 +1,12 @@
 set -x
 ENGINE=${1:-vllm}
 
-python3 -m verl.trainer.main_ppo \
+PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
     data.train_batch_size=512 \
+    data.val_batch_size=128 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \
diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
new file mode 100755
index 00000000000..0e31f002cc4
--- /dev/null
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -0,0 +1,56 @@
+set -x
+
+PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+    data.train_batch_size=128 \
+    data.val_batch_size=128 \
+    data.max_prompt_length=4096 \
+    data.max_response_length=4096 \
+    data.filter_overlong_prompts=False \
+    data.truncation='left' \
+    data.image_key=images \
+    data.video_key=videos \
+    data.prompt_key=problem \
+    data.dataloader_num_workers=0 \
+    data.modalities=\'videos\' \
+    data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=1e-8 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/medical.py \
+    custom_reward_function.name=medical_compute_score_batch \
+    reward_model.reward_manager=batch \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_hb' \
+    trainer.experiment_name='vision_only' \
+    trainer.n_gpus_per_node=2 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.val_before_train=False \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@
\ No newline at end of file

From 9b01ddd70de40324e30eadb9725a93934dca2fd6 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Fri, 15 Aug 2025 15:10:37 -0400
Subject: [PATCH 036/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 0e31f002cc4..bd1633f0e8c 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -16,7 +16,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     data.dataloader_num_workers=0 \
     data.modalities=\'videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
-    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=128 \

From d548642b268c137613133c330eac47e7093bfe34 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 11:17:07 -0400
Subject: [PATCH 037/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index bd1633f0e8c..0e31f002cc4 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -16,7 +16,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     data.dataloader_num_workers=0 \
     data.modalities=\'videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
-    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=128 \

From 1e46e85110e0a161d60180f1dc73e48d6eb78bc4 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 12:34:04 -0400
Subject: [PATCH 038/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 0e31f002cc4..4f39c1367d5 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -14,7 +14,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     data.video_key=videos \
     data.prompt_key=problem \
     data.dataloader_num_workers=0 \
-    data.modalities=\'videos\' \
+    data.modalities=\'audio,videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \

From 4b8098fb3408d8b69212d4514be37b3e951e990a Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 12:47:05 -0400
Subject: [PATCH 039/232] _

---
 .../keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 4f39c1367d5..20d0205c3a4 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -1,5 +1,8 @@
 set -x
 
+# Qwen/Qwen2.5-VL-7B-Instruct
+# Qwen/Qwen2.5-Omni-7B
+
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
@@ -16,7 +19,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     data.dataloader_num_workers=0 \
     data.modalities=\'audio,videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
-    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=128 \

From e6a780ed5d95adca386a11645b2b8a4eeba55849 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 12:57:42 -0400
Subject: [PATCH 040/232] _

---
 .../keane_run_qwen2_5_vl-7b_hb_all_modalities.sh            | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 20d0205c3a4..6f061f715f8 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -2,11 +2,13 @@ set -x
 
 # Qwen/Qwen2.5-VL-7B-Instruct
 # Qwen/Qwen2.5-Omni-7B
+# data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+#     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
     data.train_batch_size=128 \
     data.val_batch_size=128 \
     data.max_prompt_length=4096 \

From e7379943ea8c0e1c64bfc3a44bce91a79dfcb156 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 13:07:31 -0400
Subject: [PATCH 041/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 6f061f715f8..8c0a9ada190 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -3,7 +3,7 @@ set -x
 # Qwen/Qwen2.5-VL-7B-Instruct
 # Qwen/Qwen2.5-Omni-7B
 # data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
-#     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+# data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
@@ -21,7 +21,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     data.dataloader_num_workers=0 \
     data.modalities=\'audio,videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
-    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=128 \

From fd8ab0ece8a2aa2b575c0dc2a900ffb2f99b9e71 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:01:02 -0400
Subject: [PATCH 042/232] push req txt

---
 dvd_requirements.txt | 282 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 282 insertions(+)
 create mode 100644 dvd_requirements.txt

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
new file mode 100644
index 00000000000..6d322c8933d
--- /dev/null
+++ b/dvd_requirements.txt
@@ -0,0 +1,282 @@
+absl-py==2.3.0
+accelerate==1.8.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.13
+aiohttp-cors==0.8.1
+aiosignal==1.3.2
+airportsdata==20250622
+alembic==1.16.2
+aliyun-python-sdk-core==2.16.0
+aliyun-python-sdk-kms==2.16.5
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.9.0
+astor==0.8.1
+attrs==25.3.0
+audioread==3.0.1
+av==14.4.0
+beautifulsoup4==4.13.4
+blake3==1.0.5
+cachetools==5.5.2
+cbor2==5.6.5
+certifi==2025.8.3
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.4.2
+click==8.2.1
+cloudpickle==3.1.1
+codetiming==1.4.0
+color-matcher==0.6.0
+colorama==0.4.6
+colorful==0.5.6
+comfyui-embedded-docs==0.2.3
+comfyui_frontend_package==1.23.4
+comfyui_workflow_templates==0.1.32
+compressed-tensors==0.10.2
+contourpy==1.3.2
+crcmod==1.7
+cryptography==45.0.5
+cuda-bindings==12.9.0
+cuda-python==12.9.0
+cupy-cuda12x==13.4.1
+cycler==0.12.1
+datasets==4.0.0
+ddt==1.7.2
+decorator==5.2.1
+Deprecated==1.2.18
+depyf==0.19.0
+diffusers==0.34.0
+dill==0.3.8
+diskcache==5.6.3
+distlib==0.3.9
+distro==1.9.0
+dnspython==2.7.0
+docutils==0.21.2
+einops==0.8.1
+email_validator==2.2.0
+fastapi==0.115.14
+fastapi-cli==0.0.7
+fastrlock==0.8.3
+ffmpeg-python==0.2.0
+filelock==3.19.1
+flash_attn==2.8.0.post2
+flatbuffers==25.2.10
+fonttools==4.58.4
+frozenlist==1.7.0
+fsspec==2025.3.0
+ftfy==6.3.1
+future==1.0.0
+gguf==0.17.1
+gitdb==4.0.12
+GitPython==3.1.44
+google-api-core==2.25.1
+google-auth==2.40.3
+googleapis-common-protos==1.70.0
+greenlet==3.2.3
+grpcio==1.73.1
+h11==0.16.0
+heavyball==1.7.2
+hf-xet==1.1.5
+httpcore==1.0.9
+httptools==0.6.4
+httpx==0.28.1
+huggingface-hub==0.34.1
+hydra-core==1.3.2
+idna==3.10
+imageio==2.37.0
+importlib_metadata==8.7.0
+interegular==0.3.3
+jax==0.7.0
+jaxlib==0.7.0
+Jinja2==3.1.6
+jiter==0.10.0
+jmespath==0.10.0
+joblib==1.5.1
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+kiwisolver==1.4.8
+kornia==0.8.1
+kornia_rs==0.1.9
+lark==1.2.2
+lazy_loader==0.4
+librosa==0.11.0
+liger_kernel==0.5.10
+llguidance==0.7.30
+llvmlite==0.44.0
+lm-format-enforcer==0.10.11
+Mako==1.3.10
+Markdown==3.8.2
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mathruler==0.1.0
+matplotlib==3.10.3
+matrix-client==0.4.0
+mdurl==0.1.2
+mediapipe==0.10.21
+mistral_common==1.8.3
+ml_dtypes==0.5.3
+model-index==0.1.11
+mpi4py==4.1.0
+mpmath==1.3.0
+msgpack==1.1.1
+msgspec==0.19.0
+multidict==6.6.0
+multiprocess==0.70.16
+natsort==8.4.0
+nest-asyncio==1.6.0
+networkx==3.5
+ninja==1.11.1.4
+numba==0.61.2
+numpy==1.26.4
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-ml-py==12.575.51
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvshmem-cu12==3.3.9
+nvidia-nvtx-cu12==12.6.77
+olefile==0.47
+omegaconf==2.3.0
+openai==1.90.0
+opencensus==0.11.4
+opencensus-context==0.1.3
+opencv-contrib-python==4.11.0.86
+opencv-python-headless==4.11.0.86
+opendatalab==0.0.10
+openmim==0.3.9
+opentelemetry-api==1.26.0
+opentelemetry-exporter-otlp-proto-grpc==1.26.0
+opentelemetry-exporter-otlp-proto-http==1.26.0
+opentelemetry-exporter-prometheus==0.55b1
+opentelemetry-proto==1.26.0
+opentelemetry-sdk==1.26.0
+opentelemetry-semantic-conventions==0.55b1
+openxlab==0.1.2
+opt_einsum==3.4.0
+ordered-set==4.1.0
+orjson==3.10.18
+oss2==2.17.0
+outlines==0.1.11
+outlines_core==0.2.10
+packaging==24.2
+pandas==2.3.0
+partial-json-parser==0.2.1.1.post6
+peft==0.15.2
+piexif==1.1.3
+pillow==11.2.1
+pip==25.1
+platformdirs==4.3.8
+pooch==1.8.2
+prometheus_client==0.22.1
+prometheus-fastapi-instrumentator==7.1.0
+propcache==0.3.2
+proto-plus==1.26.1
+protobuf==4.25.8
+psutil==7.0.0
+py-cpuinfo==9.0.0
+py-spy==0.4.0
+pyarrow==20.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pybase64==1.4.1
+pycountry==24.6.1
+pycparser==2.22
+pycryptodome==3.23.0
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydantic-extra-types==2.10.5
+pydantic-settings==2.10.1
+PyGithub==2.6.1
+Pygments==2.19.2
+PyJWT==2.10.1
+pylatexenc==2.10
+pyloudnorm==0.1.1
+PyNaCl==1.5.0
+pynvml==12.0.0
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+python-json-logger==4.0.0.dev0
+python-multipart==0.0.20
+pytz==2023.4
+PyYAML==6.0.2
+pyzmq==27.0.0
+qwen-vl-utils==0.0.11
+ray==2.47.1
+referencing==0.36.2
+regex==2024.11.6
+requests==2.28.2
+rich==14.1.0
+rich-toolkit==0.14.7
+rpds-py==0.25.1
+rsa==4.9.1
+safetensors==0.6.0rc0
+sageattention==2.2.0
+scikit-image==0.25.2
+scikit-learn==1.7.0
+scipy==1.16.0
+sentencepiece==0.2.0
+sentry-sdk==2.32.0
+setproctitle==1.3.6
+setuptools==79.0.1
+shellingham==1.5.4
+six==1.17.0
+smart-open==7.1.0
+smmap==5.0.2
+sniffio==1.3.1
+sounddevice==0.5.2
+soundfile==0.13.1
+soupsieve==2.7
+soxr==0.5.0.post1
+spandrel==0.4.1
+SQLAlchemy==2.0.41
+starlette==0.46.2
+sympy==1.14.0
+tabulate==0.9.0
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+tensordict==0.8.3
+threadpoolctl==3.6.0
+tifffile==2025.6.11
+tiktoken==0.9.0
+timm==1.0.16
+tokenizers==0.21.2
+toml==0.10.2
+torch==2.7.1+cu126
+torchaudio==2.7.1
+torchcodec==0.4.0+cu126
+torchdata==0.11.0
+torchsde==0.2.6
+torchvision==0.22.1
+tqdm==4.65.2
+trampoline==0.1.2
+transformers==4.54.0
+triton==3.3.1
+typer==0.16.0
+typing_extensions==4.14.0
+typing-inspection==0.4.1
+tzdata==2025.2
+ujson==5.10.0
+urllib3==1.26.20
+uv==0.7.19
+uvicorn==0.34.3
+uvloop==0.21.0
+virtualenv==20.31.2
+vllm==0.8.4
+wandb==0.20.1
+watchdog==6.0.0
+watchfiles==1.1.0
+wcwidth==0.2.13
+websockets==15.0.1
+Werkzeug==3.1.3
+wfdb==4.3.0
+wheel==0.45.1
\ No newline at end of file

From 64de449c679ce6274024eb8ffbc63eaef34c144b Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:06:00 -0400
Subject: [PATCH 043/232] push req txt

---
 dvd_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
index 6d322c8933d..0a5e035d67d 100644
--- a/dvd_requirements.txt
+++ b/dvd_requirements.txt
@@ -1,3 +1,4 @@
+torch==2.7.1+cu126
 absl-py==2.3.0
 accelerate==1.8.1
 aiohappyeyeballs==2.6.1
@@ -251,7 +252,6 @@ tiktoken==0.9.0
 timm==1.0.16
 tokenizers==0.21.2
 toml==0.10.2
-torch==2.7.1+cu126
 torchaudio==2.7.1
 torchcodec==0.4.0+cu126
 torchdata==0.11.0

From ebf10b09e3437a630d673f3511933468351b4548 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:07:57 -0400
Subject: [PATCH 044/232] push req txt

---
 dvd_requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
index 0a5e035d67d..1cc2f5d4885 100644
--- a/dvd_requirements.txt
+++ b/dvd_requirements.txt
@@ -1,4 +1,4 @@
-torch==2.7.1+cu126
+# torch==2.7.1+cu126
 absl-py==2.3.0
 accelerate==1.8.1
 aiohappyeyeballs==2.6.1
@@ -252,11 +252,11 @@ tiktoken==0.9.0
 timm==1.0.16
 tokenizers==0.21.2
 toml==0.10.2
-torchaudio==2.7.1
+# torchaudio==2.7.1
 torchcodec==0.4.0+cu126
 torchdata==0.11.0
 torchsde==0.2.6
-torchvision==0.22.1
+# torchvision==0.22.1
 tqdm==4.65.2
 trampoline==0.1.2
 transformers==4.54.0

From 144ed8baa9c09bdc0c881bb025c5511715fa06fd Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:19:58 -0400
Subject: [PATCH 045/232] push req txt

---
 dvd_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
index 1cc2f5d4885..0353f08be77 100644
--- a/dvd_requirements.txt
+++ b/dvd_requirements.txt
@@ -221,7 +221,7 @@ rich-toolkit==0.14.7
 rpds-py==0.25.1
 rsa==4.9.1
 safetensors==0.6.0rc0
-sageattention==2.2.0
+# sageattention==2.2.0
 scikit-image==0.25.2
 scikit-learn==1.7.0
 scipy==1.16.0

From c2d1c8b769918af6a45504db6d7dc261165df6eb Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:22:59 -0400
Subject: [PATCH 046/232] push req txt

---
 dvd_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
index 0353f08be77..9b290f7adb0 100644
--- a/dvd_requirements.txt
+++ b/dvd_requirements.txt
@@ -253,7 +253,7 @@ timm==1.0.16
 tokenizers==0.21.2
 toml==0.10.2
 # torchaudio==2.7.1
-torchcodec==0.4.0+cu126
+# torchcodec==0.4.0+cu126
 torchdata==0.11.0
 torchsde==0.2.6
 # torchvision==0.22.1

From ca293ef13ffc77234f2c76763bedabab32625265 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:24:22 -0400
Subject: [PATCH 047/232] _

---
 dvd_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
index 9b290f7adb0..f1f96cc7951 100644
--- a/dvd_requirements.txt
+++ b/dvd_requirements.txt
@@ -215,7 +215,7 @@ qwen-vl-utils==0.0.11
 ray==2.47.1
 referencing==0.36.2
 regex==2024.11.6
-requests==2.28.2
+# requests==2.28.2
 rich==14.1.0
 rich-toolkit==0.14.7
 rpds-py==0.25.1

From d1a01c596a7afc489f4f8b985c8feb87882d875e Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:25:13 -0400
Subject: [PATCH 048/232] _

---
 dvd_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
index f1f96cc7951..28548f48f11 100644
--- a/dvd_requirements.txt
+++ b/dvd_requirements.txt
@@ -257,7 +257,7 @@ toml==0.10.2
 torchdata==0.11.0
 torchsde==0.2.6
 # torchvision==0.22.1
-tqdm==4.65.2
+# tqdm==4.65.2
 trampoline==0.1.2
 transformers==4.54.0
 triton==3.3.1

From 0fc0184aa351b9dddebec392a7eac79a610ac63c Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:27:00 -0400
Subject: [PATCH 049/232] _

---
 dvd_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
index 28548f48f11..cba90024ebf 100644
--- a/dvd_requirements.txt
+++ b/dvd_requirements.txt
@@ -85,7 +85,7 @@ huggingface-hub==0.34.1
 hydra-core==1.3.2
 idna==3.10
 imageio==2.37.0
-importlib_metadata==8.7.0
+# importlib_metadata==8.7.0
 interegular==0.3.3
 jax==0.7.0
 jaxlib==0.7.0

From 66f0b9121e34df430b69b87ab869b67b21d29ac2 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:28:12 -0400
Subject: [PATCH 050/232] _

---
 dvd_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
index cba90024ebf..53ce3663dad 100644
--- a/dvd_requirements.txt
+++ b/dvd_requirements.txt
@@ -160,7 +160,7 @@ opentelemetry-exporter-otlp-proto-http==1.26.0
 opentelemetry-exporter-prometheus==0.55b1
 opentelemetry-proto==1.26.0
 opentelemetry-sdk==1.26.0
-opentelemetry-semantic-conventions==0.55b1
+# opentelemetry-semantic-conventions==0.55b1
 openxlab==0.1.2
 opt_einsum==3.4.0
 ordered-set==4.1.0

From ac3ebbf8af362690304d678fc7d43dbf8fa5d611 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:30:21 -0400
Subject: [PATCH 051/232] _

---
 dvd_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
index 53ce3663dad..3c2b18a84bb 100644
--- a/dvd_requirements.txt
+++ b/dvd_requirements.txt
@@ -157,7 +157,7 @@ openmim==0.3.9
 opentelemetry-api==1.26.0
 opentelemetry-exporter-otlp-proto-grpc==1.26.0
 opentelemetry-exporter-otlp-proto-http==1.26.0
-opentelemetry-exporter-prometheus==0.55b1
+opentelemetry-exporter-prometheus==0.41b0
 opentelemetry-proto==1.26.0
 opentelemetry-sdk==1.26.0
 # opentelemetry-semantic-conventions==0.55b1

From 0a7f10ad979c297f7e2ed89bfaf55fbc5230a2fe Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:32:06 -0400
Subject: [PATCH 052/232] _

---
 dvd_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dvd_requirements.txt b/dvd_requirements.txt
index 3c2b18a84bb..7210f325c6b 100644
--- a/dvd_requirements.txt
+++ b/dvd_requirements.txt
@@ -59,7 +59,7 @@ fastapi==0.115.14
 fastapi-cli==0.0.7
 fastrlock==0.8.3
 ffmpeg-python==0.2.0
-filelock==3.19.1
+# filelock==3.19.1
 flash_attn==2.8.0.post2
 flatbuffers==25.2.10
 fonttools==4.58.4

From 3319c5b983feacd32bd11916b741cd6f019d5d9b Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 15:36:12 -0400
Subject: [PATCH 053/232] _

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 162022343a1..b5512988789 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ accelerate
 codetiming
 datasets
 dill
-flash-attn
+# flash-attn
 hydra-core
 liger-kernel
 numpy<2.0.0
@@ -17,6 +17,7 @@ ray[default]
 tensordict>=0.8.0,<=0.9.1,!=0.9.0
 torchdata
 transformers
+vllm
 # vllm==0.8.4
 wandb
 packaging>=20.0

From 192e705b683d1b37d2e7c8c2757928b36b04571d Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 16:19:14 -0400
Subject: [PATCH 054/232] _

---
 .../keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 8c0a9ada190..4eb7f8b0469 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -4,8 +4,9 @@ set -x
 # Qwen/Qwen2.5-Omni-7B
 # data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
 # data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+# data.modalities=\'audio,videos\' \
 
-PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
+PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
@@ -19,7 +20,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     data.video_key=videos \
     data.prompt_key=problem \
     data.dataloader_num_workers=0 \
-    data.modalities=\'audio,videos\' \
+    data.modalities=\'videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \

From 4b1f4924da4b71cda4bbe34bc8d01dd38b1e68cc Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 16:46:32 -0400
Subject: [PATCH 055/232] _

---
 .../keane_run_qwen2_5_vl-7b_hb_all_modalities.sh            | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 4eb7f8b0469..a053136edd5 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -10,8 +10,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
-    data.train_batch_size=128 \
-    data.val_batch_size=128 \
+    data.train_batch_size=16 \
+    data.val_batch_size=2 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \
@@ -25,7 +25,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=4 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-8 \

From 47fecfb30cbc744406bf6cfabeaf71e7c3ad6819 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:16:30 -0400
Subject: [PATCH 056/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index a053136edd5..beb48e55829 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -11,7 +11,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
     data.train_batch_size=16 \
-    data.val_batch_size=2 \
+    data.val_batch_size=4 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \

From 7ac1aba9066b9c1f90a1073d3a749076ca92d3f1 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:19:37 -0400
Subject: [PATCH 057/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index beb48e55829..022fbc2a4a0 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -1,5 +1,7 @@
 set -x
 
+unset ROCR_VISIBLE_DEVICES
+
 # Qwen/Qwen2.5-VL-7B-Instruct
 # Qwen/Qwen2.5-Omni-7B
 # data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
@@ -58,5 +60,5 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.nnodes=1 \
     trainer.save_freq=20 \
     trainer.val_before_train=False \
-    trainer.test_freq=5 \
+    trainer.test_freq=2 \
     trainer.total_epochs=15 $@
\ No newline at end of file

From aab4d051a5377986319e415b02331f42005f5bdb Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:38:03 -0400
Subject: [PATCH 058/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 022fbc2a4a0..4e0ecdf4bbf 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -41,7 +41,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
     actor_rollout_ref.rollout.enable_chunked_prefill=False \
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \

From 16f65b968736d34628997b891a42ef0ad00d8596 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:39:39 -0400
Subject: [PATCH 059/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 4e0ecdf4bbf..a34a512039f 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -56,7 +56,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.logger='["console","wandb"]' \
     trainer.project_name='verl_hb' \
     trainer.experiment_name='vision_only' \
-    trainer.n_gpus_per_node=2 \
+    trainer.n_gpus_per_node=3 \
     trainer.nnodes=1 \
     trainer.save_freq=20 \
     trainer.val_before_train=False \

From 1d6fb486ff71e6802618c7ec19608647a6a4be1a Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:41:12 -0400
Subject: [PATCH 060/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index a34a512039f..47b5eb7a826 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -12,8 +12,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
-    data.train_batch_size=16 \
-    data.val_batch_size=4 \
+    data.train_batch_size=18 \
+    data.val_batch_size=6 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \

From 2e414ebb8358a9befb64fbbd96010dc3f05885bc Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:43:26 -0400
Subject: [PATCH 061/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 47b5eb7a826..808402b51a5 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -27,8 +27,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=4 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=1 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-8 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \

From a510d2d574257a7d9fe8b09843e31f56052c1373 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:47:11 -0400
Subject: [PATCH 062/232] _

---
 .../keane_run_qwen2_5_vl-7b_hb_all_modalities.sh       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 808402b51a5..1ca200d8db4 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -27,8 +27,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=1 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=3 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=3 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-8 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
@@ -37,8 +37,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.enable_gradient_checkpointing=True \
     actor_rollout_ref.actor.fsdp_config.param_offload=False \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=3 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
@@ -46,7 +46,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \
     actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=3 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     algorithm.use_kl_in_reward=False \
     custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/medical.py \

From 7cb923313a5baf5566ec560047e489a1aa2d34d8 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:49:29 -0400
Subject: [PATCH 063/232] _

---
 .../keane_run_qwen2_5_vl-7b_hb_all_modalities.sh          | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 1ca200d8db4..c13cb7b76e4 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -27,8 +27,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=3 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=3 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=1 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-8 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
@@ -37,7 +37,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.enable_gradient_checkpointing=True \
     actor_rollout_ref.actor.fsdp_config.param_offload=False \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=3 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
@@ -46,7 +46,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \
     actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=3 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     algorithm.use_kl_in_reward=False \
     custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/medical.py \

From d5d29cc01eeb14ff74b7c099fb3825a5bdcdbff0 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:54:17 -0400
Subject: [PATCH 064/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index c13cb7b76e4..93993d87954 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -41,7 +41,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
     actor_rollout_ref.rollout.enable_chunked_prefill=False \
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \

From 184ee8cd60dad6814055f9fe7038de7b1f758b28 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:56:51 -0400
Subject: [PATCH 065/232] _

---
 .../keane_run_qwen2_5_vl-7b_hb_all_modalities.sh    | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 93993d87954..4eee61a9421 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -8,14 +8,15 @@ unset ROCR_VISIBLE_DEVICES
 # data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
 # data.modalities=\'audio,videos\' \
 
-PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" python3 -m verl.trainer.main_ppo \
+PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1\
+python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
     data.train_batch_size=18 \
     data.val_batch_size=6 \
-    data.max_prompt_length=4096 \
-    data.max_response_length=4096 \
+    data.max_prompt_length=3072 \
+    data.max_response_length=1536 \
     data.filter_overlong_prompts=False \
     data.truncation='left' \
     data.image_key=images \
@@ -27,7 +28,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-8 \
@@ -41,11 +42,11 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
     actor_rollout_ref.rollout.enable_chunked_prefill=False \
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \
-    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.n=3 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     algorithm.use_kl_in_reward=False \

From 582390da975b3aa75b9a291c880c0edee8daa058 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:57:38 -0400
Subject: [PATCH 066/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 4eee61a9421..027fd8ab2ff 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -8,8 +8,7 @@ unset ROCR_VISIBLE_DEVICES
 # data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
 # data.modalities=\'audio,videos\' \
 
-PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1\
-python3 -m verl.trainer.main_ppo \
+PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \

From 62bab5f60aaaa89c92adb8f5af464c7463c735c5 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sat, 16 Aug 2025 18:59:00 -0400
Subject: [PATCH 067/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 027fd8ab2ff..31634cf5c8d 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -12,8 +12,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
-    data.train_batch_size=18 \
-    data.val_batch_size=6 \
+    data.train_batch_size=64 \
+    data.val_batch_size=64 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
     data.filter_overlong_prompts=False \

From 47f8f3c9fdaee1d7c0020edf425cb0c848c305f1 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 10:39:00 -0400
Subject: [PATCH 068/232] _

---
 .../keane_run_qwen2_5_vl-7b_hb_all_modalities.sh          | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 31634cf5c8d..80438f1984e 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -2,8 +2,8 @@ set -x
 
 unset ROCR_VISIBLE_DEVICES
 
-# Qwen/Qwen2.5-VL-7B-Instruct
-# Qwen/Qwen2.5-Omni-7B
+# actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct
+# actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B
 # data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
 # data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
 # data.modalities=\'audio,videos\' \
@@ -24,7 +24,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.dataloader_num_workers=0 \
     data.modalities=\'videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
-    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=64 \
@@ -60,5 +60,5 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.nnodes=1 \
     trainer.save_freq=20 \
     trainer.val_before_train=False \
-    trainer.test_freq=2 \
+    trainer.test_freq=1 \
     trainer.total_epochs=15 $@
\ No newline at end of file

From 88d3d263927a16337ef6fe0b3c48692fb809cc6e Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 14:26:45 -0400
Subject: [PATCH 069/232] update

---
 examples/grpo_trainer/_human_behaviour.sh     |  2 +-
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh |  6 +-
 ...nly_run_qwen2_5_vl-7b_hb_all_modalities.sh | 66 +++++++++++++++++++
 .../config/_generated_ppo_trainer.yaml        |  2 +-
 verl/utils/dataset/audio_utils.py             |  9 ++-
 verl/utils/dataset/rl_dataset.py              | 53 +++++++++++++--
 verl/workers/rollout/schemas.py               |  2 +
 7 files changed, 130 insertions(+), 10 deletions(-)
 create mode 100755 examples/grpo_trainer/keane_vl_only_run_qwen2_5_vl-7b_hb_all_modalities.sh

diff --git a/examples/grpo_trainer/_human_behaviour.sh b/examples/grpo_trainer/_human_behaviour.sh
index fd946bd8389..512a4e845b1 100755
--- a/examples/grpo_trainer/_human_behaviour.sh
+++ b/examples/grpo_trainer/_human_behaviour.sh
@@ -52,4 +52,4 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 python3 -m verl.trainer.main_ppo \
     trainer.save_freq=20 \
     trainer.val_before_train=False \
     trainer.test_freq=5 \
-    trainer.total_epochs=15 $@
+    trainer.total_epochs=15 $@
\ No newline at end of file
diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 80438f1984e..122b7077509 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -16,15 +16,15 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.val_batch_size=64 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
-    data.filter_overlong_prompts=False \
+    data.filter_overlong_prompts=True \
     data.truncation='left' \
     data.image_key=images \
     data.video_key=videos \
     data.prompt_key=problem \
     data.dataloader_num_workers=0 \
-    data.modalities=\'videos\' \
+    data.modalities=\'audio,videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
-    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=64 \
diff --git a/examples/grpo_trainer/keane_vl_only_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_vl_only_run_qwen2_5_vl-7b_hb_all_modalities.sh
new file mode 100755
index 00000000000..45a759a1880
--- /dev/null
+++ b/examples/grpo_trainer/keane_vl_only_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -0,0 +1,66 @@
+set -x
+
+unset ROCR_VISIBLE_DEVICES
+
+# NOTE: be careful when setting filter_overlong_prompts; because this removes the prompts from the max_prompt_length
+
+# actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct
+# actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B
+# data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+# data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+# data.modalities=\'audio,videos\' \
+
+PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
+    data.train_batch_size=64 \
+    data.val_batch_size=64 \
+    data.max_prompt_length=3072 \
+    data.max_response_length=1536 \
+    data.filter_overlong_prompts=True \
+    data.truncation='left' \
+    data.image_key=images \
+    data.video_key=videos \
+    data.prompt_key=problem \
+    data.dataloader_num_workers=0 \
+    data.modalities=\'videos\' \
+    data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=1e-8 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=3 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/medical.py \
+    custom_reward_function.name=medical_compute_score_batch \
+    reward_model.reward_manager=batch \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_hb' \
+    trainer.experiment_name='vision_only' \
+    trainer.n_gpus_per_node=3 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.val_before_train=False \
+    trainer.test_freq=1 \
+    trainer.total_epochs=15 $@
\ No newline at end of file
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
index 7c247ddb822..dae43811119 100644
--- a/verl/trainer/config/_generated_ppo_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -255,7 +255,7 @@ data:
   image_key: images
   video_key: videos
   audio_key: audios
-  modalities: images,videos
+  modalities: images,videos # list of modalities to process
   trust_remote_code: false
   custom_cls:
     path: null
diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index d1936e25810..a706da5831c 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -20,27 +20,34 @@
 
 def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]:
     if isinstance(audio, dict):
+        # TODO: to check whether the keys are correct here
         audio_path = audio.get("audio", audio)
     else:
         audio_path = audio
 
     try:
         # Load audio
+        # NOTE: accepts waveform and sample rate; 
         audio_data, original_sr = torchaudio.load(audio_path)
 
         # Get target sampling rate
+        # NOTE: sample rate is basically the amount of audio samples captured per second
+        # 16000 means 16000 samples are taken in every second
         if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor,
                                                                              'sampling_rate'):
             target_sr = processor.feature_extractor.sampling_rate
         else:
+            
             target_sr = 16000
-
+        print(f"KEANE: Processing audio {audio_path} with sampling rate, {target_sr}")
         # Resample if needed
+        # NOTE: This is essentially the resampling of the audio sample rate
         if original_sr != target_sr:
             resampler = torchaudio.transforms.Resample(original_sr, target_sr)
             audio_data = resampler(audio_data)
 
         # Convert to mono if stereo
+        # NOTE: This is essentially the conversion of stereo audio to mono, so that we only have one channel
         if audio_data.shape[0] > 1:
             audio_data = audio_data.mean(dim=0, keepdim=False)
         else:
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index b34f1a049aa..1a8588819a6 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -101,15 +101,24 @@ def __init__(
         self.config = config
 
         self.cache_dir = os.path.expanduser(config.get("cache_dir", "~/.cache/verl/rlhf"))
+        
+        # Essentially getting all the different keys.
         self.prompt_key = config.get("prompt_key", "prompt")
         self.image_key = config.get("image_key", "images")
         self.video_key = config.get("video_key", "videos")
+
+        # NOTE: SET AUDIO KEY AS AUDIOS
         self.audio_key = config.get("audio_key", "audios")
+
+        # NOTE: SET MODALITIES, split the images and videos
         self.modalities = set(config.get("modalities", "images,videos").split(","))
+
         self.max_prompt_length = config.get("max_prompt_length", 1024)
         self.return_raw_chat = config.get("return_raw_chat", False)
         self.return_full_prompt = config.get("return_full_prompt", False)
         self.truncation = config.get("truncation", "error")
+
+        # TODO: Check whether this is true
         self.filter_overlong_prompts = config.get("filter_overlong_prompts", True)
         if isinstance(data_files, str):
             self.base_dir = os.path.dirname(os.path.abspath(data_files))
@@ -130,7 +139,7 @@ def __init__(
         self.format_prompt = self._load_format_prompt()
 
         self._download()
-        self._read_files_and_tokenize()
+        self._read_files_and_tokenize() # essentially this is prepared first before _getitem
 
     def _load_format_prompt(self) -> Optional[Template]:
         """Load format prompt from file if specified."""
@@ -162,11 +171,15 @@ def _read_files_and_tokenize(self):
 
         print(f"dataset len: {len(self.dataframe)}")
 
+        # PROCESSING THE DATAFRAME for TRAINING
         self.dataframe = self.maybe_filter_out_long_prompts(self.dataframe)
 
     def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
-        # filter out too long prompts
+        # NOTE: filter out too long prompts, because the prompts can become very long
+        # when the audio is appended.
+
         if self.filter_overlong_prompts:
+            # NOTE: FILTER OUT THE LONG PROMPTS SO THAT THEY FIT THE LENGTH
             tokenizer = self.tokenizer
             processor = self.processor
             prompt_key = self.prompt_key
@@ -175,6 +188,7 @@ def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
             audio_key = self.audio_key
 
             if processor is not None:
+                print(f"KEANE: PROCESSOR FOUND")
                 from verl.utils.dataset.vision_utils import process_image, process_video
                 from verl.utils.dataset.audio_utils import process_audio
 
@@ -194,13 +208,16 @@ def doc2len(doc) -> int:
                         processor_kwargs["videos"] = videos
                         
                     if "audio" in self.modalities and audio_key in doc and doc.get(audio_key, None) is not None:
+                        # TODO: make sure that this path is actually happening
+                        # processing of audio
+                        print(f"KEANE: Processing audio within rl dataset file")
                         audios = [process_audio(audio, processor) for audio in doc[audio_key]]
                         processor_kwargs["audio"] = audios
 
                     return len(processor(**processor_kwargs)["input_ids"][0])
 
             else:
-
+                print(f"KEANE: PROCESSOR NOT FOUND")
                 def doc2len(doc) -> int:
                     return len(tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True))
 
@@ -226,10 +243,14 @@ def __len__(self):
         return len(self.dataframe)
 
     def _build_messages(self, example: dict):
+        """
+        This appears to be called twice, once during maybe_filter_out_long_prompts, and another time during getitems
+        """
         messages: list = example.get(self.prompt_key)
         if isinstance(messages, str):
             messages = [messages]
 
+        # NOTE: Before building, check if there is multimodal content
         has_multimodal = (
             ("images" in self.modalities and self.image_key in example) or
             ("videos" in self.modalities and self.video_key in example) or
@@ -254,6 +275,8 @@ def _build_messages(self, example: dict):
                 image_tag_count = content.count("<image>")
                 video_tag_count = content.count("<video>")
                 audio_tag_count = content.count("<audio>")
+
+                # NOTE: Apppending the <image>, <video>, <audio> tags when they are missing
                 if image_tag_count < image_count:
                     content = "<image>" * (image_count - image_tag_count) + content
                     logger.warning("<image> tag count is less than image count, adding missing <image> tags."
@@ -277,6 +300,8 @@ def _build_messages(self, example: dict):
                 if "audio" in self.modalities:
                     tag_patterns.append("<audio>")
                 
+                # NOTE: Denote the different patterns based on the tag.
+                # TODO: Double check what this does
                 if tag_patterns:
                     pattern = "(" + "|".join(tag_patterns) + ")"
                     segments = re.split(pattern, content)
@@ -360,8 +385,15 @@ def __getitem__(self, item):
             if item is None:
                 row_dict[key] = []
 
+        # NOTE: BUILD_MESSAGES IS CALLED TWICE; 
+        # NOTE: FIRST TIME IS TO GET THE LENGTH OF THE RAW PROMPT AND FILTER OUT 
+        # NOTE: PROMPTS THAT DO NOT FIT THE LENGTH; 
+        # NOTE: SECOND TIME IS TO BUILD THE MESSAGE TO BE PASSED INTO THE MODEL
+
         messages = self._build_messages(row_dict)
+
         if "audio" in self.modalities:
+            # NOTE: Set the following prompt for qwen omni when we are training on audio
             messages.insert(0, {
                 "role": "system",
                 "content": [
@@ -372,6 +404,7 @@ def __getitem__(self, item):
         model_inputs = {}
 
         if self.processor is not None:
+            # THIS CHUNK IS BASICALLY ABOUT PROCESSING ALL THE MODALITIES
             from verl.utils.dataset.vision_utils import process_image, process_video
             from verl.utils.dataset.audio_utils import process_audio
 
@@ -403,6 +436,7 @@ def __getitem__(self, item):
                 multi_modal_data["video"] = [video.numpy() for video in videos]
                 processor_kwargs["videos"] = videos
 
+            # NOTE: PROCESSING OF THE AUDIO TUPLES
             if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
                 audios = []
                 audio_tuples = []  # Keep tuples for multi_modal_data
@@ -415,10 +449,15 @@ def __getitem__(self, item):
                 multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
                 processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
 
+            # TODO: Please check whether the model is processing the "audio" correctly, the processor that we are using is qwen 2.5 OMNI
+            print(f"KEANE: Processing multimodal data with processor {self.processor.__class__.__name__} ")
+            print(f"KEANE: Processor kwargs: {processor_kwargs}")
             model_inputs = self.processor(**processor_kwargs)
 
+            # NOTE: all text should be processed by self.processor()
             input_ids = model_inputs.pop("input_ids")
             attention_mask = model_inputs.pop("attention_mask")
+            
 
             if "second_per_grid_ts" in model_inputs:
                 model_inputs.pop("second_per_grid_ts")
@@ -451,7 +490,12 @@ def __getitem__(self, item):
 
         if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
             from verl.models.transformers.qwen2_vl import get_rope_index
-
+            
+            # NOTE: printing out whether this runs
+            print("KEANE: Running getting the rope index of input ids")
+            
+            # NOTE: OBTAIN ROPE of rotary positional embeddings. ROPE encodes position by rotating components of query/key vectors
+            # This is just for to get relative position in terms of angular differences etc.
             position_ids = [
                 get_rope_index(
                     self.processor,
@@ -466,6 +510,7 @@ def __getitem__(self, item):
         else:
             position_ids = compute_position_id_with_mask(attention_mask)
 
+        # Essentially training with the different input ids etc.
         row_dict["input_ids"] = input_ids[0]
         row_dict["attention_mask"] = attention_mask[0]
         row_dict["position_ids"] = position_ids[0]
diff --git a/verl/workers/rollout/schemas.py b/verl/workers/rollout/schemas.py
index 87e2ab424a9..1bdccaf736f 100644
--- a/verl/workers/rollout/schemas.py
+++ b/verl/workers/rollout/schemas.py
@@ -269,6 +269,8 @@ def _get_position_ids(
         if is_qwen2vl:
             from verl.models.transformers.qwen2_vl import get_rope_index
 
+            print("KEANE: Running getting the rope index of input ids")
+
             image_grid_thw = video_grid_thw = second_per_grid_ts = None
             if multi_modal_inputs:
                 image_grid_thw = multi_modal_inputs.get("image_grid_thw")

From d9cab8b44b4b9b7abf307256b44391501cc07bda Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 15:07:51 -0400
Subject: [PATCH 070/232] _

---
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 19 ++++++++++++++++++-
 verl/trainer/main_ppo.py                      |  3 +++
 verl/trainer/ppo/ray_trainer.py               |  2 ++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 122b7077509..41692181dd4 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -8,6 +8,22 @@ unset ROCR_VISIBLE_DEVICES
 # data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
 # data.modalities=\'audio,videos\' \
 
+# SETTING OF SAVE PATH: trainer.default_local_dir= /scratch/keane/human_behaviour/2_models_hb_vision_only
+# SETTING OF THE LOAD PATH from directory of checkpoints is also: trainer.default_local_dir
+
+# TRAINING FROM scratch: trainer.resume_mode ==  "disable" (default will save into default_local_dir)
+
+# TRAINING AUTOMATICALLY (i.e. from scratch or from latest checkpoint) : 
+    # trainer.resume_mode == "auto" and then the model will take the latest ckpt from trainer.default_hdfs_dir
+
+# TRAINING from specific CHECKPOINT: trainer.resume_mode == "resume_path" and then specify trainer.resume_from_path
+    # Setting of path to resume training from trainer.resume_from_path (exact path of checkpoint)
+    # the model will take from resume_from_path directly (absolute path), and ignore default_hdfs_dir
+
+# for validation, set val_before_train=True ; make sure that the checkpoint is loaded and put val_only=True
+# the checkpoint should already be loaded before that
+# and then we will just evaluate
+
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
@@ -61,4 +77,5 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.save_freq=20 \
     trainer.val_before_train=False \
     trainer.test_freq=1 \
-    trainer.total_epochs=15 $@
\ No newline at end of file
+    trainer.total_epochs=15 $@ \
+    trainer.default_local_dir= /scratch/keane/human_behaviour/2_models_hb_vision_only
\ No newline at end of file
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index c53f1100745..28382071eda 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -75,6 +75,8 @@ def run_ppo(config) -> None:
         runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote()
     else:
         runner = TaskRunner.remote()
+
+    # RUN THE TRAINING using runner.run
     ray.get(runner.run.remote(config))
 
     # [Optional] get the path of the timeline trace file from the configuration, default to None
@@ -255,6 +257,7 @@ def run(self, config):
         from verl.utils.dataset.rl_dataset import collate_fn
 
         # Create training and validation datasets.
+        # This is done by reading from the train and the val files
         train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor, is_train=True)
         val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor, is_train=False)
         train_sampler = create_rl_sampler(config.data, train_dataset)
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index bb783854aaf..1f5d38f7ba5 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -963,6 +963,8 @@ def init_workers(self):
             )
 
     def _save_checkpoint(self):
+
+        ## TO SAVE CHECKPOINT
         from verl.utils.fs import local_mkdir_safe
 
         # path: given_path + `/global_step_{global_steps}` + `/actor`

From 292893db496a7a179d9866317de391488677eaa1 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 15:28:45 -0400
Subject: [PATCH 071/232] _

---
 ...e_run_qwen2_5_omni-7b_hb_all_modalities.sh | 82 +++++++++++++++++++
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh |  2 +-
 2 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100755 examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
new file mode 100755
index 00000000000..38a8e4e1835
--- /dev/null
+++ b/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
@@ -0,0 +1,82 @@
+set -x
+
+unset ROCR_VISIBLE_DEVICES
+
+# actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct
+# actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B
+# data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+# data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+# data.modalities=\'audio,videos\' \
+
+# SETTING OF SAVE PATH: trainer.default_local_dir= /scratch/keane/human_behaviour/2_models_hb_vision_only
+# SETTING OF THE LOAD PATH from directory of checkpoints is also: trainer.default_local_dir
+
+# TRAINING FROM scratch: trainer.resume_mode ==  "disable" (default will save into default_local_dir)
+
+# TRAINING AUTOMATICALLY (i.e. from scratch or from latest checkpoint) : 
+    # trainer.resume_mode == "auto" and then the model will take the latest ckpt from trainer.default_hdfs_dir
+
+# TRAINING from specific CHECKPOINT: trainer.resume_mode == "resume_path" and then specify trainer.resume_from_path
+    # Setting of path to resume training from trainer.resume_from_path (exact path of checkpoint)
+    # the model will take from resume_from_path directly (absolute path), and ignore default_hdfs_dir
+
+# for validation, set val_before_train=True ; make sure that the checkpoint is loaded and put val_only=True
+# the checkpoint should already be loaded before that
+# and then we will just evaluate
+/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl
+
+PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
+    data.train_batch_size=64 \
+    data.val_batch_size=64 \
+    data.max_prompt_length=3072 \
+    data.max_response_length=1536 \
+    data.filter_overlong_prompts=True \
+    data.truncation='left' \
+    data.image_key=images \
+    data.video_key=videos \
+    data.prompt_key=problem \
+    data.dataloader_num_workers=0 \
+    data.modalities=\'audio,videos\' \
+    data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=1e-8 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=3 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/medical.py \
+    custom_reward_function.name=medical_compute_score_batch \
+    reward_model.reward_manager=batch \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_hb' \
+    trainer.experiment_name='vision_only' \
+    trainer.n_gpus_per_node=3 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.val_before_train=False \
+    trainer.test_freq=1 \
+    trainer.total_epochs=15 $@ \
+    trainer.default_local_dir= /scratch/keane/human_behaviour/verl_models_hb_vision_only
\ No newline at end of file
diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 41692181dd4..b8df1dd412a 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -78,4 +78,4 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.val_before_train=False \
     trainer.test_freq=1 \
     trainer.total_epochs=15 $@ \
-    trainer.default_local_dir= /scratch/keane/human_behaviour/2_models_hb_vision_only
\ No newline at end of file
+    trainer.default_local_dir= /scratch/keane/human_behaviour/verl_models_hb_vision_only
\ No newline at end of file

From a5b50e625ce3aa758da16dc1ef720fc6a8f871fb Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 17:12:42 -0400
Subject: [PATCH 072/232] _

---
 .../keane_run_qwen2_5_omni-7b_hb_all_modalities.sh           | 5 +++--
 .../keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
index 38a8e4e1835..293003572cc 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
@@ -23,7 +23,8 @@ unset ROCR_VISIBLE_DEVICES
 # for validation, set val_before_train=True ; make sure that the checkpoint is loaded and put val_only=True
 # the checkpoint should already be loaded before that
 # and then we will just evaluate
-/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl
+
+# /scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
@@ -79,4 +80,4 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.val_before_train=False \
     trainer.test_freq=1 \
     trainer.total_epochs=15 $@ \
-    trainer.default_local_dir= /scratch/keane/human_behaviour/verl_models_hb_vision_only
\ No newline at end of file
+    trainer.default_local_dir=/scratch/keane/human_behaviour/verl_models_hb_vision_only
\ No newline at end of file
diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index b8df1dd412a..39437c7fe60 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -78,4 +78,4 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.val_before_train=False \
     trainer.test_freq=1 \
     trainer.total_epochs=15 $@ \
-    trainer.default_local_dir= /scratch/keane/human_behaviour/verl_models_hb_vision_only
\ No newline at end of file
+    trainer.default_local_dir=/scratch/keane/human_behaviour/verl_models_hb_vision_only
\ No newline at end of file

From db61a036e11ef1a5e77ce507339e3b1c1106aa3b Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 17:22:34 -0400
Subject: [PATCH 073/232] _

---
 verl/utils/dataset/rl_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 1a8588819a6..ea976ff8621 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -213,7 +213,8 @@ def doc2len(doc) -> int:
                         print(f"KEANE: Processing audio within rl dataset file")
                         audios = [process_audio(audio, processor) for audio in doc[audio_key]]
                         processor_kwargs["audio"] = audios
-
+                    # TODO: cannot process the audio inputs
+                    print(f"KEANE: Printing the processor_kwargs, {processor_kwargs}")
                     return len(processor(**processor_kwargs)["input_ids"][0])
 
             else:

From 87ec13a5b6435854e33ab7da6eb1fb9943c3fdd8 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 17:33:54 -0400
Subject: [PATCH 074/232] _

---
 verl/utils/dataset/rl_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index ea976ff8621..a48957c1c78 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -214,6 +214,7 @@ def doc2len(doc) -> int:
                         audios = [process_audio(audio, processor) for audio in doc[audio_key]]
                         processor_kwargs["audio"] = audios
                     # TODO: cannot process the audio inputs
+                    print(f"KEANE: Processor class is {processor.__class__.__name__}")
                     print(f"KEANE: Printing the processor_kwargs, {processor_kwargs}")
                     return len(processor(**processor_kwargs)["input_ids"][0])
 

From c3f28f430a2aaa346cf737e8a31d030b2716d236 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 17:44:30 -0400
Subject: [PATCH 075/232] _

---
 verl/utils/dataset/audio_utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index a706da5831c..1d10247a888 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -53,6 +53,14 @@ def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]
         else:
             audio_data = audio_data.squeeze(0)
 
+               # Debug prints
+        print(
+            f"KEANE: Finished processing {audio_path} -> "
+            f"waveform shape {audio_data.shape}, dtype {audio_data.dtype}, "
+            # f"min {audio_data.min().item():.4f}, max {audio_data.max().item():.4f}"
+        )
+        print(f"KEANE: Returning tuple (waveform, sr={target_sr})")
+
         return audio_data, target_sr
     except Exception as e:
         print(f"Error processing audio {audio_path}: {e}")

From 16caf36664dee4cd71b1b8d746d673ffe918583f Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 17:54:15 -0400
Subject: [PATCH 076/232] _

---
 verl/utils/dataset/audio_utils.py | 93 ++++++++++++++++++++++++-------
 verl/utils/dataset/rl_dataset.py  |  1 +
 2 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index 1d10247a888..dea95ed06c2 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -16,53 +16,104 @@
 
 import torch
 import torchaudio
+import numpy as np
 
+# def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]:
+#     if isinstance(audio, dict):
+#         # TODO: to check whether the keys are correct here
+#         audio_path = audio.get("audio", audio)
+#     else:
+#         audio_path = audio
 
-def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]:
+#     try:
+#         # Load audio
+#         # NOTE: accepts waveform and sample rate; 
+#         audio_data, original_sr = torchaudio.load(audio_path)
+
+#         # Get target sampling rate
+#         # NOTE: sample rate is basically the amount of audio samples captured per second
+#         # 16000 means 16000 samples are taken in every second
+#         if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor,
+#                                                                              'sampling_rate'):
+#             target_sr = processor.feature_extractor.sampling_rate
+#         else:
+            
+#             target_sr = 16000
+#         print(f"KEANE: Processing audio {audio_path} with sampling rate, {target_sr}")
+#         # Resample if needed
+#         # NOTE: This is essentially the resampling of the audio sample rate
+#         if original_sr != target_sr:
+#             resampler = torchaudio.transforms.Resample(original_sr, target_sr)
+#             audio_data = resampler(audio_data)
+
+#         # Convert to mono if stereo
+#         # NOTE: This is essentially the conversion of stereo audio to mono, so that we only have one channel
+#         if audio_data.shape[0] > 1:
+#             audio_data = audio_data.mean(dim=0, keepdim=False)
+#         else:
+#             audio_data = audio_data.squeeze(0)
+
+#                # Debug prints
+#         print(
+#             f"KEANE: Finished processing {audio_path} -> "
+#             f"waveform shape {audio_data.shape}, dtype {audio_data.dtype}, "
+#             # f"min {audio_data.min().item():.4f}, max {audio_data.max().item():.4f}"
+#         )
+#         print(f"KEANE: Returning tuple (waveform, sr={target_sr})")
+
+#         return audio_data, target_sr
+#     except Exception as e:
+#         print(f"Error processing audio {audio_path}: {e}")
+#         dummy_audio = torch.zeros((1000,), dtype=torch.float32)
+#         return dummy_audio, 16000
+    
+
+def process_audio(audio: str | dict, processor=None) -> np.ndarray:
+    """
+    NOTE: Keane's implementation
+    """
     if isinstance(audio, dict):
-        # TODO: to check whether the keys are correct here
         audio_path = audio.get("audio", audio)
     else:
         audio_path = audio
 
     try:
-        # Load audio
-        # NOTE: accepts waveform and sample rate; 
+        # Load audio -> (channels, time), sample_rate
         audio_data, original_sr = torchaudio.load(audio_path)
 
-        # Get target sampling rate
-        # NOTE: sample rate is basically the amount of audio samples captured per second
-        # 16000 means 16000 samples are taken in every second
-        if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor,
-                                                                             'sampling_rate'):
+        # Target sampling rate (from processor or default to 16k)
+        if (
+            processor
+            and hasattr(processor, "feature_extractor")
+            and hasattr(processor.feature_extractor, "sampling_rate")
+        ):
             target_sr = processor.feature_extractor.sampling_rate
         else:
-            
             target_sr = 16000
-        print(f"KEANE: Processing audio {audio_path} with sampling rate, {target_sr}")
+
+        print(f"KEANE: Processing audio {audio_path} with target sampling rate {target_sr}")
+
         # Resample if needed
-        # NOTE: This is essentially the resampling of the audio sample rate
         if original_sr != target_sr:
             resampler = torchaudio.transforms.Resample(original_sr, target_sr)
             audio_data = resampler(audio_data)
 
         # Convert to mono if stereo
-        # NOTE: This is essentially the conversion of stereo audio to mono, so that we only have one channel
         if audio_data.shape[0] > 1:
             audio_data = audio_data.mean(dim=0, keepdim=False)
         else:
             audio_data = audio_data.squeeze(0)
 
-               # Debug prints
+        # Convert to numpy float32 (1-D)
+        audio_np = audio_data.detach().cpu().numpy().astype(np.float32)
+
         print(
-            f"KEANE: Finished processing {audio_path} -> "
-            f"waveform shape {audio_data.shape}, dtype {audio_data.dtype}, "
-            # f"min {audio_data.min().item():.4f}, max {audio_data.max().item():.4f}"
+            f"KEANE: Finished {audio_path} -> "
+            f"waveform shape {audio_np.shape}, dtype {audio_np.dtype}"
         )
-        print(f"KEANE: Returning tuple (waveform, sr={target_sr})")
 
-        return audio_data, target_sr
+        return audio_np
+
     except Exception as e:
         print(f"Error processing audio {audio_path}: {e}")
-        dummy_audio = torch.zeros((1000,), dtype=torch.float32)
-        return dummy_audio, 16000
\ No newline at end of file
+        return np.zeros((1000,), dtype=np.float32)  # dummy 1-D waveform
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index a48957c1c78..57ab765942a 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -216,6 +216,7 @@ def doc2len(doc) -> int:
                     # TODO: cannot process the audio inputs
                     print(f"KEANE: Processor class is {processor.__class__.__name__}")
                     print(f"KEANE: Printing the processor_kwargs, {processor_kwargs}")
+                    # Assume that all are in tensors already, hence there is no return_tensors = "pt"
                     return len(processor(**processor_kwargs)["input_ids"][0])
 
             else:

From 741952f40e3ec46476d13d51671a87c741812919 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 17:54:45 -0400
Subject: [PATCH 077/232] _

---
 verl/utils/dataset/audio_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index dea95ed06c2..868d0e56738 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -112,6 +112,8 @@ def process_audio(audio: str | dict, processor=None) -> np.ndarray:
             f"waveform shape {audio_np.shape}, dtype {audio_np.dtype}"
         )
 
+        # NOTE: we only need to return the numpy array and not the sampling rate
+
         return audio_np
 
     except Exception as e:

From 8e51860af4d396f791a0408ec6e42f6644b69ff7 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 17:59:25 -0400
Subject: [PATCH 078/232] _

---
 verl/utils/dataset/rl_dataset.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 57ab765942a..99055781f44 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -430,6 +430,8 @@ def __getitem__(self, item):
 
             if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
                 videos = []
+                print(f"KEANE: GETTING VIDEO {row_dict[self.video_key]}")
+
                 for video in row_dict.get(self.video_key):
                     video = os.path.join(self.base_dir, video) if isinstance(video, str) else video
                     videos.append(process_video(video))
@@ -453,8 +455,8 @@ def __getitem__(self, item):
                 processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
 
             # TODO: Please check whether the model is processing the "audio" correctly, the processor that we are using is qwen 2.5 OMNI
-            print(f"KEANE: Processing multimodal data with processor {self.processor.__class__.__name__} ")
-            print(f"KEANE: Processor kwargs: {processor_kwargs}")
+            # print(f"KEANE: Processing multimodal data with processor {self.processor.__class__.__name__} ")
+            # print(f"KEANE: Processor kwargs: {processor_kwargs}")
             model_inputs = self.processor(**processor_kwargs)
 
             # NOTE: all text should be processed by self.processor()

From 11bab30807d6e535137e747f1d5c4d8d49304fe7 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:05:31 -0400
Subject: [PATCH 079/232] _

---
 verl/utils/dataset/audio_utils.py | 148 +++++++++++++++---------------
 verl/utils/dataset/rl_dataset.py  |   2 +-
 2 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index 868d0e56738..aae8b81a212 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -18,104 +18,104 @@
 import torchaudio
 import numpy as np
 
-# def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]:
-#     if isinstance(audio, dict):
-#         # TODO: to check whether the keys are correct here
-#         audio_path = audio.get("audio", audio)
-#     else:
-#         audio_path = audio
-
-#     try:
-#         # Load audio
-#         # NOTE: accepts waveform and sample rate; 
-#         audio_data, original_sr = torchaudio.load(audio_path)
-
-#         # Get target sampling rate
-#         # NOTE: sample rate is basically the amount of audio samples captured per second
-#         # 16000 means 16000 samples are taken in every second
-#         if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor,
-#                                                                              'sampling_rate'):
-#             target_sr = processor.feature_extractor.sampling_rate
-#         else:
-            
-#             target_sr = 16000
-#         print(f"KEANE: Processing audio {audio_path} with sampling rate, {target_sr}")
-#         # Resample if needed
-#         # NOTE: This is essentially the resampling of the audio sample rate
-#         if original_sr != target_sr:
-#             resampler = torchaudio.transforms.Resample(original_sr, target_sr)
-#             audio_data = resampler(audio_data)
-
-#         # Convert to mono if stereo
-#         # NOTE: This is essentially the conversion of stereo audio to mono, so that we only have one channel
-#         if audio_data.shape[0] > 1:
-#             audio_data = audio_data.mean(dim=0, keepdim=False)
-#         else:
-#             audio_data = audio_data.squeeze(0)
-
-#                # Debug prints
-#         print(
-#             f"KEANE: Finished processing {audio_path} -> "
-#             f"waveform shape {audio_data.shape}, dtype {audio_data.dtype}, "
-#             # f"min {audio_data.min().item():.4f}, max {audio_data.max().item():.4f}"
-#         )
-#         print(f"KEANE: Returning tuple (waveform, sr={target_sr})")
-
-#         return audio_data, target_sr
-#     except Exception as e:
-#         print(f"Error processing audio {audio_path}: {e}")
-#         dummy_audio = torch.zeros((1000,), dtype=torch.float32)
-#         return dummy_audio, 16000
-    
-
-def process_audio(audio: str | dict, processor=None) -> np.ndarray:
-    """
-    NOTE: Keane's implementation
-    """
+def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]:
     if isinstance(audio, dict):
+        # TODO: to check whether the keys are correct here
         audio_path = audio.get("audio", audio)
     else:
         audio_path = audio
 
     try:
-        # Load audio -> (channels, time), sample_rate
+        # Load audio
+        # NOTE: accepts waveform and sample rate; 
         audio_data, original_sr = torchaudio.load(audio_path)
 
-        # Target sampling rate (from processor or default to 16k)
-        if (
-            processor
-            and hasattr(processor, "feature_extractor")
-            and hasattr(processor.feature_extractor, "sampling_rate")
-        ):
+        # Get target sampling rate
+        # NOTE: sample rate is basically the amount of audio samples captured per second
+        # 16000 means 16000 samples are taken in every second
+        if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor,
+                                                                             'sampling_rate'):
             target_sr = processor.feature_extractor.sampling_rate
         else:
+            
             target_sr = 16000
-
-        print(f"KEANE: Processing audio {audio_path} with target sampling rate {target_sr}")
-
+        print(f"KEANE: Processing audio {audio_path} with sampling rate, {target_sr}")
         # Resample if needed
+        # NOTE: This is essentially the resampling of the audio sample rate
         if original_sr != target_sr:
             resampler = torchaudio.transforms.Resample(original_sr, target_sr)
             audio_data = resampler(audio_data)
 
         # Convert to mono if stereo
+        # NOTE: This is essentially the conversion of stereo audio to mono, so that we only have one channel
         if audio_data.shape[0] > 1:
             audio_data = audio_data.mean(dim=0, keepdim=False)
         else:
             audio_data = audio_data.squeeze(0)
 
-        # Convert to numpy float32 (1-D)
-        audio_np = audio_data.detach().cpu().numpy().astype(np.float32)
-
+               # Debug prints
         print(
-            f"KEANE: Finished {audio_path} -> "
-            f"waveform shape {audio_np.shape}, dtype {audio_np.dtype}"
+            f"KEANE: Finished processing {audio_path} -> "
+            f"waveform shape {audio_data.shape}, dtype {audio_data.dtype}, "
+            # f"min {audio_data.min().item():.4f}, max {audio_data.max().item():.4f}"
         )
+        print(f"KEANE: Returning tuple (waveform, sr={target_sr})")
 
-        # NOTE: we only need to return the numpy array and not the sampling rate
-
-        return audio_np
-
+        return audio_data, target_sr
     except Exception as e:
         print(f"Error processing audio {audio_path}: {e}")
-        return np.zeros((1000,), dtype=np.float32)  # dummy 1-D waveform
+        dummy_audio = torch.zeros((1000,), dtype=torch.float32)
+        return dummy_audio, 16000
+    
+
+# def process_audio(audio: str | dict, processor=None) -> np.ndarray:
+#     """
+#     NOTE: Keane's implementation
+#     """
+#     if isinstance(audio, dict):
+#         audio_path = audio.get("audio", audio)
+#     else:
+#         audio_path = audio
+
+#     try:
+#         # Load audio -> (channels, time), sample_rate
+#         audio_data, original_sr = torchaudio.load(audio_path)
+
+#         # Target sampling rate (from processor or default to 16k)
+#         if (
+#             processor
+#             and hasattr(processor, "feature_extractor")
+#             and hasattr(processor.feature_extractor, "sampling_rate")
+#         ):
+#             target_sr = processor.feature_extractor.sampling_rate
+#         else:
+#             target_sr = 16000
+
+#         print(f"KEANE: Processing audio {audio_path} with target sampling rate {target_sr}")
+
+#         # Resample if needed
+#         if original_sr != target_sr:
+#             resampler = torchaudio.transforms.Resample(original_sr, target_sr)
+#             audio_data = resampler(audio_data)
+
+#         # Convert to mono if stereo
+#         if audio_data.shape[0] > 1:
+#             audio_data = audio_data.mean(dim=0, keepdim=False)
+#         else:
+#             audio_data = audio_data.squeeze(0)
+
+#         # Convert to numpy float32 (1-D)
+#         audio_np = audio_data.detach().cpu().numpy().astype(np.float32)
+
+#         print(
+#             f"KEANE: Finished {audio_path} -> "
+#             f"waveform shape {audio_np.shape}, dtype {audio_np.dtype}"
+#         )
+
+#         # NOTE: we only need to return the numpy array and not the sampling rate
+
+#         return audio_np
+
+#     except Exception as e:
+#         print(f"Error processing audio {audio_path}: {e}")
+#         return np.zeros((1000,), dtype=np.float32)  # dummy 1-D waveform
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 99055781f44..801fffbfe7f 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -428,6 +428,7 @@ def __getitem__(self, item):
                 multi_modal_data["image"] = images
                 processor_kwargs["images"] = images
 
+            print("KEANE: Videos is next line")
             if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
                 videos = []
                 print(f"KEANE: GETTING VIDEO {row_dict[self.video_key]}")
@@ -462,7 +463,6 @@ def __getitem__(self, item):
             # NOTE: all text should be processed by self.processor()
             input_ids = model_inputs.pop("input_ids")
             attention_mask = model_inputs.pop("attention_mask")
-            
 
             if "second_per_grid_ts" in model_inputs:
                 model_inputs.pop("second_per_grid_ts")

From ad371ed03ff16ecbb754fd2f374df4ea5786a903 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:06:10 -0400
Subject: [PATCH 080/232] _

---
 verl/utils/dataset/rl_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 801fffbfe7f..864fdb53995 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -428,7 +428,7 @@ def __getitem__(self, item):
                 multi_modal_data["image"] = images
                 processor_kwargs["images"] = images
 
-            print("KEANE: Videos is next line")
+            print(f"KEANE: Videos is next line, current processor_kwargs {processor_kwargs}")
             if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
                 videos = []
                 print(f"KEANE: GETTING VIDEO {row_dict[self.video_key]}")

From 335e4c53cd3bab879128a77ed8f8f80578f911a5 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:10:55 -0400
Subject: [PATCH 081/232] _

---
 verl/utils/dataset/rl_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 864fdb53995..4f6d2dd0789 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -45,7 +45,7 @@ def collate_fn(data_list: list[dict]) -> dict:
 
     Returns:
         Dict where tensor entries are stacked into a torch.Tensor of shape
-        (batch_size, \*dims) and non-tensor entries are converted to
+        (batch_size, dims) and non-tensor entries are converted to
         np.ndarray of dtype object with shape (batch_size,).
     """
     tensors = defaultdict(list)

From b339f9fca7e46797004a21b6ee2ad42ee5eb1ed0 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:17:14 -0400
Subject: [PATCH 082/232] _

---
 verl/utils/dataset/rl_dataset.py | 33 ++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 4f6d2dd0789..30b0df33726 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -199,20 +199,31 @@ def doc2len(doc) -> int:
                     )
                     processor_kwargs = {"text": [raw_prompt]}
                     
-                    if "images" in self.modalities and image_key in doc:
+                    if "images" in self.modalities and image_key in doc and len(doc[image_key]) > 0:
                         images = [process_image(image) for image in doc[image_key]]
                         processor_kwargs["images"] = images
-                        
-                    if "videos" in self.modalities and video_key in doc:
+
+                    if "videos" in self.modalities and video_key in doc and len(doc[video_key]) > 0:    
                         videos = [process_video(video) for video in doc[video_key]]
                         processor_kwargs["videos"] = videos
-                        
-                    if "audio" in self.modalities and audio_key in doc and doc.get(audio_key, None) is not None:
-                        # TODO: make sure that this path is actually happening
+
+                    if "audio" in self.modalities and audio_key in doc and doc.get(audio_key, None) is not None and len(doc[audio_key]) > 0:
                         # processing of audio
-                        print(f"KEANE: Processing audio within rl dataset file")
-                        audios = [process_audio(audio, processor) for audio in doc[audio_key]]
-                        processor_kwargs["audio"] = audios
+                        # print(f"KEANE: Processing audio within rl dataset file")
+                        # audios = [process_audio(audio, processor) for audio in doc[audio_key]]
+                        # processor_kwargs["audio"] = audios
+
+                        # PATCH
+                        audios = []
+                        audio_tuples = []  # Keep tuples for multi_modal_data
+                        for audio in doc.get(self.audio_key):
+                            audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
+                            audio_data, sampling_rate = process_audio(audio_path, self.processor)
+                            audio_tuples.append((audio_data, sampling_rate))
+                            # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
+                            audios.append(audio_data.detach().cpu().numpy().astype("float32"))
+
+                        processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
                     # TODO: cannot process the audio inputs
                     print(f"KEANE: Processor class is {processor.__class__.__name__}")
                     print(f"KEANE: Printing the processor_kwargs, {processor_kwargs}")
@@ -450,7 +461,9 @@ def __getitem__(self, item):
                     audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
                     audio_data, sampling_rate = process_audio(audio_path, self.processor)
                     audio_tuples.append((audio_data, sampling_rate))
-                    audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
+                    # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
+                    audios.append(audio_data.detach().cpu().numpy().astype("float32"))
+
 
                 multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
                 processor_kwargs["audio"] = audios  # Pass numpy arrays to processor

From fe0e2e25b618a2394fcb2899810452ba14539e37 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:22:23 -0400
Subject: [PATCH 083/232] _

---
 verl/trainer/main_ppo.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 28382071eda..254107eed54 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -262,6 +262,10 @@ def run(self, config):
         val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor, is_train=False)
         train_sampler = create_rl_sampler(config.data, train_dataset)
 
+        print(f"Using train sampler: {train_sampler}")
+        print(f"Using val dataset: {val_dataset}")
+        print(f"Using train dataset: {train_dataset}")
+
         # Initialize the PPO trainer.
         trainer = RayPPOTrainer(
             config=config,

From 761535b5f7a3bda59f4bf1c0e6a73ea673a19ba3 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:22:57 -0400
Subject: [PATCH 084/232] _

---
 verl/trainer/main_ppo.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 254107eed54..61e59021128 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -265,6 +265,7 @@ def run(self, config):
         print(f"Using train sampler: {train_sampler}")
         print(f"Using val dataset: {val_dataset}")
         print(f"Using train dataset: {train_dataset}")
+        raise NotImplementedError()
 
         # Initialize the PPO trainer.
         trainer = RayPPOTrainer(

From 513b556e0017f232ef285b358d9172c22c172fad Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:24:18 -0400
Subject: [PATCH 085/232] _

---
 .../keane_run_qwen2_5_omni-7b_hb_all_modalities.sh         | 2 +-
 verl/trainer/main_ppo.py                                   | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
index 293003572cc..bc6eeacd29e 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
@@ -34,7 +34,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.val_batch_size=64 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
-    data.filter_overlong_prompts=True \
+    data.filter_overlong_prompts=False \
     data.truncation='left' \
     data.image_key=images \
     data.video_key=videos \
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 61e59021128..5c31aea98c4 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -262,10 +262,9 @@ def run(self, config):
         val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor, is_train=False)
         train_sampler = create_rl_sampler(config.data, train_dataset)
 
-        print(f"Using train sampler: {train_sampler}")
-        print(f"Using val dataset: {val_dataset}")
-        print(f"Using train dataset: {train_dataset}")
-        raise NotImplementedError()
+        # print(f"Using train sampler: {train_sampler}")
+        # print(f"Using val dataset: {val_dataset}")
+        # print(f"Using train dataset: {train_dataset}")
 
         # Initialize the PPO trainer.
         trainer = RayPPOTrainer(

From c3ac743fae25493027ffeecafbd631fa0e348913 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:27:38 -0400
Subject: [PATCH 086/232] _

---
 .../keane_run_qwen2_5_omni-7b_hb_all_modalities.sh          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
index bc6eeacd29e..4e28791c572 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
@@ -30,8 +30,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
-    data.train_batch_size=64 \
-    data.val_batch_size=64 \
+    data.train_batch_size=1 \
+    data.val_batch_size=1 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
     data.filter_overlong_prompts=False \
@@ -45,7 +45,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=1 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-8 \

From 4d74680c0bac32b706846df3d41e6722622c4bcd Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:48:11 -0400
Subject: [PATCH 087/232] _

---
 verl/trainer/ppo/ray_trainer.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 1f5d38f7ba5..4b3a8f867e4 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1194,6 +1194,7 @@ def fit(self):
                 batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
                 non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
                 if "multi_modal_data" in batch.non_tensor_batch:
+                    # TODO: Fix the audio generation for this
                     non_tensor_batch_keys_to_pop.append("multi_modal_data")
                 if "raw_prompt" in batch.non_tensor_batch:
                     non_tensor_batch_keys_to_pop.append("raw_prompt")
@@ -1217,10 +1218,15 @@ def fit(self):
 
                 is_last_step = self.global_steps >= self.total_training_steps
 
+                # TODO: double check the gen_batch
+                print(f"gen_batch", gen_batch)
+
+
                 with marked_timer("step", timing_raw):
                     # generate a batch
                     with marked_timer("gen", timing_raw, color="red"):
                         if not self.async_rollout_mode:
+                            # TODO: Fix the audio generation for this
                             gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
                         else:
                             gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)

From 81ee854087c9695d5666743a5e72b09e5181b7d1 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:53:58 -0400
Subject: [PATCH 088/232] _

---
 verl/utils/dataset/rl_dataset.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 30b0df33726..9d0121ce260 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -465,7 +465,9 @@ def __getitem__(self, item):
                     audios.append(audio_data.detach().cpu().numpy().astype("float32"))
 
 
-                multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
+                # multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
+                multi_modal_data["audio"] = audios  # Store numpy arrays (it should not accept tuples)
+                
                 processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
 
             # TODO: Please check whether the model is processing the "audio" correctly, the processor that we are using is qwen 2.5 OMNI

From 4a133a5513073a200538f1199ce735d87abb44a2 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 18:59:23 -0400
Subject: [PATCH 089/232] fix audio

---
 verl/utils/dataset/rl_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 9d0121ce260..12ad5191b27 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -467,7 +467,7 @@ def __getitem__(self, item):
 
                 # multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
                 multi_modal_data["audio"] = audios  # Store numpy arrays (it should not accept tuples)
-                
+
                 processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
 
             # TODO: Please check whether the model is processing the "audio" correctly, the processor that we are using is qwen 2.5 OMNI

From 5e17e588e93e184815b1b95ea72f787a0630421a Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 19:08:36 -0400
Subject: [PATCH 090/232] attempt fix two

---
 verl/utils/dataset/rl_dataset.py | 48 ++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 12ad5191b27..4a67fbced1e 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -454,21 +454,47 @@ def __getitem__(self, item):
                 processor_kwargs["videos"] = videos
 
             # NOTE: PROCESSING OF THE AUDIO TUPLES
-            if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
-                audios = []
-                audio_tuples = []  # Keep tuples for multi_modal_data
-                for audio in row_dict.get(self.audio_key):
+            # if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
+            #     audios = []
+            #     audio_tuples = []  # Keep tuples for multi_modal_data
+            #     for audio in row_dict.get(self.audio_key):
+            #         audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
+            #         audio_data, sampling_rate = process_audio(audio_path, self.processor)
+            #         audio_tuples.append((audio_data, sampling_rate))
+            #         # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
+            #         audios.append(audio_data.detach().cpu().numpy().astype("float32"))
+
+            #     # multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
+            #     multi_modal_data["audio"] = audios  # Store numpy arrays (it should not accept tuples)
+
+            #     processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
+
+            if (
+                "audio" in self.modalities
+                and self.audio_key in row_dict
+                and row_dict.get(self.audio_key)
+                and len(row_dict[self.audio_key]) > 0
+            ):
+                audios_np = []
+                audios_np_sr = []
+                audio_tuples_debug = []  # keep tensors only for debugging
+
+                for audio in row_dict[self.audio_key]:
                     audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
-                    audio_data, sampling_rate = process_audio(audio_path, self.processor)
-                    audio_tuples.append((audio_data, sampling_rate))
-                    # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
-                    audios.append(audio_data.detach().cpu().numpy().astype("float32"))
+                    audio_tensor, sr = process_audio(audio_path, self.processor)
 
+                    # Debug only
+                    audio_tuples_debug.append((audio_tensor, sr))
 
-                # multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
-                multi_modal_data["audio"] = audios  # Store numpy arrays (it should not accept tuples)
+                    # What BOTH HF and vLLM need:
+                    arr = audio_tensor.detach().cpu().numpy().astype("float32")
+                    audios_np.append(arr)
+                    audios_np_sr.append((arr, int(sr)))
 
-                processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
+                # HF (Whisper / Omni processor) path
+                multi_modal_data["audio"] = audios_np_sr  # Store numpy arrays (it should not accept tuples)
+
+                processor_kwargs["audio"] = audios_np  # Pass numpy arrays to processor
 
             # TODO: Please check whether the model is processing the "audio" correctly, the processor that we are using is qwen 2.5 OMNI
             # print(f"KEANE: Processing multimodal data with processor {self.processor.__class__.__name__} ")

From da571b08526deda30c0e9d767dc846df5baa0190 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Sun, 17 Aug 2025 19:33:15 -0400
Subject: [PATCH 091/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 39437c7fe60..4e94b69f8c3 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -46,7 +46,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.actor.ppo_mini_batch_size=64 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
-    actor_rollout_ref.actor.kl_loss_coef=1e-8 \
+    actor_rollout_ref.actor.kl_loss_coef=1e-9 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
     actor_rollout_ref.actor.entropy_coeff=0 \
     actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \

From 0a86fe7badf50a86a2ad9784ea07bd23c98cebd8 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 09:44:39 -0400
Subject: [PATCH 092/232] _

---
 .../grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 4e94b69f8c3..9abf4306b79 100755
--- a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -32,7 +32,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.val_batch_size=64 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
-    data.filter_overlong_prompts=True \
+    data.filter_overlong_prompts=False \
     data.truncation='left' \
     data.image_key=images \
     data.video_key=videos \

From 5f43d91a8554cee9884c5231f6d9c7ac1360bc0d Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 09:51:29 -0400
Subject: [PATCH 093/232] _

---
 .../keane_vl_only_run_qwen2_5_vl-7b_hb_all_modalities.sh        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/keane_vl_only_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_vl_only_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 45a759a1880..13cf20b279d 100755
--- a/examples/grpo_trainer/keane_vl_only_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/keane_vl_only_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -18,7 +18,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.val_batch_size=64 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
-    data.filter_overlong_prompts=True \
+    data.filter_overlong_prompts=False \
     data.truncation='left' \
     data.image_key=images \
     data.video_key=videos \

From 6250faeb6a2a0a4a4ddb0bdda43bea16d4369fd1 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 09:52:17 -0400
Subject: [PATCH 094/232] _

---
 ...odalities.sh => _keane_run_qwen2_5_vl-7b_hb_all_modalities.sh} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/grpo_trainer/{keane_run_qwen2_5_vl-7b_hb_all_modalities.sh => _keane_run_qwen2_5_vl-7b_hb_all_modalities.sh} (100%)

diff --git a/examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
similarity index 100%
rename from examples/grpo_trainer/keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
rename to examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh

From 5a720ad4b0762c9c66c9bc5973b64e84931285d9 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 10:02:15 -0400
Subject: [PATCH 095/232] implement reward func

---
 examples/reward_function/human_behaviour.py | 49 +++++++++++++++++++++
 examples/reward_function/math.py            |  2 +-
 2 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 examples/reward_function/human_behaviour.py

diff --git a/examples/reward_function/human_behaviour.py b/examples/reward_function/human_behaviour.py
new file mode 100644
index 00000000000..f39b32a5699
--- /dev/null
+++ b/examples/reward_function/human_behaviour.py
@@ -0,0 +1,49 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Any
+
+from mathruler.grader import extract_boxed_content, grade_answer
+
+
+def format_reward(response: str) -> float:
+    pattern = re.compile(r"<think>.*</think>.*\\boxed\{.*\}.*", re.DOTALL)
+    format_match = re.fullmatch(pattern, response)
+    return 1.0 if format_match else 0.0
+
+
+def accuracy_reward(response: str, ground_truth: str) -> float:
+    answer = extract_boxed_content(response)
+    return 1.0 if grade_answer(answer, ground_truth) else 0.0
+
+
+def compute_score(reward_inputs: list[dict[str, Any]], format_weight: float = 0.1) -> list[dict[str, float]]:
+    if not isinstance(reward_inputs, list):
+        raise ValueError("Please use `reward_type=batch` for math reward function.")
+
+    scores = []
+    for reward_input in reward_inputs:
+        response = re.sub(r"\s*(<|>|/)\s*", r"\1", reward_input["response"])  # handle qwen2.5vl-32b format
+        format_score = format_reward(response)
+        accuracy_score = accuracy_reward(response, reward_input["ground_truth"])
+        scores.append(
+            {
+                "overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
+                "format": format_score,
+                "accuracy": accuracy_score,
+            }
+        )
+
+    return scores
\ No newline at end of file
diff --git a/examples/reward_function/math.py b/examples/reward_function/math.py
index ea75e3e91b5..f39b32a5699 100644
--- a/examples/reward_function/math.py
+++ b/examples/reward_function/math.py
@@ -46,4 +46,4 @@ def compute_score(reward_inputs: list[dict[str, Any]], format_weight: float = 0.
             }
         )
 
-    return scores
+    return scores
\ No newline at end of file

From e279509c1b1779f82ddb46b35f802287c25cd7cb Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 10:08:55 -0400
Subject: [PATCH 096/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 9abf4306b79..8bf50a97efc 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,8 +26,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/old_train_template_prompts.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/old_val_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=64 \
     data.val_batch_size=64 \
     data.max_prompt_length=3072 \

From 0b006bc544baa918d148e25ebd8e39f760b5ec14 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 10:32:10 -0400
Subject: [PATCH 097/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 8bf50a97efc..5b69ffab6ff 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,7 +26,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chalearn_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=64 \
     data.val_batch_size=64 \

From 625cf79bbfac36775773b0686bd3e7c5c597c7fe Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 10:36:03 -0400
Subject: [PATCH 098/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 5b69ffab6ff..514de147cdf 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,7 +26,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chalearn_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chalearn_no_vptd_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=64 \
     data.val_batch_size=64 \

From d9a4d197b1dabb14fa2ca2f51a8c5000705c8f33 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 10:49:05 -0400
Subject: [PATCH 099/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 514de147cdf..3545c5bd22e 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,7 +26,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chalearn_no_vptd_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/leftover.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=64 \
     data.val_batch_size=64 \

From 4a0f56b74806777fd9084b1b491011e2b37555b0 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 10:51:03 -0400
Subject: [PATCH 100/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh           | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3545c5bd22e..4d2d35eccef 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -28,8 +28,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/leftover.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
-    data.train_batch_size=64 \
-    data.val_batch_size=64 \
+    data.train_batch_size=1 \
+    data.val_batch_size=1 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
     data.filter_overlong_prompts=False \
@@ -43,7 +43,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=1 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-9 \

From e6267855fc80bb527bd6ebde500b12f79a3fef5b Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 10:58:22 -0400
Subject: [PATCH 101/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 4d2d35eccef..3bbb7b74ffe 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,7 +26,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/leftover.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/mmpsy_only.jsonl. \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 057a377f5fc2ccafca4a2469751ccae044680808 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 11:00:12 -0400
Subject: [PATCH 102/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3bbb7b74ffe..c001169a980 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,7 +26,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/mmpsy_only.jsonl. \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/mmpsy_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 2c6b5abef08228feb6dbc3d7308448ce3516234a Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 11:02:54 -0400
Subject: [PATCH 103/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index c001169a980..5270ffc255a 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,7 +26,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/mmpsy_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_mmpsy_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 33e3eabce8c73c2dfa8a04ce04090f58474bf16e Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 11:07:23 -0400
Subject: [PATCH 104/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 5270ffc255a..40454441146 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,7 +26,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_mmpsy_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/mosei_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 8f9c2d5266fd51e837eaba39c65e73a003db1d9d Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 11:22:12 -0400
Subject: [PATCH 105/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 40454441146..a54ca77dfe8 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,7 +26,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/mosei_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From e93952aefd4d5b9b2c6355408cd9d2612dcc6261 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 11:44:35 -0400
Subject: [PATCH 106/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index a54ca77dfe8..9f88be46c4a 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,7 +26,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_mosei_no_lmvd_no_mmpsy_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 69f2d4041fcea6fd0004e9e11264d6fd67081649 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 11:59:48 -0400
Subject: [PATCH 107/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 9f88be46c4a..24d19c3fdb4 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -26,7 +26,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_mosei_no_lmvd_no_mmpsy_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chalearn_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From c7938d637f21a69a2fd9e5c6edfe05ddf456fe4b Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 12:12:01 -0400
Subject: [PATCH 108/232] _

---
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 24d19c3fdb4..ec63dc05ece 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -24,9 +24,40 @@ unset ROCR_VISIBLE_DEVICES
 # the checkpoint should already be loaded before that
 # and then we will just evaluate
 
+# esconv
+# /scratch/keane/human_behaviour/human_behaviour_data/esconv_only.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/no_esconv.jsonl
+
+# # chalearn
+# /scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/no_chalearn.jsonl
+
+# # chsimsv2
+# /scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl
+
+# # tess
+# /scratch/keane/human_behaviour/human_behaviour_data/tess_only.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/no_tess.jsonl
+
+# # expw
+# /scratch/keane/human_behaviour/human_behaviour_data/expw_only.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/no_expw.jsonl
+
+# # meld
+# /scratch/keane/human_behaviour/human_behaviour_data/meld_only.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/no_meld.jsonl
+
+# # cremad
+# /scratch/keane/human_behaviour/human_behaviour_data/cremad_only.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/no_cremad.jsonl
+
+# old
+# discretized_no_lmvd_no_chalearn_v3_template_prompts.jsonl
+
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chalearn_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/esconv_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From bcd493932a2132d771953021ee6449a8ec527cc0 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 12:15:24 -0400
Subject: [PATCH 109/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index ec63dc05ece..cc1a1465e0e 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -57,7 +57,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/esconv_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_esconv.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 91b44d88e0c29e43ed5a6e9609a63466401060af Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 12:17:32 -0400
Subject: [PATCH 110/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index cc1a1465e0e..e3dc753898b 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -57,7 +57,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_esconv.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From fff7e905e402fb217ac2dc7c1fbcef2b6db2d917 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 12:19:05 -0400
Subject: [PATCH 111/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index e3dc753898b..709a10ba7f4 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -57,7 +57,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_chalearn.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 818754e63674aa33cf15c6293d5d5f5e5bb335fc Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 12:22:54 -0400
Subject: [PATCH 112/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh         | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 709a10ba7f4..f7e9e3f2cbc 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -28,9 +28,9 @@ unset ROCR_VISIBLE_DEVICES
 # /scratch/keane/human_behaviour/human_behaviour_data/esconv_only.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/no_esconv.jsonl
 
-# # chalearn
-# /scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl
-# /scratch/keane/human_behaviour/human_behaviour_data/no_chalearn.jsonl
+# chalearn
+# /scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl 
+# /scratch/keane/human_behaviour/human_behaviour_data/no_chalearn.jsonl (Problematic)
 
 # # chsimsv2
 # /scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl
@@ -57,7 +57,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_chalearn.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 9d9be834b5c4b1d9dd10edd8fde6d3a2d8371da0 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 12:25:01 -0400
Subject: [PATCH 113/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index f7e9e3f2cbc..49b1f6bb78f 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -33,7 +33,7 @@ unset ROCR_VISIBLE_DEVICES
 # /scratch/keane/human_behaviour/human_behaviour_data/no_chalearn.jsonl (Problematic)
 
 # # chsimsv2
-# /scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl (works)
 # /scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl
 
 # # tess
@@ -57,7 +57,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 9a95d7e08e9f1a663be2a373706691e1d14b158c Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 12:54:56 -0400
Subject: [PATCH 114/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh          | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 49b1f6bb78f..7edc037afcf 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -34,7 +34,7 @@ unset ROCR_VISIBLE_DEVICES
 
 # # chsimsv2
 # /scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl (works)
-# /scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl (works without chsimsv2)
 
 # # tess
 # /scratch/keane/human_behaviour/human_behaviour_data/tess_only.jsonl
@@ -55,9 +55,12 @@ unset ROCR_VISIBLE_DEVICES
 # old
 # discretized_no_lmvd_no_chalearn_v3_template_prompts.jsonl
 
+
+# /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl
+
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From f6c4a5450a89319060ad06bd67b53782b107d695 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 12:58:53 -0400
Subject: [PATCH 115/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 7edc037afcf..415086bf940 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -57,10 +57,11 @@ unset ROCR_VISIBLE_DEVICES
 
 
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 741b4e7bd6ad82586c1e0830043188f0c25257b0 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:00:32 -0400
Subject: [PATCH 116/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 415086bf940..0b79f1c1475 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -61,7 +61,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 80568aa98aabee0cfbe0c8b8af563f180e6d9e2b Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:02:55 -0400
Subject: [PATCH 117/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 0b79f1c1475..d81133de1f1 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -61,7 +61,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From ed044877ac9a9d9bff3f7f8bc150f4da363ff5af Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:12:05 -0400
Subject: [PATCH 118/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index d81133de1f1..c438904e0d7 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -24,6 +24,7 @@ unset ROCR_VISIBLE_DEVICES
 # the checkpoint should already be loaded before that
 # and then we will just evaluate
 
+# ALL OF THESE ARE FROM THE NO CHALEARN NO LMVD
 # esconv
 # /scratch/keane/human_behaviour/human_behaviour_data/esconv_only.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/no_esconv.jsonl

From d417dc276d17958938bcd0cf371f3edc37202c3c Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:15:42 -0400
Subject: [PATCH 119/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index c438904e0d7..f3e205d92c2 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -62,7 +62,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_vptd_no_chsimsv2_no_chalearn.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 5258d5bbbe9205baf4a42af2f5fa3336391f3345 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:33:24 -0400
Subject: [PATCH 120/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh           | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index f3e205d92c2..854737f1606 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -35,7 +35,7 @@ unset ROCR_VISIBLE_DEVICES
 
 # # chsimsv2
 # /scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl (works)
-# /scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl (works without chsimsv2)
+# /scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl (does not work)
 
 # # tess
 # /scratch/keane/human_behaviour/human_behaviour_data/tess_only.jsonl
@@ -49,7 +49,7 @@ unset ROCR_VISIBLE_DEVICES
 # /scratch/keane/human_behaviour/human_behaviour_data/meld_only.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/no_meld.jsonl
 
-# # cremad
+# cremad
 # /scratch/keane/human_behaviour/human_behaviour_data/cremad_only.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/no_cremad.jsonl
 
@@ -62,7 +62,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_vptd_no_chsimsv2_no_chalearn.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/meld_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 1424a55cd36bd5df1ac44aad7cc1a96ef2297d26 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:42:36 -0400
Subject: [PATCH 121/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh            | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 854737f1606..79b2c23f15a 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -45,8 +45,8 @@ unset ROCR_VISIBLE_DEVICES
 # /scratch/keane/human_behaviour/human_behaviour_data/expw_only.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/no_expw.jsonl
 
-# # meld
-# /scratch/keane/human_behaviour/human_behaviour_data/meld_only.jsonl
+# meld
+# /scratch/keane/human_behaviour/human_behaviour_data/meld_only.jsonl (ok)
 # /scratch/keane/human_behaviour/human_behaviour_data/no_meld.jsonl
 
 # cremad
@@ -56,7 +56,6 @@ unset ROCR_VISIBLE_DEVICES
 # old
 # discretized_no_lmvd_no_chalearn_v3_template_prompts.jsonl
 
-
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl
 

From c19287cf9bdf2ce4397e5eb75cb1d68b93bceb73 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:49:28 -0400
Subject: [PATCH 122/232] update schema

---
 verl/utils/dataset/rl_dataset.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 4a67fbced1e..d7e1e05bfd8 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -158,10 +158,21 @@ def _download(self, use_origin_parquet=False):
 
     def _read_files_and_tokenize(self):
         dataframes = []
+        
+        features = datasets.Features({
+            "problem": datasets.Value("string"),
+            "answer":  datasets.Value("string"),
+            "images":  datasets.Sequence(datasets.Value("string")),
+            "videos":  datasets.Sequence(datasets.Value("string")),
+            "audios":  datasets.Sequence(datasets.Value("string")),  # <- force list of strings
+            "dataset": datasets.Value("string"),
+            "texts":   datasets.Sequence(datasets.Value("string")),
+        })
+
         for parquet_file in self.data_files:
             # read parquet files and cache
             if parquet_file.endswith(".parquet"):
-                dataframe = datasets.load_dataset("parquet", data_files=parquet_file)["train"]
+                dataframe = datasets.load_dataset("parquet", data_files=parquet_file, features=features)["train"]
             elif parquet_file.endswith(".json") or parquet_file.endswith(".jsonl"):
                 dataframe = datasets.load_dataset("json", data_files=parquet_file)["train"]
             else:

From e2ad39c24cb44e958e525a2d16466ad6e422338c Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:49:55 -0400
Subject: [PATCH 123/232] update schema

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 79b2c23f15a..986abccf993 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -61,7 +61,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/meld_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From c1ff427b21f292b966a2620f44167f2fbf98d55c Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:53:04 -0400
Subject: [PATCH 124/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 986abccf993..6e8210b1eda 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -61,7 +61,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From f597bf820a86ab8a3863f1b8c4981131f5174fa7 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:53:54 -0400
Subject: [PATCH 125/232] sync

---
 verl/utils/dataset/rl_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index d7e1e05bfd8..688968b9313 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -158,7 +158,7 @@ def _download(self, use_origin_parquet=False):
 
     def _read_files_and_tokenize(self):
         dataframes = []
-        
+
         features = datasets.Features({
             "problem": datasets.Value("string"),
             "answer":  datasets.Value("string"),
@@ -174,7 +174,7 @@ def _read_files_and_tokenize(self):
             if parquet_file.endswith(".parquet"):
                 dataframe = datasets.load_dataset("parquet", data_files=parquet_file, features=features)["train"]
             elif parquet_file.endswith(".json") or parquet_file.endswith(".jsonl"):
-                dataframe = datasets.load_dataset("json", data_files=parquet_file)["train"]
+                dataframe = datasets.load_dataset("json", data_files=parquet_file, features=features)["train"]
             else:
                 raise ValueError(f"Unsupported file format: {parquet_file}. Only .parquet, .json, .jsonl are supported.")
             dataframes.append(dataframe)

From 740a764af533b72921eaefcf0298bba6419ce2a4 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 13:56:52 -0400
Subject: [PATCH 126/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 6e8210b1eda..986abccf993 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -61,7 +61,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From effc3aaac68ab969bac40355e14a36a2b418bfae Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 14:31:19 -0400
Subject: [PATCH 127/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 986abccf993..b986cff4ba1 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -58,10 +58,11 @@ unset ROCR_VISIBLE_DEVICES
 
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl
+# need to try with the no_lmvd prompts 
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From 0df474ab5407aed459320b6819a701256b5290bc Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 14:37:02 -0400
Subject: [PATCH 128/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index b986cff4ba1..77816d9c9f7 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -62,7 +62,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From d892c6c103da7a32d322fecd925960af0cfd6189 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 14:41:30 -0400
Subject: [PATCH 129/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh          | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 77816d9c9f7..5045d879898 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -58,11 +58,12 @@ unset ROCR_VISIBLE_DEVICES
 
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl
-# need to try with the no_lmvd prompts 
+# /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl
+# need to try with the no_lmvd prompts  (with chsimsv2 added) --> cuda OOM error
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl \
+    data.train_files=# /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \
@@ -114,4 +115,4 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.val_before_train=False \
     trainer.test_freq=1 \
     trainer.total_epochs=15 $@ \
-    trainer.default_local_dir=/scratch/keane/human_behaviour/verl_models_hb_vision_only
\ No newline at end of file
+    trainer.default_local_dir=/scratch/keane/human_behaviour/new_verl_models_hb_vision_only
\ No newline at end of file

From 267345d390877c38ae1653cdd9ed8500d0afe727 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 14:42:18 -0400
Subject: [PATCH 130/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 5045d879898..3a038f47f17 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -63,7 +63,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=# /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
     data.train_batch_size=1 \
     data.val_batch_size=1 \

From d9f547acd274a790022bfee307bf8a6b0c49e8a3 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 15:52:02 -0400
Subject: [PATCH 131/232] _

---
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3a038f47f17..9b82eeb9e98 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -59,14 +59,14 @@ unset ROCR_VISIBLE_DEVICES
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl
-# need to try with the no_lmvd prompts  (with chsimsv2 added) --> cuda OOM error
+# when resuming training from a loaded checkpoint cuda OOM error
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_lmvd_discretized_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
-    data.train_batch_size=1 \
-    data.val_batch_size=1 \
+    data.train_batch_size=512 \
+    data.val_batch_size=128 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
     data.filter_overlong_prompts=False \
@@ -80,8 +80,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=1 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-9 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
@@ -90,16 +90,16 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.enable_gradient_checkpointing=True \
     actor_rollout_ref.actor.fsdp_config.param_offload=False \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
     actor_rollout_ref.rollout.enable_chunked_prefill=False \
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \
-    actor_rollout_ref.rollout.n=3 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     algorithm.use_kl_in_reward=False \
     custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/medical.py \
@@ -108,11 +108,11 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
     trainer.project_name='verl_hb' \
-    trainer.experiment_name='vision_only' \
-    trainer.n_gpus_per_node=3 \
+    trainer.experiment_name='omni' \
+    trainer.n_gpus_per_node=2 \
     trainer.nnodes=1 \
-    trainer.save_freq=20 \
+    trainer.save_freq=5 \
     trainer.val_before_train=False \
     trainer.test_freq=1 \
     trainer.total_epochs=15 $@ \
-    trainer.default_local_dir=/scratch/keane/human_behaviour/new_verl_models_hb_vision_only
\ No newline at end of file
+    trainer.default_local_dir=/scratch/keane/human_behaviour/verl_models_hb_omni
\ No newline at end of file

From 822e7b5c019a24758f57682aee315ea7b94a4f30 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 16:02:08 -0400
Subject: [PATCH 132/232] push command

---
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 40 +++----------------
 1 file changed, 5 insertions(+), 35 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 9b82eeb9e98..3e77ae9ed56 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -24,47 +24,17 @@ unset ROCR_VISIBLE_DEVICES
 # the checkpoint should already be loaded before that
 # and then we will just evaluate
 
-# ALL OF THESE ARE FROM THE NO CHALEARN NO LMVD
-# esconv
-# /scratch/keane/human_behaviour/human_behaviour_data/esconv_only.jsonl
-# /scratch/keane/human_behaviour/human_behaviour_data/no_esconv.jsonl
-
-# chalearn
-# /scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl 
-# /scratch/keane/human_behaviour/human_behaviour_data/no_chalearn.jsonl (Problematic)
-
-# # chsimsv2
-# /scratch/keane/human_behaviour/human_behaviour_data/chsimsv2_only.jsonl (works)
-# /scratch/keane/human_behaviour/human_behaviour_data/no_chsimsv2.jsonl (does not work)
-
-# # tess
-# /scratch/keane/human_behaviour/human_behaviour_data/tess_only.jsonl
-# /scratch/keane/human_behaviour/human_behaviour_data/no_tess.jsonl
-
-# # expw
-# /scratch/keane/human_behaviour/human_behaviour_data/expw_only.jsonl
-# /scratch/keane/human_behaviour/human_behaviour_data/no_expw.jsonl
-
-# meld
-# /scratch/keane/human_behaviour/human_behaviour_data/meld_only.jsonl (ok)
-# /scratch/keane/human_behaviour/human_behaviour_data/no_meld.jsonl
-
-# cremad
-# /scratch/keane/human_behaviour/human_behaviour_data/cremad_only.jsonl
-# /scratch/keane/human_behaviour/human_behaviour_data/no_cremad.jsonl
-
-# old
-# discretized_no_lmvd_no_chalearn_v3_template_prompts.jsonl
-
+# ALTERNATIVES
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl
 # /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl
-# /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_v3_template_prompts.jsonl
+
 # when resuming training from a loaded checkpoint cuda OOM error
 
+
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_lmvd_discretized_v3_template_prompts.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_lmvd_discretized_v3_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_lmvd_discretized_v3_template_prompts.jsonl \
     data.train_batch_size=512 \
     data.val_batch_size=128 \
     data.max_prompt_length=3072 \

From 6702da798267174ddee2a8a5e465b903bbf9ecf5 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 16:27:18 -0400
Subject: [PATCH 133/232] _

---
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh |   2 +-
 examples/reward_function/human_behaviour.py   | 489 ++++++++++++++++--
 2 files changed, 451 insertions(+), 40 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3e77ae9ed56..3a51c80eec9 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -35,7 +35,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_lmvd_discretized_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_lmvd_discretized_v3_template_prompts.jsonl \
-    data.train_batch_size=512 \
+    data.train_batch_size=128 \
     data.val_batch_size=128 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
diff --git a/examples/reward_function/human_behaviour.py b/examples/reward_function/human_behaviour.py
index f39b32a5699..aeeac05b019 100644
--- a/examples/reward_function/human_behaviour.py
+++ b/examples/reward_function/human_behaviour.py
@@ -1,49 +1,460 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import re
-from typing import Any
+import json
+from typing import Dict, List
+
+import numpy
+import torch
+import numpy as np
+from mathruler.grader import extract_boxed_content
+import wandb
+import random
+
+
+def parse_conditions(text):
+    # Remove any boxing notation if present
+    text = text.replace("\\boxed{", "").replace("}", "")
+
+    # Split by common separators
+    for sep in [", ", " and ", " & ", ",", "&"]:
+        if sep in text:
+            return set(cond.strip() for cond in text.split(sep))
+
+    # If no separator found, treat as single condition
+    return {text.strip()}
+
+
+def parse_json(json_output):
+    """
+    Parsing out the markdown fencing from JSON code blocks.
+    """
+    # Look for content between ```json and ```
+    lines = json_output.splitlines()
+    for i, line in enumerate(lines):
+        if line == "```json" or line.strip() == "```":
+            json_output = "\n".join(lines[i + 1:])  # Remove everything before ```json
+            if "```" in json_output:
+                json_output = json_output.split("```")[0]  # Remove everything after the closing ```
+            break  # Exit the loop once code block marker is found
+    return json_output
+
+
+def extract_json_from_response(text):
+    """
+    Extract JSON content from markdown code blocks in the response.
+
+    Args:
+        text: The model's response text
+
+    Returns:
+        Parsed JSON object or None if no valid JSON found
+    """
+    # Find content between ```json and ```
+    json_pattern = r"```(?:json)?\s*([\s\S]*?)```"
+    matches = re.findall(json_pattern, text)
+
+    if not matches:
+        return None
+
+    # Try to parse each match as JSON
+    for match in matches:
+        try:
+            parsed_json = json.loads(match.strip())
+            return parsed_json
+        except json.JSONDecodeError:
+            continue
+
+    # If we couldn't parse any match as valid JSON, try with ast.literal_eval
+    import ast
+    for match in matches:
+        try:
+            # Clean up the match a bit
+            cleaned = match.strip().replace("'", "\"")
+            parsed_json = ast.literal_eval(cleaned)
+            return parsed_json
+        except:
+            continue
+
+    return None
+
+
+def bbox_to_mask(bbox, height, width):
+    """
+    Convert bounding box to binary mask.
+
+    Args:
+        bbox: Bounding box in format [x1, y1, x2, y2]
+        height: Height of the mask
+        width: Width of the mask
+
+    Returns:
+        Binary mask of shape (height, width)
+    """
+    mask = torch.zeros((height, width), dtype=torch.float32)
+
+    # Ensure bbox coordinates are within image boundaries
+    x1 = max(0, min(int(bbox[0]), width - 1))
+    y1 = max(0, min(int(bbox[1]), height - 1))
+    x2 = max(0, min(int(bbox[2]), width - 1))
+    y2 = max(0, min(int(bbox[3]), height - 1))
+
+    # Handle cases where x1>x2 or y1>y2
+    if x1 > x2:
+        x1, x2 = x2, x1
+    if y1 > y2:
+        y1, y2 = y2, y1
+
+    # Set the box region to 1
+    if x1 < x2 and y1 < y2:  # Ensure valid box dimensions
+        mask[y1:y2 + 1, x1:x2 + 1] = 1.0
+
+    return mask
+
+
+def calculate_bbox_iou(pred_bboxes, seg_mask=None, gt_bbox=None):
+    """
+    Calculate IoU between predicted bounding boxes and ground truth (segmentation mask or bbox).
+
+    Args:
+        pred_bboxes: List of predicted bounding boxes in format [x1, y1, x2, y2]
+        seg_mask: Ground truth segmentation mask tensor
+        gt_bbox: Ground truth bounding box in format [x1, y1, x2, y2]
+
+    Returns:
+        Mean IoU score across all bounding boxes
+    """
+    if not pred_bboxes:
+        return 0.0
+
+    # If single layer bbox, wrap it in a list
+    if not isinstance(pred_bboxes[0], list):
+        pred_bboxes = [pred_bboxes]
+
+    if seg_mask is not None and isinstance(seg_mask, numpy.ndarray):
+        seg_mask = torch.from_numpy(seg_mask)
+
+    # Not none and not all zero
+    if seg_mask is not None and torch.sum(seg_mask) > 0:
+        # Get mask dimensions
+        if len(seg_mask.shape) == 3:  # Channel dimension
+            height, width = seg_mask.shape[1], seg_mask.shape[2]
+        else:
+            height, width = seg_mask.shape[0], seg_mask.shape[1]
+
+        # Convert segmentation mask to binary (1 for any positive value)
+        binary_seg_mask = (seg_mask > 0).float()
+
+        total_iou = 0.0
+        for bbox in pred_bboxes:
+            if len(bbox) < 4:
+                continue
+            # Convert bbox to mask
+            try:
+                bbox_mask = bbox_to_mask(bbox, height, width)
+            except:
+                continue
+
+            # Calculate intersection and union
+            intersection = torch.sum(bbox_mask * binary_seg_mask)
+            union = torch.sum(torch.clamp(bbox_mask + binary_seg_mask, 0, 1))
+
+            # Calculate IoU
+            iou = intersection / union if union > 0 else 0.0
+            total_iou += iou
+
+        # Return mean IoU
+        return total_iou / len(pred_bboxes)
+
+    elif gt_bbox is not None:
+        # Calculate IoU directly between bounding boxes
+        total_iou = 0.0
+        for pred_bbox in pred_bboxes:
+            if len(pred_bbox) < 4:
+                continue
+            # Calculate intersection
+            gt_bbox = gt_bbox.tolist()
+            # print("pred_bbox: ", pred_bbox.__class__)
+            # print("gt_bbox: ", gt_bbox.__class__)
+            x1 = max(pred_bbox[0], gt_bbox[0])
+            y1 = max(pred_bbox[1], gt_bbox[1])
+            x2 = min(pred_bbox[2], gt_bbox[2])
+            y2 = min(pred_bbox[3], gt_bbox[3])
+
+            # Check if boxes overlap
+            if x1 >= x2 or y1 >= y2:
+                iou = 0.0
+            else:
+                # Calculate areas
+                intersection = (x2 - x1) * (y2 - y1)
+                pred_area = (pred_bbox[2] - pred_bbox[0]) * (pred_bbox[3] - pred_bbox[1])
+                gt_area = (gt_bbox[2] - gt_bbox[0]) * (gt_bbox[3] - gt_bbox[1])
+                union = pred_area + gt_area - intersection
+
+                # Calculate IoU
+                iou = intersection / union if union > 0 else 0.0
+
+            total_iou += iou
+
+        # Return mean IoU
+        return total_iou / len(pred_bboxes)
+
+    else:
+        # Neither segmentation mask nor ground truth bbox provided
+        return 0.0
+
+
+def evaluate_bbox_format(predict_str):
+    """
+    Evaluate the format correctness of the bounding box JSON in the response.
+    Returns a score based on how well the response follows the expected format.
+
+    Args:
+        predict_str: The model's prediction string
+
+    Returns:
+        Format score between 0.0 and 1.0
+    """
+    format_score = 0.0
+
+    # Check if response contains a code block
+    if "```" in predict_str:
+        format_score += 0.2  # 20% for having a code block
+
+        # Check if it's specifically marked as JSON
+        if "```json" in predict_str:
+            format_score += 0.1  # Additional 10% for correct JSON marker
+
+    # Try to extract and parse JSON
+    json_str = parse_json(predict_str)
+    if not json_str:
+        return format_score  # Failed to find JSON content
+
+    try:
+        # Try to parse as JSON
+        parsed_json = None
+        try:
+            parsed_json = json.loads(json_str)
+            format_score += 0.2  # Additional 20% for valid JSON
+        except json.JSONDecodeError:
+            # Try with ast.literal_eval as fallback
+            import ast
+            try:
+                cleaned = json_str.replace("'", "\"")
+                parsed_json = ast.literal_eval(cleaned)
+                format_score += 0.1  # Only 10% for requiring fallback parsing
+            except:
+                return format_score  # Failed to parse
+
+        # Check if it's a list of objects
+        if not isinstance(parsed_json, list):
+            return format_score
+
+        format_score += 0.1  # Additional 10% for being a list
+
+        # Check each item for proper bbox structure
+        valid_items = 0
+        total_items = len(parsed_json)
+
+        for item in parsed_json:
+            if not isinstance(item, dict):
+                continue
+
+            # Check for required fields
+            has_bbox = "bbox_2d" in item
+            has_label = "label" in item
+
+            if has_bbox and has_label:
+                bbox = item["bbox_2d"]
+                # Check bbox format [x1, y1, x2, y2]
+                if (isinstance(bbox, list) and len(bbox) == 4 and
+                        all(isinstance(coord, (int, float)) for coord in bbox)):
+                    valid_items += 1
+
+        # Add up to 40% based on proportion of valid items
+        if total_items > 0:
+            format_score += 0.4 * (valid_items / total_items)
+
+    except Exception:
+        # Any other parsing issues
+        pass
+
+    return format_score
+
+
+def medical_compute_score(predict_str: str, ground_truth: str, segmentation_mask=None, bbox=None) -> Dict[str, float]:
+    """
+    Compute medical scoring including standard score, bounding box IoU, and format score.
+
+    Args:
+        predict_str: The model's prediction string
+        ground_truth: The ground truth string
+        segmentation_mask: Ground truth segmentation mask tensor
+        bbox: Ground truth bounding box
+
+    Returns:
+        Tuple of (standard_score, bbox_score)
+        Note: bbox_score is a combination of IoU score and format score
+    """
+    # Calculate standard score
+    answer = extract_boxed_content(predict_str)
+    if answer == "None":
+        standard_score = 0.0  # no answer
+    else:
+        # Parse both prediction and ground truth into sets of conditions
+        predicted_conditions = parse_conditions(answer)
+        ground_truth_conditions = parse_conditions(ground_truth)
+
+        # Calculate true positives, false positives, and false negatives
+        true_positives = len(predicted_conditions.intersection(ground_truth_conditions))
+        false_positives = len(predicted_conditions - ground_truth_conditions)
+        false_negatives = len(ground_truth_conditions - predicted_conditions)
+
+        # Calculate F1 score components
+        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
+        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
+
+        # Calculate F1 score (harmonic mean of precision and recall)
+        standard_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+
+    # Calculate format score (how well the JSON follows the expected format)
+    format_score = evaluate_bbox_format(predict_str)
+
+    # length score
+    if len(predict_str) > 600:  # ~200 words
+        length_score = 1
+    else:
+        length_score = len(predict_str) * 0.001
+
+
+    # Calculate bounding box IoU score
+    iou_score = 0.0
+    # Extract predicted bounding boxes from the response
+    json_data = extract_json_from_response(predict_str)
+    if json_data:
+        # Extract bounding boxes from the JSON
+        try:
+            pred_bboxes = []
+            if isinstance(json_data, list):
+                for item in json_data:
+                    if isinstance(item, dict) and "bbox_2d" in item:
+                        pred_bboxes.append(item["bbox_2d"])
+            elif isinstance(json_data, dict) and "bbox_2d" in json_data:
+                pred_bboxes.append(json_data["bbox_2d"])
+            elif isinstance(json_data, dict) and 'objects_of_interest' in json_data:
+                for item in json_data['objects_of_interest']:
+                    if isinstance(item, dict) and "bbox_2d" in item:
+                        pred_bboxes.append(item["bbox_2d"])
+            # else:
+            #     print("Error: Invalid JSON format")
+            if random.random() < 0.0005:  # print every 0.5%
+                print("[Bounding Box] ", json_data)
+                print("[Formatted Bounding Box] ", pred_bboxes)
+                print('[GT Bounding Box] ', bbox)
+
+            # Calculate IoU between predicted boxes and ground truth
+            if pred_bboxes:
+                iou_score = calculate_bbox_iou(pred_bboxes, segmentation_mask, bbox)
+        except:
+            pass
+            # traceback.print_exc()
+
+    scores = {
+        "overall": 0.6 * standard_score + 0.2 * iou_score + 0.1 * format_score + 0.1 * length_score,
+        "standard_score": standard_score,
+        "iou_score": iou_score,
+        "format_score": format_score,
+    }
+    return scores
+
+
+def medical_compute_score_batch(data_sources: List[str], solution_strs: List[str], ground_truths: List[str], extra_infos: List[str], **kwargs) -> List[Dict[str, float]]:
+    """
+    Compute medical scoring for batch inputs including standard score, bounding box IoU, and format score.
+
+    Args:
+        data_sources: List of data sources (e.g., file paths or identifiers)
+        solution_strs: List of model prediction strings
+        ground_truths: List of ground truth strings
+        extra_infos: List of extra information (e.g., segmentation masks, bounding boxes)
+
+    Returns:
+        List of score dictionaries
+    """
+    batch_scores = []
+
+    for data_source, predict_str, ground_truth, extra_info in zip(data_sources, solution_strs, ground_truths, extra_infos):
+        segmentation_mask = None
+        bbox = None
+
+        # Calculate standard score
+        answer = extract_boxed_content(predict_str)
+        if answer == "None":
+            standard_score = 0.0  # no answer
+        else:
+            # Parse both prediction and ground truth into sets of conditions
+            predicted_conditions = parse_conditions(answer)
+            ground_truth_conditions = parse_conditions(ground_truth)
+
+            # Calculate true positives, false positives, and false negatives
+            true_positives = len(predicted_conditions.intersection(ground_truth_conditions))
+            false_positives = len(predicted_conditions - ground_truth_conditions)
+            false_negatives = len(ground_truth_conditions - predicted_conditions)
 
-from mathruler.grader import extract_boxed_content, grade_answer
+            # Calculate F1 score components
+            precision = (
+                true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
+            )
+            recall = true_positives / (true_positives + false_negatives) if (
+                                                                                        true_positives + false_negatives) > 0 else 0
 
+            # Calculate F1 score (harmonic mean of precision and recall)
+            standard_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
 
-def format_reward(response: str) -> float:
-    pattern = re.compile(r"<think>.*</think>.*\\boxed\{.*\}.*", re.DOTALL)
-    format_match = re.fullmatch(pattern, response)
-    return 1.0 if format_match else 0.0
+        # Calculate format score (how well the JSON follows the expected format)
+        format_score = evaluate_bbox_format(predict_str)
 
+        # length score
+        if len(predict_str) > 600:  # ~200 words
+            length_score = 1
+        else:
+            length_score = len(predict_str) * 0.001
 
-def accuracy_reward(response: str, ground_truth: str) -> float:
-    answer = extract_boxed_content(response)
-    return 1.0 if grade_answer(answer, ground_truth) else 0.0
+        # Calculate bounding box IoU score
+        iou_score = 0.0
+        # Extract predicted bounding boxes from the response
+        json_data = extract_json_from_response(predict_str)
+        if json_data:
+            # Extract bounding boxes from the JSON
+            try:
+                pred_bboxes = []
+                if isinstance(json_data, list):
+                    for item in json_data:
+                        if isinstance(item, dict) and "bbox_2d" in item:
+                            pred_bboxes.append(item["bbox_2d"])
+                elif isinstance(json_data, dict) and "bbox_2d" in json_data:
+                    pred_bboxes.append(json_data["bbox_2d"])
+                elif isinstance(json_data, dict) and "objects_of_interest" in json_data:
+                    for item in json_data["objects_of_interest"]:
+                        if isinstance(item, dict) and "bbox_2d" in item:
+                            pred_bboxes.append(item["bbox_2d"])
 
+                if random.random() < 0.005:  # print every 0.5%
+                    print("[Bounding Box] ", json_data)
+                    print("[Formatted Bounding Box] ", pred_bboxes)
+                    print("[GT Bounding Box] ", bbox)
 
-def compute_score(reward_inputs: list[dict[str, Any]], format_weight: float = 0.1) -> list[dict[str, float]]:
-    if not isinstance(reward_inputs, list):
-        raise ValueError("Please use `reward_type=batch` for math reward function.")
+                # Calculate IoU between predicted boxes and ground truth
+                if pred_bboxes:
+                    iou_score = calculate_bbox_iou(pred_bboxes, segmentation_mask, bbox)
+            except:
+                pass
 
-    scores = []
-    for reward_input in reward_inputs:
-        response = re.sub(r"\s*(<|>|/)\s*", r"\1", reward_input["response"])  # handle qwen2.5vl-32b format
-        format_score = format_reward(response)
-        accuracy_score = accuracy_reward(response, reward_input["ground_truth"])
-        scores.append(
-            {
-                "overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
-                "format": format_score,
-                "accuracy": accuracy_score,
-            }
-        )
+        scores = {
+            "score": 0.5 * standard_score + 0.3 * iou_score + 0.1 * format_score,
+            "standard_score": standard_score,
+            "iou_score": iou_score,
+            "format_score": format_score,
+            "length_score": length_score,
+        }
+        batch_scores.append(scores)
 
-    return scores
\ No newline at end of file
+    return batch_scores
\ No newline at end of file

From 46e3bf84abd994e64343580947f6e943a6d881bf Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 16:51:16 -0400
Subject: [PATCH 134/232] _

---
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh |  22 +-
 examples/reward_function/human_behaviour.py   | 493 ++----------------
 .../reward_function/human_behaviour_alt.py    | 140 +++++
 3 files changed, 192 insertions(+), 463 deletions(-)
 create mode 100644 examples/reward_function/human_behaviour_alt.py

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3a51c80eec9..7c3abd61a47 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -35,8 +35,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_lmvd_discretized_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_lmvd_discretized_v3_template_prompts.jsonl \
-    data.train_batch_size=128 \
-    data.val_batch_size=128 \
+    data.train_batch_size=1 \
+    data.val_batch_size=1 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
     data.filter_overlong_prompts=False \
@@ -50,8 +50,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=1 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-9 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
@@ -60,26 +60,26 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.enable_gradient_checkpointing=True \
     actor_rollout_ref.actor.fsdp_config.param_offload=False \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
     actor_rollout_ref.rollout.enable_chunked_prefill=False \
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \
-    actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.n=3 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     algorithm.use_kl_in_reward=False \
-    custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/medical.py \
-    custom_reward_function.name=medical_compute_score_batch \
+    custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/human_behaviour.py \
+    custom_reward_function.name=human_behaviour_compute_score \
     reward_model.reward_manager=batch \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
     trainer.project_name='verl_hb' \
     trainer.experiment_name='omni' \
-    trainer.n_gpus_per_node=2 \
+    trainer.n_gpus_per_node=3 \
     trainer.nnodes=1 \
     trainer.save_freq=5 \
     trainer.val_before_train=False \
diff --git a/examples/reward_function/human_behaviour.py b/examples/reward_function/human_behaviour.py
index aeeac05b019..f0bb76d7b93 100644
--- a/examples/reward_function/human_behaviour.py
+++ b/examples/reward_function/human_behaviour.py
@@ -1,460 +1,49 @@
-import re
-import json
-from typing import Dict, List
-
-import numpy
-import torch
-import numpy as np
-from mathruler.grader import extract_boxed_content
-import wandb
-import random
-
-
-def parse_conditions(text):
-    # Remove any boxing notation if present
-    text = text.replace("\\boxed{", "").replace("}", "")
-
-    # Split by common separators
-    for sep in [", ", " and ", " & ", ",", "&"]:
-        if sep in text:
-            return set(cond.strip() for cond in text.split(sep))
-
-    # If no separator found, treat as single condition
-    return {text.strip()}
-
-
-def parse_json(json_output):
-    """
-    Parsing out the markdown fencing from JSON code blocks.
-    """
-    # Look for content between ```json and ```
-    lines = json_output.splitlines()
-    for i, line in enumerate(lines):
-        if line == "```json" or line.strip() == "```":
-            json_output = "\n".join(lines[i + 1:])  # Remove everything before ```json
-            if "```" in json_output:
-                json_output = json_output.split("```")[0]  # Remove everything after the closing ```
-            break  # Exit the loop once code block marker is found
-    return json_output
-
-
-def extract_json_from_response(text):
-    """
-    Extract JSON content from markdown code blocks in the response.
-
-    Args:
-        text: The model's response text
-
-    Returns:
-        Parsed JSON object or None if no valid JSON found
-    """
-    # Find content between ```json and ```
-    json_pattern = r"```(?:json)?\s*([\s\S]*?)```"
-    matches = re.findall(json_pattern, text)
-
-    if not matches:
-        return None
-
-    # Try to parse each match as JSON
-    for match in matches:
-        try:
-            parsed_json = json.loads(match.strip())
-            return parsed_json
-        except json.JSONDecodeError:
-            continue
-
-    # If we couldn't parse any match as valid JSON, try with ast.literal_eval
-    import ast
-    for match in matches:
-        try:
-            # Clean up the match a bit
-            cleaned = match.strip().replace("'", "\"")
-            parsed_json = ast.literal_eval(cleaned)
-            return parsed_json
-        except:
-            continue
-
-    return None
-
-
-def bbox_to_mask(bbox, height, width):
-    """
-    Convert bounding box to binary mask.
-
-    Args:
-        bbox: Bounding box in format [x1, y1, x2, y2]
-        height: Height of the mask
-        width: Width of the mask
-
-    Returns:
-        Binary mask of shape (height, width)
-    """
-    mask = torch.zeros((height, width), dtype=torch.float32)
-
-    # Ensure bbox coordinates are within image boundaries
-    x1 = max(0, min(int(bbox[0]), width - 1))
-    y1 = max(0, min(int(bbox[1]), height - 1))
-    x2 = max(0, min(int(bbox[2]), width - 1))
-    y2 = max(0, min(int(bbox[3]), height - 1))
-
-    # Handle cases where x1>x2 or y1>y2
-    if x1 > x2:
-        x1, x2 = x2, x1
-    if y1 > y2:
-        y1, y2 = y2, y1
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-    # Set the box region to 1
-    if x1 < x2 and y1 < y2:  # Ensure valid box dimensions
-        mask[y1:y2 + 1, x1:x2 + 1] = 1.0
-
-    return mask
-
-
-def calculate_bbox_iou(pred_bboxes, seg_mask=None, gt_bbox=None):
-    """
-    Calculate IoU between predicted bounding boxes and ground truth (segmentation mask or bbox).
-
-    Args:
-        pred_bboxes: List of predicted bounding boxes in format [x1, y1, x2, y2]
-        seg_mask: Ground truth segmentation mask tensor
-        gt_bbox: Ground truth bounding box in format [x1, y1, x2, y2]
-
-    Returns:
-        Mean IoU score across all bounding boxes
-    """
-    if not pred_bboxes:
-        return 0.0
-
-    # If single layer bbox, wrap it in a list
-    if not isinstance(pred_bboxes[0], list):
-        pred_bboxes = [pred_bboxes]
-
-    if seg_mask is not None and isinstance(seg_mask, numpy.ndarray):
-        seg_mask = torch.from_numpy(seg_mask)
-
-    # Not none and not all zero
-    if seg_mask is not None and torch.sum(seg_mask) > 0:
-        # Get mask dimensions
-        if len(seg_mask.shape) == 3:  # Channel dimension
-            height, width = seg_mask.shape[1], seg_mask.shape[2]
-        else:
-            height, width = seg_mask.shape[0], seg_mask.shape[1]
-
-        # Convert segmentation mask to binary (1 for any positive value)
-        binary_seg_mask = (seg_mask > 0).float()
-
-        total_iou = 0.0
-        for bbox in pred_bboxes:
-            if len(bbox) < 4:
-                continue
-            # Convert bbox to mask
-            try:
-                bbox_mask = bbox_to_mask(bbox, height, width)
-            except:
-                continue
-
-            # Calculate intersection and union
-            intersection = torch.sum(bbox_mask * binary_seg_mask)
-            union = torch.sum(torch.clamp(bbox_mask + binary_seg_mask, 0, 1))
-
-            # Calculate IoU
-            iou = intersection / union if union > 0 else 0.0
-            total_iou += iou
-
-        # Return mean IoU
-        return total_iou / len(pred_bboxes)
-
-    elif gt_bbox is not None:
-        # Calculate IoU directly between bounding boxes
-        total_iou = 0.0
-        for pred_bbox in pred_bboxes:
-            if len(pred_bbox) < 4:
-                continue
-            # Calculate intersection
-            gt_bbox = gt_bbox.tolist()
-            # print("pred_bbox: ", pred_bbox.__class__)
-            # print("gt_bbox: ", gt_bbox.__class__)
-            x1 = max(pred_bbox[0], gt_bbox[0])
-            y1 = max(pred_bbox[1], gt_bbox[1])
-            x2 = min(pred_bbox[2], gt_bbox[2])
-            y2 = min(pred_bbox[3], gt_bbox[3])
-
-            # Check if boxes overlap
-            if x1 >= x2 or y1 >= y2:
-                iou = 0.0
-            else:
-                # Calculate areas
-                intersection = (x2 - x1) * (y2 - y1)
-                pred_area = (pred_bbox[2] - pred_bbox[0]) * (pred_bbox[3] - pred_bbox[1])
-                gt_area = (gt_bbox[2] - gt_bbox[0]) * (gt_bbox[3] - gt_bbox[1])
-                union = pred_area + gt_area - intersection
+import re
+from typing import Any
 
-                # Calculate IoU
-                iou = intersection / union if union > 0 else 0.0
+from mathruler.grader import extract_boxed_content, grade_answer
 
-            total_iou += iou
 
-        # Return mean IoU
-        return total_iou / len(pred_bboxes)
+def format_reward(response: str) -> float:
+    pattern = re.compile(r"<think>.*</think>.*\\boxed\{.*\}.*", re.DOTALL)
+    format_match = re.fullmatch(pattern, response)
+    return 1.0 if format_match else 0.0
 
+def accuracy_reward(response: str, ground_truth: str) -> float:
+    if response == ground_truth:
+        return 1.0
     else:
-        # Neither segmentation mask nor ground truth bbox provided
         return 0.0
 
-
-def evaluate_bbox_format(predict_str):
-    """
-    Evaluate the format correctness of the bounding box JSON in the response.
-    Returns a score based on how well the response follows the expected format.
-
-    Args:
-        predict_str: The model's prediction string
-
-    Returns:
-        Format score between 0.0 and 1.0
-    """
-    format_score = 0.0
-
-    # Check if response contains a code block
-    if "```" in predict_str:
-        format_score += 0.2  # 20% for having a code block
-
-        # Check if it's specifically marked as JSON
-        if "```json" in predict_str:
-            format_score += 0.1  # Additional 10% for correct JSON marker
-
-    # Try to extract and parse JSON
-    json_str = parse_json(predict_str)
-    if not json_str:
-        return format_score  # Failed to find JSON content
-
-    try:
-        # Try to parse as JSON
-        parsed_json = None
-        try:
-            parsed_json = json.loads(json_str)
-            format_score += 0.2  # Additional 20% for valid JSON
-        except json.JSONDecodeError:
-            # Try with ast.literal_eval as fallback
-            import ast
-            try:
-                cleaned = json_str.replace("'", "\"")
-                parsed_json = ast.literal_eval(cleaned)
-                format_score += 0.1  # Only 10% for requiring fallback parsing
-            except:
-                return format_score  # Failed to parse
-
-        # Check if it's a list of objects
-        if not isinstance(parsed_json, list):
-            return format_score
-
-        format_score += 0.1  # Additional 10% for being a list
-
-        # Check each item for proper bbox structure
-        valid_items = 0
-        total_items = len(parsed_json)
-
-        for item in parsed_json:
-            if not isinstance(item, dict):
-                continue
-
-            # Check for required fields
-            has_bbox = "bbox_2d" in item
-            has_label = "label" in item
-
-            if has_bbox and has_label:
-                bbox = item["bbox_2d"]
-                # Check bbox format [x1, y1, x2, y2]
-                if (isinstance(bbox, list) and len(bbox) == 4 and
-                        all(isinstance(coord, (int, float)) for coord in bbox)):
-                    valid_items += 1
-
-        # Add up to 40% based on proportion of valid items
-        if total_items > 0:
-            format_score += 0.4 * (valid_items / total_items)
-
-    except Exception:
-        # Any other parsing issues
-        pass
-
-    return format_score
-
-
-def medical_compute_score(predict_str: str, ground_truth: str, segmentation_mask=None, bbox=None) -> Dict[str, float]:
-    """
-    Compute medical scoring including standard score, bounding box IoU, and format score.
-
-    Args:
-        predict_str: The model's prediction string
-        ground_truth: The ground truth string
-        segmentation_mask: Ground truth segmentation mask tensor
-        bbox: Ground truth bounding box
-
-    Returns:
-        Tuple of (standard_score, bbox_score)
-        Note: bbox_score is a combination of IoU score and format score
-    """
-    # Calculate standard score
-    answer = extract_boxed_content(predict_str)
-    if answer == "None":
-        standard_score = 0.0  # no answer
-    else:
-        # Parse both prediction and ground truth into sets of conditions
-        predicted_conditions = parse_conditions(answer)
-        ground_truth_conditions = parse_conditions(ground_truth)
-
-        # Calculate true positives, false positives, and false negatives
-        true_positives = len(predicted_conditions.intersection(ground_truth_conditions))
-        false_positives = len(predicted_conditions - ground_truth_conditions)
-        false_negatives = len(ground_truth_conditions - predicted_conditions)
-
-        # Calculate F1 score components
-        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
-        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
-
-        # Calculate F1 score (harmonic mean of precision and recall)
-        standard_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
-
-    # Calculate format score (how well the JSON follows the expected format)
-    format_score = evaluate_bbox_format(predict_str)
-
-    # length score
-    if len(predict_str) > 600:  # ~200 words
-        length_score = 1
-    else:
-        length_score = len(predict_str) * 0.001
-
-
-    # Calculate bounding box IoU score
-    iou_score = 0.0
-    # Extract predicted bounding boxes from the response
-    json_data = extract_json_from_response(predict_str)
-    if json_data:
-        # Extract bounding boxes from the JSON
-        try:
-            pred_bboxes = []
-            if isinstance(json_data, list):
-                for item in json_data:
-                    if isinstance(item, dict) and "bbox_2d" in item:
-                        pred_bboxes.append(item["bbox_2d"])
-            elif isinstance(json_data, dict) and "bbox_2d" in json_data:
-                pred_bboxes.append(json_data["bbox_2d"])
-            elif isinstance(json_data, dict) and 'objects_of_interest' in json_data:
-                for item in json_data['objects_of_interest']:
-                    if isinstance(item, dict) and "bbox_2d" in item:
-                        pred_bboxes.append(item["bbox_2d"])
-            # else:
-            #     print("Error: Invalid JSON format")
-            if random.random() < 0.0005:  # print every 0.5%
-                print("[Bounding Box] ", json_data)
-                print("[Formatted Bounding Box] ", pred_bboxes)
-                print('[GT Bounding Box] ', bbox)
-
-            # Calculate IoU between predicted boxes and ground truth
-            if pred_bboxes:
-                iou_score = calculate_bbox_iou(pred_bboxes, segmentation_mask, bbox)
-        except:
-            pass
-            # traceback.print_exc()
-
-    scores = {
-        "overall": 0.6 * standard_score + 0.2 * iou_score + 0.1 * format_score + 0.1 * length_score,
-        "standard_score": standard_score,
-        "iou_score": iou_score,
-        "format_score": format_score,
-    }
-    return scores
-
-
-def medical_compute_score_batch(data_sources: List[str], solution_strs: List[str], ground_truths: List[str], extra_infos: List[str], **kwargs) -> List[Dict[str, float]]:
-    """
-    Compute medical scoring for batch inputs including standard score, bounding box IoU, and format score.
-
-    Args:
-        data_sources: List of data sources (e.g., file paths or identifiers)
-        solution_strs: List of model prediction strings
-        ground_truths: List of ground truth strings
-        extra_infos: List of extra information (e.g., segmentation masks, bounding boxes)
-
-    Returns:
-        List of score dictionaries
-    """
-    batch_scores = []
-
-    for data_source, predict_str, ground_truth, extra_info in zip(data_sources, solution_strs, ground_truths, extra_infos):
-        segmentation_mask = None
-        bbox = None
-
-        # Calculate standard score
-        answer = extract_boxed_content(predict_str)
-        if answer == "None":
-            standard_score = 0.0  # no answer
-        else:
-            # Parse both prediction and ground truth into sets of conditions
-            predicted_conditions = parse_conditions(answer)
-            ground_truth_conditions = parse_conditions(ground_truth)
-
-            # Calculate true positives, false positives, and false negatives
-            true_positives = len(predicted_conditions.intersection(ground_truth_conditions))
-            false_positives = len(predicted_conditions - ground_truth_conditions)
-            false_negatives = len(ground_truth_conditions - predicted_conditions)
-
-            # Calculate F1 score components
-            precision = (
-                true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
-            )
-            recall = true_positives / (true_positives + false_negatives) if (
-                                                                                        true_positives + false_negatives) > 0 else 0
-
-            # Calculate F1 score (harmonic mean of precision and recall)
-            standard_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
-
-        # Calculate format score (how well the JSON follows the expected format)
-        format_score = evaluate_bbox_format(predict_str)
-
-        # length score
-        if len(predict_str) > 600:  # ~200 words
-            length_score = 1
-        else:
-            length_score = len(predict_str) * 0.001
-
-        # Calculate bounding box IoU score
-        iou_score = 0.0
-        # Extract predicted bounding boxes from the response
-        json_data = extract_json_from_response(predict_str)
-        if json_data:
-            # Extract bounding boxes from the JSON
-            try:
-                pred_bboxes = []
-                if isinstance(json_data, list):
-                    for item in json_data:
-                        if isinstance(item, dict) and "bbox_2d" in item:
-                            pred_bboxes.append(item["bbox_2d"])
-                elif isinstance(json_data, dict) and "bbox_2d" in json_data:
-                    pred_bboxes.append(json_data["bbox_2d"])
-                elif isinstance(json_data, dict) and "objects_of_interest" in json_data:
-                    for item in json_data["objects_of_interest"]:
-                        if isinstance(item, dict) and "bbox_2d" in item:
-                            pred_bboxes.append(item["bbox_2d"])
-
-                if random.random() < 0.005:  # print every 0.5%
-                    print("[Bounding Box] ", json_data)
-                    print("[Formatted Bounding Box] ", pred_bboxes)
-                    print("[GT Bounding Box] ", bbox)
-
-                # Calculate IoU between predicted boxes and ground truth
-                if pred_bboxes:
-                    iou_score = calculate_bbox_iou(pred_bboxes, segmentation_mask, bbox)
-            except:
-                pass
-
-        scores = {
-            "score": 0.5 * standard_score + 0.3 * iou_score + 0.1 * format_score,
-            "standard_score": standard_score,
-            "iou_score": iou_score,
-            "format_score": format_score,
-            "length_score": length_score,
-        }
-        batch_scores.append(scores)
-
-    return batch_scores
\ No newline at end of file
+def human_behaviour_compute_score(reward_inputs: list[dict[str, Any]], format_weight: float = 0.1) -> list[dict[str, float]]:
+    if not isinstance(reward_inputs, list):
+        raise ValueError("Please use `reward_type=batch` for math reward function.")
+
+    scores = []
+    for reward_input in reward_inputs:
+        response = re.sub(r"\s*(<|>|/)\s*", r"\1", reward_input["response"])  # handle qwen2.5vl-32b format
+        format_score = format_reward(response)
+        accuracy_score = accuracy_reward(response, reward_input["ground_truth"])
+        scores.append(
+            {
+                "overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
+                "format": format_score,
+                "accuracy": accuracy_score,
+            }
+        )
+
+    return scores
\ No newline at end of file
diff --git a/examples/reward_function/human_behaviour_alt.py b/examples/reward_function/human_behaviour_alt.py
new file mode 100644
index 00000000000..765dbeb7b16
--- /dev/null
+++ b/examples/reward_function/human_behaviour_alt.py
@@ -0,0 +1,140 @@
+import re
+import json
+from typing import Dict, List
+
+import numpy
+import torch
+import numpy as np
+from mathruler.grader import extract_boxed_content
+import wandb
+import random
+
+
+def parse_conditions(text):
+    # Remove any boxing notation if present
+    text = text.replace("\\boxed{", "").replace("}", "")
+
+    # Split by common separators
+    for sep in [", ", " and ", " & ", ",", "&"]:
+        if sep in text:
+            return set(cond.strip() for cond in text.split(sep))
+
+    # If no separator found, treat as single condition
+    return {text.strip()}
+
+
+def parse_json(json_output):
+    """
+    Parsing out the markdown fencing from JSON code blocks.
+    """
+    # Look for content between ```json and ```
+    lines = json_output.splitlines()
+    for i, line in enumerate(lines):
+        if line == "```json" or line.strip() == "```":
+            json_output = "\n".join(lines[i + 1:])  # Remove everything before ```json
+            if "```" in json_output:
+                json_output = json_output.split("```")[0]  # Remove everything after the closing ```
+            break  # Exit the loop once code block marker is found
+    return json_output
+
+
+def extract_json_from_response(text):
+    """
+    Extract JSON content from markdown code blocks in the response.
+
+    Args:
+        text: The model's response text
+
+    Returns:
+        Parsed JSON object or None if no valid JSON found
+    """
+    # Find content between ```json and ```
+    json_pattern = r"```(?:json)?\s*([\s\S]*?)```"
+    matches = re.findall(json_pattern, text)
+
+    if not matches:
+        return None
+
+    # Try to parse each match as JSON
+    for match in matches:
+        try:
+            parsed_json = json.loads(match.strip())
+            return parsed_json
+        except json.JSONDecodeError:
+            continue
+
+    # If we couldn't parse any match as valid JSON, try with ast.literal_eval
+    import ast
+    for match in matches:
+        try:
+            # Clean up the match a bit
+            cleaned = match.strip().replace("'", "\"")
+            parsed_json = ast.literal_eval(cleaned)
+            return parsed_json
+        except:
+            continue
+
+    return None
+
+
+def medical_compute_score_batch(data_sources: List[str], solution_strs: List[str], ground_truths: List[str], extra_infos: List[str], **kwargs) -> List[Dict[str, float]]:
+    """
+    Compute medical scoring for batch inputs including standard score, bounding box IoU, and format score.
+
+    Args:
+        data_sources: List of data sources (e.g., file paths or identifiers)
+        solution_strs: List of model prediction strings
+        ground_truths: List of ground truth strings
+        extra_infos: List of extra information (e.g., segmentation masks, bounding boxes)
+
+    Returns:
+        List of score dictionaries
+    """
+    batch_scores = []
+
+    for data_source, predict_str, ground_truth, extra_info in zip(data_sources, solution_strs, ground_truths, extra_infos):
+        segmentation_mask = None
+        bbox = None
+
+        # Calculate standard score
+        answer = extract_boxed_content(predict_str)
+        if answer == "None":
+            standard_score = 0.0  # no answer
+        else:
+            # Parse both prediction and ground truth into sets of conditions
+            predicted_conditions = parse_conditions(answer)
+            ground_truth_conditions = parse_conditions(ground_truth)
+
+            # Calculate true positives, false positives, and false negatives
+            true_positives = len(predicted_conditions.intersection(ground_truth_conditions))
+            false_positives = len(predicted_conditions - ground_truth_conditions)
+            false_negatives = len(ground_truth_conditions - predicted_conditions)
+
+            # Calculate F1 score components
+            precision = (
+                true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
+            )
+            recall = true_positives / (true_positives + false_negatives) if (
+                                                                                        true_positives + false_negatives) > 0 else 0
+
+            # Calculate F1 score (harmonic mean of precision and recall)
+            standard_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+
+        # Calculate format score (how well the JSON follows the expected format)
+        format_score = evaluate_bbox_format(predict_str)
+
+        # length score
+        if len(predict_str) > 600:  # ~200 words
+            length_score = 1
+        else:
+            length_score = len(predict_str) * 0.001
+
+        scores = {
+            "score": 0.5 * standard_score + 0.3 * iou_score + 0.1 * format_score,
+            "standard_score": standard_score,
+            "format_score": format_score,
+            "length_score": length_score,
+        }
+        batch_scores.append(scores)
+
+    return batch_scores
\ No newline at end of file

From 43fce9156e5484fcb1b4c6d6bce383b530c90a20 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 16:53:34 -0400
Subject: [PATCH 135/232] remove prints

---
 verl/utils/dataset/rl_dataset.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 688968b9313..10c9522fd02 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -199,7 +199,7 @@ def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
             audio_key = self.audio_key
 
             if processor is not None:
-                print(f"KEANE: PROCESSOR FOUND")
+                # print(f"KEANE: PROCESSOR FOUND")
                 from verl.utils.dataset.vision_utils import process_image, process_video
                 from verl.utils.dataset.audio_utils import process_audio
 
@@ -236,13 +236,13 @@ def doc2len(doc) -> int:
 
                         processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
                     # TODO: cannot process the audio inputs
-                    print(f"KEANE: Processor class is {processor.__class__.__name__}")
-                    print(f"KEANE: Printing the processor_kwargs, {processor_kwargs}")
+                    # print(f"KEANE: Processor class is {processor.__class__.__name__}")
+                    # print(f"KEANE: Printing the processor_kwargs, {processor_kwargs}")
                     # Assume that all are in tensors already, hence there is no return_tensors = "pt"
                     return len(processor(**processor_kwargs)["input_ids"][0])
 
             else:
-                print(f"KEANE: PROCESSOR NOT FOUND")
+                # print(f"KEANE: PROCESSOR NOT FOUND")
                 def doc2len(doc) -> int:
                     return len(tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True))
 
@@ -450,10 +450,10 @@ def __getitem__(self, item):
                 multi_modal_data["image"] = images
                 processor_kwargs["images"] = images
 
-            print(f"KEANE: Videos is next line, current processor_kwargs {processor_kwargs}")
+            # print(f"KEANE: Videos is next line, current processor_kwargs {processor_kwargs}")
             if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
                 videos = []
-                print(f"KEANE: GETTING VIDEO {row_dict[self.video_key]}")
+                # print(f"KEANE: GETTING VIDEO {row_dict[self.video_key]}")
 
                 for video in row_dict.get(self.video_key):
                     video = os.path.join(self.base_dir, video) if isinstance(video, str) else video
@@ -549,7 +549,7 @@ def __getitem__(self, item):
             from verl.models.transformers.qwen2_vl import get_rope_index
             
             # NOTE: printing out whether this runs
-            print("KEANE: Running getting the rope index of input ids")
+            # print("KEANE: Running getting the rope index of input ids")
             
             # NOTE: OBTAIN ROPE of rotary positional embeddings. ROPE encodes position by rotating components of query/key vectors
             # This is just for to get relative position in terms of angular differences etc.

From 620640bbab17a8a2581b38f6037d2674715bf75e Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 17:18:31 -0400
Subject: [PATCH 136/232] implement reward score

---
 examples/reward_function/human_behaviour.py | 90 ++++++++++++---------
 1 file changed, 50 insertions(+), 40 deletions(-)

diff --git a/examples/reward_function/human_behaviour.py b/examples/reward_function/human_behaviour.py
index f0bb76d7b93..4fad1a6acac 100644
--- a/examples/reward_function/human_behaviour.py
+++ b/examples/reward_function/human_behaviour.py
@@ -1,49 +1,59 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+from typing import List, Dict
 import re
-from typing import Any
-
-from mathruler.grader import extract_boxed_content, grade_answer
-
 
 def format_reward(response: str) -> float:
+    """
+    Check whether the response matches the expected format.
+    Here we require something like <think>...</think> ... \boxed{...}
+    """
     pattern = re.compile(r"<think>.*</think>.*\\boxed\{.*\}.*", re.DOTALL)
     format_match = re.fullmatch(pattern, response)
     return 1.0 if format_match else 0.0
 
 def accuracy_reward(response: str, ground_truth: str) -> float:
-    if response == ground_truth:
-        return 1.0
-    else:
-        return 0.0
-
-def human_behaviour_compute_score(reward_inputs: list[dict[str, Any]], format_weight: float = 0.1) -> list[dict[str, float]]:
-    if not isinstance(reward_inputs, list):
-        raise ValueError("Please use `reward_type=batch` for math reward function.")
-
-    scores = []
-    for reward_input in reward_inputs:
-        response = re.sub(r"\s*(<|>|/)\s*", r"\1", reward_input["response"])  # handle qwen2.5vl-32b format
+    """
+    Simple accuracy: exact match to ground truth string.
+    """
+    return 1.0 if response == ground_truth else 0.0
+
+def human_behaviour_compute_score_batch(
+    data_sources: List[str],
+    solution_strs: List[str],
+    ground_truths: List[str],
+    extra_infos: List[str],
+    **kwargs
+) -> List[Dict[str, float]]:
+    """
+    Compute human behaviour scoring for batch inputs.
+
+    Args:
+        data_sources: List of data sources (unused here, but kept for interface compatibility)
+        solution_strs: List of model prediction strings
+        ground_truths: List of ground truth strings
+        extra_infos: List of extra information (unused here, kept for compatibility)
+
+    Returns:
+        List of score dictionaries
+    """
+    batch_scores = []
+    format_weight = 0.1
+
+    for data_source, predict_str, ground_truth, extra_info in zip(data_sources, solution_strs, ground_truths, extra_infos):
+        # Normalize response formatting (e.g., qwen2.5vl quirks)
+        response = re.sub(r"\s*(<|>|/)\s*", r"\1", predict_str)
+
+        # Compute individual components
         format_score = format_reward(response)
-        accuracy_score = accuracy_reward(response, reward_input["ground_truth"])
-        scores.append(
-            {
-                "overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
-                "format": format_score,
-                "accuracy": accuracy_score,
-            }
-        )
-
-    return scores
\ No newline at end of file
+        standard_score = accuracy_reward(response, ground_truth)
+
+        # Weighted overall score
+        overall_score = (1 - format_weight) * standard_score + format_weight * format_score
+
+        scores = {
+            "score": overall_score,
+            "standard_score": standard_score,
+            "format_score": format_score,
+        }
+        batch_scores.append(scores)
+
+    return batch_scores

From cd5b9bdb43a53c660ab0e03a641bb13286302df6 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 17:20:56 -0400
Subject: [PATCH 137/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 7c3abd61a47..336c7fe5711 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -73,7 +73,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     algorithm.use_kl_in_reward=False \
     custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/human_behaviour.py \
-    custom_reward_function.name=human_behaviour_compute_score \
+    custom_reward_function.name=human_behaviour_compute_score_batch \
     reward_model.reward_manager=batch \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \

From 066345c8eb482fa8ad36d37a60b65195e0d764ef Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 17:34:47 -0400
Subject: [PATCH 138/232] update hb reward

---
 examples/reward_function/human_behaviour.py | 50 +++++++++++++++++++--
 verl/utils/dataset/audio_utils.py           | 14 +++---
 2 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/examples/reward_function/human_behaviour.py b/examples/reward_function/human_behaviour.py
index 4fad1a6acac..9221dbf7eda 100644
--- a/examples/reward_function/human_behaviour.py
+++ b/examples/reward_function/human_behaviour.py
@@ -1,6 +1,30 @@
 from typing import List, Dict
 import re
 
+def extract_boxed_content(text: str) -> str:
+    """
+    Extract content within \boxed{} or similar boxing notations.
+
+    Args:
+        text (str): Text containing potentially boxed content.
+
+    Returns:
+        str: Extracted boxed content or the original text if no box found.
+    """
+
+    # Look for LaTeX \boxed{} notation
+    boxed_match = re.search(r"\\boxed{([^}]*)}", text)
+    if boxed_match:
+        return boxed_match.group(1)
+
+    # Look for markdown boxed notation (e.g., [boxed content])
+    markdown_match = re.search(r"\[(.*?)\]", text)
+    if markdown_match:
+        return markdown_match.group(1)
+
+    # Return the text as is if no boxed content is found
+    return text
+
 def format_reward(response: str) -> float:
     """
     Check whether the response matches the expected format.
@@ -40,11 +64,15 @@ def human_behaviour_compute_score_batch(
 
     for data_source, predict_str, ground_truth, extra_info in zip(data_sources, solution_strs, ground_truths, extra_infos):
         # Normalize response formatting (e.g., qwen2.5vl quirks)
-        response = re.sub(r"\s*(<|>|/)\s*", r"\1", predict_str)
+        full_response = re.sub(r"\s*(<|>|/)\s*", r"\1", predict_str)
+        pred_label = extract_boxed_content(full_response).lower()  # handle qwen2.5vl-32b format
 
+        print(pred_label)
         # Compute individual components
-        format_score = format_reward(response)
-        standard_score = accuracy_reward(response, ground_truth)
+        format_score = format_reward(full_response)
+        standard_score = accuracy_reward(pred_label, ground_truth)
+
+        ground_truth = ground_truth.lower()
 
         # Weighted overall score
         overall_score = (1 - format_weight) * standard_score + format_weight * format_score
@@ -57,3 +85,19 @@ def human_behaviour_compute_score_batch(
         batch_scores.append(scores)
 
     return batch_scores
+
+
+if __name__ == "__main__":
+    response_str = (
+        "<think>Well, I've listened to the speech recording. It sounds like the speaker is expressing anger. "
+        "You know, the tone and the way the words are said seem to indicate frustration or annoyance. "
+        "So, I'd say the emotion is anger.</think>\\boxed{anger}If you have any other questions or need more help, feel free to let me know."
+    )
+
+    data_sources = ["sample_audio.wav"]
+    solution_strs = [response_str]
+    ground_truths = ["anger"]
+    extra_infos = [""]
+
+    scores = human_behaviour_compute_score_batch(data_sources, solution_strs, ground_truths, extra_infos)
+    print(scores)
diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index aae8b81a212..e86abfcac3d 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -39,7 +39,7 @@ def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]
         else:
             
             target_sr = 16000
-        print(f"KEANE: Processing audio {audio_path} with sampling rate, {target_sr}")
+        # print(f"KEANE: Processing audio {audio_path} with sampling rate, {target_sr}")
         # Resample if needed
         # NOTE: This is essentially the resampling of the audio sample rate
         if original_sr != target_sr:
@@ -54,12 +54,12 @@ def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]
             audio_data = audio_data.squeeze(0)
 
                # Debug prints
-        print(
-            f"KEANE: Finished processing {audio_path} -> "
-            f"waveform shape {audio_data.shape}, dtype {audio_data.dtype}, "
-            # f"min {audio_data.min().item():.4f}, max {audio_data.max().item():.4f}"
-        )
-        print(f"KEANE: Returning tuple (waveform, sr={target_sr})")
+        # print(
+        #     f"KEANE: Finished processing {audio_path} -> "
+        #     f"waveform shape {audio_data.shape}, dtype {audio_data.dtype}, "
+        #     # f"min {audio_data.min().item():.4f}, max {audio_data.max().item():.4f}"
+        # )
+        # print(f"KEANE: Returning tuple (waveform, sr={target_sr})")
 
         return audio_data, target_sr
     except Exception as e:

From d2103a63011ed65155ee3c9219b70d1ac79cb180 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 18:09:47 -0400
Subject: [PATCH 139/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 336c7fe5711..5900b37df78 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -35,8 +35,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_lmvd_discretized_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_lmvd_discretized_v3_template_prompts.jsonl \
-    data.train_batch_size=1 \
-    data.val_batch_size=1 \
+    data.train_batch_size=12 \
+    data.val_batch_size=12 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
     data.filter_overlong_prompts=False \
@@ -50,8 +50,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=1 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=3 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=3 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-9 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
@@ -83,6 +83,6 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.nnodes=1 \
     trainer.save_freq=5 \
     trainer.val_before_train=False \
-    trainer.test_freq=1 \
+    trainer.test_freq=5 \
     trainer.total_epochs=15 $@ \
     trainer.default_local_dir=/scratch/keane/human_behaviour/verl_models_hb_omni
\ No newline at end of file

From 3024ddbbe60b5dc87ce2602070a4d1d76a3245b8 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 18:20:14 -0400
Subject: [PATCH 140/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 5900b37df78..dcdcf775683 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -51,7 +51,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=3 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=3 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-9 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \

From d910db3dbbaa2f5e86900757d7cc52b26719c0f2 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 18:25:44 -0400
Subject: [PATCH 141/232] workers

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index dcdcf775683..a8db0a4c7e3 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -44,7 +44,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.image_key=images \
     data.video_key=videos \
     data.prompt_key=problem \
-    data.dataloader_num_workers=0 \
+    data.dataloader_num_workers=8 \
     data.modalities=\'audio,videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \

From 325029a85d5aab82dd53451aec627592df1f4387 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 18:37:29 -0400
Subject: [PATCH 142/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index a8db0a4c7e3..243c1eff66b 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -30,10 +30,12 @@ unset ROCR_VISIBLE_DEVICES
 
 # when resuming training from a loaded checkpoint cuda OOM error
 
+# alt: /scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl
+# org: /scratch/keane/human_behaviour/human_behaviour_data/train_no_lmvd_discretized_v3_template_prompts.jsonl
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_lmvd_discretized_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_lmvd_discretized_v3_template_prompts.jsonl \
     data.train_batch_size=12 \
     data.val_batch_size=12 \

From 5b519f1f9869b6151165626a6371b2fde145b0dc Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 18:40:58 -0400
Subject: [PATCH 143/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 243c1eff66b..ab7c9af7348 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -36,7 +36,7 @@ unset ROCR_VISIBLE_DEVICES
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_lmvd_discretized_v3_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl \
     data.train_batch_size=12 \
     data.val_batch_size=12 \
     data.max_prompt_length=3072 \

From ece6cfe34993982fe5996df986577fad48d911d7 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 18:47:14 -0400
Subject: [PATCH 144/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index ab7c9af7348..198f2e33ace 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -46,7 +46,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.image_key=images \
     data.video_key=videos \
     data.prompt_key=problem \
-    data.dataloader_num_workers=8 \
+    data.dataloader_num_workers=0 \
     data.modalities=\'audio,videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \

From 0efaa2a3a864819eaae55175165258353f0c1663 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 18:53:29 -0400
Subject: [PATCH 145/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 198f2e33ace..51cdb386ca9 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -35,8 +35,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/['cremad']_only.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/['cremad']_only.jsonl \
     data.train_batch_size=12 \
     data.val_batch_size=12 \
     data.max_prompt_length=3072 \

From 863c4d77d096bb099705a1474896409978f39a80 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 18:56:47 -0400
Subject: [PATCH 146/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 51cdb386ca9..3af9f61c1dc 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -35,8 +35,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/['cremad']_only.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/['cremad']_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_only.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_only.jsonl \
     data.train_batch_size=12 \
     data.val_batch_size=12 \
     data.max_prompt_length=3072 \

From 3f5626fe013d9d132fe4b91a5708bc4e1ac3f7d4 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 19:01:22 -0400
Subject: [PATCH 147/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3af9f61c1dc..853107aac32 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -35,8 +35,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_only.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/ravdess_only.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/ravdess_only.jsonl \
     data.train_batch_size=12 \
     data.val_batch_size=12 \
     data.max_prompt_length=3072 \

From 8e5896b7befb33869dfc5ea8c68c06159a13518b Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 19:15:19 -0400
Subject: [PATCH 148/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 853107aac32..787ca808131 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -35,8 +35,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/ravdess_only.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/ravdess_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_only_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_only_template_prompts.jsonl \
     data.train_batch_size=12 \
     data.val_batch_size=12 \
     data.max_prompt_length=3072 \

From e5acd60f60e08c11f0f223ab7de4db7709fb5fab Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 19:33:55 -0400
Subject: [PATCH 149/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 787ca808131..3ba5196e3ec 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -35,8 +35,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_only_template_prompts.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_only_template_prompts.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_cremad_ravdess_tess.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/no_cremad_ravdess_tess.jsonl \
     data.train_batch_size=12 \
     data.val_batch_size=12 \
     data.max_prompt_length=3072 \

From d1cd282761cee696c4a9993589b8b55b792b7194 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Mon, 18 Aug 2025 19:53:27 -0400
Subject: [PATCH 150/232] change pixels

---
 verl/utils/dataset/vision_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index 7662542b13e..2136979a258 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -79,8 +79,10 @@ def process_video(
     Add video sample FPS in a future MR
     """
     if isinstance(video, str):
-        video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
-                 "nframes": 4}
+        # video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
+        #          "nframes": 4}
+        video = {"type": "video", "video": video, "min_pixels": 32768, "max_pixels": 98304,
+            "nframes": 4}
 
     if not isinstance(video, dict) or "video" not in video:
         raise NotImplementedError(VIDEO_FORMAT_HELP)

From 75867470a17193e3353b3ca68ffdc6f576ab490d Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 09:26:21 -0400
Subject: [PATCH 151/232] change pixels

---
 verl/utils/dataset/vision_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index 2136979a258..5f7a25d1bd3 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -79,10 +79,11 @@ def process_video(
     Add video sample FPS in a future MR
     """
     if isinstance(video, str):
+        # This is the original form
         # video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
         #          "nframes": 4}
         video = {"type": "video", "video": video, "min_pixels": 32768, "max_pixels": 98304,
-            "nframes": 4}
+            "nframes": 1}
 
     if not isinstance(video, dict) or "video" not in video:
         raise NotImplementedError(VIDEO_FORMAT_HELP)

From ed6c4e0b8c189bfc03612c9f56cef3bb585569a0 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 10:01:34 -0400
Subject: [PATCH 152/232] _

---
 verl/utils/dataset/vision_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index 5f7a25d1bd3..7e6619b9ced 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -83,7 +83,7 @@ def process_video(
         # video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
         #          "nframes": 4}
         video = {"type": "video", "video": video, "min_pixels": 32768, "max_pixels": 98304,
-            "nframes": 1}
+            "nframes": 1, "fps": 1}
 
     if not isinstance(video, dict) or "video" not in video:
         raise NotImplementedError(VIDEO_FORMAT_HELP)

From e9b985b13a48fb1dcf4b13c17a18852826271a39 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 10:29:56 -0400
Subject: [PATCH 153/232] try omni pixel length

---
 verl/utils/dataset/vision_utils.py | 2 +-
 verl/utils/tokenizer.py            | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index 7e6619b9ced..5f7a25d1bd3 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -83,7 +83,7 @@ def process_video(
         # video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
         #          "nframes": 4}
         video = {"type": "video", "video": video, "min_pixels": 32768, "max_pixels": 98304,
-            "nframes": 1, "fps": 1}
+            "nframes": 1}
 
     if not isinstance(video, dict) or "video" not in video:
         raise NotImplementedError(VIDEO_FORMAT_HELP)
diff --git a/verl/utils/tokenizer.py b/verl/utils/tokenizer.py
index 668ea3e1409..3dddacb9ac0 100644
--- a/verl/utils/tokenizer.py
+++ b/verl/utils/tokenizer.py
@@ -85,4 +85,12 @@ def hf_processor(name_or_path, **kwargs):
     # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/auto/processing_auto.py#L344
     if processor is not None and "Processor" not in processor.__class__.__name__:
         processor = None
+
+    # set the limits on the min and max pixels in the processor if Omni is present
+    print(f"Processor Class {processor.__class__.__name__}")
+    if "Omni" in processor.__class__.__name__:
+        print("KEANE: Setting pixel limits for Omni processor")
+        processor.min_pixels = 128*28*28
+        processor.max_pixels = 256*28*28
+
     return processor

From fbf6ace4ed3d7003f76d3d4fd1b30a4e9fe9ca4d Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 10:34:07 -0400
Subject: [PATCH 154/232] _

---
 verl/utils/tokenizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/verl/utils/tokenizer.py b/verl/utils/tokenizer.py
index 3dddacb9ac0..e2bb66d84b7 100644
--- a/verl/utils/tokenizer.py
+++ b/verl/utils/tokenizer.py
@@ -92,5 +92,6 @@ def hf_processor(name_or_path, **kwargs):
         print("KEANE: Setting pixel limits for Omni processor")
         processor.min_pixels = 128*28*28
         processor.max_pixels = 256*28*28
+    raise RuntimeError("Intentional Test Error")
 
     return processor

From 384e237c9909aa8bac411a707ec5617cb9205ae6 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 10:35:55 -0400
Subject: [PATCH 155/232] _

---
 verl/utils/tokenizer.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/verl/utils/tokenizer.py b/verl/utils/tokenizer.py
index e2bb66d84b7..34bbe27b776 100644
--- a/verl/utils/tokenizer.py
+++ b/verl/utils/tokenizer.py
@@ -90,8 +90,11 @@ def hf_processor(name_or_path, **kwargs):
     print(f"Processor Class {processor.__class__.__name__}")
     if "Omni" in processor.__class__.__name__:
         print("KEANE: Setting pixel limits for Omni processor")
-        processor.min_pixels = 128*28*28
-        processor.max_pixels = 256*28*28
-    raise RuntimeError("Intentional Test Error")
+        min_pixels = 128 * 28 * 28
+        max_pixels = 256 * 28 * 28
+        processor.min_pixels = min_pixels
+        processor.max_pixels = max_pixels
+        print(f"KEANE: Set min_pixels to {min_pixels} and max_pixels to {max_pixels}")
+    # raise RuntimeError("Intentional Test Error")
 
     return processor

From ea93f8cf8b6ab4c95b975ace99ac7253eb8088b7 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 10:46:50 -0400
Subject: [PATCH 156/232] _

---
 verl/utils/dataset/vision_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index 5f7a25d1bd3..ae3eddd2c64 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -83,7 +83,7 @@ def process_video(
         # video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
         #          "nframes": 4}
         video = {"type": "video", "video": video, "min_pixels": 32768, "max_pixels": 98304,
-            "nframes": 1}
+            "nframes": 2}
 
     if not isinstance(video, dict) or "video" not in video:
         raise NotImplementedError(VIDEO_FORMAT_HELP)

From bb3621d0751105034b447edf49eae14057332bec Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 10:50:56 -0400
Subject: [PATCH 157/232] set omni pixels

---
 verl/utils/tokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/verl/utils/tokenizer.py b/verl/utils/tokenizer.py
index 34bbe27b776..46cea70fc8e 100644
--- a/verl/utils/tokenizer.py
+++ b/verl/utils/tokenizer.py
@@ -90,8 +90,8 @@ def hf_processor(name_or_path, **kwargs):
     print(f"Processor Class {processor.__class__.__name__}")
     if "Omni" in processor.__class__.__name__:
         print("KEANE: Setting pixel limits for Omni processor")
-        min_pixels = 128 * 28 * 28
-        max_pixels = 256 * 28 * 28
+        min_pixels = 32768
+        max_pixels = 98304
         processor.min_pixels = min_pixels
         processor.max_pixels = max_pixels
         print(f"KEANE: Set min_pixels to {min_pixels} and max_pixels to {max_pixels}")

From cc605fecfb20f812b28747131a979eb0ced5cb51 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 11:03:52 -0400
Subject: [PATCH 158/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 verl/utils/dataset/vision_utils.py                            | 2 +-
 verl/utils/tokenizer.py                                       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3ba5196e3ec..b8e8457f8dd 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -35,8 +35,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/no_cremad_ravdess_tess.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/no_cremad_ravdess_tess.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl  \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl  \
     data.train_batch_size=12 \
     data.val_batch_size=12 \
     data.max_prompt_length=3072 \
diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index ae3eddd2c64..e5e3d6436b6 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -82,7 +82,7 @@ def process_video(
         # This is the original form
         # video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
         #          "nframes": 4}
-        video = {"type": "video", "video": video, "min_pixels": 32768, "max_pixels": 98304,
+        video = {"type": "video", "video": video, "min_pixels": 32768, "max_pixels": 32768,
             "nframes": 2}
 
     if not isinstance(video, dict) or "video" not in video:
diff --git a/verl/utils/tokenizer.py b/verl/utils/tokenizer.py
index 46cea70fc8e..6b82d95792f 100644
--- a/verl/utils/tokenizer.py
+++ b/verl/utils/tokenizer.py
@@ -91,7 +91,7 @@ def hf_processor(name_or_path, **kwargs):
     if "Omni" in processor.__class__.__name__:
         print("KEANE: Setting pixel limits for Omni processor")
         min_pixels = 32768
-        max_pixels = 98304
+        max_pixels = 32768
         processor.min_pixels = min_pixels
         processor.max_pixels = max_pixels
         print(f"KEANE: Set min_pixels to {min_pixels} and max_pixels to {max_pixels}")

From fb9bdd58d2305ec17c91ed3603fc818763d2a616 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 11:04:05 -0400
Subject: [PATCH 159/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index b8e8457f8dd..85fa6973a08 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -46,7 +46,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.image_key=images \
     data.video_key=videos \
     data.prompt_key=problem \
-    data.dataloader_num_workers=0 \
+    data.dataloader_num_workers=2 \
     data.modalities=\'audio,videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \

From 17198e183df3c4a4c1ccd85413a88bb63f021bf9 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 11:12:13 -0400
Subject: [PATCH 160/232] flash attn fsdp

---
 verl/workers/fsdp_workers.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
index 28b1c9de1ed..3ccecde3323 100644
--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -267,7 +267,10 @@ def _build_model_optimizer(
         if "Qwen2.5-Omni" in local_path:
             from transformers import Qwen2_5OmniThinkerConfig
             actor_model_config = Qwen2_5OmniThinkerConfig.from_pretrained(
-                local_path, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2"
+                local_path, 
+                trust_remote_code=trust_remote_code, 
+                torch_dtype=torch.bfloat16, 
+                attn_implementation="flash_attention_2"
             )
         else:
             actor_model_config = AutoConfig.from_pretrained(

From 3416084db932c7e0c37b93615ff9869a3ce5e0ee Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 11:27:06 -0400
Subject: [PATCH 161/232] _

---
 verl/workers/fsdp_workers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
index 3ccecde3323..0d0134f441a 100644
--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -269,7 +269,7 @@ def _build_model_optimizer(
             actor_model_config = Qwen2_5OmniThinkerConfig.from_pretrained(
                 local_path, 
                 trust_remote_code=trust_remote_code, 
-                torch_dtype=torch.bfloat16, 
+                # torch_dtype=torch.bfloat16, 
                 attn_implementation="flash_attention_2"
             )
         else:

From f378c708d1b3edbfa1a26adefbfb0d9de8c33f58 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 11:27:24 -0400
Subject: [PATCH 162/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 85fa6973a08..660019dd96e 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -37,8 +37,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl  \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl  \
-    data.train_batch_size=12 \
-    data.val_batch_size=12 \
+    data.train_batch_size=2 \
+    data.val_batch_size=2 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
     data.filter_overlong_prompts=False \

From ceaf4f84891710a06dad12c41e67ffc851b796c0 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 11:28:42 -0400
Subject: [PATCH 163/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 660019dd96e..529796d6a49 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -37,8 +37,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl  \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl  \
-    data.train_batch_size=2 \
-    data.val_batch_size=2 \
+    data.train_batch_size=3 \
+    data.val_batch_size=3 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
     data.filter_overlong_prompts=False \

From 6c500320edc7cbc650b54eab878c5e1148e0950f Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 11:40:22 -0400
Subject: [PATCH 164/232] set max model len

---
 verl/trainer/config/rollout/rollout.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml
index de218572e4e..26a8a4932e8 100644
--- a/verl/trainer/config/rollout/rollout.yaml
+++ b/verl/trainer/config/rollout/rollout.yaml
@@ -50,7 +50,7 @@ tensor_model_parallel_size: 2
 max_num_batched_tokens: 8192
 
 # max length for rollout
-max_model_len: null
+max_model_len: 4096
 
 # max length of sequences
 max_num_seqs: 1024

From 362ac3dbe31026ee112006dc3d5dc6d7dfddaf86 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 11:48:19 -0400
Subject: [PATCH 165/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 ++
 verl/trainer/config/rollout/rollout.yaml                        | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 529796d6a49..67dc728baec 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -73,6 +73,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.n=3 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.rollout.max_model_len=1536 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
     algorithm.use_kl_in_reward=False \
     custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/human_behaviour.py \
     custom_reward_function.name=human_behaviour_compute_score_batch \
diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml
index 26a8a4932e8..6a5b90a473e 100644
--- a/verl/trainer/config/rollout/rollout.yaml
+++ b/verl/trainer/config/rollout/rollout.yaml
@@ -47,7 +47,7 @@ free_cache_engine: True
 tensor_model_parallel_size: 2
 
 # max number of tokens in a batch
-max_num_batched_tokens: 8192
+max_num_batched_tokens: 4096
 
 # max length for rollout
 max_model_len: 4096

From f300071975d6a9e8f64b574aacf0d3f6a4dd3998 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 11:48:53 -0400
Subject: [PATCH 166/232] set rollouts

---
 verl/trainer/config/rollout/rollout.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml
index 6a5b90a473e..33bcaa89a65 100644
--- a/verl/trainer/config/rollout/rollout.yaml
+++ b/verl/trainer/config/rollout/rollout.yaml
@@ -47,10 +47,10 @@ free_cache_engine: True
 tensor_model_parallel_size: 2
 
 # max number of tokens in a batch
-max_num_batched_tokens: 4096
+max_num_batched_tokens: 1536
 
 # max length for rollout
-max_model_len: 4096
+max_model_len: 1536
 
 # max length of sequences
 max_num_seqs: 1024

From 618570430b0a10af2c5a66474f68e74528a954d6 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 11:56:15 -0400
Subject: [PATCH 167/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 67dc728baec..4a1300b30d5 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -87,6 +87,6 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.nnodes=1 \
     trainer.save_freq=5 \
     trainer.val_before_train=False \
-    trainer.test_freq=5 \
+    trainer.test_freq=10 \
     trainer.total_epochs=15 $@ \
     trainer.default_local_dir=/scratch/keane/human_behaviour/verl_models_hb_omni
\ No newline at end of file

From 4bd9a5c11577fec8116b58a84671dceff3a1df80 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 12:10:06 -0400
Subject: [PATCH 168/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh           | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 4a1300b30d5..252657bb6d2 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -53,6 +53,12 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=3 \
+    actor_rollout_ref.model.lora_rank=32 \
+    actor_rollout_ref.model.lora_alpha=32 \
+    actor_rollout_ref.rollout.load_format=safetensors \
+    actor_rollout_ref.model.target_modules=all-linear \
+    actor_rollout_ref.model.use_shm=True \
+    actor_rollout_ref.rollout.layered_summon=True \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-9 \

From aa845fbc7b8efe3389711627ef2272a3d83efd76 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 12:14:04 -0400
Subject: [PATCH 169/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 252657bb6d2..cab349e0a05 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -33,6 +33,9 @@ unset ROCR_VISIBLE_DEVICES
 # alt: /scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl
 # org: /scratch/keane/human_behaviour/human_behaviour_data/train_no_lmvd_discretized_v3_template_prompts.jsonl
 
+# LORA:
+#     actor_rollout_ref.model.use_shm=True \
+
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl  \
@@ -57,7 +60,6 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.lora_alpha=32 \
     actor_rollout_ref.rollout.load_format=safetensors \
     actor_rollout_ref.model.target_modules=all-linear \
-    actor_rollout_ref.model.use_shm=True \
     actor_rollout_ref.rollout.layered_summon=True \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \

From 08e671ac0fe5ba4f681ad309baf8043192db0468 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 12:27:35 -0400
Subject: [PATCH 170/232] clip audio

---
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 12 +--
 verl/utils/dataset/audio_utils.py             | 98 ++++++++++++++-----
 2 files changed, 81 insertions(+), 29 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index cab349e0a05..e5ae734a120 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -34,7 +34,12 @@ unset ROCR_VISIBLE_DEVICES
 # org: /scratch/keane/human_behaviour/human_behaviour_data/train_no_lmvd_discretized_v3_template_prompts.jsonl
 
 # LORA:
-#     actor_rollout_ref.model.use_shm=True \
+    # actor_rollout_ref.model.use_shm=True \
+    # actor_rollout_ref.model.lora_rank=32 \
+    # actor_rollout_ref.model.lora_alpha=32 \
+    # actor_rollout_ref.rollout.load_format=safetensors \
+    # actor_rollout_ref.model.target_modules=all-linear \
+    # actor_rollout_ref.rollout.layered_summon=True \
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
@@ -56,11 +61,6 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=3 \
-    actor_rollout_ref.model.lora_rank=32 \
-    actor_rollout_ref.model.lora_alpha=32 \
-    actor_rollout_ref.rollout.load_format=safetensors \
-    actor_rollout_ref.model.target_modules=all-linear \
-    actor_rollout_ref.rollout.layered_summon=True \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-9 \
diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index e86abfcac3d..53c7663a638 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -18,54 +18,106 @@
 import torchaudio
 import numpy as np
 
-def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]:
+from typing import Tuple, Union
+import torch
+import torchaudio
+
+def process_audio(
+    audio: Union[str, dict],
+    processor=None,
+    max_seconds: float = 2.0  # keep audio to this many seconds max
+) -> Tuple[torch.Tensor, int]:
+    """
+    Load audio, convert to mono, resample, and clip to max_seconds.
+    """
     if isinstance(audio, dict):
-        # TODO: to check whether the keys are correct here
         audio_path = audio.get("audio", audio)
     else:
         audio_path = audio
 
     try:
-        # Load audio
-        # NOTE: accepts waveform and sample rate; 
+        # Load
         audio_data, original_sr = torchaudio.load(audio_path)
 
-        # Get target sampling rate
-        # NOTE: sample rate is basically the amount of audio samples captured per second
-        # 16000 means 16000 samples are taken in every second
-        if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor,
-                                                                             'sampling_rate'):
+        # Resample if needed
+        if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor, 'sampling_rate'):
             target_sr = processor.feature_extractor.sampling_rate
         else:
-            
             target_sr = 16000
-        # print(f"KEANE: Processing audio {audio_path} with sampling rate, {target_sr}")
-        # Resample if needed
-        # NOTE: This is essentially the resampling of the audio sample rate
+
         if original_sr != target_sr:
             resampler = torchaudio.transforms.Resample(original_sr, target_sr)
             audio_data = resampler(audio_data)
+        else:
+            target_sr = original_sr
 
-        # Convert to mono if stereo
-        # NOTE: This is essentially the conversion of stereo audio to mono, so that we only have one channel
+        # Convert to mono
         if audio_data.shape[0] > 1:
             audio_data = audio_data.mean(dim=0, keepdim=False)
         else:
             audio_data = audio_data.squeeze(0)
 
-               # Debug prints
-        # print(
-        #     f"KEANE: Finished processing {audio_path} -> "
-        #     f"waveform shape {audio_data.shape}, dtype {audio_data.dtype}, "
-        #     # f"min {audio_data.min().item():.4f}, max {audio_data.max().item():.4f}"
-        # )
-        # print(f"KEANE: Returning tuple (waveform, sr={target_sr})")
+        # Clip to max_seconds
+        max_samples = int(max_seconds * target_sr)
+        if audio_data.shape[0] > max_samples:
+            audio_data = audio_data[:max_samples]
 
         return audio_data, target_sr
+
     except Exception as e:
         print(f"Error processing audio {audio_path}: {e}")
-        dummy_audio = torch.zeros((1000,), dtype=torch.float32)
+        dummy_audio = torch.zeros((int(16000 * max_seconds),), dtype=torch.float32)
         return dummy_audio, 16000
+
+
+# def process_audio(audio: str | dict, processor=None) -> Tuple[torch.Tensor, int]:
+#     if isinstance(audio, dict):
+#         # TODO: to check whether the keys are correct here
+#         audio_path = audio.get("audio", audio)
+#     else:
+#         audio_path = audio
+
+#     try:
+#         # Load audio
+#         # NOTE: accepts waveform and sample rate; 
+#         audio_data, original_sr = torchaudio.load(audio_path)
+
+#         # Get target sampling rate
+#         # NOTE: sample rate is basically the amount of audio samples captured per second
+#         # 16000 means 16000 samples are taken in every second
+#         if processor and hasattr(processor, 'feature_extractor') and hasattr(processor.feature_extractor,
+#                                                                              'sampling_rate'):
+#             target_sr = processor.feature_extractor.sampling_rate
+#         else:
+            
+#             target_sr = 16000
+#         # print(f"KEANE: Processing audio {audio_path} with sampling rate, {target_sr}")
+#         # Resample if needed
+#         # NOTE: This is essentially the resampling of the audio sample rate
+#         if original_sr != target_sr:
+#             resampler = torchaudio.transforms.Resample(original_sr, target_sr)
+#             audio_data = resampler(audio_data)
+
+#         # Convert to mono if stereo
+#         # NOTE: This is essentially the conversion of stereo audio to mono, so that we only have one channel
+#         if audio_data.shape[0] > 1:
+#             audio_data = audio_data.mean(dim=0, keepdim=False)
+#         else:
+#             audio_data = audio_data.squeeze(0)
+
+#                # Debug prints
+#         # print(
+#         #     f"KEANE: Finished processing {audio_path} -> "
+#         #     f"waveform shape {audio_data.shape}, dtype {audio_data.dtype}, "
+#         #     # f"min {audio_data.min().item():.4f}, max {audio_data.max().item():.4f}"
+#         # )
+#         # print(f"KEANE: Returning tuple (waveform, sr={target_sr})")
+
+#         return audio_data, target_sr
+#     except Exception as e:
+#         print(f"Error processing audio {audio_path}: {e}")
+#         dummy_audio = torch.zeros((1000,), dtype=torch.float32)
+#         return dummy_audio, 16000
     
 
 # def process_audio(audio: str | dict, processor=None) -> np.ndarray:

From dc13cbe1232a312fdee965fc5d7a6f004fadd91e Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 12:32:50 -0400
Subject: [PATCH 171/232] clip audio

---
 verl/utils/dataset/audio_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index 53c7663a638..dfd1c75eee3 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -60,6 +60,7 @@ def process_audio(
         # Clip to max_seconds
         max_samples = int(max_seconds * target_sr)
         if audio_data.shape[0] > max_samples:
+            print("Clipping audio to max_seconds")
             audio_data = audio_data[:max_samples]
 
         return audio_data, target_sr

From ff1cbfc94b67f196623309fc038633a5310db037 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 12:33:41 -0400
Subject: [PATCH 172/232] _

---
 verl/utils/dataset/audio_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index dfd1c75eee3..63671d17e6c 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -62,6 +62,7 @@ def process_audio(
         if audio_data.shape[0] > max_samples:
             print("Clipping audio to max_seconds")
             audio_data = audio_data[:max_samples]
+            raise ValueError("Audio data was clipped to max_seconds")
 
         return audio_data, target_sr
 

From 4a216fe2c5a110ca1b056d0463025dd956d51fcf Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 12:38:34 -0400
Subject: [PATCH 173/232] _

---
 verl/utils/dataset/audio_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index 63671d17e6c..8d85a5c72b3 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -59,6 +59,11 @@ def process_audio(
 
         # Clip to max_seconds
         max_samples = int(max_seconds * target_sr)
+        
+        print(f"Processing Audio {audio_path}, shape={audio_data.shape}, "
+                f"sr={target_sr}, max_samples={max_samples}")
+        ValueError("Audio was processed")
+
         if audio_data.shape[0] > max_samples:
             print("Clipping audio to max_seconds")
             audio_data = audio_data[:max_samples]

From faa3fa47f367dbcb35a1a19f171bc7cdb2d2fcbe Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 12:46:39 -0400
Subject: [PATCH 174/232] debug

---
 verl/utils/dataset/vision_utils.py | 143 +++++++++++++++++++++++++----
 1 file changed, 127 insertions(+), 16 deletions(-)

diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index e5e3d6436b6..afcf5253a5b 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -66,33 +66,41 @@ def process_image(image: dict | Image.Image | str) -> Image.Image:
 }
 """
 
+from typing import Optional, Dict
+import os, time, traceback
+import torch
 
 def process_video(
-    video: dict,
+    video: Dict,
     nframes: Optional[int] = None,
     fps: Optional[float] = None,
     fps_min_frames: Optional[int] = None,
     fps_max_frames: Optional[int] = None,
+    *,
+    debug: bool = True,           # <-- turn on diagnostics
+    name_hint: Optional[str] = None
 ) -> torch.Tensor:
-    """Converts a video dict into a [n_frames, 3, H, W] tensor
+    """Converts a video dict into a [n_frames, 3, H, W] uint8 tensor.
 
-    Add video sample FPS in a future MR
+    Set debug=True for per-call diagnostics to help track OOM spikes.
     """
+    start_t = time.perf_counter()
+
+    # Normalize string input → dict
     if isinstance(video, str):
-        # This is the original form
-        # video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
-        #          "nframes": 4}
-        video = {"type": "video", "video": video, "min_pixels": 32768, "max_pixels": 32768,
-            "nframes": 2}
+        # Your current defaults (tiny visual budget)
+        video = {"type": "video", "video": video,
+                 "min_pixels": 32768, "max_pixels": 32768, "nframes": 2}
 
     if not isinstance(video, dict) or "video" not in video:
-        raise NotImplementedError(VIDEO_FORMAT_HELP)
-    assert nframes is None or fps is None, "Can't use both `nframes` or `fps`"
+        raise NotImplementedError("Video format must be dict with key 'video'.")
 
-    # Shallow copy... since we might want to add some keys
+    # Shallow copy; we may add keys
     video = dict(video)
 
-    contains_sampling_rules = "nframes" in video or "fps" in video
+    # Compose sampling rules
+    assert nframes is None or fps is None, "Can't use both `nframes` and `fps`."
+    contains_sampling_rules = ("nframes" in video) or ("fps" in video)
     if not contains_sampling_rules:
         if nframes is not None:
             video["nframes"] = nframes
@@ -102,12 +110,115 @@ def process_video(
                 video["min_frames"] = fps_min_frames
             if fps_max_frames is not None:
                 video["max_frames"] = fps_max_frames
+
+    # --- DIAGNOSTICS (before decode/resize) ---
+    if debug:
+        vpath = video.get("video")
+        min_px = video.get("min_pixels", None)
+        max_px = video.get("max_pixels", None)
+        nf_req = video.get("nframes", None)
+        fps_req = video.get("fps", None)
+        size_on_disk = None
+        try:
+            if isinstance(vpath, str) and os.path.exists(vpath):
+                size_on_disk = os.path.getsize(vpath)
+        except Exception:
+            pass
+        print(f"[process_video][pre] name={name_hint or ''} path={vpath} exists={os.path.exists(vpath) if isinstance(vpath,str) else 'N/A'} "
+              f"size={size_on_disk}B min_px={min_px} max_px={max_px} "
+              f"nframes_req={nf_req} fps_req={fps_req}")
+
+    # Decode + resize according to your fetcher
     try:
-        return fetch_video(video)
+        frames = fetch_video(video)  # expected [T, 3, H, W], dtype=uint8
     except Exception as e:
-        print(e)
-        dummy_video = torch.zeros((1, 3, 224, 224), dtype=torch.uint8)
-        return dummy_video
+        if debug:
+            print(f"[process_video][error] {e}\n{traceback.format_exc()}")
+        # Return a small dummy to keep pipeline alive
+        dummy = torch.zeros((1, 3, 224, 224), dtype=torch.uint8)
+        return dummy
+
+    # --- DIAGNOSTICS (after decode/resize) ---
+    if debug:
+        try:
+            T, C, H, W = frames.shape
+        except Exception:
+            T, C, H, W = (None, None, None, None)
+
+        # Estimate visual tokens via 28x28 rule
+        def tok_per_frame(h, w):
+            if not (isinstance(h, int) and isinstance(w, int)):
+                return None
+            # tokens ≈ ceil(H/28)*ceil(W/28)
+            import math
+            return math.ceil(h / 28) * math.ceil(w / 28)
+
+        vtok_pf = tok_per_frame(H, W)
+        vtok_total = (vtok_pf * T) if (vtok_pf is not None and isinstance(T, int)) else None
+
+        # Optional: quick CUDA mem snapshot (safe even on CPU)
+        if torch.cuda.is_available():
+            cur = round(torch.cuda.memory_allocated() / 1e9, 2)
+            peak = round(torch.cuda.max_memory_allocated() / 1e9, 2)
+            mem_str = f"cuda_cur={cur}GB cuda_peak={peak}GB"
+        else:
+            mem_str = "cuda=N/A"
+
+        dt = (time.perf_counter() - start_t) * 1000
+        print(f"[process_video][post] name={name_hint or ''} shape={frames.shape} "
+              f"HxW={H}x{W} T={T} tok/frame≈{vtok_pf} tok_total≈{vtok_total} "
+              f"elapsed={dt:.1f}ms {mem_str}")
+
+        # Warn if frame count or token budget larger than expected
+        if T is not None and (("nframes" in video and T != video["nframes"]) or (T > 8)):
+            print(f"[process_video][warn] unexpected T={T} (requested {video.get('nframes')}).")
+        if vtok_total is not None and vtok_total > 4000:
+            print(f"[process_video][warn] large visual token count: ~{vtok_total}. Consider lowering pixels/frames.")
+
+    return frames
+
+
+# def process_video(
+#     video: dict,
+#     nframes: Optional[int] = None,
+#     fps: Optional[float] = None,
+#     fps_min_frames: Optional[int] = None,
+#     fps_max_frames: Optional[int] = None,
+# ) -> torch.Tensor:
+#     """Converts a video dict into a [n_frames, 3, H, W] tensor
+
+#     Add video sample FPS in a future MR
+#     """
+#     if isinstance(video, str):
+#         # This is the original form
+#         # video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
+#         #          "nframes": 4}
+#         video = {"type": "video", "video": video, "min_pixels": 32768, "max_pixels": 32768,
+#             "nframes": 2}
+
+#     if not isinstance(video, dict) or "video" not in video:
+#         raise NotImplementedError(VIDEO_FORMAT_HELP)
+#     assert nframes is None or fps is None, "Can't use both `nframes` or `fps`"
+
+#     # Shallow copy... since we might want to add some keys
+#     video = dict(video)
+
+#     contains_sampling_rules = "nframes" in video or "fps" in video
+#     if not contains_sampling_rules:
+#         if nframes is not None:
+#             video["nframes"] = nframes
+#         elif fps is not None:
+#             video["fps"] = fps
+#             if fps_min_frames is not None:
+#                 video["min_frames"] = fps_min_frames
+#             if fps_max_frames is not None:
+#                 video["max_frames"] = fps_max_frames
+#     try:
+#         return fetch_video(video)
+#     except Exception as e:
+#         print(e)
+#         dummy_video = torch.zeros((1, 3, 224, 224), dtype=torch.uint8)
+#         return dummy_video
 
 
 def process_multi_modal_inputs_for_minicpmo(input_ids, attention_mask, position_ids, cu_seqlens, multi_modal_inputs):

From dc9ec7f9cb24c613ded9a4b728260676787c0286 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 13:04:35 -0400
Subject: [PATCH 175/232] debug catch

---
 verl/utils/dataset/rl_dataset.py | 71 +++++++++++++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 10c9522fd02..1301a1998de 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -32,9 +32,26 @@
 
 import verl.utils.torch_functional as verl_F
 from verl.utils.model import compute_position_id_with_mask
+import time, os, math, warnings
 
 logger = logging.getLogger(__name__)
 
+def _tok_est_from_hw(H, W):
+    # 28x28 -> 1 "visual token" heuristic
+    return math.ceil(H/28) * math.ceil(W/28)
+
+def _sec_from_array(arr, sr):
+    try:
+        return round(len(arr) / float(sr), 3)
+    except Exception:
+        return "?"
+
+def _p99(xs):
+    xs = sorted(xs)
+    if not xs: return 0
+    k = int(0.99*(len(xs)-1))
+    return xs[k]
+
 
 def collate_fn(data_list: list[dict]) -> dict:
     """
@@ -427,6 +444,12 @@ def __getitem__(self, item):
                 ]
             })
         model_inputs = {}
+        
+        # NOTE: DEBUGGING
+        dbg = True
+        if dbg:
+            print(f"[getitem] idx=? ds={row_dict.get('dataset')} src={row_dict.get('data_source')} "
+                f"modalities={self.modalities}")
 
         if self.processor is not None:
             # THIS CHUNK IS BASICALLY ABOUT PROCESSING ALL THE MODALITIES
@@ -436,6 +459,11 @@ def __getitem__(self, item):
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", message="System prompt modified")
                 raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+
+            
+            if dbg:
+                print(f"[prompt] raw_prompt_chars={len(raw_prompt)}")
+
             multi_modal_data = {}
             processor_kwargs = {"text": [raw_prompt], "return_tensors": "pt"}
 
@@ -450,6 +478,10 @@ def __getitem__(self, item):
                 multi_modal_data["image"] = images
                 processor_kwargs["images"] = images
 
+                if dbg:
+                    print(f"[image] n={len(imgs)} shapes={[tuple(x.size()) if hasattr(x,'size') else 'np' for x in imgs]}")
+
+
             # print(f"KEANE: Videos is next line, current processor_kwargs {processor_kwargs}")
             if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
                 videos = []
@@ -464,6 +496,15 @@ def __getitem__(self, item):
                 multi_modal_data["video"] = [video.numpy() for video in videos]
                 processor_kwargs["videos"] = videos
 
+                if dbg:
+                    shapes = [tuple(v.shape) for v in videos]  # [T,3,H,W]
+                    toks = []
+                    for (T, C, H, W) in shapes:
+                        toks.append(_tok_est_from_hw(H, W) * T)
+                    print(f"[video] n={len(videos)} shapes={shapes} est_tokens={toks} "
+                        f"sum_est_tokens={sum(toks)} p99_est={_p99(toks)}")
+
+
             # NOTE: PROCESSING OF THE AUDIO TUPLES
             # if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
             #     audios = []
@@ -489,6 +530,7 @@ def __getitem__(self, item):
                 audios_np = []
                 audios_np_sr = []
                 audio_tuples_debug = []  # keep tensors only for debugging
+                audio_secs = []
 
                 for audio in row_dict[self.audio_key]:
                     audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
@@ -501,16 +543,43 @@ def __getitem__(self, item):
                     arr = audio_tensor.detach().cpu().numpy().astype("float32")
                     audios_np.append(arr)
                     audios_np_sr.append((arr, int(sr)))
+                    audio_secs.append(_sec_from_array(arr, sr))
 
                 # HF (Whisper / Omni processor) path
                 multi_modal_data["audio"] = audios_np_sr  # Store numpy arrays (it should not accept tuples)
 
                 processor_kwargs["audio"] = audios_np  # Pass numpy arrays to processor
 
+                if dbg:
+                    print(f"[audio] n={len(audios_np)} secs_each={audio_secs} total_secs≈{round(sum([s for s in audio_secs if s!='?']),3)}")
+
+            # NOTE: Original CODE PROCESSING    
             # TODO: Please check whether the model is processing the "audio" correctly, the processor that we are using is qwen 2.5 OMNI
             # print(f"KEANE: Processing multimodal data with processor {self.processor.__class__.__name__} ")
             # print(f"KEANE: Processor kwargs: {processor_kwargs}")
-            model_inputs = self.processor(**processor_kwargs)
+            # model_inputs = self.processor(**processor_kwargs)
+
+            # NOTE: Replacement code
+            try:
+                t0 = time.time()
+                model_inputs = self.processor(**processor_kwargs)
+                dt = (time.time() - t0)*1000
+                if dbg:
+                    # lengths after processor/tokenizer
+                    ids = model_inputs.get("input_ids")
+                    lens = [len(x) for x in ids] if ids is not None else []
+                    print(f"[processor] ok in {dt:.1f}ms; input_ids lens={lens} "
+                        f"min/med/max={ (min(lens) if lens else '-')} / "
+                        f"{ (sorted(lens)[len(lens)//2] if lens else '-') } / "
+                        f"{ (max(lens) if lens else '-') }")
+            except Exception as e:
+                print(f"[processor][ERROR] {type(e).__name__}: {e}")
+                # helpful context dump (small)
+                print(f"[processor][ctx] has_video={videos is not None} "
+                    f"n_vid={len(videos) if videos is not None else 0} "
+                    f"n_audio={len(audio_secs) if audio_secs else 0} "
+                    f"raw_prompt_chars={len(raw_prompt)}")
+                raise
 
             # NOTE: all text should be processed by self.processor()
             input_ids = model_inputs.pop("input_ids")

From e29be99b638731a3933b312ec0f13220909cc677 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 13:04:57 -0400
Subject: [PATCH 176/232] debug catch

---
 verl/utils/dataset/rl_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 1301a1998de..1abcb8c84b2 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -479,7 +479,7 @@ def __getitem__(self, item):
                 processor_kwargs["images"] = images
 
                 if dbg:
-                    print(f"[image] n={len(imgs)} shapes={[tuple(x.size()) if hasattr(x,'size') else 'np' for x in imgs]}")
+                    print(f"[image] n={len(images)} shapes={[tuple(x.size()) if hasattr(x,'size') else 'np' for x in images]}")
 
 
             # print(f"KEANE: Videos is next line, current processor_kwargs {processor_kwargs}")

From cca8867eed443ac63247a51423920f405473d56a Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 13:19:18 -0400
Subject: [PATCH 177/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index e5ae734a120..4edaafdd43b 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -43,8 +43,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl  \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl  \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_tess_only.jsonl  \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_tess_only.jsonl  \
     data.train_batch_size=3 \
     data.val_batch_size=3 \
     data.max_prompt_length=3072 \

From 042457502e6ffaace7b5302b0fb693653d3c2f1f Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 13:23:08 -0400
Subject: [PATCH 178/232] _

---
 verl/utils/dataset/audio_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index 8d85a5c72b3..0ca22879a0d 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -62,12 +62,12 @@ def process_audio(
         
         print(f"Processing Audio {audio_path}, shape={audio_data.shape}, "
                 f"sr={target_sr}, max_samples={max_samples}")
-        ValueError("Audio was processed")
+        # ValueError("Audio was processed")
 
         if audio_data.shape[0] > max_samples:
             print("Clipping audio to max_seconds")
             audio_data = audio_data[:max_samples]
-            raise ValueError("Audio data was clipped to max_seconds")
+            # raise ValueError("Audio data was clipped to max_seconds")
 
         return audio_data, target_sr
 

From 87e55cf9b1f63990916682ba786b45c7525be1f6 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 14:06:04 -0400
Subject: [PATCH 179/232] _

---
 verl/utils/dataset/audio_utils.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index 0ca22879a0d..760a352e4fb 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -12,12 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Tuple, Optional
-
-import torch
-import torchaudio
-import numpy as np
-
 from typing import Tuple, Union
 import torch
 import torchaudio
@@ -58,15 +52,15 @@ def process_audio(
             audio_data = audio_data.squeeze(0)
 
         # Clip to max_seconds
-        max_samples = int(max_seconds * target_sr)
+        # max_samples = int(max_seconds * target_sr)
         
-        print(f"Processing Audio {audio_path}, shape={audio_data.shape}, "
-                f"sr={target_sr}, max_samples={max_samples}")
-        # ValueError("Audio was processed")
+        # print(f"Processing Audio {audio_path}, shape={audio_data.shape}, "
+        #         f"sr={target_sr}, max_samples={max_samples}")
+        # # ValueError("Audio was processed")
 
-        if audio_data.shape[0] > max_samples:
-            print("Clipping audio to max_seconds")
-            audio_data = audio_data[:max_samples]
+        # if audio_data.shape[0] > max_samples:
+        #     print("Clipping audio to max_seconds")
+        #     audio_data = audio_data[:max_samples]
             # raise ValueError("Audio data was clipped to max_seconds")
 
         return audio_data, target_sr

From cd320d8566826e47258bef95326d636b19398234 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 14:11:54 -0400
Subject: [PATCH 180/232] push audio

---
 verl/utils/dataset/audio_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index 760a352e4fb..ac15e14d0c2 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -19,7 +19,7 @@
 def process_audio(
     audio: Union[str, dict],
     processor=None,
-    max_seconds: float = 2.0  # keep audio to this many seconds max
+    max_seconds: float = 10.0  # keep audio to this many seconds max
 ) -> Tuple[torch.Tensor, int]:
     """
     Load audio, convert to mono, resample, and clip to max_seconds.

From ac97461865c5df402640df63404309407c27ad3c Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 14:20:41 -0400
Subject: [PATCH 181/232] _

---
 verl/utils/dataset/audio_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index ac15e14d0c2..cc15184633b 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -19,7 +19,7 @@
 def process_audio(
     audio: Union[str, dict],
     processor=None,
-    max_seconds: float = 10.0  # keep audio to this many seconds max
+    max_seconds: float = 3.0  # keep audio to this many seconds max
 ) -> Tuple[torch.Tensor, int]:
     """
     Load audio, convert to mono, resample, and clip to max_seconds.

From 56883a26fbdb295d46686c7ba648c7d39e07771b Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 14:24:26 -0400
Subject: [PATCH 182/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh           | 6 +++---
 verl/utils/dataset/audio_utils.py                           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 4edaafdd43b..8e06c77f1ab 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -81,8 +81,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.n=3 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    actor_rollout_ref.rollout.max_model_len=1536 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
+    actor_rollout_ref.rollout.max_model_len=4096 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=4096 \
     algorithm.use_kl_in_reward=False \
     custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/human_behaviour.py \
     custom_reward_function.name=human_behaviour_compute_score_batch \
@@ -97,4 +97,4 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.val_before_train=False \
     trainer.test_freq=10 \
     trainer.total_epochs=15 $@ \
-    trainer.default_local_dir=/scratch/keane/human_behaviour/verl_models_hb_omni
\ No newline at end of file
+    trainer.default_local_dir=/scratch/keane/human_behaviour/new_verl_models_hb_omni
\ No newline at end of file
diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index cc15184633b..ac15e14d0c2 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -19,7 +19,7 @@
 def process_audio(
     audio: Union[str, dict],
     processor=None,
-    max_seconds: float = 3.0  # keep audio to this many seconds max
+    max_seconds: float = 10.0  # keep audio to this many seconds max
 ) -> Tuple[torch.Tensor, int]:
     """
     Load audio, convert to mono, resample, and clip to max_seconds.

From fdfdb40c02ef4b4c6b174071f960230013fe5720 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 14:26:54 -0400
Subject: [PATCH 183/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 8e06c77f1ab..9e2e5a5bfd9 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -93,7 +93,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.experiment_name='omni' \
     trainer.n_gpus_per_node=3 \
     trainer.nnodes=1 \
-    trainer.save_freq=5 \
+    trainer.save_freq=-1 \
     trainer.val_before_train=False \
     trainer.test_freq=10 \
     trainer.total_epochs=15 $@ \

From 6dbcb35e4097b503ec7d690f38f6941e361b7207 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 14:33:32 -0400
Subject: [PATCH 184/232] _

---
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh |  4 ++--
 verl/utils/dataset/audio_utils.py             | 23 ++++++++++---------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 9e2e5a5bfd9..3d80725d4e0 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -45,8 +45,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_tess_only.jsonl  \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_tess_only.jsonl  \
-    data.train_batch_size=3 \
-    data.val_batch_size=3 \
+    data.train_batch_size=12 \
+    data.val_batch_size=12 \
     data.max_prompt_length=3072 \
     data.max_response_length=1536 \
     data.filter_overlong_prompts=False \
diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index ac15e14d0c2..5a29550922f 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -19,7 +19,7 @@
 def process_audio(
     audio: Union[str, dict],
     processor=None,
-    max_seconds: float = 10.0  # keep audio to this many seconds max
+    max_seconds: float = None  # keep audio to this many seconds max
 ) -> Tuple[torch.Tensor, int]:
     """
     Load audio, convert to mono, resample, and clip to max_seconds.
@@ -52,16 +52,17 @@ def process_audio(
             audio_data = audio_data.squeeze(0)
 
         # Clip to max_seconds
-        # max_samples = int(max_seconds * target_sr)
-        
-        # print(f"Processing Audio {audio_path}, shape={audio_data.shape}, "
-        #         f"sr={target_sr}, max_samples={max_samples}")
-        # # ValueError("Audio was processed")
-
-        # if audio_data.shape[0] > max_samples:
-        #     print("Clipping audio to max_seconds")
-        #     audio_data = audio_data[:max_samples]
-            # raise ValueError("Audio data was clipped to max_seconds")
+        if max_seconds:
+            max_samples = int(max_seconds * target_sr)
+            
+            print(f"Processing Audio {audio_path}, shape={audio_data.shape}, "
+                    f"sr={target_sr}, max_samples={max_samples}")
+            # ValueError("Audio was processed")
+
+            if audio_data.shape[0] > max_samples:
+                print("Clipping audio to max_seconds")
+                audio_data = audio_data[:max_samples]
+                raise ValueError("Audio data was clipped to max_seconds")
 
         return audio_data, target_sr
 

From 98c20e362d7b48a6d8fb296bf4dd243a8961bca8 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 14:47:33 -0400
Subject: [PATCH 185/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 verl/utils/dataset/audio_utils.py                             | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3d80725d4e0..fdd4c870e92 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -47,8 +47,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_tess_only.jsonl  \
     data.train_batch_size=12 \
     data.val_batch_size=12 \
-    data.max_prompt_length=3072 \
-    data.max_response_length=1536 \
+    data.max_prompt_length=4096 \
+    data.max_response_length=4096 \
     data.filter_overlong_prompts=False \
     data.truncation='left' \
     data.image_key=images \
diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index 5a29550922f..4f19a4d0396 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -19,7 +19,7 @@
 def process_audio(
     audio: Union[str, dict],
     processor=None,
-    max_seconds: float = None  # keep audio to this many seconds max
+    max_seconds: float = 2.0  # keep audio to this many seconds max
 ) -> Tuple[torch.Tensor, int]:
     """
     Load audio, convert to mono, resample, and clip to max_seconds.

From c539637ef01c78c1fbb61591f94f62377776a570 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 15:51:05 -0400
Subject: [PATCH 186/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index fdd4c870e92..136ec98a06b 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -43,8 +43,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_tess_only.jsonl  \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/cremad_ravdess_tess_only.jsonl  \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
     data.train_batch_size=12 \
     data.val_batch_size=12 \
     data.max_prompt_length=4096 \

From 23955ae81bb5de10452b089585d06fe0b7c8beaf Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 15:52:49 -0400
Subject: [PATCH 187/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 136ec98a06b..e4eb545a01a 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -45,8 +45,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
-    data.train_batch_size=12 \
-    data.val_batch_size=12 \
+    data.train_batch_size=3 \
+    data.val_batch_size=3 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \

From a5b41c93b7c8057769dc38a4b15a12e5fa47f4e7 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 16:25:39 -0400
Subject: [PATCH 188/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index e4eb545a01a..74cb51a5fe5 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -45,8 +45,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
-    data.train_batch_size=3 \
-    data.val_batch_size=3 \
+    data.train_batch_size=6 \
+    data.val_batch_size=6 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \

From 237e21e196c76a63b9e6f728f9303c6ee88c6d8f Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 16:33:02 -0400
Subject: [PATCH 189/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 74cb51a5fe5..136ec98a06b 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -45,8 +45,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
-    data.train_batch_size=6 \
-    data.val_batch_size=6 \
+    data.train_batch_size=12 \
+    data.val_batch_size=12 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \

From 21aba8b55b4ab815d56b0ea98c625ad086a06643 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 16:43:58 -0400
Subject: [PATCH 190/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 136ec98a06b..64c6cdb9b7b 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -43,8 +43,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/full_no_chalearn.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/full_no_chalearn.jsonl \
     data.train_batch_size=12 \
     data.val_batch_size=12 \
     data.max_prompt_length=4096 \

From 02fb62094493310df5b41eeaffa00f5219893644 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 16:47:41 -0400
Subject: [PATCH 191/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 64c6cdb9b7b..98eaffdb023 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -45,8 +45,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/full_no_chalearn.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/full_no_chalearn.jsonl \
-    data.train_batch_size=12 \
-    data.val_batch_size=12 \
+    data.train_batch_size=3 \
+    data.val_batch_size=3 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \

From b7468c387693aa8d2b2e9dce07f81df1567cd287 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 17:42:43 -0400
Subject: [PATCH 192/232] _

---
 examples/split_placement/main_ppo_split.py    |    4 +-
 .../split_placement/split_monkey_patch.py     |    2 +-
 recipe/dapo/dapo_ray_trainer.py               |    2 +-
 recipe/dapo/main_dapo.py                      |    2 +-
 recipe/entropy/entropy_ray_trainer.py         |    2 +-
 recipe/entropy/main_entropy.py                |    2 +-
 recipe/one_step_off_policy/ray_trainer.py     |    2 +-
 recipe/prime/main_prime.py                    |    2 +-
 recipe/prime/prime_ray_trainer.py             |    2 +-
 recipe/spin/spin_trainer.py                   |    2 +-
 recipe/sppo/main_sppo.py                      |    2 +-
 recipe/sppo/sppo_ray_trainer.py               |    2 +-
 tests/experimental/agent_loop/agent_utils.py  |    2 +-
 tests/workers/rollout/async_rollout_utils.py  |    2 +-
 verl/trainer/main_ppo.py                      |   14 +-
 verl/trainer/ppo/ray_trainer.py               |   14 +-
 verl/trainer/ppo/ray_trainer_alt.py           | 1489 +++++++++++++++++
 verl/utils/dataset/old_get_item.py            |  298 ++++
 verl/utils/dataset/rl_dataset.py              |    5 +-
 verl/utils/dataset/rl_dataset_alt.py          |  649 +++++++
 verl/utils/dataset/rl_dataset_org.py          |  687 ++++++++
 21 files changed, 3160 insertions(+), 26 deletions(-)
 create mode 100644 verl/trainer/ppo/ray_trainer_alt.py
 create mode 100644 verl/utils/dataset/old_get_item.py
 create mode 100644 verl/utils/dataset/rl_dataset_alt.py
 create mode 100644 verl/utils/dataset/rl_dataset_org.py

diff --git a/examples/split_placement/main_ppo_split.py b/examples/split_placement/main_ppo_split.py
index 0d17832a6d6..ee80e1576f1 100644
--- a/examples/split_placement/main_ppo_split.py
+++ b/examples/split_placement/main_ppo_split.py
@@ -21,7 +21,7 @@
 from split_monkey_patch import fit
 
 from verl import DataProto
-from verl.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl.verl.trainer.ppo.ray_trainer import RayPPOTrainer
 from verl.utils.reward_score import gsm8k, math
 
 
@@ -140,7 +140,7 @@ def main_task(config):
     else:
         raise NotImplementedError
 
-    from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+    from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 
     role_worker_mapping = {
         Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
diff --git a/examples/split_placement/split_monkey_patch.py b/examples/split_placement/split_monkey_patch.py
index bef75f74cbe..2463c8446aa 100644
--- a/examples/split_placement/split_monkey_patch.py
+++ b/examples/split_placement/split_monkey_patch.py
@@ -23,7 +23,7 @@
 import torch
 
 from verl import DataProto
-from verl.trainer.ppo.ray_trainer import (
+from verl.verl.trainer.ppo.ray_trainer import (
     AdvantageEstimator,
     apply_kl_penalty,
     compute_advantage,
diff --git a/recipe/dapo/dapo_ray_trainer.py b/recipe/dapo/dapo_ray_trainer.py
index 4ee64294820..d129b4d847c 100644
--- a/recipe/dapo/dapo_ray_trainer.py
+++ b/recipe/dapo/dapo_ray_trainer.py
@@ -33,7 +33,7 @@
     compute_timing_metrics,
     reduce_metrics,
 )
-from verl.trainer.ppo.ray_trainer import (
+from verl.verl.trainer.ppo.ray_trainer import (
     AdvantageEstimator,
     RayPPOTrainer,
     apply_kl_penalty,
diff --git a/recipe/dapo/main_dapo.py b/recipe/dapo/main_dapo.py
index ffd1e43838b..e0d2e194681 100644
--- a/recipe/dapo/main_dapo.py
+++ b/recipe/dapo/main_dapo.py
@@ -98,7 +98,7 @@ def run(self, config):
         else:
             raise NotImplementedError
 
-        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+        from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 
         role_worker_mapping = {
             Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
diff --git a/recipe/entropy/entropy_ray_trainer.py b/recipe/entropy/entropy_ray_trainer.py
index 0b0b04318c9..48d94a8d749 100644
--- a/recipe/entropy/entropy_ray_trainer.py
+++ b/recipe/entropy/entropy_ray_trainer.py
@@ -32,7 +32,7 @@
     compute_timing_metrics,
     reduce_metrics,
 )
-from verl.trainer.ppo.ray_trainer import (
+from verl.verl.trainer.ppo.ray_trainer import (
     AdvantageEstimator,
     RayPPOTrainer,
     apply_kl_penalty,
diff --git a/recipe/entropy/main_entropy.py b/recipe/entropy/main_entropy.py
index ffed8e4235a..3a9a96abb5e 100644
--- a/recipe/entropy/main_entropy.py
+++ b/recipe/entropy/main_entropy.py
@@ -108,7 +108,7 @@ def run(self, config):
         else:
             raise NotImplementedError
 
-        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+        from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 
         role_worker_mapping = {
             Role.ActorRollout: ray.remote(actor_rollout_cls),
diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py
index 1f7011bdf54..ce91c4349ea 100644
--- a/recipe/one_step_off_policy/ray_trainer.py
+++ b/recipe/one_step_off_policy/ray_trainer.py
@@ -38,7 +38,7 @@
     compute_throughout_metrics,
     compute_timing_metrics,
 )
-from verl.trainer.ppo.ray_trainer import (
+from verl.verl.trainer.ppo.ray_trainer import (
     RayPPOTrainer,
     ResourcePoolManager,
     Role,
diff --git a/recipe/prime/main_prime.py b/recipe/prime/main_prime.py
index 687bc6e421a..1ba74677291 100644
--- a/recipe/prime/main_prime.py
+++ b/recipe/prime/main_prime.py
@@ -89,7 +89,7 @@ def main_task(config, compute_score=None):
     else:
         raise NotImplementedError
 
-    from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+    from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 
     role_worker_mapping = {
         Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
diff --git a/recipe/prime/prime_ray_trainer.py b/recipe/prime/prime_ray_trainer.py
index a5ad96431a8..91749641e27 100644
--- a/recipe/prime/prime_ray_trainer.py
+++ b/recipe/prime/prime_ray_trainer.py
@@ -30,7 +30,7 @@
 from verl.single_controller.ray import RayWorkerGroup
 from verl.trainer.ppo.core_algos import agg_loss
 from verl.trainer.ppo.metric_utils import _compute_response_info
-from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
+from verl.verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
 from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
 from verl.utils.metric import reduce_metrics
diff --git a/recipe/spin/spin_trainer.py b/recipe/spin/spin_trainer.py
index 43789218f57..27b8bef0cee 100644
--- a/recipe/spin/spin_trainer.py
+++ b/recipe/spin/spin_trainer.py
@@ -44,7 +44,7 @@
     process_validation_metrics,
     reduce_metrics,
 )
-from verl.trainer.ppo.ray_trainer import Role
+from verl.verl.trainer.ppo.ray_trainer import Role
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
 from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
 from verl.utils.torch_functional import masked_mean
diff --git a/recipe/sppo/main_sppo.py b/recipe/sppo/main_sppo.py
index a96fc28873d..ded6634ef44 100644
--- a/recipe/sppo/main_sppo.py
+++ b/recipe/sppo/main_sppo.py
@@ -93,7 +93,7 @@ def run(self, config):
         else:
             raise NotImplementedError
 
-        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+        from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 
         # sppo does not use critic
         role_worker_mapping = {
diff --git a/recipe/sppo/sppo_ray_trainer.py b/recipe/sppo/sppo_ray_trainer.py
index 0725d293e2b..ceb913fcf94 100644
--- a/recipe/sppo/sppo_ray_trainer.py
+++ b/recipe/sppo/sppo_ray_trainer.py
@@ -34,7 +34,7 @@
 from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import agg_loss
 from verl.trainer.ppo.metric_utils import reduce_metrics
-from verl.trainer.ppo.ray_trainer import (
+from verl.verl.trainer.ppo.ray_trainer import (
     AdvantageEstimator,
     RayPPOTrainer,
     ResourcePoolManager,
diff --git a/tests/experimental/agent_loop/agent_utils.py b/tests/experimental/agent_loop/agent_utils.py
index 3c708c42cfb..db41bbdaa41 100644
--- a/tests/experimental/agent_loop/agent_utils.py
+++ b/tests/experimental/agent_loop/agent_utils.py
@@ -18,7 +18,7 @@
 from verl.experimental.agent_loop import AgentLoopManager
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
-from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
 
 
diff --git a/tests/workers/rollout/async_rollout_utils.py b/tests/workers/rollout/async_rollout_utils.py
index 22f20291e40..05520af5010 100644
--- a/tests/workers/rollout/async_rollout_utils.py
+++ b/tests/workers/rollout/async_rollout_utils.py
@@ -17,7 +17,7 @@
 
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
-from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 from verl.workers.fsdp_workers import AsyncActorRolloutRefWorker
 from verl.workers.rollout.async_server import AsyncLLMServerManager
 
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 5c31aea98c4..0912d7fffe7 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -24,7 +24,7 @@
 
 from verl.experimental.dataset.sampler import AbstractSampler
 from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
-from verl.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl.verl.trainer.ppo.ray_trainer import RayPPOTrainer
 from verl.trainer.ppo.reward import load_reward_manager
 from verl.utils.device import is_cuda_available
 from verl.utils.import_utils import load_extern_type
@@ -129,7 +129,7 @@ def add_actor_rollout_worker(self, config):
         else:
             raise NotImplementedError
 
-        from verl.trainer.ppo.ray_trainer import Role
+        from verl.verl.trainer.ppo.ray_trainer import Role
 
         self.role_worker_mapping[Role.ActorRollout] = ray.remote(actor_rollout_cls)
 
@@ -154,13 +154,13 @@ def add_critic_worker(self, config):
         else:
             raise NotImplementedError
 
-        from verl.trainer.ppo.ray_trainer import Role
+        from verl.verl.trainer.ppo.ray_trainer import Role
 
         self.role_worker_mapping[Role.Critic] = ray.remote(CriticWorker)
 
     def init_resource_pool_mgr(self, config):
         """Initialize resource pool manager."""
-        from verl.trainer.ppo.ray_trainer import Role
+        from verl.verl.trainer.ppo.ray_trainer import Role
 
         global_pool_id = "global_pool"
         resource_pool_spec = {
@@ -168,14 +168,14 @@ def init_resource_pool_mgr(self, config):
         }
         self.mapping[Role.ActorRollout] = global_pool_id
         self.mapping[Role.Critic] = global_pool_id
-        from verl.trainer.ppo.ray_trainer import ResourcePoolManager
+        from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager
 
         resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=self.mapping)
         return resource_pool_manager
 
     def add_reward_model_worker(self, config):
         """Add reward model worker if enabled."""
-        from verl.trainer.ppo.ray_trainer import Role
+        from verl.verl.trainer.ppo.ray_trainer import Role
 
         if config.reward_model.enable:
             if config.reward_model.strategy in {"fsdp", "fsdp2"}:
@@ -189,7 +189,7 @@ def add_reward_model_worker(self, config):
 
     def add_ref_policy_worker(self, config, ref_policy_cls):
         """Add reference policy worker if KL loss or KL reward is used."""
-        from verl.trainer.ppo.ray_trainer import Role
+        from verl.verl.trainer.ppo.ray_trainer import Role
 
         if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
             self.role_worker_mapping[Role.RefPolicy] = ray.remote(ref_policy_cls)
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 4b3a8f867e4..c703f89b085 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -681,7 +681,7 @@ def _validate(self):
 
             # Store original inputs
             input_ids = test_batch.batch["input_ids"]
-            # TODO: Can we keep special tokens except for padding tokens?
+
             input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
             sample_inputs.extend(input_texts)
 
@@ -1193,9 +1193,18 @@ def fit(self):
                 # pop those keys for generation
                 batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
                 non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+
+                if "input_ids" in batch.batch:
+                    print(f"[DEBUG] input_ids shape: {batch.batch['input_ids'].shape}")
+                    print(f"[DEBUG] First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}")
+
                 if "multi_modal_data" in batch.non_tensor_batch:
                     # TODO: Fix the audio generation for this
                     non_tensor_batch_keys_to_pop.append("multi_modal_data")
+                # # NOTE: Adding pruned inputs that we kept for generation
+                # if "multi_modal_inputs" in batch.non_tensor_batch:
+                #     non_tensor_batch_keys_to_pop.append("multi_modal_inputs")
+                    
                 if "raw_prompt" in batch.non_tensor_batch:
                     non_tensor_batch_keys_to_pop.append("raw_prompt")
                 if "tools_kwargs" in batch.non_tensor_batch:
@@ -1219,8 +1228,7 @@ def fit(self):
                 is_last_step = self.global_steps >= self.total_training_steps
 
                 # TODO: double check the gen_batch
-                print(f"gen_batch", gen_batch)
-
+                # print(f"gen_batch", gen_batch)
 
                 with marked_timer("step", timing_raw):
                     # generate a batch
diff --git a/verl/trainer/ppo/ray_trainer_alt.py b/verl/trainer/ppo/ray_trainer_alt.py
new file mode 100644
index 00000000000..e5ba90cc10c
--- /dev/null
+++ b/verl/trainer/ppo/ray_trainer_alt.py
@@ -0,0 +1,1489 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PPO Trainer with Ray-based single controller.
+This trainer supports model-agonistic model initialization with huggingface
+"""
+
+import json
+import os
+import uuid
+import warnings
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass, field
+from enum import Enum
+from pprint import pprint
+from typing import Optional, Dict
+
+import numpy as np
+import ray
+import torch
+import ujson
+import wandb
+from omegaconf import OmegaConf, open_dict
+from torch.utils.data import Dataset, Sampler
+from torchdata.stateful_dataloader import StatefulDataLoader
+from tqdm import tqdm
+
+from verl import DataProto
+from verl.experimental.dataset.sampler import AbstractCurriculumSampler
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from verl.single_controller.base import Worker
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.trainer.config import AlgoConfig
+from verl.trainer.ppo import core_algos
+from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
+from verl.trainer.ppo.metric_utils import (
+    compute_data_metrics,
+    compute_throughout_metrics,
+    compute_timing_metrics,
+    process_validation_metrics,
+)
+from verl.trainer.ppo.reward import compute_reward, compute_reward_async
+from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
+from verl.utils.config import omega_conf_to_dataclass
+from verl.utils.debug import marked_timer
+from verl.utils.metric import reduce_metrics
+from verl.utils.rollout_skip import RolloutSkip
+from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
+from verl.utils.torch_functional import masked_mean
+from verl.utils.tracking import ValidationGenerationsLogger
+from examples.reward_function.evaluation import compute_metrics_by_data_source
+
+WorkerType = type[Worker]
+
+
+class Role(Enum):
+    """
+    To create more roles dynamically, you can subclass Role and add new members
+    """
+
+    Actor = 0
+    Rollout = 1
+    ActorRollout = 2
+    Critic = 3
+    RefPolicy = 4
+    RewardModel = 5
+    ActorRolloutRef = 6
+
+
+@dataclass
+class ResourcePoolManager:
+    """
+    Define a resource pool specification. Resource pool will be initialized first.
+    """
+
+    resource_pool_spec: dict[str, list[int]]
+    mapping: dict[Role, str]
+    resource_pool_dict: dict[str, RayResourcePool] = field(default_factory=dict)
+
+    def create_resource_pool(self):
+        """Create Ray resource pools for distributed training.
+
+        Initializes resource pools based on the resource pool specification,
+        with each pool managing GPU resources across multiple nodes.
+        For FSDP backend, uses max_colocate_count=1 to merge WorkerGroups.
+        For Megatron backend, uses max_colocate_count>1 for different models.
+        """
+        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
+            # max_colocate_count means the number of WorkerGroups (i.e. processes) in each RayResourcePool
+            # For FSDP backend, we recommend using max_colocate_count=1 that merge all WorkerGroups into one.
+            # For Megatron backend, we recommend using max_colocate_count>1
+            # that can utilize different WorkerGroup for differnt models
+            resource_pool = RayResourcePool(
+                process_on_nodes=process_on_nodes, use_gpu=True, max_colocate_count=1, name_prefix=resource_pool_name
+            )
+            self.resource_pool_dict[resource_pool_name] = resource_pool
+
+        self._check_resource_available()
+
+    def get_resource_pool(self, role: Role) -> RayResourcePool:
+        """Get the resource pool of the worker_cls"""
+        return self.resource_pool_dict[self.mapping[role]]
+
+    def get_n_gpus(self) -> int:
+        """Get the number of gpus in this cluster."""
+        return sum([n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes])
+
+    def _check_resource_available(self):
+        """Check if the resource pool can be satisfied in this ray cluster."""
+        node_available_resources = ray.state.available_resources_per_node()
+        node_available_gpus = {
+            node: node_info.get("GPU", 0) if "GPU" in node_info else node_info.get("NPU", 0)
+            for node, node_info in node_available_resources.items()
+        }
+
+        # check total required gpus can be satisfied
+        total_available_gpus = sum(node_available_gpus.values())
+        total_required_gpus = sum(
+            [n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes]
+        )
+        if total_available_gpus < total_required_gpus:
+            raise ValueError(
+                f"Total available GPUs {total_available_gpus} is less than total desired GPUs {total_required_gpus}"
+            )
+
+        # check each resource pool can be satisfied, O(#resource_pools * #nodes)
+        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
+            num_gpus, num_nodes = process_on_nodes[0], len(process_on_nodes)
+            for node, available_gpus in node_available_gpus.items():
+                if available_gpus >= num_gpus:
+                    node_available_gpus[node] -= num_gpus
+                    num_nodes -= 1
+                    if num_nodes == 0:
+                        break
+            if num_nodes > 0:
+                raise ValueError(
+                    f"Resource pool {resource_pool_name}: {num_gpus}*{num_nodes}"
+                    + "cannot be satisfied in this ray cluster"
+                )
+
+
+def apply_kl_penalty(data: DataProto, kl_ctrl: core_algos.AdaptiveKLController, kl_penalty="kl"):
+    """Apply KL penalty to the token-level rewards.
+
+    This function computes the KL divergence between the reference policy and current policy,
+    then applies a penalty to the token-level rewards based on this divergence.
+
+    Args:
+        data (DataProto): The data containing batched model outputs and inputs.
+        kl_ctrl (core_algos.AdaptiveKLController): Controller for adaptive KL penalty.
+        kl_penalty (str, optional): Type of KL penalty to apply. Defaults to "kl".
+
+    Returns:
+        tuple: A tuple containing:
+            - The updated data with token-level rewards adjusted by KL penalty
+            - A dictionary of metrics related to the KL penalty
+    """
+    response_mask = data.batch["response_mask"]
+    token_level_scores = data.batch["token_level_scores"]
+    batch_size = data.batch.batch_size[0]
+
+    # compute kl between ref_policy and current policy
+    # When apply_kl_penalty, algorithm.use_kl_in_reward=True, so the reference model has been enabled.
+    kld = core_algos.kl_penalty(
+        data.batch["old_log_probs"], data.batch["ref_log_prob"], kl_penalty=kl_penalty
+    )  # (batch_size, response_length)
+    kld = kld * response_mask
+    beta = kl_ctrl.value
+
+    token_level_rewards = token_level_scores - beta * kld
+
+    current_kl = masked_mean(kld, mask=response_mask, axis=-1)  # average over sequence
+    current_kl = torch.mean(current_kl, dim=0).item()
+
+    # according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837
+    kl_ctrl.update(current_kl=current_kl, n_steps=batch_size)
+    data.batch["token_level_rewards"] = token_level_rewards
+
+    metrics = {"actor/reward_kl_penalty": current_kl, "actor/reward_kl_penalty_coeff": beta}
+
+    return data, metrics
+
+
+def compute_response_mask(data: DataProto):
+    """Compute the attention mask for the response part of the sequence.
+
+    This function extracts the portion of the attention mask that corresponds to the model's response,
+    which is used for masking computations that should only apply to response tokens.
+
+    Args:
+        data (DataProto): The data containing batched model outputs and inputs.
+
+    Returns:
+        torch.Tensor: The attention mask for the response tokens.
+    """
+    responses = data.batch["responses"]
+    response_length = responses.size(1)
+    attention_mask = data.batch["attention_mask"]
+    return attention_mask[:, -response_length:]
+
+
+def compute_advantage(
+    data: DataProto,
+    adv_estimator: AdvantageEstimator,
+    gamma: float = 1.0,
+    lam: float = 1.0,
+    num_repeat: int = 1,
+    norm_adv_by_std_in_grpo: bool = True,
+    config: Optional[AlgoConfig] = None,
+) -> DataProto:
+    """Compute advantage estimates for policy optimization.
+
+    This function computes advantage estimates using various estimators like GAE, GRPO, REINFORCE++, etc.
+    The advantage estimates are used to guide policy optimization in RL algorithms.
+
+    Args:
+        data (DataProto): The data containing batched model outputs and inputs.
+        adv_estimator (AdvantageEstimator): The advantage estimator to use (e.g., GAE, GRPO, REINFORCE++).
+        gamma (float, optional): Discount factor for future rewards. Defaults to 1.0.
+        lam (float, optional): Lambda parameter for GAE. Defaults to 1.0.
+        num_repeat (int, optional): Number of times to repeat the computation. Defaults to 1.
+        norm_adv_by_std_in_grpo (bool, optional): Whether to normalize advantages by standard deviation in
+            GRPO. Defaults to True.
+        config (dict, optional): Configuration dictionary for algorithm settings. Defaults to None.
+
+    Returns:
+        DataProto: The updated data with computed advantages and returns.
+    """
+    # Back-compatible with trainers that do not compute response mask in fit
+    if "response_mask" not in data.batch.keys():
+        data.batch["response_mask"] = compute_response_mask(data)
+    # prepare response group
+    if adv_estimator == AdvantageEstimator.GAE:
+        # Compute advantages and returns using Generalized Advantage Estimation (GAE)
+        advantages, returns = core_algos.compute_gae_advantage_return(
+            token_level_rewards=data.batch["token_level_rewards"],
+            values=data.batch["values"],
+            response_mask=data.batch["response_mask"],
+            gamma=gamma,
+            lam=lam,
+        )
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
+        if config.get("use_pf_ppo", False):
+            data = core_algos.compute_pf_ppo_reweight_data(
+                data,
+                config.pf_ppo.get("reweight_method"),
+                config.pf_ppo.get("weight_pow"),
+            )
+    elif adv_estimator == AdvantageEstimator.GRPO:
+        # Initialize the mask for GRPO calculation
+        grpo_calculation_mask = data.batch["response_mask"]
+        # Call compute_grpo_outcome_advantage with parameters matching its definition
+        advantages, returns = core_algos.compute_grpo_outcome_advantage(
+            token_level_rewards=data.batch["token_level_rewards"],
+            response_mask=grpo_calculation_mask,
+            index=data.non_tensor_batch["uid"],
+            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+        )
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
+    elif adv_estimator == AdvantageEstimator.DRPO:
+        grpo_calculation_mask = data.batch["response_mask"]
+        domain_info = data.non_tensor_batch["dataset"]
+
+        advantages, returns = core_algos.compute_drpo_outcome_advantage(
+            token_level_rewards=data.batch["token_level_rewards"],
+            response_mask=grpo_calculation_mask,
+            index=data.non_tensor_batch["uid"],
+            domain_info=domain_info
+        )
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
+    else:
+        # handle all other adv estimator type other than GAE and GRPO
+        adv_estimator_fn = core_algos.get_adv_estimator_fn(adv_estimator)
+        adv_kwargs = {
+            "token_level_rewards": data.batch["token_level_rewards"],
+            "response_mask": data.batch["response_mask"],
+            "config": config,
+        }
+        if "uid" in data.non_tensor_batch:  # optional
+            adv_kwargs["index"] = data.non_tensor_batch["uid"]
+        if "reward_baselines" in data.batch:  # optional
+            adv_kwargs["reward_baselines"] = data.batch["reward_baselines"]
+
+        # calculate advantage estimator
+        advantages, returns = adv_estimator_fn(**adv_kwargs)
+        data.batch["advantages"] = advantages
+        data.batch["returns"] = returns
+    return data
+
+
+class RayPPOTrainer:
+    """Distributed PPO trainer using Ray for scalable reinforcement learning.
+
+    This trainer orchestrates distributed PPO training across multiple nodes and GPUs,
+    managing actor rollouts, critic training, and reward computation with Ray backend.
+    Supports various model architectures including FSDP, Megatron, and vLLM integration.
+    """
+
+    # TODO: support each role have individual ray_worker_group_cls,
+    # i.e., support different backend of different role
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: type[RayWorkerGroup] = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        train_dataset: Optional[Dataset] = None,
+        val_dataset: Optional[Dataset] = None,
+        collate_fn=None,
+        train_sampler: Optional[Sampler] = None,
+        device_name=None,
+    ):
+        """
+        Initialize distributed PPO trainer with Ray backend.
+        Note that this trainer runs on the driver process on a single CPU/GPU node.
+
+        Args:
+            config: Configuration object containing training parameters.
+            tokenizer: Tokenizer used for encoding and decoding text.
+            role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes.
+            resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools.
+            ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup.
+            processor: Optional data processor, used for multimodal data
+            reward_fn: Function for computing rewards during training.
+            val_reward_fn: Function for computing rewards during validation.
+            train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None.
+            val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None.
+            collate_fn: Function to collate data samples into batches.
+            train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None.
+            device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None.
+        """
+
+        # Store the tokenizer for text processing
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.config = config
+        self.reward_fn = reward_fn
+        self.val_reward_fn = val_reward_fn
+
+        self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
+        assert self.hybrid_engine, "Currently, only support hybrid engine"
+
+        if self.hybrid_engine:
+            assert Role.ActorRollout in role_worker_mapping, f"{role_worker_mapping.keys()=}"
+
+        self.role_worker_mapping = role_worker_mapping
+        self.resource_pool_manager = resource_pool_manager
+        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
+        self.use_rm = Role.RewardModel in role_worker_mapping
+        self.ray_worker_group_cls = ray_worker_group_cls
+        self.device_name = device_name if device_name else self.config.trainer.device
+        self.validation_generations_logger = ValidationGenerationsLogger(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+        )
+
+        # if ref_in_actor is True, the reference policy will be actor without lora applied
+        self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
+
+        # define in-reward KL control
+        # kl loss control currently not suppoorted
+        if self.config.algorithm.use_kl_in_reward:
+            self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
+
+        if config.critic.enable is not None:
+            self.use_critic = bool(config.critic.enable)
+        elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+            self.use_critic = True
+        else:
+            warnings.warn(
+                "Disabled critic as algorithm.adv_estimator != gae. "
+                "If it is not intended, please set critic.enable=True",
+                stacklevel=2,
+            )
+            self.use_critic = False
+
+        self._validate_config()
+        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
+
+    def _validate_config(self):
+        config = self.config
+        # number of GPUs total
+        n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
+        if config.actor_rollout_ref.actor.strategy == "megatron":
+            model_parallel_size = (
+                config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size
+                * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size
+            )
+            assert (
+                n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0
+            ), (
+                f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times "
+                f"context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})"
+            )
+            megatron_dp = n_gpus // (
+                model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size
+            )
+            minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
+        else:
+            minimal_bsz = n_gpus
+
+        # 1. Check total batch size for data correctness
+        real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
+        assert real_train_batch_size % minimal_bsz == 0, (
+            f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size "
+            f"({minimal_bsz})"
+        )
+
+        # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
+        # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
+        def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
+            """Validate mutually exclusive micro batch size configuration options.
+
+            Ensures that users don't set both deprecated micro_batch_size and
+            the new micro_batch_size_per_gpu parameters simultaneously.
+
+            Args:
+                mbs: Deprecated micro batch size parameter value.
+                mbs_per_gpu: New micro batch size per GPU parameter value.
+                name (str): Configuration section name for error messages.
+
+            Raises:
+                ValueError: If both parameters are set or neither is set.
+            """
+            settings = {
+                "reward_model": "micro_batch_size",
+                "actor_rollout_ref.ref": "log_prob_micro_batch_size",
+                "actor_rollout_ref.rollout": "log_prob_micro_batch_size",
+            }
+
+            if name in settings:
+                param = settings[name]
+                param_per_gpu = f"{param}_per_gpu"
+
+                if mbs is None and mbs_per_gpu is None:
+                    raise ValueError(
+                        f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'."
+                    )
+
+                if mbs is not None and mbs_per_gpu is not None:
+                    raise ValueError(
+                        f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove "
+                        f"'{name}.{param}' because only '*_{param_per_gpu}' is supported (the former is deprecated)."
+                    )
+
+        # Actor validation done in ActorConfig.__post_init__ and validate()
+        actor_config = omega_conf_to_dataclass(config.actor_rollout_ref.actor)
+        actor_config.validate(n_gpus, config.data.train_batch_size, config.actor_rollout_ref.model)
+
+        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
+            if self.use_reference_policy:
+                # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+                check_mutually_exclusive(
+                    config.actor_rollout_ref.ref.log_prob_micro_batch_size,
+                    config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
+                    "actor_rollout_ref.ref",
+                )
+
+            #  The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+            check_mutually_exclusive(
+                config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
+                config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
+                "actor_rollout_ref.rollout",
+            )
+
+        # Check for reward model micro-batch size conflicts
+        if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
+            check_mutually_exclusive(
+                config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
+            )
+
+        if self.config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss:
+            print("NOTICE: You have both enabled in-reward kl and kl loss.")
+
+        # critic
+        if self.use_critic:
+            critic_config = omega_conf_to_dataclass(config.critic)
+            critic_config.validate(n_gpus, config.data.train_batch_size)
+
+        if config.data.get("val_batch_size", None) is not None:
+            print(
+                "WARNING: val_batch_size is deprecated."
+                + " Validation datasets are sent to inference engines as a whole batch,"
+                + " which will schedule the memory themselves."
+            )
+
+        # check eval config
+        if config.actor_rollout_ref.rollout.val_kwargs.do_sample:
+            assert config.actor_rollout_ref.rollout.temperature > 0, (
+                "validation gen temperature should be greater than 0 when enabling do_sample"
+            )
+
+        print("[validate_config] All configuration checks passed successfully!")
+
+    def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]):
+        """
+        Creates the train and validation dataloaders.
+        """
+        # TODO: we have to make sure the batch size is divisible by the dp size
+        from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
+
+        if train_dataset is None:
+            train_dataset = create_rl_dataset(
+                self.config.data.train_files, self.config.data, self.tokenizer, self.processor
+            )
+        if val_dataset is None:
+            val_dataset = create_rl_dataset(
+                self.config.data.val_files, self.config.data, self.tokenizer, self.processor
+            )
+        self.train_dataset, self.val_dataset = train_dataset, val_dataset
+
+        if train_sampler is None:
+            train_sampler = create_rl_sampler(self.config.data, self.train_dataset)
+        if collate_fn is None:
+            from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn
+
+            collate_fn = default_collate_fn
+
+        num_workers = self.config.data["dataloader_num_workers"]
+
+        self.train_dataloader = StatefulDataLoader(
+            dataset=self.train_dataset,
+            batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
+            num_workers=num_workers,
+            drop_last=True,
+            collate_fn=collate_fn,
+            sampler=train_sampler,
+        )
+
+        val_batch_size = self.config.data.val_batch_size  # Prefer config value if set
+        if val_batch_size is None:
+            val_batch_size = len(self.val_dataset)
+
+        self.val_dataloader = StatefulDataLoader(
+            dataset=self.val_dataset,
+            batch_size=val_batch_size,
+            num_workers=num_workers,
+            shuffle=self.config.data.get("validation_shuffle", True),
+            drop_last=False,
+            collate_fn=collate_fn,
+        )
+
+        assert len(self.train_dataloader) >= 1, "Train dataloader is empty!"
+        assert len(self.val_dataloader) >= 1, "Validation dataloader is empty!"
+
+        print(
+            f"Size of train dataloader: {len(self.train_dataloader)}, Size of val dataloader: "
+            f"{len(self.val_dataloader)}"
+        )
+
+        total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
+
+        if self.config.trainer.total_training_steps is not None:
+            total_training_steps = self.config.trainer.total_training_steps
+
+        self.total_training_steps = total_training_steps
+        print(f"Total training steps: {self.total_training_steps}")
+
+        try:
+            OmegaConf.set_struct(self.config, True)
+            with open_dict(self.config):
+                if OmegaConf.select(self.config, "actor_rollout_ref.actor.optim"):
+                    self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps
+                if OmegaConf.select(self.config, "critic.optim"):
+                    self.config.critic.optim.total_training_steps = total_training_steps
+        except Exception as e:
+            print(f"Warning: Could not set total_training_steps in config. Structure missing? Error: {e}")
+
+    def _dump_generations(self, inputs, outputs, gts, scores, reward_extra_infos_dict, dump_path, **kwargs):
+        """Dump rollout/validation samples as JSONL."""
+        os.makedirs(dump_path, exist_ok=True)
+        filename = os.path.join(dump_path, f"{self.global_steps}.jsonl")
+
+        n = len(inputs)
+        base_data = {
+            "input": inputs,
+            "output": outputs,
+            "gts": gts,
+            "score": scores,
+            "step": [self.global_steps] * n,
+        }
+
+        for k, v in reward_extra_infos_dict.items():
+            if len(v) == n:
+                base_data[k] = v
+
+        for k, v in kwargs.items():
+            if isinstance(v, np.ndarray):
+                base_data[k] = v.tolist()
+            elif hasattr(v, 'cpu'):  # Check if it's a torch tensor
+                base_data[k] = v.cpu().numpy().tolist()
+            else:
+                base_data[k] = v
+
+        lines = []
+        for i in range(n):
+            entry = {k: v[i] for k, v in base_data.items()}
+            lines.append(json.dumps(entry, ensure_ascii=False))
+
+        with open(filename, "w") as f:
+            f.write("\n".join(lines) + "\n")
+
+        print(f"Dumped generations to {filename}")
+
+    def _maybe_log_val_generations(self, inputs, outputs, scores):
+        """Log a table of validation samples to the configured logger (wandb or swanlab)"""
+
+        generations_to_log = self.config.trainer.log_val_generations
+
+        if generations_to_log == 0:
+            return
+
+        import numpy as np
+
+        # Create tuples of (input, output, score) and sort by input text
+        samples = list(zip(inputs, outputs, scores, strict=True))
+        samples.sort(key=lambda x: x[0])  # Sort by input text
+
+        # Use fixed random seed for deterministic shuffling
+        rng = np.random.RandomState(42)
+        rng.shuffle(samples)
+
+        # Take first N samples after shuffling
+        samples = samples[:generations_to_log]
+
+        # Log to each configured logger
+        self.validation_generations_logger.log(self.config.trainer.logger, samples, self.global_steps)
+
+    def _validate(self):
+        data_source_lst = []
+        reward_extra_infos_dict: dict[str, list] = defaultdict(list)
+
+        # Lists to collect samples for the table
+        sample_inputs = []
+        sample_outputs = []
+        sample_gts = []
+        sample_scores = []
+        sample_turns = []
+
+        # New lists for metric calculation
+        all_predictions = []
+        all_ground_truths = []
+        all_data_sources = []
+        all_demographics = []
+        all_datasets = []
+        data_source_lst = []
+
+        for test_data in self.val_dataloader:
+            test_batch = DataProto.from_single_dict(test_data)
+
+            # repeat test batch
+            test_batch = test_batch.repeat(
+                repeat_times=self.config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True
+            )
+
+            # we only do validation on rule-based rm
+            if self.config.reward_model.enable and test_batch[0].non_tensor_batch["reward_model"]["style"] == "model":
+                return {}
+
+            # Store original inputs
+            input_ids = test_batch.batch["input_ids"]
+            # TODO: Can we keep special tokens except for padding tokens?
+            input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
+            sample_inputs.extend(input_texts)
+
+            ground_truths = [
+                item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) for item in test_batch
+            ]
+            sample_gts.extend(ground_truths)
+            data_sources = test_batch.non_tensor_batch.get("data_source", ["unknown"] * len(input_texts))
+            datasets = test_batch.non_tensor_batch.get("dataset", ["unknown"] * len(input_texts))
+            demographics = test_batch.non_tensor_batch.get("demo", ["unknown"] * len(input_texts))
+
+            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+            if "multi_modal_data" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("multi_modal_data")
+            if "raw_prompt" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("raw_prompt")
+            if "tools_kwargs" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("tools_kwargs")
+            if "interaction_kwargs" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("interaction_kwargs")
+            if "agent_name" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("agent_name")
+            test_gen_batch = test_batch.pop(
+                batch_keys=batch_keys_to_pop,
+                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+            )
+
+            test_gen_batch.meta_info = {
+                "eos_token_id": self.tokenizer.eos_token_id,
+                "pad_token_id": self.tokenizer.pad_token_id,
+                "recompute_log_prob": False,
+                "do_sample": self.config.actor_rollout_ref.rollout.val_kwargs.do_sample,
+                "validate": True,
+                "global_steps": self.global_steps,
+            }
+            print(f"test_gen_batch meta info: {test_gen_batch.meta_info}")
+
+            # pad to be divisible by dp_size
+            size_divisor = (
+                self.actor_rollout_wg.world_size
+                if not self.async_rollout_mode
+                else self.config.actor_rollout_ref.rollout.agent.num_workers
+            )
+            test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, size_divisor)
+            if not self.async_rollout_mode:
+                test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
+            else:
+                test_output_gen_batch_padded = self.async_rollout_manager.generate_sequences(test_gen_batch_padded)
+
+            # unpad
+            test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size)
+
+            print("validation generation end")
+
+            # Store generated outputs
+            output_ids = test_output_gen_batch.batch["responses"]
+            output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
+            sample_outputs.extend(output_texts)
+
+            # Collect for metrics calculation
+            all_predictions.extend(output_texts)
+            all_ground_truths.extend(ground_truths)
+            all_data_sources.extend(data_sources)
+            all_datasets.extend(datasets)
+            all_demographics.extend(demographics)
+            data_source_lst.append(
+                test_batch.non_tensor_batch.get("data_source", ["unknown"] * len(input_texts))
+            )
+
+            test_batch = test_batch.union(test_output_gen_batch)
+            test_batch.meta_info["validate"] = True
+
+            # evaluate using reward_function
+            if self.val_reward_fn is None:
+                raise ValueError("val_reward_fn must be provided for validation.")
+            result = self.val_reward_fn(test_batch, return_dict=True)
+            reward_tensor = result["reward_tensor"]
+            scores = reward_tensor.sum(-1).cpu().tolist()
+            sample_scores.extend(scores)
+
+            reward_extra_infos_dict["reward"].extend(scores)
+            print(f"len reward_extra_infos_dict['reward']: {len(reward_extra_infos_dict['reward'])}")
+            if "reward_extra_info" in result:
+                for key, lst in result["reward_extra_info"].items():
+                    reward_extra_infos_dict[key].extend(lst)
+                    print(f"len reward_extra_infos_dict['{key}']: {len(reward_extra_infos_dict[key])}")
+
+            # collect num_turns of each prompt
+            if "__num_turns__" in test_batch.non_tensor_batch:
+                sample_turns.append(test_batch.non_tensor_batch["__num_turns__"])
+
+        self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores)
+
+        # Per data source metrics
+        metrics = compute_metrics_by_data_source(all_predictions, all_ground_truths,
+                                                 all_data_sources, all_datasets, all_demographics)
+        wandb.log(metrics, step=self.global_steps)
+
+        for key_info, lst in reward_extra_infos_dict.items():
+            assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}"
+
+        data_sources = np.concatenate(data_source_lst, axis=0)
+        # convert to list for easier processing
+        data_sources = data_sources.tolist()
+
+        print(f"size of sample_scores: {len(sample_scores)}, size of sample_outputs: {len(sample_outputs)},"
+              f" size of sample_gts: {len(sample_gts)}, size of sample_inputs: {len(sample_inputs)}"
+              f", size of data_sources: {len(data_sources)}, size of sample_turns: {len(sample_turns)}")
+        data_src2var2metric2val = process_validation_metrics(data_sources, sample_inputs, reward_extra_infos_dict)
+        metric_dict = {}
+        for data_source, var2metric2val in data_src2var2metric2val.items():
+            core_var = "acc" if "acc" in var2metric2val else "reward"
+            for var_name, metric2val in var2metric2val.items():
+                n_max = max([int(name.split("@")[-1].split("/")[0]) for name in metric2val.keys()])
+                for metric_name, metric_val in metric2val.items():
+                    if (
+                        (var_name == core_var)
+                        and any(metric_name.startswith(pfx) for pfx in ["mean", "maj", "best"])
+                        and (f"@{n_max}" in metric_name)
+                    ):
+                        metric_sec = "val-core"
+                    else:
+                        metric_sec = "val-aux"
+                    pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}"
+                    metric_dict[pfx] = metric_val
+
+        # dump generations
+        val_data_dir = self.config.trainer.get("validation_data_dir", self.config.trainer.default_local_dir)
+        if val_data_dir:
+            self._dump_generations(
+                inputs=sample_inputs,
+                outputs=sample_outputs,
+                gts=sample_gts,
+                scores=sample_scores,
+                reward_extra_infos_dict=reward_extra_infos_dict,
+                dump_path=val_data_dir,
+                datasets=all_datasets,
+                data_paths=data_sources,
+            )
+
+        if len(sample_turns) > 0:
+            sample_turns = np.concatenate(sample_turns)
+            metric_dict["val-aux/num_turns/min"] = sample_turns.min()
+            metric_dict["val-aux/num_turns/max"] = sample_turns.max()
+            metric_dict["val-aux/num_turns/mean"] = sample_turns.mean()
+
+        return metric_dict
+
+    def save_generations(self, sample_datapaths, sample_datasets, sample_inputs, sample_labels, sample_outputs,
+                         sample_scores):
+        generation_save_folder = os.path.join(self.config.trainer.default_local_dir,
+                                              f"global_step_{self.global_steps}")
+        if not os.path.exists(generation_save_folder):
+            os.makedirs(generation_save_folder, exist_ok=True)
+        with open(os.path.join(generation_save_folder, "generations.jsonl"), "w") as f:
+            for i in range(len(sample_inputs)):
+                try:
+                    short_answer = sample_outputs[i].split("boxed{")[1].split("}")[0]
+                except IndexError:
+                    short_answer = ''
+                answer_is_correct = short_answer == sample_labels[i]
+                f.write(
+                    ujson.dumps({
+                        "input": sample_inputs[i],
+                        "generations": sample_outputs[i],
+                        "short_answer": short_answer,
+                        "answer_is_correct": answer_is_correct,
+                        "label": sample_labels[i],
+                        "score": sample_scores[i],
+                        "dataset": sample_datasets[i],
+                        "datapath": sample_datapaths[i],
+                    }) + "\n"
+                )
+
+    def init_workers(self):
+        """Initialize distributed training workers using Ray backend.
+
+        Creates:
+        1. Ray resource pools from configuration
+        2. Worker groups for each role (actor, critic, etc.)
+        """
+        self.resource_pool_manager.create_resource_pool()
+
+        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
+
+        # create actor and rollout
+        if self.hybrid_engine:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout)
+            actor_rollout_cls = RayClassWithInitArgs(
+                cls=self.role_worker_mapping[Role.ActorRollout],
+                config=self.config.actor_rollout_ref,
+                role="actor_rollout",
+                profile_option=self.config.trainer.npu_profile.options,
+            )
+            self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
+        else:
+            raise NotImplementedError
+
+        # create critic
+        if self.use_critic:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
+            critic_cfg = omega_conf_to_dataclass(self.config.critic)
+            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg)
+            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
+
+        # create reference policy if needed
+        if self.use_reference_policy:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
+            ref_policy_cls = RayClassWithInitArgs(
+                self.role_worker_mapping[Role.RefPolicy],
+                config=self.config.actor_rollout_ref,
+                role="ref",
+                profile_option=self.config.trainer.npu_profile.options,
+            )
+            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
+
+        # create a reward model if reward_fn is None
+        if self.use_rm:
+            # we create a RM here
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+            rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
+            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
+
+        # initialize WorkerGroup
+        # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
+        # you should not use `create_colocated_worker_cls`.
+        # Instead, directly pass different resource pool to different worker groups.
+        # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information.
+        all_wg = {}
+        wg_kwargs = {}  # Setting up kwargs for RayWorkerGroup
+        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
+            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
+        if OmegaConf.select(self.config.trainer, "profile_steps") is not None:
+            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps")
+            assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, (
+                "worker_nsight_options must be set when profile_steps is set"
+            )
+            wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
+                OmegaConf.select(self.config.trainer, "worker_nsight_options")
+            )
+        wg_kwargs["device_name"] = self.device_name
+
+        for resource_pool, class_dict in self.resource_pool_to_cls.items():
+            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+            wg_dict = self.ray_worker_group_cls(
+                resource_pool=resource_pool,
+                ray_cls_with_init=worker_dict_cls,
+                **wg_kwargs,
+            )
+            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+            all_wg.update(spawn_wg)
+
+        if self.use_critic:
+            self.critic_wg = all_wg["critic"]
+            self.critic_wg.init_model()
+
+        if self.use_reference_policy and not self.ref_in_actor:
+            self.ref_policy_wg = all_wg["ref"]
+            self.ref_policy_wg.init_model()
+
+        if self.use_rm:
+            self.rm_wg = all_wg["rm"]
+            self.rm_wg.init_model()
+
+        # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
+        self.actor_rollout_wg = all_wg["actor_rollout"]
+        self.actor_rollout_wg.init_model()
+
+        # create async rollout manager and request scheduler
+        self.async_rollout_mode = False
+        if self.config.actor_rollout_ref.rollout.mode == "async":
+            from verl.experimental.agent_loop import AgentLoopManager
+
+            self.async_rollout_mode = True
+            self.async_rollout_manager = AgentLoopManager(
+                config=self.config,
+                worker_group=self.actor_rollout_wg,
+            )
+
+    def _save_checkpoint(self):
+
+        ## TO SAVE CHECKPOINT
+        from verl.utils.fs import local_mkdir_safe
+
+        # path: given_path + `/global_step_{global_steps}` + `/actor`
+        local_global_step_folder = os.path.join(
+            self.config.trainer.default_local_dir, f"global_step_{self.global_steps}"
+        )
+
+        print(f"local_global_step_folder: {local_global_step_folder}")
+        actor_local_path = os.path.join(local_global_step_folder, "actor")
+
+        actor_remote_path = (
+            None
+            if self.config.trainer.default_hdfs_dir is None
+            else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "actor")
+        )
+
+        remove_previous_ckpt_in_save = self.config.trainer.get("remove_previous_ckpt_in_save", False)
+        if remove_previous_ckpt_in_save:
+            print(
+                "Warning: remove_previous_ckpt_in_save is deprecated,"
+                + " set max_actor_ckpt_to_keep=1 and max_critic_ckpt_to_keep=1 instead"
+            )
+        max_actor_ckpt_to_keep = (
+            self.config.trainer.get("max_actor_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
+        )
+        max_critic_ckpt_to_keep = (
+            self.config.trainer.get("max_critic_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
+        )
+
+        self.actor_rollout_wg.save_checkpoint(
+            actor_local_path, actor_remote_path, self.global_steps, max_ckpt_to_keep=max_actor_ckpt_to_keep
+        )
+
+        if self.use_critic:
+            critic_local_path = os.path.join(local_global_step_folder, "critic")
+            critic_remote_path = (
+                None
+                if self.config.trainer.default_hdfs_dir is None
+                else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "critic")
+            )
+            self.critic_wg.save_checkpoint(
+                critic_local_path, critic_remote_path, self.global_steps, max_ckpt_to_keep=max_critic_ckpt_to_keep
+            )
+
+        # save dataloader
+        local_mkdir_safe(local_global_step_folder)
+        dataloader_local_path = os.path.join(local_global_step_folder, "data.pt")
+        dataloader_state_dict = self.train_dataloader.state_dict()
+        torch.save(dataloader_state_dict, dataloader_local_path)
+
+        # latest checkpointed iteration tracker (for atomic usage)
+        local_latest_checkpointed_iteration = os.path.join(
+            self.config.trainer.default_local_dir, "latest_checkpointed_iteration.txt"
+        )
+        with open(local_latest_checkpointed_iteration, "w") as f:
+            f.write(str(self.global_steps))
+
+    def _load_checkpoint(self):
+        if self.config.trainer.resume_mode == "disable":
+            return 0
+
+        # load from hdfs
+        if self.config.trainer.default_hdfs_dir is not None:
+            raise NotImplementedError("load from hdfs is not implemented yet")
+        else:
+            checkpoint_folder = self.config.trainer.default_local_dir  # TODO: check path
+            if not os.path.isabs(checkpoint_folder):
+                working_dir = os.getcwd()
+                checkpoint_folder = os.path.join(working_dir, checkpoint_folder)
+            global_step_folder = find_latest_ckpt_path(checkpoint_folder)  # None if no latest
+
+        # find global_step_folder
+        if self.config.trainer.resume_mode == "auto":
+            if global_step_folder is None:
+                print("Training from scratch")
+                return 0
+        else:
+            if self.config.trainer.resume_mode == "resume_path":
+                assert isinstance(self.config.trainer.resume_from_path, str), "resume ckpt must be str type"
+                assert "global_step_" in self.config.trainer.resume_from_path, (
+                    "resume ckpt must specify the global_steps"
+                )
+                global_step_folder = self.config.trainer.resume_from_path
+                if not os.path.isabs(global_step_folder):
+                    working_dir = os.getcwd()
+                    global_step_folder = os.path.join(working_dir, global_step_folder)
+        print(f"Load from checkpoint folder: {global_step_folder}")
+        # set global step
+        self.global_steps = int(global_step_folder.split("global_step_")[-1])
+
+        print(f"Setting global step to {self.global_steps}")
+        print(f"Resuming from {global_step_folder}")
+
+        actor_path = os.path.join(global_step_folder, "actor")
+        critic_path = os.path.join(global_step_folder, "critic")
+        # load actor
+        self.actor_rollout_wg.load_checkpoint(
+            actor_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
+        )
+        # load critic
+        if self.use_critic:
+            self.critic_wg.load_checkpoint(
+                critic_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
+            )
+
+        # load dataloader,
+        # TODO: from remote not implemented yet
+        dataloader_local_path = os.path.join(global_step_folder, "data.pt")
+        if os.path.exists(dataloader_local_path):
+            dataloader_state_dict = torch.load(dataloader_local_path, weights_only=False)
+            self.train_dataloader.load_state_dict(dataloader_state_dict)
+        else:
+            print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch")
+
+    def _start_profiling(self, do_profile: bool) -> None:
+        """Start profiling for all worker groups if profiling is enabled."""
+        if do_profile:
+            self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps)
+            if self.use_reference_policy:
+                self.ref_policy_wg.start_profile()
+            if self.use_critic:
+                self.critic_wg.start_profile()
+            if self.use_rm:
+                self.rm_wg.start_profile()
+
+    def _stop_profiling(self, do_profile: bool) -> None:
+        """Stop profiling for all worker groups if profiling is enabled."""
+        if do_profile:
+            self.actor_rollout_wg.stop_profile()
+            if self.use_reference_policy:
+                self.ref_policy_wg.stop_profile()
+            if self.use_critic:
+                self.critic_wg.stop_profile()
+            if self.use_rm:
+                self.rm_wg.stop_profile()
+
+    def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"):
+        """Reorder the data on single controller such that each dp rank gets similar total tokens"""
+        attention_mask = batch.batch["attention_mask"]
+        batch_size = attention_mask.shape[0]
+        global_seqlen_lst = batch.batch["attention_mask"].view(batch_size, -1).sum(-1).tolist()  # (train_batch_size,)
+        world_size = self.actor_rollout_wg.world_size
+        global_partition_lst = get_seqlen_balanced_partitions(
+            global_seqlen_lst, k_partitions=world_size, equal_size=True
+        )
+        # reorder based on index. The data will be automatically equally partitioned by dispatch function
+        global_idx = torch.tensor([j for partition in global_partition_lst for j in partition])
+        batch.reorder(global_idx)
+        global_balance_stats = log_seqlen_unbalance(
+            seqlen_list=global_seqlen_lst, partitions=global_partition_lst, prefix=logging_prefix
+        )
+        metrics.update(global_balance_stats)
+
+    def fit(self):
+        """
+        The training loop of PPO.
+        The driver process only need to call the compute functions of the worker group through RPC
+        to construct the PPO dataflow.
+        The light-weight advantage computation is done on the driver process.
+        """
+        from omegaconf import OmegaConf
+
+        from verl.utils.tracking import Tracking
+
+        logger = Tracking(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+            default_backend=self.config.trainer.logger,
+            config=OmegaConf.to_container(self.config, resolve=True),
+        )
+
+        self.global_steps = 0
+
+        # load checkpoint before doing anything
+        self._load_checkpoint()
+
+        # perform validation before training
+        # currently, we only support validation using the reward_function.
+        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+            val_metrics = self._validate()
+            assert val_metrics, f"{val_metrics=}"
+            pprint(f"Initial validation metrics: {val_metrics}")
+            logger.log(data=val_metrics, step=self.global_steps)
+            if self.config.trainer.get("val_only", False):
+                return
+
+        if self.config.actor_rollout_ref.rollout.get("skip_rollout", False):
+            rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg)
+            rollout_skip.wrap_generate_sequences()
+
+        # add tqdm
+        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
+
+        # we start from step 1
+        self.global_steps += 1
+        last_val_metrics = None
+        self.max_steps_duration = 0
+
+        prev_step_profile = False
+        curr_step_profile = (
+            self.global_steps in self.config.trainer.profile_steps
+            if self.config.trainer.profile_steps is not None
+            else False
+        )
+        next_step_profile = False
+
+        for epoch in range(self.config.trainer.total_epochs):
+            for batch_dict in self.train_dataloader:
+                metrics = {}
+                timing_raw = {}
+
+                with marked_timer("start_profile", timing_raw):
+                    self._start_profiling(
+                        not prev_step_profile and curr_step_profile
+                        if self.config.trainer.profile_continuous_steps
+                        else curr_step_profile
+                    )
+
+                batch: DataProto = DataProto.from_single_dict(batch_dict)
+
+                # add uid to batch
+                batch.non_tensor_batch["uid"] = np.array(
+                    [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object
+                )
+
+                # pop those keys for generation
+                batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+                non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+                if "multi_modal_data" in batch.non_tensor_batch:
+                    # TODO: Fix the audio generation for this
+                    non_tensor_batch_keys_to_pop.append("multi_modal_data")
+
+                # NOTE: Adding pruned inputs that we kept for generation
+                # if "multi_modal_inputs" in batch.non_tensor_batch:
+                #     non_tensor_batch_keys_to_pop.append("multi_modal_inputs")
+
+                # NOTE: Adding pruned inputs that we kept for generation
+                if "multi_modal_inputs" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("multi_modal_inputs")
+                    
+                if "raw_prompt" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("raw_prompt")
+                if "tools_kwargs" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("tools_kwargs")
+                if "interaction_kwargs" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("interaction_kwargs")
+                if "index" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("index")
+                if "agent_name" in batch.non_tensor_batch:
+                    non_tensor_batch_keys_to_pop.append("agent_name")
+
+                gen_batch = batch.pop(
+                    batch_keys=batch_keys_to_pop,
+                    non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+                )
+
+                # pass global_steps to trace
+                gen_batch.meta_info["global_steps"] = self.global_steps
+                gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+
+                is_last_step = self.global_steps >= self.total_training_steps
+
+                # TODO: double check the gen_batch
+                # print(f"gen_batch", gen_batch)
+
+
+                with marked_timer("step", timing_raw):
+                    # generate a batch
+                    with marked_timer("gen", timing_raw, color="red"):
+                        if not self.async_rollout_mode:
+                            # TODO: Fix the audio generation for this
+                            gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
+                        else:
+                            gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
+                        timing_raw.update(gen_batch_output.meta_info["timing"])
+                        gen_batch_output.meta_info.pop("timing", None)
+
+                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                        if self.reward_fn is None:
+                            raise ValueError("A reward_fn is required for REMAX advantage estimation.")
+
+                        with marked_timer("gen_max", timing_raw, color="purple"):
+                            gen_baseline_batch = deepcopy(gen_batch)
+                            gen_baseline_batch.meta_info["do_sample"] = False
+                            if not self.async_rollout_mode:
+                                gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch)
+                            else:
+                                gen_baseline_output = self.async_rollout_manager.generate_sequences(gen_baseline_batch)
+                            batch = batch.union(gen_baseline_output)
+                            reward_baseline_tensor = self.reward_fn(batch)
+                            reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)
+
+                            batch.pop(batch_keys=list(gen_baseline_output.batch.keys()))
+
+                            batch.batch["reward_baselines"] = reward_baseline_tensor
+
+                            del gen_baseline_batch, gen_baseline_output
+
+                    # repeat to align with repeated responses in rollout
+                    batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+                    batch = batch.union(gen_batch_output)
+
+                    if "response_mask" not in batch.batch.keys():
+                        batch.batch["response_mask"] = compute_response_mask(batch)
+                    # Balance the number of valid tokens across DP ranks.
+                    # NOTE: This usually changes the order of data in the `batch`,
+                    # which won't affect the advantage calculation (since it's based on uid),
+                    # but might affect the loss calculation (due to the change of mini-batching).
+                    # TODO: Decouple the DP balancing and mini-batching.
+                    if self.config.trainer.balance_batch:
+                        self._balance_batch(batch, metrics=metrics)
+
+                    # compute global_valid tokens
+                    batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+
+                    with marked_timer("reward", timing_raw, color="yellow"):
+                        # compute reward model score
+                        if self.use_rm:
+                            reward_tensor = self.rm_wg.compute_rm_score(batch)
+                            batch = batch.union(reward_tensor)
+
+                        if self.config.reward_model.launch_reward_fn_async:
+                            future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn)
+                        else:
+                            reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
+
+                    # recompute old_log_probs
+                    with marked_timer("old_log_prob", timing_raw, color="blue"):
+                        old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                        entropys = old_log_prob.batch["entropys"]
+                        response_masks = batch.batch["response_mask"]
+                        loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                        entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
+                        old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
+                        metrics.update(old_log_prob_metrics)
+                        old_log_prob.batch.pop("entropys")
+                        batch = batch.union(old_log_prob)
+
+                        if "rollout_log_probs" in batch.batch.keys():
+                            # TODO: we may want to add diff of probs too.
+                            from verl.utils.debug.metrics import calculate_debug_metrics
+
+                            metrics.update(calculate_debug_metrics(batch))
+
+                    if self.use_reference_policy:
+                        # compute reference log_prob
+                        with marked_timer("ref", timing_raw, color="olive"):
+                            if not self.ref_in_actor:
+                                ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+                            else:
+                                ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
+                            batch = batch.union(ref_log_prob)
+
+                    # compute values
+                    if self.use_critic:
+                        with marked_timer("values", timing_raw, color="cyan"):
+                            values = self.critic_wg.compute_values(batch)
+                            batch = batch.union(values)
+
+                    with marked_timer("adv", timing_raw, color="brown"):
+                        # we combine with rule-based rm
+                        reward_extra_infos_dict: dict[str, list]
+                        if self.config.reward_model.launch_reward_fn_async:
+                            reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
+                        batch.batch["token_level_scores"] = reward_tensor
+
+                        if reward_extra_infos_dict:
+                            batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
+
+                        # compute rewards. apply_kl_penalty if available
+                        if self.config.algorithm.use_kl_in_reward:
+                            batch, kl_metrics = apply_kl_penalty(
+                                batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
+                            )
+                            metrics.update(kl_metrics)
+                        else:
+                            batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
+
+                        # compute advantages, executed on the driver process
+
+                        norm_adv_by_std_in_grpo = self.config.algorithm.get(
+                            "norm_adv_by_std_in_grpo", True
+                        )  # GRPO adv normalization factor
+
+                        batch = compute_advantage(
+                            batch,
+                            adv_estimator=self.config.algorithm.adv_estimator,
+                            gamma=self.config.algorithm.gamma,
+                            lam=self.config.algorithm.lam,
+                            num_repeat=self.config.actor_rollout_ref.rollout.n,
+                            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                            config=self.config.algorithm,
+                        )
+
+                    # update critic
+                    if self.use_critic:
+                        with marked_timer("update_critic", timing_raw, color="pink"):
+                            critic_output = self.critic_wg.update_critic(batch)
+                        critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+                        metrics.update(critic_output_metrics)
+
+                    # implement critic warmup
+                    if self.config.trainer.critic_warmup <= self.global_steps:
+                        # update actor
+                        with marked_timer("update_actor", timing_raw, color="red"):
+                            batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
+                            actor_output = self.actor_rollout_wg.update_actor(batch)
+                        actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+                        metrics.update(actor_output_metrics)
+
+                    # Log rollout generations if enabled
+                    rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+                    if rollout_data_dir:
+                        with marked_timer("dump_rollout_generations", timing_raw, color="green"):
+                            inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
+                            outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
+                            scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
+                            sample_gts = [
+                                item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None)
+                                for item in batch
+                            ]
+
+                            if "request_id" in batch.non_tensor_batch:
+                                reward_extra_infos_dict.setdefault(
+                                    "request_id",
+                                    batch.non_tensor_batch["request_id"].tolist(),
+                                )
+
+                            self._dump_generations(
+                                inputs=inputs,
+                                outputs=outputs,
+                                gts=sample_gts,
+                                scores=scores,
+                                reward_extra_infos_dict=reward_extra_infos_dict,
+                                dump_path=rollout_data_dir,
+                            )
+
+                    # validate
+                    if (
+                        self.val_reward_fn is not None
+                        and self.config.trainer.test_freq > 0
+                        and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
+                    ):
+                        with marked_timer("testing", timing_raw, color="green"):
+                            val_metrics: dict = self._validate()
+                            if is_last_step:
+                                last_val_metrics = val_metrics
+                        metrics.update(val_metrics)
+
+                    # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
+                    esi_close_to_expiration = should_save_ckpt_esi(
+                        max_steps_duration=self.max_steps_duration,
+                        redundant_time=self.config.trainer.esi_redundant_time,
+                    )
+                    # Check if the conditions for saving a checkpoint are met.
+                    # The conditions include a mandatory condition (1) and
+                    # one of the following optional conditions (2/3/4):
+                    # 1. The save frequency is set to a positive value.
+                    # 2. It's the last training step.
+                    # 3. The current step number is a multiple of the save frequency.
+                    # 4. The ESI(Elastic Server Instance)/training plan is close to expiration.
+                    if self.config.trainer.save_freq > 0 and (
+                        is_last_step
+                        or self.global_steps % self.config.trainer.save_freq == 0
+                        or esi_close_to_expiration
+                    ):
+                        if esi_close_to_expiration:
+                            print("Force saving checkpoint: ESI instance expiration approaching.")
+                        with marked_timer("save_checkpoint", timing_raw, color="green"):
+                            self._save_checkpoint()
+
+                with marked_timer("stop_profile", timing_raw):
+                    next_step_profile = (
+                        self.global_steps + 1 in self.config.trainer.profile_steps
+                        if self.config.trainer.profile_steps is not None
+                        else False
+                    )
+                    self._stop_profiling(
+                        curr_step_profile and not next_step_profile
+                        if self.config.trainer.profile_continuous_steps
+                        else curr_step_profile
+                    )
+                    prev_step_profile = curr_step_profile
+                    curr_step_profile = next_step_profile
+
+                steps_duration = timing_raw["step"]
+                self.max_steps_duration = max(self.max_steps_duration, steps_duration)
+
+                # training metrics
+                metrics.update(
+                    {
+                        "training/global_step": self.global_steps,
+                        "training/epoch": epoch,
+                    }
+                )
+                # collect metrics
+                metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+                metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+                # TODO: implement actual tflpo and theoretical tflpo
+                n_gpus = self.resource_pool_manager.get_n_gpus()
+                metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+
+                # this is experimental and may be changed/removed in the future in favor of a general-purpose one
+                if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
+                    self.train_dataloader.sampler.update(batch=batch)
+
+                # TODO: make a canonical logger that supports various backend
+                logger.log(data=metrics, step=self.global_steps)
+
+                progress_bar.update(1)
+                self.global_steps += 1
+
+                if is_last_step:
+                    pprint(f"Final validation metrics: {last_val_metrics}")
+                    progress_bar.close()
+                    return
+
+                # this is experimental and may be changed/removed in the future
+                # in favor of a general-purpose data buffer pool
+                if hasattr(self.train_dataset, "on_batch_end"):
+                    # The dataset may be changed after each training batch
+                    self.train_dataset.on_batch_end(batch=batch)
diff --git a/verl/utils/dataset/old_get_item.py b/verl/utils/dataset/old_get_item.py
new file mode 100644
index 00000000000..7df48ecd036
--- /dev/null
+++ b/verl/utils/dataset/old_get_item.py
@@ -0,0 +1,298 @@
+
+def __getitem__(self, item):
+    """
+    Note that we also return the raw_input_ids so that it can be combined with other chat template
+    """
+    row_dict: dict = self.dataframe[item]
+
+    is_timeseries = False
+    vision_path = row_dict['images'][0] if 'images' in row_dict and len(row_dict['images']) != 0 else None
+    if vision_path is None:  # this may be video
+        vision_path = row_dict['videos'][0] if 'videos' in row_dict and len(row_dict['videos']) != 0 else None
+    if vision_path is None:  # this may be time series only
+        vision_path = row_dict['time_series'][0] if 'time_series' in row_dict and len(
+            row_dict['time_series']) != 0 else ''
+        is_timeseries = True
+    prompt_str = row_dict[self.prompt_key]
+
+    if 'How long will the patient stay in the hospital?' in prompt_str:
+        row_dict["data_source"] = "multimodal"
+        row_dict["dataset"] = "los_prediction"
+    elif 'Will the patient survive for at least 48 hours?' in prompt_str:
+        row_dict["data_source"] = "multimodal"
+        row_dict["dataset"] = "48_ihm"
+    elif len(vision_path) != 0:
+        try:
+            row_dict["data_source"] = vision_path.split("/")[0]
+            row_dict["dataset"] = vision_path.split("/")[1]
+        except IndexError:
+            row_dict["data_source"] = "unknown"
+            row_dict["dataset"] = "unknown"
+            print(
+                f"Failed to parse vision path: {vision_path}. The annotation is {row_dict}. Using default values.")
+    elif is_timeseries:
+        row_dict["data_source"] = "ecg"
+        # dataset already set in json
+    else:
+        raise ValueError("No modality found.")
+
+    if 'reward_model' not in row_dict:
+        if 'answer' in row_dict:
+            answer = row_dict['answer']
+        elif 'ground_truth' in row_dict:
+            answer = row_dict['ground_truth']
+        else:
+            raise ValueError("No answer or ground_truth found in the row_dict.")
+        row_dict['reward_model'] = {'ground_truth': answer}
+
+    for key, item in row_dict.items():
+        if item is None:
+            row_dict[key] = []
+
+    # NOTE: BUILD_MESSAGES IS CALLED TWICE; 
+    # NOTE: FIRST TIME IS TO GET THE LENGTH OF THE RAW PROMPT AND FILTER OUT 
+    # NOTE: PROMPTS THAT DO NOT FIT THE LENGTH; 
+    # NOTE: SECOND TIME IS TO BUILD THE MESSAGE TO BE PASSED INTO THE MODEL
+
+    messages = self._build_messages(row_dict)
+
+    if "audio" in self.modalities:
+        # NOTE: Set the following prompt for qwen omni when we are training on audio
+        messages.insert(0, {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
+                                            "capable of perceiving auditory and visual inputs, as well as generating text and speech."}
+            ]
+        })
+    model_inputs = {}
+    
+    # NOTE: DEBUGGING
+    dbg = True
+    if dbg:
+        print(f"[getitem] idx=? ds={row_dict.get('dataset')} src={row_dict.get('data_source')} "
+            f"modalities={self.modalities}")
+
+    if self.processor is not None:
+        # THIS CHUNK IS BASICALLY ABOUT PROCESSING ALL THE MODALITIES
+        from verl.utils.dataset.vision_utils import process_image, process_video
+        from verl.utils.dataset.audio_utils import process_audio
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message="System prompt modified")
+            raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+
+        
+        if dbg:
+            print(f"[prompt] raw_prompt_chars={len(raw_prompt)}")
+
+        multi_modal_data = {}
+        processor_kwargs = {"text": [raw_prompt], "return_tensors": "pt"}
+
+        if "images" in self.modalities and self.image_key in row_dict and row_dict.get(self.image_key, None) is not None and len(row_dict[self.image_key]) > 0:
+            images = []
+            for image in row_dict.get(self.image_key):
+                image = os.path.join(self.base_dir, image) if isinstance(image, str) else image
+                images.append(process_image(image))
+
+            # due to the image key is "image" instead of "images" in vllm, we need to use "image" here
+            # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
+            multi_modal_data["image"] = images
+            processor_kwargs["images"] = images
+
+            if dbg:
+                print(f"[image] n={len(images)} shapes={[tuple(x.size()) if hasattr(x,'size') else 'np' for x in images]}")
+
+
+        # print(f"KEANE: Videos is next line, current processor_kwargs {processor_kwargs}")
+        if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
+            videos = []
+            # print(f"KEANE: GETTING VIDEO {row_dict[self.video_key]}")
+
+            for video in row_dict.get(self.video_key):
+                video = os.path.join(self.base_dir, video) if isinstance(video, str) else video
+                videos.append(process_video(video))
+
+            # due to the video key is "video" instead of "videos" in vllm, we need to use "video" here
+            # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
+            multi_modal_data["video"] = [video.numpy() for video in videos]
+            processor_kwargs["videos"] = videos
+
+            if dbg:
+                shapes = [tuple(v.shape) for v in videos]  # [T,3,H,W]
+                toks = []
+                for (T, C, H, W) in shapes:
+                    toks.append(_tok_est_from_hw(H, W) * T)
+                print(f"[video] n={len(videos)} shapes={shapes} est_tokens={toks} "
+                    f"sum_est_tokens={sum(toks)} p99_est={_p99(toks)}")
+
+
+        # NOTE: PROCESSING OF THE AUDIO TUPLES
+        # if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
+        #     audios = []
+        #     audio_tuples = []  # Keep tuples for multi_modal_data
+        #     for audio in row_dict.get(self.audio_key):
+        #         audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
+        #         audio_data, sampling_rate = process_audio(audio_path, self.processor)
+        #         audio_tuples.append((audio_data, sampling_rate))
+        #         # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
+        #         audios.append(audio_data.detach().cpu().numpy().astype("float32"))
+
+        #     # multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
+        #     multi_modal_data["audio"] = audios  # Store numpy arrays (it should not accept tuples)
+
+        #     processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
+
+        if (
+            "audio" in self.modalities
+            and self.audio_key in row_dict
+            and row_dict.get(self.audio_key)
+            and len(row_dict[self.audio_key]) > 0
+        ):
+            audios_np = []
+            audios_np_sr = []
+            audio_tuples_debug = []  # keep tensors only for debugging
+            audio_secs = []
+
+            for audio in row_dict[self.audio_key]:
+                audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
+                audio_tensor, sr = process_audio(audio_path, self.processor)
+
+                # Debug only
+                audio_tuples_debug.append((audio_tensor, sr))
+
+                # What BOTH HF and vLLM need:
+                arr = audio_tensor.detach().cpu().numpy().astype("float32")
+                audios_np.append(arr)
+                audios_np_sr.append((arr, int(sr)))
+                audio_secs.append(_sec_from_array(arr, sr))
+
+            # HF (Whisper / Omni processor) path
+            multi_modal_data["audio"] = audios_np_sr  # Store numpy arrays (it should not accept tuples)
+
+            processor_kwargs["audio"] = audios_np  # Pass numpy arrays to processor
+
+            if dbg:
+                print(f"[audio] n={len(audios_np)} secs_each={audio_secs} total_secs≈{round(sum([s for s in audio_secs if s!='?']),3)}")
+
+        # NOTE: Original CODE PROCESSING    
+        # TODO: Please check whether the model is processing the "audio" correctly, the processor that we are using is qwen 2.5 OMNI
+        # print(f"KEANE: Processing multimodal data with processor {self.processor.__class__.__name__} ")
+        # print(f"KEANE: Processor kwargs: {processor_kwargs}")
+        # model_inputs = self.processor(**processor_kwargs)
+
+        # NOTE: Replacement code
+        try:
+            t0 = time.time()
+            model_inputs = self.processor(**processor_kwargs)
+            dt = (time.time() - t0)*1000
+            if dbg:
+                # lengths after processor/tokenizer
+                ids = model_inputs.get("input_ids")
+                lens = [len(x) for x in ids] if ids is not None else []
+                print(f"[processor] ok in {dt:.1f}ms; input_ids lens={lens} "
+                    f"min/med/max={ (min(lens) if lens else '-')} / "
+                    f"{ (sorted(lens)[len(lens)//2] if lens else '-') } / "
+                    f"{ (max(lens) if lens else '-') }")
+        except Exception as e:
+            print(f"[processor][ERROR] {type(e).__name__}: {e}")
+            # helpful context dump (small)
+            print(f"[processor][ctx] has_video={videos is not None} "
+                f"n_vid={len(videos) if videos is not None else 0} "
+                f"n_audio={len(audio_secs) if audio_secs else 0} "
+                f"raw_prompt_chars={len(raw_prompt)}")
+            raise
+
+        # NOTE: all text should be processed by self.processor()
+        input_ids = model_inputs.pop("input_ids")
+        attention_mask = model_inputs.pop("attention_mask")
+
+        if "second_per_grid_ts" in model_inputs:
+            model_inputs.pop("second_per_grid_ts")
+
+        # There's a trap here, multi_modal_inputs has to be a dict, not BatchFeature
+        row_dict["multi_modal_data"] = multi_modal_data
+
+        # We will do batch.union() in the trainer,
+        # so we cannot have "multi_modal_inputs" in row_dict if rollout generates new multi_modal_inputs
+        if self.return_multi_modal_inputs:
+            row_dict["multi_modal_inputs"] = dict(model_inputs)
+
+            # second_per_grid_ts isn't used for training, just for mrope
+            row_dict["multi_modal_inputs"].pop("second_per_grid_ts", None)
+
+    else:
+        raw_prompt = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        model_inputs = self.tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
+        input_ids = model_inputs.pop("input_ids")
+        attention_mask = model_inputs.pop("attention_mask")
+
+    input_ids, attention_mask = verl_F.postprocess_data(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        max_length=self.max_prompt_length,
+        pad_token_id=self.tokenizer.pad_token_id,
+        left_pad=True,
+        truncation=self.truncation,
+    )
+
+    if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
+        from verl.models.transformers.qwen2_vl import get_rope_index
+        
+        # NOTE: printing out whether this runs
+        # print("KEANE: Running getting the rope index of input ids")
+        
+        # NOTE: OBTAIN ROPE of rotary positional embeddings. ROPE encodes position by rotating components of query/key vectors
+        # This is just for to get relative position in terms of angular differences etc.
+        position_ids = [
+            get_rope_index(
+                self.processor,
+                input_ids=input_ids[0],
+                image_grid_thw=model_inputs.get("image_grid_thw"),
+                video_grid_thw=model_inputs.get("video_grid_thw"),
+                second_per_grid_ts=model_inputs.get("second_per_grid_ts"),
+                attention_mask=attention_mask[0],
+            )
+        ]  # (1, 3, seq_len)
+
+    else:
+        position_ids = compute_position_id_with_mask(attention_mask)
+
+    # Essentially training with the different input ids etc.
+    row_dict["input_ids"] = input_ids[0]
+    row_dict["attention_mask"] = attention_mask[0]
+    row_dict["position_ids"] = position_ids[0]
+
+    raw_prompt_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
+    if len(raw_prompt_ids) > self.max_prompt_length:
+        if self.truncation == "left":
+            raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length :]
+        elif self.truncation == "right":
+            raw_prompt_ids = raw_prompt_ids[: self.max_prompt_length]
+        elif self.truncation == "middle":
+            left_half = self.max_prompt_length // 2
+            right_half = self.max_prompt_length - left_half
+            raw_prompt_ids = raw_prompt_ids[:left_half] + raw_prompt_ids[-right_half:]
+        elif self.truncation == "error":
+            raise RuntimeError(f"Prompt length {len(raw_prompt_ids)} is longer than {self.max_prompt_length}.")
+
+    row_dict["raw_prompt_ids"] = raw_prompt_ids
+    # encode prompts without chat template
+    if self.return_raw_chat:
+        row_dict["raw_prompt"] = messages
+
+    # get prompts with chat template
+    if self.return_full_prompt:
+        row_dict["full_prompts"] = raw_prompt  # array of strings
+
+    # add index for each prompt
+    index = row_dict.get("extra_info", {}).get("index", 0)
+    tools_kwargs = row_dict.get("extra_info", {}).get("tools_kwargs", {})
+    interaction_kwargs = row_dict.get("extra_info", {}).get("interaction_kwargs", {})
+    need_tools_kwargs = row_dict.get("extra_info", {}).get("need_tools_kwargs", self.need_tools_kwargs)
+    if need_tools_kwargs and not tools_kwargs:
+        logger.warning("tools_kwargs is empty for index {}, data source: {}", index, row_dict["data_source"])
+    row_dict["index"] = index
+    row_dict["tools_kwargs"] = tools_kwargs
+    row_dict["interaction_kwargs"] = interaction_kwargs
+    return row_dict
\ No newline at end of file
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 1abcb8c84b2..36f678c468a 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -377,6 +377,7 @@ def _build_messages(self, example: dict):
                             new_messages[i]["content"] = self.format_prompt.render(content=content)
         return new_messages
 
+
     def __getitem__(self, item):
         """
         Note that we also return the raw_input_ids so that it can be combined with other chat template
@@ -673,7 +674,9 @@ def __getitem__(self, item):
         row_dict["index"] = index
         row_dict["tools_kwargs"] = tools_kwargs
         row_dict["interaction_kwargs"] = interaction_kwargs
-        return row_dict
+    
+        return row_dict    
+    
 
     def __getstate__(self):
         if not self.serialize_dataset:
diff --git a/verl/utils/dataset/rl_dataset_alt.py b/verl/utils/dataset/rl_dataset_alt.py
new file mode 100644
index 00000000000..da8bfcc2cea
--- /dev/null
+++ b/verl/utils/dataset/rl_dataset_alt.py
@@ -0,0 +1,649 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+import os
+import re
+from collections import defaultdict
+from typing import Optional
+
+import datasets
+import numpy as np
+import torch
+from jinja2 import Template
+from omegaconf import DictConfig, ListConfig
+from torch.utils.data import Dataset
+from transformers import PreTrainedTokenizer, ProcessorMixin
+import warnings
+
+import verl.utils.torch_functional as verl_F
+from verl.utils.model import compute_position_id_with_mask
+import time, os, math, warnings
+
+logger = logging.getLogger(__name__)
+
+def _tok_est_from_hw(H, W):
+    # 28x28 -> 1 "visual token" heuristic
+    return math.ceil(H/28) * math.ceil(W/28)
+
+def _sec_from_array(arr, sr):
+    try:
+        return round(len(arr) / float(sr), 3)
+    except Exception:
+        return "?"
+
+def _p99(xs):
+    xs = sorted(xs)
+    if not xs: return 0
+    k = int(0.99*(len(xs)-1))
+    return xs[k]
+
+
+def collate_fn(data_list: list[dict]) -> dict:
+    """
+    Collate a batch of sample dicts into batched tensors and arrays.
+
+    Args:
+        data_list: List of dicts mapping feature names to torch.Tensor or other values.
+
+    Returns:
+        Dict where tensor entries are stacked into a torch.Tensor of shape
+        (batch_size, dims) and non-tensor entries are converted to
+        np.ndarray of dtype object with shape (batch_size,).
+    """
+    tensors = defaultdict(list)
+    non_tensors = defaultdict(list)
+
+    data_list = [d for d in data_list if d is not None]
+    if not data_list:
+        return None
+
+    for data in data_list:
+        for key, val in data.items():
+            if isinstance(val, torch.Tensor):
+                tensors[key].append(val)
+            else:
+                non_tensors[key].append(val)
+
+    for key, val in tensors.items():
+        tensors[key] = torch.stack(val, dim=0)
+
+    for key, val in non_tensors.items():
+        non_tensors[key] = np.fromiter(val, dtype=object, count=len(val))
+
+    return {**tensors, **non_tensors}
+
+
+class RLHFDataset(Dataset):
+    """
+    Load and preprocess RLHF data from Parquet files.
+
+    - Caches files locally.
+    - Reads into a HuggingFace Dataset and tokenizes prompts.
+    - Optionally handles images/videos via a ProcessorMixin.
+    - Filters prompts over a max length.
+    - Supports resuming from checkpoints.
+
+    Args:
+        data_files (str or list): Path(s) to Parquet file(s).
+        tokenizer (PreTrainedTokenizer): For the tokenization of text to token IDs.
+        config (DictConfig): Options like cache_dir, prompt_key, max_prompt_length, truncation, etc.
+        processor (ProcessorMixin, optional): Multimodal preprocessor for images/videos.
+    """
+
+    def __init__(
+        self,
+        data_files: str | list[str],
+        tokenizer: PreTrainedTokenizer,
+        config: DictConfig,
+        processor: Optional[ProcessorMixin] = None,
+    ):
+        if not isinstance(data_files, list | ListConfig):
+            data_files = [data_files]
+
+        self.data_files = copy.deepcopy(data_files)
+        self.original_data_files = copy.deepcopy(data_files)  # use for resume
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.config = config
+
+        self.cache_dir = os.path.expanduser(config.get("cache_dir", "~/.cache/verl/rlhf"))
+        
+        # Essentially getting all the different keys.
+        self.prompt_key = config.get("prompt_key", "prompt")
+        self.image_key = config.get("image_key", "images")
+        self.video_key = config.get("video_key", "videos")
+
+        # NOTE: SET AUDIO KEY AS AUDIOS
+        self.audio_key = config.get("audio_key", "audios")
+
+        # NOTE: SET MODALITIES, split the images and videos
+        self.modalities = set(config.get("modalities", "images,videos").split(","))
+
+        self.max_prompt_length = config.get("max_prompt_length", 1024)
+        self.return_raw_chat = config.get("return_raw_chat", False)
+        self.return_full_prompt = config.get("return_full_prompt", False)
+        self.truncation = config.get("truncation", "error")
+
+        # TODO: Check whether this is true
+        self.filter_overlong_prompts = config.get("filter_overlong_prompts", True)
+        if isinstance(data_files, str):
+            self.base_dir = os.path.dirname(os.path.abspath(data_files))
+        else:
+            self.base_dir = os.path.dirname(os.path.abspath(data_files[0]))
+
+        self.num_workers = config.get("filter_overlong_prompts_workers", max(1, os.cpu_count() // 4))
+        self.num_workers = min(self.num_workers, os.cpu_count())
+        self.use_shm = config.get("use_shm", False)
+        self.chat_template_func = config.get("chat_template_func", None)
+        self.need_tools_kwargs = config.get("need_tools_kwargs", False)
+        self.filter_prompts = config.get("filter_prompts", True)
+        self.serialize_dataset = False
+        self.return_multi_modal_inputs = config.get("return_multi_modal_inputs", True)
+        
+        # Load format prompt from file if specified
+        self.format_prompt_path = config.get("format_prompt", "examples/format_prompt/default.jinja")
+        self.format_prompt = self._load_format_prompt()
+
+        self._download()
+        self._read_files_and_tokenize() # essentially this is prepared first before _getitem
+
+    def _load_format_prompt(self) -> Optional[Template]:
+        """Load format prompt from file if specified."""
+        if self.format_prompt_path:
+            with open(self.format_prompt_path, 'r', encoding='utf-8') as f:
+                template_content = f.read()
+            return Template(template_content)
+        return None
+
+    def _download(self, use_origin_parquet=False):
+        from verl.utils.fs import copy_to_local
+
+        data_files = self.data_files if not use_origin_parquet else self.original_data_files
+        for i, parquet_file in enumerate(data_files):
+            self.data_files[i] = copy_to_local(src=parquet_file, cache_dir=self.cache_dir, use_shm=self.use_shm)
+
+    def _read_files_and_tokenize(self):
+        dataframes = []
+
+        features = datasets.Features({
+            "problem": datasets.Value("string"),
+            "answer":  datasets.Value("string"),
+            "images":  datasets.Sequence(datasets.Value("string")),
+            "videos":  datasets.Sequence(datasets.Value("string")),
+            "audios":  datasets.Sequence(datasets.Value("string")),  # <- force list of strings
+            "dataset": datasets.Value("string"),
+            "texts":   datasets.Sequence(datasets.Value("string")),
+        })
+
+        for parquet_file in self.data_files:
+            # read parquet files and cache
+            if parquet_file.endswith(".parquet"):
+                dataframe = datasets.load_dataset("parquet", data_files=parquet_file, features=features)["train"]
+            elif parquet_file.endswith(".json") or parquet_file.endswith(".jsonl"):
+                dataframe = datasets.load_dataset("json", data_files=parquet_file, features=features)["train"]
+            else:
+                raise ValueError(f"Unsupported file format: {parquet_file}. Only .parquet, .json, .jsonl are supported.")
+            dataframes.append(dataframe)
+        self.dataframe: datasets.Dataset = datasets.concatenate_datasets(dataframes)
+
+        print(f"dataset len: {len(self.dataframe)}")
+
+        # PROCESSING THE DATAFRAME for TRAINING
+        self.dataframe = self.maybe_filter_out_long_prompts(self.dataframe)
+
+    def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
+        # NOTE: filter out too long prompts, because the prompts can become very long
+        # when the audio is appended.
+
+        if self.filter_overlong_prompts:
+            # NOTE: FILTER OUT THE LONG PROMPTS SO THAT THEY FIT THE LENGTH
+            tokenizer = self.tokenizer
+            processor = self.processor
+            prompt_key = self.prompt_key
+            image_key = self.image_key
+            video_key = self.video_key
+            audio_key = self.audio_key
+
+            if processor is not None:
+                # print(f"KEANE: PROCESSOR FOUND")
+                from verl.utils.dataset.vision_utils import process_image, process_video
+                from verl.utils.dataset.audio_utils import process_audio
+
+                def doc2len(doc) -> int:
+                    messages = self._build_messages(doc)
+                    raw_prompt = self.processor.apply_chat_template(
+                        messages, add_generation_prompt=True, tokenize=False
+                    )
+                    processor_kwargs = {"text": [raw_prompt]}
+                    
+                    if "images" in self.modalities and image_key in doc and len(doc[image_key]) > 0:
+                        images = [process_image(image) for image in doc[image_key]]
+                        processor_kwargs["images"] = images
+
+                    if "videos" in self.modalities and video_key in doc and len(doc[video_key]) > 0:    
+                        videos = [process_video(video) for video in doc[video_key]]
+                        processor_kwargs["videos"] = videos
+
+                    if "audio" in self.modalities and audio_key in doc and doc.get(audio_key, None) is not None and len(doc[audio_key]) > 0:
+                        # processing of audio
+                        # print(f"KEANE: Processing audio within rl dataset file")
+                        # audios = [process_audio(audio, processor) for audio in doc[audio_key]]
+                        # processor_kwargs["audio"] = audios
+
+                        # PATCH
+                        audios = []
+                        audio_tuples = []  # Keep tuples for multi_modal_data
+                        for audio in doc.get(self.audio_key):
+                            audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
+                            audio_data, sampling_rate = process_audio(audio_path, self.processor)
+                            audio_tuples.append((audio_data, sampling_rate))
+                            # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
+                            audios.append(audio_data.detach().cpu().numpy().astype("float32"))
+
+                        processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
+                    # TODO: cannot process the audio inputs
+                    # print(f"KEANE: Processor class is {processor.__class__.__name__}")
+                    # print(f"KEANE: Printing the processor_kwargs, {processor_kwargs}")
+                    # Assume that all are in tensors already, hence there is no return_tensors = "pt"
+                    return len(processor(**processor_kwargs)["input_ids"][0])
+
+            else:
+                # print(f"KEANE: PROCESSOR NOT FOUND")
+                def doc2len(doc) -> int:
+                    return len(tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True))
+
+            dataframe = dataframe.filter(
+                lambda doc: doc2len(doc) <= self.max_prompt_length,
+                num_proc=self.num_workers,
+                desc=f"Filtering prompts longer than {self.max_prompt_length} tokens",
+            )
+
+            print(f"filter dataset len: {len(dataframe)}")
+        return dataframe
+
+    def resume_dataset_state(self):
+        self.serialize_dataset = not hasattr(self, "original_data_files")
+        # resume dataframe if not it's serialized in data.pt
+        if not self.serialize_dataset:
+            self._download(use_origin_parquet=True)  # download and resume from original parquet files
+            self._read_files_and_tokenize()
+        else:
+            print(r"old dataloader ckpt file is used, please train from scratch for better ckpt performance")
+
+    def __len__(self):
+        return len(self.dataframe)
+
+    def _build_messages(self, example: dict):
+        """
+        This appears to be called twice, once during maybe_filter_out_long_prompts, and another time during getitems
+        """
+        messages: list = example.get(self.prompt_key)
+        if isinstance(messages, str):
+            messages = [messages]
+
+        # NOTE: Before building, check if there is multimodal content
+        has_multimodal = (
+            ("images" in self.modalities and self.image_key in example) or
+            ("videos" in self.modalities and self.video_key in example) or
+            ("audio" in self.modalities and self.audio_key in example)
+        )
+        
+        if has_multimodal:
+            new_messages = []
+            for message in messages:
+                new_message = copy.deepcopy(message)
+                if isinstance(new_message, str):
+                    new_message = {"role": "user", "content": new_message}
+                content = new_message["content"]
+                
+                # Apply format prompt to the entire content first if template is loaded
+                if self.format_prompt:
+                    content = self.format_prompt.render(content=content)
+
+                image_count = len(example.get(self.image_key, []))
+                video_count = len(example.get(self.video_key, []))
+                audio_count = len(example.get(self.audio_key, []))
+                image_tag_count = content.count("<image>")
+                video_tag_count = content.count("<video>")
+                audio_tag_count = content.count("<audio>")
+
+                # NOTE: Apppending the <image>, <video>, <audio> tags when they are missing
+                if image_tag_count < image_count:
+                    content = "<image>" * (image_count - image_tag_count) + content
+                    logger.warning("<image> tag count is less than image count, adding missing <image> tags."
+                                   " content: %s", content)
+                if video_tag_count < video_count:
+                    content = "<video>" * (video_count - video_tag_count) + content
+                    logger.warning("<video> tag count is less than video count, adding missing <video> tags."
+                                 " content: %s", content)
+                if audio_tag_count < audio_count:
+                    content = "<audio>" * (audio_count - audio_tag_count) + content
+                    logger.warning("<audio> tag count is less than audio count, adding missing <audio> tags."
+                                   " content: %s", content)
+
+                content_list = []
+                # Build regex pattern based on enabled modalities
+                tag_patterns = []
+                if "images" in self.modalities:
+                    tag_patterns.append("<image>")
+                if "videos" in self.modalities:
+                    tag_patterns.append("<video>")
+                if "audio" in self.modalities:
+                    tag_patterns.append("<audio>")
+                
+                # NOTE: Denote the different patterns based on the tag.
+                # TODO: Double check what this does
+                if tag_patterns:
+                    pattern = "(" + "|".join(tag_patterns) + ")"
+                    segments = re.split(pattern, content)
+                    segments = [item for item in segments if item != ""]
+                    for segment in segments:
+                        if segment == "<image>" and "images" in self.modalities:
+                            content_list.append({"type": "image"})
+                        elif segment == "<video>" and "videos" in self.modalities:
+                            content_list.append({"type": "video"})
+                        elif segment == "<audio>" and "audio" in self.modalities:
+                            content_list.append({"type": "audio"})
+                        else:
+                            content_list.append({"type": "text", "text": segment})
+                else:
+                    content_list.append({"type": "text", "text": content})
+                new_message["content"] = content_list
+                new_messages.append(new_message)
+        else:
+            new_messages = copy.deepcopy(messages)
+            if isinstance(new_messages, str):
+                new_messages = [{"role": "user", "content": new_messages}]
+            elif isinstance(new_messages, list) and isinstance(new_messages[0], str):
+                new_messages = [{"role": "user", "content": new_messages}]
+            
+            # Apply format prompt to text-only messages if template is loaded
+            if self.format_prompt and len(new_messages) > 0:
+                for i, msg in enumerate(new_messages):
+                    if isinstance(msg, dict) and msg.get("role") == "user":
+                        content = msg.get("content", "")
+                        if isinstance(content, str):
+                            new_messages[i]["content"] = self.format_prompt.render(content=content)
+        return new_messages
+
+    def __getitem__(self, item):
+        """
+        Note that we also return the raw_input_ids so that it can be combined with other chat template
+        """
+        row_dict: dict = self.dataframe[item]
+
+        is_timeseries = False
+        vision_path = row_dict['images'][0] if 'images' in row_dict and len(row_dict['images']) != 0 else None
+        if vision_path is None:  # this may be video
+            vision_path = row_dict['videos'][0] if 'videos' in row_dict and len(row_dict['videos']) != 0 else None
+        if vision_path is None:  # this may be time series only
+            vision_path = row_dict['time_series'][0] if 'time_series' in row_dict and len(
+                row_dict['time_series']) != 0 else ''
+            is_timeseries = True
+        prompt_str = row_dict[self.prompt_key]
+
+        if 'How long will the patient stay in the hospital?' in prompt_str:
+            row_dict["data_source"] = "multimodal"
+            row_dict["dataset"] = "los_prediction"
+        elif 'Will the patient survive for at least 48 hours?' in prompt_str:
+            row_dict["data_source"] = "multimodal"
+            row_dict["dataset"] = "48_ihm"
+        elif len(vision_path) != 0:
+            try:
+                row_dict["data_source"] = vision_path.split("/")[0]
+                row_dict["dataset"] = vision_path.split("/")[1]
+            except IndexError:
+                row_dict["data_source"] = "unknown"
+                row_dict["dataset"] = "unknown"
+                print(
+                    f"Failed to parse vision path: {vision_path}. The annotation is {row_dict}. Using default values.")
+        elif is_timeseries:
+            row_dict["data_source"] = "ecg"
+            # dataset already set in json
+        else:
+            raise ValueError("No modality found.")
+
+        if 'reward_model' not in row_dict:
+            if 'answer' in row_dict:
+                answer = row_dict['answer']
+            elif 'ground_truth' in row_dict:
+                answer = row_dict['ground_truth']
+            else:
+                raise ValueError("No answer or ground_truth found in the row_dict.")
+            row_dict['reward_model'] = {'ground_truth': answer}
+
+        for key, item in row_dict.items():
+            if item is None:
+                row_dict[key] = []
+
+        # NOTE: BUILD_MESSAGES IS CALLED TWICE; 
+        # NOTE: FIRST TIME IS TO GET THE LENGTH OF THE RAW PROMPT AND FILTER OUT 
+        # NOTE: PROMPTS THAT DO NOT FIT THE LENGTH; 
+        # NOTE: SECOND TIME IS TO BUILD THE MESSAGE TO BE PASSED INTO THE MODEL
+
+        messages = self._build_messages(row_dict)
+
+        if "audio" in self.modalities:
+            # NOTE: Set the following prompt for qwen omni when we are training on audio
+            messages.insert(0, {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
+                                             "capable of perceiving auditory and visual inputs, as well as generating text and speech."}
+                ]
+            })
+
+        model_inputs = {}
+        dbg = True
+        if dbg:
+            print(f"[getitem] idx=? ds={row_dict.get('dataset')} src={row_dict.get('data_source')} modalities={self.modalities}")
+
+        if self.processor is not None:
+            from verl.utils.dataset.vision_utils import process_image, process_video
+            from verl.utils.dataset.audio_utils import process_audio
+
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="System prompt modified")
+                raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+
+            if dbg:
+                print(f"[prompt] raw_prompt_chars={len(raw_prompt)}")
+
+            # ---------- CHANGED: build kwargs with ONLY present modalities; no duplicates ----------
+            processor_kwargs = {"text": [raw_prompt], "return_tensors": "pt"}
+
+            # Optional prompt gating helper
+            def mentions(tag: str) -> bool:
+                return (f"<{tag}>" in raw_prompt) or (f"<{tag}_0>" in raw_prompt)
+
+            # IMAGES
+            images = None
+            if ("images" in self.modalities and self.image_key in row_dict
+                and row_dict.get(self.image_key) and len(row_dict[self.image_key]) > 0):
+                # Optional: uncomment to require prompt placeholders
+                # if not mentions("image"): pass
+                # else:
+                images = []
+                for image in row_dict[self.image_key]:
+                    path = os.path.join(self.base_dir, image) if isinstance(image, str) else image
+                    images.append(process_image(path))
+                processor_kwargs["images"] = images
+                if dbg:
+                    print(f"[image] n={len(images)} shapes={[tuple(x.size()) if hasattr(x,'size') else 'np' for x in images]}")
+
+            # VIDEOS
+            videos = None
+            if ("videos" in self.modalities and self.video_key in row_dict
+                and row_dict.get(self.video_key) and len(row_dict[self.video_key]) > 0):
+                # Optional: placeholder gate
+                # if mentions("video"):
+                videos = []
+                for v in row_dict[self.video_key]:
+                    path = os.path.join(self.base_dir, v) if isinstance(v, str) else v
+                    t = process_video(path, debug=dbg, name_hint=os.path.basename(str(path)))
+                    videos.append(t)  # [T,3,H,W] uint8 on CPU
+                processor_kwargs["videos"] = videos
+                if dbg:
+                    shapes = [tuple(v.shape) for v in videos]
+                    toks = []
+                    for (T, C, H, W) in shapes:
+                        toks.append(_tok_est_from_hw(H, W) * T)
+                    print(f"[video] n={len(videos)} shapes={shapes} est_tokens={toks} sum_est_tokens={sum(toks)}")
+
+            # AUDIO
+            audios_np = None
+            if ("audio" in self.modalities and self.audio_key in row_dict
+                and row_dict.get(self.audio_key) and len(row_dict[self.audio_key]) > 0):
+                # Optional: placeholder gate
+                # if mentions("audio"):
+                audios_np = []
+                audio_secs = []
+                for a in row_dict[self.audio_key]:
+                    path = os.path.join(self.base_dir, a) if isinstance(a, str) else a
+                    a_tensor, sr = process_audio(path, self.processor)  # clipped mono, 16k
+                    arr = a_tensor.detach().cpu().numpy().astype("float32")
+                    audios_np.append(arr)
+                    audio_secs.append(round(len(arr)/float(sr), 3))
+                processor_kwargs["audio"] = audios_np
+                if dbg:
+                    print(f"[audio] n={len(audios_np)} secs_each={audio_secs} total_secs≈{round(sum(audio_secs),3)}")
+
+            # ---------- CHANGED: drop temporaries & do NOT stash media into row_dict ----------
+            # (we intentionally do NOT create multi_modal_data or row_dict["multi_modal_*"])
+            # Remove local references ASAP
+            for _nm in ("images", "videos", "audios_np", "audio_secs"):
+                if _nm in locals():
+                    try: del locals()[_nm]
+                    except Exception: pass
+
+            # ---------- CHANGED: processor call; skip sample on error ----------
+            try:
+                t0 = time.time()
+                model_inputs = self.processor(**processor_kwargs)   # stays CPU
+                dt = (time.time() - t0) * 1000
+                if dbg:
+                    ids = model_inputs.get("input_ids")
+                    lens = [len(x) for x in ids] if ids is not None else []
+                    med = (sorted(lens)[len(lens)//2] if lens else "-")
+                    print(f"[processor] ok in {dt:.1f}ms; input_ids lens={lens} min/med/max={ (min(lens) if lens else '-') }/{med}/{ (max(lens) if lens else '-') }")
+            except Exception as e:
+                print(f"[processor][ERROR] {type(e).__name__}: {e} — skipping sample")
+                return None  # collate_fn should drop Nones
+
+            # ---------- unchanged: extract ids/mask, postprocess ----------
+            input_ids = model_inputs.pop("input_ids")
+            attention_mask = model_inputs.pop("attention_mask")
+            model_inputs.pop("second_per_grid_ts", None)
+
+            # >>> NEW: keep a PRUNED, CPU‑only media payload for rollout generation
+            _mm_keep = {}
+            for k in ("images", "videos", "audio", "image_grid_thw", "video_grid_thw", "second_per_grid_ts"):
+                if k in model_inputs and model_inputs[k] is not None:
+                    v = model_inputs[k]
+                    # ensure CPU + plain lists where possible (no CUDA!)
+                    try:
+                        _mm_keep[k] = v if isinstance(v, list) else v.tolist()
+                    except Exception:
+                        _mm_keep[k] = v  # e.g., list of CPU tensors / numpy arrays
+
+            # Store ONLY this small payload; do NOT store frames twice, and don't keep model_inputs itself
+            row_dict["multi_modal_inputs"] = _mm_keep
+
+            # CHANGED: drop model_inputs entirely so nothing large lingers
+            del model_inputs
+
+        else:
+            raw_prompt = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+            toks = self.tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
+            input_ids = toks.pop("input_ids")
+            attention_mask = toks.pop("attention_mask")
+            del toks
+
+        # postprocess (unchanged)
+        input_ids, attention_mask = verl_F.postprocess_data(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_length=self.max_prompt_length,
+            pad_token_id=self.tokenizer.pad_token_id,
+            left_pad=True,
+            truncation=self.truncation,
+        )
+
+        # position ids (unchanged logic)
+        if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
+            from verl.models.transformers.qwen2_vl import get_rope_index
+            position_ids = [
+                get_rope_index(
+                    self.processor,
+                    input_ids=input_ids[0],
+                    image_grid_thw=None,   # not stored; processor will infer if needed
+                    video_grid_thw=None,
+                    second_per_grid_ts=None,
+                    attention_mask=attention_mask[0],
+                )
+            ]
+        else:
+            position_ids = compute_position_id_with_mask(attention_mask)
+
+        # set minimal required fields on row_dict
+        row_dict["input_ids"] = input_ids[0]
+        row_dict["attention_mask"] = attention_mask[0]
+        row_dict["position_ids"] = position_ids[0]
+
+        # raw_prompt_ids (unchanged)
+        raw_prompt_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
+        if len(raw_prompt_ids) > self.max_prompt_length:
+            if self.truncation == "left":
+                raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length:]
+            elif self.truncation == "right":
+                raw_prompt_ids = raw_prompt_ids[: self.max_prompt_length]
+            elif self.truncation == "middle":
+                left_half = self.max_prompt_length // 2
+                right_half = self.max_prompt_length - left_half
+                raw_prompt_ids = raw_prompt_ids[:left_half] + raw_prompt_ids[-right_half:]
+            elif self.truncation == "error":
+                raise RuntimeError(f"Prompt length {len(raw_prompt_ids)} is longer than {self.max_prompt_length}.")
+
+        row_dict["raw_prompt_ids"] = raw_prompt_ids
+        if self.return_raw_chat:
+            row_dict["raw_prompt"] = messages
+        if self.return_full_prompt:
+            row_dict["full_prompts"] = raw_prompt
+
+        # tail (unchanged, small metadata only)
+        index = row_dict.get("extra_info", {}).get("index", 0)
+        tools_kwargs = row_dict.get("extra_info", {}).get("tools_kwargs", {})
+        interaction_kwargs = row_dict.get("extra_info", {}).get("interaction_kwargs", {})
+        need_tools_kwargs = row_dict.get("extra_info", {}).get("need_tools_kwargs", self.need_tools_kwargs)
+        if need_tools_kwargs and not tools_kwargs:
+            logger.warning("tools_kwargs is empty for index {}, data source: {}", index, row_dict["data_source"])
+        row_dict["index"] = index
+        row_dict["tools_kwargs"] = tools_kwargs
+        row_dict["interaction_kwargs"] = interaction_kwargs
+
+        return row_dict
+    
+    def __getstate__(self):
+        if not self.serialize_dataset:
+            state = self.__dict__.copy()
+
+            if "dataframe" in state:
+                del state["dataframe"]
+            return state
+
+        return self.__dict__.copy()
diff --git a/verl/utils/dataset/rl_dataset_org.py b/verl/utils/dataset/rl_dataset_org.py
new file mode 100644
index 00000000000..29c0a34f0ef
--- /dev/null
+++ b/verl/utils/dataset/rl_dataset_org.py
@@ -0,0 +1,687 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+import os
+import re
+from collections import defaultdict
+from typing import Optional
+
+import datasets
+import numpy as np
+import torch
+from jinja2 import Template
+from omegaconf import DictConfig, ListConfig
+from torch.utils.data import Dataset
+from transformers import PreTrainedTokenizer, ProcessorMixin
+import warnings
+
+import verl.utils.torch_functional as verl_F
+from verl.utils.model import compute_position_id_with_mask
+import time, os, math, warnings
+
+logger = logging.getLogger(__name__)
+
+def _tok_est_from_hw(H, W):
+    # 28x28 -> 1 "visual token" heuristic
+    return math.ceil(H/28) * math.ceil(W/28)
+
+def _sec_from_array(arr, sr):
+    try:
+        return round(len(arr) / float(sr), 3)
+    except Exception:
+        return "?"
+
+def _p99(xs):
+    xs = sorted(xs)
+    if not xs: return 0
+    k = int(0.99*(len(xs)-1))
+    return xs[k]
+
+
+def collate_fn(data_list: list[dict]) -> dict:
+    """
+    Collate a batch of sample dicts into batched tensors and arrays.
+
+    Args:
+        data_list: List of dicts mapping feature names to torch.Tensor or other values.
+
+    Returns:
+        Dict where tensor entries are stacked into a torch.Tensor of shape
+        (batch_size, dims) and non-tensor entries are converted to
+        np.ndarray of dtype object with shape (batch_size,).
+    """
+    tensors = defaultdict(list)
+    non_tensors = defaultdict(list)
+
+    for data in data_list:
+        for key, val in data.items():
+            if isinstance(val, torch.Tensor):
+                tensors[key].append(val)
+            else:
+                non_tensors[key].append(val)
+
+    for key, val in tensors.items():
+        tensors[key] = torch.stack(val, dim=0)
+
+    for key, val in non_tensors.items():
+        non_tensors[key] = np.fromiter(val, dtype=object, count=len(val))
+
+    return {**tensors, **non_tensors}
+
+
+class RLHFDataset(Dataset):
+    """
+    Load and preprocess RLHF data from Parquet files.
+
+    - Caches files locally.
+    - Reads into a HuggingFace Dataset and tokenizes prompts.
+    - Optionally handles images/videos via a ProcessorMixin.
+    - Filters prompts over a max length.
+    - Supports resuming from checkpoints.
+
+    Args:
+        data_files (str or list): Path(s) to Parquet file(s).
+        tokenizer (PreTrainedTokenizer): For the tokenization of text to token IDs.
+        config (DictConfig): Options like cache_dir, prompt_key, max_prompt_length, truncation, etc.
+        processor (ProcessorMixin, optional): Multimodal preprocessor for images/videos.
+    """
+
+    def __init__(
+        self,
+        data_files: str | list[str],
+        tokenizer: PreTrainedTokenizer,
+        config: DictConfig,
+        processor: Optional[ProcessorMixin] = None,
+    ):
+        if not isinstance(data_files, list | ListConfig):
+            data_files = [data_files]
+
+        self.data_files = copy.deepcopy(data_files)
+        self.original_data_files = copy.deepcopy(data_files)  # use for resume
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.config = config
+
+        self.cache_dir = os.path.expanduser(config.get("cache_dir", "~/.cache/verl/rlhf"))
+        
+        # Essentially getting all the different keys.
+        self.prompt_key = config.get("prompt_key", "prompt")
+        self.image_key = config.get("image_key", "images")
+        self.video_key = config.get("video_key", "videos")
+
+        # NOTE: SET AUDIO KEY AS AUDIOS
+        self.audio_key = config.get("audio_key", "audios")
+
+        # NOTE: SET MODALITIES, split the images and videos
+        self.modalities = set(config.get("modalities", "images,videos").split(","))
+
+        self.max_prompt_length = config.get("max_prompt_length", 1024)
+        self.return_raw_chat = config.get("return_raw_chat", False)
+        self.return_full_prompt = config.get("return_full_prompt", False)
+        self.truncation = config.get("truncation", "error")
+
+        # TODO: Check whether this is true
+        self.filter_overlong_prompts = config.get("filter_overlong_prompts", True)
+        if isinstance(data_files, str):
+            self.base_dir = os.path.dirname(os.path.abspath(data_files))
+        else:
+            self.base_dir = os.path.dirname(os.path.abspath(data_files[0]))
+
+        self.num_workers = config.get("filter_overlong_prompts_workers", max(1, os.cpu_count() // 4))
+        self.num_workers = min(self.num_workers, os.cpu_count())
+        self.use_shm = config.get("use_shm", False)
+        self.chat_template_func = config.get("chat_template_func", None)
+        self.need_tools_kwargs = config.get("need_tools_kwargs", False)
+        self.filter_prompts = config.get("filter_prompts", True)
+        self.serialize_dataset = False
+        self.return_multi_modal_inputs = config.get("return_multi_modal_inputs", True)
+        
+        # Load format prompt from file if specified
+        self.format_prompt_path = config.get("format_prompt", "examples/format_prompt/default.jinja")
+        self.format_prompt = self._load_format_prompt()
+
+        self._download()
+        self._read_files_and_tokenize() # essentially this is prepared first before _getitem
+
+    def _load_format_prompt(self) -> Optional[Template]:
+        """Load format prompt from file if specified."""
+        if self.format_prompt_path:
+            with open(self.format_prompt_path, 'r', encoding='utf-8') as f:
+                template_content = f.read()
+            return Template(template_content)
+        return None
+
+    def _download(self, use_origin_parquet=False):
+        from verl.utils.fs import copy_to_local
+
+        data_files = self.data_files if not use_origin_parquet else self.original_data_files
+        for i, parquet_file in enumerate(data_files):
+            self.data_files[i] = copy_to_local(src=parquet_file, cache_dir=self.cache_dir, use_shm=self.use_shm)
+
+    def _read_files_and_tokenize(self):
+        dataframes = []
+
+        features = datasets.Features({
+            "problem": datasets.Value("string"),
+            "answer":  datasets.Value("string"),
+            "images":  datasets.Sequence(datasets.Value("string")),
+            "videos":  datasets.Sequence(datasets.Value("string")),
+            "audios":  datasets.Sequence(datasets.Value("string")),  # <- force list of strings
+            "dataset": datasets.Value("string"),
+            "texts":   datasets.Sequence(datasets.Value("string")),
+        })
+
+        for parquet_file in self.data_files:
+            # read parquet files and cache
+            if parquet_file.endswith(".parquet"):
+                dataframe = datasets.load_dataset("parquet", data_files=parquet_file, features=features)["train"]
+            elif parquet_file.endswith(".json") or parquet_file.endswith(".jsonl"):
+                dataframe = datasets.load_dataset("json", data_files=parquet_file, features=features)["train"]
+            else:
+                raise ValueError(f"Unsupported file format: {parquet_file}. Only .parquet, .json, .jsonl are supported.")
+            dataframes.append(dataframe)
+        self.dataframe: datasets.Dataset = datasets.concatenate_datasets(dataframes)
+
+        print(f"dataset len: {len(self.dataframe)}")
+
+        # PROCESSING THE DATAFRAME for TRAINING
+        self.dataframe = self.maybe_filter_out_long_prompts(self.dataframe)
+
+    def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
+        # NOTE: filter out too long prompts, because the prompts can become very long
+        # when the audio is appended.
+
+        if self.filter_overlong_prompts:
+            # NOTE: FILTER OUT THE LONG PROMPTS SO THAT THEY FIT THE LENGTH
+            tokenizer = self.tokenizer
+            processor = self.processor
+            prompt_key = self.prompt_key
+            image_key = self.image_key
+            video_key = self.video_key
+            audio_key = self.audio_key
+
+            if processor is not None:
+                # print(f"KEANE: PROCESSOR FOUND")
+                from verl.utils.dataset.vision_utils import process_image, process_video
+                from verl.utils.dataset.audio_utils import process_audio
+
+                def doc2len(doc) -> int:
+                    messages = self._build_messages(doc)
+                    raw_prompt = self.processor.apply_chat_template(
+                        messages, add_generation_prompt=True, tokenize=False
+                    )
+                    processor_kwargs = {"text": [raw_prompt]}
+                    
+                    if "images" in self.modalities and image_key in doc and len(doc[image_key]) > 0:
+                        images = [process_image(image) for image in doc[image_key]]
+                        processor_kwargs["images"] = images
+
+                    if "videos" in self.modalities and video_key in doc and len(doc[video_key]) > 0:    
+                        videos = [process_video(video) for video in doc[video_key]]
+                        processor_kwargs["videos"] = videos
+
+                    if "audio" in self.modalities and audio_key in doc and doc.get(audio_key, None) is not None and len(doc[audio_key]) > 0:
+                        # processing of audio
+                        # print(f"KEANE: Processing audio within rl dataset file")
+                        # audios = [process_audio(audio, processor) for audio in doc[audio_key]]
+                        # processor_kwargs["audio"] = audios
+
+                        # PATCH
+                        audios = []
+                        audio_tuples = []  # Keep tuples for multi_modal_data
+                        for audio in doc.get(self.audio_key):
+                            audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
+                            audio_data, sampling_rate = process_audio(audio_path, self.processor)
+                            audio_tuples.append((audio_data, sampling_rate))
+                            # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
+                            audios.append(audio_data.detach().cpu().numpy().astype("float32"))
+
+                        processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
+                    # TODO: cannot process the audio inputs
+                    # print(f"KEANE: Processor class is {processor.__class__.__name__}")
+                    # print(f"KEANE: Printing the processor_kwargs, {processor_kwargs}")
+                    # Assume that all are in tensors already, hence there is no return_tensors = "pt"
+                    return len(processor(**processor_kwargs)["input_ids"][0])
+
+            else:
+                # print(f"KEANE: PROCESSOR NOT FOUND")
+                def doc2len(doc) -> int:
+                    return len(tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True))
+
+            dataframe = dataframe.filter(
+                lambda doc: doc2len(doc) <= self.max_prompt_length,
+                num_proc=self.num_workers,
+                desc=f"Filtering prompts longer than {self.max_prompt_length} tokens",
+            )
+
+            print(f"filter dataset len: {len(dataframe)}")
+        return dataframe
+
+    def resume_dataset_state(self):
+        self.serialize_dataset = not hasattr(self, "original_data_files")
+        # resume dataframe if not it's serialized in data.pt
+        if not self.serialize_dataset:
+            self._download(use_origin_parquet=True)  # download and resume from original parquet files
+            self._read_files_and_tokenize()
+        else:
+            print(r"old dataloader ckpt file is used, please train from scratch for better ckpt performance")
+
+    def __len__(self):
+        return len(self.dataframe)
+
+    def _build_messages(self, example: dict):
+        """
+        This appears to be called twice, once during maybe_filter_out_long_prompts, and another time during getitems
+        """
+        messages: list = example.get(self.prompt_key)
+        if isinstance(messages, str):
+            messages = [messages]
+
+        # NOTE: Before building, check if there is multimodal content
+        has_multimodal = (
+            ("images" in self.modalities and self.image_key in example) or
+            ("videos" in self.modalities and self.video_key in example) or
+            ("audio" in self.modalities and self.audio_key in example)
+        )
+        
+        if has_multimodal:
+            new_messages = []
+            for message in messages:
+                new_message = copy.deepcopy(message)
+                if isinstance(new_message, str):
+                    new_message = {"role": "user", "content": new_message}
+                content = new_message["content"]
+                
+                # Apply format prompt to the entire content first if template is loaded
+                if self.format_prompt:
+                    content = self.format_prompt.render(content=content)
+
+                image_count = len(example.get(self.image_key, []))
+                video_count = len(example.get(self.video_key, []))
+                audio_count = len(example.get(self.audio_key, []))
+                image_tag_count = content.count("<image>")
+                video_tag_count = content.count("<video>")
+                audio_tag_count = content.count("<audio>")
+
+                # NOTE: Apppending the <image>, <video>, <audio> tags when they are missing
+                if image_tag_count < image_count:
+                    content = "<image>" * (image_count - image_tag_count) + content
+                    logger.warning("<image> tag count is less than image count, adding missing <image> tags."
+                                   " content: %s", content)
+                if video_tag_count < video_count:
+                    content = "<video>" * (video_count - video_tag_count) + content
+                    logger.warning("<video> tag count is less than video count, adding missing <video> tags."
+                                 " content: %s", content)
+                if audio_tag_count < audio_count:
+                    content = "<audio>" * (audio_count - audio_tag_count) + content
+                    logger.warning("<audio> tag count is less than audio count, adding missing <audio> tags."
+                                   " content: %s", content)
+
+                content_list = []
+                # Build regex pattern based on enabled modalities
+                tag_patterns = []
+                if "images" in self.modalities:
+                    tag_patterns.append("<image>")
+                if "videos" in self.modalities:
+                    tag_patterns.append("<video>")
+                if "audio" in self.modalities:
+                    tag_patterns.append("<audio>")
+                
+                # NOTE: Denote the different patterns based on the tag.
+                # TODO: Double check what this does
+                if tag_patterns:
+                    pattern = "(" + "|".join(tag_patterns) + ")"
+                    segments = re.split(pattern, content)
+                    segments = [item for item in segments if item != ""]
+                    for segment in segments:
+                        if segment == "<image>" and "images" in self.modalities:
+                            content_list.append({"type": "image"})
+                        elif segment == "<video>" and "videos" in self.modalities:
+                            content_list.append({"type": "video"})
+                        elif segment == "<audio>" and "audio" in self.modalities:
+                            content_list.append({"type": "audio"})
+                        else:
+                            content_list.append({"type": "text", "text": segment})
+                else:
+                    content_list.append({"type": "text", "text": content})
+                new_message["content"] = content_list
+                new_messages.append(new_message)
+        else:
+            new_messages = copy.deepcopy(messages)
+            if isinstance(new_messages, str):
+                new_messages = [{"role": "user", "content": new_messages}]
+            elif isinstance(new_messages, list) and isinstance(new_messages[0], str):
+                new_messages = [{"role": "user", "content": new_messages}]
+            
+            # Apply format prompt to text-only messages if template is loaded
+            if self.format_prompt and len(new_messages) > 0:
+                for i, msg in enumerate(new_messages):
+                    if isinstance(msg, dict) and msg.get("role") == "user":
+                        content = msg.get("content", "")
+                        if isinstance(content, str):
+                            new_messages[i]["content"] = self.format_prompt.render(content=content)
+        return new_messages
+
+    def __getitem__(self, item):
+        """
+        Note that we also return the raw_input_ids so that it can be combined with other chat template
+        """
+        row_dict: dict = self.dataframe[item]
+
+        is_timeseries = False
+        vision_path = row_dict['images'][0] if 'images' in row_dict and len(row_dict['images']) != 0 else None
+        if vision_path is None:  # this may be video
+            vision_path = row_dict['videos'][0] if 'videos' in row_dict and len(row_dict['videos']) != 0 else None
+        if vision_path is None:  # this may be time series only
+            vision_path = row_dict['time_series'][0] if 'time_series' in row_dict and len(
+                row_dict['time_series']) != 0 else ''
+            is_timeseries = True
+        prompt_str = row_dict[self.prompt_key]
+
+        if 'How long will the patient stay in the hospital?' in prompt_str:
+            row_dict["data_source"] = "multimodal"
+            row_dict["dataset"] = "los_prediction"
+        elif 'Will the patient survive for at least 48 hours?' in prompt_str:
+            row_dict["data_source"] = "multimodal"
+            row_dict["dataset"] = "48_ihm"
+        elif len(vision_path) != 0:
+            try:
+                row_dict["data_source"] = vision_path.split("/")[0]
+                row_dict["dataset"] = vision_path.split("/")[1]
+            except IndexError:
+                row_dict["data_source"] = "unknown"
+                row_dict["dataset"] = "unknown"
+                print(
+                    f"Failed to parse vision path: {vision_path}. The annotation is {row_dict}. Using default values.")
+        elif is_timeseries:
+            row_dict["data_source"] = "ecg"
+            # dataset already set in json
+        else:
+            raise ValueError("No modality found.")
+
+        if 'reward_model' not in row_dict:
+            if 'answer' in row_dict:
+                answer = row_dict['answer']
+            elif 'ground_truth' in row_dict:
+                answer = row_dict['ground_truth']
+            else:
+                raise ValueError("No answer or ground_truth found in the row_dict.")
+            row_dict['reward_model'] = {'ground_truth': answer}
+
+        for key, item in row_dict.items():
+            if item is None:
+                row_dict[key] = []
+
+        # NOTE: BUILD_MESSAGES IS CALLED TWICE; 
+        # NOTE: FIRST TIME IS TO GET THE LENGTH OF THE RAW PROMPT AND FILTER OUT 
+        # NOTE: PROMPTS THAT DO NOT FIT THE LENGTH; 
+        # NOTE: SECOND TIME IS TO BUILD THE MESSAGE TO BE PASSED INTO THE MODEL
+
+        messages = self._build_messages(row_dict)
+
+        if "audio" in self.modalities:
+            # NOTE: Set the following prompt for qwen omni when we are training on audio
+            messages.insert(0, {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
+                                             "capable of perceiving auditory and visual inputs, as well as generating text and speech."}
+                ]
+            })
+        model_inputs = {}
+        
+        # NOTE: DEBUGGING
+        dbg = True
+        if dbg:
+            print(f"[getitem] idx=? ds={row_dict.get('dataset')} src={row_dict.get('data_source')} "
+                f"modalities={self.modalities}")
+
+        if self.processor is not None:
+            # THIS CHUNK IS BASICALLY ABOUT PROCESSING ALL THE MODALITIES
+            from verl.utils.dataset.vision_utils import process_image, process_video
+            from verl.utils.dataset.audio_utils import process_audio
+
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="System prompt modified")
+                raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+
+            
+            if dbg:
+                print(f"[prompt] raw_prompt_chars={len(raw_prompt)}")
+
+            multi_modal_data = {}
+            processor_kwargs = {"text": [raw_prompt], "return_tensors": "pt"}
+
+            if "images" in self.modalities and self.image_key in row_dict and row_dict.get(self.image_key, None) is not None and len(row_dict[self.image_key]) > 0:
+                images = []
+                for image in row_dict.get(self.image_key):
+                    image = os.path.join(self.base_dir, image) if isinstance(image, str) else image
+                    images.append(process_image(image))
+
+                # due to the image key is "image" instead of "images" in vllm, we need to use "image" here
+                # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
+                multi_modal_data["image"] = images
+                processor_kwargs["images"] = images
+
+                if dbg:
+                    print(f"[image] n={len(images)} shapes={[tuple(x.size()) if hasattr(x,'size') else 'np' for x in images]}")
+
+
+            # print(f"KEANE: Videos is next line, current processor_kwargs {processor_kwargs}")
+            if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
+                videos = []
+                # print(f"KEANE: GETTING VIDEO {row_dict[self.video_key]}")
+
+                for video in row_dict.get(self.video_key):
+                    video = os.path.join(self.base_dir, video) if isinstance(video, str) else video
+                    videos.append(process_video(video))
+
+                # due to the video key is "video" instead of "videos" in vllm, we need to use "video" here
+                # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
+                multi_modal_data["video"] = [video.numpy() for video in videos]
+                processor_kwargs["videos"] = videos
+
+                if dbg:
+                    shapes = [tuple(v.shape) for v in videos]  # [T,3,H,W]
+                    toks = []
+                    for (T, C, H, W) in shapes:
+                        toks.append(_tok_est_from_hw(H, W) * T)
+                    print(f"[video] n={len(videos)} shapes={shapes} est_tokens={toks} "
+                        f"sum_est_tokens={sum(toks)} p99_est={_p99(toks)}")
+
+
+            # NOTE: PROCESSING OF THE AUDIO TUPLES
+            # if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
+            #     audios = []
+            #     audio_tuples = []  # Keep tuples for multi_modal_data
+            #     for audio in row_dict.get(self.audio_key):
+            #         audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
+            #         audio_data, sampling_rate = process_audio(audio_path, self.processor)
+            #         audio_tuples.append((audio_data, sampling_rate))
+            #         # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
+            #         audios.append(audio_data.detach().cpu().numpy().astype("float32"))
+
+            #     # multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
+            #     multi_modal_data["audio"] = audios  # Store numpy arrays (it should not accept tuples)
+
+            #     processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
+
+            if (
+                "audio" in self.modalities
+                and self.audio_key in row_dict
+                and row_dict.get(self.audio_key)
+                and len(row_dict[self.audio_key]) > 0
+            ):
+                audios_np = []
+                audios_np_sr = []
+                audio_tuples_debug = []  # keep tensors only for debugging
+                audio_secs = []
+
+                for audio in row_dict[self.audio_key]:
+                    audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
+                    audio_tensor, sr = process_audio(audio_path, self.processor)
+
+                    # Debug only
+                    audio_tuples_debug.append((audio_tensor, sr))
+
+                    # What BOTH HF and vLLM need:
+                    arr = audio_tensor.detach().cpu().numpy().astype("float32")
+                    audios_np.append(arr)
+                    audios_np_sr.append((arr, int(sr)))
+                    audio_secs.append(_sec_from_array(arr, sr))
+
+                # HF (Whisper / Omni processor) path
+                multi_modal_data["audio"] = audios_np_sr  # Store numpy arrays (it should not accept tuples)
+
+                processor_kwargs["audio"] = audios_np  # Pass numpy arrays to processor
+
+                if dbg:
+                    print(f"[audio] n={len(audios_np)} secs_each={audio_secs} total_secs≈{round(sum([s for s in audio_secs if s!='?']),3)}")
+
+            # NOTE: Original CODE PROCESSING    
+            # TODO: Please check whether the model is processing the "audio" correctly, the processor that we are using is qwen 2.5 OMNI
+            # print(f"KEANE: Processing multimodal data with processor {self.processor.__class__.__name__} ")
+            # print(f"KEANE: Processor kwargs: {processor_kwargs}")
+            # model_inputs = self.processor(**processor_kwargs)
+
+            # NOTE: Replacement code
+            try:
+                t0 = time.time()
+                model_inputs = self.processor(**processor_kwargs)
+                dt = (time.time() - t0)*1000
+                if dbg:
+                    # lengths after processor/tokenizer
+                    ids = model_inputs.get("input_ids")
+                    lens = [len(x) for x in ids] if ids is not None else []
+                    print(f"[processor] ok in {dt:.1f}ms; input_ids lens={lens} "
+                        f"min/med/max={ (min(lens) if lens else '-')} / "
+                        f"{ (sorted(lens)[len(lens)//2] if lens else '-') } / "
+                        f"{ (max(lens) if lens else '-') }")
+            except Exception as e:
+                print(f"[processor][ERROR] {type(e).__name__}: {e}")
+                # helpful context dump (small)
+                print(f"[processor][ctx] has_video={videos is not None} "
+                    f"n_vid={len(videos) if videos is not None else 0} "
+                    f"n_audio={len(audio_secs) if audio_secs else 0} "
+                    f"raw_prompt_chars={len(raw_prompt)}")
+                raise
+
+            # NOTE: all text should be processed by self.processor()
+            input_ids = model_inputs.pop("input_ids")
+            attention_mask = model_inputs.pop("attention_mask")
+
+            if "second_per_grid_ts" in model_inputs:
+                model_inputs.pop("second_per_grid_ts")
+
+            # There's a trap here, multi_modal_inputs has to be a dict, not BatchFeature
+            row_dict["multi_modal_data"] = multi_modal_data
+
+            # We will do batch.union() in the trainer,
+            # so we cannot have "multi_modal_inputs" in row_dict if rollout generates new multi_modal_inputs
+            if self.return_multi_modal_inputs:
+                row_dict["multi_modal_inputs"] = dict(model_inputs)
+
+                # second_per_grid_ts isn't used for training, just for mrope
+                row_dict["multi_modal_inputs"].pop("second_per_grid_ts", None)
+
+        else:
+            raw_prompt = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+            model_inputs = self.tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
+            input_ids = model_inputs.pop("input_ids")
+            attention_mask = model_inputs.pop("attention_mask")
+
+        input_ids, attention_mask = verl_F.postprocess_data(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_length=self.max_prompt_length,
+            pad_token_id=self.tokenizer.pad_token_id,
+            left_pad=True,
+            truncation=self.truncation,
+        )
+
+        if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
+            from verl.models.transformers.qwen2_vl import get_rope_index
+            
+            # NOTE: printing out whether this runs
+            # print("KEANE: Running getting the rope index of input ids")
+            
+            # NOTE: OBTAIN ROPE of rotary positional embeddings. ROPE encodes position by rotating components of query/key vectors
+            # This is just for to get relative position in terms of angular differences etc.
+            position_ids = [
+                get_rope_index(
+                    self.processor,
+                    input_ids=input_ids[0],
+                    image_grid_thw=model_inputs.get("image_grid_thw"),
+                    video_grid_thw=model_inputs.get("video_grid_thw"),
+                    second_per_grid_ts=model_inputs.get("second_per_grid_ts"),
+                    attention_mask=attention_mask[0],
+                )
+            ]  # (1, 3, seq_len)
+
+        else:
+            position_ids = compute_position_id_with_mask(attention_mask)
+
+        # Essentially training with the different input ids etc.
+        row_dict["input_ids"] = input_ids[0]
+        row_dict["attention_mask"] = attention_mask[0]
+        row_dict["position_ids"] = position_ids[0]
+
+        raw_prompt_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
+        if len(raw_prompt_ids) > self.max_prompt_length:
+            if self.truncation == "left":
+                raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length :]
+            elif self.truncation == "right":
+                raw_prompt_ids = raw_prompt_ids[: self.max_prompt_length]
+            elif self.truncation == "middle":
+                left_half = self.max_prompt_length // 2
+                right_half = self.max_prompt_length - left_half
+                raw_prompt_ids = raw_prompt_ids[:left_half] + raw_prompt_ids[-right_half:]
+            elif self.truncation == "error":
+                raise RuntimeError(f"Prompt length {len(raw_prompt_ids)} is longer than {self.max_prompt_length}.")
+
+        row_dict["raw_prompt_ids"] = raw_prompt_ids
+        # encode prompts without chat template
+        if self.return_raw_chat:
+            row_dict["raw_prompt"] = messages
+
+        # get prompts with chat template
+        if self.return_full_prompt:
+            row_dict["full_prompts"] = raw_prompt  # array of strings
+
+        # add index for each prompt
+        index = row_dict.get("extra_info", {}).get("index", 0)
+        tools_kwargs = row_dict.get("extra_info", {}).get("tools_kwargs", {})
+        interaction_kwargs = row_dict.get("extra_info", {}).get("interaction_kwargs", {})
+        need_tools_kwargs = row_dict.get("extra_info", {}).get("need_tools_kwargs", self.need_tools_kwargs)
+        if need_tools_kwargs and not tools_kwargs:
+            logger.warning("tools_kwargs is empty for index {}, data source: {}", index, row_dict["data_source"])
+        row_dict["index"] = index
+        row_dict["tools_kwargs"] = tools_kwargs
+        row_dict["interaction_kwargs"] = interaction_kwargs
+        
+        return row_dict
+
+    def __getstate__(self):
+        if not self.serialize_dataset:
+            state = self.__dict__.copy()
+
+            if "dataframe" in state:
+                del state["dataframe"]
+            return state
+
+        return self.__dict__.copy()

From 7f41420ec1d52850782ec666c3357f98a0d2d7cf Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 17:53:37 -0400
Subject: [PATCH 193/232] revert commit

---
 examples/split_placement/main_ppo_split.py    |    4 +-
 .../split_placement/split_monkey_patch.py     |    2 +-
 recipe/dapo/dapo_ray_trainer.py               |    2 +-
 recipe/dapo/main_dapo.py                      |    2 +-
 recipe/entropy/entropy_ray_trainer.py         |    2 +-
 recipe/entropy/main_entropy.py                |    2 +-
 recipe/one_step_off_policy/ray_trainer.py     |    2 +-
 recipe/prime/main_prime.py                    |    2 +-
 recipe/prime/prime_ray_trainer.py             |    2 +-
 recipe/spin/spin_trainer.py                   |    2 +-
 recipe/sppo/main_sppo.py                      |    2 +-
 recipe/sppo/sppo_ray_trainer.py               |    2 +-
 tests/experimental/agent_loop/agent_utils.py  |    2 +-
 tests/workers/rollout/async_rollout_utils.py  |    2 +-
 verl/trainer/main_ppo.py                      |   14 +-
 verl/trainer/ppo/ray_trainer.py               |   14 +-
 verl/trainer/ppo/ray_trainer_alt.py           | 1489 -----------------
 verl/utils/dataset/old_get_item.py            |  298 ----
 verl/utils/dataset/rl_dataset.py              |    5 +-
 verl/utils/dataset/rl_dataset_alt.py          |  649 -------
 verl/utils/dataset/rl_dataset_org.py          |  687 --------
 21 files changed, 26 insertions(+), 3160 deletions(-)
 delete mode 100644 verl/trainer/ppo/ray_trainer_alt.py
 delete mode 100644 verl/utils/dataset/old_get_item.py
 delete mode 100644 verl/utils/dataset/rl_dataset_alt.py
 delete mode 100644 verl/utils/dataset/rl_dataset_org.py

diff --git a/examples/split_placement/main_ppo_split.py b/examples/split_placement/main_ppo_split.py
index ee80e1576f1..0d17832a6d6 100644
--- a/examples/split_placement/main_ppo_split.py
+++ b/examples/split_placement/main_ppo_split.py
@@ -21,7 +21,7 @@
 from split_monkey_patch import fit
 
 from verl import DataProto
-from verl.verl.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer
 from verl.utils.reward_score import gsm8k, math
 
 
@@ -140,7 +140,7 @@ def main_task(config):
     else:
         raise NotImplementedError
 
-    from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+    from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 
     role_worker_mapping = {
         Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
diff --git a/examples/split_placement/split_monkey_patch.py b/examples/split_placement/split_monkey_patch.py
index 2463c8446aa..bef75f74cbe 100644
--- a/examples/split_placement/split_monkey_patch.py
+++ b/examples/split_placement/split_monkey_patch.py
@@ -23,7 +23,7 @@
 import torch
 
 from verl import DataProto
-from verl.verl.trainer.ppo.ray_trainer import (
+from verl.trainer.ppo.ray_trainer import (
     AdvantageEstimator,
     apply_kl_penalty,
     compute_advantage,
diff --git a/recipe/dapo/dapo_ray_trainer.py b/recipe/dapo/dapo_ray_trainer.py
index d129b4d847c..4ee64294820 100644
--- a/recipe/dapo/dapo_ray_trainer.py
+++ b/recipe/dapo/dapo_ray_trainer.py
@@ -33,7 +33,7 @@
     compute_timing_metrics,
     reduce_metrics,
 )
-from verl.verl.trainer.ppo.ray_trainer import (
+from verl.trainer.ppo.ray_trainer import (
     AdvantageEstimator,
     RayPPOTrainer,
     apply_kl_penalty,
diff --git a/recipe/dapo/main_dapo.py b/recipe/dapo/main_dapo.py
index e0d2e194681..ffd1e43838b 100644
--- a/recipe/dapo/main_dapo.py
+++ b/recipe/dapo/main_dapo.py
@@ -98,7 +98,7 @@ def run(self, config):
         else:
             raise NotImplementedError
 
-        from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 
         role_worker_mapping = {
             Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
diff --git a/recipe/entropy/entropy_ray_trainer.py b/recipe/entropy/entropy_ray_trainer.py
index 48d94a8d749..0b0b04318c9 100644
--- a/recipe/entropy/entropy_ray_trainer.py
+++ b/recipe/entropy/entropy_ray_trainer.py
@@ -32,7 +32,7 @@
     compute_timing_metrics,
     reduce_metrics,
 )
-from verl.verl.trainer.ppo.ray_trainer import (
+from verl.trainer.ppo.ray_trainer import (
     AdvantageEstimator,
     RayPPOTrainer,
     apply_kl_penalty,
diff --git a/recipe/entropy/main_entropy.py b/recipe/entropy/main_entropy.py
index 3a9a96abb5e..ffed8e4235a 100644
--- a/recipe/entropy/main_entropy.py
+++ b/recipe/entropy/main_entropy.py
@@ -108,7 +108,7 @@ def run(self, config):
         else:
             raise NotImplementedError
 
-        from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 
         role_worker_mapping = {
             Role.ActorRollout: ray.remote(actor_rollout_cls),
diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py
index ce91c4349ea..1f7011bdf54 100644
--- a/recipe/one_step_off_policy/ray_trainer.py
+++ b/recipe/one_step_off_policy/ray_trainer.py
@@ -38,7 +38,7 @@
     compute_throughout_metrics,
     compute_timing_metrics,
 )
-from verl.verl.trainer.ppo.ray_trainer import (
+from verl.trainer.ppo.ray_trainer import (
     RayPPOTrainer,
     ResourcePoolManager,
     Role,
diff --git a/recipe/prime/main_prime.py b/recipe/prime/main_prime.py
index 1ba74677291..687bc6e421a 100644
--- a/recipe/prime/main_prime.py
+++ b/recipe/prime/main_prime.py
@@ -89,7 +89,7 @@ def main_task(config, compute_score=None):
     else:
         raise NotImplementedError
 
-    from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+    from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 
     role_worker_mapping = {
         Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
diff --git a/recipe/prime/prime_ray_trainer.py b/recipe/prime/prime_ray_trainer.py
index 91749641e27..a5ad96431a8 100644
--- a/recipe/prime/prime_ray_trainer.py
+++ b/recipe/prime/prime_ray_trainer.py
@@ -30,7 +30,7 @@
 from verl.single_controller.ray import RayWorkerGroup
 from verl.trainer.ppo.core_algos import agg_loss
 from verl.trainer.ppo.metric_utils import _compute_response_info
-from verl.verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
 from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
 from verl.utils.metric import reduce_metrics
diff --git a/recipe/spin/spin_trainer.py b/recipe/spin/spin_trainer.py
index 27b8bef0cee..43789218f57 100644
--- a/recipe/spin/spin_trainer.py
+++ b/recipe/spin/spin_trainer.py
@@ -44,7 +44,7 @@
     process_validation_metrics,
     reduce_metrics,
 )
-from verl.verl.trainer.ppo.ray_trainer import Role
+from verl.trainer.ppo.ray_trainer import Role
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
 from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
 from verl.utils.torch_functional import masked_mean
diff --git a/recipe/sppo/main_sppo.py b/recipe/sppo/main_sppo.py
index ded6634ef44..a96fc28873d 100644
--- a/recipe/sppo/main_sppo.py
+++ b/recipe/sppo/main_sppo.py
@@ -93,7 +93,7 @@ def run(self, config):
         else:
             raise NotImplementedError
 
-        from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 
         # sppo does not use critic
         role_worker_mapping = {
diff --git a/recipe/sppo/sppo_ray_trainer.py b/recipe/sppo/sppo_ray_trainer.py
index ceb913fcf94..0725d293e2b 100644
--- a/recipe/sppo/sppo_ray_trainer.py
+++ b/recipe/sppo/sppo_ray_trainer.py
@@ -34,7 +34,7 @@
 from verl.trainer.ppo import core_algos
 from verl.trainer.ppo.core_algos import agg_loss
 from verl.trainer.ppo.metric_utils import reduce_metrics
-from verl.verl.trainer.ppo.ray_trainer import (
+from verl.trainer.ppo.ray_trainer import (
     AdvantageEstimator,
     RayPPOTrainer,
     ResourcePoolManager,
diff --git a/tests/experimental/agent_loop/agent_utils.py b/tests/experimental/agent_loop/agent_utils.py
index db41bbdaa41..3c708c42cfb 100644
--- a/tests/experimental/agent_loop/agent_utils.py
+++ b/tests/experimental/agent_loop/agent_utils.py
@@ -18,7 +18,7 @@
 from verl.experimental.agent_loop import AgentLoopManager
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
-from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
 
 
diff --git a/tests/workers/rollout/async_rollout_utils.py b/tests/workers/rollout/async_rollout_utils.py
index 05520af5010..22f20291e40 100644
--- a/tests/workers/rollout/async_rollout_utils.py
+++ b/tests/workers/rollout/async_rollout_utils.py
@@ -17,7 +17,7 @@
 
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
-from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
 from verl.workers.fsdp_workers import AsyncActorRolloutRefWorker
 from verl.workers.rollout.async_server import AsyncLLMServerManager
 
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 0912d7fffe7..5c31aea98c4 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -24,7 +24,7 @@
 
 from verl.experimental.dataset.sampler import AbstractSampler
 from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
-from verl.verl.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer
 from verl.trainer.ppo.reward import load_reward_manager
 from verl.utils.device import is_cuda_available
 from verl.utils.import_utils import load_extern_type
@@ -129,7 +129,7 @@ def add_actor_rollout_worker(self, config):
         else:
             raise NotImplementedError
 
-        from verl.verl.trainer.ppo.ray_trainer import Role
+        from verl.trainer.ppo.ray_trainer import Role
 
         self.role_worker_mapping[Role.ActorRollout] = ray.remote(actor_rollout_cls)
 
@@ -154,13 +154,13 @@ def add_critic_worker(self, config):
         else:
             raise NotImplementedError
 
-        from verl.verl.trainer.ppo.ray_trainer import Role
+        from verl.trainer.ppo.ray_trainer import Role
 
         self.role_worker_mapping[Role.Critic] = ray.remote(CriticWorker)
 
     def init_resource_pool_mgr(self, config):
         """Initialize resource pool manager."""
-        from verl.verl.trainer.ppo.ray_trainer import Role
+        from verl.trainer.ppo.ray_trainer import Role
 
         global_pool_id = "global_pool"
         resource_pool_spec = {
@@ -168,14 +168,14 @@ def init_resource_pool_mgr(self, config):
         }
         self.mapping[Role.ActorRollout] = global_pool_id
         self.mapping[Role.Critic] = global_pool_id
-        from verl.verl.trainer.ppo.ray_trainer import ResourcePoolManager
+        from verl.trainer.ppo.ray_trainer import ResourcePoolManager
 
         resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=self.mapping)
         return resource_pool_manager
 
     def add_reward_model_worker(self, config):
         """Add reward model worker if enabled."""
-        from verl.verl.trainer.ppo.ray_trainer import Role
+        from verl.trainer.ppo.ray_trainer import Role
 
         if config.reward_model.enable:
             if config.reward_model.strategy in {"fsdp", "fsdp2"}:
@@ -189,7 +189,7 @@ def add_reward_model_worker(self, config):
 
     def add_ref_policy_worker(self, config, ref_policy_cls):
         """Add reference policy worker if KL loss or KL reward is used."""
-        from verl.verl.trainer.ppo.ray_trainer import Role
+        from verl.trainer.ppo.ray_trainer import Role
 
         if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
             self.role_worker_mapping[Role.RefPolicy] = ray.remote(ref_policy_cls)
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index c703f89b085..4b3a8f867e4 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -681,7 +681,7 @@ def _validate(self):
 
             # Store original inputs
             input_ids = test_batch.batch["input_ids"]
-
+            # TODO: Can we keep special tokens except for padding tokens?
             input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
             sample_inputs.extend(input_texts)
 
@@ -1193,18 +1193,9 @@ def fit(self):
                 # pop those keys for generation
                 batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
                 non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-
-                if "input_ids" in batch.batch:
-                    print(f"[DEBUG] input_ids shape: {batch.batch['input_ids'].shape}")
-                    print(f"[DEBUG] First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}")
-
                 if "multi_modal_data" in batch.non_tensor_batch:
                     # TODO: Fix the audio generation for this
                     non_tensor_batch_keys_to_pop.append("multi_modal_data")
-                # # NOTE: Adding pruned inputs that we kept for generation
-                # if "multi_modal_inputs" in batch.non_tensor_batch:
-                #     non_tensor_batch_keys_to_pop.append("multi_modal_inputs")
-                    
                 if "raw_prompt" in batch.non_tensor_batch:
                     non_tensor_batch_keys_to_pop.append("raw_prompt")
                 if "tools_kwargs" in batch.non_tensor_batch:
@@ -1228,7 +1219,8 @@ def fit(self):
                 is_last_step = self.global_steps >= self.total_training_steps
 
                 # TODO: double check the gen_batch
-                # print(f"gen_batch", gen_batch)
+                print(f"gen_batch", gen_batch)
+
 
                 with marked_timer("step", timing_raw):
                     # generate a batch
diff --git a/verl/trainer/ppo/ray_trainer_alt.py b/verl/trainer/ppo/ray_trainer_alt.py
deleted file mode 100644
index e5ba90cc10c..00000000000
--- a/verl/trainer/ppo/ray_trainer_alt.py
+++ /dev/null
@@ -1,1489 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-# Copyright 2023-2024 SGLang Team
-# Copyright 2025 ModelBest Inc. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-PPO Trainer with Ray-based single controller.
-This trainer supports model-agonistic model initialization with huggingface
-"""
-
-import json
-import os
-import uuid
-import warnings
-from collections import defaultdict
-from copy import deepcopy
-from dataclasses import dataclass, field
-from enum import Enum
-from pprint import pprint
-from typing import Optional, Dict
-
-import numpy as np
-import ray
-import torch
-import ujson
-import wandb
-from omegaconf import OmegaConf, open_dict
-from torch.utils.data import Dataset, Sampler
-from torchdata.stateful_dataloader import StatefulDataLoader
-from tqdm import tqdm
-
-from verl import DataProto
-from verl.experimental.dataset.sampler import AbstractCurriculumSampler
-from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
-from verl.single_controller.base import Worker
-from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
-from verl.single_controller.ray.base import create_colocated_worker_cls
-from verl.trainer.config import AlgoConfig
-from verl.trainer.ppo import core_algos
-from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
-from verl.trainer.ppo.metric_utils import (
-    compute_data_metrics,
-    compute_throughout_metrics,
-    compute_timing_metrics,
-    process_validation_metrics,
-)
-from verl.trainer.ppo.reward import compute_reward, compute_reward_async
-from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
-from verl.utils.config import omega_conf_to_dataclass
-from verl.utils.debug import marked_timer
-from verl.utils.metric import reduce_metrics
-from verl.utils.rollout_skip import RolloutSkip
-from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
-from verl.utils.torch_functional import masked_mean
-from verl.utils.tracking import ValidationGenerationsLogger
-from examples.reward_function.evaluation import compute_metrics_by_data_source
-
-WorkerType = type[Worker]
-
-
-class Role(Enum):
-    """
-    To create more roles dynamically, you can subclass Role and add new members
-    """
-
-    Actor = 0
-    Rollout = 1
-    ActorRollout = 2
-    Critic = 3
-    RefPolicy = 4
-    RewardModel = 5
-    ActorRolloutRef = 6
-
-
-@dataclass
-class ResourcePoolManager:
-    """
-    Define a resource pool specification. Resource pool will be initialized first.
-    """
-
-    resource_pool_spec: dict[str, list[int]]
-    mapping: dict[Role, str]
-    resource_pool_dict: dict[str, RayResourcePool] = field(default_factory=dict)
-
-    def create_resource_pool(self):
-        """Create Ray resource pools for distributed training.
-
-        Initializes resource pools based on the resource pool specification,
-        with each pool managing GPU resources across multiple nodes.
-        For FSDP backend, uses max_colocate_count=1 to merge WorkerGroups.
-        For Megatron backend, uses max_colocate_count>1 for different models.
-        """
-        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
-            # max_colocate_count means the number of WorkerGroups (i.e. processes) in each RayResourcePool
-            # For FSDP backend, we recommend using max_colocate_count=1 that merge all WorkerGroups into one.
-            # For Megatron backend, we recommend using max_colocate_count>1
-            # that can utilize different WorkerGroup for differnt models
-            resource_pool = RayResourcePool(
-                process_on_nodes=process_on_nodes, use_gpu=True, max_colocate_count=1, name_prefix=resource_pool_name
-            )
-            self.resource_pool_dict[resource_pool_name] = resource_pool
-
-        self._check_resource_available()
-
-    def get_resource_pool(self, role: Role) -> RayResourcePool:
-        """Get the resource pool of the worker_cls"""
-        return self.resource_pool_dict[self.mapping[role]]
-
-    def get_n_gpus(self) -> int:
-        """Get the number of gpus in this cluster."""
-        return sum([n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes])
-
-    def _check_resource_available(self):
-        """Check if the resource pool can be satisfied in this ray cluster."""
-        node_available_resources = ray.state.available_resources_per_node()
-        node_available_gpus = {
-            node: node_info.get("GPU", 0) if "GPU" in node_info else node_info.get("NPU", 0)
-            for node, node_info in node_available_resources.items()
-        }
-
-        # check total required gpus can be satisfied
-        total_available_gpus = sum(node_available_gpus.values())
-        total_required_gpus = sum(
-            [n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes]
-        )
-        if total_available_gpus < total_required_gpus:
-            raise ValueError(
-                f"Total available GPUs {total_available_gpus} is less than total desired GPUs {total_required_gpus}"
-            )
-
-        # check each resource pool can be satisfied, O(#resource_pools * #nodes)
-        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
-            num_gpus, num_nodes = process_on_nodes[0], len(process_on_nodes)
-            for node, available_gpus in node_available_gpus.items():
-                if available_gpus >= num_gpus:
-                    node_available_gpus[node] -= num_gpus
-                    num_nodes -= 1
-                    if num_nodes == 0:
-                        break
-            if num_nodes > 0:
-                raise ValueError(
-                    f"Resource pool {resource_pool_name}: {num_gpus}*{num_nodes}"
-                    + "cannot be satisfied in this ray cluster"
-                )
-
-
-def apply_kl_penalty(data: DataProto, kl_ctrl: core_algos.AdaptiveKLController, kl_penalty="kl"):
-    """Apply KL penalty to the token-level rewards.
-
-    This function computes the KL divergence between the reference policy and current policy,
-    then applies a penalty to the token-level rewards based on this divergence.
-
-    Args:
-        data (DataProto): The data containing batched model outputs and inputs.
-        kl_ctrl (core_algos.AdaptiveKLController): Controller for adaptive KL penalty.
-        kl_penalty (str, optional): Type of KL penalty to apply. Defaults to "kl".
-
-    Returns:
-        tuple: A tuple containing:
-            - The updated data with token-level rewards adjusted by KL penalty
-            - A dictionary of metrics related to the KL penalty
-    """
-    response_mask = data.batch["response_mask"]
-    token_level_scores = data.batch["token_level_scores"]
-    batch_size = data.batch.batch_size[0]
-
-    # compute kl between ref_policy and current policy
-    # When apply_kl_penalty, algorithm.use_kl_in_reward=True, so the reference model has been enabled.
-    kld = core_algos.kl_penalty(
-        data.batch["old_log_probs"], data.batch["ref_log_prob"], kl_penalty=kl_penalty
-    )  # (batch_size, response_length)
-    kld = kld * response_mask
-    beta = kl_ctrl.value
-
-    token_level_rewards = token_level_scores - beta * kld
-
-    current_kl = masked_mean(kld, mask=response_mask, axis=-1)  # average over sequence
-    current_kl = torch.mean(current_kl, dim=0).item()
-
-    # according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837
-    kl_ctrl.update(current_kl=current_kl, n_steps=batch_size)
-    data.batch["token_level_rewards"] = token_level_rewards
-
-    metrics = {"actor/reward_kl_penalty": current_kl, "actor/reward_kl_penalty_coeff": beta}
-
-    return data, metrics
-
-
-def compute_response_mask(data: DataProto):
-    """Compute the attention mask for the response part of the sequence.
-
-    This function extracts the portion of the attention mask that corresponds to the model's response,
-    which is used for masking computations that should only apply to response tokens.
-
-    Args:
-        data (DataProto): The data containing batched model outputs and inputs.
-
-    Returns:
-        torch.Tensor: The attention mask for the response tokens.
-    """
-    responses = data.batch["responses"]
-    response_length = responses.size(1)
-    attention_mask = data.batch["attention_mask"]
-    return attention_mask[:, -response_length:]
-
-
-def compute_advantage(
-    data: DataProto,
-    adv_estimator: AdvantageEstimator,
-    gamma: float = 1.0,
-    lam: float = 1.0,
-    num_repeat: int = 1,
-    norm_adv_by_std_in_grpo: bool = True,
-    config: Optional[AlgoConfig] = None,
-) -> DataProto:
-    """Compute advantage estimates for policy optimization.
-
-    This function computes advantage estimates using various estimators like GAE, GRPO, REINFORCE++, etc.
-    The advantage estimates are used to guide policy optimization in RL algorithms.
-
-    Args:
-        data (DataProto): The data containing batched model outputs and inputs.
-        adv_estimator (AdvantageEstimator): The advantage estimator to use (e.g., GAE, GRPO, REINFORCE++).
-        gamma (float, optional): Discount factor for future rewards. Defaults to 1.0.
-        lam (float, optional): Lambda parameter for GAE. Defaults to 1.0.
-        num_repeat (int, optional): Number of times to repeat the computation. Defaults to 1.
-        norm_adv_by_std_in_grpo (bool, optional): Whether to normalize advantages by standard deviation in
-            GRPO. Defaults to True.
-        config (dict, optional): Configuration dictionary for algorithm settings. Defaults to None.
-
-    Returns:
-        DataProto: The updated data with computed advantages and returns.
-    """
-    # Back-compatible with trainers that do not compute response mask in fit
-    if "response_mask" not in data.batch.keys():
-        data.batch["response_mask"] = compute_response_mask(data)
-    # prepare response group
-    if adv_estimator == AdvantageEstimator.GAE:
-        # Compute advantages and returns using Generalized Advantage Estimation (GAE)
-        advantages, returns = core_algos.compute_gae_advantage_return(
-            token_level_rewards=data.batch["token_level_rewards"],
-            values=data.batch["values"],
-            response_mask=data.batch["response_mask"],
-            gamma=gamma,
-            lam=lam,
-        )
-        data.batch["advantages"] = advantages
-        data.batch["returns"] = returns
-        if config.get("use_pf_ppo", False):
-            data = core_algos.compute_pf_ppo_reweight_data(
-                data,
-                config.pf_ppo.get("reweight_method"),
-                config.pf_ppo.get("weight_pow"),
-            )
-    elif adv_estimator == AdvantageEstimator.GRPO:
-        # Initialize the mask for GRPO calculation
-        grpo_calculation_mask = data.batch["response_mask"]
-        # Call compute_grpo_outcome_advantage with parameters matching its definition
-        advantages, returns = core_algos.compute_grpo_outcome_advantage(
-            token_level_rewards=data.batch["token_level_rewards"],
-            response_mask=grpo_calculation_mask,
-            index=data.non_tensor_batch["uid"],
-            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
-        )
-        data.batch["advantages"] = advantages
-        data.batch["returns"] = returns
-    elif adv_estimator == AdvantageEstimator.DRPO:
-        grpo_calculation_mask = data.batch["response_mask"]
-        domain_info = data.non_tensor_batch["dataset"]
-
-        advantages, returns = core_algos.compute_drpo_outcome_advantage(
-            token_level_rewards=data.batch["token_level_rewards"],
-            response_mask=grpo_calculation_mask,
-            index=data.non_tensor_batch["uid"],
-            domain_info=domain_info
-        )
-        data.batch["advantages"] = advantages
-        data.batch["returns"] = returns
-    else:
-        # handle all other adv estimator type other than GAE and GRPO
-        adv_estimator_fn = core_algos.get_adv_estimator_fn(adv_estimator)
-        adv_kwargs = {
-            "token_level_rewards": data.batch["token_level_rewards"],
-            "response_mask": data.batch["response_mask"],
-            "config": config,
-        }
-        if "uid" in data.non_tensor_batch:  # optional
-            adv_kwargs["index"] = data.non_tensor_batch["uid"]
-        if "reward_baselines" in data.batch:  # optional
-            adv_kwargs["reward_baselines"] = data.batch["reward_baselines"]
-
-        # calculate advantage estimator
-        advantages, returns = adv_estimator_fn(**adv_kwargs)
-        data.batch["advantages"] = advantages
-        data.batch["returns"] = returns
-    return data
-
-
-class RayPPOTrainer:
-    """Distributed PPO trainer using Ray for scalable reinforcement learning.
-
-    This trainer orchestrates distributed PPO training across multiple nodes and GPUs,
-    managing actor rollouts, critic training, and reward computation with Ray backend.
-    Supports various model architectures including FSDP, Megatron, and vLLM integration.
-    """
-
-    # TODO: support each role have individual ray_worker_group_cls,
-    # i.e., support different backend of different role
-    def __init__(
-        self,
-        config,
-        tokenizer,
-        role_worker_mapping: dict[Role, WorkerType],
-        resource_pool_manager: ResourcePoolManager,
-        ray_worker_group_cls: type[RayWorkerGroup] = RayWorkerGroup,
-        processor=None,
-        reward_fn=None,
-        val_reward_fn=None,
-        train_dataset: Optional[Dataset] = None,
-        val_dataset: Optional[Dataset] = None,
-        collate_fn=None,
-        train_sampler: Optional[Sampler] = None,
-        device_name=None,
-    ):
-        """
-        Initialize distributed PPO trainer with Ray backend.
-        Note that this trainer runs on the driver process on a single CPU/GPU node.
-
-        Args:
-            config: Configuration object containing training parameters.
-            tokenizer: Tokenizer used for encoding and decoding text.
-            role_worker_mapping (dict[Role, WorkerType]): Mapping from roles to worker classes.
-            resource_pool_manager (ResourcePoolManager): Manager for Ray resource pools.
-            ray_worker_group_cls (RayWorkerGroup, optional): Class for Ray worker groups. Defaults to RayWorkerGroup.
-            processor: Optional data processor, used for multimodal data
-            reward_fn: Function for computing rewards during training.
-            val_reward_fn: Function for computing rewards during validation.
-            train_dataset (Optional[Dataset], optional): Training dataset. Defaults to None.
-            val_dataset (Optional[Dataset], optional): Validation dataset. Defaults to None.
-            collate_fn: Function to collate data samples into batches.
-            train_sampler (Optional[Sampler], optional): Sampler for the training dataset. Defaults to None.
-            device_name (str, optional): Device name for training (e.g., "cuda", "cpu"). Defaults to None.
-        """
-
-        # Store the tokenizer for text processing
-        self.tokenizer = tokenizer
-        self.processor = processor
-        self.config = config
-        self.reward_fn = reward_fn
-        self.val_reward_fn = val_reward_fn
-
-        self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
-        assert self.hybrid_engine, "Currently, only support hybrid engine"
-
-        if self.hybrid_engine:
-            assert Role.ActorRollout in role_worker_mapping, f"{role_worker_mapping.keys()=}"
-
-        self.role_worker_mapping = role_worker_mapping
-        self.resource_pool_manager = resource_pool_manager
-        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
-        self.use_rm = Role.RewardModel in role_worker_mapping
-        self.ray_worker_group_cls = ray_worker_group_cls
-        self.device_name = device_name if device_name else self.config.trainer.device
-        self.validation_generations_logger = ValidationGenerationsLogger(
-            project_name=self.config.trainer.project_name,
-            experiment_name=self.config.trainer.experiment_name,
-        )
-
-        # if ref_in_actor is True, the reference policy will be actor without lora applied
-        self.ref_in_actor = config.actor_rollout_ref.model.get("lora_rank", 0) > 0
-
-        # define in-reward KL control
-        # kl loss control currently not suppoorted
-        if self.config.algorithm.use_kl_in_reward:
-            self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
-
-        if config.critic.enable is not None:
-            self.use_critic = bool(config.critic.enable)
-        elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
-            self.use_critic = True
-        else:
-            warnings.warn(
-                "Disabled critic as algorithm.adv_estimator != gae. "
-                "If it is not intended, please set critic.enable=True",
-                stacklevel=2,
-            )
-            self.use_critic = False
-
-        self._validate_config()
-        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
-
-    def _validate_config(self):
-        config = self.config
-        # number of GPUs total
-        n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
-        if config.actor_rollout_ref.actor.strategy == "megatron":
-            model_parallel_size = (
-                config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size
-                * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size
-            )
-            assert (
-                n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0
-            ), (
-                f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times "
-                f"context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})"
-            )
-            megatron_dp = n_gpus // (
-                model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size
-            )
-            minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
-        else:
-            minimal_bsz = n_gpus
-
-        # 1. Check total batch size for data correctness
-        real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
-        assert real_train_batch_size % minimal_bsz == 0, (
-            f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size "
-            f"({minimal_bsz})"
-        )
-
-        # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
-        # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
-        def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
-            """Validate mutually exclusive micro batch size configuration options.
-
-            Ensures that users don't set both deprecated micro_batch_size and
-            the new micro_batch_size_per_gpu parameters simultaneously.
-
-            Args:
-                mbs: Deprecated micro batch size parameter value.
-                mbs_per_gpu: New micro batch size per GPU parameter value.
-                name (str): Configuration section name for error messages.
-
-            Raises:
-                ValueError: If both parameters are set or neither is set.
-            """
-            settings = {
-                "reward_model": "micro_batch_size",
-                "actor_rollout_ref.ref": "log_prob_micro_batch_size",
-                "actor_rollout_ref.rollout": "log_prob_micro_batch_size",
-            }
-
-            if name in settings:
-                param = settings[name]
-                param_per_gpu = f"{param}_per_gpu"
-
-                if mbs is None and mbs_per_gpu is None:
-                    raise ValueError(
-                        f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'."
-                    )
-
-                if mbs is not None and mbs_per_gpu is not None:
-                    raise ValueError(
-                        f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove "
-                        f"'{name}.{param}' because only '*_{param_per_gpu}' is supported (the former is deprecated)."
-                    )
-
-        # Actor validation done in ActorConfig.__post_init__ and validate()
-        actor_config = omega_conf_to_dataclass(config.actor_rollout_ref.actor)
-        actor_config.validate(n_gpus, config.data.train_batch_size, config.actor_rollout_ref.model)
-
-        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
-            if self.use_reference_policy:
-                # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
-                check_mutually_exclusive(
-                    config.actor_rollout_ref.ref.log_prob_micro_batch_size,
-                    config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
-                    "actor_rollout_ref.ref",
-                )
-
-            #  The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
-            check_mutually_exclusive(
-                config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
-                config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
-                "actor_rollout_ref.rollout",
-            )
-
-        # Check for reward model micro-batch size conflicts
-        if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
-            check_mutually_exclusive(
-                config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
-            )
-
-        if self.config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss:
-            print("NOTICE: You have both enabled in-reward kl and kl loss.")
-
-        # critic
-        if self.use_critic:
-            critic_config = omega_conf_to_dataclass(config.critic)
-            critic_config.validate(n_gpus, config.data.train_batch_size)
-
-        if config.data.get("val_batch_size", None) is not None:
-            print(
-                "WARNING: val_batch_size is deprecated."
-                + " Validation datasets are sent to inference engines as a whole batch,"
-                + " which will schedule the memory themselves."
-            )
-
-        # check eval config
-        if config.actor_rollout_ref.rollout.val_kwargs.do_sample:
-            assert config.actor_rollout_ref.rollout.temperature > 0, (
-                "validation gen temperature should be greater than 0 when enabling do_sample"
-            )
-
-        print("[validate_config] All configuration checks passed successfully!")
-
-    def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]):
-        """
-        Creates the train and validation dataloaders.
-        """
-        # TODO: we have to make sure the batch size is divisible by the dp size
-        from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
-
-        if train_dataset is None:
-            train_dataset = create_rl_dataset(
-                self.config.data.train_files, self.config.data, self.tokenizer, self.processor
-            )
-        if val_dataset is None:
-            val_dataset = create_rl_dataset(
-                self.config.data.val_files, self.config.data, self.tokenizer, self.processor
-            )
-        self.train_dataset, self.val_dataset = train_dataset, val_dataset
-
-        if train_sampler is None:
-            train_sampler = create_rl_sampler(self.config.data, self.train_dataset)
-        if collate_fn is None:
-            from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn
-
-            collate_fn = default_collate_fn
-
-        num_workers = self.config.data["dataloader_num_workers"]
-
-        self.train_dataloader = StatefulDataLoader(
-            dataset=self.train_dataset,
-            batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
-            num_workers=num_workers,
-            drop_last=True,
-            collate_fn=collate_fn,
-            sampler=train_sampler,
-        )
-
-        val_batch_size = self.config.data.val_batch_size  # Prefer config value if set
-        if val_batch_size is None:
-            val_batch_size = len(self.val_dataset)
-
-        self.val_dataloader = StatefulDataLoader(
-            dataset=self.val_dataset,
-            batch_size=val_batch_size,
-            num_workers=num_workers,
-            shuffle=self.config.data.get("validation_shuffle", True),
-            drop_last=False,
-            collate_fn=collate_fn,
-        )
-
-        assert len(self.train_dataloader) >= 1, "Train dataloader is empty!"
-        assert len(self.val_dataloader) >= 1, "Validation dataloader is empty!"
-
-        print(
-            f"Size of train dataloader: {len(self.train_dataloader)}, Size of val dataloader: "
-            f"{len(self.val_dataloader)}"
-        )
-
-        total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
-
-        if self.config.trainer.total_training_steps is not None:
-            total_training_steps = self.config.trainer.total_training_steps
-
-        self.total_training_steps = total_training_steps
-        print(f"Total training steps: {self.total_training_steps}")
-
-        try:
-            OmegaConf.set_struct(self.config, True)
-            with open_dict(self.config):
-                if OmegaConf.select(self.config, "actor_rollout_ref.actor.optim"):
-                    self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps
-                if OmegaConf.select(self.config, "critic.optim"):
-                    self.config.critic.optim.total_training_steps = total_training_steps
-        except Exception as e:
-            print(f"Warning: Could not set total_training_steps in config. Structure missing? Error: {e}")
-
-    def _dump_generations(self, inputs, outputs, gts, scores, reward_extra_infos_dict, dump_path, **kwargs):
-        """Dump rollout/validation samples as JSONL."""
-        os.makedirs(dump_path, exist_ok=True)
-        filename = os.path.join(dump_path, f"{self.global_steps}.jsonl")
-
-        n = len(inputs)
-        base_data = {
-            "input": inputs,
-            "output": outputs,
-            "gts": gts,
-            "score": scores,
-            "step": [self.global_steps] * n,
-        }
-
-        for k, v in reward_extra_infos_dict.items():
-            if len(v) == n:
-                base_data[k] = v
-
-        for k, v in kwargs.items():
-            if isinstance(v, np.ndarray):
-                base_data[k] = v.tolist()
-            elif hasattr(v, 'cpu'):  # Check if it's a torch tensor
-                base_data[k] = v.cpu().numpy().tolist()
-            else:
-                base_data[k] = v
-
-        lines = []
-        for i in range(n):
-            entry = {k: v[i] for k, v in base_data.items()}
-            lines.append(json.dumps(entry, ensure_ascii=False))
-
-        with open(filename, "w") as f:
-            f.write("\n".join(lines) + "\n")
-
-        print(f"Dumped generations to {filename}")
-
-    def _maybe_log_val_generations(self, inputs, outputs, scores):
-        """Log a table of validation samples to the configured logger (wandb or swanlab)"""
-
-        generations_to_log = self.config.trainer.log_val_generations
-
-        if generations_to_log == 0:
-            return
-
-        import numpy as np
-
-        # Create tuples of (input, output, score) and sort by input text
-        samples = list(zip(inputs, outputs, scores, strict=True))
-        samples.sort(key=lambda x: x[0])  # Sort by input text
-
-        # Use fixed random seed for deterministic shuffling
-        rng = np.random.RandomState(42)
-        rng.shuffle(samples)
-
-        # Take first N samples after shuffling
-        samples = samples[:generations_to_log]
-
-        # Log to each configured logger
-        self.validation_generations_logger.log(self.config.trainer.logger, samples, self.global_steps)
-
-    def _validate(self):
-        data_source_lst = []
-        reward_extra_infos_dict: dict[str, list] = defaultdict(list)
-
-        # Lists to collect samples for the table
-        sample_inputs = []
-        sample_outputs = []
-        sample_gts = []
-        sample_scores = []
-        sample_turns = []
-
-        # New lists for metric calculation
-        all_predictions = []
-        all_ground_truths = []
-        all_data_sources = []
-        all_demographics = []
-        all_datasets = []
-        data_source_lst = []
-
-        for test_data in self.val_dataloader:
-            test_batch = DataProto.from_single_dict(test_data)
-
-            # repeat test batch
-            test_batch = test_batch.repeat(
-                repeat_times=self.config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True
-            )
-
-            # we only do validation on rule-based rm
-            if self.config.reward_model.enable and test_batch[0].non_tensor_batch["reward_model"]["style"] == "model":
-                return {}
-
-            # Store original inputs
-            input_ids = test_batch.batch["input_ids"]
-            # TODO: Can we keep special tokens except for padding tokens?
-            input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
-            sample_inputs.extend(input_texts)
-
-            ground_truths = [
-                item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) for item in test_batch
-            ]
-            sample_gts.extend(ground_truths)
-            data_sources = test_batch.non_tensor_batch.get("data_source", ["unknown"] * len(input_texts))
-            datasets = test_batch.non_tensor_batch.get("dataset", ["unknown"] * len(input_texts))
-            demographics = test_batch.non_tensor_batch.get("demo", ["unknown"] * len(input_texts))
-
-            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
-            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-            if "multi_modal_data" in test_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("multi_modal_data")
-            if "raw_prompt" in test_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("raw_prompt")
-            if "tools_kwargs" in test_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("tools_kwargs")
-            if "interaction_kwargs" in test_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("interaction_kwargs")
-            if "agent_name" in test_batch.non_tensor_batch:
-                non_tensor_batch_keys_to_pop.append("agent_name")
-            test_gen_batch = test_batch.pop(
-                batch_keys=batch_keys_to_pop,
-                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
-            )
-
-            test_gen_batch.meta_info = {
-                "eos_token_id": self.tokenizer.eos_token_id,
-                "pad_token_id": self.tokenizer.pad_token_id,
-                "recompute_log_prob": False,
-                "do_sample": self.config.actor_rollout_ref.rollout.val_kwargs.do_sample,
-                "validate": True,
-                "global_steps": self.global_steps,
-            }
-            print(f"test_gen_batch meta info: {test_gen_batch.meta_info}")
-
-            # pad to be divisible by dp_size
-            size_divisor = (
-                self.actor_rollout_wg.world_size
-                if not self.async_rollout_mode
-                else self.config.actor_rollout_ref.rollout.agent.num_workers
-            )
-            test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, size_divisor)
-            if not self.async_rollout_mode:
-                test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
-            else:
-                test_output_gen_batch_padded = self.async_rollout_manager.generate_sequences(test_gen_batch_padded)
-
-            # unpad
-            test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size)
-
-            print("validation generation end")
-
-            # Store generated outputs
-            output_ids = test_output_gen_batch.batch["responses"]
-            output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
-            sample_outputs.extend(output_texts)
-
-            # Collect for metrics calculation
-            all_predictions.extend(output_texts)
-            all_ground_truths.extend(ground_truths)
-            all_data_sources.extend(data_sources)
-            all_datasets.extend(datasets)
-            all_demographics.extend(demographics)
-            data_source_lst.append(
-                test_batch.non_tensor_batch.get("data_source", ["unknown"] * len(input_texts))
-            )
-
-            test_batch = test_batch.union(test_output_gen_batch)
-            test_batch.meta_info["validate"] = True
-
-            # evaluate using reward_function
-            if self.val_reward_fn is None:
-                raise ValueError("val_reward_fn must be provided for validation.")
-            result = self.val_reward_fn(test_batch, return_dict=True)
-            reward_tensor = result["reward_tensor"]
-            scores = reward_tensor.sum(-1).cpu().tolist()
-            sample_scores.extend(scores)
-
-            reward_extra_infos_dict["reward"].extend(scores)
-            print(f"len reward_extra_infos_dict['reward']: {len(reward_extra_infos_dict['reward'])}")
-            if "reward_extra_info" in result:
-                for key, lst in result["reward_extra_info"].items():
-                    reward_extra_infos_dict[key].extend(lst)
-                    print(f"len reward_extra_infos_dict['{key}']: {len(reward_extra_infos_dict[key])}")
-
-            # collect num_turns of each prompt
-            if "__num_turns__" in test_batch.non_tensor_batch:
-                sample_turns.append(test_batch.non_tensor_batch["__num_turns__"])
-
-        self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores)
-
-        # Per data source metrics
-        metrics = compute_metrics_by_data_source(all_predictions, all_ground_truths,
-                                                 all_data_sources, all_datasets, all_demographics)
-        wandb.log(metrics, step=self.global_steps)
-
-        for key_info, lst in reward_extra_infos_dict.items():
-            assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}"
-
-        data_sources = np.concatenate(data_source_lst, axis=0)
-        # convert to list for easier processing
-        data_sources = data_sources.tolist()
-
-        print(f"size of sample_scores: {len(sample_scores)}, size of sample_outputs: {len(sample_outputs)},"
-              f" size of sample_gts: {len(sample_gts)}, size of sample_inputs: {len(sample_inputs)}"
-              f", size of data_sources: {len(data_sources)}, size of sample_turns: {len(sample_turns)}")
-        data_src2var2metric2val = process_validation_metrics(data_sources, sample_inputs, reward_extra_infos_dict)
-        metric_dict = {}
-        for data_source, var2metric2val in data_src2var2metric2val.items():
-            core_var = "acc" if "acc" in var2metric2val else "reward"
-            for var_name, metric2val in var2metric2val.items():
-                n_max = max([int(name.split("@")[-1].split("/")[0]) for name in metric2val.keys()])
-                for metric_name, metric_val in metric2val.items():
-                    if (
-                        (var_name == core_var)
-                        and any(metric_name.startswith(pfx) for pfx in ["mean", "maj", "best"])
-                        and (f"@{n_max}" in metric_name)
-                    ):
-                        metric_sec = "val-core"
-                    else:
-                        metric_sec = "val-aux"
-                    pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}"
-                    metric_dict[pfx] = metric_val
-
-        # dump generations
-        val_data_dir = self.config.trainer.get("validation_data_dir", self.config.trainer.default_local_dir)
-        if val_data_dir:
-            self._dump_generations(
-                inputs=sample_inputs,
-                outputs=sample_outputs,
-                gts=sample_gts,
-                scores=sample_scores,
-                reward_extra_infos_dict=reward_extra_infos_dict,
-                dump_path=val_data_dir,
-                datasets=all_datasets,
-                data_paths=data_sources,
-            )
-
-        if len(sample_turns) > 0:
-            sample_turns = np.concatenate(sample_turns)
-            metric_dict["val-aux/num_turns/min"] = sample_turns.min()
-            metric_dict["val-aux/num_turns/max"] = sample_turns.max()
-            metric_dict["val-aux/num_turns/mean"] = sample_turns.mean()
-
-        return metric_dict
-
-    def save_generations(self, sample_datapaths, sample_datasets, sample_inputs, sample_labels, sample_outputs,
-                         sample_scores):
-        generation_save_folder = os.path.join(self.config.trainer.default_local_dir,
-                                              f"global_step_{self.global_steps}")
-        if not os.path.exists(generation_save_folder):
-            os.makedirs(generation_save_folder, exist_ok=True)
-        with open(os.path.join(generation_save_folder, "generations.jsonl"), "w") as f:
-            for i in range(len(sample_inputs)):
-                try:
-                    short_answer = sample_outputs[i].split("boxed{")[1].split("}")[0]
-                except IndexError:
-                    short_answer = ''
-                answer_is_correct = short_answer == sample_labels[i]
-                f.write(
-                    ujson.dumps({
-                        "input": sample_inputs[i],
-                        "generations": sample_outputs[i],
-                        "short_answer": short_answer,
-                        "answer_is_correct": answer_is_correct,
-                        "label": sample_labels[i],
-                        "score": sample_scores[i],
-                        "dataset": sample_datasets[i],
-                        "datapath": sample_datapaths[i],
-                    }) + "\n"
-                )
-
-    def init_workers(self):
-        """Initialize distributed training workers using Ray backend.
-
-        Creates:
-        1. Ray resource pools from configuration
-        2. Worker groups for each role (actor, critic, etc.)
-        """
-        self.resource_pool_manager.create_resource_pool()
-
-        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
-
-        # create actor and rollout
-        if self.hybrid_engine:
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout)
-            actor_rollout_cls = RayClassWithInitArgs(
-                cls=self.role_worker_mapping[Role.ActorRollout],
-                config=self.config.actor_rollout_ref,
-                role="actor_rollout",
-                profile_option=self.config.trainer.npu_profile.options,
-            )
-            self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
-        else:
-            raise NotImplementedError
-
-        # create critic
-        if self.use_critic:
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
-            critic_cfg = omega_conf_to_dataclass(self.config.critic)
-            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg)
-            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
-
-        # create reference policy if needed
-        if self.use_reference_policy:
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
-            ref_policy_cls = RayClassWithInitArgs(
-                self.role_worker_mapping[Role.RefPolicy],
-                config=self.config.actor_rollout_ref,
-                role="ref",
-                profile_option=self.config.trainer.npu_profile.options,
-            )
-            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
-
-        # create a reward model if reward_fn is None
-        if self.use_rm:
-            # we create a RM here
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
-            rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
-            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
-
-        # initialize WorkerGroup
-        # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
-        # you should not use `create_colocated_worker_cls`.
-        # Instead, directly pass different resource pool to different worker groups.
-        # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information.
-        all_wg = {}
-        wg_kwargs = {}  # Setting up kwargs for RayWorkerGroup
-        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
-            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
-        if OmegaConf.select(self.config.trainer, "profile_steps") is not None:
-            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.trainer, "profile_steps")
-            assert OmegaConf.select(self.config.trainer, "worker_nsight_options") is not None, (
-                "worker_nsight_options must be set when profile_steps is set"
-            )
-            wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
-                OmegaConf.select(self.config.trainer, "worker_nsight_options")
-            )
-        wg_kwargs["device_name"] = self.device_name
-
-        for resource_pool, class_dict in self.resource_pool_to_cls.items():
-            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
-            wg_dict = self.ray_worker_group_cls(
-                resource_pool=resource_pool,
-                ray_cls_with_init=worker_dict_cls,
-                **wg_kwargs,
-            )
-            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
-            all_wg.update(spawn_wg)
-
-        if self.use_critic:
-            self.critic_wg = all_wg["critic"]
-            self.critic_wg.init_model()
-
-        if self.use_reference_policy and not self.ref_in_actor:
-            self.ref_policy_wg = all_wg["ref"]
-            self.ref_policy_wg.init_model()
-
-        if self.use_rm:
-            self.rm_wg = all_wg["rm"]
-            self.rm_wg.init_model()
-
-        # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
-        self.actor_rollout_wg = all_wg["actor_rollout"]
-        self.actor_rollout_wg.init_model()
-
-        # create async rollout manager and request scheduler
-        self.async_rollout_mode = False
-        if self.config.actor_rollout_ref.rollout.mode == "async":
-            from verl.experimental.agent_loop import AgentLoopManager
-
-            self.async_rollout_mode = True
-            self.async_rollout_manager = AgentLoopManager(
-                config=self.config,
-                worker_group=self.actor_rollout_wg,
-            )
-
-    def _save_checkpoint(self):
-
-        ## TO SAVE CHECKPOINT
-        from verl.utils.fs import local_mkdir_safe
-
-        # path: given_path + `/global_step_{global_steps}` + `/actor`
-        local_global_step_folder = os.path.join(
-            self.config.trainer.default_local_dir, f"global_step_{self.global_steps}"
-        )
-
-        print(f"local_global_step_folder: {local_global_step_folder}")
-        actor_local_path = os.path.join(local_global_step_folder, "actor")
-
-        actor_remote_path = (
-            None
-            if self.config.trainer.default_hdfs_dir is None
-            else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "actor")
-        )
-
-        remove_previous_ckpt_in_save = self.config.trainer.get("remove_previous_ckpt_in_save", False)
-        if remove_previous_ckpt_in_save:
-            print(
-                "Warning: remove_previous_ckpt_in_save is deprecated,"
-                + " set max_actor_ckpt_to_keep=1 and max_critic_ckpt_to_keep=1 instead"
-            )
-        max_actor_ckpt_to_keep = (
-            self.config.trainer.get("max_actor_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
-        )
-        max_critic_ckpt_to_keep = (
-            self.config.trainer.get("max_critic_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
-        )
-
-        self.actor_rollout_wg.save_checkpoint(
-            actor_local_path, actor_remote_path, self.global_steps, max_ckpt_to_keep=max_actor_ckpt_to_keep
-        )
-
-        if self.use_critic:
-            critic_local_path = os.path.join(local_global_step_folder, "critic")
-            critic_remote_path = (
-                None
-                if self.config.trainer.default_hdfs_dir is None
-                else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "critic")
-            )
-            self.critic_wg.save_checkpoint(
-                critic_local_path, critic_remote_path, self.global_steps, max_ckpt_to_keep=max_critic_ckpt_to_keep
-            )
-
-        # save dataloader
-        local_mkdir_safe(local_global_step_folder)
-        dataloader_local_path = os.path.join(local_global_step_folder, "data.pt")
-        dataloader_state_dict = self.train_dataloader.state_dict()
-        torch.save(dataloader_state_dict, dataloader_local_path)
-
-        # latest checkpointed iteration tracker (for atomic usage)
-        local_latest_checkpointed_iteration = os.path.join(
-            self.config.trainer.default_local_dir, "latest_checkpointed_iteration.txt"
-        )
-        with open(local_latest_checkpointed_iteration, "w") as f:
-            f.write(str(self.global_steps))
-
-    def _load_checkpoint(self):
-        if self.config.trainer.resume_mode == "disable":
-            return 0
-
-        # load from hdfs
-        if self.config.trainer.default_hdfs_dir is not None:
-            raise NotImplementedError("load from hdfs is not implemented yet")
-        else:
-            checkpoint_folder = self.config.trainer.default_local_dir  # TODO: check path
-            if not os.path.isabs(checkpoint_folder):
-                working_dir = os.getcwd()
-                checkpoint_folder = os.path.join(working_dir, checkpoint_folder)
-            global_step_folder = find_latest_ckpt_path(checkpoint_folder)  # None if no latest
-
-        # find global_step_folder
-        if self.config.trainer.resume_mode == "auto":
-            if global_step_folder is None:
-                print("Training from scratch")
-                return 0
-        else:
-            if self.config.trainer.resume_mode == "resume_path":
-                assert isinstance(self.config.trainer.resume_from_path, str), "resume ckpt must be str type"
-                assert "global_step_" in self.config.trainer.resume_from_path, (
-                    "resume ckpt must specify the global_steps"
-                )
-                global_step_folder = self.config.trainer.resume_from_path
-                if not os.path.isabs(global_step_folder):
-                    working_dir = os.getcwd()
-                    global_step_folder = os.path.join(working_dir, global_step_folder)
-        print(f"Load from checkpoint folder: {global_step_folder}")
-        # set global step
-        self.global_steps = int(global_step_folder.split("global_step_")[-1])
-
-        print(f"Setting global step to {self.global_steps}")
-        print(f"Resuming from {global_step_folder}")
-
-        actor_path = os.path.join(global_step_folder, "actor")
-        critic_path = os.path.join(global_step_folder, "critic")
-        # load actor
-        self.actor_rollout_wg.load_checkpoint(
-            actor_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
-        )
-        # load critic
-        if self.use_critic:
-            self.critic_wg.load_checkpoint(
-                critic_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
-            )
-
-        # load dataloader,
-        # TODO: from remote not implemented yet
-        dataloader_local_path = os.path.join(global_step_folder, "data.pt")
-        if os.path.exists(dataloader_local_path):
-            dataloader_state_dict = torch.load(dataloader_local_path, weights_only=False)
-            self.train_dataloader.load_state_dict(dataloader_state_dict)
-        else:
-            print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch")
-
-    def _start_profiling(self, do_profile: bool) -> None:
-        """Start profiling for all worker groups if profiling is enabled."""
-        if do_profile:
-            self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps)
-            if self.use_reference_policy:
-                self.ref_policy_wg.start_profile()
-            if self.use_critic:
-                self.critic_wg.start_profile()
-            if self.use_rm:
-                self.rm_wg.start_profile()
-
-    def _stop_profiling(self, do_profile: bool) -> None:
-        """Stop profiling for all worker groups if profiling is enabled."""
-        if do_profile:
-            self.actor_rollout_wg.stop_profile()
-            if self.use_reference_policy:
-                self.ref_policy_wg.stop_profile()
-            if self.use_critic:
-                self.critic_wg.stop_profile()
-            if self.use_rm:
-                self.rm_wg.stop_profile()
-
-    def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"):
-        """Reorder the data on single controller such that each dp rank gets similar total tokens"""
-        attention_mask = batch.batch["attention_mask"]
-        batch_size = attention_mask.shape[0]
-        global_seqlen_lst = batch.batch["attention_mask"].view(batch_size, -1).sum(-1).tolist()  # (train_batch_size,)
-        world_size = self.actor_rollout_wg.world_size
-        global_partition_lst = get_seqlen_balanced_partitions(
-            global_seqlen_lst, k_partitions=world_size, equal_size=True
-        )
-        # reorder based on index. The data will be automatically equally partitioned by dispatch function
-        global_idx = torch.tensor([j for partition in global_partition_lst for j in partition])
-        batch.reorder(global_idx)
-        global_balance_stats = log_seqlen_unbalance(
-            seqlen_list=global_seqlen_lst, partitions=global_partition_lst, prefix=logging_prefix
-        )
-        metrics.update(global_balance_stats)
-
-    def fit(self):
-        """
-        The training loop of PPO.
-        The driver process only need to call the compute functions of the worker group through RPC
-        to construct the PPO dataflow.
-        The light-weight advantage computation is done on the driver process.
-        """
-        from omegaconf import OmegaConf
-
-        from verl.utils.tracking import Tracking
-
-        logger = Tracking(
-            project_name=self.config.trainer.project_name,
-            experiment_name=self.config.trainer.experiment_name,
-            default_backend=self.config.trainer.logger,
-            config=OmegaConf.to_container(self.config, resolve=True),
-        )
-
-        self.global_steps = 0
-
-        # load checkpoint before doing anything
-        self._load_checkpoint()
-
-        # perform validation before training
-        # currently, we only support validation using the reward_function.
-        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
-            val_metrics = self._validate()
-            assert val_metrics, f"{val_metrics=}"
-            pprint(f"Initial validation metrics: {val_metrics}")
-            logger.log(data=val_metrics, step=self.global_steps)
-            if self.config.trainer.get("val_only", False):
-                return
-
-        if self.config.actor_rollout_ref.rollout.get("skip_rollout", False):
-            rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg)
-            rollout_skip.wrap_generate_sequences()
-
-        # add tqdm
-        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
-
-        # we start from step 1
-        self.global_steps += 1
-        last_val_metrics = None
-        self.max_steps_duration = 0
-
-        prev_step_profile = False
-        curr_step_profile = (
-            self.global_steps in self.config.trainer.profile_steps
-            if self.config.trainer.profile_steps is not None
-            else False
-        )
-        next_step_profile = False
-
-        for epoch in range(self.config.trainer.total_epochs):
-            for batch_dict in self.train_dataloader:
-                metrics = {}
-                timing_raw = {}
-
-                with marked_timer("start_profile", timing_raw):
-                    self._start_profiling(
-                        not prev_step_profile and curr_step_profile
-                        if self.config.trainer.profile_continuous_steps
-                        else curr_step_profile
-                    )
-
-                batch: DataProto = DataProto.from_single_dict(batch_dict)
-
-                # add uid to batch
-                batch.non_tensor_batch["uid"] = np.array(
-                    [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object
-                )
-
-                # pop those keys for generation
-                batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
-                non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
-                if "multi_modal_data" in batch.non_tensor_batch:
-                    # TODO: Fix the audio generation for this
-                    non_tensor_batch_keys_to_pop.append("multi_modal_data")
-
-                # NOTE: Adding pruned inputs that we kept for generation
-                # if "multi_modal_inputs" in batch.non_tensor_batch:
-                #     non_tensor_batch_keys_to_pop.append("multi_modal_inputs")
-
-                # NOTE: Adding pruned inputs that we kept for generation
-                if "multi_modal_inputs" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("multi_modal_inputs")
-                    
-                if "raw_prompt" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("raw_prompt")
-                if "tools_kwargs" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("tools_kwargs")
-                if "interaction_kwargs" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("interaction_kwargs")
-                if "index" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("index")
-                if "agent_name" in batch.non_tensor_batch:
-                    non_tensor_batch_keys_to_pop.append("agent_name")
-
-                gen_batch = batch.pop(
-                    batch_keys=batch_keys_to_pop,
-                    non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
-                )
-
-                # pass global_steps to trace
-                gen_batch.meta_info["global_steps"] = self.global_steps
-                gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
-
-                is_last_step = self.global_steps >= self.total_training_steps
-
-                # TODO: double check the gen_batch
-                # print(f"gen_batch", gen_batch)
-
-
-                with marked_timer("step", timing_raw):
-                    # generate a batch
-                    with marked_timer("gen", timing_raw, color="red"):
-                        if not self.async_rollout_mode:
-                            # TODO: Fix the audio generation for this
-                            gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
-                        else:
-                            gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
-                        timing_raw.update(gen_batch_output.meta_info["timing"])
-                        gen_batch_output.meta_info.pop("timing", None)
-
-                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
-                        if self.reward_fn is None:
-                            raise ValueError("A reward_fn is required for REMAX advantage estimation.")
-
-                        with marked_timer("gen_max", timing_raw, color="purple"):
-                            gen_baseline_batch = deepcopy(gen_batch)
-                            gen_baseline_batch.meta_info["do_sample"] = False
-                            if not self.async_rollout_mode:
-                                gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch)
-                            else:
-                                gen_baseline_output = self.async_rollout_manager.generate_sequences(gen_baseline_batch)
-                            batch = batch.union(gen_baseline_output)
-                            reward_baseline_tensor = self.reward_fn(batch)
-                            reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)
-
-                            batch.pop(batch_keys=list(gen_baseline_output.batch.keys()))
-
-                            batch.batch["reward_baselines"] = reward_baseline_tensor
-
-                            del gen_baseline_batch, gen_baseline_output
-
-                    # repeat to align with repeated responses in rollout
-                    batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
-                    batch = batch.union(gen_batch_output)
-
-                    if "response_mask" not in batch.batch.keys():
-                        batch.batch["response_mask"] = compute_response_mask(batch)
-                    # Balance the number of valid tokens across DP ranks.
-                    # NOTE: This usually changes the order of data in the `batch`,
-                    # which won't affect the advantage calculation (since it's based on uid),
-                    # but might affect the loss calculation (due to the change of mini-batching).
-                    # TODO: Decouple the DP balancing and mini-batching.
-                    if self.config.trainer.balance_batch:
-                        self._balance_batch(batch, metrics=metrics)
-
-                    # compute global_valid tokens
-                    batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
-
-                    with marked_timer("reward", timing_raw, color="yellow"):
-                        # compute reward model score
-                        if self.use_rm:
-                            reward_tensor = self.rm_wg.compute_rm_score(batch)
-                            batch = batch.union(reward_tensor)
-
-                        if self.config.reward_model.launch_reward_fn_async:
-                            future_reward = compute_reward_async.remote(data=batch, reward_fn=self.reward_fn)
-                        else:
-                            reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
-
-                    # recompute old_log_probs
-                    with marked_timer("old_log_prob", timing_raw, color="blue"):
-                        old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
-                        entropys = old_log_prob.batch["entropys"]
-                        response_masks = batch.batch["response_mask"]
-                        loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
-                        entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
-                        old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
-                        metrics.update(old_log_prob_metrics)
-                        old_log_prob.batch.pop("entropys")
-                        batch = batch.union(old_log_prob)
-
-                        if "rollout_log_probs" in batch.batch.keys():
-                            # TODO: we may want to add diff of probs too.
-                            from verl.utils.debug.metrics import calculate_debug_metrics
-
-                            metrics.update(calculate_debug_metrics(batch))
-
-                    if self.use_reference_policy:
-                        # compute reference log_prob
-                        with marked_timer("ref", timing_raw, color="olive"):
-                            if not self.ref_in_actor:
-                                ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
-                            else:
-                                ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
-                            batch = batch.union(ref_log_prob)
-
-                    # compute values
-                    if self.use_critic:
-                        with marked_timer("values", timing_raw, color="cyan"):
-                            values = self.critic_wg.compute_values(batch)
-                            batch = batch.union(values)
-
-                    with marked_timer("adv", timing_raw, color="brown"):
-                        # we combine with rule-based rm
-                        reward_extra_infos_dict: dict[str, list]
-                        if self.config.reward_model.launch_reward_fn_async:
-                            reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
-                        batch.batch["token_level_scores"] = reward_tensor
-
-                        if reward_extra_infos_dict:
-                            batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
-
-                        # compute rewards. apply_kl_penalty if available
-                        if self.config.algorithm.use_kl_in_reward:
-                            batch, kl_metrics = apply_kl_penalty(
-                                batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
-                            )
-                            metrics.update(kl_metrics)
-                        else:
-                            batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
-
-                        # compute advantages, executed on the driver process
-
-                        norm_adv_by_std_in_grpo = self.config.algorithm.get(
-                            "norm_adv_by_std_in_grpo", True
-                        )  # GRPO adv normalization factor
-
-                        batch = compute_advantage(
-                            batch,
-                            adv_estimator=self.config.algorithm.adv_estimator,
-                            gamma=self.config.algorithm.gamma,
-                            lam=self.config.algorithm.lam,
-                            num_repeat=self.config.actor_rollout_ref.rollout.n,
-                            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
-                            config=self.config.algorithm,
-                        )
-
-                    # update critic
-                    if self.use_critic:
-                        with marked_timer("update_critic", timing_raw, color="pink"):
-                            critic_output = self.critic_wg.update_critic(batch)
-                        critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
-                        metrics.update(critic_output_metrics)
-
-                    # implement critic warmup
-                    if self.config.trainer.critic_warmup <= self.global_steps:
-                        # update actor
-                        with marked_timer("update_actor", timing_raw, color="red"):
-                            batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
-                            actor_output = self.actor_rollout_wg.update_actor(batch)
-                        actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
-                        metrics.update(actor_output_metrics)
-
-                    # Log rollout generations if enabled
-                    rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
-                    if rollout_data_dir:
-                        with marked_timer("dump_rollout_generations", timing_raw, color="green"):
-                            inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
-                            outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
-                            scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
-                            sample_gts = [
-                                item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None)
-                                for item in batch
-                            ]
-
-                            if "request_id" in batch.non_tensor_batch:
-                                reward_extra_infos_dict.setdefault(
-                                    "request_id",
-                                    batch.non_tensor_batch["request_id"].tolist(),
-                                )
-
-                            self._dump_generations(
-                                inputs=inputs,
-                                outputs=outputs,
-                                gts=sample_gts,
-                                scores=scores,
-                                reward_extra_infos_dict=reward_extra_infos_dict,
-                                dump_path=rollout_data_dir,
-                            )
-
-                    # validate
-                    if (
-                        self.val_reward_fn is not None
-                        and self.config.trainer.test_freq > 0
-                        and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
-                    ):
-                        with marked_timer("testing", timing_raw, color="green"):
-                            val_metrics: dict = self._validate()
-                            if is_last_step:
-                                last_val_metrics = val_metrics
-                        metrics.update(val_metrics)
-
-                    # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
-                    esi_close_to_expiration = should_save_ckpt_esi(
-                        max_steps_duration=self.max_steps_duration,
-                        redundant_time=self.config.trainer.esi_redundant_time,
-                    )
-                    # Check if the conditions for saving a checkpoint are met.
-                    # The conditions include a mandatory condition (1) and
-                    # one of the following optional conditions (2/3/4):
-                    # 1. The save frequency is set to a positive value.
-                    # 2. It's the last training step.
-                    # 3. The current step number is a multiple of the save frequency.
-                    # 4. The ESI(Elastic Server Instance)/training plan is close to expiration.
-                    if self.config.trainer.save_freq > 0 and (
-                        is_last_step
-                        or self.global_steps % self.config.trainer.save_freq == 0
-                        or esi_close_to_expiration
-                    ):
-                        if esi_close_to_expiration:
-                            print("Force saving checkpoint: ESI instance expiration approaching.")
-                        with marked_timer("save_checkpoint", timing_raw, color="green"):
-                            self._save_checkpoint()
-
-                with marked_timer("stop_profile", timing_raw):
-                    next_step_profile = (
-                        self.global_steps + 1 in self.config.trainer.profile_steps
-                        if self.config.trainer.profile_steps is not None
-                        else False
-                    )
-                    self._stop_profiling(
-                        curr_step_profile and not next_step_profile
-                        if self.config.trainer.profile_continuous_steps
-                        else curr_step_profile
-                    )
-                    prev_step_profile = curr_step_profile
-                    curr_step_profile = next_step_profile
-
-                steps_duration = timing_raw["step"]
-                self.max_steps_duration = max(self.max_steps_duration, steps_duration)
-
-                # training metrics
-                metrics.update(
-                    {
-                        "training/global_step": self.global_steps,
-                        "training/epoch": epoch,
-                    }
-                )
-                # collect metrics
-                metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
-                metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
-                # TODO: implement actual tflpo and theoretical tflpo
-                n_gpus = self.resource_pool_manager.get_n_gpus()
-                metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
-
-                # this is experimental and may be changed/removed in the future in favor of a general-purpose one
-                if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
-                    self.train_dataloader.sampler.update(batch=batch)
-
-                # TODO: make a canonical logger that supports various backend
-                logger.log(data=metrics, step=self.global_steps)
-
-                progress_bar.update(1)
-                self.global_steps += 1
-
-                if is_last_step:
-                    pprint(f"Final validation metrics: {last_val_metrics}")
-                    progress_bar.close()
-                    return
-
-                # this is experimental and may be changed/removed in the future
-                # in favor of a general-purpose data buffer pool
-                if hasattr(self.train_dataset, "on_batch_end"):
-                    # The dataset may be changed after each training batch
-                    self.train_dataset.on_batch_end(batch=batch)
diff --git a/verl/utils/dataset/old_get_item.py b/verl/utils/dataset/old_get_item.py
deleted file mode 100644
index 7df48ecd036..00000000000
--- a/verl/utils/dataset/old_get_item.py
+++ /dev/null
@@ -1,298 +0,0 @@
-
-def __getitem__(self, item):
-    """
-    Note that we also return the raw_input_ids so that it can be combined with other chat template
-    """
-    row_dict: dict = self.dataframe[item]
-
-    is_timeseries = False
-    vision_path = row_dict['images'][0] if 'images' in row_dict and len(row_dict['images']) != 0 else None
-    if vision_path is None:  # this may be video
-        vision_path = row_dict['videos'][0] if 'videos' in row_dict and len(row_dict['videos']) != 0 else None
-    if vision_path is None:  # this may be time series only
-        vision_path = row_dict['time_series'][0] if 'time_series' in row_dict and len(
-            row_dict['time_series']) != 0 else ''
-        is_timeseries = True
-    prompt_str = row_dict[self.prompt_key]
-
-    if 'How long will the patient stay in the hospital?' in prompt_str:
-        row_dict["data_source"] = "multimodal"
-        row_dict["dataset"] = "los_prediction"
-    elif 'Will the patient survive for at least 48 hours?' in prompt_str:
-        row_dict["data_source"] = "multimodal"
-        row_dict["dataset"] = "48_ihm"
-    elif len(vision_path) != 0:
-        try:
-            row_dict["data_source"] = vision_path.split("/")[0]
-            row_dict["dataset"] = vision_path.split("/")[1]
-        except IndexError:
-            row_dict["data_source"] = "unknown"
-            row_dict["dataset"] = "unknown"
-            print(
-                f"Failed to parse vision path: {vision_path}. The annotation is {row_dict}. Using default values.")
-    elif is_timeseries:
-        row_dict["data_source"] = "ecg"
-        # dataset already set in json
-    else:
-        raise ValueError("No modality found.")
-
-    if 'reward_model' not in row_dict:
-        if 'answer' in row_dict:
-            answer = row_dict['answer']
-        elif 'ground_truth' in row_dict:
-            answer = row_dict['ground_truth']
-        else:
-            raise ValueError("No answer or ground_truth found in the row_dict.")
-        row_dict['reward_model'] = {'ground_truth': answer}
-
-    for key, item in row_dict.items():
-        if item is None:
-            row_dict[key] = []
-
-    # NOTE: BUILD_MESSAGES IS CALLED TWICE; 
-    # NOTE: FIRST TIME IS TO GET THE LENGTH OF THE RAW PROMPT AND FILTER OUT 
-    # NOTE: PROMPTS THAT DO NOT FIT THE LENGTH; 
-    # NOTE: SECOND TIME IS TO BUILD THE MESSAGE TO BE PASSED INTO THE MODEL
-
-    messages = self._build_messages(row_dict)
-
-    if "audio" in self.modalities:
-        # NOTE: Set the following prompt for qwen omni when we are training on audio
-        messages.insert(0, {
-            "role": "system",
-            "content": [
-                {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
-                                            "capable of perceiving auditory and visual inputs, as well as generating text and speech."}
-            ]
-        })
-    model_inputs = {}
-    
-    # NOTE: DEBUGGING
-    dbg = True
-    if dbg:
-        print(f"[getitem] idx=? ds={row_dict.get('dataset')} src={row_dict.get('data_source')} "
-            f"modalities={self.modalities}")
-
-    if self.processor is not None:
-        # THIS CHUNK IS BASICALLY ABOUT PROCESSING ALL THE MODALITIES
-        from verl.utils.dataset.vision_utils import process_image, process_video
-        from verl.utils.dataset.audio_utils import process_audio
-
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", message="System prompt modified")
-            raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-
-        
-        if dbg:
-            print(f"[prompt] raw_prompt_chars={len(raw_prompt)}")
-
-        multi_modal_data = {}
-        processor_kwargs = {"text": [raw_prompt], "return_tensors": "pt"}
-
-        if "images" in self.modalities and self.image_key in row_dict and row_dict.get(self.image_key, None) is not None and len(row_dict[self.image_key]) > 0:
-            images = []
-            for image in row_dict.get(self.image_key):
-                image = os.path.join(self.base_dir, image) if isinstance(image, str) else image
-                images.append(process_image(image))
-
-            # due to the image key is "image" instead of "images" in vllm, we need to use "image" here
-            # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
-            multi_modal_data["image"] = images
-            processor_kwargs["images"] = images
-
-            if dbg:
-                print(f"[image] n={len(images)} shapes={[tuple(x.size()) if hasattr(x,'size') else 'np' for x in images]}")
-
-
-        # print(f"KEANE: Videos is next line, current processor_kwargs {processor_kwargs}")
-        if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
-            videos = []
-            # print(f"KEANE: GETTING VIDEO {row_dict[self.video_key]}")
-
-            for video in row_dict.get(self.video_key):
-                video = os.path.join(self.base_dir, video) if isinstance(video, str) else video
-                videos.append(process_video(video))
-
-            # due to the video key is "video" instead of "videos" in vllm, we need to use "video" here
-            # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
-            multi_modal_data["video"] = [video.numpy() for video in videos]
-            processor_kwargs["videos"] = videos
-
-            if dbg:
-                shapes = [tuple(v.shape) for v in videos]  # [T,3,H,W]
-                toks = []
-                for (T, C, H, W) in shapes:
-                    toks.append(_tok_est_from_hw(H, W) * T)
-                print(f"[video] n={len(videos)} shapes={shapes} est_tokens={toks} "
-                    f"sum_est_tokens={sum(toks)} p99_est={_p99(toks)}")
-
-
-        # NOTE: PROCESSING OF THE AUDIO TUPLES
-        # if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
-        #     audios = []
-        #     audio_tuples = []  # Keep tuples for multi_modal_data
-        #     for audio in row_dict.get(self.audio_key):
-        #         audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
-        #         audio_data, sampling_rate = process_audio(audio_path, self.processor)
-        #         audio_tuples.append((audio_data, sampling_rate))
-        #         # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
-        #         audios.append(audio_data.detach().cpu().numpy().astype("float32"))
-
-        #     # multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
-        #     multi_modal_data["audio"] = audios  # Store numpy arrays (it should not accept tuples)
-
-        #     processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
-
-        if (
-            "audio" in self.modalities
-            and self.audio_key in row_dict
-            and row_dict.get(self.audio_key)
-            and len(row_dict[self.audio_key]) > 0
-        ):
-            audios_np = []
-            audios_np_sr = []
-            audio_tuples_debug = []  # keep tensors only for debugging
-            audio_secs = []
-
-            for audio in row_dict[self.audio_key]:
-                audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
-                audio_tensor, sr = process_audio(audio_path, self.processor)
-
-                # Debug only
-                audio_tuples_debug.append((audio_tensor, sr))
-
-                # What BOTH HF and vLLM need:
-                arr = audio_tensor.detach().cpu().numpy().astype("float32")
-                audios_np.append(arr)
-                audios_np_sr.append((arr, int(sr)))
-                audio_secs.append(_sec_from_array(arr, sr))
-
-            # HF (Whisper / Omni processor) path
-            multi_modal_data["audio"] = audios_np_sr  # Store numpy arrays (it should not accept tuples)
-
-            processor_kwargs["audio"] = audios_np  # Pass numpy arrays to processor
-
-            if dbg:
-                print(f"[audio] n={len(audios_np)} secs_each={audio_secs} total_secs≈{round(sum([s for s in audio_secs if s!='?']),3)}")
-
-        # NOTE: Original CODE PROCESSING    
-        # TODO: Please check whether the model is processing the "audio" correctly, the processor that we are using is qwen 2.5 OMNI
-        # print(f"KEANE: Processing multimodal data with processor {self.processor.__class__.__name__} ")
-        # print(f"KEANE: Processor kwargs: {processor_kwargs}")
-        # model_inputs = self.processor(**processor_kwargs)
-
-        # NOTE: Replacement code
-        try:
-            t0 = time.time()
-            model_inputs = self.processor(**processor_kwargs)
-            dt = (time.time() - t0)*1000
-            if dbg:
-                # lengths after processor/tokenizer
-                ids = model_inputs.get("input_ids")
-                lens = [len(x) for x in ids] if ids is not None else []
-                print(f"[processor] ok in {dt:.1f}ms; input_ids lens={lens} "
-                    f"min/med/max={ (min(lens) if lens else '-')} / "
-                    f"{ (sorted(lens)[len(lens)//2] if lens else '-') } / "
-                    f"{ (max(lens) if lens else '-') }")
-        except Exception as e:
-            print(f"[processor][ERROR] {type(e).__name__}: {e}")
-            # helpful context dump (small)
-            print(f"[processor][ctx] has_video={videos is not None} "
-                f"n_vid={len(videos) if videos is not None else 0} "
-                f"n_audio={len(audio_secs) if audio_secs else 0} "
-                f"raw_prompt_chars={len(raw_prompt)}")
-            raise
-
-        # NOTE: all text should be processed by self.processor()
-        input_ids = model_inputs.pop("input_ids")
-        attention_mask = model_inputs.pop("attention_mask")
-
-        if "second_per_grid_ts" in model_inputs:
-            model_inputs.pop("second_per_grid_ts")
-
-        # There's a trap here, multi_modal_inputs has to be a dict, not BatchFeature
-        row_dict["multi_modal_data"] = multi_modal_data
-
-        # We will do batch.union() in the trainer,
-        # so we cannot have "multi_modal_inputs" in row_dict if rollout generates new multi_modal_inputs
-        if self.return_multi_modal_inputs:
-            row_dict["multi_modal_inputs"] = dict(model_inputs)
-
-            # second_per_grid_ts isn't used for training, just for mrope
-            row_dict["multi_modal_inputs"].pop("second_per_grid_ts", None)
-
-    else:
-        raw_prompt = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        model_inputs = self.tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
-        input_ids = model_inputs.pop("input_ids")
-        attention_mask = model_inputs.pop("attention_mask")
-
-    input_ids, attention_mask = verl_F.postprocess_data(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        max_length=self.max_prompt_length,
-        pad_token_id=self.tokenizer.pad_token_id,
-        left_pad=True,
-        truncation=self.truncation,
-    )
-
-    if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
-        from verl.models.transformers.qwen2_vl import get_rope_index
-        
-        # NOTE: printing out whether this runs
-        # print("KEANE: Running getting the rope index of input ids")
-        
-        # NOTE: OBTAIN ROPE of rotary positional embeddings. ROPE encodes position by rotating components of query/key vectors
-        # This is just for to get relative position in terms of angular differences etc.
-        position_ids = [
-            get_rope_index(
-                self.processor,
-                input_ids=input_ids[0],
-                image_grid_thw=model_inputs.get("image_grid_thw"),
-                video_grid_thw=model_inputs.get("video_grid_thw"),
-                second_per_grid_ts=model_inputs.get("second_per_grid_ts"),
-                attention_mask=attention_mask[0],
-            )
-        ]  # (1, 3, seq_len)
-
-    else:
-        position_ids = compute_position_id_with_mask(attention_mask)
-
-    # Essentially training with the different input ids etc.
-    row_dict["input_ids"] = input_ids[0]
-    row_dict["attention_mask"] = attention_mask[0]
-    row_dict["position_ids"] = position_ids[0]
-
-    raw_prompt_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
-    if len(raw_prompt_ids) > self.max_prompt_length:
-        if self.truncation == "left":
-            raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length :]
-        elif self.truncation == "right":
-            raw_prompt_ids = raw_prompt_ids[: self.max_prompt_length]
-        elif self.truncation == "middle":
-            left_half = self.max_prompt_length // 2
-            right_half = self.max_prompt_length - left_half
-            raw_prompt_ids = raw_prompt_ids[:left_half] + raw_prompt_ids[-right_half:]
-        elif self.truncation == "error":
-            raise RuntimeError(f"Prompt length {len(raw_prompt_ids)} is longer than {self.max_prompt_length}.")
-
-    row_dict["raw_prompt_ids"] = raw_prompt_ids
-    # encode prompts without chat template
-    if self.return_raw_chat:
-        row_dict["raw_prompt"] = messages
-
-    # get prompts with chat template
-    if self.return_full_prompt:
-        row_dict["full_prompts"] = raw_prompt  # array of strings
-
-    # add index for each prompt
-    index = row_dict.get("extra_info", {}).get("index", 0)
-    tools_kwargs = row_dict.get("extra_info", {}).get("tools_kwargs", {})
-    interaction_kwargs = row_dict.get("extra_info", {}).get("interaction_kwargs", {})
-    need_tools_kwargs = row_dict.get("extra_info", {}).get("need_tools_kwargs", self.need_tools_kwargs)
-    if need_tools_kwargs and not tools_kwargs:
-        logger.warning("tools_kwargs is empty for index {}, data source: {}", index, row_dict["data_source"])
-    row_dict["index"] = index
-    row_dict["tools_kwargs"] = tools_kwargs
-    row_dict["interaction_kwargs"] = interaction_kwargs
-    return row_dict
\ No newline at end of file
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 36f678c468a..1abcb8c84b2 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -377,7 +377,6 @@ def _build_messages(self, example: dict):
                             new_messages[i]["content"] = self.format_prompt.render(content=content)
         return new_messages
 
-
     def __getitem__(self, item):
         """
         Note that we also return the raw_input_ids so that it can be combined with other chat template
@@ -674,9 +673,7 @@ def __getitem__(self, item):
         row_dict["index"] = index
         row_dict["tools_kwargs"] = tools_kwargs
         row_dict["interaction_kwargs"] = interaction_kwargs
-    
-        return row_dict    
-    
+        return row_dict
 
     def __getstate__(self):
         if not self.serialize_dataset:
diff --git a/verl/utils/dataset/rl_dataset_alt.py b/verl/utils/dataset/rl_dataset_alt.py
deleted file mode 100644
index da8bfcc2cea..00000000000
--- a/verl/utils/dataset/rl_dataset_alt.py
+++ /dev/null
@@ -1,649 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-# Copyright 2023-2024 SGLang Team
-# Copyright 2025 ModelBest Inc. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import logging
-import os
-import re
-from collections import defaultdict
-from typing import Optional
-
-import datasets
-import numpy as np
-import torch
-from jinja2 import Template
-from omegaconf import DictConfig, ListConfig
-from torch.utils.data import Dataset
-from transformers import PreTrainedTokenizer, ProcessorMixin
-import warnings
-
-import verl.utils.torch_functional as verl_F
-from verl.utils.model import compute_position_id_with_mask
-import time, os, math, warnings
-
-logger = logging.getLogger(__name__)
-
-def _tok_est_from_hw(H, W):
-    # 28x28 -> 1 "visual token" heuristic
-    return math.ceil(H/28) * math.ceil(W/28)
-
-def _sec_from_array(arr, sr):
-    try:
-        return round(len(arr) / float(sr), 3)
-    except Exception:
-        return "?"
-
-def _p99(xs):
-    xs = sorted(xs)
-    if not xs: return 0
-    k = int(0.99*(len(xs)-1))
-    return xs[k]
-
-
-def collate_fn(data_list: list[dict]) -> dict:
-    """
-    Collate a batch of sample dicts into batched tensors and arrays.
-
-    Args:
-        data_list: List of dicts mapping feature names to torch.Tensor or other values.
-
-    Returns:
-        Dict where tensor entries are stacked into a torch.Tensor of shape
-        (batch_size, dims) and non-tensor entries are converted to
-        np.ndarray of dtype object with shape (batch_size,).
-    """
-    tensors = defaultdict(list)
-    non_tensors = defaultdict(list)
-
-    data_list = [d for d in data_list if d is not None]
-    if not data_list:
-        return None
-
-    for data in data_list:
-        for key, val in data.items():
-            if isinstance(val, torch.Tensor):
-                tensors[key].append(val)
-            else:
-                non_tensors[key].append(val)
-
-    for key, val in tensors.items():
-        tensors[key] = torch.stack(val, dim=0)
-
-    for key, val in non_tensors.items():
-        non_tensors[key] = np.fromiter(val, dtype=object, count=len(val))
-
-    return {**tensors, **non_tensors}
-
-
-class RLHFDataset(Dataset):
-    """
-    Load and preprocess RLHF data from Parquet files.
-
-    - Caches files locally.
-    - Reads into a HuggingFace Dataset and tokenizes prompts.
-    - Optionally handles images/videos via a ProcessorMixin.
-    - Filters prompts over a max length.
-    - Supports resuming from checkpoints.
-
-    Args:
-        data_files (str or list): Path(s) to Parquet file(s).
-        tokenizer (PreTrainedTokenizer): For the tokenization of text to token IDs.
-        config (DictConfig): Options like cache_dir, prompt_key, max_prompt_length, truncation, etc.
-        processor (ProcessorMixin, optional): Multimodal preprocessor for images/videos.
-    """
-
-    def __init__(
-        self,
-        data_files: str | list[str],
-        tokenizer: PreTrainedTokenizer,
-        config: DictConfig,
-        processor: Optional[ProcessorMixin] = None,
-    ):
-        if not isinstance(data_files, list | ListConfig):
-            data_files = [data_files]
-
-        self.data_files = copy.deepcopy(data_files)
-        self.original_data_files = copy.deepcopy(data_files)  # use for resume
-        self.tokenizer = tokenizer
-        self.processor = processor
-        self.config = config
-
-        self.cache_dir = os.path.expanduser(config.get("cache_dir", "~/.cache/verl/rlhf"))
-        
-        # Essentially getting all the different keys.
-        self.prompt_key = config.get("prompt_key", "prompt")
-        self.image_key = config.get("image_key", "images")
-        self.video_key = config.get("video_key", "videos")
-
-        # NOTE: SET AUDIO KEY AS AUDIOS
-        self.audio_key = config.get("audio_key", "audios")
-
-        # NOTE: SET MODALITIES, split the images and videos
-        self.modalities = set(config.get("modalities", "images,videos").split(","))
-
-        self.max_prompt_length = config.get("max_prompt_length", 1024)
-        self.return_raw_chat = config.get("return_raw_chat", False)
-        self.return_full_prompt = config.get("return_full_prompt", False)
-        self.truncation = config.get("truncation", "error")
-
-        # TODO: Check whether this is true
-        self.filter_overlong_prompts = config.get("filter_overlong_prompts", True)
-        if isinstance(data_files, str):
-            self.base_dir = os.path.dirname(os.path.abspath(data_files))
-        else:
-            self.base_dir = os.path.dirname(os.path.abspath(data_files[0]))
-
-        self.num_workers = config.get("filter_overlong_prompts_workers", max(1, os.cpu_count() // 4))
-        self.num_workers = min(self.num_workers, os.cpu_count())
-        self.use_shm = config.get("use_shm", False)
-        self.chat_template_func = config.get("chat_template_func", None)
-        self.need_tools_kwargs = config.get("need_tools_kwargs", False)
-        self.filter_prompts = config.get("filter_prompts", True)
-        self.serialize_dataset = False
-        self.return_multi_modal_inputs = config.get("return_multi_modal_inputs", True)
-        
-        # Load format prompt from file if specified
-        self.format_prompt_path = config.get("format_prompt", "examples/format_prompt/default.jinja")
-        self.format_prompt = self._load_format_prompt()
-
-        self._download()
-        self._read_files_and_tokenize() # essentially this is prepared first before _getitem
-
-    def _load_format_prompt(self) -> Optional[Template]:
-        """Load format prompt from file if specified."""
-        if self.format_prompt_path:
-            with open(self.format_prompt_path, 'r', encoding='utf-8') as f:
-                template_content = f.read()
-            return Template(template_content)
-        return None
-
-    def _download(self, use_origin_parquet=False):
-        from verl.utils.fs import copy_to_local
-
-        data_files = self.data_files if not use_origin_parquet else self.original_data_files
-        for i, parquet_file in enumerate(data_files):
-            self.data_files[i] = copy_to_local(src=parquet_file, cache_dir=self.cache_dir, use_shm=self.use_shm)
-
-    def _read_files_and_tokenize(self):
-        dataframes = []
-
-        features = datasets.Features({
-            "problem": datasets.Value("string"),
-            "answer":  datasets.Value("string"),
-            "images":  datasets.Sequence(datasets.Value("string")),
-            "videos":  datasets.Sequence(datasets.Value("string")),
-            "audios":  datasets.Sequence(datasets.Value("string")),  # <- force list of strings
-            "dataset": datasets.Value("string"),
-            "texts":   datasets.Sequence(datasets.Value("string")),
-        })
-
-        for parquet_file in self.data_files:
-            # read parquet files and cache
-            if parquet_file.endswith(".parquet"):
-                dataframe = datasets.load_dataset("parquet", data_files=parquet_file, features=features)["train"]
-            elif parquet_file.endswith(".json") or parquet_file.endswith(".jsonl"):
-                dataframe = datasets.load_dataset("json", data_files=parquet_file, features=features)["train"]
-            else:
-                raise ValueError(f"Unsupported file format: {parquet_file}. Only .parquet, .json, .jsonl are supported.")
-            dataframes.append(dataframe)
-        self.dataframe: datasets.Dataset = datasets.concatenate_datasets(dataframes)
-
-        print(f"dataset len: {len(self.dataframe)}")
-
-        # PROCESSING THE DATAFRAME for TRAINING
-        self.dataframe = self.maybe_filter_out_long_prompts(self.dataframe)
-
-    def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
-        # NOTE: filter out too long prompts, because the prompts can become very long
-        # when the audio is appended.
-
-        if self.filter_overlong_prompts:
-            # NOTE: FILTER OUT THE LONG PROMPTS SO THAT THEY FIT THE LENGTH
-            tokenizer = self.tokenizer
-            processor = self.processor
-            prompt_key = self.prompt_key
-            image_key = self.image_key
-            video_key = self.video_key
-            audio_key = self.audio_key
-
-            if processor is not None:
-                # print(f"KEANE: PROCESSOR FOUND")
-                from verl.utils.dataset.vision_utils import process_image, process_video
-                from verl.utils.dataset.audio_utils import process_audio
-
-                def doc2len(doc) -> int:
-                    messages = self._build_messages(doc)
-                    raw_prompt = self.processor.apply_chat_template(
-                        messages, add_generation_prompt=True, tokenize=False
-                    )
-                    processor_kwargs = {"text": [raw_prompt]}
-                    
-                    if "images" in self.modalities and image_key in doc and len(doc[image_key]) > 0:
-                        images = [process_image(image) for image in doc[image_key]]
-                        processor_kwargs["images"] = images
-
-                    if "videos" in self.modalities and video_key in doc and len(doc[video_key]) > 0:    
-                        videos = [process_video(video) for video in doc[video_key]]
-                        processor_kwargs["videos"] = videos
-
-                    if "audio" in self.modalities and audio_key in doc and doc.get(audio_key, None) is not None and len(doc[audio_key]) > 0:
-                        # processing of audio
-                        # print(f"KEANE: Processing audio within rl dataset file")
-                        # audios = [process_audio(audio, processor) for audio in doc[audio_key]]
-                        # processor_kwargs["audio"] = audios
-
-                        # PATCH
-                        audios = []
-                        audio_tuples = []  # Keep tuples for multi_modal_data
-                        for audio in doc.get(self.audio_key):
-                            audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
-                            audio_data, sampling_rate = process_audio(audio_path, self.processor)
-                            audio_tuples.append((audio_data, sampling_rate))
-                            # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
-                            audios.append(audio_data.detach().cpu().numpy().astype("float32"))
-
-                        processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
-                    # TODO: cannot process the audio inputs
-                    # print(f"KEANE: Processor class is {processor.__class__.__name__}")
-                    # print(f"KEANE: Printing the processor_kwargs, {processor_kwargs}")
-                    # Assume that all are in tensors already, hence there is no return_tensors = "pt"
-                    return len(processor(**processor_kwargs)["input_ids"][0])
-
-            else:
-                # print(f"KEANE: PROCESSOR NOT FOUND")
-                def doc2len(doc) -> int:
-                    return len(tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True))
-
-            dataframe = dataframe.filter(
-                lambda doc: doc2len(doc) <= self.max_prompt_length,
-                num_proc=self.num_workers,
-                desc=f"Filtering prompts longer than {self.max_prompt_length} tokens",
-            )
-
-            print(f"filter dataset len: {len(dataframe)}")
-        return dataframe
-
-    def resume_dataset_state(self):
-        self.serialize_dataset = not hasattr(self, "original_data_files")
-        # resume dataframe if not it's serialized in data.pt
-        if not self.serialize_dataset:
-            self._download(use_origin_parquet=True)  # download and resume from original parquet files
-            self._read_files_and_tokenize()
-        else:
-            print(r"old dataloader ckpt file is used, please train from scratch for better ckpt performance")
-
-    def __len__(self):
-        return len(self.dataframe)
-
-    def _build_messages(self, example: dict):
-        """
-        This appears to be called twice, once during maybe_filter_out_long_prompts, and another time during getitems
-        """
-        messages: list = example.get(self.prompt_key)
-        if isinstance(messages, str):
-            messages = [messages]
-
-        # NOTE: Before building, check if there is multimodal content
-        has_multimodal = (
-            ("images" in self.modalities and self.image_key in example) or
-            ("videos" in self.modalities and self.video_key in example) or
-            ("audio" in self.modalities and self.audio_key in example)
-        )
-        
-        if has_multimodal:
-            new_messages = []
-            for message in messages:
-                new_message = copy.deepcopy(message)
-                if isinstance(new_message, str):
-                    new_message = {"role": "user", "content": new_message}
-                content = new_message["content"]
-                
-                # Apply format prompt to the entire content first if template is loaded
-                if self.format_prompt:
-                    content = self.format_prompt.render(content=content)
-
-                image_count = len(example.get(self.image_key, []))
-                video_count = len(example.get(self.video_key, []))
-                audio_count = len(example.get(self.audio_key, []))
-                image_tag_count = content.count("<image>")
-                video_tag_count = content.count("<video>")
-                audio_tag_count = content.count("<audio>")
-
-                # NOTE: Apppending the <image>, <video>, <audio> tags when they are missing
-                if image_tag_count < image_count:
-                    content = "<image>" * (image_count - image_tag_count) + content
-                    logger.warning("<image> tag count is less than image count, adding missing <image> tags."
-                                   " content: %s", content)
-                if video_tag_count < video_count:
-                    content = "<video>" * (video_count - video_tag_count) + content
-                    logger.warning("<video> tag count is less than video count, adding missing <video> tags."
-                                 " content: %s", content)
-                if audio_tag_count < audio_count:
-                    content = "<audio>" * (audio_count - audio_tag_count) + content
-                    logger.warning("<audio> tag count is less than audio count, adding missing <audio> tags."
-                                   " content: %s", content)
-
-                content_list = []
-                # Build regex pattern based on enabled modalities
-                tag_patterns = []
-                if "images" in self.modalities:
-                    tag_patterns.append("<image>")
-                if "videos" in self.modalities:
-                    tag_patterns.append("<video>")
-                if "audio" in self.modalities:
-                    tag_patterns.append("<audio>")
-                
-                # NOTE: Denote the different patterns based on the tag.
-                # TODO: Double check what this does
-                if tag_patterns:
-                    pattern = "(" + "|".join(tag_patterns) + ")"
-                    segments = re.split(pattern, content)
-                    segments = [item for item in segments if item != ""]
-                    for segment in segments:
-                        if segment == "<image>" and "images" in self.modalities:
-                            content_list.append({"type": "image"})
-                        elif segment == "<video>" and "videos" in self.modalities:
-                            content_list.append({"type": "video"})
-                        elif segment == "<audio>" and "audio" in self.modalities:
-                            content_list.append({"type": "audio"})
-                        else:
-                            content_list.append({"type": "text", "text": segment})
-                else:
-                    content_list.append({"type": "text", "text": content})
-                new_message["content"] = content_list
-                new_messages.append(new_message)
-        else:
-            new_messages = copy.deepcopy(messages)
-            if isinstance(new_messages, str):
-                new_messages = [{"role": "user", "content": new_messages}]
-            elif isinstance(new_messages, list) and isinstance(new_messages[0], str):
-                new_messages = [{"role": "user", "content": new_messages}]
-            
-            # Apply format prompt to text-only messages if template is loaded
-            if self.format_prompt and len(new_messages) > 0:
-                for i, msg in enumerate(new_messages):
-                    if isinstance(msg, dict) and msg.get("role") == "user":
-                        content = msg.get("content", "")
-                        if isinstance(content, str):
-                            new_messages[i]["content"] = self.format_prompt.render(content=content)
-        return new_messages
-
-    def __getitem__(self, item):
-        """
-        Note that we also return the raw_input_ids so that it can be combined with other chat template
-        """
-        row_dict: dict = self.dataframe[item]
-
-        is_timeseries = False
-        vision_path = row_dict['images'][0] if 'images' in row_dict and len(row_dict['images']) != 0 else None
-        if vision_path is None:  # this may be video
-            vision_path = row_dict['videos'][0] if 'videos' in row_dict and len(row_dict['videos']) != 0 else None
-        if vision_path is None:  # this may be time series only
-            vision_path = row_dict['time_series'][0] if 'time_series' in row_dict and len(
-                row_dict['time_series']) != 0 else ''
-            is_timeseries = True
-        prompt_str = row_dict[self.prompt_key]
-
-        if 'How long will the patient stay in the hospital?' in prompt_str:
-            row_dict["data_source"] = "multimodal"
-            row_dict["dataset"] = "los_prediction"
-        elif 'Will the patient survive for at least 48 hours?' in prompt_str:
-            row_dict["data_source"] = "multimodal"
-            row_dict["dataset"] = "48_ihm"
-        elif len(vision_path) != 0:
-            try:
-                row_dict["data_source"] = vision_path.split("/")[0]
-                row_dict["dataset"] = vision_path.split("/")[1]
-            except IndexError:
-                row_dict["data_source"] = "unknown"
-                row_dict["dataset"] = "unknown"
-                print(
-                    f"Failed to parse vision path: {vision_path}. The annotation is {row_dict}. Using default values.")
-        elif is_timeseries:
-            row_dict["data_source"] = "ecg"
-            # dataset already set in json
-        else:
-            raise ValueError("No modality found.")
-
-        if 'reward_model' not in row_dict:
-            if 'answer' in row_dict:
-                answer = row_dict['answer']
-            elif 'ground_truth' in row_dict:
-                answer = row_dict['ground_truth']
-            else:
-                raise ValueError("No answer or ground_truth found in the row_dict.")
-            row_dict['reward_model'] = {'ground_truth': answer}
-
-        for key, item in row_dict.items():
-            if item is None:
-                row_dict[key] = []
-
-        # NOTE: BUILD_MESSAGES IS CALLED TWICE; 
-        # NOTE: FIRST TIME IS TO GET THE LENGTH OF THE RAW PROMPT AND FILTER OUT 
-        # NOTE: PROMPTS THAT DO NOT FIT THE LENGTH; 
-        # NOTE: SECOND TIME IS TO BUILD THE MESSAGE TO BE PASSED INTO THE MODEL
-
-        messages = self._build_messages(row_dict)
-
-        if "audio" in self.modalities:
-            # NOTE: Set the following prompt for qwen omni when we are training on audio
-            messages.insert(0, {
-                "role": "system",
-                "content": [
-                    {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
-                                             "capable of perceiving auditory and visual inputs, as well as generating text and speech."}
-                ]
-            })
-
-        model_inputs = {}
-        dbg = True
-        if dbg:
-            print(f"[getitem] idx=? ds={row_dict.get('dataset')} src={row_dict.get('data_source')} modalities={self.modalities}")
-
-        if self.processor is not None:
-            from verl.utils.dataset.vision_utils import process_image, process_video
-            from verl.utils.dataset.audio_utils import process_audio
-
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", message="System prompt modified")
-                raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-
-            if dbg:
-                print(f"[prompt] raw_prompt_chars={len(raw_prompt)}")
-
-            # ---------- CHANGED: build kwargs with ONLY present modalities; no duplicates ----------
-            processor_kwargs = {"text": [raw_prompt], "return_tensors": "pt"}
-
-            # Optional prompt gating helper
-            def mentions(tag: str) -> bool:
-                return (f"<{tag}>" in raw_prompt) or (f"<{tag}_0>" in raw_prompt)
-
-            # IMAGES
-            images = None
-            if ("images" in self.modalities and self.image_key in row_dict
-                and row_dict.get(self.image_key) and len(row_dict[self.image_key]) > 0):
-                # Optional: uncomment to require prompt placeholders
-                # if not mentions("image"): pass
-                # else:
-                images = []
-                for image in row_dict[self.image_key]:
-                    path = os.path.join(self.base_dir, image) if isinstance(image, str) else image
-                    images.append(process_image(path))
-                processor_kwargs["images"] = images
-                if dbg:
-                    print(f"[image] n={len(images)} shapes={[tuple(x.size()) if hasattr(x,'size') else 'np' for x in images]}")
-
-            # VIDEOS
-            videos = None
-            if ("videos" in self.modalities and self.video_key in row_dict
-                and row_dict.get(self.video_key) and len(row_dict[self.video_key]) > 0):
-                # Optional: placeholder gate
-                # if mentions("video"):
-                videos = []
-                for v in row_dict[self.video_key]:
-                    path = os.path.join(self.base_dir, v) if isinstance(v, str) else v
-                    t = process_video(path, debug=dbg, name_hint=os.path.basename(str(path)))
-                    videos.append(t)  # [T,3,H,W] uint8 on CPU
-                processor_kwargs["videos"] = videos
-                if dbg:
-                    shapes = [tuple(v.shape) for v in videos]
-                    toks = []
-                    for (T, C, H, W) in shapes:
-                        toks.append(_tok_est_from_hw(H, W) * T)
-                    print(f"[video] n={len(videos)} shapes={shapes} est_tokens={toks} sum_est_tokens={sum(toks)}")
-
-            # AUDIO
-            audios_np = None
-            if ("audio" in self.modalities and self.audio_key in row_dict
-                and row_dict.get(self.audio_key) and len(row_dict[self.audio_key]) > 0):
-                # Optional: placeholder gate
-                # if mentions("audio"):
-                audios_np = []
-                audio_secs = []
-                for a in row_dict[self.audio_key]:
-                    path = os.path.join(self.base_dir, a) if isinstance(a, str) else a
-                    a_tensor, sr = process_audio(path, self.processor)  # clipped mono, 16k
-                    arr = a_tensor.detach().cpu().numpy().astype("float32")
-                    audios_np.append(arr)
-                    audio_secs.append(round(len(arr)/float(sr), 3))
-                processor_kwargs["audio"] = audios_np
-                if dbg:
-                    print(f"[audio] n={len(audios_np)} secs_each={audio_secs} total_secs≈{round(sum(audio_secs),3)}")
-
-            # ---------- CHANGED: drop temporaries & do NOT stash media into row_dict ----------
-            # (we intentionally do NOT create multi_modal_data or row_dict["multi_modal_*"])
-            # Remove local references ASAP
-            for _nm in ("images", "videos", "audios_np", "audio_secs"):
-                if _nm in locals():
-                    try: del locals()[_nm]
-                    except Exception: pass
-
-            # ---------- CHANGED: processor call; skip sample on error ----------
-            try:
-                t0 = time.time()
-                model_inputs = self.processor(**processor_kwargs)   # stays CPU
-                dt = (time.time() - t0) * 1000
-                if dbg:
-                    ids = model_inputs.get("input_ids")
-                    lens = [len(x) for x in ids] if ids is not None else []
-                    med = (sorted(lens)[len(lens)//2] if lens else "-")
-                    print(f"[processor] ok in {dt:.1f}ms; input_ids lens={lens} min/med/max={ (min(lens) if lens else '-') }/{med}/{ (max(lens) if lens else '-') }")
-            except Exception as e:
-                print(f"[processor][ERROR] {type(e).__name__}: {e} — skipping sample")
-                return None  # collate_fn should drop Nones
-
-            # ---------- unchanged: extract ids/mask, postprocess ----------
-            input_ids = model_inputs.pop("input_ids")
-            attention_mask = model_inputs.pop("attention_mask")
-            model_inputs.pop("second_per_grid_ts", None)
-
-            # >>> NEW: keep a PRUNED, CPU‑only media payload for rollout generation
-            _mm_keep = {}
-            for k in ("images", "videos", "audio", "image_grid_thw", "video_grid_thw", "second_per_grid_ts"):
-                if k in model_inputs and model_inputs[k] is not None:
-                    v = model_inputs[k]
-                    # ensure CPU + plain lists where possible (no CUDA!)
-                    try:
-                        _mm_keep[k] = v if isinstance(v, list) else v.tolist()
-                    except Exception:
-                        _mm_keep[k] = v  # e.g., list of CPU tensors / numpy arrays
-
-            # Store ONLY this small payload; do NOT store frames twice, and don't keep model_inputs itself
-            row_dict["multi_modal_inputs"] = _mm_keep
-
-            # CHANGED: drop model_inputs entirely so nothing large lingers
-            del model_inputs
-
-        else:
-            raw_prompt = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-            toks = self.tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
-            input_ids = toks.pop("input_ids")
-            attention_mask = toks.pop("attention_mask")
-            del toks
-
-        # postprocess (unchanged)
-        input_ids, attention_mask = verl_F.postprocess_data(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_length=self.max_prompt_length,
-            pad_token_id=self.tokenizer.pad_token_id,
-            left_pad=True,
-            truncation=self.truncation,
-        )
-
-        # position ids (unchanged logic)
-        if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
-            from verl.models.transformers.qwen2_vl import get_rope_index
-            position_ids = [
-                get_rope_index(
-                    self.processor,
-                    input_ids=input_ids[0],
-                    image_grid_thw=None,   # not stored; processor will infer if needed
-                    video_grid_thw=None,
-                    second_per_grid_ts=None,
-                    attention_mask=attention_mask[0],
-                )
-            ]
-        else:
-            position_ids = compute_position_id_with_mask(attention_mask)
-
-        # set minimal required fields on row_dict
-        row_dict["input_ids"] = input_ids[0]
-        row_dict["attention_mask"] = attention_mask[0]
-        row_dict["position_ids"] = position_ids[0]
-
-        # raw_prompt_ids (unchanged)
-        raw_prompt_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
-        if len(raw_prompt_ids) > self.max_prompt_length:
-            if self.truncation == "left":
-                raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length:]
-            elif self.truncation == "right":
-                raw_prompt_ids = raw_prompt_ids[: self.max_prompt_length]
-            elif self.truncation == "middle":
-                left_half = self.max_prompt_length // 2
-                right_half = self.max_prompt_length - left_half
-                raw_prompt_ids = raw_prompt_ids[:left_half] + raw_prompt_ids[-right_half:]
-            elif self.truncation == "error":
-                raise RuntimeError(f"Prompt length {len(raw_prompt_ids)} is longer than {self.max_prompt_length}.")
-
-        row_dict["raw_prompt_ids"] = raw_prompt_ids
-        if self.return_raw_chat:
-            row_dict["raw_prompt"] = messages
-        if self.return_full_prompt:
-            row_dict["full_prompts"] = raw_prompt
-
-        # tail (unchanged, small metadata only)
-        index = row_dict.get("extra_info", {}).get("index", 0)
-        tools_kwargs = row_dict.get("extra_info", {}).get("tools_kwargs", {})
-        interaction_kwargs = row_dict.get("extra_info", {}).get("interaction_kwargs", {})
-        need_tools_kwargs = row_dict.get("extra_info", {}).get("need_tools_kwargs", self.need_tools_kwargs)
-        if need_tools_kwargs and not tools_kwargs:
-            logger.warning("tools_kwargs is empty for index {}, data source: {}", index, row_dict["data_source"])
-        row_dict["index"] = index
-        row_dict["tools_kwargs"] = tools_kwargs
-        row_dict["interaction_kwargs"] = interaction_kwargs
-
-        return row_dict
-    
-    def __getstate__(self):
-        if not self.serialize_dataset:
-            state = self.__dict__.copy()
-
-            if "dataframe" in state:
-                del state["dataframe"]
-            return state
-
-        return self.__dict__.copy()
diff --git a/verl/utils/dataset/rl_dataset_org.py b/verl/utils/dataset/rl_dataset_org.py
deleted file mode 100644
index 29c0a34f0ef..00000000000
--- a/verl/utils/dataset/rl_dataset_org.py
+++ /dev/null
@@ -1,687 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-# Copyright 2023-2024 SGLang Team
-# Copyright 2025 ModelBest Inc. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import logging
-import os
-import re
-from collections import defaultdict
-from typing import Optional
-
-import datasets
-import numpy as np
-import torch
-from jinja2 import Template
-from omegaconf import DictConfig, ListConfig
-from torch.utils.data import Dataset
-from transformers import PreTrainedTokenizer, ProcessorMixin
-import warnings
-
-import verl.utils.torch_functional as verl_F
-from verl.utils.model import compute_position_id_with_mask
-import time, os, math, warnings
-
-logger = logging.getLogger(__name__)
-
-def _tok_est_from_hw(H, W):
-    # 28x28 -> 1 "visual token" heuristic
-    return math.ceil(H/28) * math.ceil(W/28)
-
-def _sec_from_array(arr, sr):
-    try:
-        return round(len(arr) / float(sr), 3)
-    except Exception:
-        return "?"
-
-def _p99(xs):
-    xs = sorted(xs)
-    if not xs: return 0
-    k = int(0.99*(len(xs)-1))
-    return xs[k]
-
-
-def collate_fn(data_list: list[dict]) -> dict:
-    """
-    Collate a batch of sample dicts into batched tensors and arrays.
-
-    Args:
-        data_list: List of dicts mapping feature names to torch.Tensor or other values.
-
-    Returns:
-        Dict where tensor entries are stacked into a torch.Tensor of shape
-        (batch_size, dims) and non-tensor entries are converted to
-        np.ndarray of dtype object with shape (batch_size,).
-    """
-    tensors = defaultdict(list)
-    non_tensors = defaultdict(list)
-
-    for data in data_list:
-        for key, val in data.items():
-            if isinstance(val, torch.Tensor):
-                tensors[key].append(val)
-            else:
-                non_tensors[key].append(val)
-
-    for key, val in tensors.items():
-        tensors[key] = torch.stack(val, dim=0)
-
-    for key, val in non_tensors.items():
-        non_tensors[key] = np.fromiter(val, dtype=object, count=len(val))
-
-    return {**tensors, **non_tensors}
-
-
-class RLHFDataset(Dataset):
-    """
-    Load and preprocess RLHF data from Parquet files.
-
-    - Caches files locally.
-    - Reads into a HuggingFace Dataset and tokenizes prompts.
-    - Optionally handles images/videos via a ProcessorMixin.
-    - Filters prompts over a max length.
-    - Supports resuming from checkpoints.
-
-    Args:
-        data_files (str or list): Path(s) to Parquet file(s).
-        tokenizer (PreTrainedTokenizer): For the tokenization of text to token IDs.
-        config (DictConfig): Options like cache_dir, prompt_key, max_prompt_length, truncation, etc.
-        processor (ProcessorMixin, optional): Multimodal preprocessor for images/videos.
-    """
-
-    def __init__(
-        self,
-        data_files: str | list[str],
-        tokenizer: PreTrainedTokenizer,
-        config: DictConfig,
-        processor: Optional[ProcessorMixin] = None,
-    ):
-        if not isinstance(data_files, list | ListConfig):
-            data_files = [data_files]
-
-        self.data_files = copy.deepcopy(data_files)
-        self.original_data_files = copy.deepcopy(data_files)  # use for resume
-        self.tokenizer = tokenizer
-        self.processor = processor
-        self.config = config
-
-        self.cache_dir = os.path.expanduser(config.get("cache_dir", "~/.cache/verl/rlhf"))
-        
-        # Essentially getting all the different keys.
-        self.prompt_key = config.get("prompt_key", "prompt")
-        self.image_key = config.get("image_key", "images")
-        self.video_key = config.get("video_key", "videos")
-
-        # NOTE: SET AUDIO KEY AS AUDIOS
-        self.audio_key = config.get("audio_key", "audios")
-
-        # NOTE: SET MODALITIES, split the images and videos
-        self.modalities = set(config.get("modalities", "images,videos").split(","))
-
-        self.max_prompt_length = config.get("max_prompt_length", 1024)
-        self.return_raw_chat = config.get("return_raw_chat", False)
-        self.return_full_prompt = config.get("return_full_prompt", False)
-        self.truncation = config.get("truncation", "error")
-
-        # TODO: Check whether this is true
-        self.filter_overlong_prompts = config.get("filter_overlong_prompts", True)
-        if isinstance(data_files, str):
-            self.base_dir = os.path.dirname(os.path.abspath(data_files))
-        else:
-            self.base_dir = os.path.dirname(os.path.abspath(data_files[0]))
-
-        self.num_workers = config.get("filter_overlong_prompts_workers", max(1, os.cpu_count() // 4))
-        self.num_workers = min(self.num_workers, os.cpu_count())
-        self.use_shm = config.get("use_shm", False)
-        self.chat_template_func = config.get("chat_template_func", None)
-        self.need_tools_kwargs = config.get("need_tools_kwargs", False)
-        self.filter_prompts = config.get("filter_prompts", True)
-        self.serialize_dataset = False
-        self.return_multi_modal_inputs = config.get("return_multi_modal_inputs", True)
-        
-        # Load format prompt from file if specified
-        self.format_prompt_path = config.get("format_prompt", "examples/format_prompt/default.jinja")
-        self.format_prompt = self._load_format_prompt()
-
-        self._download()
-        self._read_files_and_tokenize() # essentially this is prepared first before _getitem
-
-    def _load_format_prompt(self) -> Optional[Template]:
-        """Load format prompt from file if specified."""
-        if self.format_prompt_path:
-            with open(self.format_prompt_path, 'r', encoding='utf-8') as f:
-                template_content = f.read()
-            return Template(template_content)
-        return None
-
-    def _download(self, use_origin_parquet=False):
-        from verl.utils.fs import copy_to_local
-
-        data_files = self.data_files if not use_origin_parquet else self.original_data_files
-        for i, parquet_file in enumerate(data_files):
-            self.data_files[i] = copy_to_local(src=parquet_file, cache_dir=self.cache_dir, use_shm=self.use_shm)
-
-    def _read_files_and_tokenize(self):
-        dataframes = []
-
-        features = datasets.Features({
-            "problem": datasets.Value("string"),
-            "answer":  datasets.Value("string"),
-            "images":  datasets.Sequence(datasets.Value("string")),
-            "videos":  datasets.Sequence(datasets.Value("string")),
-            "audios":  datasets.Sequence(datasets.Value("string")),  # <- force list of strings
-            "dataset": datasets.Value("string"),
-            "texts":   datasets.Sequence(datasets.Value("string")),
-        })
-
-        for parquet_file in self.data_files:
-            # read parquet files and cache
-            if parquet_file.endswith(".parquet"):
-                dataframe = datasets.load_dataset("parquet", data_files=parquet_file, features=features)["train"]
-            elif parquet_file.endswith(".json") or parquet_file.endswith(".jsonl"):
-                dataframe = datasets.load_dataset("json", data_files=parquet_file, features=features)["train"]
-            else:
-                raise ValueError(f"Unsupported file format: {parquet_file}. Only .parquet, .json, .jsonl are supported.")
-            dataframes.append(dataframe)
-        self.dataframe: datasets.Dataset = datasets.concatenate_datasets(dataframes)
-
-        print(f"dataset len: {len(self.dataframe)}")
-
-        # PROCESSING THE DATAFRAME for TRAINING
-        self.dataframe = self.maybe_filter_out_long_prompts(self.dataframe)
-
-    def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
-        # NOTE: filter out too long prompts, because the prompts can become very long
-        # when the audio is appended.
-
-        if self.filter_overlong_prompts:
-            # NOTE: FILTER OUT THE LONG PROMPTS SO THAT THEY FIT THE LENGTH
-            tokenizer = self.tokenizer
-            processor = self.processor
-            prompt_key = self.prompt_key
-            image_key = self.image_key
-            video_key = self.video_key
-            audio_key = self.audio_key
-
-            if processor is not None:
-                # print(f"KEANE: PROCESSOR FOUND")
-                from verl.utils.dataset.vision_utils import process_image, process_video
-                from verl.utils.dataset.audio_utils import process_audio
-
-                def doc2len(doc) -> int:
-                    messages = self._build_messages(doc)
-                    raw_prompt = self.processor.apply_chat_template(
-                        messages, add_generation_prompt=True, tokenize=False
-                    )
-                    processor_kwargs = {"text": [raw_prompt]}
-                    
-                    if "images" in self.modalities and image_key in doc and len(doc[image_key]) > 0:
-                        images = [process_image(image) for image in doc[image_key]]
-                        processor_kwargs["images"] = images
-
-                    if "videos" in self.modalities and video_key in doc and len(doc[video_key]) > 0:    
-                        videos = [process_video(video) for video in doc[video_key]]
-                        processor_kwargs["videos"] = videos
-
-                    if "audio" in self.modalities and audio_key in doc and doc.get(audio_key, None) is not None and len(doc[audio_key]) > 0:
-                        # processing of audio
-                        # print(f"KEANE: Processing audio within rl dataset file")
-                        # audios = [process_audio(audio, processor) for audio in doc[audio_key]]
-                        # processor_kwargs["audio"] = audios
-
-                        # PATCH
-                        audios = []
-                        audio_tuples = []  # Keep tuples for multi_modal_data
-                        for audio in doc.get(self.audio_key):
-                            audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
-                            audio_data, sampling_rate = process_audio(audio_path, self.processor)
-                            audio_tuples.append((audio_data, sampling_rate))
-                            # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
-                            audios.append(audio_data.detach().cpu().numpy().astype("float32"))
-
-                        processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
-                    # TODO: cannot process the audio inputs
-                    # print(f"KEANE: Processor class is {processor.__class__.__name__}")
-                    # print(f"KEANE: Printing the processor_kwargs, {processor_kwargs}")
-                    # Assume that all are in tensors already, hence there is no return_tensors = "pt"
-                    return len(processor(**processor_kwargs)["input_ids"][0])
-
-            else:
-                # print(f"KEANE: PROCESSOR NOT FOUND")
-                def doc2len(doc) -> int:
-                    return len(tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True))
-
-            dataframe = dataframe.filter(
-                lambda doc: doc2len(doc) <= self.max_prompt_length,
-                num_proc=self.num_workers,
-                desc=f"Filtering prompts longer than {self.max_prompt_length} tokens",
-            )
-
-            print(f"filter dataset len: {len(dataframe)}")
-        return dataframe
-
-    def resume_dataset_state(self):
-        self.serialize_dataset = not hasattr(self, "original_data_files")
-        # resume dataframe if not it's serialized in data.pt
-        if not self.serialize_dataset:
-            self._download(use_origin_parquet=True)  # download and resume from original parquet files
-            self._read_files_and_tokenize()
-        else:
-            print(r"old dataloader ckpt file is used, please train from scratch for better ckpt performance")
-
-    def __len__(self):
-        return len(self.dataframe)
-
-    def _build_messages(self, example: dict):
-        """
-        This appears to be called twice, once during maybe_filter_out_long_prompts, and another time during getitems
-        """
-        messages: list = example.get(self.prompt_key)
-        if isinstance(messages, str):
-            messages = [messages]
-
-        # NOTE: Before building, check if there is multimodal content
-        has_multimodal = (
-            ("images" in self.modalities and self.image_key in example) or
-            ("videos" in self.modalities and self.video_key in example) or
-            ("audio" in self.modalities and self.audio_key in example)
-        )
-        
-        if has_multimodal:
-            new_messages = []
-            for message in messages:
-                new_message = copy.deepcopy(message)
-                if isinstance(new_message, str):
-                    new_message = {"role": "user", "content": new_message}
-                content = new_message["content"]
-                
-                # Apply format prompt to the entire content first if template is loaded
-                if self.format_prompt:
-                    content = self.format_prompt.render(content=content)
-
-                image_count = len(example.get(self.image_key, []))
-                video_count = len(example.get(self.video_key, []))
-                audio_count = len(example.get(self.audio_key, []))
-                image_tag_count = content.count("<image>")
-                video_tag_count = content.count("<video>")
-                audio_tag_count = content.count("<audio>")
-
-                # NOTE: Apppending the <image>, <video>, <audio> tags when they are missing
-                if image_tag_count < image_count:
-                    content = "<image>" * (image_count - image_tag_count) + content
-                    logger.warning("<image> tag count is less than image count, adding missing <image> tags."
-                                   " content: %s", content)
-                if video_tag_count < video_count:
-                    content = "<video>" * (video_count - video_tag_count) + content
-                    logger.warning("<video> tag count is less than video count, adding missing <video> tags."
-                                 " content: %s", content)
-                if audio_tag_count < audio_count:
-                    content = "<audio>" * (audio_count - audio_tag_count) + content
-                    logger.warning("<audio> tag count is less than audio count, adding missing <audio> tags."
-                                   " content: %s", content)
-
-                content_list = []
-                # Build regex pattern based on enabled modalities
-                tag_patterns = []
-                if "images" in self.modalities:
-                    tag_patterns.append("<image>")
-                if "videos" in self.modalities:
-                    tag_patterns.append("<video>")
-                if "audio" in self.modalities:
-                    tag_patterns.append("<audio>")
-                
-                # NOTE: Denote the different patterns based on the tag.
-                # TODO: Double check what this does
-                if tag_patterns:
-                    pattern = "(" + "|".join(tag_patterns) + ")"
-                    segments = re.split(pattern, content)
-                    segments = [item for item in segments if item != ""]
-                    for segment in segments:
-                        if segment == "<image>" and "images" in self.modalities:
-                            content_list.append({"type": "image"})
-                        elif segment == "<video>" and "videos" in self.modalities:
-                            content_list.append({"type": "video"})
-                        elif segment == "<audio>" and "audio" in self.modalities:
-                            content_list.append({"type": "audio"})
-                        else:
-                            content_list.append({"type": "text", "text": segment})
-                else:
-                    content_list.append({"type": "text", "text": content})
-                new_message["content"] = content_list
-                new_messages.append(new_message)
-        else:
-            new_messages = copy.deepcopy(messages)
-            if isinstance(new_messages, str):
-                new_messages = [{"role": "user", "content": new_messages}]
-            elif isinstance(new_messages, list) and isinstance(new_messages[0], str):
-                new_messages = [{"role": "user", "content": new_messages}]
-            
-            # Apply format prompt to text-only messages if template is loaded
-            if self.format_prompt and len(new_messages) > 0:
-                for i, msg in enumerate(new_messages):
-                    if isinstance(msg, dict) and msg.get("role") == "user":
-                        content = msg.get("content", "")
-                        if isinstance(content, str):
-                            new_messages[i]["content"] = self.format_prompt.render(content=content)
-        return new_messages
-
-    def __getitem__(self, item):
-        """
-        Note that we also return the raw_input_ids so that it can be combined with other chat template
-        """
-        row_dict: dict = self.dataframe[item]
-
-        is_timeseries = False
-        vision_path = row_dict['images'][0] if 'images' in row_dict and len(row_dict['images']) != 0 else None
-        if vision_path is None:  # this may be video
-            vision_path = row_dict['videos'][0] if 'videos' in row_dict and len(row_dict['videos']) != 0 else None
-        if vision_path is None:  # this may be time series only
-            vision_path = row_dict['time_series'][0] if 'time_series' in row_dict and len(
-                row_dict['time_series']) != 0 else ''
-            is_timeseries = True
-        prompt_str = row_dict[self.prompt_key]
-
-        if 'How long will the patient stay in the hospital?' in prompt_str:
-            row_dict["data_source"] = "multimodal"
-            row_dict["dataset"] = "los_prediction"
-        elif 'Will the patient survive for at least 48 hours?' in prompt_str:
-            row_dict["data_source"] = "multimodal"
-            row_dict["dataset"] = "48_ihm"
-        elif len(vision_path) != 0:
-            try:
-                row_dict["data_source"] = vision_path.split("/")[0]
-                row_dict["dataset"] = vision_path.split("/")[1]
-            except IndexError:
-                row_dict["data_source"] = "unknown"
-                row_dict["dataset"] = "unknown"
-                print(
-                    f"Failed to parse vision path: {vision_path}. The annotation is {row_dict}. Using default values.")
-        elif is_timeseries:
-            row_dict["data_source"] = "ecg"
-            # dataset already set in json
-        else:
-            raise ValueError("No modality found.")
-
-        if 'reward_model' not in row_dict:
-            if 'answer' in row_dict:
-                answer = row_dict['answer']
-            elif 'ground_truth' in row_dict:
-                answer = row_dict['ground_truth']
-            else:
-                raise ValueError("No answer or ground_truth found in the row_dict.")
-            row_dict['reward_model'] = {'ground_truth': answer}
-
-        for key, item in row_dict.items():
-            if item is None:
-                row_dict[key] = []
-
-        # NOTE: BUILD_MESSAGES IS CALLED TWICE; 
-        # NOTE: FIRST TIME IS TO GET THE LENGTH OF THE RAW PROMPT AND FILTER OUT 
-        # NOTE: PROMPTS THAT DO NOT FIT THE LENGTH; 
-        # NOTE: SECOND TIME IS TO BUILD THE MESSAGE TO BE PASSED INTO THE MODEL
-
-        messages = self._build_messages(row_dict)
-
-        if "audio" in self.modalities:
-            # NOTE: Set the following prompt for qwen omni when we are training on audio
-            messages.insert(0, {
-                "role": "system",
-                "content": [
-                    {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
-                                             "capable of perceiving auditory and visual inputs, as well as generating text and speech."}
-                ]
-            })
-        model_inputs = {}
-        
-        # NOTE: DEBUGGING
-        dbg = True
-        if dbg:
-            print(f"[getitem] idx=? ds={row_dict.get('dataset')} src={row_dict.get('data_source')} "
-                f"modalities={self.modalities}")
-
-        if self.processor is not None:
-            # THIS CHUNK IS BASICALLY ABOUT PROCESSING ALL THE MODALITIES
-            from verl.utils.dataset.vision_utils import process_image, process_video
-            from verl.utils.dataset.audio_utils import process_audio
-
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", message="System prompt modified")
-                raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-
-            
-            if dbg:
-                print(f"[prompt] raw_prompt_chars={len(raw_prompt)}")
-
-            multi_modal_data = {}
-            processor_kwargs = {"text": [raw_prompt], "return_tensors": "pt"}
-
-            if "images" in self.modalities and self.image_key in row_dict and row_dict.get(self.image_key, None) is not None and len(row_dict[self.image_key]) > 0:
-                images = []
-                for image in row_dict.get(self.image_key):
-                    image = os.path.join(self.base_dir, image) if isinstance(image, str) else image
-                    images.append(process_image(image))
-
-                # due to the image key is "image" instead of "images" in vllm, we need to use "image" here
-                # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
-                multi_modal_data["image"] = images
-                processor_kwargs["images"] = images
-
-                if dbg:
-                    print(f"[image] n={len(images)} shapes={[tuple(x.size()) if hasattr(x,'size') else 'np' for x in images]}")
-
-
-            # print(f"KEANE: Videos is next line, current processor_kwargs {processor_kwargs}")
-            if "videos" in self.modalities and self.video_key in row_dict and row_dict.get(self.video_key, None) is not None and len(row_dict[self.video_key]) > 0:
-                videos = []
-                # print(f"KEANE: GETTING VIDEO {row_dict[self.video_key]}")
-
-                for video in row_dict.get(self.video_key):
-                    video = os.path.join(self.base_dir, video) if isinstance(video, str) else video
-                    videos.append(process_video(video))
-
-                # due to the video key is "video" instead of "videos" in vllm, we need to use "video" here
-                # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
-                multi_modal_data["video"] = [video.numpy() for video in videos]
-                processor_kwargs["videos"] = videos
-
-                if dbg:
-                    shapes = [tuple(v.shape) for v in videos]  # [T,3,H,W]
-                    toks = []
-                    for (T, C, H, W) in shapes:
-                        toks.append(_tok_est_from_hw(H, W) * T)
-                    print(f"[video] n={len(videos)} shapes={shapes} est_tokens={toks} "
-                        f"sum_est_tokens={sum(toks)} p99_est={_p99(toks)}")
-
-
-            # NOTE: PROCESSING OF THE AUDIO TUPLES
-            # if "audio" in self.modalities and self.audio_key in row_dict and row_dict.get(self.audio_key, None) is not None and len(row_dict[self.audio_key]) > 0:
-            #     audios = []
-            #     audio_tuples = []  # Keep tuples for multi_modal_data
-            #     for audio in row_dict.get(self.audio_key):
-            #         audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
-            #         audio_data, sampling_rate = process_audio(audio_path, self.processor)
-            #         audio_tuples.append((audio_data, sampling_rate))
-            #         # audios.append(audio_data.numpy())  # Convert to numpy array for Whisper
-            #         audios.append(audio_data.detach().cpu().numpy().astype("float32"))
-
-            #     # multi_modal_data["audio"] = audio_tuples  # Store tuples for reference
-            #     multi_modal_data["audio"] = audios  # Store numpy arrays (it should not accept tuples)
-
-            #     processor_kwargs["audio"] = audios  # Pass numpy arrays to processor
-
-            if (
-                "audio" in self.modalities
-                and self.audio_key in row_dict
-                and row_dict.get(self.audio_key)
-                and len(row_dict[self.audio_key]) > 0
-            ):
-                audios_np = []
-                audios_np_sr = []
-                audio_tuples_debug = []  # keep tensors only for debugging
-                audio_secs = []
-
-                for audio in row_dict[self.audio_key]:
-                    audio_path = os.path.join(self.base_dir, audio) if isinstance(audio, str) else audio
-                    audio_tensor, sr = process_audio(audio_path, self.processor)
-
-                    # Debug only
-                    audio_tuples_debug.append((audio_tensor, sr))
-
-                    # What BOTH HF and vLLM need:
-                    arr = audio_tensor.detach().cpu().numpy().astype("float32")
-                    audios_np.append(arr)
-                    audios_np_sr.append((arr, int(sr)))
-                    audio_secs.append(_sec_from_array(arr, sr))
-
-                # HF (Whisper / Omni processor) path
-                multi_modal_data["audio"] = audios_np_sr  # Store numpy arrays (it should not accept tuples)
-
-                processor_kwargs["audio"] = audios_np  # Pass numpy arrays to processor
-
-                if dbg:
-                    print(f"[audio] n={len(audios_np)} secs_each={audio_secs} total_secs≈{round(sum([s for s in audio_secs if s!='?']),3)}")
-
-            # NOTE: Original CODE PROCESSING    
-            # TODO: Please check whether the model is processing the "audio" correctly, the processor that we are using is qwen 2.5 OMNI
-            # print(f"KEANE: Processing multimodal data with processor {self.processor.__class__.__name__} ")
-            # print(f"KEANE: Processor kwargs: {processor_kwargs}")
-            # model_inputs = self.processor(**processor_kwargs)
-
-            # NOTE: Replacement code
-            try:
-                t0 = time.time()
-                model_inputs = self.processor(**processor_kwargs)
-                dt = (time.time() - t0)*1000
-                if dbg:
-                    # lengths after processor/tokenizer
-                    ids = model_inputs.get("input_ids")
-                    lens = [len(x) for x in ids] if ids is not None else []
-                    print(f"[processor] ok in {dt:.1f}ms; input_ids lens={lens} "
-                        f"min/med/max={ (min(lens) if lens else '-')} / "
-                        f"{ (sorted(lens)[len(lens)//2] if lens else '-') } / "
-                        f"{ (max(lens) if lens else '-') }")
-            except Exception as e:
-                print(f"[processor][ERROR] {type(e).__name__}: {e}")
-                # helpful context dump (small)
-                print(f"[processor][ctx] has_video={videos is not None} "
-                    f"n_vid={len(videos) if videos is not None else 0} "
-                    f"n_audio={len(audio_secs) if audio_secs else 0} "
-                    f"raw_prompt_chars={len(raw_prompt)}")
-                raise
-
-            # NOTE: all text should be processed by self.processor()
-            input_ids = model_inputs.pop("input_ids")
-            attention_mask = model_inputs.pop("attention_mask")
-
-            if "second_per_grid_ts" in model_inputs:
-                model_inputs.pop("second_per_grid_ts")
-
-            # There's a trap here, multi_modal_inputs has to be a dict, not BatchFeature
-            row_dict["multi_modal_data"] = multi_modal_data
-
-            # We will do batch.union() in the trainer,
-            # so we cannot have "multi_modal_inputs" in row_dict if rollout generates new multi_modal_inputs
-            if self.return_multi_modal_inputs:
-                row_dict["multi_modal_inputs"] = dict(model_inputs)
-
-                # second_per_grid_ts isn't used for training, just for mrope
-                row_dict["multi_modal_inputs"].pop("second_per_grid_ts", None)
-
-        else:
-            raw_prompt = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-            model_inputs = self.tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
-            input_ids = model_inputs.pop("input_ids")
-            attention_mask = model_inputs.pop("attention_mask")
-
-        input_ids, attention_mask = verl_F.postprocess_data(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_length=self.max_prompt_length,
-            pad_token_id=self.tokenizer.pad_token_id,
-            left_pad=True,
-            truncation=self.truncation,
-        )
-
-        if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
-            from verl.models.transformers.qwen2_vl import get_rope_index
-            
-            # NOTE: printing out whether this runs
-            # print("KEANE: Running getting the rope index of input ids")
-            
-            # NOTE: OBTAIN ROPE of rotary positional embeddings. ROPE encodes position by rotating components of query/key vectors
-            # This is just for to get relative position in terms of angular differences etc.
-            position_ids = [
-                get_rope_index(
-                    self.processor,
-                    input_ids=input_ids[0],
-                    image_grid_thw=model_inputs.get("image_grid_thw"),
-                    video_grid_thw=model_inputs.get("video_grid_thw"),
-                    second_per_grid_ts=model_inputs.get("second_per_grid_ts"),
-                    attention_mask=attention_mask[0],
-                )
-            ]  # (1, 3, seq_len)
-
-        else:
-            position_ids = compute_position_id_with_mask(attention_mask)
-
-        # Essentially training with the different input ids etc.
-        row_dict["input_ids"] = input_ids[0]
-        row_dict["attention_mask"] = attention_mask[0]
-        row_dict["position_ids"] = position_ids[0]
-
-        raw_prompt_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
-        if len(raw_prompt_ids) > self.max_prompt_length:
-            if self.truncation == "left":
-                raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length :]
-            elif self.truncation == "right":
-                raw_prompt_ids = raw_prompt_ids[: self.max_prompt_length]
-            elif self.truncation == "middle":
-                left_half = self.max_prompt_length // 2
-                right_half = self.max_prompt_length - left_half
-                raw_prompt_ids = raw_prompt_ids[:left_half] + raw_prompt_ids[-right_half:]
-            elif self.truncation == "error":
-                raise RuntimeError(f"Prompt length {len(raw_prompt_ids)} is longer than {self.max_prompt_length}.")
-
-        row_dict["raw_prompt_ids"] = raw_prompt_ids
-        # encode prompts without chat template
-        if self.return_raw_chat:
-            row_dict["raw_prompt"] = messages
-
-        # get prompts with chat template
-        if self.return_full_prompt:
-            row_dict["full_prompts"] = raw_prompt  # array of strings
-
-        # add index for each prompt
-        index = row_dict.get("extra_info", {}).get("index", 0)
-        tools_kwargs = row_dict.get("extra_info", {}).get("tools_kwargs", {})
-        interaction_kwargs = row_dict.get("extra_info", {}).get("interaction_kwargs", {})
-        need_tools_kwargs = row_dict.get("extra_info", {}).get("need_tools_kwargs", self.need_tools_kwargs)
-        if need_tools_kwargs and not tools_kwargs:
-            logger.warning("tools_kwargs is empty for index {}, data source: {}", index, row_dict["data_source"])
-        row_dict["index"] = index
-        row_dict["tools_kwargs"] = tools_kwargs
-        row_dict["interaction_kwargs"] = interaction_kwargs
-        
-        return row_dict
-
-    def __getstate__(self):
-        if not self.serialize_dataset:
-            state = self.__dict__.copy()
-
-            if "dataframe" in state:
-                del state["dataframe"]
-            return state
-
-        return self.__dict__.copy()

From 1984a6b0cdd8a951cf2dbcf5247c6d12c2921698 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 17:57:54 -0400
Subject: [PATCH 194/232] debug print

---
 verl/trainer/ppo/ray_trainer.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 4b3a8f867e4..38ded34b71e 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1193,6 +1193,12 @@ def fit(self):
                 # pop those keys for generation
                 batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
                 non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+
+                if "input_ids" in batch.batch:
+                    print(f"[DEBUG] input_ids shape: {batch.batch['input_ids'].shape}")
+                    print(f"[DEBUG] First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}")
+
+            
                 if "multi_modal_data" in batch.non_tensor_batch:
                     # TODO: Fix the audio generation for this
                     non_tensor_batch_keys_to_pop.append("multi_modal_data")

From 1efce59602d5b6369ab573131741ce5dfff0cf1f Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 18:03:45 -0400
Subject: [PATCH 195/232] debug print

---
 verl/trainer/ppo/ray_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 38ded34b71e..69272700f11 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1225,7 +1225,7 @@ def fit(self):
                 is_last_step = self.global_steps >= self.total_training_steps
 
                 # TODO: double check the gen_batch
-                print(f"gen_batch", gen_batch)
+                # print(f"gen_batch", gen_batch)
 
 
                 with marked_timer("step", timing_raw):

From 55aaa26028fca3f0f420b99103ba62d0f49cf3ba Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 18:14:13 -0400
Subject: [PATCH 196/232] debug print

---
 verl/trainer/ppo/ray_trainer.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 69272700f11..941368d9422 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1171,7 +1171,9 @@ def fit(self):
         )
         next_step_profile = False
 
+
         for epoch in range(self.config.trainer.total_epochs):
+            i = 0
             for batch_dict in self.train_dataloader:
                 metrics = {}
                 timing_raw = {}
@@ -1194,11 +1196,14 @@ def fit(self):
                 batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
                 non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
 
+                
                 if "input_ids" in batch.batch:
                     print(f"[DEBUG] input_ids shape: {batch.batch['input_ids'].shape}")
                     print(f"[DEBUG] First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}")
 
-            
+                if i==3:
+                    raise ValueError("Debugging error at iteration 4")
+
                 if "multi_modal_data" in batch.non_tensor_batch:
                     # TODO: Fix the audio generation for this
                     non_tensor_batch_keys_to_pop.append("multi_modal_data")
@@ -1226,7 +1231,7 @@ def fit(self):
 
                 # TODO: double check the gen_batch
                 # print(f"gen_batch", gen_batch)
-
+                i += 1
 
                 with marked_timer("step", timing_raw):
                     # generate a batch

From 6d35542a6dc0854cd5613aba291e698a72063a5b Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 18:19:07 -0400
Subject: [PATCH 197/232] debug

---
 verl/trainer/ppo/ray_trainer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 941368d9422..51d8f1865c4 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1201,8 +1201,12 @@ def fit(self):
                     print(f"[DEBUG] input_ids shape: {batch.batch['input_ids'].shape}")
                     print(f"[DEBUG] First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}")
 
-                if i==3:
-                    raise ValueError("Debugging error at iteration 4")
+                if i == 5:
+                    raise ValueError(
+                        f"Debugging error at iteration 4\n"
+                        f"input_ids shape: {batch.batch['input_ids'].shape}\n"
+                        f"First 10 tokens: {batch.batch['input_ids'][0][:10].tolist()}"
+                    )
 
                 if "multi_modal_data" in batch.non_tensor_batch:
                     # TODO: Fix the audio generation for this

From b02313d78cb5030a11882df34b8d64e0fcecd78b Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 18:25:26 -0400
Subject: [PATCH 198/232] _

---
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh |  3 +++
 verl/trainer/ppo/ray_trainer.py               | 26 ++++++++++++-------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 98eaffdb023..74cf2933fbd 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -41,6 +41,9 @@ unset ROCR_VISIBLE_DEVICES
     # actor_rollout_ref.model.target_modules=all-linear \
     # actor_rollout_ref.rollout.layered_summon=True \
 
+# Set PyTorch CUDA memory allocator policies
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb=128
+
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/full_no_chalearn.jsonl \
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 51d8f1865c4..92a096b56fb 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -67,6 +67,7 @@
 
 WorkerType = type[Worker]
 
+debug_file = "debug_log.txt"
 
 class Role(Enum):
     """
@@ -1197,16 +1198,23 @@ def fit(self):
                 non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
 
                 
-                if "input_ids" in batch.batch:
-                    print(f"[DEBUG] input_ids shape: {batch.batch['input_ids'].shape}")
-                    print(f"[DEBUG] First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}")
+                # if "input_ids" in batch.batch:
+                #     print(f"[DEBUG] input_ids shape: {batch.batch['input_ids'].shape}")
+                #     print(f"[DEBUG] First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}")
 
-                if i == 5:
-                    raise ValueError(
-                        f"Debugging error at iteration 4\n"
-                        f"input_ids shape: {batch.batch['input_ids'].shape}\n"
-                        f"First 10 tokens: {batch.batch['input_ids'][0][:10].tolist()}"
-                    )
+
+                if "input_ids" in batch.batch:
+                            with open(debug_file, "a") as f:  # append mode
+                                f.write(f"[DEBUG] Epoch {epoch}, Iter {i}\n")
+                                f.write(f"input_ids shape: {batch.batch['input_ids'].shape}\n")
+                                f.write(f"First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}\n\n")
+
+                # if i == 5:
+                #     raise ValueError(
+                #         f"Debugging error at iteration 4\n"
+                #         f"input_ids shape: {batch.batch['input_ids'].shape}\n"
+                #         f"First 10 tokens: {batch.batch['input_ids'][0][:10].tolist()}"
+                #     )
 
                 if "multi_modal_data" in batch.non_tensor_batch:
                     # TODO: Fix the audio generation for this

From 10265a99f97c6c5e30b6036b9477e251f6baddb7 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 18:28:48 -0400
Subject: [PATCH 199/232] _

---
 verl/trainer/ppo/ray_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 92a096b56fb..9d0949ac120 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -67,7 +67,7 @@
 
 WorkerType = type[Worker]
 
-debug_file = "debug_log.txt"
+debug_file = "/home/keaneong/human-behavior/verl/examples/grpo_trainer/debug_log.txt"
 
 class Role(Enum):
     """

From 24b6d61234b0868a1c643455685131c38d0b6f6c Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 18:30:38 -0400
Subject: [PATCH 200/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 74cf2933fbd..bbee24881ac 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -42,7 +42,7 @@ unset ROCR_VISIBLE_DEVICES
     # actor_rollout_ref.rollout.layered_summon=True \
 
 # Set PyTorch CUDA memory allocator policies
-export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb=128
+# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb=128
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \

From d598b570e0cf5a8def6eba99aea4c8e599c1a751 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 18:57:14 -0400
Subject: [PATCH 201/232] prep video training

---
 ..._keane_run_qwen2_5_vl-7b_hb_all_modalities.sh |  6 +++---
 verl/utils/dataset/vision_utils.py               | 10 +++++++---
 verl/utils/tokenizer.py                          | 16 ++++++++--------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index bbee24881ac..9cfccf39190 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -46,8 +46,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/full_no_chalearn.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/full_no_chalearn.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
     data.train_batch_size=3 \
     data.val_batch_size=3 \
     data.max_prompt_length=4096 \
@@ -100,4 +100,4 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.val_before_train=False \
     trainer.test_freq=10 \
     trainer.total_epochs=15 $@ \
-    trainer.default_local_dir=/scratch/keane/human_behaviour/new_verl_models_hb_omni
\ No newline at end of file
+    trainer.default_local_dir=/scratch/keane/human_behaviour/newest_verl_models_hb_omni
\ No newline at end of file
diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index afcf5253a5b..5e114306be9 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -87,10 +87,14 @@ def process_video(
     start_t = time.perf_counter()
 
     # Normalize string input → dict
+    # if isinstance(video, str):
+    #     # Your current defaults (tiny visual budget)
+    #     video = {"type": "video", "video": video,
+    #              "min_pixels": 32768, "max_pixels": 32768, "nframes": 2}
+        
     if isinstance(video, str):
-        # Your current defaults (tiny visual budget)
-        video = {"type": "video", "video": video,
-                 "min_pixels": 32768, "max_pixels": 32768, "nframes": 2}
+        video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
+                 "nframes": 4}
 
     if not isinstance(video, dict) or "video" not in video:
         raise NotImplementedError("Video format must be dict with key 'video'.")
diff --git a/verl/utils/tokenizer.py b/verl/utils/tokenizer.py
index 6b82d95792f..6515f657d1f 100644
--- a/verl/utils/tokenizer.py
+++ b/verl/utils/tokenizer.py
@@ -87,14 +87,14 @@ def hf_processor(name_or_path, **kwargs):
         processor = None
 
     # set the limits on the min and max pixels in the processor if Omni is present
-    print(f"Processor Class {processor.__class__.__name__}")
-    if "Omni" in processor.__class__.__name__:
-        print("KEANE: Setting pixel limits for Omni processor")
-        min_pixels = 32768
-        max_pixels = 32768
-        processor.min_pixels = min_pixels
-        processor.max_pixels = max_pixels
-        print(f"KEANE: Set min_pixels to {min_pixels} and max_pixels to {max_pixels}")
+    # print(f"Processor Class {processor.__class__.__name__}")
+    # if "Omni" in processor.__class__.__name__:
+    #     print("KEANE: Setting pixel limits for Omni processor")
+    #     min_pixels = 32768
+    #     max_pixels = 32768
+    #     processor.min_pixels = min_pixels
+    #     processor.max_pixels = max_pixels
+    #     print(f"KEANE: Set min_pixels to {min_pixels} and max_pixels to {max_pixels}")
     # raise RuntimeError("Intentional Test Error")
 
     return processor

From 4ec901bf75c9ff175aae7357ffb17ecca2ce2713 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 18:58:06 -0400
Subject: [PATCH 202/232] prep audio config

---
 verl/utils/dataset/audio_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/audio_utils.py b/verl/utils/dataset/audio_utils.py
index 4f19a4d0396..5a29550922f 100644
--- a/verl/utils/dataset/audio_utils.py
+++ b/verl/utils/dataset/audio_utils.py
@@ -19,7 +19,7 @@
 def process_audio(
     audio: Union[str, dict],
     processor=None,
-    max_seconds: float = 2.0  # keep audio to this many seconds max
+    max_seconds: float = None  # keep audio to this many seconds max
 ) -> Tuple[torch.Tensor, int]:
     """
     Load audio, convert to mono, resample, and clip to max_seconds.

From 137a73e87b636f622c6f1626b95456547c2bec96 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 18:59:23 -0400
Subject: [PATCH 203/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 9cfccf39190..5eb60c7ac82 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -48,8 +48,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
-    data.train_batch_size=3 \
-    data.val_batch_size=3 \
+    data.train_batch_size=27 \
+    data.val_batch_size=27 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \

From 9c367293da48140a6c32735bc0da1cc9b6acce83 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 20:40:39 -0400
Subject: [PATCH 204/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 5eb60c7ac82..5647f909604 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -46,8 +46,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/chalearn_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_vision_only.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_vision_only.jsonl \
     data.train_batch_size=27 \
     data.val_batch_size=27 \
     data.max_prompt_length=4096 \

From f670372bfba930d9be6b43ab82c01de1e5a57294 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 20:54:18 -0400
Subject: [PATCH 205/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 5647f909604..328a03e07ad 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -46,7 +46,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_vision_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_video_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_vision_only.jsonl \
     data.train_batch_size=27 \
     data.val_batch_size=27 \

From 7001ac332d238bbf58baf4e5108c625310530385 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 21:16:00 -0400
Subject: [PATCH 206/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 328a03e07ad..3a0e0931573 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -48,7 +48,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_video_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_vision_only.jsonl \
-    data.train_batch_size=27 \
+    data.train_batch_size=512 \
     data.val_batch_size=27 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \

From eaa2f402baac89e916c9658c71b3c0a3ad59c4f9 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 21:20:01 -0400
Subject: [PATCH 207/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3a0e0931573..96d109e115d 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -46,7 +46,7 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_video_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_image_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_vision_only.jsonl \
     data.train_batch_size=512 \
     data.val_batch_size=27 \

From c2e0c3d2667ad6349b34b8b68fd0ecf9d775bb6c Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 21:22:52 -0400
Subject: [PATCH 208/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 96d109e115d..be32a60d141 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -48,7 +48,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_image_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_vision_only.jsonl \
-    data.train_batch_size=512 \
+    data.train_batch_size=27 \
     data.val_batch_size=27 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \

From 55d108ed14b11ea62affb73d1a5ade4e9aeab245 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 21:34:40 -0400
Subject: [PATCH 209/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index be32a60d141..3a0e0931573 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -46,9 +46,9 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_image_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_video_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_vision_only.jsonl \
-    data.train_batch_size=27 \
+    data.train_batch_size=512 \
     data.val_batch_size=27 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \

From 650550e34a11253e22853618ec06765e7d380aad Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 21:41:02 -0400
Subject: [PATCH 210/232] _

---
 verl/utils/dataset/rl_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 1abcb8c84b2..7606b402721 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -446,7 +446,7 @@ def __getitem__(self, item):
         model_inputs = {}
         
         # NOTE: DEBUGGING
-        dbg = True
+        dbg = False
         if dbg:
             print(f"[getitem] idx=? ds={row_dict.get('dataset')} src={row_dict.get('data_source')} "
                 f"modalities={self.modalities}")

From b0e7a9b183f479ffc66f3987cff4f50e74c2b7d4 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 21:46:49 -0400
Subject: [PATCH 211/232] debug_off

---
 verl/utils/dataset/vision_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index 5e114306be9..74a83285d9c 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -77,7 +77,7 @@ def process_video(
     fps_min_frames: Optional[int] = None,
     fps_max_frames: Optional[int] = None,
     *,
-    debug: bool = True,           # <-- turn on diagnostics
+    debug: bool = False,           # <-- turn on diagnostics
     name_hint: Optional[str] = None
 ) -> torch.Tensor:
     """Converts a video dict into a [n_frames, 3, H, W] uint8 tensor.

From 3a73517f62d082adc6fc21882a834392ed8ada09 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 22:16:42 -0400
Subject: [PATCH 212/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 verl/trainer/ppo/ray_trainer.py                               | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3a0e0931573..c708308e9ff 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -63,8 +63,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=3 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=1e-9 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 9d0949ac120..0af097cc6f1 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -545,6 +545,7 @@ def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampl
             dataset=self.train_dataset,
             batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
             num_workers=num_workers,
+            # shuffle=False,
             drop_last=True,
             collate_fn=collate_fn,
             sampler=train_sampler,

From 7d11247b43aa5988fb14c545bcb1b64b2d66131a Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 22:18:42 -0400
Subject: [PATCH 213/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index c708308e9ff..bb3a3a30c6d 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -81,7 +81,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.enable_chunked_prefill=False \
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \
-    actor_rollout_ref.rollout.n=3 \
+    actor_rollout_ref.rollout.n=5 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     actor_rollout_ref.rollout.max_model_len=4096 \

From d68f37574b1cb162fb5b3b1f42b9be2556c308e1 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 22:37:29 -0400
Subject: [PATCH 214/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index bb3a3a30c6d..4c4a4875d5f 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -48,7 +48,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_video_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_vision_only.jsonl \
-    data.train_batch_size=512 \
+    data.train_batch_size=513 \
     data.val_batch_size=27 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
@@ -57,23 +57,23 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.image_key=images \
     data.video_key=videos \
     data.prompt_key=problem \
-    data.dataloader_num_workers=2 \
+    data.dataloader_num_workers=8 \
     data.modalities=\'audio,videos\' \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=129 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=3 \
     actor_rollout_ref.actor.use_kl_loss=False \
-    actor_rollout_ref.actor.kl_loss_coef=1e-9 \
+    actor_rollout_ref.actor.kl_loss_coef=0 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
     actor_rollout_ref.actor.entropy_coeff=0 \
     actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
     actor_rollout_ref.model.enable_gradient_checkpointing=True \
     actor_rollout_ref.actor.fsdp_config.param_offload=False \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=3 \
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
@@ -82,7 +82,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \
     actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=3 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     actor_rollout_ref.rollout.max_model_len=4096 \
     actor_rollout_ref.rollout.max_num_batched_tokens=4096 \

From eca7a4aced4a17adaa4e98004f11e0d0c6686ff6 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Tue, 19 Aug 2025 22:42:23 -0400
Subject: [PATCH 215/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 4c4a4875d5f..fe97c7f5210 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -48,7 +48,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_video_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_vision_only.jsonl \
-    data.train_batch_size=513 \
+    data.train_batch_size=540 \
     data.val_batch_size=27 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
@@ -63,7 +63,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=129 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=135 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=3 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=0 \

From 8ecf2574ab9b5a3f3965a08adae3d6829f1ed8fd Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 09:24:05 -0400
Subject: [PATCH 216/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index fe97c7f5210..469f504ed93 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -47,7 +47,7 @@ unset ROCR_VISIBLE_DEVICES
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_video_only.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_vision_only.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_video_only.jsonl \
     data.train_batch_size=540 \
     data.val_batch_size=27 \
     data.max_prompt_length=4096 \

From 8dfbd2ab19a8b296d1d80e0509a20059c47b6d95 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 09:39:02 -0400
Subject: [PATCH 217/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh      | 10 +++++-----
 verl/trainer/ppo/ray_trainer.py                        |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 469f504ed93..5d0836ce78b 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -48,8 +48,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_video_only.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_video_only.jsonl \
-    data.train_batch_size=540 \
-    data.val_batch_size=27 \
+    data.train_batch_size=288 \
+    data.val_batch_size=144 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \
@@ -63,7 +63,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=135 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=72 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=3 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=0 \
@@ -73,7 +73,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.enable_gradient_checkpointing=True \
     actor_rollout_ref.actor.fsdp_config.param_offload=False \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=3 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
@@ -82,7 +82,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \
     actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=3 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     actor_rollout_ref.rollout.max_model_len=4096 \
     actor_rollout_ref.rollout.max_num_batched_tokens=4096 \
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 0af097cc6f1..e3a6752fb61 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -541,6 +541,7 @@ def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampl
 
         num_workers = self.config.data["dataloader_num_workers"]
 
+        ## TODO: trainer_sampler is pretty much the rl_sampler here, which shuffles the dataset
         self.train_dataloader = StatefulDataLoader(
             dataset=self.train_dataset,
             batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
@@ -555,6 +556,7 @@ def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampl
         if val_batch_size is None:
             val_batch_size = len(self.val_dataset)
 
+        # TODO: validation data is shuffled here as well
         self.val_dataloader = StatefulDataLoader(
             dataset=self.val_dataset,
             batch_size=val_batch_size,

From ee9d04c1ef6c1007adbb7473bcd7917e560ed323 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 09:43:41 -0400
Subject: [PATCH 218/232] _

---
 .../grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 5d0836ce78b..6cc310e3d7b 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -64,7 +64,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=72 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=3 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=0 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \

From bbfca47863e2fd9bf4e957597d135f1b384d6564 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 10:26:28 -0400
Subject: [PATCH 219/232] push modality sampler

---
 verl/trainer/main_ppo.py               | 78 +++++++++++++++++++++++++-
 verl/utils/dataset/modality_sampler.py | 48 ++++++++++++++++
 verl/utils/dataset/vision_utils.py     | 16 +++---
 3 files changed, 134 insertions(+), 8 deletions(-)
 create mode 100644 verl/utils/dataset/modality_sampler.py

diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 5c31aea98c4..39597c3070b 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -28,6 +28,7 @@
 from verl.trainer.ppo.reward import load_reward_manager
 from verl.utils.device import is_cuda_available
 from verl.utils.import_utils import load_extern_type
+from verl.utils.dataset.modality_sampler import ModalitySignatureBatchSampler
 
 
 @hydra.main(config_path="config", config_name="ppo_trainer", version_base=None)
@@ -267,6 +268,7 @@ def run(self, config):
         # print(f"Using train dataset: {train_dataset}")
 
         # Initialize the PPO trainer.
+        # TODO: train sampler is fed into this; and is used to shuffle the training dataset (and extending to validation)
         trainer = RayPPOTrainer(
             config=config,
             tokenizer=tokenizer,
@@ -336,7 +338,7 @@ def create_rl_dataset(data_paths, data_config, tokenizer, processor, is_train=Tr
 
     return dataset
 
-
+# NOTE: This is the old rl_sampler
 def create_rl_sampler(data_config, dataset):
     """Create a sampler for the dataset.
 
@@ -378,6 +380,80 @@ def create_rl_sampler(data_config, dataset):
 
     return sampler
 
+# NOTE: This is your implementation
+# def create_rl_sampler(data_config, dataset, split: str = "train"):
+#     """Create a sampler for the dataset.
+
+#     Arguments:
+#         data_config: The data config.
+#         dataset (Dataset): The dataset.
+
+#     Returns:
+#         sampler (Sampler): The sampler.
+#     """
+#     import torch
+#     from torch.utils.data import RandomSampler, SequentialSampler
+
+#     # modality batching
+#     mb_cfg = data_config.get("modality_batching") if split == "train" \
+#             else data_config.get("val_modality_batching")
+
+#     if data_config.sampler is not None and data_config.sampler.get("class_path", None) is not None:
+#         curriculum_class = load_extern_type(
+#             data_config.sampler.class_path,
+#             data_config.sampler.class_name,
+#         )
+#         sampler = curriculum_class(
+#             data_source=dataset,
+#             data_config=data_config,
+#         )
+#         assert isinstance(sampler, AbstractSampler)
+#         assert data_config.get("dataloader_num_workers", 8) == 0, (
+#             "If using curriculum, num_workers must be 0 to prevent data caching. "
+#             "If the dataloader caches data before the batch is done the "
+#             "curriculum sampler won't have the opportunity to reorder it. "
+#         )
+#     if mb_cfg and mb_cfg.get("enabled", False):
+#         by_sig = {}
+#         for i in range(len(dataset)):
+#             row = dataset.dataframe[i] if hasattr(dataset, "dataframe") else dataset[i]
+#             sig = row.get("modality_signature")
+#             if sig is None:
+#                 n_img = len(row.get("images", []) or [])
+#                 n_vid = len(row.get("videos", []) or [])
+#                 n_aud = len(row.get("audios", []) or [])
+#                 if n_vid > 0 and n_aud > 0: sig = "video_audio"
+#                 elif n_vid > 0:             sig = "video"
+#                 elif n_img > 0:             sig = "image"
+#                 elif n_aud > 0:             sig = "audio"
+#                 else:                        sig = "text_only"
+#             by_sig.setdefault(sig, []).append(i)
+
+#         batch_size = mb_cfg.get("batch_size", data_config.get(
+#             "train_batch_size" if split=="train" else "val_batch_size"
+#         ))
+#         drop_last = mb_cfg.get("drop_last", split=="train")
+#         shuffle = (split == "train")
+
+#         sampler = ModalitySignatureBatchSampler(
+#             indices_by_sig=by_sig,
+#             batch_size=int(batch_size),
+#             drop_last=drop_last,
+#             seed=data_config.get("seed", 42),
+#             shuffle=shuffle,
+#         )
+
+#     # Use a sampler to facilitate checkpoint resumption.
+#     # If shuffling is enabled in the data configuration, create a random sampler.
+#     elif data_config.shuffle:
+#         train_dataloader_generator = torch.Generator()
+#         train_dataloader_generator.manual_seed(data_config.get("seed", 1))
+#         sampler = RandomSampler(data_source=dataset, generator=train_dataloader_generator)
+#     else:
+#         # If shuffling is disabled, use a sequential sampler to iterate through the dataset in order.
+#         sampler = SequentialSampler(data_source=dataset)
+
+#     return sampler
 
 if __name__ == "__main__":
     main()
diff --git a/verl/utils/dataset/modality_sampler.py b/verl/utils/dataset/modality_sampler.py
new file mode 100644
index 00000000000..1089d65254a
--- /dev/null
+++ b/verl/utils/dataset/modality_sampler.py
@@ -0,0 +1,48 @@
+import random
+from typing import Dict, List, Iterator, Optional
+from torch.utils.data import BatchSampler
+
+class ModalitySignatureBatchSampler(BatchSampler):
+    """
+    Yields batches where each batch is homogeneous by 'modality_signature'.
+    No weighting logic — just shuffle (train) or sequential (val).
+    """
+    def __init__(
+        self,
+        indices_by_sig: Dict[str, List[int]],
+        batch_size: int,
+        drop_last: bool = True,
+        seed: int = 42,
+        shuffle: bool = True,
+    ):
+        self.indices_by_sig = {s: list(v) for s, v in indices_by_sig.items()}
+        self.batch_size = int(batch_size)
+        self.drop_last = drop_last
+        self.shuffle = shuffle
+        self.rng = random.Random(seed)
+        self.sigs = list(self.indices_by_sig.keys())
+
+    def __iter__(self) -> Iterator[List[int]]:
+        # copy fresh pools
+        pools = {s: list(v) for s, v in self.indices_by_sig.items()}
+        for s in self.sigs:
+            if self.shuffle:
+                self.rng.shuffle(pools[s])
+
+        for s in self.sigs:
+            pool = pools[s]
+            n = len(pool)
+            for start in range(0, n, self.batch_size):
+                end = start + self.batch_size
+                batch = pool[start:end]
+                if len(batch) < self.batch_size and self.drop_last:
+                    continue
+                if batch:
+                    yield batch
+
+    def __len__(self) -> int:
+        total = 0
+        for s, pool in self.indices_by_sig.items():
+            full, rem = divmod(len(pool), self.batch_size)
+            total += full + (0 if self.drop_last or rem == 0 else 1)
+        return total
diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index 74a83285d9c..2b13e98d26f 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -87,14 +87,16 @@ def process_video(
     start_t = time.perf_counter()
 
     # Normalize string input → dict
-    # if isinstance(video, str):
-    #     # Your current defaults (tiny visual budget)
-    #     video = {"type": "video", "video": video,
-    #              "min_pixels": 32768, "max_pixels": 32768, "nframes": 2}
-        
     if isinstance(video, str):
-        video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
-                 "nframes": 4}
+        # Your current defaults (tiny visual budget)
+        video = {"type": "video", "video": video,
+                 "min_pixels": 32768, "max_pixels": 32768, "nframes": 2}
+
+    # Moderate budget
+        
+    # if isinstance(video, str):
+    #     video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
+    #              "nframes": 4}
 
     if not isinstance(video, dict) or "video" not in video:
         raise NotImplementedError("Video format must be dict with key 'video'.")

From 68a07b33d364f2c450ac5393be7b3034c5c7cdc5 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 15:16:59 -0400
Subject: [PATCH 220/232] push modality sampler

---
 _unit_test_modality_sampler.py                | 308 ++++++++++++++++++
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh |  17 +-
 verl/trainer/main_ppo.py                      | 159 ++++-----
 verl/trainer/ppo/org_functions.py             |  78 +++++
 verl/trainer/ppo/ray_trainer.py               | 105 ++++--
 verl/utils/dataset/modality_sampler.py        |  63 +++-
 verl/utils/dataset/rl_dataset.py              |  27 +-
 7 files changed, 628 insertions(+), 129 deletions(-)
 create mode 100644 _unit_test_modality_sampler.py
 create mode 100644 verl/trainer/ppo/org_functions.py

diff --git a/_unit_test_modality_sampler.py b/_unit_test_modality_sampler.py
new file mode 100644
index 00000000000..323d2461439
--- /dev/null
+++ b/_unit_test_modality_sampler.py
@@ -0,0 +1,308 @@
+# test_stateful_modality_sampler_hardcoded.py
+
+import json
+from typing import Dict, Any, List, Iterator
+from torch.utils.data import Dataset, BatchSampler
+import random
+
+# ==== ADJUST PATHS below to match your repo structure ====
+# from verl.utils.dataset.modality_sampler import ModalitySignatureBatchSampler
+from torchdata.stateful_dataloader import StatefulDataLoader
+from collections import defaultdict, deque
+import torch
+import numpy as np
+
+# ---------- HARD-CODED PATHS + CONFIG ----------
+JSONL_PATH = "/Users/keane/Desktop/research/human-behavior/data/all/sigs_no_lmvd_discretized_v3_template_prompts.jsonl"
+TRAIN_BS   = 4
+VAL_BS     = 4
+SEED       = 42
+TRUNCATE_RATIO = 0.001  # for quick testing; set to 1.0 to disable
+# ---------------------------------------------
+
+# TODO: Please remove text only; everything should be text_only
+
+class ModalitySignatureBatchSampler(BatchSampler):
+    """
+    Round-robin across modality signatures, pruning exhausted signatures.
+    - Shuffles within each signature if shuffle=True (train).
+    - Each yielded batch is homogeneous by modality_signature.
+    - If a signature runs out of batches, it is removed and RR continues.
+    """
+    def __init__(
+        self,
+        indices_by_sig: Dict[str, List[int]],
+        batch_size: int,
+        drop_last: bool = True,
+        seed: int = 42,
+        shuffle: bool = True,
+    ):
+        self.indices_by_sig = {s: list(v) for s, v in indices_by_sig.items()}
+        self.batch_size = int(batch_size)
+        self.drop_last = drop_last
+        self.shuffle = shuffle
+        self.rng = random.Random(seed)
+        self.sigs = list(self.indices_by_sig.keys())
+
+    def _batches_for(self, pool: List[int]) -> List[List[int]]:
+        n = len(pool)
+        batches = []
+        for start in range(0, n, self.batch_size):
+            chunk = pool[start:start + self.batch_size]
+            if len(chunk) < self.batch_size and self.drop_last:
+                continue
+            if chunk:
+                batches.append(chunk)
+        return batches
+
+    def __iter__(self) -> Iterator[List[int]]:
+        # Fresh pools + optional shuffle within each signature
+        pools = {s: list(v) for s, v in self.indices_by_sig.items()}
+        for s in pools:
+            if self.shuffle:
+                self.rng.shuffle(pools[s])
+
+        # Build per-signature batch queues; essentially a dictionary with batches of each different modality signature
+        per_sig_batches = {s: deque(self._batches_for(pools[s])) for s in self.sigs}
+
+        # Establish RR order
+        order = list(self.sigs)
+        if self.shuffle:
+            # rotate start signature per epoch for variety (keeps RR structure)
+            k = self.rng.randrange(len(order)) if order else 0
+            order = order[k:] + order[:k]
+        else:
+            order = sorted(order)
+
+        # Active signatures as a deque for easy rotation
+        active = deque([s for s in order if len(per_sig_batches[s]) > 0])
+
+        while active:
+            s = active.popleft() # take the queue's leftmost element (modality signature)
+            q = per_sig_batches[s] # access all of the batched stuff
+            if q:
+                yield q.popleft() # yield that batch
+                # if still has batches, push to the end to continue RR
+                if q:
+                    active.append(s) # reappend the modality signature to the active queue
+                # if q is empty, we simply don't re-append s → pruned automatically
+                else:
+                    print(f"Ran-Out: Pruning modality signature: {s}")
+            
+    def __len__(self) -> int:
+        # Total number of batches across all signatures (after drop_last handling)
+        total = 0
+        for pool in self.indices_by_sig.values():
+            full, rem = divmod(len(pool), self.batch_size)
+            total += full + (0 if self.drop_last or rem == 0 else 1)
+        return total
+
+
+def rl_collate_fn(data_list: list[dict]) -> dict:
+    """
+    Collate a batch of sample dicts into batched tensors and arrays.
+
+    Args:
+        data_list: List of dicts mapping feature names to torch.Tensor or other values.
+
+    Returns:
+        Dict where tensor entries are stacked into a torch.Tensor of shape
+        (batch_size, dims) and non-tensor entries are converted to
+        np.ndarray of dtype object with shape (batch_size,).
+    """
+    tensors = defaultdict(list)
+    non_tensors = defaultdict(list)
+
+    for data in data_list:
+        for key, val in data.items():
+            if isinstance(val, torch.Tensor):
+                tensors[key].append(val)
+            else:
+                non_tensors[key].append(val)
+
+    for key, val in tensors.items():
+        tensors[key] = torch.stack(val, dim=0)
+
+    for key, val in non_tensors.items():
+        non_tensors[key] = np.fromiter(val, dtype=object, count=len(val))
+
+    return {**tensors, **non_tensors}
+
+def create_rl_sampler(data_config, dataset, split: str = "train"):
+    """Create a sampler for the dataset, grouping strictly by existing modality_signature."""
+    import torch
+    from torch.utils.data import RandomSampler, SequentialSampler
+
+    mb_cfg = data_config.get("modality_batching") if split == "train" \
+             else data_config.get("val_modality_batching")
+
+    # (keep curriculum path if you actually use it; omitted here for brevity)
+
+    if mb_cfg and mb_cfg.get("enabled", False):
+        by_sig: Dict[str, List[int]] = {}
+        for i in range(len(dataset)):
+            row = dataset.dataframe[i] if hasattr(dataset, "dataframe") else dataset[i]
+            sig = row.get("modality_signature")
+            if sig is None:
+                print(f"[WARNING] Row {i} missing 'modality_signature'. Skipping.")
+                continue
+            by_sig.setdefault(sig, []).append(i)
+
+        batch_size = mb_cfg.get("batch_size", data_config.get(
+            "train_batch_size" if split=="train" else "val_batch_size"
+        ))
+        drop_last = mb_cfg.get("drop_last", split=="train")
+        shuffle = (split == "train")
+
+        return ModalitySignatureBatchSampler(
+            indices_by_sig=by_sig,
+            batch_size=int(batch_size),
+            drop_last=drop_last,
+            seed=data_config.get("seed", 42),
+            shuffle=shuffle,
+        )
+
+    # Fallbacks
+    if data_config.get("shuffle", True) and split == "train":
+        g = torch.Generator(); g.manual_seed(data_config.get("seed", 1))
+        return RandomSampler(data_source=dataset, generator=g)
+    else:
+        return SequentialSampler(data_source=dataset)
+
+class JsonlDataset(Dataset):
+    def __init__(self, jsonl_path: str, truncate_ratio: float = TRUNCATE_RATIO, seed: int = SEED):
+        """
+        Loads ONLY entries that already have 'modality_signature'.
+        Optionally keeps a proportion per signature for fast debugging.
+        """
+        all_rows: List[Dict[str, Any]] = []
+        with open(jsonl_path, "r", encoding="utf-8") as f:
+            for ln in f:
+                ln = ln.strip()
+                if not ln:
+                    continue
+                ex = json.loads(ln)
+                sig = ex.get("modality_signature")
+                if sig is None:
+                    print(f"[WARNING] Entry missing 'modality_signature'. Skipping.")
+                    continue  # skip missing
+                all_rows.append(ex)
+
+        # Group by signature and truncate per signature
+        sig_to_rows: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+        for ex in all_rows:
+            sig_to_rows[ex["modality_signature"]].append(ex)
+
+        rng = random.Random(seed)
+        truncated_rows: List[Dict[str, Any]] = []
+        for sig, rows in sig_to_rows.items():
+            if truncate_ratio >= 1.0:
+                truncated_rows.extend(rows)
+                continue
+            keep_n = max(1, int(len(rows) * truncate_ratio))
+            rng.shuffle(rows)
+            truncated_rows.extend(rows[:keep_n])
+
+        self.rows = truncated_rows
+        self.dataframe = self  # preserve your API
+
+        # simple stats
+        counts = {sig: sum(1 for r in self.rows if r["modality_signature"] == sig) for sig in sig_to_rows}
+        print(f"[DEBUG] After truncation (ratio={truncate_ratio}), total {len(self.rows)}. Per-signature: {counts}")
+
+    def __len__(self): 
+        return len(self.rows)
+
+    def __getitem__(self, idx): 
+        return self.rows[idx]
+
+
+def assert_homogeneous(batch_list: List[Dict[str, Any]]):
+    sigs = {b.get("modality_signature") for b in batch_list}
+    if len(sigs) != 1:
+        raise AssertionError(f"Non-homogeneous batch signatures: {sigs}")
+
+def collate_with_guard(batch_list):
+    assert_homogeneous(batch_list)
+    return rl_collate_fn(batch_list)
+
+def build_cfg(train_bs: int, val_bs: int, seed: int = 42):
+    class Dot(dict):
+        __getattr__ = dict.get
+        __setattr__ = dict.__setitem__
+        __delattr__ = dict.__delitem__
+    return Dot({
+        "train_batch_size": train_bs,
+        "val_batch_size": val_bs,
+        "shuffle": True,
+        "seed": seed,
+        "dataloader_num_workers": 0,
+        "validation_shuffle": False,
+        "sampler": None,
+        "modality_batching":      {"enabled": True, "batch_size": train_bs, "drop_last": True},
+        "val_modality_batching":  {"enabled": True, "batch_size": val_bs,   "drop_last": False},
+    })
+
+def build_loader(dataset, data_cfg, split: str):
+    sampler_or_batch = create_rl_sampler(data_cfg, dataset, split=split)
+    if isinstance(sampler_or_batch, BatchSampler):
+        return StatefulDataLoader(
+            dataset=dataset,
+            batch_sampler=sampler_or_batch,
+            num_workers=data_cfg["dataloader_num_workers"],
+            collate_fn=collate_with_guard,
+        )
+    else:
+        bs = data_cfg.get("train_batch_size" if split == "train" else "val_batch_size")
+        return StatefulDataLoader(
+            dataset=dataset,
+            sampler=sampler_or_batch,
+            batch_size=bs,
+            num_workers=data_cfg["dataloader_num_workers"],
+            drop_last=(split == "train"),
+            shuffle=False if split == "val" else False,
+            collate_fn=collate_with_guard,
+        )
+
+def main():
+    ds = JsonlDataset(JSONL_PATH)
+    print(f"Dataset size: {len(ds)}; per-signature counts:",
+          {sig: sum(1 for r in ds.rows if r['modality_signature']==sig)
+           for sig in sorted({r['modality_signature'] for r in ds.rows})})
+
+    cfg = build_cfg(TRAIN_BS, VAL_BS, SEED)
+
+    # TRAIN
+    train_loader = build_loader(ds, cfg, split="train")
+    print("\n[TRAIN] Iteration 1")
+    n_train_batches = sum(1 for _ in train_loader) # iterating as you would with the train loader
+    print(f"train steps: {n_train_batches} (drop_last=True)")
+
+    # New epoch
+    train_loader2 = build_loader(ds, cfg, split="train")
+    n_train_batches2 = sum(1 for _ in train_loader2)
+    assert n_train_batches == n_train_batches2
+    print("[TRAIN] Iteration 2: step count consistent")
+
+    # VAL
+    val_loader = build_loader(ds, cfg, split="val")
+    print("\n[VAL] Iteration 1")
+    n_val_batches = sum(1 for _ in val_loader)
+    print(f"val steps: {n_val_batches} (drop_last=False)")
+
+    # Stateful resume check (if supported)
+    if hasattr(train_loader, "state_dict"):
+        print("\n[STATEFUL] Testing resume mid-epoch")
+        train_loader3 = build_loader(ds, cfg, split="train")
+        it = iter(train_loader3)
+        next(it); next(it)  # consume 2
+        sd = train_loader3.state_dict()
+        train_loader4 = build_loader(ds, cfg, split="train")
+        train_loader4.load_state_dict(sd)
+        resumed = sum(1 for _ in train_loader4)
+        print(f"resumed batches after 2 consumed: {resumed}")
+
+    print("\nOK: StatefulDataLoader + sampler test finished.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 6cc310e3d7b..3c14063636a 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -44,6 +44,19 @@ unset ROCR_VISIBLE_DEVICES
 # Set PyTorch CUDA memory allocator policies
 # export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb=128
 
+# data:
+#   train_batch_size: 8
+#   val_batch_size: 8
+
+#   train_modality_batching:
+#     enabled: true
+#     drop_last: true
+
+#   val_modality_batching:
+#     enabled: true
+#     drop_last: false
+
+
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_video_only.jsonl \
@@ -59,6 +72,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.prompt_key=problem \
     data.dataloader_num_workers=8 \
     data.modalities=\'audio,videos\' \
+    data.train_modality_batching.enabled=True \
+    data.val_modality_batching.enabled=True \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
@@ -99,5 +114,5 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.save_freq=-1 \
     trainer.val_before_train=False \
     trainer.test_freq=10 \
-    trainer.total_epochs=15 $@ \
+    trainer.total_epochs=1 $@ \
     trainer.default_local_dir=/scratch/keane/human_behaviour/newest_verl_models_hb_omni
\ No newline at end of file
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 39597c3070b..4be5e1385da 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -22,6 +22,7 @@
 import ray
 from omegaconf import OmegaConf
 
+from typing import Dict, List
 from verl.experimental.dataset.sampler import AbstractSampler
 from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer
@@ -261,7 +262,7 @@ def run(self, config):
         # This is done by reading from the train and the val files
         train_dataset = create_rl_dataset(config.data.train_files, config.data, tokenizer, processor, is_train=True)
         val_dataset = create_rl_dataset(config.data.val_files, config.data, tokenizer, processor, is_train=False)
-        train_sampler = create_rl_sampler(config.data, train_dataset)
+        train_sampler = create_rl_sampler(config.data, train_dataset, split="train")
 
         # print(f"Using train sampler: {train_sampler}")
         # print(f"Using val dataset: {val_dataset}")
@@ -339,49 +340,7 @@ def create_rl_dataset(data_paths, data_config, tokenizer, processor, is_train=Tr
     return dataset
 
 # NOTE: This is the old rl_sampler
-def create_rl_sampler(data_config, dataset):
-    """Create a sampler for the dataset.
-
-    Arguments:
-        data_config: The data config.
-        dataset (Dataset): The dataset.
-
-    Returns:
-        sampler (Sampler): The sampler.
-    """
-    import torch
-    from torch.utils.data import RandomSampler, SequentialSampler
-
-    if data_config.sampler is not None and data_config.sampler.get("class_path", None) is not None:
-        curriculum_class = load_extern_type(
-            data_config.sampler.class_path,
-            data_config.sampler.class_name,
-        )
-        sampler = curriculum_class(
-            data_source=dataset,
-            data_config=data_config,
-        )
-        assert isinstance(sampler, AbstractSampler)
-        assert data_config.get("dataloader_num_workers", 8) == 0, (
-            "If using curriculum, num_workers must be 0 to prevent data caching. "
-            "If the dataloader caches data before the batch is done the "
-            "curriculum sampler won't have the opportunity to reorder it. "
-        )
-
-    # Use a sampler to facilitate checkpoint resumption.
-    # If shuffling is enabled in the data configuration, create a random sampler.
-    elif data_config.shuffle:
-        train_dataloader_generator = torch.Generator()
-        train_dataloader_generator.manual_seed(data_config.get("seed", 1))
-        sampler = RandomSampler(data_source=dataset, generator=train_dataloader_generator)
-    else:
-        # If shuffling is disabled, use a sequential sampler to iterate through the dataset in order.
-        sampler = SequentialSampler(data_source=dataset)
-
-    return sampler
-
-# NOTE: This is your implementation
-# def create_rl_sampler(data_config, dataset, split: str = "train"):
+# def create_rl_sampler(data_config, dataset):
 #     """Create a sampler for the dataset.
 
 #     Arguments:
@@ -394,10 +353,6 @@ def create_rl_sampler(data_config, dataset):
 #     import torch
 #     from torch.utils.data import RandomSampler, SequentialSampler
 
-#     # modality batching
-#     mb_cfg = data_config.get("modality_batching") if split == "train" \
-#             else data_config.get("val_modality_batching")
-
 #     if data_config.sampler is not None and data_config.sampler.get("class_path", None) is not None:
 #         curriculum_class = load_extern_type(
 #             data_config.sampler.class_path,
@@ -413,35 +368,6 @@ def create_rl_sampler(data_config, dataset):
 #             "If the dataloader caches data before the batch is done the "
 #             "curriculum sampler won't have the opportunity to reorder it. "
 #         )
-#     if mb_cfg and mb_cfg.get("enabled", False):
-#         by_sig = {}
-#         for i in range(len(dataset)):
-#             row = dataset.dataframe[i] if hasattr(dataset, "dataframe") else dataset[i]
-#             sig = row.get("modality_signature")
-#             if sig is None:
-#                 n_img = len(row.get("images", []) or [])
-#                 n_vid = len(row.get("videos", []) or [])
-#                 n_aud = len(row.get("audios", []) or [])
-#                 if n_vid > 0 and n_aud > 0: sig = "video_audio"
-#                 elif n_vid > 0:             sig = "video"
-#                 elif n_img > 0:             sig = "image"
-#                 elif n_aud > 0:             sig = "audio"
-#                 else:                        sig = "text_only"
-#             by_sig.setdefault(sig, []).append(i)
-
-#         batch_size = mb_cfg.get("batch_size", data_config.get(
-#             "train_batch_size" if split=="train" else "val_batch_size"
-#         ))
-#         drop_last = mb_cfg.get("drop_last", split=="train")
-#         shuffle = (split == "train")
-
-#         sampler = ModalitySignatureBatchSampler(
-#             indices_by_sig=by_sig,
-#             batch_size=int(batch_size),
-#             drop_last=drop_last,
-#             seed=data_config.get("seed", 42),
-#             shuffle=shuffle,
-#         )
 
 #     # Use a sampler to facilitate checkpoint resumption.
 #     # If shuffling is enabled in the data configuration, create a random sampler.
@@ -455,5 +381,84 @@ def create_rl_sampler(data_config, dataset):
 
 #     return sampler
 
+# NOTE: This is your implementation
+def create_rl_sampler(data_config, dataset, split: str = "train"):
+    """Create a sampler for the dataset.
+
+    Arguments:
+        data_config: The data config.
+        dataset (Dataset): The dataset.
+
+    Returns:
+        sampler (Sampler): The sampler.
+    """
+    import torch
+    from torch.utils.data import RandomSampler, SequentialSampler
+
+    # modality batching config parse
+    mb_cfg = data_config.get("train_modality_batching") if split == "train" \
+            else data_config.get("val_modality_batching")
+
+    if data_config.sampler is not None and data_config.sampler.get("class_path", None) is not None:
+        curriculum_class = load_extern_type(
+            data_config.sampler.class_path,
+            data_config.sampler.class_name,
+        )
+        sampler = curriculum_class(
+            data_source=dataset,
+            data_config=data_config,
+        )
+        assert isinstance(sampler, AbstractSampler)
+        assert data_config.get("dataloader_num_workers", 8) == 0, (
+            "If using curriculum, num_workers must be 0 to prevent data caching. "
+            "If the dataloader caches data before the batch is done the "
+            "curriculum sampler won't have the opportunity to reorder it. "
+        )
+
+    if mb_cfg and mb_cfg.get("enabled", False):
+
+        # by_sig is actually the collation of dataset indices grouped by their modality signature
+        by_sig: Dict[str, List[int]] = {}
+        # essentially getting "modality_signature" from the jsonl dataset
+        for i in range(len(dataset)):
+            row = dataset.dataframe[i] if hasattr(dataset, "dataframe") else dataset[i]
+            sig = row.get("modality_signature")
+            if sig is None:
+                print(f"[WARNING] Row {i} missing 'modality_signature'. Skipping.")
+                continue
+            by_sig.setdefault(sig, []).append(i)
+
+        # batch_size = mb_cfg.get("batch_size", data_config.get(
+        #     "train_batch_size" if split=="train" else "val_batch_size"
+        # ))
+
+        batch_size = data_config.get("train_batch_size" if split=="train" else "val_batch_size")
+
+        drop_last = mb_cfg.get("drop_last")
+
+        # shuffle if split (meaning that we shuffle the samples within each batch)
+        shuffle = (split == "train")
+
+        sampler = ModalitySignatureBatchSampler(
+            indices_by_sig=by_sig,
+            batch_size=int(batch_size),
+            drop_last=drop_last,
+            seed=data_config.get("seed", 42),
+            shuffle=shuffle,
+        )
+
+    # Use a sampler to facilitate checkpoint resumption.
+    # If shuffling is enabled in the data configuration, create a random sampler.
+    elif data_config.shuffle and split == "train":
+        train_dataloader_generator = torch.Generator()
+        train_dataloader_generator.manual_seed(data_config.get("seed", 1))
+        sampler = RandomSampler(data_source=dataset, generator=train_dataloader_generator)
+
+    else:
+        # If shuffling is disabled, use a sequential sampler to iterate through the dataset in order.
+        sampler = SequentialSampler(data_source=dataset)
+
+    return sampler
+
 if __name__ == "__main__":
     main()
diff --git a/verl/trainer/ppo/org_functions.py b/verl/trainer/ppo/org_functions.py
new file mode 100644
index 00000000000..f9e3eb86248
--- /dev/null
+++ b/verl/trainer/ppo/org_functions.py
@@ -0,0 +1,78 @@
+def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]):
+    """
+    Creates the train and validation dataloaders.
+    """
+    # TODO: we have to make sure the batch size is divisible by the dp size
+    from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
+
+    if train_dataset is None:
+        train_dataset = create_rl_dataset(
+            self.config.data.train_files, self.config.data, self.tokenizer, self.processor
+        )
+    if val_dataset is None:
+        val_dataset = create_rl_dataset(
+            self.config.data.val_files, self.config.data, self.tokenizer, self.processor
+        )
+    self.train_dataset, self.val_dataset = train_dataset, val_dataset
+
+    if train_sampler is None:
+        # TODO; you can essentially specify the type of sampler here, based also on the data split
+        train_sampler = create_rl_sampler(self.config.data, self.train_dataset)
+    if collate_fn is None:
+        from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn
+
+        collate_fn = default_collate_fn
+
+    num_workers = self.config.data["dataloader_num_workers"]
+
+    ## TODO: trainer_sampler is pretty much the rl_sampler here, which shuffles the dataset
+    # TODO: the sampler is now placed into this (which is essentially your sampler)
+    self.train_dataloader = StatefulDataLoader(
+        dataset=self.train_dataset,
+        batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
+        num_workers=num_workers,
+        # shuffle=False,
+        drop_last=True,
+        collate_fn=collate_fn,
+        sampler=train_sampler,
+    )
+
+    val_batch_size = self.config.data.val_batch_size  # Prefer config value if set
+    if val_batch_size is None:
+        val_batch_size = len(self.val_dataset)
+
+    # TODO: validation data is shuffled here as well
+    self.val_dataloader = StatefulDataLoader(
+        dataset=self.val_dataset,
+        batch_size=val_batch_size,
+        num_workers=num_workers,
+        shuffle=self.config.data.get("validation_shuffle", True),
+        drop_last=False,
+        collate_fn=collate_fn,
+    )
+
+    assert len(self.train_dataloader) >= 1, "Train dataloader is empty!"
+    assert len(self.val_dataloader) >= 1, "Validation dataloader is empty!"
+
+    print(
+        f"Size of train dataloader: {len(self.train_dataloader)}, Size of val dataloader: "
+        f"{len(self.val_dataloader)}"
+    )
+
+    total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
+
+    if self.config.trainer.total_training_steps is not None:
+        total_training_steps = self.config.trainer.total_training_steps
+
+    self.total_training_steps = total_training_steps
+    print(f"Total training steps: {self.total_training_steps}")
+
+    try:
+        OmegaConf.set_struct(self.config, True)
+        with open_dict(self.config):
+            if OmegaConf.select(self.config, "actor_rollout_ref.actor.optim"):
+                self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps
+            if OmegaConf.select(self.config, "critic.optim"):
+                self.config.critic.optim.total_training_steps = total_training_steps
+    except Exception as e:
+        print(f"Warning: Could not set total_training_steps in config. Structure missing? Error: {e}")
\ No newline at end of file
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index e3a6752fb61..350c3902d6a 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -35,7 +35,7 @@
 import ujson
 import wandb
 from omegaconf import OmegaConf, open_dict
-from torch.utils.data import Dataset, Sampler
+from torch.utils.data import Dataset, Sampler, BatchSampler, SequentialSampler
 from torchdata.stateful_dataloader import StatefulDataLoader
 from tqdm import tqdm
 
@@ -515,7 +515,7 @@ def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
 
         print("[validate_config] All configuration checks passed successfully!")
 
-    def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]):
+    def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler], val_sampler: Optional[Sampler]):
         """
         Creates the train and validation dataloaders.
         """
@@ -533,7 +533,11 @@ def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampl
         self.train_dataset, self.val_dataset = train_dataset, val_dataset
 
         if train_sampler is None:
-            train_sampler = create_rl_sampler(self.config.data, self.train_dataset)
+            train_sampler = create_rl_sampler(self.config.data, self.train_dataset, split="train")
+        
+        if val_sampler is None:
+            val_sampler = create_rl_sampler(self.config.data, self.val_dataset, split="val")
+            
         if collate_fn is None:
             from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn
 
@@ -541,30 +545,54 @@ def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampl
 
         num_workers = self.config.data["dataloader_num_workers"]
 
-        ## TODO: trainer_sampler is pretty much the rl_sampler here, which shuffles the dataset
-        self.train_dataloader = StatefulDataLoader(
-            dataset=self.train_dataset,
-            batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
-            num_workers=num_workers,
-            # shuffle=False,
-            drop_last=True,
-            collate_fn=collate_fn,
-            sampler=train_sampler,
-        )
 
-        val_batch_size = self.config.data.val_batch_size  # Prefer config value if set
-        if val_batch_size is None:
-            val_batch_size = len(self.val_dataset)
-
-        # TODO: validation data is shuffled here as well
-        self.val_dataloader = StatefulDataLoader(
-            dataset=self.val_dataset,
-            batch_size=val_batch_size,
-            num_workers=num_workers,
-            shuffle=self.config.data.get("validation_shuffle", True),
-            drop_last=False,
-            collate_fn=collate_fn,
-        )
+        if isinstance(train_sampler, BatchSampler):
+            self.train_dataloader = StatefulDataLoader(
+                dataset=self.train_dataset,
+                batch_sampler=train_sampler,
+                num_workers=num_workers,
+                collate_fn=collate_fn,
+            )
+        else:
+            # Else if it is not a batch sampler, we can specify the batch size directly
+            self.train_dataloader = StatefulDataLoader(
+                dataset=self.train_dataset,
+                batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
+                num_workers=num_workers,
+                # shuffle=False,
+                drop_last=True,
+                collate_fn=collate_fn,
+                sampler=train_sampler,
+            )
+        if isinstance(val_sampler, BatchSampler):
+            # BatchSampler path: DO NOT pass batch_size/shuffle/drop_last
+            self.val_dataloader = StatefulDataLoader(
+                dataset=self.val_dataset,
+                batch_sampler=val_sampler,
+                num_workers=num_workers,
+                collate_fn=collate_fn,
+            )
+        else:
+            # Plain Sampler path: compute val_batch_size (None -> len(dataset))
+            # This plain sampler path, if you trace the instance of val_sampler,
+            # should be that of a sequential sampler. Break if it is not.
+            if not isinstance(val_sampler, SequentialSampler):
+                raise ValueError("Validation sampler is not a SequentialSampler")
+
+            val_batch_size = self.config.data.val_batch_size
+            if val_batch_size is None:
+                val_batch_size = len(self.val_dataset)
+
+            self.val_dataloader = StatefulDataLoader(
+                dataset=self.val_dataset,
+                sampler=val_sampler,
+                batch_size=val_batch_size,
+                num_workers=num_workers,
+                drop_last=False,                           # keep all val samples
+                collate_fn=collate_fn,
+                # Deterministic val preferred; if you want to honor a config flag, keep it here:
+                shuffle=self.config.data.get("validation_shuffle", False),
+            )
 
         assert len(self.train_dataloader) >= 1, "Train dataloader is empty!"
         assert len(self.val_dataloader) >= 1, "Validation dataloader is empty!"
@@ -1177,8 +1205,19 @@ def fit(self):
 
 
         for epoch in range(self.config.trainer.total_epochs):
-            i = 0
-            for batch_dict in self.train_dataloader:
+            # i = 0
+            for batch_idx, batch_dict in enumerate(self.train_dataloader):
+                #--- DEBUG: log batch content into debug_file ---
+                if debug_file is not None:
+                    with open(debug_file, "a", encoding="utf-8") as f:
+                        log_entry = {
+                            "epoch": epoch,
+                            "batch_idx": batch_idx,
+                            "modality_signatures": batch_dict.get("modality_signatures", []),
+                            "prompts": batch_dict.get("debug_prompts", []),   # may be long
+                        }
+                        f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
+
                 metrics = {}
                 timing_raw = {}
 
@@ -1206,11 +1245,11 @@ def fit(self):
                 #     print(f"[DEBUG] First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}")
 
 
-                if "input_ids" in batch.batch:
-                            with open(debug_file, "a") as f:  # append mode
-                                f.write(f"[DEBUG] Epoch {epoch}, Iter {i}\n")
-                                f.write(f"input_ids shape: {batch.batch['input_ids'].shape}\n")
-                                f.write(f"First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}\n\n")
+                # if "input_ids" in batch.batch:
+                #             with open(debug_file, "a") as f:  # append mode
+                #                 f.write(f"[DEBUG] Epoch {epoch}, Iter {i}\n")
+                #                 f.write(f"input_ids shape: {batch.batch['input_ids'].shape}\n")
+                #                 f.write(f"First sequence tokens: {batch.batch['input_ids'][0][:10].tolist()}\n\n")
 
                 # if i == 5:
                 #     raise ValueError(
diff --git a/verl/utils/dataset/modality_sampler.py b/verl/utils/dataset/modality_sampler.py
index 1089d65254a..94e34b00b7b 100644
--- a/verl/utils/dataset/modality_sampler.py
+++ b/verl/utils/dataset/modality_sampler.py
@@ -1,11 +1,14 @@
 import random
 from typing import Dict, List, Iterator, Optional
+from collections import defaultdict, deque
 from torch.utils.data import BatchSampler
 
 class ModalitySignatureBatchSampler(BatchSampler):
     """
-    Yields batches where each batch is homogeneous by 'modality_signature'.
-    No weighting logic — just shuffle (train) or sequential (val).
+    Round-robin across modality signatures, pruning exhausted signatures.
+    - Shuffles within each signature if shuffle=True (train).
+    - Each yielded batch is homogeneous by modality_signature.
+    - If a signature runs out of batches, it is removed and RR continues.
     """
     def __init__(
         self,
@@ -22,27 +25,55 @@ def __init__(
         self.rng = random.Random(seed)
         self.sigs = list(self.indices_by_sig.keys())
 
+    def _batches_for(self, pool: List[int]) -> List[List[int]]:
+        n = len(pool)
+        batches = []
+        for start in range(0, n, self.batch_size):
+            chunk = pool[start:start + self.batch_size]
+            if len(chunk) < self.batch_size and self.drop_last:
+                continue
+            if chunk:
+                batches.append(chunk)
+        return batches
+
     def __iter__(self) -> Iterator[List[int]]:
-        # copy fresh pools
+        # Fresh pools + optional shuffle within each signature
         pools = {s: list(v) for s, v in self.indices_by_sig.items()}
-        for s in self.sigs:
+        for s in pools:
             if self.shuffle:
                 self.rng.shuffle(pools[s])
 
-        for s in self.sigs:
-            pool = pools[s]
-            n = len(pool)
-            for start in range(0, n, self.batch_size):
-                end = start + self.batch_size
-                batch = pool[start:end]
-                if len(batch) < self.batch_size and self.drop_last:
-                    continue
-                if batch:
-                    yield batch
+        # Build per-signature batch queues; essentially a dictionary with batches of each different modality signature
+        per_sig_batches = {s: deque(self._batches_for(pools[s])) for s in self.sigs}
+
+        # Establish RR order
+        order = list(self.sigs)
+        if self.shuffle:
+            # rotate start signature per epoch for variety (keeps RR structure)
+            k = self.rng.randrange(len(order)) if order else 0
+            order = order[k:] + order[:k]
+        else:
+            order = sorted(order)
+
+        # Active signatures as a deque for easy rotation
+        active = deque([s for s in order if len(per_sig_batches[s]) > 0])
 
+        while active:
+            s = active.popleft() # take the queue's leftmost element (modality signature)
+            q = per_sig_batches[s] # access all of the batched stuff
+            if q:
+                yield q.popleft() # yield that batch
+                # if still has batches, push to the end to continue RR
+                if q:
+                    active.append(s) # reappend the modality signature to the active queue
+                # if q is empty, we simply don't re-append s → pruned automatically
+                else:
+                    print(f"Ran-Out: Pruning modality signature: {s}")
+            
     def __len__(self) -> int:
+        # Total number of batches across all signatures (after drop_last handling)
         total = 0
-        for s, pool in self.indices_by_sig.items():
+        for pool in self.indices_by_sig.values():
             full, rem = divmod(len(pool), self.batch_size)
             total += full + (0 if self.drop_last or rem == 0 else 1)
-        return total
+        return total
\ No newline at end of file
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 7606b402721..9e07b44097d 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -19,8 +19,7 @@
 import os
 import re
 from collections import defaultdict
-from typing import Optional
-
+from typing import Optional,Dict, Any, List
 import datasets
 import numpy as np
 import torch
@@ -52,6 +51,10 @@ def _p99(xs):
     k = int(0.99*(len(xs)-1))
     return xs[k]
 
+def assert_homogeneous(batch_list: List[Dict[str, Any]]):
+    sigs = {b.get("modality_signature") for b in batch_list}
+    if len(sigs) != 1:
+        raise AssertionError(f"Non-homogeneous batch signatures: {sigs}")
 
 def collate_fn(data_list: list[dict]) -> dict:
     """
@@ -65,6 +68,10 @@ def collate_fn(data_list: list[dict]) -> dict:
         (batch_size, dims) and non-tensor entries are converted to
         np.ndarray of dtype object with shape (batch_size,).
     """
+    # data list is the batch list
+    # NOTE: we assert homogeneous if the modality signatures are not homogeneous
+    assert_homogeneous(data_list) # assert if not homogeneous
+
     tensors = defaultdict(list)
     non_tensors = defaultdict(list)
 
@@ -380,9 +387,22 @@ def _build_messages(self, example: dict):
     def __getitem__(self, item):
         """
         Note that we also return the raw_input_ids so that it can be combined with other chat template
+        PROCESSING ONE ROW AT A TIME
         """
         row_dict: dict = self.dataframe[item]
 
+        # NOTE: save the modality signature to this; 
+        sig = row_dict.get("modality_signature", None)
+
+        if not isinstance(sig, str) or len(sig.strip()) == 0:
+            raise ValueError(
+                f"[Dataset] Missing modality_signature for idx={item}. "
+                "Preprocess your JSONL with signatures first."
+            )
+        # Optionally normalize to str (avoid numpy scalar, etc.)
+        # save the modality signatures for debugging purposes
+        row_dict["modality_signatures"] = str(sig)
+
         is_timeseries = False
         vision_path = row_dict['images'][0] if 'images' in row_dict and len(row_dict['images']) != 0 else None
         if vision_path is None:  # this may be video
@@ -393,6 +413,9 @@ def __getitem__(self, item):
             is_timeseries = True
         prompt_str = row_dict[self.prompt_key]
 
+        # save the debug_prompts for debugging purposes
+        row_dict["debug_prompts"] = prompt_str
+
         if 'How long will the patient stay in the hospital?' in prompt_str:
             row_dict["data_source"] = "multimodal"
             row_dict["dataset"] = "los_prediction"

From 620f56d2d1244fba137ae40b4e23646ac27721a1 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 15:51:50 -0400
Subject: [PATCH 221/232] add debug

---
 ...ane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 118 ++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100755 examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh

diff --git a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
new file mode 100755
index 00000000000..e118dc87147
--- /dev/null
+++ b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -0,0 +1,118 @@
+set -x
+
+unset ROCR_VISIBLE_DEVICES
+
+# actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct
+# actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B
+# data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+# data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
+# data.modalities=\'audio,videos\' \
+
+# SETTING OF SAVE PATH: trainer.default_local_dir= /scratch/keane/human_behaviour/2_models_hb_vision_only
+# SETTING OF THE LOAD PATH from directory of checkpoints is also: trainer.default_local_dir
+
+# TRAINING FROM scratch: trainer.resume_mode ==  "disable" (default will save into default_local_dir)
+
+# TRAINING AUTOMATICALLY (i.e. from scratch or from latest checkpoint) : 
+    # trainer.resume_mode == "auto" and then the model will take the latest ckpt from trainer.default_hdfs_dir
+
+# TRAINING from specific CHECKPOINT: trainer.resume_mode == "resume_path" and then specify trainer.resume_from_path
+    # Setting of path to resume training from trainer.resume_from_path (exact path of checkpoint)
+    # the model will take from resume_from_path directly (absolute path), and ignore default_hdfs_dir
+
+# for validation, set val_before_train=True ; make sure that the checkpoint is loaded and put val_only=True
+# the checkpoint should already be loaded before that
+# and then we will just evaluate
+
+# ALTERNATIVES
+# /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_v3_template_prompts.jsonl
+# /scratch/keane/human_behaviour/human_behaviour_data/discretized_no_lmvd_no_chsimsv2_no_chalearn_v3_template_prompts.jsonl
+
+# when resuming training from a loaded checkpoint cuda OOM error
+
+# alt: /scratch/keane/human_behaviour/human_behaviour_data/0.1_train_no_lmvd_discretized_v3_template_prompts.jsonl
+# org: /scratch/keane/human_behaviour/human_behaviour_data/train_no_lmvd_discretized_v3_template_prompts.jsonl
+
+# LORA:
+    # actor_rollout_ref.model.use_shm=True \
+    # actor_rollout_ref.model.lora_rank=32 \
+    # actor_rollout_ref.model.lora_alpha=32 \
+    # actor_rollout_ref.rollout.load_format=safetensors \
+    # actor_rollout_ref.model.target_modules=all-linear \
+    # actor_rollout_ref.rollout.layered_summon=True \
+
+# Set PyTorch CUDA memory allocator policies
+# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb=128
+
+# data:
+#   train_batch_size: 8
+#   val_batch_size: 8
+
+#   train_modality_batching:
+#     enabled: true
+#     drop_last: true
+
+#   val_modality_batching:
+#     enabled: true
+#     drop_last: false
+
+
+PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/sigs_train_no_lmvd_discretized_v3_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/sigs_train_no_lmvd_discretized_v3_template_prompts.jsonl \
+    data.train_batch_size=288 \
+    data.val_batch_size=144 \
+    data.max_prompt_length=4096 \
+    data.max_response_length=4096 \
+    data.filter_overlong_prompts=False \
+    data.truncation='left' \
+    data.image_key=images \
+    data.video_key=videos \
+    data.prompt_key=problem \
+    data.dataloader_num_workers=8 \
+    data.modalities=\'audio,videos\' \
+    data.train_modality_batching.enabled=True \
+    data.val_modality_batching.enabled=True \
+    data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.ppo_mini_batch_size=72 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.rollout.max_model_len=4096 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=4096 \
+    algorithm.use_kl_in_reward=False \
+    custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/human_behaviour.py \
+    custom_reward_function.name=human_behaviour_compute_score_batch \
+    reward_model.reward_manager=batch \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_hb' \
+    trainer.experiment_name='omni' \
+    trainer.n_gpus_per_node=3 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.val_before_train=False \
+    trainer.test_freq=10 \
+    trainer.total_epochs=1 $@ \
+    trainer.default_local_dir=/scratch/keane/human_behaviour/newest_verl_models_hb_omni
\ No newline at end of file

From 1e101469f77eced41122946b0df5f01e406c10d1 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 15:55:04 -0400
Subject: [PATCH 222/232] _

---
 ...ebug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index e118dc87147..e0c533a2c7d 100755
--- a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -61,8 +61,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/sigs_train_no_lmvd_discretized_v3_template_prompts.jsonl \
     data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/sigs_train_no_lmvd_discretized_v3_template_prompts.jsonl \
-    data.train_batch_size=288 \
-    data.val_batch_size=144 \
+    data.train_batch_size=6 \
+    data.val_batch_size=6 \
     data.max_prompt_length=4096 \
     data.max_response_length=4096 \
     data.filter_overlong_prompts=False \
@@ -78,8 +78,8 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
     actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=72 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=3 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.actor.use_kl_loss=False \
     actor_rollout_ref.actor.kl_loss_coef=0 \
     actor_rollout_ref.actor.kl_loss_type=low_var_kl \
@@ -88,7 +88,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.model.enable_gradient_checkpointing=True \
     actor_rollout_ref.actor.fsdp_config.param_offload=False \
     actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
     actor_rollout_ref.rollout.name=vllm \
     actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
@@ -97,7 +97,7 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \
     actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     actor_rollout_ref.rollout.max_model_len=4096 \
     actor_rollout_ref.rollout.max_num_batched_tokens=4096 \

From 0d5d23fb107cfbdfc7d48718f6816a450d96777c Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 16:02:11 -0400
Subject: [PATCH 223/232] add to config

---
 .../_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh    | 2 ++
 verl/trainer/config/_generated_ppo_megatron_trainer.yaml   | 6 ++++++
 verl/trainer/config/_generated_ppo_trainer.yaml            | 6 ++++++
 verl/trainer/config/data/legacy_data.yaml                  | 7 +++++++
 4 files changed, 21 insertions(+)

diff --git a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index e0c533a2c7d..abb9e53b4c3 100755
--- a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -73,7 +73,9 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.dataloader_num_workers=8 \
     data.modalities=\'audio,videos\' \
     data.train_modality_batching.enabled=True \
+    data.train_modality_batching.drop_last=True \
     data.val_modality_batching.enabled=True \
+    data.val_modality_batching.drop_last=False \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
index 3b7220d717d..29d33429668 100644
--- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -283,6 +283,12 @@ data:
   video_key: videos
   audio_key: audios
   modalities: images,videos
+  train_modality_batching:
+    enabled: false
+    drop_last: false
+  val_modality_batching:
+    enabled: false
+    drop_last: false
   trust_remote_code: false
   custom_cls:
     path: null
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
index dae43811119..c468f6db4bb 100644
--- a/verl/trainer/config/_generated_ppo_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -256,6 +256,12 @@ data:
   video_key: videos
   audio_key: audios
   modalities: images,videos # list of modalities to process
+  train_modality_batching:
+    enabled: false
+    drop_last: false
+  val_modality_batching:
+    enabled: false
+    drop_last: false
   trust_remote_code: false
   custom_cls:
     path: null
diff --git a/verl/trainer/config/data/legacy_data.yaml b/verl/trainer/config/data/legacy_data.yaml
index 54e476adff8..19d93f89b12 100644
--- a/verl/trainer/config/data/legacy_data.yaml
+++ b/verl/trainer/config/data/legacy_data.yaml
@@ -85,6 +85,13 @@ audio_key: audios
 # Example: 'images,videos,audios' to enable all modalities
 modalities: images,videos
 
+train_modality_batching:
+    enabled: false
+    drop_last: false
+val_modality_batching:
+    enabled: false
+    drop_last: false
+
 # If the remote tokenizer has a Python file, this flag determines whether to allow using it.
 trust_remote_code: False
 

From 9ceabdcf2bf8bc935e7a8d907a8410d5bd4fb1c0 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 16:07:06 -0400
Subject: [PATCH 224/232] add features

---
 verl/utils/dataset/rl_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 9e07b44097d..cad3fbfecfe 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -191,6 +191,7 @@ def _read_files_and_tokenize(self):
             "audios":  datasets.Sequence(datasets.Value("string")),  # <- force list of strings
             "dataset": datasets.Value("string"),
             "texts":   datasets.Sequence(datasets.Value("string")),
+            "modality_signature": datasets.Value("string"),
         })
 
         for parquet_file in self.data_files:

From 9e21b4671e73b5b8bb066639b1d1f36772514e0d Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 16:11:03 -0400
Subject: [PATCH 225/232] touch up dataloader

---
 verl/trainer/ppo/ray_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 350c3902d6a..f3d7fd2f460 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -331,6 +331,7 @@ def __init__(
         val_dataset: Optional[Dataset] = None,
         collate_fn=None,
         train_sampler: Optional[Sampler] = None,
+        val_sampler: Optional[Sampler] = None,
         device_name=None,
     ):
         """
@@ -398,7 +399,7 @@ def __init__(
             self.use_critic = False
 
         self._validate_config()
-        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
+        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler, val_sampler)
 
     def _validate_config(self):
         config = self.config

From 564b96670dd1bc51c189bc582d59c7ce650fa12d Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 16:19:16 -0400
Subject: [PATCH 226/232] debug write

---
 verl/trainer/main_ppo.py        |  4 +++-
 verl/trainer/ppo/ray_trainer.py | 12 +++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index 4be5e1385da..0e0a23ce1a7 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -416,7 +416,7 @@ def create_rl_sampler(data_config, dataset, split: str = "train"):
         )
 
     if mb_cfg and mb_cfg.get("enabled", False):
-
+        print(f"Creating our modality sampler for split: {split}")
         # by_sig is actually the collation of dataset indices grouped by their modality signature
         by_sig: Dict[str, List[int]] = {}
         # essentially getting "modality_signature" from the jsonl dataset
@@ -439,6 +439,8 @@ def create_rl_sampler(data_config, dataset, split: str = "train"):
         # shuffle if split (meaning that we shuffle the samples within each batch)
         shuffle = (split == "train")
 
+        print(f"Creating our modality sampler for split: {split}, batch_size: {batch_size}, drop_last: {drop_last}, shuffle: {shuffle}")
+
         sampler = ModalitySignatureBatchSampler(
             indices_by_sig=by_sig,
             batch_size=int(batch_size),
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index f3d7fd2f460..27318c42222 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1209,16 +1209,18 @@ def fit(self):
             # i = 0
             for batch_idx, batch_dict in enumerate(self.train_dataloader):
                 #--- DEBUG: log batch content into debug_file ---
+
+
                 if debug_file is not None:
                     with open(debug_file, "a", encoding="utf-8") as f:
                         log_entry = {
-                            "epoch": epoch,
-                            "batch_idx": batch_idx,
+                            "epoch": int(epoch),
+                            "batch_idx": int(batch_idx),
                             "modality_signatures": batch_dict.get("modality_signatures", []),
-                            "prompts": batch_dict.get("debug_prompts", []),   # may be long
+                            "prompts": batch_dict.get("debug_prompts", []),
                         }
-                        f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
-
+                        f.write(json.dumps(log_entry, ensure_ascii=False, default=lambda o: o.tolist() if isinstance(o, np.ndarray) else str(o)) + "\n")
+                
                 metrics = {}
                 timing_raw = {}
 

From 3c74a536f4442b4144f28c73de651fdac9519291 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 16:22:56 -0400
Subject: [PATCH 227/232] debug write

---
 verl/trainer/ppo/ray_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 27318c42222..28201700860 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -1288,7 +1288,7 @@ def fit(self):
 
                 # TODO: double check the gen_batch
                 # print(f"gen_batch", gen_batch)
-                i += 1
+                # i += 1
 
                 with marked_timer("step", timing_raw):
                     # generate a batch

From 67b93871c1b91a2eed64fb1f517f8733218841fc Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 16:33:05 -0400
Subject: [PATCH 228/232] debug counterfactual

---
 .../_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index abb9e53b4c3..3f49f98c23a 100755
--- a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -72,9 +72,9 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.prompt_key=problem \
     data.dataloader_num_workers=8 \
     data.modalities=\'audio,videos\' \
-    data.train_modality_batching.enabled=True \
+    data.train_modality_batching.enabled=False \
     data.train_modality_batching.drop_last=True \
-    data.val_modality_batching.enabled=True \
+    data.val_modality_batching.enabled=False \
     data.val_modality_batching.drop_last=False \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \

From bd9ca3fe3bb34c9a059518feabb37b0501cef519 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 16:36:42 -0400
Subject: [PATCH 229/232] debug counterfact

---
 .../_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh         | 1 +
 verl/utils/dataset/rl_dataset.py                                | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3f49f98c23a..29d62207d78 100755
--- a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -48,6 +48,7 @@ unset ROCR_VISIBLE_DEVICES
 #   train_batch_size: 8
 #   val_batch_size: 8
 
+# NOTE: THESE NEED TO BE TOGGLED AS TRUE 
 #   train_modality_batching:
 #     enabled: true
 #     drop_last: true
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index cad3fbfecfe..9456e680e8b 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -70,7 +70,7 @@ def collate_fn(data_list: list[dict]) -> dict:
     """
     # data list is the batch list
     # NOTE: we assert homogeneous if the modality signatures are not homogeneous
-    assert_homogeneous(data_list) # assert if not homogeneous
+    # assert_homogeneous(data_list) # assert if not homogeneous
 
     tensors = defaultdict(list)
     non_tensors = defaultdict(list)

From 9465b06692709fd5239d738e700159fccc6f56c5 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 16:50:00 -0400
Subject: [PATCH 230/232] _

---
 ...ebug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh | 12 ++++++------
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh    |  6 ++++--
 verl/utils/dataset/rl_dataset.py                     |  2 +-
 verl/utils/dataset/vision_utils.py                   |  8 +++++---
 4 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 29d62207d78..3522b2fea39 100755
--- a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -61,7 +61,7 @@ unset ROCR_VISIBLE_DEVICES
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
     data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/sigs_train_no_lmvd_discretized_v3_template_prompts.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/sigs_train_no_lmvd_discretized_v3_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/sigs_val_no_lmvd_discretized_v3_template_prompts.jsonl \
     data.train_batch_size=6 \
     data.val_batch_size=6 \
     data.max_prompt_length=4096 \
@@ -73,10 +73,10 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.prompt_key=problem \
     data.dataloader_num_workers=8 \
     data.modalities=\'audio,videos\' \
-    data.train_modality_batching.enabled=False \
+    data.train_modality_batching.enabled=True \
     data.train_modality_batching.drop_last=True \
-    data.val_modality_batching.enabled=False \
-    data.val_modality_batching.drop_last=False \
+    data.val_modality_batching.enabled=True \
+    data.val_modality_batching.drop_last=True \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
@@ -111,11 +111,11 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
     trainer.project_name='verl_hb' \
-    trainer.experiment_name='omni' \
+    trainer.experiment_name='mixed_modal_omni' \
     trainer.n_gpus_per_node=3 \
     trainer.nnodes=1 \
     trainer.save_freq=-1 \
     trainer.val_before_train=False \
     trainer.test_freq=10 \
     trainer.total_epochs=1 $@ \
-    trainer.default_local_dir=/scratch/keane/human_behaviour/newest_verl_models_hb_omni
\ No newline at end of file
+    trainer.default_local_dir=/scratch/keane/human_behaviour/mixed_modal_verl_models_hb_omni
\ No newline at end of file
diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index 3c14063636a..c66baef52c2 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -59,8 +59,8 @@ unset ROCR_VISIBLE_DEVICES
 
 PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_video_only.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_video_only.jsonl \
+    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/sigs_train_no_lmvd_discretized_v3_template_prompts.jsonl \
+    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/sigs_val_no_lmvd_discretized_v3_template_prompts.jsonl \
     data.train_batch_size=288 \
     data.val_batch_size=144 \
     data.max_prompt_length=4096 \
@@ -73,7 +73,9 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     data.dataloader_num_workers=8 \
     data.modalities=\'audio,videos\' \
     data.train_modality_batching.enabled=True \
+    data.train_modality_batching.drop_last=True \
     data.val_modality_batching.enabled=True \
+    data.val_modality_batching.drop_last=True \
     data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
     actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
     actor_rollout_ref.actor.optim.lr=1e-6 \
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
index 9456e680e8b..cad3fbfecfe 100644
--- a/verl/utils/dataset/rl_dataset.py
+++ b/verl/utils/dataset/rl_dataset.py
@@ -70,7 +70,7 @@ def collate_fn(data_list: list[dict]) -> dict:
     """
     # data list is the batch list
     # NOTE: we assert homogeneous if the modality signatures are not homogeneous
-    # assert_homogeneous(data_list) # assert if not homogeneous
+    assert_homogeneous(data_list) # assert if not homogeneous
 
     tensors = defaultdict(list)
     non_tensors = defaultdict(list)
diff --git a/verl/utils/dataset/vision_utils.py b/verl/utils/dataset/vision_utils.py
index 2b13e98d26f..0457c2981ba 100644
--- a/verl/utils/dataset/vision_utils.py
+++ b/verl/utils/dataset/vision_utils.py
@@ -89,12 +89,14 @@ def process_video(
     # Normalize string input → dict
     if isinstance(video, str):
         # Your current defaults (tiny visual budget)
-        video = {"type": "video", "video": video,
-                 "min_pixels": 32768, "max_pixels": 32768, "nframes": 2}
+        # video = {"type": "video", "video": video,
+        #          "min_pixels": 32768, "max_pixels": 32768, "nframes": 2}
 
     # Moderate budget
+        video = {"type": "video", "video": video,
+                "min_pixels": 49152, "max_pixels": 262144, "nframes": 4}
         
-    # if isinstance(video, str):
+    # Most expensive budget
     #     video = {"type": "video", "video": video, "min_pixels": 65536, "max_pixels": 524288,
     #              "nframes": 4}
 

From 772a47132355e89120660129f4b2c4171220face Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 16:53:38 -0400
Subject: [PATCH 231/232] _

---
 .../_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
index c66baef52c2..9bebaf58293 100755
--- a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
+++ b/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
@@ -109,12 +109,12 @@ PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/
     reward_model.reward_manager=batch \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
-    trainer.project_name='verl_hb' \
-    trainer.experiment_name='omni' \
+    trainer.project_name='mixed_modal_verl_hb' \
+    trainer.experiment_name='mixed_modal_omni' \
     trainer.n_gpus_per_node=3 \
     trainer.nnodes=1 \
-    trainer.save_freq=-1 \
+    trainer.save_freq=10 \
     trainer.val_before_train=False \
-    trainer.test_freq=10 \
+    trainer.test_freq=5 \
     trainer.total_epochs=1 $@ \
-    trainer.default_local_dir=/scratch/keane/human_behaviour/newest_verl_models_hb_omni
\ No newline at end of file
+    trainer.default_local_dir=/scratch/keane/human_behaviour/mixed_modal_verl_models_hb_omni
\ No newline at end of file

From 6a6f11ff928ccb36d9f2c1d2f67630e31bfd0000 Mon Sep 17 00:00:00 2001
From: Keane Ong Wei Yang <keane.ongweiyang@u.nus.edu>
Date: Wed, 20 Aug 2025 16:54:35 -0400
Subject: [PATCH 232/232] _

---
 ..._run_qwen2_5_omni-7b_hb_all_modalities.sh} |  0
 ..._run_qwen2_5_omni-7b_hb_all_modalities.sh} |  0
 ...e_run_qwen2_5_omni-7b_hb_all_modalities.sh | 83 -------------------
 3 files changed, 83 deletions(-)
 rename examples/grpo_trainer/{_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh => _debug_keane_run_qwen2_5_omni-7b_hb_all_modalities.sh} (100%)
 rename examples/grpo_trainer/{_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh => _keane_run_qwen2_5_omni-7b_hb_all_modalities.sh} (100%)
 delete mode 100755 examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh

diff --git a/examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_debug_keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
similarity index 100%
rename from examples/grpo_trainer/_debug_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
rename to examples/grpo_trainer/_debug_keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
diff --git a/examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh b/examples/grpo_trainer/_keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
similarity index 100%
rename from examples/grpo_trainer/_keane_run_qwen2_5_vl-7b_hb_all_modalities.sh
rename to examples/grpo_trainer/_keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
diff --git a/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh b/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
deleted file mode 100755
index 4e28791c572..00000000000
--- a/examples/grpo_trainer/keane_run_qwen2_5_omni-7b_hb_all_modalities.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-set -x
-
-unset ROCR_VISIBLE_DEVICES
-
-# actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct
-# actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B
-# data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/train_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
-# data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/val_no_meld_no_chalearn_vision_v2_template_prompts.jsonl \
-# data.modalities=\'audio,videos\' \
-
-# SETTING OF SAVE PATH: trainer.default_local_dir= /scratch/keane/human_behaviour/2_models_hb_vision_only
-# SETTING OF THE LOAD PATH from directory of checkpoints is also: trainer.default_local_dir
-
-# TRAINING FROM scratch: trainer.resume_mode ==  "disable" (default will save into default_local_dir)
-
-# TRAINING AUTOMATICALLY (i.e. from scratch or from latest checkpoint) : 
-    # trainer.resume_mode == "auto" and then the model will take the latest ckpt from trainer.default_hdfs_dir
-
-# TRAINING from specific CHECKPOINT: trainer.resume_mode == "resume_path" and then specify trainer.resume_from_path
-    # Setting of path to resume training from trainer.resume_from_path (exact path of checkpoint)
-    # the model will take from resume_from_path directly (absolute path), and ignore default_hdfs_dir
-
-# for validation, set val_before_train=True ; make sure that the checkpoint is loaded and put val_only=True
-# the checkpoint should already be loaded before that
-# and then we will just evaluate
-
-# /scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl
-
-PYTHONUNBUFFERED=1 HYDRA_FULL_ERROR=1 PYTHONPATH="/home/keaneong/human-behavior/verl:$PYTHONPATH" NCCL_ASYNC_ERROR_HANDLING=1 python3 -m verl.trainer.main_ppo \
-    algorithm.adv_estimator=grpo \
-    data.train_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
-    data.val_files=/scratch/keane/human_behaviour/human_behaviour_data/subset_cremad_only.jsonl \
-    data.train_batch_size=1 \
-    data.val_batch_size=1 \
-    data.max_prompt_length=3072 \
-    data.max_response_length=1536 \
-    data.filter_overlong_prompts=False \
-    data.truncation='left' \
-    data.image_key=images \
-    data.video_key=videos \
-    data.prompt_key=problem \
-    data.dataloader_num_workers=0 \
-    data.modalities=\'audio,videos\' \
-    data.format_prompt=/home/keaneong/human-behavior/verl/examples/format_prompt/default.jinja \
-    actor_rollout_ref.model.path=Qwen/Qwen2.5-Omni-7B \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.model.use_remove_padding=False \
-    actor_rollout_ref.actor.ppo_mini_batch_size=1 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
-    actor_rollout_ref.actor.use_kl_loss=False \
-    actor_rollout_ref.actor.kl_loss_coef=1e-8 \
-    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.fsdp_config.param_offload=False \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
-    actor_rollout_ref.rollout.enable_chunked_prefill=False \
-    actor_rollout_ref.rollout.enforce_eager=False \
-    actor_rollout_ref.rollout.free_cache_engine=True \
-    actor_rollout_ref.rollout.n=3 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    algorithm.use_kl_in_reward=False \
-    custom_reward_function.path=/home/keaneong/human-behavior/verl/examples/reward_function/medical.py \
-    custom_reward_function.name=medical_compute_score_batch \
-    reward_model.reward_manager=batch \
-    trainer.critic_warmup=0 \
-    trainer.logger='["console","wandb"]' \
-    trainer.project_name='verl_hb' \
-    trainer.experiment_name='vision_only' \
-    trainer.n_gpus_per_node=3 \
-    trainer.nnodes=1 \
-    trainer.save_freq=20 \
-    trainer.val_before_train=False \
-    trainer.test_freq=1 \
-    trainer.total_epochs=15 $@ \
-    trainer.default_local_dir=/scratch/keane/human_behaviour/verl_models_hb_vision_only
\ No newline at end of file