Emerge-Lab · wangbingke0 · Sep 4, 2025 · Oct 26, 2025 · Oct 26, 2025 · Jan 4, 2026
diff --git a/baselines/ppo/config/ppo_base_puffer.yaml b/baselines/ppo/config/ppo_base_puffer.yaml
@@ -3,105 +3,119 @@ use_rnn: false
 eval_model_path: null
 baseline: false
 data_dir: data/processed/training
-continue_training: false
-model_cpt: null
-
-environment: # Overrides default environment configs (see pygpudrive/env/config.py)
+continue_training: true
+model_cpt: /home/wbk/gpudrive/runs/PPO__C__S_72__01_20_15_12_22_098/model_PPO__C__S_72__01_20_15_12_22_098_027823.pt
+environment: # 覆盖环境配置（见 gpudrive/env/config.py）
   name: "gpudrive"
-  num_worlds: 75 # Number of parallel environments
-  k_unique_scenes: 75 # Number of unique scenes to sample from
-  max_controlled_agents: 64 # Maximum number of agents controlled by the model. Make sure this aligns with the variable kMaxAgentCount in src/consts.hpp
+  num_worlds: 18 # 并行环境数量（进一步降低以减少重采样时显存峰值）
+  k_unique_scenes: 72 # 采样场景数量（减少单次负载）
+  max_controlled_agents: 64 # 最大控制代理数量（需与环境掩码维度一致）
   ego_state: true
   road_map_obs: true
   partner_obs: true
   norm_obs: true
-  remove_non_vehicles: true # If false, all agents are included (vehicles, pedestrians, cyclists)
-  lidar_obs: false # NOTE: Setting this to true currently turns of the other observation types
+  remove_non_vehicles: true # 如果为 false，则包括所有代理（车辆、行人、自行车）
+  lidar_obs: false # 注意：设为 true 会关闭其他观测类型
   reward_type: "weighted_combination"
-  collision_weight: -0.75
-  off_road_weight: -0.75
-  goal_achieved_weight: 1.0
+  collision_weight: -3.0  # 提高碰撞惩罚：减少转弯时的碰撞
+  off_road_weight: -3.0   # 降低：允许适度冒险
+  goal_achieved_weight: 1.0  # 大幅提高：让"到达"比"安全躲避"更有吸引力
+  # 避免"动几下就停"的塑形项（仅 weighted_combination 生效）
+  time_penalty: 0.005  # 提高：增强推进压力
+  idle_speed_threshold: 0.5
+  idle_penalty: 0.02   # 降低：避免惩罚过重
+  # 进度奖励：距离目标越近奖励越高（密集正向信号）
+  progress_reward_weight: 0.1  # 降低：避免改变奖励scale太多
+  progress_reward_scale: 20.0
   dynamics_model: "classic"
-  collision_behavior: "ignore" # Options: "remove", "stop", "ignore"
+  collision_behavior: "remove" # 选项："remove"、"stop"、"ignore"
   dist_to_goal_threshold: 2.0
-  polyline_reduction_threshold: 0.1 # Rate at which to sample points from the polyline (0 is use all closest points, 1 maximum sparsity), needs to be balanced with kMaxAgentMapObservationsCount
-  sampling_seed: 42 # If given, the set of scenes to sample from will be deterministic, if None, the set of scenes will be random
-  obs_radius: 50.0 # Visibility radius of the agents
+  polyline_reduction_threshold: 0.1 # 采样点率（0 表示使用所有最近点，1 表示最大稀疏度），需与 kMaxAgentMapObservationsCount 平衡
+  sampling_seed: 42 # 若设置则场景采样可复现；为 None 则随机
+  obs_radius: 50.0 # 智能体可见半径
   action_space_steer_disc: 13
   action_space_accel_disc: 7
-  # Versatile Behavior Diffusion (VBD): This will slow down training
+  # Versatile Behavior Diffusion (VBD)：开启会降低训练速度
   use_vbd: false
   vbd_model_path: "gpudrive/integrations/vbd/weights/epoch=18.ckpt"
   init_steps: 11
-  vbd_trajectory_weight: 0.1 # Importance of distance to the vbd trajectories in the reward function
+  vbd_trajectory_weight: 0.1 # 奖励中 VBD 轨迹距离项的权重
   vbd_in_obs: false
 
 wandb:
   entity: ""
   project: "gpudrive"
   group: "test"
-  mode: "online" # Options: online, offline, disabled
+  mode: "online" # 选项：online、offline、disabled
   tags: ["ppo", "ff"]
 
 train:
-  exp_id: PPO # Set dynamically in the script if needed
+  exp_id: PPO # 如需可在脚本中动态设置
   seed: 42
   cpu_offload: false
-  device: "cuda"  # Dynamically set to cuda if available, else cpu
+  device: "cuda"  # 若可用则使用 cuda，否则使用 cpu
   bptt_horizon: 1
   compile: false
   compile_mode: "reduce-overhead"
 
-  # # # Data sampling # # #
-  resample_scenes: false
-  resample_dataset_size: 10_000 # Number of unique scenes to sample from
-  resample_interval: 2_000_000
+  # # # 数据采样 # # #
+  resample_scenes: false   # 开启重采样，提升泛化能力
+  resample_dataset_size: 10_000
+  resample_interval: 10_000_000  # 50M步训练约5次重采样，平衡稳定性和泛化
   sample_with_replacement: true
   shuffle_dataset: false
 
   # # # PPO # # #
   torch_deterministic: false
-  total_timesteps: 1_000_000_000
-  batch_size: 131_072
-  minibatch_size: 8192
-  learning_rate: 3e-4
-  anneal_lr: false
+  total_timesteps: 500_000_000
+  batch_size: 18432
+  minibatch_size: 3072
+  # 适度提高学习率：适应新的奖励函数（碰撞/越界惩罚提高）
+  learning_rate: 2e-4  # 从1e-4提高到2e-4，加快适应新奖励信号
+  anneal_lr: true     # 开启学习率衰减：从2e-4开始，随训练逐渐降低，先快速适应后精细调优
   gamma: 0.99
   gae_lambda: 0.95
-  update_epochs: 4
+  # 收紧更新：避免策略变化太大
+  update_epochs: 3  # 减少更新次数
   norm_adv: true
-  clip_coef: 0.2
-  clip_vloss: false
+  clip_coef: 0.15   # 收紧clip，限制策略变化幅度
+  # value 更稳
+  clip_vloss: true
   vf_clip_coef: 0.2
-  ent_coef: 0.0001
+  # 降低探索：策略已学会走，现在需要更稳定（减少晃动）
+  ent_coef: 0.0003  # 从 0.001 降低到 0.0003
   vf_coef: 0.3
   max_grad_norm: 0.5
-  target_kl: null
+  # KL 早停，避免重采样后一次更新过猛导致震荡
+  target_kl: 0.02
   log_window: 1000
 
-  # # # Network # # #
+  # # # 网络 # # #
   network:
-    input_dim: 64 # Embedding of the input features
-    hidden_dim: 128 # Latent dimension
+    input_dim: 64 # 输入特征嵌入维度
+    hidden_dim: 128 # 潜在维度
     dropout: 0.01
     class_name: "NeuralNet"
-    num_parameters: 0 # Total trainable parameters, to be filled at runtime
+    num_parameters: 0 # 可训练参数数量（运行时填充）
+    # 新增：观察融合网络配置
+    fusion_type: "attention" # 选项: "simple", "attention", "adaptive"
+    num_attention_heads: 4 # 注意力头数（仅在fusion_type="attention"时有效）
 
-  # # # Checkpointing # # #
-  checkpoint_interval: 400 # Save policy every k iterations
+  # # # 检查点保存 # # #
+  checkpoint_interval: 200 # 每隔 k 次迭代保存一次
   checkpoint_path: "./runs"
 
-  # # # Rendering # # #
-  render: false # Determines whether to render the environment (note: will slow down training)
-  render_3d: true # Render simulator state in 3d or 2d
-  render_interval: 1 # Render every k iterations
-  render_k_scenarios: 10 # Number of scenarios to render
-  render_format: "mp4" # Options: gif, mp4
-  render_fps: 15 # Frames per second
+  # # # 渲染 # # #
+  render: false # 是否渲染环境（开启会减慢训练）
+  render_3d: true # 渲染 3D 或 2D
+  render_interval: 1 # 每隔 k 次迭代渲染
+  render_k_scenarios: 0 # 训练期建议为 0，避免额外 IO/不确定性
+  render_format: "mp4" # 选项：gif、mp4
+  render_fps: 15 # 每秒帧数
   zoom_radius: 50
 
 vec:
-  backend: "native" # Only native is currently supported
+  backend: "native" # 目前仅支持 native
   num_workers: 1
   env_batch_size: 1
   zero_copy: false
diff --git a/baselines/ppo/config/ppo_base_sb3.yaml b/baselines/ppo/config/ppo_base_sb3.yaml
@@ -1,5 +1,5 @@
 data_dir: "data/processed/examples"
-num_worlds: 100
+num_worlds: 30
 k_unique_scenes: 4
 device: "cuda"  # or "cpu"
 

diff --git a/gpudrive/env/config.py b/gpudrive/env/config.py
@@ -42,9 +42,9 @@ class EnvConfig:
 
     # Set the weights for the reward components
     # R = a * collided + b * goal_achieved + c * off_road
-    collision_weight: float = 0.0
+    collision_weight: float = -0.5
     goal_achieved_weight: float = 1.0
-    off_road_weight: float = 0.0
+    off_road_weight: float = -0.5
 
     # Road observation algorithm settings
     road_obs_algorithm: str = "linear"  # Algorithm for road observations
@@ -101,6 +101,22 @@ class EnvConfig:
     reward_type: str = "sparse_on_goal_achieved"
     # Alternatively, "weighted_combination", "distance_to_logs", "distance_to_vdb_trajs", "reward_conditioned"
 
+    # --- weighted_combination 额外稠密项（用于避免“动几下就停”的局部最优） ---
+    # 每一步的时间成本（仅在 reward_type == "weighted_combination" 时生效）
+    # 建议从 0.001~0.005 试起；过大可能导致冒进/碰撞上升
+    time_penalty: float = 0.0
+
+    # 低速/怠速惩罚（仅在 reward_type == "weighted_combination" 时生效）
+    # 当 speed < idle_speed_threshold 且未完成/未终止时，额外扣 idle_penalty
+    idle_speed_threshold: float = 0.5
+    idle_penalty: float = 0.0
+
+    # 进度奖励：距离目标越近，每步获得的奖励越高（密集正向信号）
+    # reward += progress_reward_weight * exp(-dist_to_goal / progress_reward_scale)
+    # 建议 progress_reward_weight: 0.1~0.3, progress_reward_scale: 15~30
+    progress_reward_weight: float = 0.0  # 默认关闭
+    progress_reward_scale: float = 20.0  # 距离衰减因子
+
     condition_mode: str = "random"  # Options: "random", "fixed", "preset"
 
     # Define upper and lower bounds for reward components if using reward_conditioned

diff --git a/gpudrive/env/dataset.py b/gpudrive/env/dataset.py
@@ -35,6 +35,8 @@ def __post_init__(self):
             )
 
         # Set the random seed for reproducibility
+        if self.seed is None:
+            self.seed = 42
         self.random_gen = random.Random(self.seed)
 
         # Create the dataset from valid files in the directory
@@ -84,8 +86,9 @@ def __len__(self):
     def __next__(self) -> List[str]:
         if self.sample_with_replacement:
             # Ensure deterministic behavior
+            base_seed = 0 if self.seed is None else self.seed
             random_gen = random.Random(
-                self.seed + self.current_index
+                base_seed + self.current_index
             )  # Changing the seed per batch
 
             # Determine the batch size using the random generator to shuffle the indices

diff --git a/gpudrive/env/env_puffer.py b/gpudrive/env/env_puffer.py
@@ -52,24 +52,24 @@ def __init__(
         off_road_weight=-0.5,
         goal_achieved_weight=1,
         dist_to_goal_threshold=2.0,
-        polyline_reduction_threshold=0.1,
+        polyline_reduction_threshold=0.1, #折线简化阈值，是一个用于控制道路图观察点采样密度的参数。
         remove_non_vehicles=True,
         obs_radius=50.0,
         use_vbd=False,
         vbd_model_path=None,
         vbd_trajectory_weight=0.1,
         render=False,
         render_3d=True,
-        render_interval=50,
-        render_k_scenarios=3,
+        render_interval=50, #渲染间隔，每隔多少步渲染一次
+        render_k_scenarios=3, #渲染场景数量
         render_agent_obs=False,
         render_format="mp4",
         render_fps=15,
         zoom_radius=50,
-        buf=None,
+        buf=None, #缓冲区，用于存储环境状态和动作
         **kwargs,
     ):
-        assert buf is None, "GPUDrive set up only for --vec native"
+        assert buf is None, "GPUDrive set up only for --vec native" #断言缓冲区为空，表示只支持原生环境
 
         if data_loader is None:
             data_loader = SceneDataLoader(
@@ -78,7 +78,7 @@ def __init__(
                 dataset_size=loader_dataset_size,
                 sample_with_replacement=loader_sample_with_replacement,
                 shuffle=loader_shuffle,
-            )
+            ) #数据加载器，用于加载场景数据
 
         if device is None:
             device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -239,6 +239,13 @@ def step(self, action):
         Args:
             action: A numpy array of actions for the controlled agents. Shape:
                 (num_worlds, max_cont_agents_per_env)
+        执行一步环境交互：
+        1. 应用动作
+        2. 执行物理仿真
+        3. 计算奖励
+        4. 处理终止状态
+        5. 异步重置完成的环境
+        6. 返回新的观测
         """
 
         # Set the action for the controlled agents

diff --git a/gpudrive/env/env_torch.py b/gpudrive/env/env_torch.py
@@ -491,6 +491,51 @@ def get_rewards(
                 + off_road_weight * off_road
             )
 
+            # 稠密塑形：避免"停住最优"
+            # 仅对未 done 且未达成目标的 agent 生效（done/goal 后不再额外惩罚）
+            needs_shaping = (
+                self.config.time_penalty != 0.0
+                or self.config.idle_penalty != 0.0
+                or self.config.progress_reward_weight != 0.0
+            )
+            if needs_shaping:
+                done = (
+                    self.sim.done_tensor()
+                    .to_torch()
+                    .clone()
+                    .squeeze(dim=2)
+                    .to(weighted_rewards.device)
+                    .to(torch.float)
+                )
+                active = (1.0 - done) * (1.0 - goal_achieved)
+
+                if self.config.time_penalty != 0.0:
+                    weighted_rewards = weighted_rewards - self.config.time_penalty * active
+
+                # 获取速度（用于idle惩罚）
+                if self.config.idle_penalty != 0.0:
+                    speed = (
+                        self.sim.self_observation_tensor()
+                        .to_torch()
+                        .clone()[:, :, 0]
+                        .to(weighted_rewards.device)
+                        .to(torch.float)
+                    )
+                    is_idle = (speed < self.config.idle_speed_threshold).to(torch.float)
+                    weighted_rewards = weighted_rewards - self.config.idle_penalty * is_idle * active
+
+                # 进度奖励：距离目标越近，每步正奖励越高（密集引导信号）
+                if self.config.progress_reward_weight != 0.0:
+                    self_obs = self.sim.self_observation_tensor().to_torch().clone()
+                    rel_goal_x = self_obs[:, :, 4].to(weighted_rewards.device)
+                    rel_goal_y = self_obs[:, :, 5].to(weighted_rewards.device)
+                    dist_to_goal = torch.sqrt(rel_goal_x ** 2 + rel_goal_y ** 2 + 1e-6)
+                    progress_reward = self.config.progress_reward_weight * torch.exp(
+                        -dist_to_goal / self.config.progress_reward_scale
+                    )
+                    # 只给仍在行驶中的 agent
+                    weighted_rewards = weighted_rewards + progress_reward * active
+
             return weighted_rewards
 
         elif self.config.reward_type == "reward_conditioned":

diff --git a/gpudrive/integrations/puffer/ppo.py b/gpudrive/integrations/puffer/ppo.py
@@ -239,6 +239,20 @@ def train(data):
         dones_np = experience.dones_np[idxs]
         values_np = experience.values_np[idxs]
         rewards_np = experience.rewards_np[idxs]
+
+        # 数值稳定性检查：检查输入数据
+        if np.isnan(dones_np).any() or np.isnan(values_np).any() or np.isnan(rewards_np).any():
+            print("Warning: NaN detected in GAE inputs, replacing with zeros")
+            dones_np = np.nan_to_num(dones_np, nan=0.0)
+            values_np = np.nan_to_num(values_np, nan=0.0)
+            rewards_np = np.nan_to_num(rewards_np, nan=0.0)
+
+        # 检查是否有Inf值
+        if np.isinf(values_np).any() or np.isinf(rewards_np).any():
+            print("Warning: Inf detected in GAE inputs, clipping values")
+            values_np = np.clip(values_np, -1e6, 1e6)
+            rewards_np = np.clip(rewards_np, -1e6, 1e6)
+
         advantages_np = compute_gae(
             dones_np, values_np, rewards_np, config.gamma, config.gae_lambda
         )
@@ -347,7 +361,12 @@ def train(data):
 
     with profile.train_misc:
         if config.anneal_lr:
-            frac = 1.0 - data.global_step / config.total_timesteps
+            # 支持继续训练时从配置的学习率开始衰减
+            lr_start_step = getattr(data, 'lr_start_step', 0)
+            lr_total_steps = config.total_timesteps - lr_start_step
+            steps_since_start = data.global_step - lr_start_step
+            frac = 1.0 - steps_since_start / lr_total_steps
+            frac = max(0.0, frac)  # 防止负数
             lrnow = float(frac) * float(config.learning_rate)
             data.optimizer.param_groups[0]["lr"] = lrnow