Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 63 additions & 49 deletions baselines/ppo/config/ppo_base_puffer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,105 +3,119 @@ use_rnn: false
eval_model_path: null
baseline: false
data_dir: data/processed/training
continue_training: false
model_cpt: null

environment: # Overrides default environment configs (see pygpudrive/env/config.py)
continue_training: true
model_cpt: /home/wbk/gpudrive/runs/PPO__C__S_72__01_20_15_12_22_098/model_PPO__C__S_72__01_20_15_12_22_098_027823.pt
environment: # 覆盖环境配置(见 gpudrive/env/config.py)
name: "gpudrive"
num_worlds: 75 # Number of parallel environments
k_unique_scenes: 75 # Number of unique scenes to sample from
max_controlled_agents: 64 # Maximum number of agents controlled by the model. Make sure this aligns with the variable kMaxAgentCount in src/consts.hpp
num_worlds: 18 # 并行环境数量(进一步降低以减少重采样时显存峰值)
k_unique_scenes: 72 # 采样场景数量(减少单次负载)
max_controlled_agents: 64 # 最大控制代理数量(需与环境掩码维度一致)
ego_state: true
road_map_obs: true
partner_obs: true
norm_obs: true
remove_non_vehicles: true # If false, all agents are included (vehicles, pedestrians, cyclists)
lidar_obs: false # NOTE: Setting this to true currently turns of the other observation types
remove_non_vehicles: true # 如果为 false,则包括所有代理(车辆、行人、自行车)
lidar_obs: false # 注意:设为 true 会关闭其他观测类型
reward_type: "weighted_combination"
collision_weight: -0.75
off_road_weight: -0.75
goal_achieved_weight: 1.0
collision_weight: -3.0 # 提高碰撞惩罚:减少转弯时的碰撞
off_road_weight: -3.0 # 降低:允许适度冒险
goal_achieved_weight: 1.0 # 大幅提高:让"到达"比"安全躲避"更有吸引力
# 避免"动几下就停"的塑形项(仅 weighted_combination 生效)
time_penalty: 0.005 # 提高:增强推进压力
idle_speed_threshold: 0.5
idle_penalty: 0.02 # 降低:避免惩罚过重
# 进度奖励:距离目标越近奖励越高(密集正向信号)
progress_reward_weight: 0.1 # 降低:避免改变奖励scale太多
progress_reward_scale: 20.0
dynamics_model: "classic"
collision_behavior: "ignore" # Options: "remove", "stop", "ignore"
collision_behavior: "remove" # 选项:"remove""stop""ignore"
dist_to_goal_threshold: 2.0
polyline_reduction_threshold: 0.1 # Rate at which to sample points from the polyline (0 is use all closest points, 1 maximum sparsity), needs to be balanced with kMaxAgentMapObservationsCount
sampling_seed: 42 # If given, the set of scenes to sample from will be deterministic, if None, the set of scenes will be random
obs_radius: 50.0 # Visibility radius of the agents
polyline_reduction_threshold: 0.1 # 采样点率(0 表示使用所有最近点,1 表示最大稀疏度),需与 kMaxAgentMapObservationsCount 平衡
sampling_seed: 42 # 若设置则场景采样可复现;为 None 则随机
obs_radius: 50.0 # 智能体可见半径
action_space_steer_disc: 13
action_space_accel_disc: 7
# Versatile Behavior Diffusion (VBD): This will slow down training
# Versatile Behavior Diffusion (VBD):开启会降低训练速度
use_vbd: false
vbd_model_path: "gpudrive/integrations/vbd/weights/epoch=18.ckpt"
init_steps: 11
vbd_trajectory_weight: 0.1 # Importance of distance to the vbd trajectories in the reward function
vbd_trajectory_weight: 0.1 # 奖励中 VBD 轨迹距离项的权重
vbd_in_obs: false

wandb:
entity: ""
project: "gpudrive"
group: "test"
mode: "online" # Options: online, offline, disabled
mode: "online" # 选项:onlineofflinedisabled
tags: ["ppo", "ff"]

train:
exp_id: PPO # Set dynamically in the script if needed
exp_id: PPO # 如需可在脚本中动态设置
seed: 42
cpu_offload: false
device: "cuda" # Dynamically set to cuda if available, else cpu
device: "cuda" # 若可用则使用 cuda,否则使用 cpu
bptt_horizon: 1
compile: false
compile_mode: "reduce-overhead"

# # # Data sampling # # #
resample_scenes: false
resample_dataset_size: 10_000 # Number of unique scenes to sample from
resample_interval: 2_000_000
# # # 数据采样 # # #
resample_scenes: false # 开启重采样,提升泛化能力
resample_dataset_size: 10_000
resample_interval: 10_000_000 # 50M步训练约5次重采样,平衡稳定性和泛化
sample_with_replacement: true
shuffle_dataset: false

# # # PPO # # #
torch_deterministic: false
total_timesteps: 1_000_000_000
batch_size: 131_072
minibatch_size: 8192
learning_rate: 3e-4
anneal_lr: false
total_timesteps: 500_000_000
batch_size: 18432
minibatch_size: 3072
# 适度提高学习率:适应新的奖励函数(碰撞/越界惩罚提高)
learning_rate: 2e-4 # 从1e-4提高到2e-4,加快适应新奖励信号
anneal_lr: true # 开启学习率衰减:从2e-4开始,随训练逐渐降低,先快速适应后精细调优
gamma: 0.99
gae_lambda: 0.95
update_epochs: 4
# 收紧更新:避免策略变化太大
update_epochs: 3 # 减少更新次数
norm_adv: true
clip_coef: 0.2
clip_vloss: false
clip_coef: 0.15 # 收紧clip,限制策略变化幅度
# value 更稳
clip_vloss: true
vf_clip_coef: 0.2
ent_coef: 0.0001
# 降低探索:策略已学会走,现在需要更稳定(减少晃动)
ent_coef: 0.0003 # 从 0.001 降低到 0.0003
vf_coef: 0.3
max_grad_norm: 0.5
target_kl: null
# KL 早停,避免重采样后一次更新过猛导致震荡
target_kl: 0.02
log_window: 1000

# # # Network # # #
# # # 网络 # # #
network:
input_dim: 64 # Embedding of the input features
hidden_dim: 128 # Latent dimension
input_dim: 64 # 输入特征嵌入维度
hidden_dim: 128 # 潜在维度
dropout: 0.01
class_name: "NeuralNet"
num_parameters: 0 # Total trainable parameters, to be filled at runtime
num_parameters: 0 # 可训练参数数量(运行时填充)
# 新增:观察融合网络配置
fusion_type: "attention" # 选项: "simple", "attention", "adaptive"
num_attention_heads: 4 # 注意力头数(仅在fusion_type="attention"时有效)

# # # Checkpointing # # #
checkpoint_interval: 400 # Save policy every k iterations
# # # 检查点保存 # # #
checkpoint_interval: 200 # 每隔 k 次迭代保存一次
checkpoint_path: "./runs"

# # # Rendering # # #
render: false # Determines whether to render the environment (note: will slow down training)
render_3d: true # Render simulator state in 3d or 2d
render_interval: 1 # Render every k iterations
render_k_scenarios: 10 # Number of scenarios to render
render_format: "mp4" # Options: gif, mp4
render_fps: 15 # Frames per second
# # # 渲染 # # #
render: false # 是否渲染环境(开启会减慢训练)
render_3d: true # 渲染 3D 或 2D
render_interval: 1 # 每隔 k 次迭代渲染
render_k_scenarios: 0 # 训练期建议为 0,避免额外 IO/不确定性
render_format: "mp4" # 选项:gifmp4
render_fps: 15 # 每秒帧数
zoom_radius: 50

vec:
backend: "native" # Only native is currently supported
backend: "native" # 目前仅支持 native
num_workers: 1
env_batch_size: 1
zero_copy: false
2 changes: 1 addition & 1 deletion baselines/ppo/config/ppo_base_sb3.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
data_dir: "data/processed/examples"
num_worlds: 100
num_worlds: 30
k_unique_scenes: 4
device: "cuda" # or "cpu"

Expand Down
20 changes: 18 additions & 2 deletions gpudrive/env/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ class EnvConfig:

# Set the weights for the reward components
# R = a * collided + b * goal_achieved + c * off_road
collision_weight: float = 0.0
collision_weight: float = -0.5
goal_achieved_weight: float = 1.0
off_road_weight: float = 0.0
off_road_weight: float = -0.5

# Road observation algorithm settings
road_obs_algorithm: str = "linear" # Algorithm for road observations
Expand Down Expand Up @@ -101,6 +101,22 @@ class EnvConfig:
reward_type: str = "sparse_on_goal_achieved"
# Alternatively, "weighted_combination", "distance_to_logs", "distance_to_vdb_trajs", "reward_conditioned"

# --- weighted_combination 额外稠密项(用于避免“动几下就停”的局部最优) ---
# 每一步的时间成本(仅在 reward_type == "weighted_combination" 时生效)
# 建议从 0.001~0.005 试起;过大可能导致冒进/碰撞上升
time_penalty: float = 0.0

# 低速/怠速惩罚(仅在 reward_type == "weighted_combination" 时生效)
# 当 speed < idle_speed_threshold 且未完成/未终止时,额外扣 idle_penalty
idle_speed_threshold: float = 0.5
idle_penalty: float = 0.0

# 进度奖励:距离目标越近,每步获得的奖励越高(密集正向信号)
# reward += progress_reward_weight * exp(-dist_to_goal / progress_reward_scale)
# 建议 progress_reward_weight: 0.1~0.3, progress_reward_scale: 15~30
progress_reward_weight: float = 0.0 # 默认关闭
progress_reward_scale: float = 20.0 # 距离衰减因子

condition_mode: str = "random" # Options: "random", "fixed", "preset"

# Define upper and lower bounds for reward components if using reward_conditioned
Expand Down
5 changes: 4 additions & 1 deletion gpudrive/env/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def __post_init__(self):
)

# Set the random seed for reproducibility
if self.seed is None:
self.seed = 42
self.random_gen = random.Random(self.seed)

# Create the dataset from valid files in the directory
Expand Down Expand Up @@ -84,8 +86,9 @@ def __len__(self):
def __next__(self) -> List[str]:
if self.sample_with_replacement:
# Ensure deterministic behavior
base_seed = 0 if self.seed is None else self.seed
random_gen = random.Random(
self.seed + self.current_index
base_seed + self.current_index
) # Changing the seed per batch

# Determine the batch size using the random generator to shuffle the indices
Expand Down
19 changes: 13 additions & 6 deletions gpudrive/env/env_puffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,24 +52,24 @@ def __init__(
off_road_weight=-0.5,
goal_achieved_weight=1,
dist_to_goal_threshold=2.0,
polyline_reduction_threshold=0.1,
polyline_reduction_threshold=0.1, #折线简化阈值,是一个用于控制道路图观察点采样密度的参数。
remove_non_vehicles=True,
obs_radius=50.0,
use_vbd=False,
vbd_model_path=None,
vbd_trajectory_weight=0.1,
render=False,
render_3d=True,
render_interval=50,
render_k_scenarios=3,
render_interval=50, #渲染间隔,每隔多少步渲染一次
render_k_scenarios=3, #渲染场景数量
render_agent_obs=False,
render_format="mp4",
render_fps=15,
zoom_radius=50,
buf=None,
buf=None, #缓冲区,用于存储环境状态和动作
**kwargs,
):
assert buf is None, "GPUDrive set up only for --vec native"
assert buf is None, "GPUDrive set up only for --vec native" #断言缓冲区为空,表示只支持原生环境

if data_loader is None:
data_loader = SceneDataLoader(
Expand All @@ -78,7 +78,7 @@ def __init__(
dataset_size=loader_dataset_size,
sample_with_replacement=loader_sample_with_replacement,
shuffle=loader_shuffle,
)
) #数据加载器,用于加载场景数据

if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
Expand Down Expand Up @@ -239,6 +239,13 @@ def step(self, action):
Args:
action: A numpy array of actions for the controlled agents. Shape:
(num_worlds, max_cont_agents_per_env)
执行一步环境交互:
1. 应用动作
2. 执行物理仿真
3. 计算奖励
4. 处理终止状态
5. 异步重置完成的环境
6. 返回新的观测
"""

# Set the action for the controlled agents
Expand Down
45 changes: 45 additions & 0 deletions gpudrive/env/env_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,51 @@ def get_rewards(
+ off_road_weight * off_road
)

# 稠密塑形:避免"停住最优"
# 仅对未 done 且未达成目标的 agent 生效(done/goal 后不再额外惩罚)
needs_shaping = (
self.config.time_penalty != 0.0
or self.config.idle_penalty != 0.0
or self.config.progress_reward_weight != 0.0
)
if needs_shaping:
done = (
self.sim.done_tensor()
.to_torch()
.clone()
.squeeze(dim=2)
.to(weighted_rewards.device)
.to(torch.float)
)
active = (1.0 - done) * (1.0 - goal_achieved)

if self.config.time_penalty != 0.0:
weighted_rewards = weighted_rewards - self.config.time_penalty * active

# 获取速度(用于idle惩罚)
if self.config.idle_penalty != 0.0:
speed = (
self.sim.self_observation_tensor()
.to_torch()
.clone()[:, :, 0]
.to(weighted_rewards.device)
.to(torch.float)
)
is_idle = (speed < self.config.idle_speed_threshold).to(torch.float)
weighted_rewards = weighted_rewards - self.config.idle_penalty * is_idle * active

# 进度奖励:距离目标越近,每步正奖励越高(密集引导信号)
if self.config.progress_reward_weight != 0.0:
self_obs = self.sim.self_observation_tensor().to_torch().clone()
rel_goal_x = self_obs[:, :, 4].to(weighted_rewards.device)
rel_goal_y = self_obs[:, :, 5].to(weighted_rewards.device)
dist_to_goal = torch.sqrt(rel_goal_x ** 2 + rel_goal_y ** 2 + 1e-6)
progress_reward = self.config.progress_reward_weight * torch.exp(
-dist_to_goal / self.config.progress_reward_scale
)
# 只给仍在行驶中的 agent
weighted_rewards = weighted_rewards + progress_reward * active

return weighted_rewards

elif self.config.reward_type == "reward_conditioned":
Expand Down
21 changes: 20 additions & 1 deletion gpudrive/integrations/puffer/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,20 @@ def train(data):
dones_np = experience.dones_np[idxs]
values_np = experience.values_np[idxs]
rewards_np = experience.rewards_np[idxs]

# 数值稳定性检查:检查输入数据
if np.isnan(dones_np).any() or np.isnan(values_np).any() or np.isnan(rewards_np).any():
print("Warning: NaN detected in GAE inputs, replacing with zeros")
dones_np = np.nan_to_num(dones_np, nan=0.0)
values_np = np.nan_to_num(values_np, nan=0.0)
rewards_np = np.nan_to_num(rewards_np, nan=0.0)

# 检查是否有Inf值
if np.isinf(values_np).any() or np.isinf(rewards_np).any():
print("Warning: Inf detected in GAE inputs, clipping values")
values_np = np.clip(values_np, -1e6, 1e6)
rewards_np = np.clip(rewards_np, -1e6, 1e6)

advantages_np = compute_gae(
dones_np, values_np, rewards_np, config.gamma, config.gae_lambda
)
Expand Down Expand Up @@ -347,7 +361,12 @@ def train(data):

with profile.train_misc:
if config.anneal_lr:
frac = 1.0 - data.global_step / config.total_timesteps
# 支持继续训练时从配置的学习率开始衰减
lr_start_step = getattr(data, 'lr_start_step', 0)
lr_total_steps = config.total_timesteps - lr_start_step
steps_since_start = data.global_step - lr_start_step
frac = 1.0 - steps_since_start / lr_total_steps
frac = max(0.0, frac) # 防止负数
lrnow = float(frac) * float(config.learning_rate)
data.optimizer.param_groups[0]["lr"] = lrnow

Expand Down
Loading