diff --git a/.gitignore b/.gitignore index 2bd940568..a19332a6e 100644 --- a/.gitignore +++ b/.gitignore @@ -172,6 +172,9 @@ pufferlib/resources/drive/binaries/validation/ !pufferlib/resources/drive/binaries/training/map_000.bin pufferlib/resources/drive/sanity/sanity_binaries/ +# Ignore human demonstration data +pufferlib/resources/drive/human_demonstrations/* + # Compiled drive binary in root /drive /visualize diff --git a/examples/analyze_learnable_information.py b/examples/analyze_learnable_information.py new file mode 100644 index 000000000..d221dc65d --- /dev/null +++ b/examples/analyze_learnable_information.py @@ -0,0 +1,366 @@ +import wandb +import torch +import torch.nn as nn +from torch.optim import Adam +from torch.utils.data import DataLoader, TensorDataset +from torch.distributions.categorical import Categorical +import numpy as np +import matplotlib.pyplot as plt +from sklearn.decomposition import PCA + +from pufferlib.pufferl import load_config, load_env + + +class BCPolicy(nn.Module): + def __init__(self, input_size, hidden_size, output_size): + super().__init__() + self.nn = nn.Sequential( + nn.Linear(input_size, hidden_size), + nn.ReLU(), + nn.LayerNorm(hidden_size), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.LayerNorm(hidden_size), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + ) + self.heads = nn.ModuleList([nn.Linear(hidden_size, output_size)]) + + def dist(self, obs): + """Generate action distribution.""" + x_out = self.nn(obs.float()) + return [Categorical(logits=head(x_out)) for head in self.heads] + + def forward(self, obs, deterministic=False): + """Generate an output from tensor input.""" + action_dist = self.dist(obs) + + if deterministic: + actions_idx = action_dist[0].logits.argmax(axis=-1) + else: + actions_idx = action_dist[0].sample() + return actions_idx + + def _log_prob(self, obs, expert_actions): + pred_action_dist = self.dist(obs) + log_prob = pred_action_dist[0].log_prob(expert_actions).mean() + return log_prob + + +def prepare_human_data(env, max_expert_sequences=512): + """Step 1: Extract and process human demonstration data""" + print("Preparing human data...") + + env._prep_human_data( + bptt_horizon=1, + max_expert_sequences=max_expert_sequences, + ) + + # Access the raw expert data collected by the environment + expert_actions_discrete = torch.Tensor(env.expert_actions_discrete) # Shape: (T, N, 1) + expert_observations = torch.Tensor(env.expert_observations_full) # Shape: (T, N, obs_dim) + + # Flatten to create a batch of samples + action_labels = torch.flatten(expert_actions_discrete, 0, 1).squeeze() # [B] + observations = torch.flatten(expert_observations, 0, 1) # [B, obs_dim] + + # Filter out invalid actions (-1) + valid_mask = action_labels != -1 + action_labels = action_labels[valid_mask] + observations = observations[valid_mask] + + return observations, action_labels + + +def compute_observation_coverage(obs, max_expert_sequences, subsample_for_viz=5000): + """ + Compute observation space coverage metrics. + + Metrics: + 1. Statistical Dispersion: variance, std, range per dimension + 2. Effective Rank: PCA components needed to explain 95% variance + 3. Visualizations: PCA projections and distributions + """ + print("\nAnalyzing observation coverage...") + + obs_np = obs.numpy() if isinstance(obs, torch.Tensor) else obs + n_samples, obs_dim = obs_np.shape + + # === 1. Statistical Dispersion Metrics === + obs_std = np.std(obs_np, axis=0) + obs_var = np.var(obs_np, axis=0) + obs_min = np.min(obs_np, axis=0) + obs_max = np.max(obs_np, axis=0) + obs_range = obs_max - obs_min + + # Average metrics across dimensions + avg_std = np.mean(obs_std) + avg_var = np.mean(obs_var) + avg_range = np.mean(obs_range) + + print(f" Avg Std: {avg_std:.4f}") + print(f" Avg Variance: {avg_var:.4f}") + print(f" Avg Range: {avg_range:.4f}") + + # === 2. Effective Rank (PCA) === + pca = PCA() + pca.fit(obs_np) + explained_var_ratio = pca.explained_variance_ratio_ + cumulative_var = np.cumsum(explained_var_ratio) + + # Effective rank (number of dimensions to explain 95% variance) + effective_rank = np.argmax(cumulative_var >= 0.95) + 1 + + print(f" Effective Rank (95% var): {effective_rank}/{obs_dim}") + + # === 3. Visualizations === + fig, axes = plt.subplots(2, 2, figsize=(14, 12)) + + # Plot 1: Variance per dimension + axes[0, 0].bar(range(min(50, obs_dim)), obs_var[:50], color="steelblue", edgecolor="black") + axes[0, 0].set_xlabel("Dimension", fontsize=11) + axes[0, 0].set_ylabel("Variance", fontsize=11) + axes[0, 0].set_title("Variance per Dimension (first 50)", fontsize=12, fontweight="bold") + axes[0, 0].grid(True, alpha=0.3) + + # Plot 2: Range per dimension + axes[0, 1].bar(range(min(50, obs_dim)), obs_range[:50], color="coral", edgecolor="black") + axes[0, 1].set_xlabel("Dimension", fontsize=11) + axes[0, 1].set_ylabel("Range", fontsize=11) + axes[0, 1].set_title("Range per Dimension (first 50)", fontsize=12, fontweight="bold") + axes[0, 1].grid(True, alpha=0.3) + + # Plot 3: PCA explained variance + axes[1, 0].plot(cumulative_var[: min(100, len(cumulative_var))], "b-", linewidth=2, label="Cumulative Variance") + axes[1, 0].axhline(y=0.95, color="r", linestyle="--", linewidth=2, label="95% threshold") + axes[1, 0].axvline( + x=effective_rank - 1, color="g", linestyle="--", linewidth=2, label=f"Effective rank = {effective_rank}" + ) + axes[1, 0].set_xlabel("Number of Components", fontsize=11) + axes[1, 0].set_ylabel("Cumulative Explained Variance", fontsize=11) + axes[1, 0].set_title("PCA Cumulative Variance", fontsize=12, fontweight="bold") + axes[1, 0].legend(fontsize=10) + axes[1, 0].grid(True, alpha=0.3) + + # Plot 4: 2D PCA projection + if n_samples > subsample_for_viz: + viz_indices = np.random.choice(n_samples, subsample_for_viz, replace=False) + obs_viz = obs_np[viz_indices] + else: + obs_viz = obs_np + + pca_2d = PCA(n_components=2) + obs_2d = pca_2d.fit_transform(obs_viz) + + axes[1, 1].scatter(obs_2d[:, 0], obs_2d[:, 1], alpha=0.4, s=10, c="purple", edgecolors="none") + axes[1, 1].set_xlabel(f"PC1 ({pca_2d.explained_variance_ratio_[0]:.2%} var)", fontsize=11) + axes[1, 1].set_ylabel(f"PC2 ({pca_2d.explained_variance_ratio_[1]:.2%} var)", fontsize=11) + axes[1, 1].set_title(f"2D PCA Projection (N={n_samples} samples)", fontsize=12, fontweight="bold") + axes[1, 1].grid(True, alpha=0.3) + + plt.suptitle( + f"Observation Coverage Analysis (N={max_expert_sequences} sequences)", fontsize=14, fontweight="bold", y=1.00 + ) + plt.tight_layout() + + # Save and log + plt.savefig(f"coverage_analysis_N{max_expert_sequences}.png", dpi=150, bbox_inches="tight") + wandb.log({f"coverage_analysis": wandb.Image(fig)}) + plt.close() + + # Log metrics to wandb + coverage_metrics = { + "coverage/dataset_size": n_samples, + "coverage/obs_dim": obs_dim, + "coverage/avg_std": avg_std, + "coverage/avg_variance": avg_var, + "coverage/avg_range": avg_range, + "coverage/effective_rank": effective_rank, + "coverage/effective_rank_ratio": effective_rank / obs_dim, + "coverage/pca_var_pc1": explained_var_ratio[0], + "coverage/pca_var_pc2": explained_var_ratio[1] if len(explained_var_ratio) > 1 else 0, + } + + wandb.log(coverage_metrics) + + return coverage_metrics + + +def train_bc_policy(obs, actions, config): + """Step 2: Train behavioral cloning policy""" + print("Training BC policy...") + + # Initialize wandb + wandb.init(project="gsp_epiplexity", config=config) + + wandb.log({"dataset_size": obs.shape[0]}) + + # Setup + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Convert to tensors + obs_tensor = obs.float() + actions_tensor = actions.long() + + dataset = TensorDataset(obs_tensor, actions_tensor) + dataloader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=True) + data_iter = iter(dataloader) + + # Create model + policy = BCPolicy( + input_size=obs.shape[-1], hidden_size=config["hidden_size"], output_size=config["num_actions"] + ).to(device) + + optimizer = Adam(policy.parameters(), lr=config["learning_rate"]) + + # Training loop + losses = [] + global_step = 0 + + for epoch in range(config["epochs"]): + epoch_losses = [] + + for i in range(config["minibatches"]): + try: + batch_obs, batch_actions = next(data_iter) + except StopIteration: + data_iter = iter(dataloader) + batch_obs, batch_actions = next(data_iter) + + batch_obs = batch_obs.to(device) + batch_actions = batch_actions.to(device) + + # Forward pass + log_prob = policy._log_prob(batch_obs, batch_actions.float()) + loss = -log_prob + + # Backward pass + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Compute accuracy + with torch.no_grad(): + pred_action = policy(batch_obs, deterministic=True) + accuracy = (batch_actions == pred_action).sum() / batch_actions.shape[0] + + # Log + loss_val = loss.item() + epoch_losses.append(loss_val) + losses.append(loss_val) + + wandb.log({"global_step": global_step, "loss": loss_val, "accuracy": accuracy.item(), "epoch": epoch}) + + global_step += 1 + + avg_epoch_loss = np.mean(epoch_losses) + + if avg_epoch_loss < 0.001: + print(f"Early stopping at epoch {epoch + 1} with loss {avg_epoch_loss:.6f}") + break + else: + print(f"Epoch {epoch + 1}/{config['epochs']}: Loss = {avg_epoch_loss:.4f}") + + return losses, policy + + +def compute_epiplexity(losses, dataset_size): + """Step 3: Compute area under loss curve above final loss (epiplexity)""" + print("Computing epiplexity...") + + # Convert to numpy array + losses_array = np.array(losses) + + # Take final loss as the last value (asymptotic loss) + final_loss = losses_array[-1] + + # Epiplexity: area under the curve above the final loss + losses_above_final = losses_array - final_loss + epiplexity = np.trapezoid(losses_above_final) + + print(f"Final Loss: {final_loss:.4f}") + print(f"Epiplexity (AUC above final loss): {epiplexity:.4f}") + + # Log to wandb + wandb.log({"epiplexity": epiplexity, "final_loss": final_loss}) + + # Create visualization + fig, ax = plt.subplots(figsize=(10, 7)) + + # Plot loss curve + steps = np.arange(len(losses_array)) + ax.plot(steps, losses_array, linewidth=1.5, label="Training loss", zorder=3) + + # Draw horizontal line at final loss + ax.axhline(y=final_loss, color="red", linestyle="--", linewidth=2, label=f"Final Loss = {final_loss:.4f}", zorder=2) + + # Fill the area between loss curve and final loss (epiplexity area) + ax.fill_between( + steps, + losses_array, + final_loss, + where=(losses_array >= final_loss), + color="red", + alpha=0.3, + label=f"Epiplexity = {epiplexity:.4f}", + zorder=1, + ) + + ax.set_xlabel("Training step", fontsize=12) + ax.set_ylabel("Loss", fontsize=12) + ax.set_title(f"BC training loss curve (N={dataset_size} samples)", fontsize=14) + ax.legend(loc="upper right", fontsize=10) + ax.grid(True, alpha=0.3) + + plt.tight_layout() + + # Log plot to wandb + wandb.log({"loss_curve_with_epiplexity": wandb.Image(fig)}) + plt.close() + + return epiplexity, final_loss + + +if __name__ == "__main__": + # Load configuration + args = load_config("puffer_drive") + args["vec"]["backend"] = "Serial" + + for max_expert_sequences in [256, 512, 1024, 2048, 4096]: + args["env"]["num_agents"] = max_expert_sequences + + config = { + "batch_size": 512, + "hidden_size": 1024, + "num_actions": 91, # 7*13 for classic discrete action space + "learning_rate": 1e-4, + "epochs": 10_000, + "minibatches": 4, + "max_expert_sequences": max_expert_sequences, + } + + env = load_env("puffer_drive", args) + + # Step 1: Prepare human data (o_t, a_t) tuples + human_obs, human_actions = prepare_human_data(env.driver_env, max_expert_sequences=max_expert_sequences) + + print(f"Data shapes - Obs: {human_obs.shape}, Actions: {human_actions.shape}") + + # Step 2: Train BC policy + losses, policy = train_bc_policy(human_obs, human_actions, config) + + # Analyze observation coverage + # coverage_metrics = compute_observation_coverage(human_obs, max_expert_sequences) + + # Step 3: Compute epiplexity with visualization + epiplexity, final_loss = compute_epiplexity(losses, human_obs.shape[0]) + + print("\n" + "=" * 60) + print(f"RESULTS (N={max_expert_sequences}):") + print(f" Final Loss: {final_loss:.4f}") + print(f" Epiplexity: {epiplexity:.4f}") + print("=" * 60 + "\n") + + env.close() + + wandb.finish() diff --git a/examples/eval_pipeline.py b/examples/eval_pipeline.py new file mode 100644 index 000000000..50dcc3ef5 --- /dev/null +++ b/examples/eval_pipeline.py @@ -0,0 +1,352 @@ +import numpy as np +import pandas as pd +import torch +import matplotlib.pyplot as plt +import matplotlib as mpl +import seaborn as sns +import warnings + +from pufferlib.pufferl import load_env, load_policy, load_config +from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator + +POLICY_DIR = "models" + +COLUMN_ORDER = [ + "policy", + "realism_meta_score", + "kinematic_metrics", + "interactive_metrics", + "map_based_metrics", + "min_ade", + "ade", + "likelihood_linear_speed", + "likelihood_linear_acceleration", + "likelihood_angular_speed", + "likelihood_angular_acceleration", + "likelihood_collision_indication", + "likelihood_offroad_indication", + "likelihood_time_to_collision", + "likelihood_distance_to_road_edge", + "likelihood_distance_to_nearest_object", +] + + +def plot_wosac_results(df): + """Create a 3-column visualization of WOSAC results.""" + + # Set style + sns.set("notebook", font_scale=1.05, rc={"figure.figsize": (16, 5)}) + sns.set_style("ticks", rc={"figure.facecolor": "none", "axes.facecolor": "none"}) + warnings.filterwarnings("ignore") + plt.set_loglevel("WARNING") + mpl.rcParams["lines.markersize"] = 8 + + fig, axes = plt.subplots(1, 3, figsize=(15, 5)) + palette = sns.color_palette("Set2", n_colors=len(df["policy"].unique())) + + # Left: Realism meta score + sns.barplot(data=df, x="policy", y="realism_meta_score", errorbar="sd", palette=palette, ax=axes[0], alpha=0.8) + axes[0].set_title("Aggregate realism score") + # axes[0].set_ylim(0, 1.0) + axes[0].grid(axis="y", alpha=0.3, linestyle="--") + axes[0].tick_params(axis="x", rotation=30) + + # Middle: Metric Categories + df_metrics = df.melt( + id_vars=["policy"], value_vars=["kinematic_metrics", "interactive_metrics", "map_based_metrics"] + ) + sns.barplot( + data=df_metrics, x="variable", y="value", hue="policy", errorbar="sd", palette=palette, ax=axes[1], alpha=0.8 + ) + axes[1].set_title("Group metric categories") + axes[1].set_xlabel("") + axes[1].set_ylabel("Score") + # axes[1].set_ylim(0, 1.0) + axes[1].legend(title="Policy", loc="upper left") + axes[1].grid(axis="y", alpha=0.3, linestyle="--") + axes[1].tick_params(axis="x", rotation=30) + + # Right: ADE and minADE + df_ade = df.melt(id_vars=["policy"], value_vars=["ade", "min_ade"]) + sns.barplot( + data=df_ade, x="policy", y="value", hue="variable", errorbar="sd", palette="muted", ax=axes[2], alpha=0.8 + ) + axes[2].set_title("Displacement error") + axes[2].set_ylabel("Distance (m)") + axes[2].legend(title="Metric") + axes[2].grid(axis="y", alpha=0.3, linestyle="--") + axes[2].tick_params(axis="x", rotation=15) + + for ax in axes: + sns.despine(ax=ax) + + plt.tight_layout() + plt.savefig("wosac_evaluation_results.png", dpi=300, bbox_inches="tight", facecolor="white") + plt.show() + + return fig + + +def plot_realism_score_distributions(df): + """Create histogram distributions of realism scores for each policy.""" + + # Set style + sns.set("notebook", font_scale=1.05) + sns.set_style("ticks", rc={"figure.facecolor": "none", "axes.facecolor": "none"}) + warnings.filterwarnings("ignore") + plt.set_loglevel("WARNING") + mpl.rcParams["lines.markersize"] = 8 + + policies = df["policy"].unique() + n_policies = len(policies) + + fig, axes = plt.subplots(1, n_policies, figsize=(5 * n_policies, 4), sharey=True, sharex=True) + if n_policies == 1: + axes = [axes] + + palette = sns.color_palette("Set2", n_colors=n_policies) + + for idx, (policy, ax) in enumerate(zip(policies, axes)): + policy_data = df[df["policy"] == policy]["realism_meta_score"] + + # Plot histogram + ax.hist(policy_data, bins=20, alpha=0.8, color=palette[idx], edgecolor="black") + + # Add mean line + mean_val = policy_data.mean() + ax.axvline(mean_val, color="red", linestyle="--", linewidth=2, label=f"Mean: {mean_val:.3f}") + + # Add std text + std_val = policy_data.std() + ax.text( + 0.05, + 0.95, + f"μ = {mean_val:.3f}\nσ = {std_val:.3f}", + transform=ax.transAxes, + verticalalignment="top", + bbox=dict(boxstyle="round", facecolor="white", alpha=0.8), + ) + + ax.set_title(f"{policy}") + ax.set_xlabel("Realism meta score") + if idx == 0: + ax.set_ylabel("Count") + ax.grid(axis="y", alpha=0.3, linestyle="--") + ax.legend() + sns.despine(ax=ax) + + plt.tight_layout() + plt.savefig("wosac_realism_score_distributions.png", dpi=300, bbox_inches="tight", facecolor="white") + plt.show() + + return fig + + +def evaluate_ground_truth(config, vecenv, evaluator): + """Compute WOSAC metrics for ground truth trajectories.""" + + gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) + + fake_simulated_trajectories = gt_trajectories.copy() + for key in ["x", "y", "heading", "id"]: + fake_simulated_trajectories[key] = np.repeat(gt_trajectories[key], config["eval"]["wosac_num_rollouts"], axis=1) + fake_simulated_trajectories["id"] = fake_simulated_trajectories["id"][..., np.newaxis] + + # Compute metrics + agent_state = vecenv.driver_env.get_global_agent_state() + road_edge_polylines = vecenv.driver_env.get_road_edge_polylines() + results = evaluator.compute_metrics( + gt_trajectories, + fake_simulated_trajectories, + agent_state, + road_edge_polylines, + ) + return results + + +def evaluate_human_inferred_actions(config, vecenv, evaluator): + """Compute WOSAC metrics for human inferred actions.""" + + gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) + + vecenv._prep_human_data() + + # Roll out inferred human actions in the simulator + simulated_trajectories = evaluator.collect_simulated_trajectories( + args=config, + puffer_env=vecenv, + policy=None, + actions=vecenv.expert_actions_discrete, + ) + + # Compute metrics + agent_state = vecenv.driver_env.get_global_agent_state() + road_edge_polylines = vecenv.driver_env.get_road_edge_polylines() + results = evaluator.compute_metrics( + gt_trajectories, + simulated_trajectories, + agent_state, + road_edge_polylines, + ) + + return results + + +def evaluate_random_policy(config, vecenv, evaluator): + gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) + simulated_trajectories = evaluator.collect_wosac_random_baseline(vecenv) + + # Compute metrics + agent_state = vecenv.driver_env.get_global_agent_state() + road_edge_polylines = vecenv.driver_env.get_road_edge_polylines() + results = evaluator.compute_metrics( + gt_trajectories, + simulated_trajectories, + agent_state, + road_edge_polylines, + ) + + return results + + +def evaluate_bc_policy(config, vecenv, evaluator, policy_path): + config["train"]["use_rnn"] = False + evaluator.mode = "bc_policy" + + from train_bc_policy import BCPolicy + + bc_policy = BCPolicy( + input_size=vecenv.observation_space.shape[-1], + hidden_size=1024, + output_size=21 * 31, + ) + bc_policy.load_state_dict(torch.load(policy_path)) + bc_policy.eval().to(config["train"]["device"]) + + gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) + simulated_trajectories = evaluator.collect_simulated_trajectories( + args=config, + puffer_env=vecenv, + policy=bc_policy, + ) + + # Compute metrics + agent_state = vecenv.driver_env.get_global_agent_state() + road_edge_polylines = vecenv.driver_env.get_road_edge_polylines() + results = evaluator.compute_metrics( + gt_trajectories, + simulated_trajectories, + agent_state, + road_edge_polylines, + ) + + return results + + +def evaluate_rl_policy(config, vecenv, evaluator, policy_path): + config["train"]["use_rnn"] = True + evaluator.mode = "rl_policy" + + policy = load_policy(config, vecenv, "puffer_drive") + + gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) + simulated_trajectories = evaluator.collect_simulated_trajectories(config, vecenv, policy) + + # Compute metrics + agent_state = vecenv.driver_env.get_global_agent_state() + road_edge_polylines = vecenv.driver_env.get_road_edge_polylines() + results = evaluator.compute_metrics( + gt_trajectories, + simulated_trajectories, + agent_state, + road_edge_polylines, + ) + return results + + +def pipeline(env_name="puffer_drive"): + """Obtain WOSAC scores for various baselines and policies.""" + + config = load_config(env_name) + + config["env"]["num_maps"] = 60 + config["env"]["map_dir"] = "pufferlib/resources/drive/binaries/validation_interactive_small" + config["wosac"]["enabled"] = True + config["vec"]["backend"] = "PufferEnv" + config["vec"]["num_envs"] = 1 + config["env"]["sequential_map_sampling"] = True + config["env"]["init_mode"] = "create_all_valid" + config["env"]["control_mode"] = "control_wosac" + config["env"]["init_steps"] = 10 + config["env"]["goal_behavior"] = 2 + config["env"]["goal_radius"] = 1.0 + config["env"]["save_data_to_disk"] = False + + # Make env + vecenv = load_env(env_name, config) + + # Make evaluator + evaluator = WOSACEvaluator(config) + + # Baseline: Ground truth + df_results_gt = evaluate_ground_truth(config, vecenv, evaluator) + df_results_gt["policy"] = "ground_truth" + + # Baseline: Agent with inferred human actions (using classic bicycle dynamics model) + df_results_inferred_human = evaluate_human_inferred_actions(config, vecenv, evaluator) + df_results_inferred_human["policy"] = "inferred_human_actions" + + # Baseline: Imitation learning policy + # df_results_bc = evaluate_bc_policy(config, vecenv, evaluator, POLICY_DIR + "/bc_policy.pt") + # df_results_bc["policy"] = "bc_policy" + + # Baseline: Self-play RL policy + # run: https://wandb.ai/emerge_/gsp/runs/qld2z6tn?nw=nwuserdaphnecor + # df_results_self_play = evaluate_rl_policy( + # config, vecenv, evaluator, "pufferlib/resources/drive/pufferdrive_weights.pt" + # ) # POLICY_DIR + "/puffer_drive_sp_qld2z6tn.pt") + # df_results_self_play["policy"] = "self_play_rl_base" + + # TODO: Guided self-play policy (guidance in rewards) + # ... + + # TODO: Guided self-play policy (regularization) + # ... + + # Baseline: Random policy + df_results_random = evaluate_random_policy(config, vecenv, evaluator) + df_results_random["policy"] = "random" + + # Combine + df = pd.concat( + [ + df_results_gt, + df_results_inferred_human, + df_results_random, + # df_results_bc, + # df_results_self_play, + ], + ignore_index=True, + ) + + df = df[COLUMN_ORDER] + + # Visualize + plot_wosac_results(df) + plot_realism_score_distributions(df) + + print(f"total agents: {df_results_gt['num_agents_per_scene'].sum().item()}") + + print(df.groupby("policy")["realism_meta_score"].mean()) + print("---") + print(df.groupby("policy")["kinematic_metrics"].mean()) + print("---") + print(df.groupby("policy")["interactive_metrics"].mean()) + print("---") + print(df.groupby("policy")["map_based_metrics"].mean()) + + breakpoint() + + +if __name__ == "__main__": + pipeline() diff --git a/examples/gym_env.py b/examples/gym_env.py deleted file mode 100644 index 94e525d93..000000000 --- a/examples/gym_env.py +++ /dev/null @@ -1,23 +0,0 @@ -import gym -import pufferlib.emulation - - -class SampleGymEnv(gym.Env): - def __init__(self): - self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(1,)) - self.action_space = gym.spaces.Discrete(2) - - def reset(self): - return self.observation_space.sample() - - def step(self, action): - return self.observation_space.sample(), 0.0, False, {} - - -if __name__ == "__main__": - gym_env = SampleGymEnv() - gymnasium_env = pufferlib.GymToGymnasium(gym_env) - puffer_env = pufferlib.emulation.GymnasiumPufferEnv(gymnasium_env) - observations, info = puffer_env.reset() - action = puffer_env.action_space.sample() - observation, reward, terminal, truncation, info = puffer_env.step(action) diff --git a/examples/gymnasium_env.py b/examples/gymnasium_env.py deleted file mode 100644 index e33ae289e..000000000 --- a/examples/gymnasium_env.py +++ /dev/null @@ -1,22 +0,0 @@ -import gymnasium -import pufferlib.emulation - - -class SampleGymnasiumEnv(gymnasium.Env): - def __init__(self): - self.observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(1,)) - self.action_space = gymnasium.spaces.Discrete(2) - - def reset(self): - return self.observation_space.sample(), {} - - def step(self, action): - return self.observation_space.sample(), 0.0, False, False, {} - - -if __name__ == "__main__": - gymnasium_env = SampleGymnasiumEnv() - puffer_env = pufferlib.emulation.GymnasiumPufferEnv(gymnasium_env) - observation, info = puffer_env.reset() - action = puffer_env.action_space.sample() - observation, reward, terminal, truncation, info = puffer_env.step(action) diff --git a/examples/pettingzoo_env.py b/examples/pettingzoo_env.py deleted file mode 100644 index 333069229..000000000 --- a/examples/pettingzoo_env.py +++ /dev/null @@ -1,35 +0,0 @@ -import gymnasium -import pettingzoo -import pufferlib.emulation - - -class SamplePettingzooEnv(pettingzoo.ParallelEnv): - def __init__(self): - self.possible_agents = ["agent_0", "agent_1"] - self.agents = ["agent_0", "agent_1"] - - def observation_space(self, agent): - return gymnasium.spaces.Box(low=-1, high=1, shape=(1,)) - - def action_space(self, agent): - return gymnasium.spaces.Discrete(2) - - def reset(self, seed=None, options=None): - observations = {agent: self.observation_space(agent).sample() for agent in self.agents} - return observations, {} - - def step(self, action): - observations = {agent: self.observation_space(agent).sample() for agent in self.agents} - rewards = {agent: 0.0 for agent in self.agents} - terminals = {agent: False for agent in self.agents} - truncations = {agent: False for agent in self.agents} - infos = {agent: {} for agent in self.agents} - return observations, rewards, terminals, truncations, infos - - -if __name__ == "__main__": - env = SamplePettingzooEnv() - puffer_env = pufferlib.emulation.PettingZooPufferEnv(env) - observations, infos = puffer_env.reset() - actions = {agent: puffer_env.action_space(agent).sample() for agent in puffer_env.agents} - observations, rewards, terminals, truncations, infos = puffer_env.step(actions) diff --git a/examples/puffer_env.py b/examples/puffer_env.py deleted file mode 100644 index f18bad977..000000000 --- a/examples/puffer_env.py +++ /dev/null @@ -1,31 +0,0 @@ -import gymnasium -import pufferlib.emulation - - -class SamplePufferEnv(pufferlib.PufferEnv): - def __init__(self, buf=None, seed=0): - self.single_observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(1,)) - self.single_action_space = gymnasium.spaces.Discrete(2) - self.num_agents = 2 - super().__init__(buf) - - def reset(self, seed=0): - self.observations[:] = self.observation_space.sample() - return self.observations, [] - - def step(self, action): - self.observations[:] = self.observation_space.sample() - infos = [{"infos": "is a list of dictionaries"}] - return self.observations, self.rewards, self.terminals, self.truncations, infos - - -if __name__ == "__main__": - puffer_env = SamplePufferEnv() - observations, infos = puffer_env.reset() - actions = puffer_env.action_space.sample() - observations, rewards, terminals, truncations, infos = puffer_env.step(actions) - print("Puffer envs use a vector interface and in-place array updates") - print("Observation:", observations) - print("Reward:", rewards) - print("Terminal:", terminals) - print("Truncation:", truncations) diff --git a/examples/pufferl.py b/examples/pufferl.py deleted file mode 100644 index baa51baaa..000000000 --- a/examples/pufferl.py +++ /dev/null @@ -1,57 +0,0 @@ -import torch -import pufferlib.vector -import pufferlib.ocean -from pufferlib import pufferl - - -# Equivalent to running puffer train puffer_breakout -def cli(): - pufferl.train("puffer_breakout") - - -class Policy(torch.nn.Module): - def __init__(self, env): - super().__init__() - self.net = torch.nn.Sequential( - pufferlib.pytorch.layer_init(torch.nn.Linear(env.single_observation_space.shape[0], 128)), - torch.nn.ReLU(), - pufferlib.pytorch.layer_init(torch.nn.Linear(128, 128)), - ) - self.action_head = torch.nn.Linear(128, env.single_action_space.n) - self.value_head = torch.nn.Linear(128, 1) - - def forward_eval(self, observations, state=None): - hidden = self.net(observations) - logits = self.action_head(hidden) - values = self.value_head(hidden) - return logits, values - - # We use this to work around a major torch perf issue - def forward(self, observations, state=None): - return self.forward_eval(observations, state) - - -# Managing your own trainer -if __name__ == "__main__": - env_name = "puffer_breakout" - env_creator = pufferlib.ocean.env_creator(env_name) - vecenv = pufferlib.vector.make( - env_creator, - num_envs=2, - num_workers=2, - batch_size=1, - backend=pufferlib.vector.Multiprocessing, - env_kwargs={"num_envs": 4096}, - ) - policy = Policy(vecenv.driver_env).cuda() - args = pufferl.load_config("default") - args["train"]["env"] = env_name - - trainer = pufferl.PuffeRL(args["train"], vecenv, policy) - - for epoch in range(10): - trainer.evaluate() - logs = trainer.train() - - trainer.print_dashboard() - trainer.close() diff --git a/examples/render.py b/examples/render.py deleted file mode 100644 index b3965859f..000000000 --- a/examples/render.py +++ /dev/null @@ -1,7 +0,0 @@ -from pufferlib.ocean.breakout import breakout - -env = breakout.Breakout() -env.reset() -while True: - env.step(env.action_space.sample()) - frame = env.render() diff --git a/examples/structured_env.py b/examples/structured_env.py deleted file mode 100644 index f7af0bee3..000000000 --- a/examples/structured_env.py +++ /dev/null @@ -1,39 +0,0 @@ -import gymnasium -import pufferlib.emulation - - -class SampleGymnasiumEnv(gymnasium.Env): - def __init__(self): - self.observation_space = gymnasium.spaces.Dict( - { - "foo": gymnasium.spaces.Box(low=-1, high=1, shape=(2,)), - "bar": gymnasium.spaces.Box(low=2, high=3, shape=(3,)), - } - ) - self.action_space = gymnasium.spaces.MultiDiscrete([2, 5]) - - def reset(self): - return self.observation_space.sample(), {} - - def step(self, action): - return self.observation_space.sample(), 0.0, False, False, {} - - -if __name__ == "__main__": - gymnasium_env = SampleGymnasiumEnv() - puffer_env = pufferlib.emulation.GymnasiumPufferEnv(gymnasium_env) - flat_observation, info = puffer_env.reset() - flat_action = puffer_env.action_space.sample() - flat_observation, reward, terminal, truncation, info = puffer_env.step(flat_action) - print(f"PufferLib flattens observations and actions:\n{flat_observation}\n{flat_action}") - - observation = flat_observation.view(puffer_env.obs_dtype) - print(f"You can unflatten observations with numpy:\n{observation}") - - import torch - import pufferlib.pytorch - - flat_torch_observation = torch.from_numpy(flat_observation) - torch_dtype = pufferlib.pytorch.nativize_dtype(puffer_env.emulated) - torch_observation = pufferlib.pytorch.nativize_tensor(flat_torch_observation, torch_dtype) - print(f"But we suggest unflattening observations with torch in your model forward pass:\n{torch_observation}") diff --git a/examples/train_bc_policy.py b/examples/train_bc_policy.py new file mode 100644 index 000000000..15c4b5b69 --- /dev/null +++ b/examples/train_bc_policy.py @@ -0,0 +1,189 @@ +import wandb +import torch +import torch.nn as nn +from torch.optim import Adam +from torch.utils.data import DataLoader, TensorDataset +from torch.distributions.categorical import Categorical +import numpy as np +import matplotlib.pyplot as plt +from pufferlib.pufferl import load_config, load_env + +CHECKPOINT_PATH = "models" + + +class BCPolicy(nn.Module): + def __init__(self, input_size, hidden_size, output_size): + super().__init__() + self.nn = nn.Sequential( + nn.Linear(input_size, hidden_size), + nn.ReLU(), + nn.LayerNorm(hidden_size), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.LayerNorm(hidden_size), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + ) + self.heads = nn.ModuleList([nn.Linear(hidden_size, output_size)]) + + def dist(self, obs): + """Generate action distribution.""" + x_out = self.nn(obs.float()) + return [Categorical(logits=head(x_out)) for head in self.heads] + + def forward(self, obs, deterministic=False): + """Generate an output from tensor input.""" + action_dist = self.dist(obs) + + if deterministic: + actions_idx = action_dist[0].logits.argmax(axis=-1) + else: + actions_idx = action_dist[0].sample() + return actions_idx + + def _log_prob(self, obs, expert_actions): + pred_action_dist = self.dist(obs) + log_prob = pred_action_dist[0].log_prob(expert_actions).mean() + return log_prob + + +def prepare_human_data(env, max_expert_sequences=512): + """Step 1: Extract and process human demonstration data""" + print("Preparing human data...") + + env._prep_human_data( + bptt_horizon=1, + max_expert_sequences=max_expert_sequences, + ) + + # Access the raw expert data collected by the environment + expert_actions_discrete = torch.Tensor(env.expert_actions_discrete) # Shape: (T, N, 1) + expert_observations = torch.Tensor(env.expert_observations_full) # Shape: (T, N, obs_dim) + + # Flatten to create a batch of samples + action_labels = torch.flatten(expert_actions_discrete, 0, 1).squeeze() # [B] + observations = torch.flatten(expert_observations, 0, 1) # [B, obs_dim] + + # Filter out invalid actions (-1) + valid_mask = action_labels != -1 + action_labels = action_labels[valid_mask] + observations = observations[valid_mask] + + return observations, action_labels + + +def train_bc_policy(obs, actions, config): + """Step 2: Train behavioral cloning policy""" + print("Training BC policy...") + + # Initialize wandb + wandb.init(project="gsp_bc_policy", config=config) + + wandb.log({"dataset_size": obs.shape[0]}) + + # Setup + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + obs_tensor = obs.float() + actions_tensor = actions.long() + + dataset = TensorDataset(obs_tensor, actions_tensor) + dataloader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=True) + data_iter = iter(dataloader) + + # Create model + policy = BCPolicy( + input_size=obs.shape[-1], + hidden_size=config["hidden_size"], + output_size=config["num_actions"], + ).to(device) + + optimizer = Adam(policy.parameters(), lr=config["learning_rate"]) + + # Training loop + losses = [] + global_step = 0 + + for epoch in range(config["epochs"]): + epoch_losses = [] + + for i in range(config["minibatches"]): + try: + batch_obs, batch_actions = next(data_iter) + except StopIteration: + data_iter = iter(dataloader) + batch_obs, batch_actions = next(data_iter) + + batch_obs = batch_obs.to(device) + batch_actions = batch_actions.to(device) + + # Forward pass + log_prob = policy._log_prob(batch_obs, batch_actions.float()) + loss = -log_prob + + # Backward pass + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Compute accuracy + with torch.no_grad(): + pred_action = policy(batch_obs, deterministic=True) + accuracy = (batch_actions == pred_action).sum() / batch_actions.shape[0] + + # Log + loss_val = loss.item() + epoch_losses.append(loss_val) + losses.append(loss_val) + + wandb.log({"global_step": global_step, "loss": loss_val, "accuracy": accuracy.item(), "epoch": epoch}) + + global_step += 1 + + avg_epoch_loss = np.mean(epoch_losses) + + if avg_epoch_loss < 0.001: + print(f"Early stopping at epoch {epoch + 1} with loss {avg_epoch_loss:.6f}") + break + else: + print(f"Epoch {epoch + 1}/{config['epochs']}: Loss = {avg_epoch_loss:.4f}") + + return losses, policy + + +if __name__ == "__main__": + # Load configuration + args = load_config("puffer_drive") + args["vec"]["backend"] = "Serial" + args["env"]["num_maps"] = 229 + args["env"]["map_dir"] = "pufferlib/resources/drive/binaries/validation_interactive_small" + + for max_expert_sequences in [4096]: + args["env"]["num_agents"] = max_expert_sequences + + config = { + "batch_size": 512, + "hidden_size": 1024, + "num_actions": 21 * 31, # TODO: Do not hardcode + "learning_rate": 1e-4, + "epochs": 100_000, + "minibatches": 4, + "max_expert_sequences": max_expert_sequences, + } + + env = load_env("puffer_drive", args) + + # Step 1: Prepare human data (o_t, a_t) tuples + human_obs, human_actions = prepare_human_data(env.driver_env, max_expert_sequences=max_expert_sequences) + + print(f"Data shapes - Obs: {human_obs.shape}, Actions: {human_actions.shape}") + + # Step 2: Train BC policy + losses, policy = train_bc_policy(human_obs, human_actions, config) + + # Store the trained model + torch.save(policy.state_dict(), f"{CHECKPOINT_PATH}/bc_policy.pt") + + env.close() + + wandb.finish() diff --git a/examples/vectorization.py b/examples/vectorization.py deleted file mode 100644 index 62c999789..000000000 --- a/examples/vectorization.py +++ /dev/null @@ -1,82 +0,0 @@ -import gymnasium -import pufferlib.emulation -import pufferlib.vector - - -class SamplePufferEnv(pufferlib.PufferEnv): - def __init__(self, foo=0, bar=1, buf=None, seed=0): - self.single_observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(1,)) - self.single_action_space = gymnasium.spaces.Discrete(2) - self.num_agents = 2 - super().__init__(buf) - - # Sample args and kwargs - self.foo = foo - self.bar = bar - - def reset(self, seed=0): - self.observations[:] = self.observation_space.sample() - return self.observations, [] - - def step(self, action): - self.observations[:] = self.observation_space.sample() - infos = [{"infos": "is a list of dictionaries"}] - return self.observations, self.rewards, self.terminals, self.truncations, infos - - def close(self): - pass - - -if __name__ == "__main__": - serial_vecenv = pufferlib.vector.make(SamplePufferEnv, num_envs=2, backend=pufferlib.vector.Serial) - observations, infos = serial_vecenv.reset() - actions = serial_vecenv.action_space.sample() - o, r, d, t, i = serial_vecenv.step(actions) - print("Serial VecEnv:") - print("Observations:", o) - print("Rewards:", r) - print("Terminals:", t) - print("Truncations:", d) - - # Pass arguments to all environments like this - serial_vecenv = pufferlib.vector.make( - SamplePufferEnv, num_envs=2, backend=pufferlib.vector.Serial, env_args=[3], env_kwargs={"bar": 4} - ) - print("Foo: ", [env.foo for env in serial_vecenv.envs]) - print("Bar: ", [env.bar for env in serial_vecenv.envs]) - - # Or to each environment like this - serial_vecenv = pufferlib.vector.make( - [SamplePufferEnv, SamplePufferEnv], - num_envs=2, - backend=pufferlib.vector.Serial, - env_args=[[3], [4]], - env_kwargs=[{"bar": 4}, {"bar": 5}], - ) - print("Foo: ", [env.foo for env in serial_vecenv.envs]) - print("Bar: ", [env.bar for env in serial_vecenv.envs]) - - vecenv = pufferlib.vector.make( - SamplePufferEnv, num_envs=2, num_workers=2, batch_size=1, backend=pufferlib.vector.Multiprocessing - ) - vecenv.async_reset() # You can also use the synchronous API with Multiprocessing - o, r, d, t, i, env_ids, masks = vecenv.recv() - actions = vecenv.action_space.sample() - print("Policy computes actions for all agents in batch_size=1 of the total num_envs=2 environments") - print("Actions:", actions) - vecenv.send(actions) - - # New observations are ready while the other envs are running in the background - o, r, d, t, i, env_ids, masks = vecenv.recv() - print("Observations:", o) - - # Make sure to close the vecenv when you're done - vecenv.close() - - try: - vecenv = pufferlib.vector.make( - SamplePufferEnv, num_envs=1, num_workers=2, batch_size=3, backend=pufferlib.vector.Multiprocessing - ) - except pufferlib.APIUsageError: - # Make sure num_envs divides num_workers, and both num_envs and num_workers should divide batch_size - pass diff --git a/greene/a100_baseline_sweep.sh b/greene/a100_baseline_sweep.sh new file mode 100644 index 000000000..ca34d8205 --- /dev/null +++ b/greene/a100_baseline_sweep.sh @@ -0,0 +1,81 @@ +#!/bin/bash +#SBATCH --array=0-10 +#SBATCH --job-name=pd_sweep_a100 +#SBATCH --output=/scratch/%u/PufferDrive/logs/output_%A_%a.txt +#SBATCH --error=/scratch/%u/PufferDrive/logs/error_%A_%a.txt +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:a100:2 +#SBATCH --account=pr_100_tandon_priority + +# Print job info +echo "Job ID: $SLURM_JOB_ID" +echo "Array Task ID: $SLURM_ARRAY_TASK_ID" +echo "User: $USER" +echo "Running on node: $(hostname)" +echo "GPU: $CUDA_VISIBLE_DEVICES" +date + +# Set project directory +PROJECT_DIR="/scratch/${USER}/PufferDrive" +echo "Project directory: $PROJECT_DIR" + +# Create logs directory if it doesn't exist +mkdir -p ${PROJECT_DIR}/logs + +# Navigate to project directory +cd ${PROJECT_DIR} + +# Activate virtual environment +source .venv/bin/activate + +# A100 dual-GPU optimized settings (scaled up for 2 GPUs and 64 CPUs) +NUM_WORKERS=32 +NUM_ENVS=32 +VEC_BATCH_SIZE=2 +NUM_AGENTS=1024 +BPTT_HORIZON=32 +HUMAN_REG_COEF=0.0 + +# Calculate the segment size (minimum train batch size) +SEGMENT_SIZE=$((NUM_AGENTS * NUM_WORKERS * BPTT_HORIZON)) + +# Target batch size (aim for 2M with 2 GPUs) +TARGET_BATCH_SIZE=2097152 # 2M = 2^21 + +# Calculate train batch size: largest multiple of SEGMENT_SIZE that doesn't exceed target +TRAIN_BATCH_SIZE=$(( (TARGET_BATCH_SIZE / SEGMENT_SIZE) * SEGMENT_SIZE )) + +# If calculated batch size is 0 or too small, use the segment size +if [ $TRAIN_BATCH_SIZE -lt $SEGMENT_SIZE ]; then + TRAIN_BATCH_SIZE=$SEGMENT_SIZE +fi + +echo "Starting sweep with settings:" +echo " NUM_WORKERS: $NUM_WORKERS" +echo " NUM_ENVS: $NUM_ENVS" +echo " VEC_BATCH_SIZE: $VEC_BATCH_SIZE" +echo " NUM_AGENTS: $NUM_AGENTS" +echo " BPTT_HORIZON: $BPTT_HORIZON" +echo " SEGMENT_SIZE (min required): $SEGMENT_SIZE" +echo " TRAIN_BATCH_SIZE (calculated): $TRAIN_BATCH_SIZE" +echo " Divisibility check: $TRAIN_BATCH_SIZE / $SEGMENT_SIZE = $(($TRAIN_BATCH_SIZE / $SEGMENT_SIZE))" + +# Launch sweep with optimized parameters +puffer sweep puffer_drive \ + --wandb \ + --wandb-project "gsp" \ + --tag "guidance_regularize" \ + --vec.num-workers $NUM_WORKERS \ + --vec.num-envs $NUM_ENVS \ + --vec.batch-size $VEC_BATCH_SIZE \ + --env.num-agents $NUM_AGENTS \ + --train.batch-size $TRAIN_BATCH_SIZE \ + --train.bptt-horizon $BPTT_HORIZON \ + --train.human-ll-coef $HUMAN_REG_COEF + +# Print completion info +echo "Sweep completed" +date \ No newline at end of file diff --git a/greene/h100_gsp_rew_sweep.sh b/greene/h100_gsp_rew_sweep.sh new file mode 100644 index 000000000..dde3f2d04 --- /dev/null +++ b/greene/h100_gsp_rew_sweep.sh @@ -0,0 +1,82 @@ +#!/bin/bash +#SBATCH --array=0-5 +#SBATCH --job-name=pd_sweep_h100 +#SBATCH --output=/scratch/%u/PufferDrive/logs/output_%A_%a.txt +#SBATCH --error=/scratch/%u/PufferDrive/logs/error_%A_%a.txt +#SBATCH --mem=64G +#SBATCH --time=0-24:00:0 +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:h100:1 +#SBATCH --account=pr_100_tandon_advanced + +# Print job info +echo "Job ID: $SLURM_JOB_ID" +echo "Array Task ID: $SLURM_ARRAY_TASK_ID" +echo "User: $USER" +echo "Running on node: $(hostname)" +echo "GPU: $CUDA_VISIBLE_DEVICES" +date + +# Set project directory +PROJECT_DIR="/scratch/${USER}/PufferDrive" +echo "Project directory: $PROJECT_DIR" + +# Create logs directory if it doesn't exist +mkdir -p ${PROJECT_DIR}/logs + +# Navigate to project directory +cd ${PROJECT_DIR} + +# Activate virtual environment +source .venv/bin/activate + +# H100 optimized settings +NUM_WORKERS=48 +NUM_ENVS=48 +VEC_BATCH_SIZE=6 +NUM_AGENTS=1024 +BPTT_HORIZON=32 +HUMAN_REG_COEF=0.0 +USE_GUIDANCE_REWARDS=0 + +# Calculate the segment size (minimum train batch size) +SEGMENT_SIZE=$((NUM_AGENTS * NUM_WORKERS * BPTT_HORIZON)) + +# Target batch size (aim for 1-2M on H100) +TARGET_BATCH_SIZE=2097152 # 2M = 2^21 + +# Calculate train batch size: largest multiple of SEGMENT_SIZE that doesn't exceed target +TRAIN_BATCH_SIZE=$(( (TARGET_BATCH_SIZE / SEGMENT_SIZE) * SEGMENT_SIZE )) + +# If calculated batch size is 0 or too small, use the segment size +if [ $TRAIN_BATCH_SIZE -lt $SEGMENT_SIZE ]; then + TRAIN_BATCH_SIZE=$SEGMENT_SIZE +fi + +echo "Starting sweep with settings:" +echo " NUM_WORKERS: $NUM_WORKERS" +echo " NUM_ENVS: $NUM_ENVS" +echo " VEC_BATCH_SIZE: $VEC_BATCH_SIZE" +echo " NUM_AGENTS: $NUM_AGENTS" +echo " BPTT_HORIZON: $BPTT_HORIZON" +echo " SEGMENT_SIZE (min required): $SEGMENT_SIZE" +echo " TRAIN_BATCH_SIZE (calculated): $TRAIN_BATCH_SIZE" +echo " Divisibility check: $TRAIN_BATCH_SIZE / $SEGMENT_SIZE = $(($TRAIN_BATCH_SIZE / $SEGMENT_SIZE))" + +# Launch sweep with optimized parameters +puffer sweep puffer_drive \ + --wandb \ + --wandb-project "gsp" \ + --tag "guidance_regularize" \ + --vec.num-workers $NUM_WORKERS \ + --vec.num-envs $NUM_ENVS \ + --vec.batch-size $VEC_BATCH_SIZE \ + --env.num-agents $NUM_AGENTS \ + --env.use-guided-autonomy $USE_GUIDANCE_REWARDS \ + --train.batch-size $TRAIN_BATCH_SIZE \ + --train.bptt-horizon $BPTT_HORIZON \ + +# Print completion info +echo "Sweep completed" +date \ No newline at end of file diff --git a/greene/rtx8000_sweep.sh b/greene/rtx8000_sweep.sh new file mode 100644 index 000000000..6b8ba2d42 --- /dev/null +++ b/greene/rtx8000_sweep.sh @@ -0,0 +1,94 @@ +#!/bin/bash +#SBATCH --array=0-6 +#SBATCH --job-name=pd_sweep_rtx8000 +#SBATCH --output=/scratch/%u/PufferDrive/logs/output_%A_%a.txt +#SBATCH --error=/scratch/%u/PufferDrive/logs/error_%A_%a.txt +#SBATCH --mem=48G +#SBATCH --time=0-24:0:0 +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=20 +#SBATCH --gres=gpu:rtx8000:1 +#SBATCH --account=pr_100_tandon_priority + +# Print job info +echo "Job ID: $SLURM_JOB_ID" +echo "Array Task ID: $SLURM_ARRAY_TASK_ID" +echo "User: $USER" +echo "Running on node: $(hostname)" +echo "GPU: $CUDA_VISIBLE_DEVICES" +date + +# Set project directory +PROJECT_DIR="/scratch/${USER}/PufferDrive" +echo "Project directory: $PROJECT_DIR" + +# Create logs directory if it doesn't exist +mkdir -p ${PROJECT_DIR}/logs + +# Navigate to project directory +cd ${PROJECT_DIR} + +# Check if directory exists +if [ ! -d "$PROJECT_DIR" ]; then + echo "ERROR: Project directory does not exist: $PROJECT_DIR" + exit 1 +fi + +# Activate virtual environment +if [ -f ".venv/bin/activate" ]; then + source .venv/bin/activate + echo "Virtual environment activated" +else + echo "ERROR: Virtual environment not found at ${PROJECT_DIR}/.venv" + exit 1 +fi + +# RTX8000 optimized settings +NUM_WORKERS=32 +NUM_ENVS=32 +VEC_BATCH_SIZE=4 +NUM_AGENTS=1024 +BPTT_HORIZON=32 +HUMAN_REG_COEF=0.0 +USE_GUIDANCE_REWARDS=1 + +# Calculate the segment size (minimum train batch size) +SEGMENT_SIZE=$((NUM_AGENTS * NUM_WORKERS * BPTT_HORIZON)) + +# Target batch size (aim for 1-2M on H100) +TARGET_BATCH_SIZE=2097152 # 2M = 2^21 + +# Calculate train batch size: largest multiple of SEGMENT_SIZE that doesn't exceed target +TRAIN_BATCH_SIZE=$(( (TARGET_BATCH_SIZE / SEGMENT_SIZE) * SEGMENT_SIZE )) + +# If calculated batch size is 0 or too small, use the segment size +if [ $TRAIN_BATCH_SIZE -lt $SEGMENT_SIZE ]; then + TRAIN_BATCH_SIZE=$SEGMENT_SIZE +fi + +echo "Starting sweep with settings:" +echo " NUM_WORKERS: $NUM_WORKERS" +echo " NUM_ENVS: $NUM_ENVS" +echo " VEC_BATCH_SIZE: $VEC_BATCH_SIZE" +echo " NUM_AGENTS: $NUM_AGENTS" +echo " BPTT_HORIZON: $BPTT_HORIZON" +echo " SEGMENT_SIZE (min required): $SEGMENT_SIZE" +echo " TRAIN_BATCH_SIZE (calculated): $TRAIN_BATCH_SIZE" +echo " Divisibility check: $TRAIN_BATCH_SIZE / $SEGMENT_SIZE = $(($TRAIN_BATCH_SIZE / $SEGMENT_SIZE))" + +# Launch sweep with optimized parameters +puffer sweep puffer_drive \ + --wandb \ + --wandb-project "gsp" \ + --tag "guidance_rewards" \ + --vec.num-workers $NUM_WORKERS \ + --vec.num-envs $NUM_ENVS \ + --vec.batch-size $VEC_BATCH_SIZE \ + --env.num-agents $NUM_AGENTS \ + --env.use-guided-autonomy $USE_GUIDANCE_REWARDS \ + --train.batch-size $TRAIN_BATCH_SIZE \ + --train.bptt-horizon $BPTT_HORIZON \ + +# Print completion info +echo "Sweep completed" +date \ No newline at end of file diff --git a/models/.gitkeep b/models/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/models/bc_policy.pt b/models/bc_policy.pt new file mode 100644 index 000000000..a4db27932 Binary files /dev/null and b/models/bc_policy.pt differ diff --git a/models/model_puffer_drive_001908.pt b/models/model_puffer_drive_001908.pt new file mode 100644 index 000000000..fcd89b9a2 Binary files /dev/null and b/models/model_puffer_drive_001908.pt differ diff --git a/models/puffer_drive_sp_qld2z6tn.pt b/models/puffer_drive_sp_qld2z6tn.pt new file mode 100644 index 000000000..ce2c79999 Binary files /dev/null and b/models/puffer_drive_sp_qld2z6tn.pt differ diff --git a/profile_net.py b/profile_net.py new file mode 100644 index 000000000..630e38849 --- /dev/null +++ b/profile_net.py @@ -0,0 +1,159 @@ +# Profiling for various implementations of linear + max encoding for road points +# This was a 4x perf bottleneck. Now faster than expected baseline with custom kernels. + +import time +import torch +from torch import nn +import torch.nn.functional as F + +import torch +from torch.backends import cudnn +import torch.utils.cpp_extension + +from pufferlib import _C + +cudnn.benchmark = True +cudnn.deterministic = False +cudnn.benchmark_limit = 32 + + +class PointwiseNet(nn.Module): + def __init__(self, input_size, hidden_size): + super().__init__() + self.linear = nn.Linear(input_size, hidden_size) + + def forward(self, x): + return self.linear(x).max(dim=1)[0] + + +class OriginalLinearMax(torch.nn.Module): + """Original implementation of Linear + Max without custom CUDA kernel.""" + + def __init__(self, input_size, hidden_size): + super().__init__() + self.linear1 = nn.Linear(input_size, hidden_size) + self.ln = nn.LayerNorm(hidden_size) + self.linear2 = nn.Linear(hidden_size, hidden_size) + + def forward(self, x): + x = self.linear1(x) + x = self.ln(x) + x = self.linear2(x) + x, _ = x.max(dim=1) + return x + + +class LinearReluMaxKernel(torch.autograd.Function): + """Custom kernel: Linear -> ReLU -> Max""" + + @staticmethod + def forward(ctx, x, weight, bias): + out = torch.ops.pufferlib.linear_relu_max(x, weight, bias) + ctx.save_for_backward(x, weight, bias) + return out + + @staticmethod + def backward(ctx, grad_out): + x, weight, bias = ctx.saved_tensors + grad_x, grad_weight, grad_bias = torch.ops.pufferlib.linear_relu_max_backward( + grad_out.contiguous(), x, weight, bias + ) + return grad_x, grad_weight, grad_bias + + +class FusedLinearReluMax(nn.Module): + """Fused CUDA kernel: Linear -> ReLU -> Max""" + + def __init__(self, input_size, hidden_size): + super().__init__() + self.weight = nn.Parameter(torch.empty(hidden_size, input_size).normal_(mean=0.0, std=input_size**-0.5)) + self.bias = nn.Parameter(torch.zeros(hidden_size)) + + def forward(self, x): + return LinearReluMaxKernel.apply(x, self.weight, self.bias) + + +def check_cuda_correct(model, cuda_model, data): + cuda_model.weight.data = model.linear.weight.data.clone() + cuda_model.bias.data = model.linear.bias.data.clone() + for batch in data: + model.zero_grad() + cuda_model.zero_grad() + model_out = model(batch) + cuda_out = cuda_model(batch) + assert torch.allclose(model_out, cuda_out, atol=1e-4, rtol=1e-4) + model_out.mean().backward() + cuda_out.mean().backward() + assert torch.allclose(model.linear.weight.grad, cuda_model.weight.grad, rtol=1e-4, atol=1e-4) + assert torch.allclose(model.linear.bias.grad, cuda_model.bias.grad, rtol=1e-4, atol=1e-4) + + +def profile_forward(model, data, num_iters=100, warmup=10): + B = data.shape[0] + with torch.no_grad(): + # Warmup + for i in range(warmup): + model(data[i]) + + torch.cuda.synchronize() + start = time.time() + for i in range(num_iters): + model(data[i] + 1) # +1 to avoid in-place reuse issues + torch.cuda.synchronize() + end = time.time() + + sps = B * num_iters / (end - start) + return sps + + +def profile_backward(model, data, num_iters=100, warmup=10): + B = data.shape[0] + # Warmup + for i in range(warmup): + model.zero_grad() + out = model(data[i]) + out.sum().backward() + + torch.cuda.synchronize() + start = time.time() + for i in range(num_iters): + model.zero_grad() + out = model(data[i] + 1) + out.sum().backward() + torch.cuda.synchronize() + end = time.time() + + sps = B * num_iters / (end - start) + return sps + + +if __name__ == "__main__": + num_iters = 100 + B = 4096 # Batch size + H = 64 # Hidden size + N = 128 # Number of road points + D = 13 # Feature dimension + + device = torch.device("cuda") + data = torch.randn(num_iters, B, N, D).to(device) + + pointwise = PointwiseNet(D, H).to(device) + linear_max = LinearMax(D, H).to(device) + original_linear_max = OriginalLinearMax(D, H).to(device) + + linear_relu_max = FusedLinearReluMax(D, H).to(device) + + print( + f"Pointwise: Forward ({profile_forward(pointwise, data):,.2f}), backward ({profile_backward(pointwise, data):,.2f})" + ) + print( + f"Original Linear Max: Forward ({profile_forward(original_linear_max, data):,.2f}), backward ({profile_backward(original_linear_max, data):,.2f})" + ) + print( + f"Cuda Linear Max: Forward ({profile_forward(linear_max, data):,.2f}), backward ({profile_backward(linear_max, data):,.2f})" + ) + print() + + print( + f"Cuda Linear ReLU Max: Forward ({profile_forward(linear_relu_max, data):,.2f}), backward ({profile_backward(linear_relu_max, data):,.2f})" + ) diff --git a/pufferlib/config/default.ini b/pufferlib/config/default.ini index 810de828c..ba64b030b 100644 --- a/pufferlib/config/default.ini +++ b/pufferlib/config/default.ini @@ -66,13 +66,20 @@ metric = score goal = maximize downsample = 10 -#[sweep.vec.num_envs] -#distribution = uniform_pow2 -#min = 1 -#max = 16 -#mean = 8 -#scale = auto +; #[sweep.vec.num_envs] +; #distribution = uniform_pow2 +; #min = 1 +; #max = 16 +; #mean = 8 +; #scale = auto +; # TODO: Elim from base +; [sweep.train.total_timesteps] +; distribution = log_normal +; min = 5e7 +; max = 1e10 +; mean = 1e8 +; scale = time ; # TODO: Elim from base ; [sweep.train.total_timesteps] ; distribution = log_normal @@ -81,6 +88,12 @@ downsample = 10 ; mean = 1e8 ; scale = time +; [sweep.train.bptt_horizon] +; distribution = uniform_pow2 +; min = 16 +; max = 64 +; mean = 64 +; scale = auto ; [sweep.train.bptt_horizon] ; distribution = uniform_pow2 ; min = 16 @@ -95,34 +108,6 @@ downsample = 10 ; mean = 32768 ; scale = auto -; [sweep.train.learning_rate] -; distribution = log_normal -; min = 0.00001 -; mean = 0.01 -; max = 0.1 -; scale = 0.5 - -; [sweep.train.ent_coef] -; distribution = log_normal -; min = 0.00001 -; mean = 0.01 -; max = 0.2 -; scale = auto - -; [sweep.train.gamma] -; distribution = logit_normal -; min = 0.8 -; mean = 0.98 -; max = 0.9999 -; scale = auto - -; [sweep.train.gae_lambda] -; distribution = logit_normal -; min = 0.6 -; mean = 0.95 -; max = 0.995 -; scale = auto - ; [sweep.train.vtrace_rho_clip] ; distribution = uniform ; min = 0.0 diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index 7245e5f2b..146743da4 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -8,7 +8,7 @@ rnn_name = Recurrent num_workers = 16 num_envs = 16 batch_size = 4 -; backend = Serial +#backend = Serial [policy] input_size = 64 @@ -26,11 +26,16 @@ action_type = discrete dynamics_model = classic reward_vehicle_collision = -0.5 reward_offroad_collision = -0.5 +; Guided autonomy reward parameters +use_guided_autonomy = 0 +guidance_speed_weight = 0.0 +guidance_heading_weight = 0.0 +waypoint_reach_threshold = 2.0 dt = 0.1 reward_goal = 1.0 reward_goal_post_respawn = 0.25 ; Meters around goal to be considered "reached" -goal_radius = 2.0 +goal_radius = 1.0 ; Max target speed in m/s for the agent to maintain towards the goal goal_speed = 100.0 ; What to do when the goal is reached. Options: 0:"respawn", 1:"generate_new_goals", 2:"stop" @@ -46,18 +51,27 @@ offroad_behavior = 0 episode_length = 91 resample_frequency = 910 termination_mode = 1 # 0 - terminate at episode_length, 1 - terminate after all agents have been reset -map_dir = "resources/drive/binaries/training" -num_maps = 10000 +map_dir = "resources/drive/binaries/validation_interactive_small" #"resources/drive/binaries/training" +num_maps = 229 #10_000 ; Determines which step of the trajectory to initialize the agents at upon reset init_steps = 0 ; Options: "control_vehicles", "control_agents", "control_wosac", "control_sdc_only" -control_mode = "control_vehicles" +control_mode = "control_agents" ; Options: "created_all_valid", "create_only_controlled" init_mode = "create_all_valid" +; Note: this argument is currently duplicated in [train] section and should match +bptt_horizon = 32 +; Directory to save/load human demonstration data +human_data_dir = "resources/drive/human_demonstrations" +prep_human_data = True +; Number of human samples to create per env. Each sequence has length equal to bptt_horizon. +; So with bptt_horizon=32, each sequence has 3 seconds of human data. +max_expert_sequences = 3600 + [train] seed=42 -total_timesteps = 2_000_000_000 +total_timesteps = 3_000_000_000 ; learning_rate = 0.02 ; gamma = 0.985 anneal_lr = True @@ -70,10 +84,10 @@ adam_beta1 = 0.9 adam_beta2 = 0.999 adam_eps = 1e-8 clip_coef = 0.2 -ent_coef = 0.005 +ent_coef = 0.004670370966041502 gae_lambda = 0.95 -gamma = 0.98 -learning_rate = 0.003 +gamma = 0.973457552852506 +learning_rate = 0.0030854349125978733 max_grad_norm = 1 prio_alpha = 0.8499999999999999 prio_beta0 = 0.8499999999999999 @@ -82,10 +96,16 @@ vf_clip_coef = 0.1999999999999999 vf_coef = 2 vtrace_c_clip = 1 vtrace_rho_clip = 1 -checkpoint_interval = 1000 +; How many randomly sampled human demonstration sequences to use per update +human_sequences = 64 +save_data_to_disk = True +human_clip_coef = 10.0 +human_ll_coef = 0.0 + +checkpoint_interval = 200 ; Rendering options -render = True -render_interval = 1000 +render = False +render_interval = 200 ; If True, show exactly what the agent sees in agent observation obs_only = True ; Show grid lines @@ -93,22 +113,22 @@ show_grid = True ; Draws lines from ego agent observed ORUs and road elements to show detection range show_lasers = False ; Display human xy logs in the background -show_human_logs = False +show_human_logs = True ; If True, zoom in on a part of the map. Otherwise, show full map zoom_in = True ; Options: List[str to path], str to path (e.g., "resources/drive/training/binaries/map_001.bin"), None render_map = none [eval] -eval_interval = 1000 +eval_interval = 300 ; Path to dataset used for evaluation -map_dir = "resources/drive/binaries/training" +map_dir = "resources/drive/binaries/validation_interactive_small" ; Evaluation will run on the first num_maps maps in the map_dir directory -wosac_num_maps = 20 +wosac_num_maps = 60 backend = PufferEnv ; WOSAC (Waymo Open Sim Agents Challenge) evaluation settings ; If True, enables evaluation on realism metrics each time we save a checkpoint -wosac_realism_eval = False +wosac_realism_eval = True ; Number of policy rollouts per scene wosac_num_rollouts = 32 ; When to start the simulation @@ -120,15 +140,18 @@ wosac_init_mode = "create_all_valid" ; Stop when reaching the goal wosac_goal_behavior = 2 ; Can shrink goal radius for WOSAC evaluation -wosac_goal_radius = 2.0 +wosac_goal_radius = 1.0 wosac_sanity_check = False ; Only return aggregate results across all scenes wosac_aggregate_results = True ; If True, enable human replay evaluation (pair policy-controlled agent with human replays) -human_replay_eval = False +human_replay_eval = True +; Determine number of agents == number of human replays to use during evaluation +human_replay_num_agents = 32 ; Control only the self-driving car human_replay_control_mode = "control_sdc_only" +; General sweep settings: [sweep.train.learning_rate] distribution = log_normal min = 0.001 @@ -150,18 +173,23 @@ mean = 0.98 max = 0.999 scale = auto -[sweep.train.gae_lambda] -distribution = log_normal -min = 0.95 -mean = 0.98 -max = 0.999 -scale = auto +; [sweep.env.guidance_heading_weight] +; distribution = uniform +; min = 0.1 +; max = 0.5 +; mean = 0.3 +; scale = auto -[controlled_exp.train.goal_speed] -values = [10, 20, 30, 3] +; [sweep.env.guidance_speed_weight] +; distribution = uniform +; min = 0.1 +; max = 0.5 +; mean = 0.3 +; scale = auto -[controlled_exp.train.ent_coef] -values = [0.001, 0.005, 0.01] +; Settings for controlled experiments +[controlled_exp.train.human_ll_coef] +values = [0.1, 0.5, 0.9] -[controlled_exp.train.seed] -values = [42, 55, 1] +; [controlled_exp.env.max_expert_sequences] +; values = [512, 1024, 2048] diff --git a/pufferlib/extensions/cuda/pufferlib.cu b/pufferlib/extensions/cuda/pufferlib.cu index 9426a7e5e..7acfa8790 100644 --- a/pufferlib/extensions/cuda/pufferlib.cu +++ b/pufferlib/extensions/cuda/pufferlib.cu @@ -1,9 +1,328 @@ #include #include #include +#include +#include +#include namespace pufferlib { +// Linear -> ReLU -> Max kernel +template +__global__ void linear_relu_max_kernel( + const float* __restrict__ x, + const float* __restrict__ weight, + const float* __restrict__ bias, + float* __restrict__ out, + int B, int N, int H, int D +) { + int b = blockIdx.x; + int h = threadIdx.x; + + if (b >= B || h >= H) return; + + extern __shared__ float sh_x[]; + + constexpr int D_compile = D_VAL; + const int D_runtime = (D_compile > 0) ? D_compile : D; + const int x_size = N * D_runtime; + const float* x_batch = x + b * x_size; + + // Strided cooperative loading for better memory coalescing + for (int i = h; i < x_size; i += H) { + sh_x[i] = x_batch[i]; + } + __syncthreads(); + + const float* weight_h = weight + h * D_runtime; + float bias_val = bias[h]; + float max_val = -FLT_MAX; + + #pragma unroll 8 + for (int n = 0; n < N; n++) { + float dot = 0.0f; + const float* x_n = sh_x + n * D_runtime; + + if constexpr (D_compile > 0) { + // Compile-time known D: fully unroll + #pragma unroll + for (int d = 0; d < D_compile; d++) { + dot += weight_h[d] * x_n[d]; + } + } else { + // Runtime D: manual unroll by 4 + int d = 0; + #pragma unroll 4 + for (; d + 3 < D; d += 4) { + dot += weight_h[d] * x_n[d] + + weight_h[d+1] * x_n[d+1] + + weight_h[d+2] * x_n[d+2] + + weight_h[d+3] * x_n[d+3]; + } + for (; d < D; d++) { + dot += weight_h[d] * x_n[d]; + } + } + + // Apply ReLU before max + float activated = fmaxf(0.0f, dot + bias_val); + max_val = fmaxf(max_val, activated); + } + + out[b * H + h] = max_val; +} + +template +__global__ void linear_relu_max_backward_kernel( + const float* __restrict__ x, + const float* __restrict__ weight, + const float* __restrict__ bias, + const float* __restrict__ grad_out, + float* __restrict__ grad_x, + float* __restrict__ grad_weight_temp, + float* __restrict__ grad_bias_temp, + int B, int N, int H, int D +) { + int b = blockIdx.x; + int h = threadIdx.x; + + if (b >= B || h >= H) return; + + extern __shared__ float sh_x[]; + + constexpr int D_compile = D_VAL; + const int D_runtime = (D_compile > 0) ? D_compile : D; + const int x_size = N * D_runtime; + const float* x_batch = x + b * x_size; + + // Strided cooperative loading + for (int i = h; i < x_size; i += H) { + sh_x[i] = x_batch[i]; + } + __syncthreads(); + + const float* weight_h = weight + h * D_runtime; + const float* bias_ptr = bias + h; + float bias_val = bias_ptr[0]; + float go = grad_out[b * H + h]; + + int argmax_n = -1; + float max_activated = -FLT_MAX; + + // Forward pass to find argmax + #pragma unroll 8 + for (int n = 0; n < N; n++) { + float dot = 0.0f; + const float* x_n = sh_x + n * D_runtime; + + if constexpr (D_compile > 0) { + #pragma unroll + for (int d = 0; d < D_compile; d++) { + dot += weight_h[d] * x_n[d]; + } + } else { + int d = 0; + #pragma unroll 4 + for (; d + 3 < D; d += 4) { + dot += weight_h[d] * x_n[d] + + weight_h[d+1] * x_n[d+1] + + weight_h[d+2] * x_n[d+2] + + weight_h[d+3] * x_n[d+3]; + } + for (; d < D; d++) { + dot += weight_h[d] * x_n[d]; + } + } + + // Apply ReLU + float activated = fmaxf(0.0f, dot + bias_val); + + if (activated > max_activated) { + max_activated = activated; + argmax_n = n; + } + } + + if (argmax_n == -1) return; // Should not happen if N > 0 + + // Check if ReLU was active at the argmax position + // Recompute the pre-activation value + float dot_at_argmax = 0.0f; + const float* x_argmax = sh_x + argmax_n * D_runtime; + + if constexpr (D_compile > 0) { + #pragma unroll + for (int d = 0; d < D_compile; d++) { + dot_at_argmax += weight_h[d] * x_argmax[d]; + } + } else { + for (int d = 0; d < D; d++) { + dot_at_argmax += weight_h[d] * x_argmax[d]; + } + } + + float pre_activation = dot_at_argmax + bias_val; + + // Only backprop if ReLU was active (pre_activation > 0) + if (pre_activation <= 0.0f) { + // ReLU blocked the gradient + return; + } + + // Gradient flows through: go * 1 (from max) * 1 (from ReLU when active) + grad_bias_temp[b * H + h] = go; + + const int weight_temp_base = b * H * D_runtime + h * D_runtime; + const int x_base = argmax_n * D_runtime; + + if constexpr (D_compile > 0) { + #pragma unroll + for (int d = 0; d < D_compile; d++) { + grad_weight_temp[weight_temp_base + d] = go * sh_x[x_base + d]; + } + } else { + for (int d = 0; d < D; d++) { + grad_weight_temp[weight_temp_base + d] = go * sh_x[x_base + d]; + } + } + + const int grad_x_base = b * N * D_runtime + argmax_n * D_runtime; + + if constexpr (D_compile > 0) { + #pragma unroll + for (int d = 0; d < D_compile; d++) { + atomicAdd(grad_x + grad_x_base + d, go * weight_h[d]); + } + } else { + for (int d = 0; d < D; d++) { + atomicAdd(grad_x + grad_x_base + d, go * weight_h[d]); + } + } +} + +torch::Tensor linear_relu_max_cuda( + torch::Tensor x, + torch::Tensor weight, + torch::Tensor bias +) { + TORCH_CHECK(x.is_cuda() && weight.is_cuda() && bias.is_cuda(), + "All tensors must be on CUDA"); + TORCH_CHECK(x.dim() == 3, "x must be (B, N, D)"); + TORCH_CHECK(weight.dim() == 2, "weight must be (H, D)"); + TORCH_CHECK(bias.dim() == 1, "bias must be (H)"); + TORCH_CHECK(x.dtype() == torch::kFloat, "x must be float32"); + TORCH_CHECK(weight.dtype() == torch::kFloat, "weight must be float32"); + TORCH_CHECK(bias.dtype() == torch::kFloat, "bias must be float32"); + + const int B = x.size(0); + const int N = x.size(1); + const int D = x.size(2); + const int H = weight.size(0); + + TORCH_CHECK(weight.size(1) == D, "Weight D mismatch"); + TORCH_CHECK(bias.size(0) == H, "Bias H mismatch"); + + auto out = torch::empty({B, H}, x.options()); + + dim3 block(H); + dim3 grid(B); + size_t shared_size = N * D * sizeof(float); + + const float* x_ptr = x.data_ptr(); + const float* weight_ptr = weight.data_ptr(); + const float* bias_ptr = bias.data_ptr(); + float* out_ptr = out.data_ptr(); + + int device_idx = x.get_device(); + cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device_idx); + + if (D == 7) { + linear_relu_max_kernel<7><<>>( + x_ptr, weight_ptr, bias_ptr, out_ptr, B, N, H, D); + } else if (D == 13) { + linear_relu_max_kernel<13><<>>( + x_ptr, weight_ptr, bias_ptr, out_ptr, B, N, H, D); + } else { + linear_relu_max_kernel<0><<>>( + x_ptr, weight_ptr, bias_ptr, out_ptr, B, N, H, D); + } + + C10_CUDA_KERNEL_LAUNCH_CHECK(); + return out; +} + +std::tuple linear_relu_max_backward_cuda( + torch::Tensor grad_out, + torch::Tensor x, + torch::Tensor weight, + torch::Tensor bias +) { + TORCH_CHECK(grad_out.is_cuda() && x.is_cuda() && weight.is_cuda() && bias.is_cuda(), + "All tensors must be on CUDA"); + TORCH_CHECK(grad_out.dim() == 2, "grad_out must be (B, H)"); + TORCH_CHECK(x.dim() == 3, "x must be (B, N, D)"); + TORCH_CHECK(weight.dim() == 2, "weight must be (H, D)"); + TORCH_CHECK(bias.dim() == 1, "bias must be (H)"); + TORCH_CHECK(grad_out.dtype() == torch::kFloat, "grad_out must be float32"); + TORCH_CHECK(x.dtype() == torch::kFloat, "x must be float32"); + TORCH_CHECK(weight.dtype() == torch::kFloat, "weight must be float32"); + TORCH_CHECK(bias.dtype() == torch::kFloat, "bias must be float32"); + + const int B = x.size(0); + const int N = x.size(1); + const int D = x.size(2); + const int H = weight.size(0); + + TORCH_CHECK(grad_out.size(0) == B && grad_out.size(1) == H, "grad_out shape mismatch"); + TORCH_CHECK(weight.size(1) == D, "Weight D mismatch"); + TORCH_CHECK(bias.size(0) == H, "Bias H mismatch"); + + auto grad_x = torch::zeros_like(x); + auto grad_weight = torch::zeros_like(weight); + auto grad_bias = torch::zeros({H}, x.options()); + + auto grad_weight_temp = torch::zeros({B, H, D}, x.options()); + auto grad_bias_temp = torch::zeros({B, H}, x.options()); + + dim3 block(H); + dim3 grid(B); + size_t shared_size = N * D * sizeof(float); + + const float* grad_out_ptr = grad_out.data_ptr(); + const float* x_ptr = x.data_ptr(); + const float* weight_ptr = weight.data_ptr(); + const float* bias_ptr = bias.data_ptr(); + float* grad_x_ptr = grad_x.data_ptr(); + float* grad_weight_temp_ptr = grad_weight_temp.data_ptr(); + float* grad_bias_temp_ptr = grad_bias_temp.data_ptr(); + + int device_idx = x.get_device(); + cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device_idx); + + if (D == 7) { + linear_relu_max_backward_kernel<7><<>>( + x_ptr, weight_ptr, bias_ptr, grad_out_ptr, grad_x_ptr, + grad_weight_temp_ptr, grad_bias_temp_ptr, B, N, H, D); + } else if (D == 13) { + linear_relu_max_backward_kernel<13><<>>( + x_ptr, weight_ptr, bias_ptr, grad_out_ptr, grad_x_ptr, + grad_weight_temp_ptr, grad_bias_temp_ptr, B, N, H, D); + } else { + linear_relu_max_backward_kernel<0><<>>( + x_ptr, weight_ptr, bias_ptr, grad_out_ptr, grad_x_ptr, + grad_weight_temp_ptr, grad_bias_temp_ptr, B, N, H, D); + } + + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + grad_weight = grad_weight_temp.sum(0); + grad_bias = grad_bias_temp.sum(0); + + cudaStreamSynchronize(stream); + + return {grad_x, grad_weight, grad_bias}; +} + __host__ __device__ void puff_advantage_row_cuda(float* values, float* rewards, float* dones, float* importance, float* advantages, float gamma, float lambda, float rho_clip, float c_clip, int horizon) { diff --git a/pufferlib/extensions/pufferlib.cpp b/pufferlib/extensions/pufferlib.cpp index a20d58bc2..8dba39ded 100644 --- a/pufferlib/extensions/pufferlib.cpp +++ b/pufferlib/extensions/pufferlib.cpp @@ -85,7 +85,7 @@ void compute_puff_advantage_cpu(torch::Tensor values, torch::Tensor rewards, } TORCH_LIBRARY(pufferlib, m) { - m.def("compute_puff_advantage(Tensor(a!) values, Tensor(b!) rewards, Tensor(c!) dones, Tensor(d!) importance, Tensor(e!) advantages, float gamma, float lambda, float rho_clip, float c_clip) -> ()"); + m.def("compute_puff_advantage(Tensor(a!) values, Tensor(b!) rewards, Tensor(c!) dones, Tensor(d!) importance, Tensor(e!) advantages, float gamma, float lambda, float rho_clip, float c_clip) -> ()"); } TORCH_LIBRARY_IMPL(pufferlib, CPU, m) { diff --git a/pufferlib/ocean/benchmark/evaluate_checkpoints.py b/pufferlib/ocean/benchmark/evaluate_checkpoints.py new file mode 100644 index 000000000..5b36360be --- /dev/null +++ b/pufferlib/ocean/benchmark/evaluate_checkpoints.py @@ -0,0 +1,409 @@ +""" +Comprehensive evaluation script for Drive environment checkpoints. + +Evaluates all .pt checkpoints in a folder using: +1. WOSAC metrics (realism, ADE, likelihood metrics) +2. Collision rates (SDC-only control mode) + +Includes baselines for ground truth and random policy. +""" + +import argparse +import os +import glob +import sys +from pathlib import Path +import numpy as np +import pandas as pd +import torch + +from pufferlib.pufferl import load_env, load_policy, load_config +from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator +import pufferlib.pytorch + + +def evaluate_wosac(config, vecenv, policy, policy_name="policy"): + """Run WOSAC evaluation for a given policy.""" + print(f"Running WOSAC evaluation for {policy_name}...") + + evaluator = WOSACEvaluator(config) + + # Collect ground truth trajectories + gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) + + # Collect simulated trajectories + simulated_trajectories = evaluator.collect_simulated_trajectories(config, vecenv, policy) + + # Optional sanity check + if config["wosac"].get("sanity_check", False): + evaluator._quick_sanity_check(gt_trajectories, simulated_trajectories) + + # Compute metrics + results = evaluator.compute_metrics(gt_trajectories, simulated_trajectories) + + return { + "policy": policy_name, + "ade": results["ade"].mean(), + "min_ade": results["min_ade"].mean(), + "likelihood_linear_speed": results["likelihood_linear_speed"].mean(), + "likelihood_linear_acceleration": results["likelihood_linear_acceleration"].mean(), + "likelihood_angular_speed": results["likelihood_angular_speed"].mean(), + "likelihood_angular_acceleration": results["likelihood_angular_acceleration"].mean(), + "realism_metametric": results["realism_metametric"].mean(), + } + + +def evaluate_wosac_ground_truth(config, vecenv): + """Run WOSAC evaluation using ground truth as the policy (perfect replay).""" + print("Running WOSAC evaluation for ground truth...") + + evaluator = WOSACEvaluator(config) + + # Collect ground truth trajectories + gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) + + # Use ground truth as simulated (perfect replay) + # For this, we need to broadcast GT to match rollout dimensions + simulated_trajectories = {} + num_rollouts = config["wosac"]["num_rollouts"] + + for key in gt_trajectories: + if key in ["x", "y", "z", "heading"]: + # Broadcast ground truth across rollouts + gt_shape = gt_trajectories[key].shape + simulated_trajectories[key] = np.broadcast_to( + gt_trajectories[key], (gt_shape[0], num_rollouts, gt_shape[2]) + ).copy() + else: + simulated_trajectories[key] = gt_trajectories[key].copy() + + # Compute metrics + results = evaluator.compute_metrics(gt_trajectories, simulated_trajectories) + + return { + "policy": "ground_truth", + "ade": results["ade"].mean(), + "min_ade": results["min_ade"].mean(), + "likelihood_linear_speed": results["likelihood_linear_speed"].mean(), + "likelihood_linear_acceleration": results["likelihood_linear_acceleration"].mean(), + "likelihood_angular_speed": results["likelihood_angular_speed"].mean(), + "likelihood_angular_acceleration": results["likelihood_angular_acceleration"].mean(), + "realism_metametric": results["realism_metametric"].mean(), + } + + +def evaluate_wosac_random(config, vecenv, policy): + """Run WOSAC evaluation with a random policy.""" + print("Running WOSAC evaluation for random policy...") + + class RandomPolicy: + """Random action policy wrapper.""" + + def __init__(self, action_space): + self.action_space = action_space + + def forward_eval(self, obs, state): + batch_size = obs.shape[0] + # Generate random actions + if hasattr(self.action_space, "nvec"): # MultiDiscrete + actions = torch.tensor( + [[np.random.randint(0, n) for n in self.action_space.nvec] for _ in range(batch_size)], + dtype=torch.long, + ) + else: # Continuous + actions = torch.tensor(self.action_space.sample()).unsqueeze(0).repeat(batch_size, 1) + + # Return dummy values for value estimate + value = torch.zeros(batch_size, 1) + return actions, value + + random_policy = RandomPolicy(vecenv.single_action_space) + return evaluate_wosac(config, vecenv, random_policy, policy_name="random") + + +def evaluate_collision_rate(config, vecenv, policy, policy_name="policy", num_episodes=100): + """Evaluate collision rate in SDC-only control mode.""" + print(f"Running collision rate evaluation for {policy_name}...") + + device = config["train"]["device"] + num_agents = vecenv.num_agents + + # Initialize LSTM state if needed + state = {} + if config["train"]["use_rnn"]: + state = dict( + lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), + lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), + ) + + # Track collision statistics + total_steps = 0 + total_collision_count = 0 + total_collisions_per_agent = 0.0 + episodes_completed = 0 + + ob, info = vecenv.reset() + + while episodes_completed < num_episodes: + with torch.no_grad(): + ob_tensor = torch.as_tensor(ob).to(device) + + if hasattr(policy, "forward_eval"): + logits, value = policy.forward_eval(ob_tensor, state) + else: + # Random policy + if hasattr(vecenv.single_action_space, "nvec"): + action = np.array([[np.random.randint(0, n) for n in vecenv.single_action_space.nvec]]) + else: + action = vecenv.single_action_space.sample() + if isinstance(action, np.ndarray): + action = action.reshape(1, -1) + ob, rewards, terminals, truncations, info = vecenv.step(action) + total_steps += 1 + + # Check for episode completion + if terminals.any() or truncations.any(): + episodes_completed += 1 + # Extract metrics from final info (contains log data) + if info: + for info_dict in info: + if isinstance(info_dict, dict): + if "collision_rate" in info_dict: + total_collision_count += int(info_dict["collision_rate"]) + if "avg_collisions_per_agent" in info_dict: + total_collisions_per_agent += float(info_dict["avg_collisions_per_agent"]) + continue + + action, logprob, _ = pufferlib.pytorch.sample_logits(logits) + action = action.cpu().numpy().reshape(vecenv.action_space.shape) + + # Clip continuous actions + if isinstance(logits, torch.distributions.Normal): + action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) + + ob, rewards, terminals, truncations, info = vecenv.step(action) + total_steps += 1 + + # Check for episode completion + if terminals.any() or truncations.any(): + episodes_completed += 1 + # Extract metrics from final info (contains log data) + if info: + for info_dict in info: + if isinstance(info_dict, dict): + if "collision_rate" in info_dict: + total_collision_count += int(info_dict["collision_rate"]) + if "avg_collisions_per_agent" in info_dict: + total_collisions_per_agent += float(info_dict["avg_collisions_per_agent"]) + + # Calculate rates + collision_rate = total_collision_count / num_episodes if num_episodes > 0 else 0 + avg_collisions_per_agent = total_collisions_per_agent / num_episodes if num_episodes > 0 else 0 + + return { + "policy": policy_name, + "num_episodes": num_episodes, + "total_steps": total_steps, + "collision_rate": collision_rate, + "avg_collisions_per_agent": avg_collisions_per_agent, + } + + +def evaluate_collision_rate_random(config, vecenv, num_episodes=100): + """Evaluate collision rate with random policy.""" + print("Running collision rate evaluation for random policy...") + + class RandomPolicy: + """Dummy random policy.""" + + pass + + random_policy = RandomPolicy() + return evaluate_collision_rate(config, vecenv, random_policy, policy_name="random", num_episodes=num_episodes) + + +def main(): + parser = argparse.ArgumentParser( + description="Evaluate Drive environment checkpoints", formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("checkpoint_dir", type=str, help="Directory containing .pt checkpoint files") + parser.add_argument( + "--output", + type=str, + default="evaluation_results.csv", + help="Output CSV file path (default: evaluation_results.csv)", + ) + parser.add_argument( + "--num-collision-episodes", + type=int, + default=100, + help="Number of episodes for collision rate evaluation (default: 100)", + ) + parser.add_argument("--skip-wosac", action="store_true", help="Skip WOSAC evaluation") + parser.add_argument("--skip-collision", action="store_true", help="Skip collision rate evaluation") + args = parser.parse_args() + + # Hardcode environment name + env_name = "puffer_drive" + + # Find all checkpoint files + checkpoint_dir = Path(args.checkpoint_dir) + checkpoint_files = sorted(glob.glob(str(checkpoint_dir / "*.pt"))) + + if not checkpoint_files: + print(f"No .pt files found in {checkpoint_dir}") + return + + print(f"Found {len(checkpoint_files)} checkpoint files") + + # Temporarily modify sys.argv to prevent load_config from parsing our arguments + original_argv = sys.argv.copy() + sys.argv = [sys.argv[0]] # Keep only script name + + try: + config = load_config(env_name) + finally: + sys.argv = original_argv + + # Results storage + wosac_results = [] + collision_results = [] + + # === WOSAC Evaluation === + if not args.skip_wosac: + print("\n" + "=" * 80) + print("STARTING WOSAC EVALUATION") + print("=" * 80) + + # Configure for WOSAC + config["wosac"]["enabled"] = True + config["vec"]["backend"] = "PufferEnv" + config["vec"]["num_envs"] = 1 + config["env"]["num_agents"] = config["wosac"]["num_total_wosac_agents"] + config["env"]["init_mode"] = config["wosac"]["init_mode"] + config["env"]["control_mode"] = config["wosac"]["control_mode"] + config["env"]["init_steps"] = config["wosac"]["init_steps"] + config["env"]["goal_behavior"] = config["wosac"]["goal_behavior"] + config["env"]["goal_radius"] = config["wosac"]["goal_radius"] + + # Create environment for WOSAC + vecenv_wosac = load_env(env_name, config) + + # Evaluate ground truth + try: + gt_result = evaluate_wosac_ground_truth(config, vecenv_wosac) + wosac_results.append(gt_result) + except Exception as e: + print(f"Error evaluating ground truth: {e}") + + # Evaluate random policy + try: + policy_template = load_policy(config, vecenv_wosac, env_name) + random_result = evaluate_wosac_random(config, vecenv_wosac, policy_template) + wosac_results.append(random_result) + except Exception as e: + print(f"Error evaluating random policy: {e}") + + # Evaluate each checkpoint + for checkpoint_path in checkpoint_files: + checkpoint_name = Path(checkpoint_path).stem + print(f"\nEvaluating checkpoint: {checkpoint_name}") + + try: + # Load policy + policy = load_policy(config, vecenv_wosac, env_name) + state_dict = torch.load(checkpoint_path, map_location=config["train"]["device"]) + state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + policy.load_state_dict(state_dict) + + # Evaluate + result = evaluate_wosac(config, vecenv_wosac, policy, policy_name=checkpoint_name) + wosac_results.append(result) + + except Exception as e: + print(f"Error evaluating {checkpoint_name}: {e}") + + vecenv_wosac.close() + + # === Collision Rate Evaluation === + if not args.skip_collision: + print("\n" + "=" * 80) + print("STARTING COLLISION RATE EVALUATION") + print("=" * 80) + + # Configure for SDC-only collision evaluation + config["vec"]["backend"] = "PufferEnv" + config["vec"]["num_envs"] = 1 + config["env"]["num_agents"] = 1 + config["env"]["control_mode"] = "control_sdc_only" + config["env"]["init_mode"] = "create_all_valid" + config["env"]["init_steps"] = 10 + + # Create environment for collision evaluation + vecenv_human_replay = load_env(env_name, config) + + # Evaluate random policy + try: + random_collision_result = evaluate_collision_rate_random( + config, vecenv_human_replay, num_episodes=args.num_collision_episodes + ) + collision_results.append(random_collision_result) + except Exception as e: + print(f"Error evaluating random policy collision rate: {e}") + + # Evaluate each checkpoint + for checkpoint_path in checkpoint_files: + checkpoint_name = Path(checkpoint_path).stem + print(f"\nEvaluating checkpoint collision rate: {checkpoint_name}") + + try: + # Load policy + policy = load_policy(config, vecenv_human_replay, env_name) + state_dict = torch.load(checkpoint_path, map_location=config["train"]["device"]) + state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + policy.load_state_dict(state_dict) + + # Evaluate + result = evaluate_collision_rate( + config, + vecenv_human_replay, + policy, + policy_name=checkpoint_name, + num_episodes=args.num_collision_episodes, + ) + collision_results.append(result) + + except Exception as e: + print(f"Error evaluating {checkpoint_name} collision rate: {e}") + + vecenv_human_replay.close() + + # === Save Results === + print("\n" + "=" * 80) + print("SAVING RESULTS") + print("=" * 80) + + # Create DataFrames + if wosac_results: + df_wosac = pd.DataFrame(wosac_results) + wosac_output = args.output.replace(".csv", "_wosac.csv") + df_wosac.to_csv(wosac_output, index=False) + print(f"\nWOSAC results saved to {wosac_output}") + print("\nWOSAC Results Summary:") + print(df_wosac.to_string(index=False)) + + if collision_results: + df_collision = pd.DataFrame(collision_results) + collision_output = args.output.replace(".csv", "_collision.csv") + df_collision.to_csv(collision_output, index=False) + print(f"\nCollision results saved to {collision_output}") + print("\nCollision Results Summary:") + print(df_collision.to_string(index=False)) + + print("\n" + "=" * 80) + print("EVALUATION COMPLETE") + print("=" * 80) + + +if __name__ == "__main__": + main() diff --git a/pufferlib/ocean/benchmark/evaluator.py b/pufferlib/ocean/benchmark/evaluator.py index 003bfad27..a83869edc 100644 --- a/pufferlib/ocean/benchmark/evaluator.py +++ b/pufferlib/ocean/benchmark/evaluator.py @@ -7,6 +7,7 @@ import matplotlib.pyplot as plt import configparser import os +from tqdm import tqdm import pufferlib from pufferlib.ocean.benchmark import metrics @@ -40,6 +41,7 @@ def __init__(self, config: Dict): wosac_metrics_path = os.path.join(os.path.dirname(__file__), "wosac.ini") self.metrics_config = configparser.ConfigParser() self.metrics_config.read(wosac_metrics_path) + self.mode = "rl" def _compute_metametric(self, metrics: pd.Series) -> float: metametric = 0.0 @@ -67,8 +69,15 @@ def collect_ground_truth_trajectories(self, puffer_env): """ return puffer_env.get_ground_truth_trajectories() - def collect_simulated_trajectories(self, args, puffer_env, policy): + def collect_simulated_trajectories(self, args, puffer_env, policy=None, actions=None): """Roll out policy in env and collect trajectories. + Args: + args: configuration dictionary + puffer_env: PufferDrive environment + policy: policy to evaluate (if None, actions must be provided) + actions: actions to step the agent (if policy is None). Currently only works with discrete actions and the + classic dynamics model. Shape: [time, num_agents, 1] + Returns: trajectories: dict with keys 'x', 'y', 'z', 'heading' each of shape (num_agents, num_rollouts, num_steps) @@ -86,11 +95,13 @@ def collect_simulated_trajectories(self, args, puffer_env, policy): "id": np.zeros((num_agents, self.num_rollouts, self.sim_steps), dtype=np.int32), } - for rollout_idx in range(self.num_rollouts): + for rollout_idx in tqdm(range(self.num_rollouts), desc="Collecting rollouts", colour="blue"): print(f"\rCollecting rollout {rollout_idx + 1}/{self.num_rollouts}...", end="", flush=True) + obs, info = puffer_env.reset() state = {} - if args["train"]["use_rnn"]: + + if args["train"]["use_rnn"] and policy is not None: state = dict( lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), @@ -106,14 +117,34 @@ def collect_simulated_trajectories(self, args, puffer_env, policy): trajectories["id"][:, rollout_idx, time_idx] = agent_state["id"] # Step policy - with torch.no_grad(): - ob_tensor = torch.as_tensor(obs).to(device) - logits, value = policy.forward_eval(ob_tensor, state) - action, logprob, _ = pufferlib.pytorch.sample_logits(logits) - action_np = action.cpu().numpy().reshape(puffer_env.action_space.shape) - - if isinstance(logits, torch.distributions.Normal): - action_np = np.clip(action_np, puffer_env.action_space.low, puffer_env.action_space.high) + if policy is None and actions is not None: + human_act_time_index = self.init_steps + time_idx + action_np = actions[human_act_time_index, :].copy() + + # Replace invalid actions (-1) with "do nothing" action + # For discrete actions: use action 45 (accel=0, steer=0) + # For continuous actions: use [0.0, 0.0] + invalid_mask = action_np == -1.0 + + if puffer_env.action_space.__class__.__name__ == "MultiDiscrete": + # Discrete action space + action_np[invalid_mask] = 45 # Do nothing action + + elif policy is not None: + if self.mode == "bc_policy": + with torch.no_grad(): + ob_tensor = torch.as_tensor(obs).to(device) + pred_action = policy(ob_tensor, deterministic=True) + action_np = pred_action.cpu().numpy().reshape(puffer_env.action_space.shape) + else: + with torch.no_grad(): + ob_tensor = torch.as_tensor(obs).to(device) + logits, value = policy.forward_eval(ob_tensor, state) + action, logprob, _ = pufferlib.pytorch.sample_logits(logits) + action_np = action.cpu().numpy().reshape(puffer_env.action_space.shape) + + if isinstance(logits, torch.distributions.Normal): + action_np = np.clip(action_np, puffer_env.action_space.low, puffer_env.action_space.high) obs, _, _, _, _ = puffer_env.step(action_np) @@ -192,6 +223,8 @@ def compute_metrics( "Agent IDs don't match between simulated and ground truth trajectories" ) + print("Computing metrics...") + eval_mask = ground_truth_trajectories["id"][:, 0] >= 0 # Extract trajectories @@ -488,52 +521,54 @@ def compute_metrics( } ) - scene_level_results = df.groupby("scenario_id")[ + # Aggregate along agent dimenision: Obtain one score per scenario + df_scene_level = df.groupby("scenario_id").mean().drop(columns=["agent_id"]).dropna() + + # Exponentiate the averaged log-likelihoods to get final likelihoods + likelihood_columns = [col for col in df_scene_level.columns if col.startswith("likelihood_")] + df_scene_level[likelihood_columns] = np.exp(df_scene_level[likelihood_columns]) + + df_scene_level["realism_meta_score"] = df_scene_level.apply(self._compute_metametric, axis=1) + df_scene_level["num_agents_per_scene"] = df.groupby("scenario_id").size() + df_scene_level = df_scene_level.round(3) + + # Get group summary metrics + kinematic_metrics = np.mean( + [ + df_scene_level["likelihood_linear_speed"], + df_scene_level["likelihood_linear_acceleration"], + df_scene_level["likelihood_angular_speed"], + df_scene_level["likelihood_angular_acceleration"], + ] + ) + + interactive_metrics = np.mean( [ - "ade", - "min_ade", - "num_collisions_sim", - "num_collisions_ref", - "num_offroad_sim", - "num_offroad_ref", - "likelihood_linear_speed", - "likelihood_linear_acceleration", - "likelihood_angular_speed", - "likelihood_angular_acceleration", - "likelihood_distance_to_nearest_object", - "likelihood_time_to_collision", - "likelihood_collision_indication", - "likelihood_distance_to_road_edge", - "likelihood_offroad_indication", + df_scene_level["likelihood_collision_indication"], + df_scene_level["likelihood_distance_to_nearest_object"], + df_scene_level["likelihood_time_to_collision"], ] - ].mean() + ) - # Transform log-likelihoods to positive scores: - likelihood_cols = [c for c in scene_level_results.columns if "likelihood" in c] - scene_level_results[likelihood_cols] = np.exp(scene_level_results[likelihood_cols]) + map_metrics = np.mean( + [ + df_scene_level["likelihood_distance_to_road_edge"], + df_scene_level["likelihood_offroad_indication"], + ] + ) - scene_level_results["realism_meta_score"] = scene_level_results.apply(self._compute_metametric, axis=1) - scene_level_results["num_agents"] = df.groupby("scenario_id").size() - scene_level_results = scene_level_results[ - ["num_agents"] + [col for col in scene_level_results.columns if col != "num_agents"] - ] + df_scene_level["kinematic_metrics"] = kinematic_metrics + df_scene_level["interactive_metrics"] = interactive_metrics + df_scene_level["map_based_metrics"] = map_metrics if aggregate_results: - aggregate_metrics = scene_level_results.mean().to_dict() - aggregate_metrics["total_num_agents"] = scene_level_results["num_agents"].sum() - # Convert numpy types to Python native types - return {k: v.item() if hasattr(v, "item") else v for k, v in aggregate_metrics.items()} + # Aggregate over scenarios + aggregate_metrics = df_scene_level.mean().to_dict() + aggregate_metrics["total_num_agents"] = df_scene_level["num_agents_per_scene"].sum() + aggregate_metrics["realism_score_std"] = df_scene_level["realism_meta_score"].std() + return aggregate_metrics else: - print("\n Scene-level results:\n") - print(scene_level_results) - - print(f"\n Overall realism meta score: {scene_level_results['realism_meta_score'].mean():.4f}") - print(f"\n Overall minADE: {scene_level_results['min_ade'].mean():.4f}") - print(f"\n Overall ADE: {scene_level_results['ade'].mean():.4f}") - - # print(f"\n Full agent-level results:\n") - # print(df) - return scene_level_results + return df_scene_level def _quick_sanity_check(self, gt_trajectories, simulated_trajectories, agent_idx=None, max_agents_to_plot=10): if agent_idx is None: diff --git a/pufferlib/ocean/benchmark/metrics_sanity_check.py b/pufferlib/ocean/benchmark/metrics_sanity_check.py index 74d8596f8..f957ad455 100644 --- a/pufferlib/ocean/benchmark/metrics_sanity_check.py +++ b/pufferlib/ocean/benchmark/metrics_sanity_check.py @@ -1,14 +1,22 @@ """ -Validation script for WOSAC log-likelihood metrics. +Comprehensive evaluation script for Drive environment checkpoints. -Idea is to check how the log-likelihood metrics change as we replace -increasing numbers of random rollouts with ground-truth data. +Evaluates all .pt checkpoints in a folder using: +1. WOSAC metrics (realism, ADE, likelihood metrics) +2. Collision rates (SDC-only control mode) + +Includes baselines for ground truth and random policy. """ import argparse +import os +import glob +from pathlib import Path import numpy as np +import pandas as pd +import torch -from pufferlib.pufferl import load_config, load_env, load_policy +from pufferlib.pufferl import load_env, load_policy from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator @@ -29,6 +37,7 @@ def replace_rollouts_with_gt(simulated_traj, gt_traj, num_replacements): def run_validation_experiment(config, vecenv): evaluator = WOSACEvaluator(config) + # Collect ground truth trajectories gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) simulated_trajectories = evaluator.collect_wosac_random_baseline(vecenv) agent_state = vecenv.driver_env.get_global_agent_state() @@ -79,9 +88,24 @@ def format_results_table(results): def main(): - parser = argparse.ArgumentParser(description="Validate WOSAC log-likelihood metrics") - parser.add_argument("--env", default="puffer_drive") - parser.add_argument("--config", default="config/ocean/drive.ini") + parser = argparse.ArgumentParser( + description="Evaluate Drive environment checkpoints", formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("checkpoint_dir", type=str, help="Directory containing .pt checkpoint files") + parser.add_argument( + "--output", + type=str, + default="evaluation_results.csv", + help="Output CSV file path (default: evaluation_results.csv)", + ) + parser.add_argument( + "--num-collision-episodes", + type=int, + default=100, + help="Number of episodes for collision rate evaluation (default: 100)", + ) + parser.add_argument("--skip-wosac", action="store_true", help="Skip WOSAC evaluation") + parser.add_argument("--skip-collision", action="store_true", help="Skip collision rate evaluation") args = parser.parse_args() config = load_config(args.env) diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c index e8abb8f2b..e0eb38b91 100644 --- a/pufferlib/ocean/drive/binding.c +++ b/pufferlib/ocean/drive/binding.c @@ -193,6 +193,11 @@ static int my_init(Env *env, PyObject *args, PyObject *kwargs) { env->reward_goal_post_respawn = conf.reward_goal_post_respawn; env->episode_length = conf.episode_length; env->termination_mode = conf.termination_mode; + env->use_guided_autonomy = conf.use_guided_autonomy; + env->guidance_speed_weight = conf.guidance_speed_weight; + env->guidance_heading_weight = conf.guidance_heading_weight; + env->waypoint_reach_threshold = conf.waypoint_reach_threshold; + env->goal_radius = conf.goal_radius; env->collision_behavior = conf.collision_behavior; env->offroad_behavior = conf.offroad_behavior; env->max_controlled_agents = unpack(kwargs, "max_controlled_agents"); diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h index 4df3bf021..df0600ff5 100644 --- a/pufferlib/ocean/drive/drive.h +++ b/pufferlib/ocean/drive/drive.h @@ -110,9 +110,16 @@ static const float JERK_LONG[4] = {-15.0f, -4.0f, 0.0f, 4.0f}; static const float JERK_LAT[3] = {-4.0f, 0.0f, 4.0f}; // Classic action space (for CLASSIC dynamics model) -static const float ACCELERATION_VALUES[7] = {-4.0000f, -2.6670f, -1.3330f, -0.0000f, 1.3330f, 2.6670f, 4.0000f}; -static const float STEERING_VALUES[13] = {-1.000f, -0.833f, -0.667f, -0.500f, -0.333f, -0.167f, 0.000f, - 0.167f, 0.333f, 0.500f, 0.667f, 0.833f, 1.000f}; +#define NUM_ACCEL_BINS 7 // 21 +#define ACCEL_MIN -4.0f +#define ACCEL_MAX 4.0f + +#define NUM_STEER_BINS 13 // 31 +#define STEER_MIN -1.0f // radians +#define STEER_MAX 1.0f + +static float ACCELERATION_VALUES[NUM_ACCEL_BINS]; +static float STEERING_VALUES[NUM_STEER_BINS]; static const float offsets[4][2] = { {-1, 1}, // top-left @@ -179,6 +186,8 @@ struct Entity { float *traj_vz; float *traj_heading; int *traj_valid; + float *expert_accel; + float *expert_steering; float width; float length; float height; @@ -218,6 +227,12 @@ struct Entity { float jerk_lat; float steering_angle; float wheelbase; + + // Guided autonomy tracking + int *waypoints_hit; // Boolean array: 1 if waypoint reached, 0 otherwise + int waypoints_hit_count; // Total waypoints reached so far + int total_valid_waypoints; // Total valid waypoints in trajectory + float route_progress; // Percentage of route completed (0.0 to 1.0) }; void free_entity(Entity *entity) { @@ -230,6 +245,9 @@ void free_entity(Entity *entity) { free(entity->traj_vz); free(entity->traj_heading); free(entity->traj_valid); + free(entity->waypoints_hit); + free(entity->expert_accel); + free(entity->expert_steering); } // Utility functions @@ -253,6 +271,14 @@ float clip(float value, float min, float max) { return value; } +float normalize_heading(float heading) { + if (heading > M_PI) + heading -= 2 * M_PI; + if (heading < -M_PI) + heading += 2 * M_PI; + return heading; +} + typedef struct GridMapEntity GridMapEntity; struct GridMapEntity { int entity_idx; @@ -283,6 +309,7 @@ struct Drive { float *actions; float *rewards; unsigned char *terminals; + unsigned char *truncations; Log log; Log *logs; int num_agents; @@ -308,6 +335,10 @@ struct Drive { int termination_mode; float reward_vehicle_collision; float reward_offroad_collision; + int use_guided_autonomy; // Boolean: whether to calculate and add guided autonomy reward + float guidance_speed_weight; // Weight for speed deviation penalty + float guidance_heading_weight; // Weight for heading deviation penalty + float waypoint_reach_threshold; // Distance threshold for hitting waypoints (e.g., 2.0m) char *map_name; float world_mean_x; float world_mean_y; @@ -381,6 +412,148 @@ void add_log(Drive *env) { } } +void init_action_space() { + + float accel_step = (ACCEL_MAX - ACCEL_MIN) / (NUM_ACCEL_BINS - 1); + for (int i = 0; i < NUM_ACCEL_BINS; i++) { + ACCELERATION_VALUES[i] = ACCEL_MIN + i * accel_step; + } + + float steer_step = (STEER_MAX - STEER_MIN) / (NUM_STEER_BINS - 1); + for (int i = 0; i < NUM_STEER_BINS; i++) { + STEERING_VALUES[i] = STEER_MIN + i * steer_step; + } + + // printf("Acceleration values: "); + // for (int i = 0; i < NUM_ACCEL_BINS; i++) { + // printf("%.3f ", ACCELERATION_VALUES[i]); + // } + // printf("\nSteering values: "); + // for (int i = 0; i < NUM_STEER_BINS; i++) { + // printf("%.3f ", STEERING_VALUES[i]); + // } + // printf("\n"); +} + +// Guided autonomy helper functions +int is_waypoint_within_reach(Entity *agent, int traj_idx, float threshold) { + if (traj_idx >= agent->array_size || !agent->traj_valid[traj_idx]) { + return 0; + } + + float ref_x = agent->traj_x[traj_idx]; + float ref_y = agent->traj_y[traj_idx]; + + if (ref_x == INVALID_POSITION || ref_y == INVALID_POSITION) { + return 0; + } + + float dx = agent->x - ref_x; + float dy = agent->y - ref_y; + float distance = sqrtf(dx * dx + dy * dy); + + return distance < threshold; +} + +float compute_route_guidance_reward(Drive *env, Entity *agent, int timestep) { + float reward = 0.0f; + int new_hits = 0; + + // Check all waypoints from current timestep to end of trajectory + for (int i = timestep; i < agent->array_size; i++) { + // Skip if already hit or invalid + if (agent->waypoints_hit[i] || !agent->traj_valid[i]) { + continue; + } + + // Check if within reach + if (is_waypoint_within_reach(agent, i, env->waypoint_reach_threshold)) { + agent->waypoints_hit[i] = 1; + agent->waypoints_hit_count++; + new_hits++; + } + } + + // Compute route progress + if (agent->total_valid_waypoints > 0) { + agent->route_progress = (float)agent->waypoints_hit_count / agent->total_valid_waypoints; + } + + // Scale reward by number of valid waypoints to normalize + if (agent->total_valid_waypoints > 0 && new_hits > 0) { + reward = (float)new_hits / agent->total_valid_waypoints; + reward = fminf(reward, 0.1f); // Cap at 0.1 per step + } + + return reward; +} + +float compute_speed_guidance_reward(Entity *agent, int timestep, float weight) { + if (timestep >= agent->array_size || !agent->traj_valid[timestep]) { + return 0.0f; + } + + // Get reference speed at current timestep + float ref_vx = agent->traj_vx[timestep]; + float ref_vy = agent->traj_vy[timestep]; + float ref_speed = sqrtf(ref_vx * ref_vx + ref_vy * ref_vy); + + // Get actual speed + float actual_speed = sqrtf(agent->vx * agent->vx + agent->vy * agent->vy); + + // Compute squared error + float speed_error = ref_speed - actual_speed; + float speed_error_sq = speed_error * speed_error; + + // Exponential penalty: 1.0 - exp(-error²) + float penalty = 1.0f - expf(-speed_error_sq); + + return -weight * penalty; +} + +float compute_heading_guidance_reward(Entity *agent, int timestep, float weight) { + if (timestep >= agent->array_size || !agent->traj_valid[timestep]) { + return 0.0f; + } + + // Get reference heading at current timestep + float ref_heading = agent->traj_heading[timestep]; + + // Get actual heading + float actual_heading = agent->heading; + + // Compute heading error (accounting for angle wrapping)[-π, π] + float heading_error = normalize_heading(ref_heading - actual_heading); + + float heading_error_sq = heading_error * heading_error; + + // Exponential penalty: 1.0 - exp(-error²) + float penalty = 1.0f - expf(-heading_error_sq); + + return -weight * penalty; +} + +float compute_guided_autonomy_reward(Drive *env, int agent_idx, int active_idx) { + Entity *agent = &env->entities[agent_idx]; + int timestep = env->timestep; + + float total_reward = 0.0f; + + // 1. Route guidance (waypoint hitting) + float route_reward = compute_route_guidance_reward(env, agent, timestep); + total_reward += route_reward; + + // 2. Speed guidance + float speed_reward = compute_speed_guidance_reward(agent, timestep, env->guidance_speed_weight); + total_reward += speed_reward; + + // 3. Heading guidance + float heading_reward = compute_heading_guidance_reward(agent, timestep, env->guidance_heading_weight); + total_reward += heading_reward; + + return total_reward; +} + Entity *load_map_binary(const char *filename, Drive *env) { FILE *file = fopen(filename, "rb"); if (!file) @@ -405,6 +578,7 @@ Entity *load_map_binary(const char *filename, Drive *env) { fread(&env->num_roads, sizeof(int), 1, file); env->num_entities = env->num_objects + env->num_roads; Entity *entities = (Entity *)malloc(env->num_entities * sizeof(Entity)); + for (int i = 0; i < env->num_entities; i++) { // Read base entity data fread(&entities[i].scenario_id, sizeof(int), 1, file); @@ -424,6 +598,13 @@ Entity *load_map_binary(const char *filename, Drive *env) { entities[i].traj_vz = (float *)malloc(size * sizeof(float)); entities[i].traj_heading = (float *)malloc(size * sizeof(float)); entities[i].traj_valid = (int *)malloc(size * sizeof(int)); + entities[i].expert_accel = (float *)malloc(size * sizeof(float)); + entities[i].expert_steering = (float *)malloc(size * sizeof(float)); + // Allocate waypoints_hit array for guided autonomy + entities[i].waypoints_hit = (int *)calloc(size, sizeof(int)); + entities[i].waypoints_hit_count = 0; + entities[i].total_valid_waypoints = 0; + entities[i].route_progress = 0.0f; } else { // Roads don't use these arrays entities[i].traj_vx = NULL; @@ -431,6 +612,12 @@ Entity *load_map_binary(const char *filename, Drive *env) { entities[i].traj_vz = NULL; entities[i].traj_heading = NULL; entities[i].traj_valid = NULL; + entities[i].expert_accel = NULL; + entities[i].expert_steering = NULL; + entities[i].waypoints_hit = NULL; + entities[i].waypoints_hit_count = 0; + entities[i].total_valid_waypoints = 0; + entities[i].route_progress = 0.0f; } // Read array data fread(entities[i].traj_x, sizeof(float), size, file); @@ -443,6 +630,14 @@ Entity *load_map_binary(const char *filename, Drive *env) { fread(entities[i].traj_vz, sizeof(float), size, file); fread(entities[i].traj_heading, sizeof(float), size, file); fread(entities[i].traj_valid, sizeof(int), size, file); + fread(entities[i].expert_accel, sizeof(float), size, file); + fread(entities[i].expert_steering, sizeof(float), size, file); + // Count total valid waypoints for guided autonomy + for (int j = 0; j < size; j++) { + if (entities[i].traj_valid[j]) { + entities[i].total_valid_waypoints++; + } + } } // Read remaining scalar fields fread(&entities[i].width, sizeof(float), 1, file); @@ -513,7 +708,15 @@ void set_start_position(Drive *env) { e->jerk_lat = 0.0f; e->steering_angle = 0.0f; e->wheelbase = 0.6f * e->length; + + // Reset guided autonomy tracking + if (e->waypoints_hit != NULL) { + memset(e->waypoints_hit, 0, e->array_size * sizeof(int)); + } + e->waypoints_hit_count = 0; + e->route_progress = 0.0f; } + // EndDrawing(); } int getGridIndex(Drive *env, float x1, float y1) { @@ -1408,6 +1611,7 @@ void init_goal_positions(Drive *env) { void init(Drive *env) { env->human_agent_idx = 0; env->timestep = 0; + init_action_space(); env->entities = load_map_binary(env->map_name, env); set_means(env); init_grid_map(env); @@ -1458,6 +1662,7 @@ void allocate(Drive *env) { env->actions = (float *)calloc(env->active_agent_count * 2, sizeof(float)); env->rewards = (float *)calloc(env->active_agent_count, sizeof(float)); env->terminals = (unsigned char *)calloc(env->active_agent_count, sizeof(unsigned char)); + env->truncations = (unsigned char *)calloc(env->active_agent_count, sizeof(unsigned char)); } void free_allocated(Drive *env) { @@ -1465,6 +1670,7 @@ void free_allocated(Drive *env) { free(env->actions); free(env->rewards); free(env->terminals); + free(env->truncations); c_close(env); } @@ -1477,14 +1683,6 @@ float clipSpeed(float speed) { return speed; } -float normalize_heading(float heading) { - if (heading > M_PI) - heading -= 2 * M_PI; - if (heading < -M_PI) - heading += 2 * M_PI; - return heading; -} - float normalize_value(float value, float min, float max) { return (value - min) / (max - min); } void move_dynamics(Drive *env, int action_idx, int agent_idx) { @@ -1508,12 +1706,12 @@ void move_dynamics(Drive *env, int action_idx, int agent_idx) { acceleration = action_array_f[action_idx][0]; steering = action_array_f[action_idx][1]; - acceleration *= ACCELERATION_VALUES[6]; - steering *= STEERING_VALUES[12]; + acceleration *= ACCEL_MAX; + steering *= STEER_MAX; } else { // discrete // Interpret action as a single integer: a = accel_idx * num_steer + steer_idx int *action_array = (int *)env->actions; - int num_steer = sizeof(STEERING_VALUES) / sizeof(STEERING_VALUES[0]); + int num_steer = NUM_STEER_BINS; int action_val = action_array[action_idx]; int acceleration_index = action_val / num_steer; int steering_index = action_val % num_steer; @@ -1549,7 +1747,7 @@ void move_dynamics(Drive *env, int action_idx, int agent_idx) { // Update position x = x + (new_vx * env->dt); y = y + (new_vy * env->dt); - heading = heading + yaw_rate * env->dt; + heading = normalize_heading(heading + yaw_rate * env->dt); // Apply updates to the agent's state agent->x = x; @@ -2017,24 +2215,9 @@ void respawn_agent(Drive *env, int agent_idx) { void c_step(Drive *env) { memset(env->rewards, 0, env->active_agent_count * sizeof(float)); memset(env->terminals, 0, env->active_agent_count * sizeof(unsigned char)); + memset(env->truncations, 0, env->active_agent_count * sizeof(unsigned char)); env->timestep++; - int originals_remaining = 0; - for (int i = 0; i < env->active_agent_count; i++) { - int agent_idx = env->active_agent_indices[i]; - // Keep flag true if there is at least one agent that has not been respawned yet - if (env->entities[agent_idx].respawn_count == 0) { - originals_remaining = 1; - break; - } - } - - if (env->timestep == env->episode_length || (!originals_remaining && env->termination_mode == 1)) { - add_log(env); - c_reset(env); - return; - } - // Move static experts for (int i = 0; i < env->expert_static_agent_count; i++) { int expert_idx = env->expert_static_agent_indices[i]; @@ -2053,14 +2236,14 @@ void c_step(Drive *env) { move_dynamics(env, i, agent_idx); - // Tiny jerk penalty for smoothness - if (env->dynamics_model == CLASSIC) { - float delta_vx = env->entities[agent_idx].vx - prev_vx; - float delta_vy = env->entities[agent_idx].vy - prev_vy; - float jerk_penalty = -0.0002f * sqrtf(delta_vx * delta_vx + delta_vy * delta_vy) / env->dt; - env->rewards[i] += jerk_penalty; - env->logs[i].episode_return += jerk_penalty; - } + // // Tiny jerk penalty for smoothness + // if (env->dynamics_model == CLASSIC) { + // float delta_vx = env->entities[agent_idx].vx - prev_vx; + // float delta_vy = env->entities[agent_idx].vy - prev_vy; + // float jerk_penalty = -0.0002f * sqrtf(delta_vx * delta_vx + delta_vy * delta_vy) / env->dt; + // env->rewards[i] += jerk_penalty; + // env->logs[i].episode_return += jerk_penalty; + // } } // Compute rewards @@ -2122,6 +2305,13 @@ void c_step(Drive *env) { int lane_aligned = env->entities[agent_idx].metrics_array[LANE_ALIGNED_IDX]; env->logs[i].lane_alignment_rate = lane_aligned; + + // Apply guided autonomy reward + if (env->use_guided_autonomy) { + float ga_reward = compute_guided_autonomy_reward(env, agent_idx, i); + env->rewards[i] += ga_reward; + env->logs[i].episode_return += ga_reward; + } } if (env->goal_behavior == GOAL_RESPAWN) { @@ -2129,6 +2319,7 @@ void c_step(Drive *env) { int agent_idx = env->active_agent_indices[i]; int reached_goal = env->entities[agent_idx].metrics_array[REACHED_GOAL_IDX]; if (reached_goal) { + env->terminals[i] = 1; respawn_agent(env, agent_idx); env->entities[agent_idx].respawn_count++; } @@ -2144,9 +2335,140 @@ void c_step(Drive *env) { } } + // Episode boundary after this step: treat time-limit and early-termination as truncation. + // `timestep` is incremented at step start, so truncate when `(timestep + 1) >= episode_length`. + int originals_remaining = 0; + for (int i = 0; i < env->active_agent_count; i++) { + int agent_idx = env->active_agent_indices[i]; + if (env->entities[agent_idx].respawn_count == 0) { + originals_remaining = 1; + break; + } + } + int reached_time_limit = (env->timestep + 1) >= env->episode_length; + int reached_early_termination = (!originals_remaining && env->termination_mode == 1); + if (reached_time_limit || reached_early_termination) { + for (int i = 0; i < env->active_agent_count; i++) { + env->truncations[i] = 1; + } + add_log(env); + c_reset(env); + return; + } + compute_observations(env); } +void c_collect_expert_data(Drive *env, float *expert_actions_discrete_out, float *expert_actions_continuous_out, + float *expert_obs_out) { + + int ego_dim = (env->dynamics_model == JERK) ? 10 : 7; + int max_obs = ego_dim + 7 * (MAX_AGENTS - 1) + 7 * MAX_ROAD_SEGMENT_OBSERVATIONS; + + int original_timestep = env->timestep; + + c_reset(env); + + for (int t = 0; t < TRAJECTORY_LENGTH; t++) { + // Get the current observations + int obs_offset = t * env->active_agent_count * max_obs; + memcpy(&expert_obs_out[obs_offset], env->observations, env->active_agent_count * max_obs * sizeof(float)); + + // Now set expert actions for this timestep + for (int i = 0; i < env->active_agent_count; i++) { + int agent_idx = env->active_agent_indices[i]; + Entity *agent = &env->entities[agent_idx]; + + // Check bounds and validity + bool is_valid = (t < agent->array_size && agent->expert_accel && agent->expert_steering && + agent->expert_accel[t] != -1.0f && agent->expert_steering[t] != -1.0f); + + if (is_valid) { + float continuous_accel = agent->expert_accel[t]; + float continuous_steer = agent->expert_steering[t]; + + // Store continuous actions + int continuous_offset = t * env->active_agent_count * 2 + i * 2; + expert_actions_continuous_out[continuous_offset] = continuous_accel; + expert_actions_continuous_out[continuous_offset + 1] = continuous_steer; + + // Discretize acceleration - find closest value in ACCELERATION_VALUES + int best_accel_idx = 0; + float min_accel_diff = fabsf(continuous_accel - ACCELERATION_VALUES[0]); + for (int j = 1; j < NUM_ACCEL_BINS; j++) { + float diff = fabsf(continuous_accel - ACCELERATION_VALUES[j]); + if (diff < min_accel_diff) { + min_accel_diff = diff; + best_accel_idx = j; + } + } + + // Discretize steering - find closest value in STEERING_VALUES + int best_steer_idx = 0; + float min_steer_diff = fabsf(continuous_steer - STEERING_VALUES[0]); + for (int j = 1; j < NUM_STEER_BINS; j++) { + float diff = fabsf(continuous_steer - STEERING_VALUES[j]); + if (diff < min_steer_diff) { + min_steer_diff = diff; + best_steer_idx = j; + } + } + + // Compute joint discrete action: action = accel_idx * num_steer + steer_idx + int joint_action = best_accel_idx * NUM_STEER_BINS + best_steer_idx; + + // Store joint discrete action + int discrete_offset = t * env->active_agent_count + i; + expert_actions_discrete_out[discrete_offset] = (float)joint_action; + + // Apply the expert actions to env->actions so that c_step will use them + if (env->action_type == 1) { // continuous + float (*action_array_f)[2] = (float (*)[2])env->actions; + action_array_f[i][0] = continuous_accel / ACCEL_MAX; + action_array_f[i][1] = continuous_steer / STEER_MAX; // Normalize + } else { + int *action_array = (int *)env->actions; + action_array[i] = joint_action; + + // printf("Timestep %d, Agent %d: VALID - Accel: %.3f, Steer: %.3f, Discrete: %d\n", t, i, + // continuous_accel, continuous_steer, joint_action); + } + } else { + // Invalid action: store -1.0 as placeholder + int continuous_offset = t * env->active_agent_count * 2 + i * 2; + expert_actions_continuous_out[continuous_offset] = -1.0f; + expert_actions_continuous_out[continuous_offset + 1] = -1.0f; + + int discrete_offset = t * env->active_agent_count + i; + expert_actions_discrete_out[discrete_offset] = -1.0f; + + // Apply "do nothing" action (zero acceleration and steering) + if (env->action_type == 1) { // continuous + float (*action_array_f)[2] = (float (*)[2])env->actions; + action_array_f[i][0] = 0.0f; // No acceleration + action_array_f[i][1] = 0.0f; // No steering + } else { // discrete + int *action_array = (int *)env->actions; + // Apply "do nothing" action - middle acceleration (index 10) and straight steering (index 15) + int do_nothing_accel_idx = NUM_ACCEL_BINS / 2; // Middle of range (should be 0 accel) + int do_nothing_steer_idx = NUM_STEER_BINS / 2; // Middle of range (should be 0 steer) + int do_nothing_action = do_nothing_accel_idx * NUM_STEER_BINS + do_nothing_steer_idx; + action_array[i] = do_nothing_action; + } + } + } + + // Step environment to get next observations + if (t < TRAJECTORY_LENGTH - 1) { + c_step(env); + } + } + + // Restore original state + env->timestep = original_timestep; + c_reset(env); +} + typedef struct Client Client; struct Client { float width; @@ -2828,9 +3150,8 @@ void c_render(Drive *env) { int action_val = action_array[env->human_agent_idx]; if (env->dynamics_model == CLASSIC) { - int num_steer = 13; - int accel_idx = action_val / num_steer; - int steer_idx = action_val % num_steer; + int accel_idx = action_val / NUM_STEER_BINS; + int steer_idx = action_val % NUM_STEER_BINS; float accel_value = ACCELERATION_VALUES[accel_idx]; float steer_value = STEERING_VALUES[steer_idx]; diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index e85eb7322..e39f37376 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -3,6 +3,8 @@ import json import struct import os +import math +import torch import pufferlib from pufferlib.ocean.drive import binding from multiprocessing import Pool, cpu_count @@ -10,6 +12,8 @@ class Drive(pufferlib.PufferEnv): + _human_data_prepped = False + def __init__( self, render_mode=None, @@ -21,6 +25,10 @@ def __init__( reward_offroad_collision=-0.1, reward_goal=1.0, reward_goal_post_respawn=0.5, + use_guided_autonomy=0, + guidance_speed_weight=0.0, + guidance_heading_weight=0.0, + waypoint_reach_threshold=2.0, goal_behavior=0, goal_target_distance=10.0, goal_radius=2.0, @@ -38,11 +46,17 @@ def __init__( max_controlled_agents=-1, buf=None, seed=1, + bptt_horizon=32, + human_data_dir="pufferlib/resources/drive/human_demonstrations", + max_expert_sequences=128, + prep_human_data=True, init_steps=0, init_mode="create_all_valid", control_mode="control_vehicles", map_dir="resources/drive/binaries/training", sequential_map_sampling=False, + ini_file_path="pufferlib/config/ocean/drive.ini", + save_data_to_disk=True, ): # env self.dt = dt @@ -59,11 +73,17 @@ def __init__( self.goal_target_distance = goal_target_distance self.collision_behavior = collision_behavior self.offroad_behavior = offroad_behavior + self.use_guided_autonomy = use_guided_autonomy + self.guidance_speed_weight = guidance_speed_weight + self.guidance_heading_weight = guidance_heading_weight + self.waypoint_reach_threshold = waypoint_reach_threshold self.human_agent_idx = human_agent_idx self.episode_length = episode_length self.termination_mode = termination_mode self.resample_frequency = resample_frequency self.dynamics_model = dynamics_model + self.ini_file_path = ini_file_path + self.save_data_to_disk = save_data_to_disk # Observation space calculation self.ego_features = {"classic": binding.EGO_FEATURES_CLASSIC, "jerk": binding.EGO_FEATURES_JERK}.get( @@ -87,6 +107,8 @@ def __init__( self.init_steps = init_steps self.init_mode_str = init_mode self.control_mode_str = control_mode + self.prep_human_data = prep_human_data + self.max_expert_sequences = int(max_expert_sequences) self.map_dir = map_dir if self.control_mode_str == "control_vehicles": @@ -113,7 +135,8 @@ def __init__( if action_type == "discrete": if dynamics_model == "classic": # Joint action space (assume dependence) - self.single_action_space = gymnasium.spaces.MultiDiscrete([7 * 13]) + self.joint_action_space_size = binding.NUM_ACCEL_BINS * binding.NUM_STEER_BINS + self.single_action_space = gymnasium.spaces.MultiDiscrete([self.joint_action_space_size]) # Multi discrete (assume independence) # self.single_action_space = gymnasium.spaces.MultiDiscrete([7, 13]) elif dynamics_model == "jerk": @@ -180,6 +203,10 @@ def __init__( reward_offroad_collision=reward_offroad_collision, reward_goal=reward_goal, reward_goal_post_respawn=reward_goal_post_respawn, + use_guided_autonomy=use_guided_autonomy, + guidance_speed_weight=guidance_speed_weight, + guidance_heading_weight=guidance_heading_weight, + waypoint_reach_threshold=waypoint_reach_threshold, goal_radius=goal_radius, goal_speed=goal_speed, goal_behavior=self.goal_behavior, @@ -192,7 +219,7 @@ def __init__( max_controlled_agents=self.max_controlled_agents, map_id=map_ids[i], max_agents=nxt - cur, - ini_file="pufferlib/config/ocean/drive.ini", + ini_file=self.ini_file_path, init_steps=init_steps, init_mode=self.init_mode, control_mode=self.control_mode, @@ -202,13 +229,40 @@ def __init__( self.c_envs = binding.vectorize(*env_ids) + # Human data preparation + self.bptt_horizon = bptt_horizon + self.human_data_dir = human_data_dir + self.save_data_to_disk = save_data_to_disk + + self.expert_data_metrics = {} + + os.makedirs(self.human_data_dir, exist_ok=True) + + if self.prep_human_data and not Drive._human_data_prepped: + self.expert_data_metrics = self._prep_human_data(bptt_horizon) + Drive._human_data_prepped = True + elif self.prep_human_data: + if self.save_data_to_disk and os.path.exists( + os.path.join(self.human_data_dir, f"expert_actions_discrete_h{bptt_horizon}.pt") + ): + discrete_actions = torch.load( + os.path.join(self.human_data_dir, f"expert_actions_discrete_h{bptt_horizon}.pt"), + map_location="cpu", + weights_only=False, + ) + self._cache_size = len(discrete_actions) + else: + self._cache_size = 0 + def reset(self, seed=0): binding.vec_reset(self.c_envs, seed) self.tick = 0 + self.truncations[:] = 0 return self.observations, [] def step(self, actions): self.terminals[:] = 0 + self.truncations[:] = 0 self.actions[:] = actions binding.vec_step(self.c_envs) self.tick += 1 @@ -255,6 +309,10 @@ def step(self, actions): reward_offroad_collision=self.reward_offroad_collision, reward_goal=self.reward_goal, reward_goal_post_respawn=self.reward_goal_post_respawn, + use_guided_autonomy=self.use_guided_autonomy, + guidance_speed_weight=self.guidance_speed_weight, + guidance_heading_weight=self.guidance_heading_weight, + waypoint_reach_threshold=self.waypoint_reach_threshold, goal_radius=self.goal_radius, goal_behavior=self.goal_behavior, goal_target_distance=self.goal_target_distance, @@ -266,7 +324,7 @@ def step(self, actions): max_controlled_agents=self.max_controlled_agents, map_id=map_ids[i], max_agents=nxt - cur, - ini_file="pufferlib/config/ocean/drive.ini", + ini_file=self.ini_file_path, init_steps=self.init_steps, init_mode=self.init_mode, control_mode=self.control_mode, @@ -276,7 +334,18 @@ def step(self, actions): self.c_envs = binding.vectorize(*env_ids) binding.vec_reset(self.c_envs, seed) - self.terminals[:] = 1 + # Map resampling is an external reset boundary (dataset/map switch). Treat as truncation. + self.truncations[:] = 1 + + # Resample human data if needed and capture metrics + if self.prep_human_data and hasattr(self, "_needs_resampling") and self._needs_resampling: + resample_metrics = self.resample_human_data() + # Add resampling metrics to info for logging + if resample_metrics: + # Prefix with "resampled_" to distinguish from initial metrics + resample_metrics = {f"resampled_{k}": v for k, v in resample_metrics.items()} + info.append(resample_metrics) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def get_global_agent_state(self): @@ -347,6 +416,185 @@ def get_ground_truth_trajectories(self): return trajectories + def resample_human_data(self): + """Resample human data when needed (called during environment resampling). + + Returns: + dict: Metrics about the expert data collection for logging + """ + if not hasattr(self, "_needs_resampling") or not self._needs_resampling: + return {} + + print( + f"Resampling human data (max_expert_sequences={self.max_expert_sequences} > " + f"available={self._total_available_sequences})..." + ) + + # Re-prepare human data with the new environment state + return self._prep_human_data(self.bptt_horizon) + + def _prep_human_data(self, bptt_horizon=32): + """Collect and save expert trajectories with bptt_horizon length sequences. + + Returns: + dict: Metrics about the expert data collection for logging + """ + trajectory_length = 91 + + if self.dynamics_model == "jerk": + raise NotImplementedError( + "Expert data collection is not yet implemented for jerk dynamics model. " + "Only classic dynamics model is currently supported." + ) + + # Collect human trajectories. + self.expert_actions_discrete = np.zeros((trajectory_length, self.num_agents, 1), dtype=np.float32) + self.expert_actions_continuous = np.zeros((trajectory_length, self.num_agents, 2), dtype=np.float32) + self.expert_observations_full = np.zeros((trajectory_length, self.num_agents, self.num_obs), dtype=np.float32) + + binding.vec_collect_expert_data( + self.c_envs, self.expert_actions_discrete, self.expert_actions_continuous, self.expert_observations_full + ) + + # Extract all valid sequences of length bptt_horizon from all trajectories + discrete_sequences_list = [] + continuous_sequences_list = [] + obs_sequences_list = [] + + # Track metrics + unique_agents = set() + total_sequences_extracted = 0 + + for agent_idx in range(self.num_agents): + # Extract all valid windows of length bptt_horizon for this agent + for start_t in range(trajectory_length - bptt_horizon + 1): + end_t = start_t + bptt_horizon + + # Check if this window is entirely valid (no -1 values) + window_valid = np.all(self.expert_actions_discrete[start_t:end_t, agent_idx, 0] != -1.0) + + if window_valid: + discrete_sequences_list.append(self.expert_actions_discrete[start_t:end_t, agent_idx, :].copy()) + continuous_sequences_list.append(self.expert_actions_continuous[start_t:end_t, agent_idx, :].copy()) + obs_sequences_list.append(self.expert_observations_full[start_t:end_t, agent_idx, :].copy()) + unique_agents.add(agent_idx) + total_sequences_extracted += 1 + + # Convert lists to arrays + if len(discrete_sequences_list) == 0: + raise ValueError("No valid expert sequences found!") + + all_discrete = np.stack(discrete_sequences_list, axis=0) + all_continuous = np.stack(continuous_sequences_list, axis=0) + all_obs = np.stack(obs_sequences_list, axis=0) + + # Sample sequences (with replacement if max_expert_sequences > available) + num_sequences = self.max_expert_sequences + needs_resampling = num_sequences > len(discrete_sequences_list) + + # Randomly sample indices (with replacement if necessary) + sampled_indices = np.random.choice(len(discrete_sequences_list), size=num_sequences, replace=needs_resampling) + + discrete_sequences = all_discrete[sampled_indices] + continuous_sequences = all_continuous[sampled_indices] + obs_sequences = all_obs[sampled_indices] + + self._cache_size = num_sequences + self._total_available_sequences = len(discrete_sequences_list) + self._needs_resampling = needs_resampling + + # Compute statistics + avg_sequences_per_agent = total_sequences_extracted / len(unique_agents) if len(unique_agents) > 0 else 0 + coverage_pct = 100 * len(unique_agents) / self.num_agents if self.num_agents > 0 else 0 + utilization_pct = ( + 100 * min(num_sequences, total_sequences_extracted) / total_sequences_extracted + if total_sequences_extracted > 0 + else 0 + ) + + data_metrics = { + "expert_data/total_agents": self.num_agents, + "expert_data/unique_agents_with_data": len(unique_agents), + "expert_data/agent_coverage_pct": coverage_pct, + "expert_data/total_sequences_extracted": total_sequences_extracted, + "expert_data/avg_sequences_per_agent": avg_sequences_per_agent, + "expert_data/sequences_stored": num_sequences, + "expert_data/sequence_length": bptt_horizon, + "expert_data/storage_utilization_pct": utilization_pct, + "expert_data/resampling_enabled": int(needs_resampling), + "expert_data/resampling_factor": num_sequences / len(discrete_sequences_list) if needs_resampling else 1.0, + } + + if self.save_data_to_disk: + print( + f"Saving {num_sequences} expert sequences of length {bptt_horizon} to disk at {self.human_data_dir}..." + ) + torch.save( + torch.from_numpy(discrete_sequences), + os.path.join(self.human_data_dir, f"expert_actions_discrete_h{bptt_horizon}.pt"), + ) + torch.save( + torch.from_numpy(continuous_sequences), + os.path.join(self.human_data_dir, f"expert_actions_continuous_h{bptt_horizon}.pt"), + ) + torch.save( + torch.from_numpy(obs_sequences), + os.path.join(self.human_data_dir, f"expert_observations_h{bptt_horizon}.pt"), + ) + + return data_metrics + + def sample_expert_data(self, n_samples=512, return_both=False): + """Sample a random batch of human (expert) sequences from disk. + + Args: + n_samples: Number of sequences to randomly sample + return_both: If True, return both continuous and discrete actions as a tuple. + If False, return only the action type matching the environment's action space. + + Returns: + If return_both=True: + (discrete_actions, continuous_actions, observations) + If return_both=False: + (actions, observations) where actions match the env's action type + + Note: + For classic discrete actions, the shape is (n_samples, bptt_horizon, 1) for joint actions. + For continuous actions, the shape is always (n_samples, bptt_horizon, 2). + Jerk dynamics model is not currently supported for expert data. + """ + if not self.save_data_to_disk: + raise ValueError("Expert data was not saved to disk. Cannot sample expert data.") + + discrete_path = os.path.join(self.human_data_dir, f"expert_actions_discrete_h{self.bptt_horizon}.pt") + continuous_path = os.path.join(self.human_data_dir, f"expert_actions_continuous_h{self.bptt_horizon}.pt") + observations_path = os.path.join(self.human_data_dir, f"expert_observations_h{self.bptt_horizon}.pt") + + observations_full = torch.load(observations_path, map_location="cpu", weights_only=False) + + # breakpoint() + + # Sample indices + samples = min(n_samples, self._cache_size) + indices = torch.randint(0, self._cache_size, (samples,)) + + # print(f'Sampling {samples} expert sequences from {self._cache_size} available sequences.') + # print(indices) + + sampled_obs = observations_full[indices] + + if return_both: + discrete_actions = torch.load(discrete_path, map_location="cpu", weights_only=False) + continuous_actions = torch.load(continuous_path, map_location="cpu", weights_only=False) + return discrete_actions[indices], continuous_actions[indices], sampled_obs + else: + # Return only the action type matching the environment + if self._action_type_flag == 1: # continuous + actions = torch.load(continuous_path, map_location="cpu", weights_only=False) + else: # discrete + actions = torch.load(discrete_path, map_location="cpu", weights_only=False) + return actions[indices], sampled_obs + def get_road_edge_polylines(self): """Get road edge polylines for all scenarios. @@ -380,6 +628,92 @@ def close(self): binding.vec_close(self.c_envs) +def infer_human_actions(obj): + """Infer expert actions (steer, accel) using inverse bicycle model.""" + trajectory_length = 91 + + # Initialize expert actions arrays + expert_acceleration = [] + expert_steering = [] + + positions = obj.get("position", []) + velocities = obj.get("velocity", []) + headings = obj.get("heading", []) + valids = obj.get("valid", []) + + if len(positions) < 2 or len(velocities) < 2 or len(headings) < 2: + return [-1.0] * trajectory_length, [-1.0] * trajectory_length + + dt = 0.1 # Discretization + vehicle_length = obj.get("length", 4.5) # Default vehicle length + + for t in range(trajectory_length): + # Check if we have enough data and both current and next timesteps are valid + if ( + t >= len(positions) + or t >= len(velocities) + or t >= len(headings) + or t >= len(valids) + or not valids[t] + or t + 1 >= len(positions) + or t + 1 >= len(velocities) + or t + 1 >= len(headings) + or t + 1 >= len(valids) + or not valids[t + 1] + ): + expert_acceleration.append(-1.0) + expert_steering.append(-1.0) + continue + + # Current state + vel_t = velocities[t] + heading_t = headings[t] + speed_t = math.sqrt(vel_t.get("x", 0.0) ** 2 + vel_t.get("y", 0.0) ** 2) + + # Next state + vel_t1 = velocities[t + 1] + heading_t1 = headings[t + 1] + speed_t1 = math.sqrt(vel_t1.get("x", 0.0) ** 2 + vel_t1.get("y", 0.0) ** 2) + + # Compute acceleration + acceleration = (speed_t1 - speed_t) / dt + + # Normalize heading difference + heading_diff = heading_t1 - heading_t + while heading_diff > math.pi: + heading_diff -= 2 * math.pi + while heading_diff < -math.pi: + heading_diff += 2 * math.pi + + # Compute yaw rate + yaw_rate = heading_diff / dt + + # Compute steering using inverse bicycle model + steering = 0.0 + if speed_t > 0.1: # Avoid division by zero + # From bicycle model: yaw_rate = (v * cos(beta) * tan(delta)) / L + # Assuming beta ≈ 0: yaw_rate ≈ (v * tan(delta)) / L + tan_steering = (yaw_rate * vehicle_length) / speed_t + # Clamp tan_steering to avoid extreme values + tan_steering = max(-10.0, min(10.0, tan_steering)) + steering = math.atan(tan_steering) + + # Clamp values to reasonable ranges + acceleration = max(-10.0, min(10.0, acceleration)) + steering = max(-2.0, min(2.0, steering)) + + expert_acceleration.append(acceleration) + expert_steering.append(steering) + + # Ensure arrays are exactly trajectory_length (should already be, but just in case) + assert len(expert_acceleration) == trajectory_length, ( + f"Expected {trajectory_length}, got {len(expert_acceleration)}" + ) + assert len(expert_steering) == trajectory_length, f"Expected {trajectory_length}, got {len(expert_steering)}" + + return expert_acceleration, expert_steering + + def calculate_area(p1, p2, p3): # Calculate the area of the triangle using the determinant method return 0.5 * abs((p1["x"] - p3["x"]) * (p2["y"] - p1["y"]) - (p1["x"] - p2["x"]) * (p3["y"] - p1["y"])) @@ -507,6 +841,20 @@ def save_map_binary(map_data, output_file, unique_map_id): ) ) + # Infer and write human actions + if obj_type in [1, 2, 3]: # For vehicles, pedestrians, cyclists + human_accel, human_steering = infer_human_actions(obj) + + # print(f"Human Acceleration: {human_accel}") + # print(f"Human Steering: {human_steering}") + + f.write(struct.pack(f"{trajectory_length}f", *human_accel)) + f.write(struct.pack(f"{trajectory_length}f", *human_steering)) + else: + # Write zeros for non-vehicles + f.write(struct.pack(f"{trajectory_length}f", *[0.0] * trajectory_length)) + f.write(struct.pack(f"{trajectory_length}f", *[0.0] * trajectory_length)) + # Write scalar fields f.write(struct.pack("f", float(obj.get("width", 0.0)))) f.write(struct.pack("f", float(obj.get("length", 0.0)))) @@ -669,10 +1017,71 @@ def test_performance(timeout=10, atn_cache=1024, num_agents=1024): env.close() +def test_human_demonstrations(): + import matplotlib.pyplot as plt + from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator + from pufferlib.pufferl import load_config, load_env + + args = load_config("puffer_drive") + backend = args["eval"]["backend"] + + args["env"]["map_dir"] = args["eval"]["map_dir"] + args["env"]["num_maps"] = args["eval"]["wosac_num_maps"] + args["env"]["sequential_map_sampling"] = True + dataset_name = args["env"]["map_dir"].split("/")[-1] + + args["vec"] = dict(backend=backend, num_envs=1) + args["env"]["init_mode"] = args["eval"]["wosac_init_mode"] + args["env"]["control_mode"] = args["eval"]["wosac_control_mode"] + args["env"]["init_steps"] = args["eval"]["wosac_init_steps"] + args["env"]["goal_behavior"] = args["eval"]["wosac_goal_behavior"] + args["env"]["goal_radius"] = args["eval"]["wosac_goal_radius"] + # args["env"]["num_agents"] = args["eval"]["wosac_num_agents"] + + vecenv = load_env("puffer_drive", args) + + exp_actions = np.zeros((91, vecenv.num_agents, 1), dtype=np.float32) + exp_actions_cont = np.zeros((91, vecenv.num_agents, 2), dtype=np.float32) + exp_observations_full = np.zeros((91, vecenv.num_agents, vecenv.num_obs), dtype=np.float32) + binding.vec_collect_expert_data(vecenv.c_envs, exp_actions, exp_actions_cont, exp_observations_full) + + evaluator = WOSACEvaluator(args) + + gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv) + + # Step with inferred human actions + simulated_trajectories = evaluator.collect_simulated_trajectories( + args=args, + puffer_env=vecenv, + policy=None, + actions=exp_actions, + ) + + evaluator._quick_sanity_check(gt_trajectories, simulated_trajectories) + + # Analyze and compute metrics + agent_state = vecenv.get_global_agent_state() + road_edge_polylines = vecenv.get_road_edge_polylines() + results = evaluator.compute_metrics( + gt_trajectories, + simulated_trajectories, + agent_state, + road_edge_polylines, + True, + ) + + print("\n", results["realism_meta_score"]) + + from pprint import pprint + + pprint(results) + + if __name__ == "__main__": + # test_human_demonstrations() # test_performance() # Process the train dataset - process_all_maps(data_folder="data/processed/training") + process_all_maps(data_folder="data/processed/wosac/validation_interactive_small") # Process the validation/test dataset # process_all_maps(data_folder="data/processed/validation") # # Process the validation_interactive dataset diff --git a/pufferlib/ocean/drive/drivenet.h b/pufferlib/ocean/drive/drivenet.h index 3082fa585..714cc0fc3 100644 --- a/pufferlib/ocean/drive/drivenet.h +++ b/pufferlib/ocean/drive/drivenet.h @@ -62,8 +62,8 @@ DriveNet *init_drivenet(Weights *weights, int num_agents, int dynamics_model) { int action_size, logit_sizes[2]; int action_dim; if (dynamics_model == CLASSIC) { - action_size = 7 * 13; // Joint action space - logit_sizes[0] = 7 * 13; + action_size = NUM_ACCEL_BINS * NUM_STEER_BINS; // Joint action space + logit_sizes[0] = NUM_ACCEL_BINS * NUM_STEER_BINS; action_dim = 1; } else { // JERK action_size = 4 * 3; // Joint action space (4 longitudinal × 3 lateral = 12) diff --git a/pufferlib/ocean/drive/visualize.c b/pufferlib/ocean/drive/visualize.c index de235b4fa..3a511a262 100644 --- a/pufferlib/ocean/drive/visualize.c +++ b/pufferlib/ocean/drive/visualize.c @@ -13,8 +13,13 @@ #include "drivenet.h" #include "libgen.h" #include "../env_config.h" + #define TRAJECTORY_LENGTH_DEFAULT 91 +#define ACTIONS_FROM_POLICY 0 +#define ACTIONS_INFERRED_FROM_HUMAN 1 +#define ACTIONS_RANDOM 2 + typedef struct { int pipefd[2]; pid_t pid; @@ -67,7 +72,7 @@ void CloseVideo(VideoRecorder *recorder) { void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int lasers, int trajectories, int frame_count, float *path, int show_human_logs, int show_grid, int img_width, int img_height, - int zoom_in) { + int zoom_in, int control) { BeginDrawing(); // Top-down orthographic camera @@ -133,6 +138,26 @@ void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int // Draw scene draw_scene(env, client, 1, obs, lasers, show_grid); EndMode3D(); + + // Draw control overlay text + const char *control_text; + Color text_color; + if (control == ACTIONS_INFERRED_FROM_HUMAN) { + control_text = "Control: Inferred human actions"; + text_color = LIGHTGREEN; + } else if (control == ACTIONS_RANDOM) { + control_text = "Control: Random actions"; + text_color = RED; + } else { + control_text = "Control: Policy actions"; + text_color = YELLOW; + } + + int text_width = MeasureText(control_text, 30); + int text_x = img_width - text_width - 20; + int text_y = 20; + + DrawText(control_text, text_x, text_y, 30, text_color); EndDrawing(); } @@ -189,9 +214,76 @@ static int make_gif_from_frames(const char *pattern, int fps, const char *palett return 0; } +static void set_actions(Drive *env, int timestep, int control, DriveNet *net) { + // Helper function to set actions based on control type + if (control == ACTIONS_INFERRED_FROM_HUMAN) { + // Use expert actions from trajectory data + for (int j = 0; j < env->active_agent_count; j++) { + int agent_idx = env->active_agent_indices[j]; + Entity *agent = &env->entities[agent_idx]; + + // Check bounds + if (timestep < agent->array_size && agent->expert_accel && agent->expert_steering) { + + // Check if expert actions are invalid (-1 means no action available) + if (agent->expert_accel[timestep] == -1.0f && agent->expert_steering[timestep] == -1.0f) { + // Do nothing - apply zero action + continue; + } + if (env->action_type == 1) { // continuous + float (*action_array_f)[2] = (float (*)[2])env->actions; + action_array_f[j][0] = agent->expert_accel[timestep] / ACCEL_MAX; + action_array_f[j][1] = agent->expert_steering[timestep] / STEER_MAX; + } else { // discrete + // Discretize to nearest action + int best_accel_idx = 0; + float min_accel_diff = fabsf(agent->expert_accel[timestep] - ACCELERATION_VALUES[0]); + for (int k = 1; k < NUM_ACCEL_BINS; k++) { + float diff = fabsf(agent->expert_accel[timestep] - ACCELERATION_VALUES[k]); + if (diff < min_accel_diff) { + min_accel_diff = diff; + best_accel_idx = k; + } + } + + int best_steer_idx = 0; + float min_steer_diff = fabsf(agent->expert_steering[timestep] - STEERING_VALUES[0]); + for (int k = 1; k < NUM_STEER_BINS; k++) { + float diff = fabsf(agent->expert_steering[timestep] - STEERING_VALUES[k]); + if (diff < min_steer_diff) { + min_steer_diff = diff; + best_steer_idx = k; + } + } + + int *action_array = (int *)env->actions; + action_array[j] = best_accel_idx * NUM_STEER_BINS + best_steer_idx; + } + } + } + } else if (control == ACTIONS_RANDOM) { + // Use random actions + for (int j = 0; j < env->active_agent_count; j++) { + if (env->action_type == 1) { // continuous + float (*action_array_f)[2] = (float (*)[2])env->actions; + // Random values in range [-1, 1] + action_array_f[j][0] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; + action_array_f[j][1] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; + } else { // discrete + int *action_array = (int *)env->actions; + // Random action index + action_array[j] = rand() % (NUM_ACCEL_BINS * NUM_STEER_BINS); + } + } + } else { + // Use policy by default (ACTIONS_FROM_POLICY) + forward(net, env->observations, (int *)env->actions); + } +} + int eval_gif(const char *map_name, const char *policy_name, int show_grid, int obs_only, int lasers, int show_human_logs, int frame_skip, const char *view_mode, const char *output_topdown, - const char *output_agent, int num_maps, int zoom_in) { + const char *output_agent, int num_maps, int zoom_in, int control) { // Parse configuration from INI file env_init_config conf = {0}; @@ -294,9 +386,14 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o client->cyclist = LoadModel("resources/drive/cyclist.glb"); client->pedestrian = LoadModel("resources/drive/pedestrian.glb"); - Weights *weights = load_weights(policy_name); - printf("Active agents in map: %d\n", env.active_agent_count); - DriveNet *net = init_drivenet(weights, env.active_agent_count, env.dynamics_model); + Weights *weights = NULL; + DriveNet *net = NULL; + + if (control == ACTIONS_FROM_POLICY) { + weights = load_weights(policy_name); + printf("Active agents in map: %d\n", env.active_agent_count); + net = init_drivenet(weights, env.active_agent_count, env.dynamics_model); + } int frame_count = env.episode_length > 0 ? env.episode_length : TRAJECTORY_LENGTH_DEFAULT; char filename_topdown[256]; @@ -355,11 +452,11 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o for (int i = 0; i < frame_count; i++) { if (i % frame_skip == 0) { renderTopDownView(&env, client, map_height, 0, 0, 0, frame_count, NULL, show_human_logs, show_grid, - img_width, img_height, zoom_in); + img_width, img_height, zoom_in, control); WriteFrame(&topdown_recorder, img_width, img_height); rendered_frames++; } - forward(net, env.observations, (int *)env.actions); + set_actions(&env, i, control, net); c_step(&env); } } @@ -377,7 +474,7 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o WriteFrame(&agent_recorder, img_width, img_height); rendered_frames++; } - forward(net, env.observations, (int *)env.actions); + set_actions(&env, i, control, net); c_step(&env); } } @@ -399,8 +496,14 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o free(client); free_allocated(&env); - free_drivenet(net); - free(weights); + + if (net != NULL) { + free_drivenet(net); + } + if (weights != NULL) { + free(weights); + } + return 0; } @@ -409,17 +512,18 @@ int main(int argc, char *argv[]) { int show_grid = 0; int obs_only = 0; int lasers = 0; - int show_human_logs = 0; + int show_human_logs = 1; int frame_skip = 1; - int zoom_in = 0; - const char *view_mode = "both"; + int zoom_in = 1; + const char *view_mode = "topdown"; // File paths and num_maps (not in [env] section) const char *map_name = NULL; const char *policy_name = "resources/drive/puffer_drive_weights.bin"; const char *output_topdown = NULL; const char *output_agent = NULL; - int num_maps = 1; + int num_maps = 100; + int control = ACTIONS_FROM_POLICY; // ACTIONS_INFERRED_FROM_HUMAN; // Parse command line arguments for (int i = 1; i < argc; i++) { @@ -429,7 +533,7 @@ int main(int argc, char *argv[]) { obs_only = 1; } else if (strcmp(argv[i], "--lasers") == 0) { lasers = 1; - } else if (strcmp(argv[i], "--log-trajectories") == 0) { + } else if (strcmp(argv[i], "--show-human-logs") == 0) { show_human_logs = 1; } else if (strcmp(argv[i], "--frame-skip") == 0) { if (i + 1 < argc) { @@ -485,10 +589,28 @@ int main(int argc, char *argv[]) { num_maps = atoi(argv[i + 1]); i++; } + } else if (strcmp(argv[i], "--control") == 0) { + if (i + 1 < argc) { + const char *control_str = argv[i + 1]; + i++; + if (strcmp(control_str, "policy") == 0) { + control = ACTIONS_FROM_POLICY; + } else if (strcmp(control_str, "inferred_human") == 0) { + control = ACTIONS_INFERRED_FROM_HUMAN; + } else if (strcmp(control_str, "random") == 0) { + control = ACTIONS_RANDOM; + } else { + fprintf(stderr, "Error: --control must be 'policy' or 'inferred_human'\n"); + return 1; + } + } else { + fprintf(stderr, "Error: --control option requires a value (policy/inferred_human)\n"); + return 1; + } } } eval_gif(map_name, policy_name, show_grid, obs_only, lasers, show_human_logs, frame_skip, view_mode, output_topdown, - output_agent, num_maps, zoom_in); + output_agent, num_maps, zoom_in, control); return 0; } diff --git a/pufferlib/ocean/env_binding.h b/pufferlib/ocean/env_binding.h index c6b766776..50a7d89c5 100644 --- a/pufferlib/ocean/env_binding.h +++ b/pufferlib/ocean/env_binding.h @@ -128,7 +128,7 @@ static PyObject *env_init(PyObject *self, PyObject *args, PyObject *kwargs) { PyErr_SetString(PyExc_ValueError, "Truncations must be 1D"); return NULL; } - // env->truncations = PyArray_DATA(truncations); + env->truncations = PyArray_DATA(truncations); PyObject *seed_arg = PyTuple_GetItem(args, 5); if (!PyObject_TypeCheck(seed_arg, &PyLong_Type)) { @@ -412,7 +412,7 @@ static PyObject *vec_init(PyObject *self, PyObject *args, PyObject *kwargs) { env->actions = (void *)((char *)PyArray_DATA(actions) + i * PyArray_STRIDE(actions, 0)); env->rewards = (void *)((char *)PyArray_DATA(rewards) + i * PyArray_STRIDE(rewards, 0)); env->terminals = (void *)((char *)PyArray_DATA(terminals) + i * PyArray_STRIDE(terminals, 0)); - // env->truncations = (void*)((char*)PyArray_DATA(truncations) + i*PyArray_STRIDE(truncations, 0)); + env->truncations = (void *)((char *)PyArray_DATA(truncations) + i * PyArray_STRIDE(truncations, 0)); // Assumes each process has the same number of environments int env_seed = i + seed * vec->num_envs; @@ -951,6 +951,111 @@ static char *unpack_str(PyObject *kwargs, char *key) { return ret; } +static PyObject *vec_collect_expert_data(PyObject *self, PyObject *args) { + if (PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "collect_expert_data requires 4 arguments"); + return NULL; + } + + VecEnv *vec = unpack_vecenv(args); + if (!vec) { + return NULL; + } + + // Argument 1: discrete actions + PyObject *discrete_actions_arg = PyTuple_GetItem(args, 1); + if (!PyObject_TypeCheck(discrete_actions_arg, &PyArray_Type)) { + PyErr_SetString(PyExc_TypeError, "expert_actions_discrete must be a NumPy array"); + return NULL; + } + PyArrayObject *expert_actions_discrete = (PyArrayObject *)discrete_actions_arg; + + // Argument 2: continuous actions + PyObject *continuous_actions_arg = PyTuple_GetItem(args, 2); + if (!PyObject_TypeCheck(continuous_actions_arg, &PyArray_Type)) { + PyErr_SetString(PyExc_TypeError, "expert_actions_continuous must be a NumPy array"); + return NULL; + } + PyArrayObject *expert_actions_continuous = (PyArrayObject *)continuous_actions_arg; + + // Argument 3: observations + PyObject *obs_arg = PyTuple_GetItem(args, 3); + if (!PyObject_TypeCheck(obs_arg, &PyArray_Type)) { + PyErr_SetString(PyExc_TypeError, "expert_observations must be a NumPy array"); + return NULL; + } + PyArrayObject *expert_observations = (PyArrayObject *)obs_arg; + + // Check array shapes + if (PyArray_NDIM(expert_actions_discrete) != 3) { + PyErr_SetString(PyExc_ValueError, "expert_actions_discrete must be 3D (trajectory_length, num_agents, 2)"); + return NULL; + } + if (PyArray_NDIM(expert_actions_continuous) != 3) { + PyErr_SetString(PyExc_ValueError, "expert_actions_continuous must be 3D (trajectory_length, num_agents, 2)"); + return NULL; + } + if (PyArray_NDIM(expert_observations) != 3) { + PyErr_SetString(PyExc_ValueError, "expert_observations must be 3D (trajectory_length, num_agents, obs_dim)"); + return NULL; + } + + int trajectory_length = PyArray_DIM(expert_actions_discrete, 0); + int max_obs = 7 + 7 * (MAX_AGENTS - 1) + 7 * MAX_ROAD_SEGMENT_OBSERVATIONS; + + // Process each environment + int agent_offset = 0; + for (int i = 0; i < vec->num_envs; i++) { + Env *env = vec->envs[i]; + int num_agents = env->active_agent_count; + + // Allocate temporary buffers for this environment + float *env_actions_discrete = (float *)malloc(trajectory_length * num_agents * 1 * sizeof(float)); + float *env_actions_continuous = (float *)malloc(trajectory_length * num_agents * 2 * sizeof(float)); + float *env_obs = (float *)malloc(trajectory_length * num_agents * max_obs * sizeof(float)); + + if (!env_actions_discrete || !env_actions_continuous || !env_obs) { + PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffers"); + free(env_actions_discrete); + free(env_actions_continuous); + free(env_obs); + return NULL; + } + + // Call C function with both action types + c_collect_expert_data(env, env_actions_discrete, env_actions_continuous, env_obs); + + // Copy results back to Python arrays + for (int t = 0; t < trajectory_length; t++) { + for (int a = 0; a < num_agents; a++) { + // Copy discrete actions + float *discrete_action_src = &env_actions_discrete[t * num_agents + a]; + float *discrete_action_dst = (float *)PyArray_GETPTR3(expert_actions_discrete, t, agent_offset + a, 0); + discrete_action_dst[0] = discrete_action_src[0]; + + // Copy continuous actions + float *continuous_action_src = &env_actions_continuous[t * num_agents * 2 + a * 2]; + float *continuous_action_dst = + (float *)PyArray_GETPTR3(expert_actions_continuous, t, agent_offset + a, 0); + continuous_action_dst[0] = continuous_action_src[0]; + continuous_action_dst[1] = continuous_action_src[1]; + + // Copy observations + float *obs_src = &env_obs[t * num_agents * max_obs + a * max_obs]; + float *obs_dst = (float *)PyArray_GETPTR3(expert_observations, t, agent_offset + a, 0); + memcpy(obs_dst, obs_src, max_obs * sizeof(float)); + } + } + + free(env_actions_discrete); + free(env_actions_continuous); + free(env_obs); + agent_offset += num_agents; + } + + Py_RETURN_NONE; +} + // Method table static PyMethodDef methods[] = { {"env_init", (PyCFunction)env_init, METH_VARARGS | METH_KEYWORDS, @@ -969,6 +1074,7 @@ static PyMethodDef methods[] = { {"vec_render", vec_render, METH_VARARGS, "Render the vector of environments"}, {"vec_close", vec_close, METH_VARARGS, "Close the vector of environments"}, {"shared", (PyCFunction)my_shared, METH_VARARGS | METH_KEYWORDS, "Shared state"}, + {"vec_collect_expert_data", vec_collect_expert_data, METH_VARARGS, "Collect expert actions and observations"}, {"get_global_agent_state", get_global_agent_state, METH_VARARGS, "Get global agent state"}, {"vec_get_global_agent_state", vec_get_global_agent_state, METH_VARARGS, "Get agent state from vectorized env"}, {"get_ground_truth_trajectories", get_ground_truth_trajectories, METH_VARARGS, "Get ground truth trajectories"}, @@ -993,6 +1099,8 @@ PyMODINIT_FUNC PyInit_binding(void) { } // Make constants accessible from Python + PyModule_AddIntConstant(m, "NUM_ACCEL_BINS", NUM_ACCEL_BINS); + PyModule_AddIntConstant(m, "NUM_STEER_BINS", NUM_STEER_BINS); PyModule_AddIntConstant(m, "MAX_ROAD_SEGMENT_OBSERVATIONS", MAX_ROAD_SEGMENT_OBSERVATIONS); PyModule_AddIntConstant(m, "MAX_AGENTS", MAX_AGENTS); PyModule_AddIntConstant(m, "TRAJECTORY_LENGTH", TRAJECTORY_LENGTH); diff --git a/pufferlib/ocean/env_config.h b/pufferlib/ocean/env_config.h index 71c19debb..6b7368e70 100644 --- a/pufferlib/ocean/env_config.h +++ b/pufferlib/ocean/env_config.h @@ -15,6 +15,10 @@ typedef struct { float reward_goal; float reward_goal_post_respawn; float reward_vehicle_collision_post_respawn; + int use_guided_autonomy; // Boolean: whether to calculate and add guided autonomy reward + float guidance_speed_weight; // Weight for speed deviation penalty + float guidance_heading_weight; // Weight for heading deviation penalty + float waypoint_reach_threshold; // Distance threshold for hitting waypoints float goal_radius; float goal_speed; int collision_behavior; @@ -68,6 +72,14 @@ static int handler(void *config, const char *section, const char *name, const ch env_config->reward_goal_post_respawn = atof(value); } else if (MATCH("env", "reward_vehicle_collision_post_respawn")) { env_config->reward_vehicle_collision_post_respawn = atof(value); + } else if (MATCH("env", "use_guided_autonomy")) { + env_config->use_guided_autonomy = atoi(value); + } else if (MATCH("env", "guidance_speed_weight")) { + env_config->guidance_speed_weight = atof(value); + } else if (MATCH("env", "guidance_heading_weight")) { + env_config->guidance_heading_weight = atof(value); + } else if (MATCH("env", "waypoint_reach_threshold")) { + env_config->waypoint_reach_threshold = atof(value); } else if (MATCH("env", "goal_radius")) { env_config->goal_radius = atof(value); } else if (MATCH("env", "goal_speed")) { diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index c7a6e6f04..98cc11a4e 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -8,7 +8,6 @@ from pufferlib.models import Default as Policy # noqa: F401 from pufferlib.models import Convolutional as Conv # noqa: F401 - Recurrent = pufferlib.models.LSTMWrapper diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index c797f3244..412828030 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -220,11 +220,18 @@ def __init__(self, config, vecenv, policy, logger=None): self.stats = defaultdict(list) self.last_stats = defaultdict(list) self.losses = {} + self.realism = {} # Dashboard self.model_size = sum(p.numel() for p in policy.parameters() if p.requires_grad) self.print_dashboard(clear=True) + if hasattr(vecenv, "driver_env") and hasattr(vecenv.driver_env, "expert_data_metrics"): + expert_metrics = vecenv.driver_env.expert_data_metrics + if expert_metrics: + # Log to wandb/neptune immediately at step 0 + self.logger.log(expert_metrics, step=0) + @property def uptime(self): return time.time() - self.start_time @@ -258,7 +265,6 @@ def evaluate(self): profile("eval_misc", epoch) env_id = slice(env_id[0], env_id[-1] + 1) - done_mask = d + t # TODO: Handle truncations separately self.global_step += int(mask.sum()) profile("eval_copy", epoch) @@ -266,12 +272,14 @@ def evaluate(self): o_device = o.to(device) # , non_blocking=True) r = torch.as_tensor(r).to(device) # , non_blocking=True) d = torch.as_tensor(d).to(device) # , non_blocking=True) + t = torch.as_tensor(t).to(device) # , non_blocking=True) + done_mask = (d + t).clamp(max=1) profile("eval_forward", epoch) with torch.no_grad(), self.amp_context: state = dict( reward=r, - done=d, + done=done_mask, env_id=env_id, mask=mask, ) @@ -301,8 +309,16 @@ def evaluate(self): self.actions[batch_rows, l] = action self.logprobs[batch_rows, l] = logprob + # Truncation bootstrap hack for auto-reset envs. + # Ideally we add `gamma * V(s_{t+1})` on truncation steps, but Drive resets in C so + # the value at index `l` is post-reset. We use `values[..., l-1]` as a heuristic + # proxy for the pre-reset terminal value (bootstrap term is not clipped). + if l > 0: + trunc_mask = (t > 0) & (d == 0) + r = r + trunc_mask.to(r.dtype) * config["gamma"] * self.values[batch_rows, l - 1] self.rewards[batch_rows, l] = r - self.terminals[batch_rows, l] = d.float() + self.terminals[batch_rows, l] = done_mask.float() + self.truncations[batch_rows, l] = t.float() self.values[batch_rows, l] = value.flatten() # Note: We are not yet handling masks in this version @@ -351,6 +367,7 @@ def train(self): a = config["prio_alpha"] clip_coef = config["clip_coef"] vf_clip = config["vf_clip_coef"] + human_clip = config["human_clip_coef"] anneal_beta = b0 + (1 - b0) * a * self.epoch / self.total_epochs self.ratio[:] = 1 @@ -413,6 +430,32 @@ def train(self): approx_kl = ((ratio - 1) - logratio).mean() clipfrac = ((ratio - 1.0).abs() > config["clip_coef"]).float().mean() + # Compute log likelihood loss of human actions under current policy. + # 1: Sample a batch of human actions and observations from dataset + # Shape: [n_sequences, bptt_horizon, feature_dim] + discrete_human_actions, continuous_human_actions, human_observations = ( + self.vecenv.driver_env.sample_expert_data(n_samples=config["human_sequences"], return_both=True) + ) + + use_continuous = self.vecenv.driver_env._action_type_flag == 1 + human_actions = continuous_human_actions if use_continuous else discrete_human_actions + human_actions = human_actions.reshape(-1, 1).to(device) + human_observations = human_observations.to(device) + + # 2: Compute the log-likelihood of human actions under the current policy, + # given the corresponding human observations. A higher likelihood indicates + # that the policy behaves more like a human under the same observations. + human_state = dict( + lstm_h=None, + lstm_c=None, + ) + + human_logits, _ = self.policy(human_observations, human_state) + + _, human_log_prob, human_entropy = pufferlib.pytorch.sample_logits( + logits=human_logits, action=human_actions + ) + adv = advantages[idx] adv = compute_puff_advantage( mb_values, @@ -439,9 +482,16 @@ def train(self): v_loss_clipped = (v_clipped - mb_returns) ** 2 v_loss = 0.5 * torch.max(v_loss_unclipped, v_loss_clipped).mean() + human_nll = human_log_prob.mean() + entropy_loss = entropy.mean() - loss = pg_loss + config["vf_coef"] * v_loss - config["ent_coef"] * entropy_loss + loss = ( + pg_loss + + config["vf_coef"] * v_loss + - config["ent_coef"] * entropy_loss + - config["human_ll_coef"] * human_nll + ) self.amp_context.__enter__() # TODO: AMP needs some debugging # This breaks vloss clipping? @@ -456,10 +506,14 @@ def train(self): losses["approx_kl"] += approx_kl.item() / self.total_minibatches losses["clipfrac"] += clipfrac.item() / self.total_minibatches losses["importance"] += ratio.mean().item() / self.total_minibatches + losses["human_nll"] += human_nll / self.total_minibatches + self.realism["human_log_prob"] = human_log_prob.mean().item() # Learn on accumulated minibatches profile("learn", epoch) + loss.backward() + if (mb + 1) % self.accumulate_minibatches == 0: torch.nn.utils.clip_grad_norm_(self.policy.parameters(), config["max_grad_norm"]) self.optimizer.step() @@ -489,38 +543,16 @@ def train(self): self.last_log_step = self.global_step profile.clear() - if self.epoch % config["checkpoint_interval"] == 0 or done_training: + if (self.epoch - 1) % config["checkpoint_interval"] == 0 or done_training: self.save_checkpoint() self.msg = f"Checkpoint saved at update {self.epoch}" - if self.render and self.epoch % self.render_interval == 0: - model_dir = os.path.join(self.config["data_dir"], f"{self.config['env']}_{self.logger.run_id}") - model_files = glob.glob(os.path.join(model_dir, "model_*.pt")) - - if model_files: - # Take the latest checkpoint - latest_cpt = max(model_files, key=os.path.getctime) - bin_path = f"{model_dir}.bin" - - # Export to .bin for rendering with raylib - try: - export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config} - - export( - args=export_args, - env_name=self.config["env"], - vecenv=self.vecenv, - policy=self.uncompiled_policy, - path=bin_path, - silent=True, - ) - pufferlib.utils.render_videos( - self.config, self.vecenv, self.logger, self.epoch, self.global_step, bin_path - ) - - except Exception as e: - print(f"Failed to export model weights: {e}") - + if self.render and (self.epoch - 1) % self.render_interval == 0: + bin_path = self._export_to_bin() + if bin_path: + pufferlib.utils.render_videos( + self.config, self.vecenv, self.logger, self.epoch, self.global_step, bin_path + ) if self.config["eval"]["wosac_realism_eval"] and ( (self.epoch - 1) % self.config["eval"]["eval_interval"] == 0 or done_training ): @@ -531,6 +563,9 @@ def train(self): ): pufferlib.utils.run_human_replay_eval_in_subprocess(self.config, self.logger, self.global_step) + if done_training: # Export latest checkpoint to .bin + self._export_to_bin() + def mean_and_log(self): config = self.config for k in list(self.stats.keys()): @@ -538,12 +573,18 @@ def mean_and_log(self): try: v = np.mean(v) except: - del self.stats[k] + # Keep single-value metrics (like resampled expert data metrics) + if isinstance(v, (int, float)): + pass + else: + del self.stats[k] + continue self.stats[k] = v device = config["device"] agent_steps = int(dist_sum(self.global_step, device)) + logs = { "SPS": dist_sum(self.sps, device), "agent_steps": agent_steps, @@ -558,6 +599,16 @@ def mean_and_log(self): # **{f'performance/{k}': dist_sum(v['elapsed'], device) for k, v in self.profile}, } + for k, v in self.realism.items(): + # if k.endswith("_histogram"): + # if hasattr(self.logger, "wandb") and self.logger.wandb: + # import wandb + + # metric_name = k.replace("_histogram", "") + # logs[f"eval/{metric_name}"] = wandb.Histogram(v) + # else: + logs[f"eval/{k}"] = v + if torch.distributed.is_initialized(): if torch.distributed.get_rank() != 0: self.logger.log(logs, agent_steps) @@ -601,12 +652,40 @@ def save_checkpoint(self): "update": self.epoch, "model_name": model_name, "run_id": run_id, + "max_expert_sequences": self.vecenv.driver_env.max_expert_sequences, + "bptt_horizon": self.config["bptt_horizon"], } state_path = os.path.join(path, "trainer_state.pt") torch.save(state, state_path + ".tmp") os.rename(state_path + ".tmp", state_path) return model_path + def _export_to_bin(self): + """Export latest checkpoint to .bin for rendering.""" + model_dir = os.path.join(self.config["data_dir"], f"{self.config['env']}_{self.logger.run_id}") + model_files = glob.glob(os.path.join(model_dir, "model_*.pt")) + + if not model_files: + return None + + latest_cpt = max(model_files, key=os.path.getctime) + bin_path = f"{model_dir}.bin" + + try: + export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config} + export( + args=export_args, + env_name=self.config["env"], + vecenv=self.vecenv, + policy=self.uncompiled_policy, + path=bin_path, + silent=True, + ) + return bin_path + except Exception as e: + print(f"Failed to export model weights: {e}") + return None + def print_dashboard(self, clear=False, idx=[0], c1="[cyan]", c2="[white]", b1="[bright_cyan]", b2="[bright_white]"): config = self.config sps = dist_sum(self.sps, config["device"]) @@ -800,8 +879,8 @@ def __call__(self, name, epoch, nest=False): if epoch % self.frequency != 0: return - # if torch.cuda.is_available(): - # torch.cuda.synchronize() + if torch.cuda.is_available(): + torch.cuda.synchronize() tick = time.time() if len(self.stack) != 0 and not nest: @@ -817,8 +896,8 @@ def pop(self, end): profile["delta"] += delta def end(self): - # if torch.cuda.is_available(): - # torch.cuda.synchronize() + if torch.cuda.is_available(): + torch.cuda.synchronize() end = time.time() for i in range(len(self.stack)): @@ -933,15 +1012,16 @@ class WandbLogger: def __init__(self, args, load_id=None, resume="allow"): import wandb + run_name = args.get("wandb_run_name", None) wandb.init( id=load_id or wandb.util.generate_id(), + name=run_name, project=args["wandb_project"], group=args["wandb_group"], allow_val_change=True, save_code=False, resume=resume, config=args, - name=args.get("wandb_name"), tags=[args["tag"]] if args["tag"] is not None else [], ) self.wandb = wandb @@ -1036,6 +1116,7 @@ def eval(env_name, args=None, vecenv=None, policy=None): """Evaluate a policy.""" args = args or load_config(env_name) + args["env"]["prep_human_data"] = False wosac_enabled = args["eval"]["wosac_realism_eval"] human_replay_enabled = args["eval"]["human_replay_eval"] @@ -1085,6 +1166,7 @@ def eval(env_name, args=None, vecenv=None, policy=None): road_edge_polylines, args["eval"]["wosac_aggregate_results"], ) + results = {k: v.item() if hasattr(v, "item") else v for k, v in results.items()} if args["eval"]["wosac_aggregate_results"]: import json @@ -1103,6 +1185,7 @@ def eval(env_name, args=None, vecenv=None, policy=None): args["vec"] = dict(backend=backend, num_envs=1) args["env"]["control_mode"] = args["eval"]["human_replay_control_mode"] args["env"]["episode_length"] = 91 # WOMD scenario length + args["env"]["num_maps"] = args["eval"]["human_replay_num_agents"] vecenv = vecenv or load_env(env_name, args) policy = policy or load_policy(args, vecenv, env_name) @@ -1197,7 +1280,7 @@ def sweep(args=None, env_name=None): points_per_run = args["sweep"]["downsample"] target_key = f"environment/{args['sweep']['metric']}" for i in range(args["max_runs"]): - seed = time.time_ns() & 0xFFFFFFFF + seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) @@ -1255,6 +1338,11 @@ def controlled_exp(env_name, args=None): section, param = key.split(".") exp_args[section][param] = value + # Create descriptive name + run_name_parts = [f"{key.split('.')[-1]}={value}" for key, value in zip(keys, combo)] + exp_name = "_".join(run_name_parts) + exp_args["wandb_run_name"] = f"{exp_name}" + print(f"\nExperiment {i}/{len(combinations)}: {dict(zip(keys, combo))}") # Train diff --git a/pufferlib/resources/drive/binaries/map_000.bin b/pufferlib/resources/drive/binaries/map_000.bin index 0b71744f6..deea84b71 100644 Binary files a/pufferlib/resources/drive/binaries/map_000.bin and b/pufferlib/resources/drive/binaries/map_000.bin differ diff --git a/pufferlib/utils.py b/pufferlib/utils.py index 76bf4a5a0..6d9df13e4 100644 --- a/pufferlib/utils.py +++ b/pufferlib/utils.py @@ -112,7 +112,7 @@ def run_wosac_eval_in_subprocess(config, logger, global_step): "--eval.wosac-goal-behavior", str(eval_config.get("wosac_goal_behavior", 2)), "--eval.wosac-goal-radius", - str(eval_config.get("wosac_goal_radius", 2.0)), + str(eval_config.get("wosac_goal_radius", 1.0)), "--eval.wosac-sanity-check", str(eval_config.get("wosac_sanity_check", False)), "--eval.wosac-aggregate-results", @@ -141,9 +141,18 @@ def run_wosac_eval_in_subprocess(config, logger, global_step): if hasattr(logger, "wandb") and logger.wandb: logger.wandb.log( { - "eval/wosac_realism_meta_score": wosac_metrics["realism_meta_score"], + "eval/wosac_realism_meta_score_mean": wosac_metrics["realism_meta_score"], + "eval/wosac_realism_meta_score_std": wosac_metrics["realism_score_std"], "eval/wosac_ade": wosac_metrics["ade"], "eval/wosac_min_ade": wosac_metrics["min_ade"], + "eval/wosac_kinematic_metrics": wosac_metrics["kinematic_metrics"], + "eval/wosac_interactive_metrics": wosac_metrics["interactive_metrics"], + "eval/map_based_metrics": wosac_metrics["map_based_metrics"], + "eval/wosac_likelihood_ttc": wosac_metrics["likelihood_time_to_collision"], + "eval/wosac_likelihood_collision": wosac_metrics["likelihood_collision_indication"], + "eval/wosac_likelihood_dist_to_no": wosac_metrics["likelihood_distance_to_nearest_object"], + "eval/num_collisions_sim": wosac_metrics["num_collisions_sim"], + "eval/num_collisions_ref": wosac_metrics["num_collisions_ref"], "eval/wosac_total_num_agents": wosac_metrics["total_num_agents"], }, step=global_step, diff --git a/setup.py b/setup.py index b834f94cf..0ae3a7705 100644 --- a/setup.py +++ b/setup.py @@ -312,6 +312,7 @@ def run(self): "gym==0.23", "gymnasium==0.29.1", "pettingzoo==1.24.1", + "matplotlib", ] if not NO_TRAIN: diff --git a/wosac_evaluation_results.png b/wosac_evaluation_results.png new file mode 100644 index 000000000..bafd2b2f9 Binary files /dev/null and b/wosac_evaluation_results.png differ diff --git a/wosac_realism_score_distributions.png b/wosac_realism_score_distributions.png new file mode 100644 index 000000000..6150cb766 Binary files /dev/null and b/wosac_realism_score_distributions.png differ