diff --git a/.gitignore b/.gitignore
index 2bd940568..a19332a6e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,6 +172,9 @@ pufferlib/resources/drive/binaries/validation/
 !pufferlib/resources/drive/binaries/training/map_000.bin
 pufferlib/resources/drive/sanity/sanity_binaries/
 
+# Ignore human demonstration data
+pufferlib/resources/drive/human_demonstrations/*
+
 # Compiled drive binary in root
 /drive
 /visualize
diff --git a/examples/analyze_learnable_information.py b/examples/analyze_learnable_information.py
new file mode 100644
index 000000000..d221dc65d
--- /dev/null
+++ b/examples/analyze_learnable_information.py
@@ -0,0 +1,366 @@
+import wandb
+import torch
+import torch.nn as nn
+from torch.optim import Adam
+from torch.utils.data import DataLoader, TensorDataset
+from torch.distributions.categorical import Categorical
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+
+from pufferlib.pufferl import load_config, load_env
+
+
+class BCPolicy(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super().__init__()
+        self.nn = nn.Sequential(
+            nn.Linear(input_size, hidden_size),
+            nn.ReLU(),
+            nn.LayerNorm(hidden_size),
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(),
+            nn.LayerNorm(hidden_size),
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(),
+        )
+        self.heads = nn.ModuleList([nn.Linear(hidden_size, output_size)])
+
+    def dist(self, obs):
+        """Generate action distribution."""
+        x_out = self.nn(obs.float())
+        return [Categorical(logits=head(x_out)) for head in self.heads]
+
+    def forward(self, obs, deterministic=False):
+        """Generate an output from tensor input."""
+        action_dist = self.dist(obs)
+
+        if deterministic:
+            actions_idx = action_dist[0].logits.argmax(axis=-1)
+        else:
+            actions_idx = action_dist[0].sample()
+        return actions_idx
+
+    def _log_prob(self, obs, expert_actions):
+        pred_action_dist = self.dist(obs)
+        log_prob = pred_action_dist[0].log_prob(expert_actions).mean()
+        return log_prob
+
+
+def prepare_human_data(env, max_expert_sequences=512):
+    """Step 1: Extract and process human demonstration data"""
+    print("Preparing human data...")
+
+    env._prep_human_data(
+        bptt_horizon=1,
+        max_expert_sequences=max_expert_sequences,
+    )
+
+    # Access the raw expert data collected by the environment
+    expert_actions_discrete = torch.Tensor(env.expert_actions_discrete)  # Shape: (T, N, 1)
+    expert_observations = torch.Tensor(env.expert_observations_full)  # Shape: (T, N, obs_dim)
+
+    # Flatten to create a batch of samples
+    action_labels = torch.flatten(expert_actions_discrete, 0, 1).squeeze()  # [B]
+    observations = torch.flatten(expert_observations, 0, 1)  # [B, obs_dim]
+
+    # Filter out invalid actions (-1)
+    valid_mask = action_labels != -1
+    action_labels = action_labels[valid_mask]
+    observations = observations[valid_mask]
+
+    return observations, action_labels
+
+
+def compute_observation_coverage(obs, max_expert_sequences, subsample_for_viz=5000):
+    """
+    Compute observation space coverage metrics.
+
+    Metrics:
+    1. Statistical Dispersion: variance, std, range per dimension
+    2. Effective Rank: PCA components needed to explain 95% variance
+    3. Visualizations: PCA projections and distributions
+    """
+    print("\nAnalyzing observation coverage...")
+
+    obs_np = obs.numpy() if isinstance(obs, torch.Tensor) else obs
+    n_samples, obs_dim = obs_np.shape
+
+    # === 1. Statistical Dispersion Metrics ===
+    obs_std = np.std(obs_np, axis=0)
+    obs_var = np.var(obs_np, axis=0)
+    obs_min = np.min(obs_np, axis=0)
+    obs_max = np.max(obs_np, axis=0)
+    obs_range = obs_max - obs_min
+
+    # Average metrics across dimensions
+    avg_std = np.mean(obs_std)
+    avg_var = np.mean(obs_var)
+    avg_range = np.mean(obs_range)
+
+    print(f"  Avg Std: {avg_std:.4f}")
+    print(f"  Avg Variance: {avg_var:.4f}")
+    print(f"  Avg Range: {avg_range:.4f}")
+
+    # === 2. Effective Rank (PCA) ===
+    pca = PCA()
+    pca.fit(obs_np)
+    explained_var_ratio = pca.explained_variance_ratio_
+    cumulative_var = np.cumsum(explained_var_ratio)
+
+    # Effective rank (number of dimensions to explain 95% variance)
+    effective_rank = np.argmax(cumulative_var >= 0.95) + 1
+
+    print(f"  Effective Rank (95% var): {effective_rank}/{obs_dim}")
+
+    # === 3. Visualizations ===
+    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
+
+    # Plot 1: Variance per dimension
+    axes[0, 0].bar(range(min(50, obs_dim)), obs_var[:50], color="steelblue", edgecolor="black")
+    axes[0, 0].set_xlabel("Dimension", fontsize=11)
+    axes[0, 0].set_ylabel("Variance", fontsize=11)
+    axes[0, 0].set_title("Variance per Dimension (first 50)", fontsize=12, fontweight="bold")
+    axes[0, 0].grid(True, alpha=0.3)
+
+    # Plot 2: Range per dimension
+    axes[0, 1].bar(range(min(50, obs_dim)), obs_range[:50], color="coral", edgecolor="black")
+    axes[0, 1].set_xlabel("Dimension", fontsize=11)
+    axes[0, 1].set_ylabel("Range", fontsize=11)
+    axes[0, 1].set_title("Range per Dimension (first 50)", fontsize=12, fontweight="bold")
+    axes[0, 1].grid(True, alpha=0.3)
+
+    # Plot 3: PCA explained variance
+    axes[1, 0].plot(cumulative_var[: min(100, len(cumulative_var))], "b-", linewidth=2, label="Cumulative Variance")
+    axes[1, 0].axhline(y=0.95, color="r", linestyle="--", linewidth=2, label="95% threshold")
+    axes[1, 0].axvline(
+        x=effective_rank - 1, color="g", linestyle="--", linewidth=2, label=f"Effective rank = {effective_rank}"
+    )
+    axes[1, 0].set_xlabel("Number of Components", fontsize=11)
+    axes[1, 0].set_ylabel("Cumulative Explained Variance", fontsize=11)
+    axes[1, 0].set_title("PCA Cumulative Variance", fontsize=12, fontweight="bold")
+    axes[1, 0].legend(fontsize=10)
+    axes[1, 0].grid(True, alpha=0.3)
+
+    # Plot 4: 2D PCA projection
+    if n_samples > subsample_for_viz:
+        viz_indices = np.random.choice(n_samples, subsample_for_viz, replace=False)
+        obs_viz = obs_np[viz_indices]
+    else:
+        obs_viz = obs_np
+
+    pca_2d = PCA(n_components=2)
+    obs_2d = pca_2d.fit_transform(obs_viz)
+
+    axes[1, 1].scatter(obs_2d[:, 0], obs_2d[:, 1], alpha=0.4, s=10, c="purple", edgecolors="none")
+    axes[1, 1].set_xlabel(f"PC1 ({pca_2d.explained_variance_ratio_[0]:.2%} var)", fontsize=11)
+    axes[1, 1].set_ylabel(f"PC2 ({pca_2d.explained_variance_ratio_[1]:.2%} var)", fontsize=11)
+    axes[1, 1].set_title(f"2D PCA Projection (N={n_samples} samples)", fontsize=12, fontweight="bold")
+    axes[1, 1].grid(True, alpha=0.3)
+
+    plt.suptitle(
+        f"Observation Coverage Analysis (N={max_expert_sequences} sequences)", fontsize=14, fontweight="bold", y=1.00
+    )
+    plt.tight_layout()
+
+    # Save and log
+    plt.savefig(f"coverage_analysis_N{max_expert_sequences}.png", dpi=150, bbox_inches="tight")
+    wandb.log({f"coverage_analysis": wandb.Image(fig)})
+    plt.close()
+
+    # Log metrics to wandb
+    coverage_metrics = {
+        "coverage/dataset_size": n_samples,
+        "coverage/obs_dim": obs_dim,
+        "coverage/avg_std": avg_std,
+        "coverage/avg_variance": avg_var,
+        "coverage/avg_range": avg_range,
+        "coverage/effective_rank": effective_rank,
+        "coverage/effective_rank_ratio": effective_rank / obs_dim,
+        "coverage/pca_var_pc1": explained_var_ratio[0],
+        "coverage/pca_var_pc2": explained_var_ratio[1] if len(explained_var_ratio) > 1 else 0,
+    }
+
+    wandb.log(coverage_metrics)
+
+    return coverage_metrics
+
+
+def train_bc_policy(obs, actions, config):
+    """Step 2: Train behavioral cloning policy"""
+    print("Training BC policy...")
+
+    # Initialize wandb
+    wandb.init(project="gsp_epiplexity", config=config)
+
+    wandb.log({"dataset_size": obs.shape[0]})
+
+    # Setup
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Convert to tensors
+    obs_tensor = obs.float()
+    actions_tensor = actions.long()
+
+    dataset = TensorDataset(obs_tensor, actions_tensor)
+    dataloader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=True)
+    data_iter = iter(dataloader)
+
+    # Create model
+    policy = BCPolicy(
+        input_size=obs.shape[-1], hidden_size=config["hidden_size"], output_size=config["num_actions"]
+    ).to(device)
+
+    optimizer = Adam(policy.parameters(), lr=config["learning_rate"])
+
+    # Training loop
+    losses = []
+    global_step = 0
+
+    for epoch in range(config["epochs"]):
+        epoch_losses = []
+
+        for i in range(config["minibatches"]):
+            try:
+                batch_obs, batch_actions = next(data_iter)
+            except StopIteration:
+                data_iter = iter(dataloader)
+                batch_obs, batch_actions = next(data_iter)
+
+            batch_obs = batch_obs.to(device)
+            batch_actions = batch_actions.to(device)
+
+            # Forward pass
+            log_prob = policy._log_prob(batch_obs, batch_actions.float())
+            loss = -log_prob
+
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # Compute accuracy
+            with torch.no_grad():
+                pred_action = policy(batch_obs, deterministic=True)
+                accuracy = (batch_actions == pred_action).sum() / batch_actions.shape[0]
+
+            # Log
+            loss_val = loss.item()
+            epoch_losses.append(loss_val)
+            losses.append(loss_val)
+
+            wandb.log({"global_step": global_step, "loss": loss_val, "accuracy": accuracy.item(), "epoch": epoch})
+
+            global_step += 1
+
+        avg_epoch_loss = np.mean(epoch_losses)
+
+        if avg_epoch_loss < 0.001:
+            print(f"Early stopping at epoch {epoch + 1} with loss {avg_epoch_loss:.6f}")
+            break
+        else:
+            print(f"Epoch {epoch + 1}/{config['epochs']}: Loss = {avg_epoch_loss:.4f}")
+
+    return losses, policy
+
+
+def compute_epiplexity(losses, dataset_size):
+    """Step 3: Compute area under loss curve above final loss (epiplexity)"""
+    print("Computing epiplexity...")
+
+    # Convert to numpy array
+    losses_array = np.array(losses)
+
+    # Take final loss as the last value (asymptotic loss)
+    final_loss = losses_array[-1]
+
+    # Epiplexity: area under the curve above the final loss
+    losses_above_final = losses_array - final_loss
+    epiplexity = np.trapezoid(losses_above_final)
+
+    print(f"Final Loss: {final_loss:.4f}")
+    print(f"Epiplexity (AUC above final loss): {epiplexity:.4f}")
+
+    # Log to wandb
+    wandb.log({"epiplexity": epiplexity, "final_loss": final_loss})
+
+    # Create visualization
+    fig, ax = plt.subplots(figsize=(10, 7))
+
+    # Plot loss curve
+    steps = np.arange(len(losses_array))
+    ax.plot(steps, losses_array, linewidth=1.5, label="Training loss", zorder=3)
+
+    # Draw horizontal line at final loss
+    ax.axhline(y=final_loss, color="red", linestyle="--", linewidth=2, label=f"Final Loss = {final_loss:.4f}", zorder=2)
+
+    # Fill the area between loss curve and final loss (epiplexity area)
+    ax.fill_between(
+        steps,
+        losses_array,
+        final_loss,
+        where=(losses_array >= final_loss),
+        color="red",
+        alpha=0.3,
+        label=f"Epiplexity = {epiplexity:.4f}",
+        zorder=1,
+    )
+
+    ax.set_xlabel("Training step", fontsize=12)
+    ax.set_ylabel("Loss", fontsize=12)
+    ax.set_title(f"BC training loss curve (N={dataset_size} samples)", fontsize=14)
+    ax.legend(loc="upper right", fontsize=10)
+    ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+
+    # Log plot to wandb
+    wandb.log({"loss_curve_with_epiplexity": wandb.Image(fig)})
+    plt.close()
+
+    return epiplexity, final_loss
+
+
+if __name__ == "__main__":
+    # Load configuration
+    args = load_config("puffer_drive")
+    args["vec"]["backend"] = "Serial"
+
+    for max_expert_sequences in [256, 512, 1024, 2048, 4096]:
+        args["env"]["num_agents"] = max_expert_sequences
+
+        config = {
+            "batch_size": 512,
+            "hidden_size": 1024,
+            "num_actions": 91,  # 7*13 for classic discrete action space
+            "learning_rate": 1e-4,
+            "epochs": 10_000,
+            "minibatches": 4,
+            "max_expert_sequences": max_expert_sequences,
+        }
+
+        env = load_env("puffer_drive", args)
+
+        # Step 1: Prepare human data (o_t, a_t) tuples
+        human_obs, human_actions = prepare_human_data(env.driver_env, max_expert_sequences=max_expert_sequences)
+
+        print(f"Data shapes - Obs: {human_obs.shape}, Actions: {human_actions.shape}")
+
+        # Step 2: Train BC policy
+        losses, policy = train_bc_policy(human_obs, human_actions, config)
+
+        # Analyze observation coverage
+        # coverage_metrics = compute_observation_coverage(human_obs, max_expert_sequences)
+
+        # Step 3: Compute epiplexity with visualization
+        epiplexity, final_loss = compute_epiplexity(losses, human_obs.shape[0])
+
+        print("\n" + "=" * 60)
+        print(f"RESULTS (N={max_expert_sequences}):")
+        print(f"  Final Loss: {final_loss:.4f}")
+        print(f"  Epiplexity: {epiplexity:.4f}")
+        print("=" * 60 + "\n")
+
+        env.close()
+
+        wandb.finish()
diff --git a/examples/eval_pipeline.py b/examples/eval_pipeline.py
new file mode 100644
index 000000000..50dcc3ef5
--- /dev/null
+++ b/examples/eval_pipeline.py
@@ -0,0 +1,352 @@
+import numpy as np
+import pandas as pd
+import torch
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import seaborn as sns
+import warnings
+
+from pufferlib.pufferl import load_env, load_policy, load_config
+from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator
+
+POLICY_DIR = "models"
+
+COLUMN_ORDER = [
+    "policy",
+    "realism_meta_score",
+    "kinematic_metrics",
+    "interactive_metrics",
+    "map_based_metrics",
+    "min_ade",
+    "ade",
+    "likelihood_linear_speed",
+    "likelihood_linear_acceleration",
+    "likelihood_angular_speed",
+    "likelihood_angular_acceleration",
+    "likelihood_collision_indication",
+    "likelihood_offroad_indication",
+    "likelihood_time_to_collision",
+    "likelihood_distance_to_road_edge",
+    "likelihood_distance_to_nearest_object",
+]
+
+
+def plot_wosac_results(df):
+    """Create a 3-column visualization of WOSAC results."""
+
+    # Set style
+    sns.set("notebook", font_scale=1.05, rc={"figure.figsize": (16, 5)})
+    sns.set_style("ticks", rc={"figure.facecolor": "none", "axes.facecolor": "none"})
+    warnings.filterwarnings("ignore")
+    plt.set_loglevel("WARNING")
+    mpl.rcParams["lines.markersize"] = 8
+
+    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+    palette = sns.color_palette("Set2", n_colors=len(df["policy"].unique()))
+
+    # Left: Realism meta score
+    sns.barplot(data=df, x="policy", y="realism_meta_score", errorbar="sd", palette=palette, ax=axes[0], alpha=0.8)
+    axes[0].set_title("Aggregate realism score")
+    # axes[0].set_ylim(0, 1.0)
+    axes[0].grid(axis="y", alpha=0.3, linestyle="--")
+    axes[0].tick_params(axis="x", rotation=30)
+
+    # Middle: Metric Categories
+    df_metrics = df.melt(
+        id_vars=["policy"], value_vars=["kinematic_metrics", "interactive_metrics", "map_based_metrics"]
+    )
+    sns.barplot(
+        data=df_metrics, x="variable", y="value", hue="policy", errorbar="sd", palette=palette, ax=axes[1], alpha=0.8
+    )
+    axes[1].set_title("Group metric categories")
+    axes[1].set_xlabel("")
+    axes[1].set_ylabel("Score")
+    # axes[1].set_ylim(0, 1.0)
+    axes[1].legend(title="Policy", loc="upper left")
+    axes[1].grid(axis="y", alpha=0.3, linestyle="--")
+    axes[1].tick_params(axis="x", rotation=30)
+
+    # Right: ADE and minADE
+    df_ade = df.melt(id_vars=["policy"], value_vars=["ade", "min_ade"])
+    sns.barplot(
+        data=df_ade, x="policy", y="value", hue="variable", errorbar="sd", palette="muted", ax=axes[2], alpha=0.8
+    )
+    axes[2].set_title("Displacement error")
+    axes[2].set_ylabel("Distance (m)")
+    axes[2].legend(title="Metric")
+    axes[2].grid(axis="y", alpha=0.3, linestyle="--")
+    axes[2].tick_params(axis="x", rotation=15)
+
+    for ax in axes:
+        sns.despine(ax=ax)
+
+    plt.tight_layout()
+    plt.savefig("wosac_evaluation_results.png", dpi=300, bbox_inches="tight", facecolor="white")
+    plt.show()
+
+    return fig
+
+
+def plot_realism_score_distributions(df):
+    """Create histogram distributions of realism scores for each policy."""
+
+    # Set style
+    sns.set("notebook", font_scale=1.05)
+    sns.set_style("ticks", rc={"figure.facecolor": "none", "axes.facecolor": "none"})
+    warnings.filterwarnings("ignore")
+    plt.set_loglevel("WARNING")
+    mpl.rcParams["lines.markersize"] = 8
+
+    policies = df["policy"].unique()
+    n_policies = len(policies)
+
+    fig, axes = plt.subplots(1, n_policies, figsize=(5 * n_policies, 4), sharey=True, sharex=True)
+    if n_policies == 1:
+        axes = [axes]
+
+    palette = sns.color_palette("Set2", n_colors=n_policies)
+
+    for idx, (policy, ax) in enumerate(zip(policies, axes)):
+        policy_data = df[df["policy"] == policy]["realism_meta_score"]
+
+        # Plot histogram
+        ax.hist(policy_data, bins=20, alpha=0.8, color=palette[idx], edgecolor="black")
+
+        # Add mean line
+        mean_val = policy_data.mean()
+        ax.axvline(mean_val, color="red", linestyle="--", linewidth=2, label=f"Mean: {mean_val:.3f}")
+
+        # Add std text
+        std_val = policy_data.std()
+        ax.text(
+            0.05,
+            0.95,
+            f"μ = {mean_val:.3f}\nσ = {std_val:.3f}",
+            transform=ax.transAxes,
+            verticalalignment="top",
+            bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
+        )
+
+        ax.set_title(f"{policy}")
+        ax.set_xlabel("Realism meta score")
+        if idx == 0:
+            ax.set_ylabel("Count")
+        ax.grid(axis="y", alpha=0.3, linestyle="--")
+        ax.legend()
+        sns.despine(ax=ax)
+
+    plt.tight_layout()
+    plt.savefig("wosac_realism_score_distributions.png", dpi=300, bbox_inches="tight", facecolor="white")
+    plt.show()
+
+    return fig
+
+
+def evaluate_ground_truth(config, vecenv, evaluator):
+    """Compute WOSAC metrics for ground truth trajectories."""
+
+    gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv)
+
+    fake_simulated_trajectories = gt_trajectories.copy()
+    for key in ["x", "y", "heading", "id"]:
+        fake_simulated_trajectories[key] = np.repeat(gt_trajectories[key], config["eval"]["wosac_num_rollouts"], axis=1)
+    fake_simulated_trajectories["id"] = fake_simulated_trajectories["id"][..., np.newaxis]
+
+    # Compute metrics
+    agent_state = vecenv.driver_env.get_global_agent_state()
+    road_edge_polylines = vecenv.driver_env.get_road_edge_polylines()
+    results = evaluator.compute_metrics(
+        gt_trajectories,
+        fake_simulated_trajectories,
+        agent_state,
+        road_edge_polylines,
+    )
+    return results
+
+
+def evaluate_human_inferred_actions(config, vecenv, evaluator):
+    """Compute WOSAC metrics for human inferred actions."""
+
+    gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv)
+
+    vecenv._prep_human_data()
+
+    # Roll out inferred human actions in the simulator
+    simulated_trajectories = evaluator.collect_simulated_trajectories(
+        args=config,
+        puffer_env=vecenv,
+        policy=None,
+        actions=vecenv.expert_actions_discrete,
+    )
+
+    # Compute metrics
+    agent_state = vecenv.driver_env.get_global_agent_state()
+    road_edge_polylines = vecenv.driver_env.get_road_edge_polylines()
+    results = evaluator.compute_metrics(
+        gt_trajectories,
+        simulated_trajectories,
+        agent_state,
+        road_edge_polylines,
+    )
+
+    return results
+
+
+def evaluate_random_policy(config, vecenv, evaluator):
+    gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv)
+    simulated_trajectories = evaluator.collect_wosac_random_baseline(vecenv)
+
+    # Compute metrics
+    agent_state = vecenv.driver_env.get_global_agent_state()
+    road_edge_polylines = vecenv.driver_env.get_road_edge_polylines()
+    results = evaluator.compute_metrics(
+        gt_trajectories,
+        simulated_trajectories,
+        agent_state,
+        road_edge_polylines,
+    )
+
+    return results
+
+
+def evaluate_bc_policy(config, vecenv, evaluator, policy_path):
+    config["train"]["use_rnn"] = False
+    evaluator.mode = "bc_policy"
+
+    from train_bc_policy import BCPolicy
+
+    bc_policy = BCPolicy(
+        input_size=vecenv.observation_space.shape[-1],
+        hidden_size=1024,
+        output_size=21 * 31,
+    )
+    bc_policy.load_state_dict(torch.load(policy_path))
+    bc_policy.eval().to(config["train"]["device"])
+
+    gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv)
+    simulated_trajectories = evaluator.collect_simulated_trajectories(
+        args=config,
+        puffer_env=vecenv,
+        policy=bc_policy,
+    )
+
+    # Compute metrics
+    agent_state = vecenv.driver_env.get_global_agent_state()
+    road_edge_polylines = vecenv.driver_env.get_road_edge_polylines()
+    results = evaluator.compute_metrics(
+        gt_trajectories,
+        simulated_trajectories,
+        agent_state,
+        road_edge_polylines,
+    )
+
+    return results
+
+
+def evaluate_rl_policy(config, vecenv, evaluator, policy_path):
+    config["train"]["use_rnn"] = True
+    evaluator.mode = "rl_policy"
+
+    policy = load_policy(config, vecenv, "puffer_drive")
+
+    gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv)
+    simulated_trajectories = evaluator.collect_simulated_trajectories(config, vecenv, policy)
+
+    # Compute metrics
+    agent_state = vecenv.driver_env.get_global_agent_state()
+    road_edge_polylines = vecenv.driver_env.get_road_edge_polylines()
+    results = evaluator.compute_metrics(
+        gt_trajectories,
+        simulated_trajectories,
+        agent_state,
+        road_edge_polylines,
+    )
+    return results
+
+
+def pipeline(env_name="puffer_drive"):
+    """Obtain WOSAC scores for various baselines and policies."""
+
+    config = load_config(env_name)
+
+    config["env"]["num_maps"] = 60
+    config["env"]["map_dir"] = "pufferlib/resources/drive/binaries/validation_interactive_small"
+    config["wosac"]["enabled"] = True
+    config["vec"]["backend"] = "PufferEnv"
+    config["vec"]["num_envs"] = 1
+    config["env"]["sequential_map_sampling"] = True
+    config["env"]["init_mode"] = "create_all_valid"
+    config["env"]["control_mode"] = "control_wosac"
+    config["env"]["init_steps"] = 10
+    config["env"]["goal_behavior"] = 2
+    config["env"]["goal_radius"] = 1.0
+    config["env"]["save_data_to_disk"] = False
+
+    # Make env
+    vecenv = load_env(env_name, config)
+
+    # Make evaluator
+    evaluator = WOSACEvaluator(config)
+
+    # Baseline: Ground truth
+    df_results_gt = evaluate_ground_truth(config, vecenv, evaluator)
+    df_results_gt["policy"] = "ground_truth"
+
+    # Baseline: Agent with inferred human actions (using classic bicycle dynamics model)
+    df_results_inferred_human = evaluate_human_inferred_actions(config, vecenv, evaluator)
+    df_results_inferred_human["policy"] = "inferred_human_actions"
+
+    # Baseline: Imitation learning policy
+    # df_results_bc = evaluate_bc_policy(config, vecenv, evaluator, POLICY_DIR + "/bc_policy.pt")
+    # df_results_bc["policy"] = "bc_policy"
+
+    # Baseline: Self-play RL policy
+    # run: https://wandb.ai/emerge_/gsp/runs/qld2z6tn?nw=nwuserdaphnecor
+    # df_results_self_play = evaluate_rl_policy(
+    #     config, vecenv, evaluator, "pufferlib/resources/drive/pufferdrive_weights.pt"
+    # )  # POLICY_DIR + "/puffer_drive_sp_qld2z6tn.pt")
+    # df_results_self_play["policy"] = "self_play_rl_base"
+
+    # TODO: Guided self-play policy (guidance in rewards)
+    # ...
+
+    # TODO: Guided self-play policy (regularization)
+    # ...
+
+    # Baseline: Random policy
+    df_results_random = evaluate_random_policy(config, vecenv, evaluator)
+    df_results_random["policy"] = "random"
+
+    # Combine
+    df = pd.concat(
+        [
+            df_results_gt,
+            df_results_inferred_human,
+            df_results_random,
+            # df_results_bc,
+            # df_results_self_play,
+        ],
+        ignore_index=True,
+    )
+
+    df = df[COLUMN_ORDER]
+
+    # Visualize
+    plot_wosac_results(df)
+    plot_realism_score_distributions(df)
+
+    print(f"total agents: {df_results_gt['num_agents_per_scene'].sum().item()}")
+
+    print(df.groupby("policy")["realism_meta_score"].mean())
+    print("---")
+    print(df.groupby("policy")["kinematic_metrics"].mean())
+    print("---")
+    print(df.groupby("policy")["interactive_metrics"].mean())
+    print("---")
+    print(df.groupby("policy")["map_based_metrics"].mean())
+
+    breakpoint()
+
+
+if __name__ == "__main__":
+    pipeline()
diff --git a/examples/gym_env.py b/examples/gym_env.py
deleted file mode 100644
index 94e525d93..000000000
--- a/examples/gym_env.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import gym
-import pufferlib.emulation
-
-
-class SampleGymEnv(gym.Env):
-    def __init__(self):
-        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(1,))
-        self.action_space = gym.spaces.Discrete(2)
-
-    def reset(self):
-        return self.observation_space.sample()
-
-    def step(self, action):
-        return self.observation_space.sample(), 0.0, False, {}
-
-
-if __name__ == "__main__":
-    gym_env = SampleGymEnv()
-    gymnasium_env = pufferlib.GymToGymnasium(gym_env)
-    puffer_env = pufferlib.emulation.GymnasiumPufferEnv(gymnasium_env)
-    observations, info = puffer_env.reset()
-    action = puffer_env.action_space.sample()
-    observation, reward, terminal, truncation, info = puffer_env.step(action)
diff --git a/examples/gymnasium_env.py b/examples/gymnasium_env.py
deleted file mode 100644
index e33ae289e..000000000
--- a/examples/gymnasium_env.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import gymnasium
-import pufferlib.emulation
-
-
-class SampleGymnasiumEnv(gymnasium.Env):
-    def __init__(self):
-        self.observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(1,))
-        self.action_space = gymnasium.spaces.Discrete(2)
-
-    def reset(self):
-        return self.observation_space.sample(), {}
-
-    def step(self, action):
-        return self.observation_space.sample(), 0.0, False, False, {}
-
-
-if __name__ == "__main__":
-    gymnasium_env = SampleGymnasiumEnv()
-    puffer_env = pufferlib.emulation.GymnasiumPufferEnv(gymnasium_env)
-    observation, info = puffer_env.reset()
-    action = puffer_env.action_space.sample()
-    observation, reward, terminal, truncation, info = puffer_env.step(action)
diff --git a/examples/pettingzoo_env.py b/examples/pettingzoo_env.py
deleted file mode 100644
index 333069229..000000000
--- a/examples/pettingzoo_env.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import gymnasium
-import pettingzoo
-import pufferlib.emulation
-
-
-class SamplePettingzooEnv(pettingzoo.ParallelEnv):
-    def __init__(self):
-        self.possible_agents = ["agent_0", "agent_1"]
-        self.agents = ["agent_0", "agent_1"]
-
-    def observation_space(self, agent):
-        return gymnasium.spaces.Box(low=-1, high=1, shape=(1,))
-
-    def action_space(self, agent):
-        return gymnasium.spaces.Discrete(2)
-
-    def reset(self, seed=None, options=None):
-        observations = {agent: self.observation_space(agent).sample() for agent in self.agents}
-        return observations, {}
-
-    def step(self, action):
-        observations = {agent: self.observation_space(agent).sample() for agent in self.agents}
-        rewards = {agent: 0.0 for agent in self.agents}
-        terminals = {agent: False for agent in self.agents}
-        truncations = {agent: False for agent in self.agents}
-        infos = {agent: {} for agent in self.agents}
-        return observations, rewards, terminals, truncations, infos
-
-
-if __name__ == "__main__":
-    env = SamplePettingzooEnv()
-    puffer_env = pufferlib.emulation.PettingZooPufferEnv(env)
-    observations, infos = puffer_env.reset()
-    actions = {agent: puffer_env.action_space(agent).sample() for agent in puffer_env.agents}
-    observations, rewards, terminals, truncations, infos = puffer_env.step(actions)
diff --git a/examples/puffer_env.py b/examples/puffer_env.py
deleted file mode 100644
index f18bad977..000000000
--- a/examples/puffer_env.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import gymnasium
-import pufferlib.emulation
-
-
-class SamplePufferEnv(pufferlib.PufferEnv):
-    def __init__(self, buf=None, seed=0):
-        self.single_observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(1,))
-        self.single_action_space = gymnasium.spaces.Discrete(2)
-        self.num_agents = 2
-        super().__init__(buf)
-
-    def reset(self, seed=0):
-        self.observations[:] = self.observation_space.sample()
-        return self.observations, []
-
-    def step(self, action):
-        self.observations[:] = self.observation_space.sample()
-        infos = [{"infos": "is a list of dictionaries"}]
-        return self.observations, self.rewards, self.terminals, self.truncations, infos
-
-
-if __name__ == "__main__":
-    puffer_env = SamplePufferEnv()
-    observations, infos = puffer_env.reset()
-    actions = puffer_env.action_space.sample()
-    observations, rewards, terminals, truncations, infos = puffer_env.step(actions)
-    print("Puffer envs use a vector interface and in-place array updates")
-    print("Observation:", observations)
-    print("Reward:", rewards)
-    print("Terminal:", terminals)
-    print("Truncation:", truncations)
diff --git a/examples/pufferl.py b/examples/pufferl.py
deleted file mode 100644
index baa51baaa..000000000
--- a/examples/pufferl.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import torch
-import pufferlib.vector
-import pufferlib.ocean
-from pufferlib import pufferl
-
-
-# Equivalent to running puffer train puffer_breakout
-def cli():
-    pufferl.train("puffer_breakout")
-
-
-class Policy(torch.nn.Module):
-    def __init__(self, env):
-        super().__init__()
-        self.net = torch.nn.Sequential(
-            pufferlib.pytorch.layer_init(torch.nn.Linear(env.single_observation_space.shape[0], 128)),
-            torch.nn.ReLU(),
-            pufferlib.pytorch.layer_init(torch.nn.Linear(128, 128)),
-        )
-        self.action_head = torch.nn.Linear(128, env.single_action_space.n)
-        self.value_head = torch.nn.Linear(128, 1)
-
-    def forward_eval(self, observations, state=None):
-        hidden = self.net(observations)
-        logits = self.action_head(hidden)
-        values = self.value_head(hidden)
-        return logits, values
-
-    # We use this to work around a major torch perf issue
-    def forward(self, observations, state=None):
-        return self.forward_eval(observations, state)
-
-
-# Managing your own trainer
-if __name__ == "__main__":
-    env_name = "puffer_breakout"
-    env_creator = pufferlib.ocean.env_creator(env_name)
-    vecenv = pufferlib.vector.make(
-        env_creator,
-        num_envs=2,
-        num_workers=2,
-        batch_size=1,
-        backend=pufferlib.vector.Multiprocessing,
-        env_kwargs={"num_envs": 4096},
-    )
-    policy = Policy(vecenv.driver_env).cuda()
-    args = pufferl.load_config("default")
-    args["train"]["env"] = env_name
-
-    trainer = pufferl.PuffeRL(args["train"], vecenv, policy)
-
-    for epoch in range(10):
-        trainer.evaluate()
-        logs = trainer.train()
-
-    trainer.print_dashboard()
-    trainer.close()
diff --git a/examples/render.py b/examples/render.py
deleted file mode 100644
index b3965859f..000000000
--- a/examples/render.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from pufferlib.ocean.breakout import breakout
-
-env = breakout.Breakout()
-env.reset()
-while True:
-    env.step(env.action_space.sample())
-    frame = env.render()
diff --git a/examples/structured_env.py b/examples/structured_env.py
deleted file mode 100644
index f7af0bee3..000000000
--- a/examples/structured_env.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import gymnasium
-import pufferlib.emulation
-
-
-class SampleGymnasiumEnv(gymnasium.Env):
-    def __init__(self):
-        self.observation_space = gymnasium.spaces.Dict(
-            {
-                "foo": gymnasium.spaces.Box(low=-1, high=1, shape=(2,)),
-                "bar": gymnasium.spaces.Box(low=2, high=3, shape=(3,)),
-            }
-        )
-        self.action_space = gymnasium.spaces.MultiDiscrete([2, 5])
-
-    def reset(self):
-        return self.observation_space.sample(), {}
-
-    def step(self, action):
-        return self.observation_space.sample(), 0.0, False, False, {}
-
-
-if __name__ == "__main__":
-    gymnasium_env = SampleGymnasiumEnv()
-    puffer_env = pufferlib.emulation.GymnasiumPufferEnv(gymnasium_env)
-    flat_observation, info = puffer_env.reset()
-    flat_action = puffer_env.action_space.sample()
-    flat_observation, reward, terminal, truncation, info = puffer_env.step(flat_action)
-    print(f"PufferLib flattens observations and actions:\n{flat_observation}\n{flat_action}")
-
-    observation = flat_observation.view(puffer_env.obs_dtype)
-    print(f"You can unflatten observations with numpy:\n{observation}")
-
-    import torch
-    import pufferlib.pytorch
-
-    flat_torch_observation = torch.from_numpy(flat_observation)
-    torch_dtype = pufferlib.pytorch.nativize_dtype(puffer_env.emulated)
-    torch_observation = pufferlib.pytorch.nativize_tensor(flat_torch_observation, torch_dtype)
-    print(f"But we suggest unflattening observations with torch in your model forward pass:\n{torch_observation}")
diff --git a/examples/train_bc_policy.py b/examples/train_bc_policy.py
new file mode 100644
index 000000000..15c4b5b69
--- /dev/null
+++ b/examples/train_bc_policy.py
@@ -0,0 +1,189 @@
+import wandb
+import torch
+import torch.nn as nn
+from torch.optim import Adam
+from torch.utils.data import DataLoader, TensorDataset
+from torch.distributions.categorical import Categorical
+import numpy as np
+import matplotlib.pyplot as plt
+from pufferlib.pufferl import load_config, load_env
+
+CHECKPOINT_PATH = "models"
+
+
+class BCPolicy(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super().__init__()
+        self.nn = nn.Sequential(
+            nn.Linear(input_size, hidden_size),
+            nn.ReLU(),
+            nn.LayerNorm(hidden_size),
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(),
+            nn.LayerNorm(hidden_size),
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(),
+        )
+        self.heads = nn.ModuleList([nn.Linear(hidden_size, output_size)])
+
+    def dist(self, obs):
+        """Generate action distribution."""
+        x_out = self.nn(obs.float())
+        return [Categorical(logits=head(x_out)) for head in self.heads]
+
+    def forward(self, obs, deterministic=False):
+        """Generate an output from tensor input."""
+        action_dist = self.dist(obs)
+
+        if deterministic:
+            actions_idx = action_dist[0].logits.argmax(axis=-1)
+        else:
+            actions_idx = action_dist[0].sample()
+        return actions_idx
+
+    def _log_prob(self, obs, expert_actions):
+        pred_action_dist = self.dist(obs)
+        log_prob = pred_action_dist[0].log_prob(expert_actions).mean()
+        return log_prob
+
+
+def prepare_human_data(env, max_expert_sequences=512):
+    """Step 1: Extract and process human demonstration data"""
+    print("Preparing human data...")
+
+    env._prep_human_data(
+        bptt_horizon=1,
+        max_expert_sequences=max_expert_sequences,
+    )
+
+    # Access the raw expert data collected by the environment
+    expert_actions_discrete = torch.Tensor(env.expert_actions_discrete)  # Shape: (T, N, 1)
+    expert_observations = torch.Tensor(env.expert_observations_full)  # Shape: (T, N, obs_dim)
+
+    # Flatten to create a batch of samples
+    action_labels = torch.flatten(expert_actions_discrete, 0, 1).squeeze()  # [B]
+    observations = torch.flatten(expert_observations, 0, 1)  # [B, obs_dim]
+
+    # Filter out invalid actions (-1)
+    valid_mask = action_labels != -1
+    action_labels = action_labels[valid_mask]
+    observations = observations[valid_mask]
+
+    return observations, action_labels
+
+
+def train_bc_policy(obs, actions, config):
+    """Step 2: Train behavioral cloning policy"""
+    print("Training BC policy...")
+
+    # Initialize wandb
+    wandb.init(project="gsp_bc_policy", config=config)
+
+    wandb.log({"dataset_size": obs.shape[0]})
+
+    # Setup
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    obs_tensor = obs.float()
+    actions_tensor = actions.long()
+
+    dataset = TensorDataset(obs_tensor, actions_tensor)
+    dataloader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=True)
+    data_iter = iter(dataloader)
+
+    # Create model
+    policy = BCPolicy(
+        input_size=obs.shape[-1],
+        hidden_size=config["hidden_size"],
+        output_size=config["num_actions"],
+    ).to(device)
+
+    optimizer = Adam(policy.parameters(), lr=config["learning_rate"])
+
+    # Training loop
+    losses = []
+    global_step = 0
+
+    for epoch in range(config["epochs"]):
+        epoch_losses = []
+
+        for i in range(config["minibatches"]):
+            try:
+                batch_obs, batch_actions = next(data_iter)
+            except StopIteration:
+                data_iter = iter(dataloader)
+                batch_obs, batch_actions = next(data_iter)
+
+            batch_obs = batch_obs.to(device)
+            batch_actions = batch_actions.to(device)
+
+            # Forward pass
+            log_prob = policy._log_prob(batch_obs, batch_actions.float())
+            loss = -log_prob
+
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # Compute accuracy
+            with torch.no_grad():
+                pred_action = policy(batch_obs, deterministic=True)
+                accuracy = (batch_actions == pred_action).sum() / batch_actions.shape[0]
+
+            # Log
+            loss_val = loss.item()
+            epoch_losses.append(loss_val)
+            losses.append(loss_val)
+
+            wandb.log({"global_step": global_step, "loss": loss_val, "accuracy": accuracy.item(), "epoch": epoch})
+
+            global_step += 1
+
+        avg_epoch_loss = np.mean(epoch_losses)
+
+        if avg_epoch_loss < 0.001:
+            print(f"Early stopping at epoch {epoch + 1} with loss {avg_epoch_loss:.6f}")
+            break
+        else:
+            print(f"Epoch {epoch + 1}/{config['epochs']}: Loss = {avg_epoch_loss:.4f}")
+
+    return losses, policy
+
+
+if __name__ == "__main__":
+    # Load configuration
+    args = load_config("puffer_drive")
+    args["vec"]["backend"] = "Serial"
+    args["env"]["num_maps"] = 229
+    args["env"]["map_dir"] = "pufferlib/resources/drive/binaries/validation_interactive_small"
+
+    for max_expert_sequences in [4096]:
+        args["env"]["num_agents"] = max_expert_sequences
+
+        config = {
+            "batch_size": 512,
+            "hidden_size": 1024,
+            "num_actions": 21 * 31,  # TODO: Do not hardcode
+            "learning_rate": 1e-4,
+            "epochs": 100_000,
+            "minibatches": 4,
+            "max_expert_sequences": max_expert_sequences,
+        }
+
+        env = load_env("puffer_drive", args)
+
+        # Step 1: Prepare human data (o_t, a_t) tuples
+        human_obs, human_actions = prepare_human_data(env.driver_env, max_expert_sequences=max_expert_sequences)
+
+        print(f"Data shapes - Obs: {human_obs.shape}, Actions: {human_actions.shape}")
+
+        # Step 2: Train BC policy
+        losses, policy = train_bc_policy(human_obs, human_actions, config)
+
+        # Store the trained model
+        torch.save(policy.state_dict(), f"{CHECKPOINT_PATH}/bc_policy.pt")
+
+        env.close()
+
+        wandb.finish()
diff --git a/examples/vectorization.py b/examples/vectorization.py
deleted file mode 100644
index 62c999789..000000000
--- a/examples/vectorization.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import gymnasium
-import pufferlib.emulation
-import pufferlib.vector
-
-
-class SamplePufferEnv(pufferlib.PufferEnv):
-    def __init__(self, foo=0, bar=1, buf=None, seed=0):
-        self.single_observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(1,))
-        self.single_action_space = gymnasium.spaces.Discrete(2)
-        self.num_agents = 2
-        super().__init__(buf)
-
-        # Sample args and kwargs
-        self.foo = foo
-        self.bar = bar
-
-    def reset(self, seed=0):
-        self.observations[:] = self.observation_space.sample()
-        return self.observations, []
-
-    def step(self, action):
-        self.observations[:] = self.observation_space.sample()
-        infos = [{"infos": "is a list of dictionaries"}]
-        return self.observations, self.rewards, self.terminals, self.truncations, infos
-
-    def close(self):
-        pass
-
-
-if __name__ == "__main__":
-    serial_vecenv = pufferlib.vector.make(SamplePufferEnv, num_envs=2, backend=pufferlib.vector.Serial)
-    observations, infos = serial_vecenv.reset()
-    actions = serial_vecenv.action_space.sample()
-    o, r, d, t, i = serial_vecenv.step(actions)
-    print("Serial VecEnv:")
-    print("Observations:", o)
-    print("Rewards:", r)
-    print("Terminals:", t)
-    print("Truncations:", d)
-
-    # Pass arguments to all environments like this
-    serial_vecenv = pufferlib.vector.make(
-        SamplePufferEnv, num_envs=2, backend=pufferlib.vector.Serial, env_args=[3], env_kwargs={"bar": 4}
-    )
-    print("Foo: ", [env.foo for env in serial_vecenv.envs])
-    print("Bar: ", [env.bar for env in serial_vecenv.envs])
-
-    # Or to each environment like this
-    serial_vecenv = pufferlib.vector.make(
-        [SamplePufferEnv, SamplePufferEnv],
-        num_envs=2,
-        backend=pufferlib.vector.Serial,
-        env_args=[[3], [4]],
-        env_kwargs=[{"bar": 4}, {"bar": 5}],
-    )
-    print("Foo: ", [env.foo for env in serial_vecenv.envs])
-    print("Bar: ", [env.bar for env in serial_vecenv.envs])
-
-    vecenv = pufferlib.vector.make(
-        SamplePufferEnv, num_envs=2, num_workers=2, batch_size=1, backend=pufferlib.vector.Multiprocessing
-    )
-    vecenv.async_reset()  # You can also use the synchronous API with Multiprocessing
-    o, r, d, t, i, env_ids, masks = vecenv.recv()
-    actions = vecenv.action_space.sample()
-    print("Policy computes actions for all agents in batch_size=1 of the total num_envs=2 environments")
-    print("Actions:", actions)
-    vecenv.send(actions)
-
-    # New observations are ready while the other envs are running in the background
-    o, r, d, t, i, env_ids, masks = vecenv.recv()
-    print("Observations:", o)
-
-    # Make sure to close the vecenv when you're done
-    vecenv.close()
-
-    try:
-        vecenv = pufferlib.vector.make(
-            SamplePufferEnv, num_envs=1, num_workers=2, batch_size=3, backend=pufferlib.vector.Multiprocessing
-        )
-    except pufferlib.APIUsageError:
-        # Make sure num_envs divides num_workers, and both num_envs and num_workers should divide batch_size
-        pass
diff --git a/greene/a100_baseline_sweep.sh b/greene/a100_baseline_sweep.sh
new file mode 100644
index 000000000..ca34d8205
--- /dev/null
+++ b/greene/a100_baseline_sweep.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+#SBATCH --array=0-10
+#SBATCH --job-name=pd_sweep_a100
+#SBATCH --output=/scratch/%u/PufferDrive/logs/output_%A_%a.txt
+#SBATCH --error=/scratch/%u/PufferDrive/logs/error_%A_%a.txt
+#SBATCH --mem=64G
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=16
+#SBATCH --gres=gpu:a100:2
+#SBATCH --account=pr_100_tandon_priority
+
+# Print job info
+echo "Job ID: $SLURM_JOB_ID"
+echo "Array Task ID: $SLURM_ARRAY_TASK_ID"
+echo "User: $USER"
+echo "Running on node: $(hostname)"
+echo "GPU: $CUDA_VISIBLE_DEVICES"
+date
+
+# Set project directory
+PROJECT_DIR="/scratch/${USER}/PufferDrive"
+echo "Project directory: $PROJECT_DIR"
+
+# Create logs directory if it doesn't exist
+mkdir -p ${PROJECT_DIR}/logs
+
+# Navigate to project directory
+cd ${PROJECT_DIR}
+
+# Activate virtual environment
+source .venv/bin/activate
+
+# A100 dual-GPU optimized settings (scaled up for 2 GPUs and 64 CPUs)
+NUM_WORKERS=32
+NUM_ENVS=32
+VEC_BATCH_SIZE=2
+NUM_AGENTS=1024
+BPTT_HORIZON=32
+HUMAN_REG_COEF=0.0
+
+# Calculate the segment size (minimum train batch size)
+SEGMENT_SIZE=$((NUM_AGENTS * NUM_WORKERS * BPTT_HORIZON))
+
+# Target batch size (aim for 2M with 2 GPUs)
+TARGET_BATCH_SIZE=2097152  # 2M = 2^21
+
+# Calculate train batch size: largest multiple of SEGMENT_SIZE that doesn't exceed target
+TRAIN_BATCH_SIZE=$(( (TARGET_BATCH_SIZE / SEGMENT_SIZE) * SEGMENT_SIZE ))
+
+# If calculated batch size is 0 or too small, use the segment size
+if [ $TRAIN_BATCH_SIZE -lt $SEGMENT_SIZE ]; then
+    TRAIN_BATCH_SIZE=$SEGMENT_SIZE
+fi
+
+echo "Starting sweep with settings:"
+echo "  NUM_WORKERS: $NUM_WORKERS"
+echo "  NUM_ENVS: $NUM_ENVS"
+echo "  VEC_BATCH_SIZE: $VEC_BATCH_SIZE"
+echo "  NUM_AGENTS: $NUM_AGENTS"
+echo "  BPTT_HORIZON: $BPTT_HORIZON"
+echo "  SEGMENT_SIZE (min required): $SEGMENT_SIZE"
+echo "  TRAIN_BATCH_SIZE (calculated): $TRAIN_BATCH_SIZE"
+echo "  Divisibility check: $TRAIN_BATCH_SIZE / $SEGMENT_SIZE = $(($TRAIN_BATCH_SIZE / $SEGMENT_SIZE))"
+
+# Launch sweep with optimized parameters
+puffer sweep puffer_drive \
+  --wandb \
+  --wandb-project "gsp" \
+  --tag "guidance_regularize" \
+  --vec.num-workers $NUM_WORKERS \
+  --vec.num-envs $NUM_ENVS \
+  --vec.batch-size $VEC_BATCH_SIZE \
+  --env.num-agents $NUM_AGENTS \
+  --train.batch-size $TRAIN_BATCH_SIZE \
+  --train.bptt-horizon $BPTT_HORIZON \
+  --train.human-ll-coef $HUMAN_REG_COEF
+
+# Print completion info
+echo "Sweep completed"
+date
\ No newline at end of file
diff --git a/greene/h100_gsp_rew_sweep.sh b/greene/h100_gsp_rew_sweep.sh
new file mode 100644
index 000000000..dde3f2d04
--- /dev/null
+++ b/greene/h100_gsp_rew_sweep.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+#SBATCH --array=0-5
+#SBATCH --job-name=pd_sweep_h100
+#SBATCH --output=/scratch/%u/PufferDrive/logs/output_%A_%a.txt
+#SBATCH --error=/scratch/%u/PufferDrive/logs/error_%A_%a.txt
+#SBATCH --mem=64G
+#SBATCH --time=0-24:00:0
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=16
+#SBATCH --gres=gpu:h100:1
+#SBATCH --account=pr_100_tandon_advanced
+
+# Print job info
+echo "Job ID: $SLURM_JOB_ID"
+echo "Array Task ID: $SLURM_ARRAY_TASK_ID"
+echo "User: $USER"
+echo "Running on node: $(hostname)"
+echo "GPU: $CUDA_VISIBLE_DEVICES"
+date
+
+# Set project directory
+PROJECT_DIR="/scratch/${USER}/PufferDrive"
+echo "Project directory: $PROJECT_DIR"
+
+# Create logs directory if it doesn't exist
+mkdir -p ${PROJECT_DIR}/logs
+
+# Navigate to project directory
+cd ${PROJECT_DIR}
+
+# Activate virtual environment
+source .venv/bin/activate
+
+# H100 optimized settings
+NUM_WORKERS=48
+NUM_ENVS=48
+VEC_BATCH_SIZE=6
+NUM_AGENTS=1024 
+BPTT_HORIZON=32 
+HUMAN_REG_COEF=0.0
+USE_GUIDANCE_REWARDS=0
+
+# Calculate the segment size (minimum train batch size)
+SEGMENT_SIZE=$((NUM_AGENTS * NUM_WORKERS * BPTT_HORIZON))
+
+# Target batch size (aim for 1-2M on H100)
+TARGET_BATCH_SIZE=2097152  # 2M = 2^21
+
+# Calculate train batch size: largest multiple of SEGMENT_SIZE that doesn't exceed target
+TRAIN_BATCH_SIZE=$(( (TARGET_BATCH_SIZE / SEGMENT_SIZE) * SEGMENT_SIZE ))
+
+# If calculated batch size is 0 or too small, use the segment size
+if [ $TRAIN_BATCH_SIZE -lt $SEGMENT_SIZE ]; then
+    TRAIN_BATCH_SIZE=$SEGMENT_SIZE
+fi
+
+echo "Starting sweep with settings:"
+echo "  NUM_WORKERS: $NUM_WORKERS"
+echo "  NUM_ENVS: $NUM_ENVS"
+echo "  VEC_BATCH_SIZE: $VEC_BATCH_SIZE"
+echo "  NUM_AGENTS: $NUM_AGENTS"
+echo "  BPTT_HORIZON: $BPTT_HORIZON"
+echo "  SEGMENT_SIZE (min required): $SEGMENT_SIZE"
+echo "  TRAIN_BATCH_SIZE (calculated): $TRAIN_BATCH_SIZE"
+echo "  Divisibility check: $TRAIN_BATCH_SIZE / $SEGMENT_SIZE = $(($TRAIN_BATCH_SIZE / $SEGMENT_SIZE))"
+
+# Launch sweep with optimized parameters
+puffer sweep puffer_drive \
+  --wandb \
+  --wandb-project "gsp" \
+  --tag "guidance_regularize" \
+  --vec.num-workers $NUM_WORKERS \
+  --vec.num-envs $NUM_ENVS \
+  --vec.batch-size $VEC_BATCH_SIZE \
+  --env.num-agents $NUM_AGENTS \
+  --env.use-guided-autonomy $USE_GUIDANCE_REWARDS \
+  --train.batch-size $TRAIN_BATCH_SIZE \
+  --train.bptt-horizon $BPTT_HORIZON \
+
+# Print completion info
+echo "Sweep completed"
+date
\ No newline at end of file
diff --git a/greene/rtx8000_sweep.sh b/greene/rtx8000_sweep.sh
new file mode 100644
index 000000000..6b8ba2d42
--- /dev/null
+++ b/greene/rtx8000_sweep.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+#SBATCH --array=0-6
+#SBATCH --job-name=pd_sweep_rtx8000
+#SBATCH --output=/scratch/%u/PufferDrive/logs/output_%A_%a.txt
+#SBATCH --error=/scratch/%u/PufferDrive/logs/error_%A_%a.txt
+#SBATCH --mem=48G
+#SBATCH --time=0-24:0:0
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=20
+#SBATCH --gres=gpu:rtx8000:1
+#SBATCH --account=pr_100_tandon_priority
+
+# Print job info
+echo "Job ID: $SLURM_JOB_ID"
+echo "Array Task ID: $SLURM_ARRAY_TASK_ID"
+echo "User: $USER"
+echo "Running on node: $(hostname)"
+echo "GPU: $CUDA_VISIBLE_DEVICES"
+date
+
+# Set project directory
+PROJECT_DIR="/scratch/${USER}/PufferDrive"
+echo "Project directory: $PROJECT_DIR"
+
+# Create logs directory if it doesn't exist
+mkdir -p ${PROJECT_DIR}/logs
+
+# Navigate to project directory
+cd ${PROJECT_DIR}
+
+# Check if directory exists
+if [ ! -d "$PROJECT_DIR" ]; then
+    echo "ERROR: Project directory does not exist: $PROJECT_DIR"
+    exit 1
+fi
+
+# Activate virtual environment
+if [ -f ".venv/bin/activate" ]; then
+    source .venv/bin/activate
+    echo "Virtual environment activated"
+else
+    echo "ERROR: Virtual environment not found at ${PROJECT_DIR}/.venv"
+    exit 1
+fi
+
+# RTX8000 optimized settings
+NUM_WORKERS=32
+NUM_ENVS=32
+VEC_BATCH_SIZE=4
+NUM_AGENTS=1024 
+BPTT_HORIZON=32 
+HUMAN_REG_COEF=0.0
+USE_GUIDANCE_REWARDS=1
+
+# Calculate the segment size (minimum train batch size)
+SEGMENT_SIZE=$((NUM_AGENTS * NUM_WORKERS * BPTT_HORIZON))
+
+# Target batch size (aim for 1-2M on H100)
+TARGET_BATCH_SIZE=2097152  # 2M = 2^21
+
+# Calculate train batch size: largest multiple of SEGMENT_SIZE that doesn't exceed target
+TRAIN_BATCH_SIZE=$(( (TARGET_BATCH_SIZE / SEGMENT_SIZE) * SEGMENT_SIZE ))
+
+# If calculated batch size is 0 or too small, use the segment size
+if [ $TRAIN_BATCH_SIZE -lt $SEGMENT_SIZE ]; then
+    TRAIN_BATCH_SIZE=$SEGMENT_SIZE
+fi
+
+echo "Starting sweep with settings:"
+echo "  NUM_WORKERS: $NUM_WORKERS"
+echo "  NUM_ENVS: $NUM_ENVS"
+echo "  VEC_BATCH_SIZE: $VEC_BATCH_SIZE"
+echo "  NUM_AGENTS: $NUM_AGENTS"
+echo "  BPTT_HORIZON: $BPTT_HORIZON"
+echo "  SEGMENT_SIZE (min required): $SEGMENT_SIZE"
+echo "  TRAIN_BATCH_SIZE (calculated): $TRAIN_BATCH_SIZE"
+echo "  Divisibility check: $TRAIN_BATCH_SIZE / $SEGMENT_SIZE = $(($TRAIN_BATCH_SIZE / $SEGMENT_SIZE))"
+
+# Launch sweep with optimized parameters
+puffer sweep puffer_drive \
+  --wandb \
+  --wandb-project "gsp" \
+  --tag "guidance_rewards" \
+  --vec.num-workers $NUM_WORKERS \
+  --vec.num-envs $NUM_ENVS \
+  --vec.batch-size $VEC_BATCH_SIZE \
+  --env.num-agents $NUM_AGENTS \
+  --env.use-guided-autonomy $USE_GUIDANCE_REWARDS \
+  --train.batch-size $TRAIN_BATCH_SIZE \
+  --train.bptt-horizon $BPTT_HORIZON \
+
+# Print completion info
+echo "Sweep completed"
+date
\ No newline at end of file
diff --git a/models/.gitkeep b/models/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/bc_policy.pt b/models/bc_policy.pt
new file mode 100644
index 000000000..a4db27932
Binary files /dev/null and b/models/bc_policy.pt differ
diff --git a/models/model_puffer_drive_001908.pt b/models/model_puffer_drive_001908.pt
new file mode 100644
index 000000000..fcd89b9a2
Binary files /dev/null and b/models/model_puffer_drive_001908.pt differ
diff --git a/models/puffer_drive_sp_qld2z6tn.pt b/models/puffer_drive_sp_qld2z6tn.pt
new file mode 100644
index 000000000..ce2c79999
Binary files /dev/null and b/models/puffer_drive_sp_qld2z6tn.pt differ
diff --git a/profile_net.py b/profile_net.py
new file mode 100644
index 000000000..630e38849
--- /dev/null
+++ b/profile_net.py
@@ -0,0 +1,159 @@
+# Profiling for various implementations of linear + max encoding for road points
+# This was a 4x perf bottleneck. Now faster than expected baseline with custom kernels.
+
+import time
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+import torch
+from torch.backends import cudnn
+import torch.utils.cpp_extension
+
+from pufferlib import _C
+
+cudnn.benchmark = True
+cudnn.deterministic = False
+cudnn.benchmark_limit = 32
+
+
+class PointwiseNet(nn.Module):
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.linear = nn.Linear(input_size, hidden_size)
+
+    def forward(self, x):
+        return self.linear(x).max(dim=1)[0]
+
+
+class OriginalLinearMax(torch.nn.Module):
+    """Original implementation of Linear + Max without custom CUDA kernel."""
+
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.linear1 = nn.Linear(input_size, hidden_size)
+        self.ln = nn.LayerNorm(hidden_size)
+        self.linear2 = nn.Linear(hidden_size, hidden_size)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.ln(x)
+        x = self.linear2(x)
+        x, _ = x.max(dim=1)
+        return x
+
+
+class LinearReluMaxKernel(torch.autograd.Function):
+    """Custom kernel: Linear -> ReLU -> Max"""
+
+    @staticmethod
+    def forward(ctx, x, weight, bias):
+        out = torch.ops.pufferlib.linear_relu_max(x, weight, bias)
+        ctx.save_for_backward(x, weight, bias)
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        x, weight, bias = ctx.saved_tensors
+        grad_x, grad_weight, grad_bias = torch.ops.pufferlib.linear_relu_max_backward(
+            grad_out.contiguous(), x, weight, bias
+        )
+        return grad_x, grad_weight, grad_bias
+
+
+class FusedLinearReluMax(nn.Module):
+    """Fused CUDA kernel: Linear -> ReLU -> Max"""
+
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(hidden_size, input_size).normal_(mean=0.0, std=input_size**-0.5))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+
+    def forward(self, x):
+        return LinearReluMaxKernel.apply(x, self.weight, self.bias)
+
+
+def check_cuda_correct(model, cuda_model, data):
+    cuda_model.weight.data = model.linear.weight.data.clone()
+    cuda_model.bias.data = model.linear.bias.data.clone()
+    for batch in data:
+        model.zero_grad()
+        cuda_model.zero_grad()
+        model_out = model(batch)
+        cuda_out = cuda_model(batch)
+        assert torch.allclose(model_out, cuda_out, atol=1e-4, rtol=1e-4)
+        model_out.mean().backward()
+        cuda_out.mean().backward()
+        assert torch.allclose(model.linear.weight.grad, cuda_model.weight.grad, rtol=1e-4, atol=1e-4)
+        assert torch.allclose(model.linear.bias.grad, cuda_model.bias.grad, rtol=1e-4, atol=1e-4)
+
+
+def profile_forward(model, data, num_iters=100, warmup=10):
+    B = data.shape[0]
+    with torch.no_grad():
+        # Warmup
+        for i in range(warmup):
+            model(data[i])
+
+        torch.cuda.synchronize()
+        start = time.time()
+        for i in range(num_iters):
+            model(data[i] + 1)  # +1 to avoid in-place reuse issues
+        torch.cuda.synchronize()
+        end = time.time()
+
+        sps = B * num_iters / (end - start)
+    return sps
+
+
+def profile_backward(model, data, num_iters=100, warmup=10):
+    B = data.shape[0]
+    # Warmup
+    for i in range(warmup):
+        model.zero_grad()
+        out = model(data[i])
+        out.sum().backward()
+
+    torch.cuda.synchronize()
+    start = time.time()
+    for i in range(num_iters):
+        model.zero_grad()
+        out = model(data[i] + 1)
+        out.sum().backward()
+    torch.cuda.synchronize()
+    end = time.time()
+
+    sps = B * num_iters / (end - start)
+    return sps
+
+
+if __name__ == "__main__":
+    num_iters = 100
+    B = 4096  # Batch size
+    H = 64  # Hidden size
+    N = 128  # Number of road points
+    D = 13  # Feature dimension
+
+    device = torch.device("cuda")
+    data = torch.randn(num_iters, B, N, D).to(device)
+
+    pointwise = PointwiseNet(D, H).to(device)
+    linear_max = LinearMax(D, H).to(device)
+    original_linear_max = OriginalLinearMax(D, H).to(device)
+
+    linear_relu_max = FusedLinearReluMax(D, H).to(device)
+
+    print(
+        f"Pointwise: Forward ({profile_forward(pointwise, data):,.2f}), backward ({profile_backward(pointwise, data):,.2f})"
+    )
+    print(
+        f"Original Linear Max: Forward ({profile_forward(original_linear_max, data):,.2f}), backward ({profile_backward(original_linear_max, data):,.2f})"
+    )
+    print(
+        f"Cuda Linear Max: Forward ({profile_forward(linear_max, data):,.2f}), backward ({profile_backward(linear_max, data):,.2f})"
+    )
+    print()
+
+    print(
+        f"Cuda Linear ReLU Max: Forward ({profile_forward(linear_relu_max, data):,.2f}), backward ({profile_backward(linear_relu_max, data):,.2f})"
+    )
diff --git a/pufferlib/config/default.ini b/pufferlib/config/default.ini
index 810de828c..ba64b030b 100644
--- a/pufferlib/config/default.ini
+++ b/pufferlib/config/default.ini
@@ -66,13 +66,20 @@ metric = score
 goal = maximize
 downsample = 10
 
-#[sweep.vec.num_envs]
-#distribution = uniform_pow2
-#min = 1
-#max = 16
-#mean = 8
-#scale = auto
+; #[sweep.vec.num_envs]
+; #distribution = uniform_pow2
+; #min = 1
+; #max = 16
+; #mean = 8
+; #scale = auto
 
+; # TODO: Elim from base
+; [sweep.train.total_timesteps]
+; distribution = log_normal
+; min = 5e7
+; max = 1e10
+; mean = 1e8
+; scale = time
 ; # TODO: Elim from base
 ; [sweep.train.total_timesteps]
 ; distribution = log_normal
@@ -81,6 +88,12 @@ downsample = 10
 ; mean = 1e8
 ; scale = time
 
+; [sweep.train.bptt_horizon]
+; distribution = uniform_pow2
+; min = 16
+; max = 64
+; mean = 64
+; scale = auto
 ; [sweep.train.bptt_horizon]
 ; distribution = uniform_pow2
 ; min = 16
@@ -95,34 +108,6 @@ downsample = 10
 ; mean = 32768
 ; scale = auto
 
-; [sweep.train.learning_rate]
-; distribution = log_normal
-; min = 0.00001
-; mean = 0.01
-; max = 0.1
-; scale = 0.5
-
-; [sweep.train.ent_coef]
-; distribution = log_normal
-; min = 0.00001
-; mean = 0.01
-; max = 0.2
-; scale = auto
-
-; [sweep.train.gamma]
-; distribution = logit_normal
-; min = 0.8
-; mean = 0.98
-; max = 0.9999
-; scale = auto
-
-; [sweep.train.gae_lambda]
-; distribution = logit_normal
-; min = 0.6
-; mean = 0.95
-; max = 0.995
-; scale = auto
-
 ; [sweep.train.vtrace_rho_clip]
 ; distribution = uniform
 ; min = 0.0
diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index 7245e5f2b..146743da4 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -8,7 +8,7 @@ rnn_name = Recurrent
 num_workers = 16
 num_envs = 16
 batch_size = 4
-; backend = Serial
+#backend = Serial
 
 [policy]
 input_size = 64
@@ -26,11 +26,16 @@ action_type = discrete
 dynamics_model = classic
 reward_vehicle_collision = -0.5
 reward_offroad_collision = -0.5
+; Guided autonomy reward parameters
+use_guided_autonomy = 0
+guidance_speed_weight = 0.0
+guidance_heading_weight = 0.0
+waypoint_reach_threshold = 2.0
 dt = 0.1
 reward_goal = 1.0
 reward_goal_post_respawn = 0.25
 ; Meters around goal to be considered "reached"
-goal_radius = 2.0
+goal_radius = 1.0
 ; Max target speed in m/s for the agent to maintain towards the goal
 goal_speed = 100.0
 ; What to do when the goal is reached. Options: 0:"respawn", 1:"generate_new_goals", 2:"stop"
@@ -46,18 +51,27 @@ offroad_behavior = 0
 episode_length = 91
 resample_frequency = 910
 termination_mode = 1 # 0 - terminate at episode_length, 1 - terminate after all agents have been reset
-map_dir = "resources/drive/binaries/training"
-num_maps = 10000
+map_dir = "resources/drive/binaries/validation_interactive_small" #"resources/drive/binaries/training"
+num_maps = 229 #10_000
 ; Determines which step of the trajectory to initialize the agents at upon reset
 init_steps = 0
 ; Options: "control_vehicles", "control_agents", "control_wosac", "control_sdc_only"
-control_mode = "control_vehicles"
+control_mode = "control_agents"
 ; Options: "created_all_valid", "create_only_controlled"
 init_mode = "create_all_valid"
 
+; Note: this argument is currently duplicated in [train] section and should match
+bptt_horizon = 32
+; Directory to save/load human demonstration data
+human_data_dir = "resources/drive/human_demonstrations"
+prep_human_data = True
+; Number of human samples to create per env. Each sequence has length equal to bptt_horizon.
+; So with bptt_horizon=32, each sequence has 3 seconds of human data.
+max_expert_sequences = 3600
+
 [train]
 seed=42
-total_timesteps = 2_000_000_000
+total_timesteps = 3_000_000_000
 ; learning_rate = 0.02
 ; gamma = 0.985
 anneal_lr = True
@@ -70,10 +84,10 @@ adam_beta1 = 0.9
 adam_beta2 = 0.999
 adam_eps = 1e-8
 clip_coef = 0.2
-ent_coef = 0.005
+ent_coef = 0.004670370966041502
 gae_lambda = 0.95
-gamma = 0.98
-learning_rate = 0.003
+gamma = 0.973457552852506
+learning_rate = 0.0030854349125978733
 max_grad_norm = 1
 prio_alpha = 0.8499999999999999
 prio_beta0 = 0.8499999999999999
@@ -82,10 +96,16 @@ vf_clip_coef = 0.1999999999999999
 vf_coef = 2
 vtrace_c_clip = 1
 vtrace_rho_clip = 1
-checkpoint_interval = 1000
+; How many randomly sampled human demonstration sequences to use per update
+human_sequences = 64
+save_data_to_disk = True
+human_clip_coef = 10.0
+human_ll_coef = 0.0
+
+checkpoint_interval = 200
 ; Rendering options
-render = True
-render_interval = 1000
+render = False
+render_interval = 200
 ; If True, show exactly what the agent sees in agent observation
 obs_only = True
 ; Show grid lines
@@ -93,22 +113,22 @@ show_grid = True
 ; Draws lines from ego agent observed ORUs and road elements to show detection range
 show_lasers = False
 ; Display human xy logs in the background
-show_human_logs = False
+show_human_logs = True
 ; If True, zoom in on a part of the map. Otherwise, show full map
 zoom_in = True
 ; Options: List[str to path], str to path (e.g., "resources/drive/training/binaries/map_001.bin"), None
 render_map = none
 
 [eval]
-eval_interval = 1000
+eval_interval = 300
 ; Path to dataset used for evaluation
-map_dir = "resources/drive/binaries/training"
+map_dir = "resources/drive/binaries/validation_interactive_small"
 ; Evaluation will run on the first num_maps maps in the map_dir directory
-wosac_num_maps = 20
+wosac_num_maps = 60
 backend = PufferEnv
 ; WOSAC (Waymo Open Sim Agents Challenge) evaluation settings
 ; If True, enables evaluation on realism metrics each time we save a checkpoint
-wosac_realism_eval = False
+wosac_realism_eval = True
 ; Number of policy rollouts per scene
 wosac_num_rollouts = 32
 ; When to start the simulation
@@ -120,15 +140,18 @@ wosac_init_mode = "create_all_valid"
 ; Stop when reaching the goal
 wosac_goal_behavior = 2
 ; Can shrink goal radius for WOSAC evaluation
-wosac_goal_radius = 2.0
+wosac_goal_radius = 1.0
 wosac_sanity_check = False
 ; Only return aggregate results across all scenes
 wosac_aggregate_results = True
 ; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
-human_replay_eval = False
+human_replay_eval = True
+; Determine number of agents == number of human replays to use during evaluation
+human_replay_num_agents = 32
 ; Control only the self-driving car
 human_replay_control_mode = "control_sdc_only"
 
+; General sweep settings:
 [sweep.train.learning_rate]
 distribution = log_normal
 min = 0.001
@@ -150,18 +173,23 @@ mean = 0.98
 max = 0.999
 scale = auto
 
-[sweep.train.gae_lambda]
-distribution = log_normal
-min = 0.95
-mean = 0.98
-max = 0.999
-scale = auto
+; [sweep.env.guidance_heading_weight]
+; distribution = uniform
+; min = 0.1
+; max = 0.5
+; mean = 0.3
+; scale = auto
 
-[controlled_exp.train.goal_speed]
-values = [10, 20, 30, 3]
+; [sweep.env.guidance_speed_weight]
+; distribution = uniform
+; min = 0.1
+; max = 0.5
+; mean = 0.3
+; scale = auto
 
-[controlled_exp.train.ent_coef]
-values = [0.001, 0.005, 0.01]
+; Settings for controlled experiments
+[controlled_exp.train.human_ll_coef]
+values = [0.1, 0.5, 0.9]
 
-[controlled_exp.train.seed]
-values = [42, 55, 1]
+; [controlled_exp.env.max_expert_sequences]
+; values = [512, 1024, 2048]
diff --git a/pufferlib/extensions/cuda/pufferlib.cu b/pufferlib/extensions/cuda/pufferlib.cu
index 9426a7e5e..7acfa8790 100644
--- a/pufferlib/extensions/cuda/pufferlib.cu
+++ b/pufferlib/extensions/cuda/pufferlib.cu
@@ -1,9 +1,328 @@
 #include <torch/extension.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cmath>
 
 namespace pufferlib {
 
+// Linear -> ReLU -> Max kernel
+template<int D_VAL = 0>
+__global__ void linear_relu_max_kernel(
+    const float* __restrict__ x,
+    const float* __restrict__ weight,
+    const float* __restrict__ bias,
+    float* __restrict__ out,
+    int B, int N, int H, int D
+) {
+    int b = blockIdx.x;
+    int h = threadIdx.x;
+
+    if (b >= B || h >= H) return;
+
+    extern __shared__ float sh_x[];
+
+    constexpr int D_compile = D_VAL;
+    const int D_runtime = (D_compile > 0) ? D_compile : D;
+    const int x_size = N * D_runtime;
+    const float* x_batch = x + b * x_size;
+
+    // Strided cooperative loading for better memory coalescing
+    for (int i = h; i < x_size; i += H) {
+        sh_x[i] = x_batch[i];
+    }
+    __syncthreads();
+
+    const float* weight_h = weight + h * D_runtime;
+    float bias_val = bias[h];
+    float max_val = -FLT_MAX;
+
+    #pragma unroll 8
+    for (int n = 0; n < N; n++) {
+        float dot = 0.0f;
+        const float* x_n = sh_x + n * D_runtime;
+
+        if constexpr (D_compile > 0) {
+            // Compile-time known D: fully unroll
+            #pragma unroll
+            for (int d = 0; d < D_compile; d++) {
+                dot += weight_h[d] * x_n[d];
+            }
+        } else {
+            // Runtime D: manual unroll by 4
+            int d = 0;
+            #pragma unroll 4
+            for (; d + 3 < D; d += 4) {
+                dot += weight_h[d] * x_n[d] +
+                       weight_h[d+1] * x_n[d+1] +
+                       weight_h[d+2] * x_n[d+2] +
+                       weight_h[d+3] * x_n[d+3];
+            }
+            for (; d < D; d++) {
+                dot += weight_h[d] * x_n[d];
+            }
+        }
+
+        // Apply ReLU before max
+        float activated = fmaxf(0.0f, dot + bias_val);
+        max_val = fmaxf(max_val, activated);
+    }
+
+    out[b * H + h] = max_val;
+}
+
+template<int D_VAL = 0>
+__global__ void linear_relu_max_backward_kernel(
+    const float* __restrict__ x,
+    const float* __restrict__ weight,
+    const float* __restrict__ bias,
+    const float* __restrict__ grad_out,
+    float* __restrict__ grad_x,
+    float* __restrict__ grad_weight_temp,
+    float* __restrict__ grad_bias_temp,
+    int B, int N, int H, int D
+) {
+    int b = blockIdx.x;
+    int h = threadIdx.x;
+
+    if (b >= B || h >= H) return;
+
+    extern __shared__ float sh_x[];
+
+    constexpr int D_compile = D_VAL;
+    const int D_runtime = (D_compile > 0) ? D_compile : D;
+    const int x_size = N * D_runtime;
+    const float* x_batch = x + b * x_size;
+
+    // Strided cooperative loading
+    for (int i = h; i < x_size; i += H) {
+        sh_x[i] = x_batch[i];
+    }
+    __syncthreads();
+
+    const float* weight_h = weight + h * D_runtime;
+    const float* bias_ptr = bias + h;
+    float bias_val = bias_ptr[0];
+    float go = grad_out[b * H + h];
+
+    int argmax_n = -1;
+    float max_activated = -FLT_MAX;
+
+    // Forward pass to find argmax
+    #pragma unroll 8
+    for (int n = 0; n < N; n++) {
+        float dot = 0.0f;
+        const float* x_n = sh_x + n * D_runtime;
+
+        if constexpr (D_compile > 0) {
+            #pragma unroll
+            for (int d = 0; d < D_compile; d++) {
+                dot += weight_h[d] * x_n[d];
+            }
+        } else {
+            int d = 0;
+            #pragma unroll 4
+            for (; d + 3 < D; d += 4) {
+                dot += weight_h[d] * x_n[d] +
+                       weight_h[d+1] * x_n[d+1] +
+                       weight_h[d+2] * x_n[d+2] +
+                       weight_h[d+3] * x_n[d+3];
+            }
+            for (; d < D; d++) {
+                dot += weight_h[d] * x_n[d];
+            }
+        }
+
+        // Apply ReLU
+        float activated = fmaxf(0.0f, dot + bias_val);
+
+        if (activated > max_activated) {
+            max_activated = activated;
+            argmax_n = n;
+        }
+    }
+
+    if (argmax_n == -1) return;  // Should not happen if N > 0
+
+    // Check if ReLU was active at the argmax position
+    // Recompute the pre-activation value
+    float dot_at_argmax = 0.0f;
+    const float* x_argmax = sh_x + argmax_n * D_runtime;
+
+    if constexpr (D_compile > 0) {
+        #pragma unroll
+        for (int d = 0; d < D_compile; d++) {
+            dot_at_argmax += weight_h[d] * x_argmax[d];
+        }
+    } else {
+        for (int d = 0; d < D; d++) {
+            dot_at_argmax += weight_h[d] * x_argmax[d];
+        }
+    }
+
+    float pre_activation = dot_at_argmax + bias_val;
+
+    // Only backprop if ReLU was active (pre_activation > 0)
+    if (pre_activation <= 0.0f) {
+        // ReLU blocked the gradient
+        return;
+    }
+
+    // Gradient flows through: go * 1 (from max) * 1 (from ReLU when active)
+    grad_bias_temp[b * H + h] = go;
+
+    const int weight_temp_base = b * H * D_runtime + h * D_runtime;
+    const int x_base = argmax_n * D_runtime;
+
+    if constexpr (D_compile > 0) {
+        #pragma unroll
+        for (int d = 0; d < D_compile; d++) {
+            grad_weight_temp[weight_temp_base + d] = go * sh_x[x_base + d];
+        }
+    } else {
+        for (int d = 0; d < D; d++) {
+            grad_weight_temp[weight_temp_base + d] = go * sh_x[x_base + d];
+        }
+    }
+
+    const int grad_x_base = b * N * D_runtime + argmax_n * D_runtime;
+
+    if constexpr (D_compile > 0) {
+        #pragma unroll
+        for (int d = 0; d < D_compile; d++) {
+            atomicAdd(grad_x + grad_x_base + d, go * weight_h[d]);
+        }
+    } else {
+        for (int d = 0; d < D; d++) {
+            atomicAdd(grad_x + grad_x_base + d, go * weight_h[d]);
+        }
+    }
+}
+
+torch::Tensor linear_relu_max_cuda(
+    torch::Tensor x,
+    torch::Tensor weight,
+    torch::Tensor bias
+) {
+    TORCH_CHECK(x.is_cuda() && weight.is_cuda() && bias.is_cuda(),
+                "All tensors must be on CUDA");
+    TORCH_CHECK(x.dim() == 3, "x must be (B, N, D)");
+    TORCH_CHECK(weight.dim() == 2, "weight must be (H, D)");
+    TORCH_CHECK(bias.dim() == 1, "bias must be (H)");
+    TORCH_CHECK(x.dtype() == torch::kFloat, "x must be float32");
+    TORCH_CHECK(weight.dtype() == torch::kFloat, "weight must be float32");
+    TORCH_CHECK(bias.dtype() == torch::kFloat, "bias must be float32");
+
+    const int B = x.size(0);
+    const int N = x.size(1);
+    const int D = x.size(2);
+    const int H = weight.size(0);
+
+    TORCH_CHECK(weight.size(1) == D, "Weight D mismatch");
+    TORCH_CHECK(bias.size(0) == H, "Bias H mismatch");
+
+    auto out = torch::empty({B, H}, x.options());
+
+    dim3 block(H);
+    dim3 grid(B);
+    size_t shared_size = N * D * sizeof(float);
+
+    const float* x_ptr = x.data_ptr<float>();
+    const float* weight_ptr = weight.data_ptr<float>();
+    const float* bias_ptr = bias.data_ptr<float>();
+    float* out_ptr = out.data_ptr<float>();
+
+    int device_idx = x.get_device();
+    cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device_idx);
+
+    if (D == 7) {
+        linear_relu_max_kernel<7><<<grid, block, shared_size, stream>>>(
+            x_ptr, weight_ptr, bias_ptr, out_ptr, B, N, H, D);
+    } else if (D == 13) {
+        linear_relu_max_kernel<13><<<grid, block, shared_size, stream>>>(
+            x_ptr, weight_ptr, bias_ptr, out_ptr, B, N, H, D);
+    } else {
+        linear_relu_max_kernel<0><<<grid, block, shared_size, stream>>>(
+            x_ptr, weight_ptr, bias_ptr, out_ptr, B, N, H, D);
+    }
+
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    return out;
+}
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> linear_relu_max_backward_cuda(
+    torch::Tensor grad_out,
+    torch::Tensor x,
+    torch::Tensor weight,
+    torch::Tensor bias
+) {
+    TORCH_CHECK(grad_out.is_cuda() && x.is_cuda() && weight.is_cuda() && bias.is_cuda(),
+                "All tensors must be on CUDA");
+    TORCH_CHECK(grad_out.dim() == 2, "grad_out must be (B, H)");
+    TORCH_CHECK(x.dim() == 3, "x must be (B, N, D)");
+    TORCH_CHECK(weight.dim() == 2, "weight must be (H, D)");
+    TORCH_CHECK(bias.dim() == 1, "bias must be (H)");
+    TORCH_CHECK(grad_out.dtype() == torch::kFloat, "grad_out must be float32");
+    TORCH_CHECK(x.dtype() == torch::kFloat, "x must be float32");
+    TORCH_CHECK(weight.dtype() == torch::kFloat, "weight must be float32");
+    TORCH_CHECK(bias.dtype() == torch::kFloat, "bias must be float32");
+
+    const int B = x.size(0);
+    const int N = x.size(1);
+    const int D = x.size(2);
+    const int H = weight.size(0);
+
+    TORCH_CHECK(grad_out.size(0) == B && grad_out.size(1) == H, "grad_out shape mismatch");
+    TORCH_CHECK(weight.size(1) == D, "Weight D mismatch");
+    TORCH_CHECK(bias.size(0) == H, "Bias H mismatch");
+
+    auto grad_x = torch::zeros_like(x);
+    auto grad_weight = torch::zeros_like(weight);
+    auto grad_bias = torch::zeros({H}, x.options());
+
+    auto grad_weight_temp = torch::zeros({B, H, D}, x.options());
+    auto grad_bias_temp = torch::zeros({B, H}, x.options());
+
+    dim3 block(H);
+    dim3 grid(B);
+    size_t shared_size = N * D * sizeof(float);
+
+    const float* grad_out_ptr = grad_out.data_ptr<float>();
+    const float* x_ptr = x.data_ptr<float>();
+    const float* weight_ptr = weight.data_ptr<float>();
+    const float* bias_ptr = bias.data_ptr<float>();
+    float* grad_x_ptr = grad_x.data_ptr<float>();
+    float* grad_weight_temp_ptr = grad_weight_temp.data_ptr<float>();
+    float* grad_bias_temp_ptr = grad_bias_temp.data_ptr<float>();
+
+    int device_idx = x.get_device();
+    cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device_idx);
+
+    if (D == 7) {
+        linear_relu_max_backward_kernel<7><<<grid, block, shared_size, stream>>>(
+            x_ptr, weight_ptr, bias_ptr, grad_out_ptr, grad_x_ptr,
+            grad_weight_temp_ptr, grad_bias_temp_ptr, B, N, H, D);
+    } else if (D == 13) {
+        linear_relu_max_backward_kernel<13><<<grid, block, shared_size, stream>>>(
+            x_ptr, weight_ptr, bias_ptr, grad_out_ptr, grad_x_ptr,
+            grad_weight_temp_ptr, grad_bias_temp_ptr, B, N, H, D);
+    } else {
+        linear_relu_max_backward_kernel<0><<<grid, block, shared_size, stream>>>(
+            x_ptr, weight_ptr, bias_ptr, grad_out_ptr, grad_x_ptr,
+            grad_weight_temp_ptr, grad_bias_temp_ptr, B, N, H, D);
+    }
+
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    grad_weight = grad_weight_temp.sum(0);
+    grad_bias = grad_bias_temp.sum(0);
+
+    cudaStreamSynchronize(stream);
+
+    return {grad_x, grad_weight, grad_bias};
+}
+
 __host__ __device__ void puff_advantage_row_cuda(float* values, float* rewards, float* dones,
         float* importance, float* advantages, float gamma, float lambda,
         float rho_clip, float c_clip, int horizon) {
diff --git a/pufferlib/extensions/pufferlib.cpp b/pufferlib/extensions/pufferlib.cpp
index a20d58bc2..8dba39ded 100644
--- a/pufferlib/extensions/pufferlib.cpp
+++ b/pufferlib/extensions/pufferlib.cpp
@@ -85,7 +85,7 @@ void compute_puff_advantage_cpu(torch::Tensor values, torch::Tensor rewards,
 }
 
 TORCH_LIBRARY(pufferlib, m) {
-   m.def("compute_puff_advantage(Tensor(a!) values, Tensor(b!) rewards, Tensor(c!) dones, Tensor(d!) importance, Tensor(e!) advantages, float gamma, float lambda, float rho_clip, float c_clip) -> ()");
+    m.def("compute_puff_advantage(Tensor(a!) values, Tensor(b!) rewards, Tensor(c!) dones, Tensor(d!) importance, Tensor(e!) advantages, float gamma, float lambda, float rho_clip, float c_clip) -> ()");
  }
 
 TORCH_LIBRARY_IMPL(pufferlib, CPU, m) {
diff --git a/pufferlib/ocean/benchmark/evaluate_checkpoints.py b/pufferlib/ocean/benchmark/evaluate_checkpoints.py
new file mode 100644
index 000000000..5b36360be
--- /dev/null
+++ b/pufferlib/ocean/benchmark/evaluate_checkpoints.py
@@ -0,0 +1,409 @@
+"""
+Comprehensive evaluation script for Drive environment checkpoints.
+
+Evaluates all .pt checkpoints in a folder using:
+1. WOSAC metrics (realism, ADE, likelihood metrics)
+2. Collision rates (SDC-only control mode)
+
+Includes baselines for ground truth and random policy.
+"""
+
+import argparse
+import os
+import glob
+import sys
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch
+
+from pufferlib.pufferl import load_env, load_policy, load_config
+from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator
+import pufferlib.pytorch
+
+
+def evaluate_wosac(config, vecenv, policy, policy_name="policy"):
+    """Run WOSAC evaluation for a given policy."""
+    print(f"Running WOSAC evaluation for {policy_name}...")
+
+    evaluator = WOSACEvaluator(config)
+
+    # Collect ground truth trajectories
+    gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv)
+
+    # Collect simulated trajectories
+    simulated_trajectories = evaluator.collect_simulated_trajectories(config, vecenv, policy)
+
+    # Optional sanity check
+    if config["wosac"].get("sanity_check", False):
+        evaluator._quick_sanity_check(gt_trajectories, simulated_trajectories)
+
+    # Compute metrics
+    results = evaluator.compute_metrics(gt_trajectories, simulated_trajectories)
+
+    return {
+        "policy": policy_name,
+        "ade": results["ade"].mean(),
+        "min_ade": results["min_ade"].mean(),
+        "likelihood_linear_speed": results["likelihood_linear_speed"].mean(),
+        "likelihood_linear_acceleration": results["likelihood_linear_acceleration"].mean(),
+        "likelihood_angular_speed": results["likelihood_angular_speed"].mean(),
+        "likelihood_angular_acceleration": results["likelihood_angular_acceleration"].mean(),
+        "realism_metametric": results["realism_metametric"].mean(),
+    }
+
+
+def evaluate_wosac_ground_truth(config, vecenv):
+    """Run WOSAC evaluation using ground truth as the policy (perfect replay)."""
+    print("Running WOSAC evaluation for ground truth...")
+
+    evaluator = WOSACEvaluator(config)
+
+    # Collect ground truth trajectories
+    gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv)
+
+    # Use ground truth as simulated (perfect replay)
+    # For this, we need to broadcast GT to match rollout dimensions
+    simulated_trajectories = {}
+    num_rollouts = config["wosac"]["num_rollouts"]
+
+    for key in gt_trajectories:
+        if key in ["x", "y", "z", "heading"]:
+            # Broadcast ground truth across rollouts
+            gt_shape = gt_trajectories[key].shape
+            simulated_trajectories[key] = np.broadcast_to(
+                gt_trajectories[key], (gt_shape[0], num_rollouts, gt_shape[2])
+            ).copy()
+        else:
+            simulated_trajectories[key] = gt_trajectories[key].copy()
+
+    # Compute metrics
+    results = evaluator.compute_metrics(gt_trajectories, simulated_trajectories)
+
+    return {
+        "policy": "ground_truth",
+        "ade": results["ade"].mean(),
+        "min_ade": results["min_ade"].mean(),
+        "likelihood_linear_speed": results["likelihood_linear_speed"].mean(),
+        "likelihood_linear_acceleration": results["likelihood_linear_acceleration"].mean(),
+        "likelihood_angular_speed": results["likelihood_angular_speed"].mean(),
+        "likelihood_angular_acceleration": results["likelihood_angular_acceleration"].mean(),
+        "realism_metametric": results["realism_metametric"].mean(),
+    }
+
+
+def evaluate_wosac_random(config, vecenv, policy):
+    """Run WOSAC evaluation with a random policy."""
+    print("Running WOSAC evaluation for random policy...")
+
+    class RandomPolicy:
+        """Random action policy wrapper."""
+
+        def __init__(self, action_space):
+            self.action_space = action_space
+
+        def forward_eval(self, obs, state):
+            batch_size = obs.shape[0]
+            # Generate random actions
+            if hasattr(self.action_space, "nvec"):  # MultiDiscrete
+                actions = torch.tensor(
+                    [[np.random.randint(0, n) for n in self.action_space.nvec] for _ in range(batch_size)],
+                    dtype=torch.long,
+                )
+            else:  # Continuous
+                actions = torch.tensor(self.action_space.sample()).unsqueeze(0).repeat(batch_size, 1)
+
+            # Return dummy values for value estimate
+            value = torch.zeros(batch_size, 1)
+            return actions, value
+
+    random_policy = RandomPolicy(vecenv.single_action_space)
+    return evaluate_wosac(config, vecenv, random_policy, policy_name="random")
+
+
+def evaluate_collision_rate(config, vecenv, policy, policy_name="policy", num_episodes=100):
+    """Evaluate collision rate in SDC-only control mode."""
+    print(f"Running collision rate evaluation for {policy_name}...")
+
+    device = config["train"]["device"]
+    num_agents = vecenv.num_agents
+
+    # Initialize LSTM state if needed
+    state = {}
+    if config["train"]["use_rnn"]:
+        state = dict(
+            lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
+            lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
+        )
+
+    # Track collision statistics
+    total_steps = 0
+    total_collision_count = 0
+    total_collisions_per_agent = 0.0
+    episodes_completed = 0
+
+    ob, info = vecenv.reset()
+
+    while episodes_completed < num_episodes:
+        with torch.no_grad():
+            ob_tensor = torch.as_tensor(ob).to(device)
+
+            if hasattr(policy, "forward_eval"):
+                logits, value = policy.forward_eval(ob_tensor, state)
+            else:
+                # Random policy
+                if hasattr(vecenv.single_action_space, "nvec"):
+                    action = np.array([[np.random.randint(0, n) for n in vecenv.single_action_space.nvec]])
+                else:
+                    action = vecenv.single_action_space.sample()
+                    if isinstance(action, np.ndarray):
+                        action = action.reshape(1, -1)
+                ob, rewards, terminals, truncations, info = vecenv.step(action)
+                total_steps += 1
+
+                # Check for episode completion
+                if terminals.any() or truncations.any():
+                    episodes_completed += 1
+                    # Extract metrics from final info (contains log data)
+                    if info:
+                        for info_dict in info:
+                            if isinstance(info_dict, dict):
+                                if "collision_rate" in info_dict:
+                                    total_collision_count += int(info_dict["collision_rate"])
+                                if "avg_collisions_per_agent" in info_dict:
+                                    total_collisions_per_agent += float(info_dict["avg_collisions_per_agent"])
+                continue
+
+            action, logprob, _ = pufferlib.pytorch.sample_logits(logits)
+            action = action.cpu().numpy().reshape(vecenv.action_space.shape)
+
+            # Clip continuous actions
+            if isinstance(logits, torch.distributions.Normal):
+                action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
+
+        ob, rewards, terminals, truncations, info = vecenv.step(action)
+        total_steps += 1
+
+        # Check for episode completion
+        if terminals.any() or truncations.any():
+            episodes_completed += 1
+            # Extract metrics from final info (contains log data)
+            if info:
+                for info_dict in info:
+                    if isinstance(info_dict, dict):
+                        if "collision_rate" in info_dict:
+                            total_collision_count += int(info_dict["collision_rate"])
+                        if "avg_collisions_per_agent" in info_dict:
+                            total_collisions_per_agent += float(info_dict["avg_collisions_per_agent"])
+
+    # Calculate rates
+    collision_rate = total_collision_count / num_episodes if num_episodes > 0 else 0
+    avg_collisions_per_agent = total_collisions_per_agent / num_episodes if num_episodes > 0 else 0
+
+    return {
+        "policy": policy_name,
+        "num_episodes": num_episodes,
+        "total_steps": total_steps,
+        "collision_rate": collision_rate,
+        "avg_collisions_per_agent": avg_collisions_per_agent,
+    }
+
+
+def evaluate_collision_rate_random(config, vecenv, num_episodes=100):
+    """Evaluate collision rate with random policy."""
+    print("Running collision rate evaluation for random policy...")
+
+    class RandomPolicy:
+        """Dummy random policy."""
+
+        pass
+
+    random_policy = RandomPolicy()
+    return evaluate_collision_rate(config, vecenv, random_policy, policy_name="random", num_episodes=num_episodes)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate Drive environment checkpoints", formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("checkpoint_dir", type=str, help="Directory containing .pt checkpoint files")
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="evaluation_results.csv",
+        help="Output CSV file path (default: evaluation_results.csv)",
+    )
+    parser.add_argument(
+        "--num-collision-episodes",
+        type=int,
+        default=100,
+        help="Number of episodes for collision rate evaluation (default: 100)",
+    )
+    parser.add_argument("--skip-wosac", action="store_true", help="Skip WOSAC evaluation")
+    parser.add_argument("--skip-collision", action="store_true", help="Skip collision rate evaluation")
+    args = parser.parse_args()
+
+    # Hardcode environment name
+    env_name = "puffer_drive"
+
+    # Find all checkpoint files
+    checkpoint_dir = Path(args.checkpoint_dir)
+    checkpoint_files = sorted(glob.glob(str(checkpoint_dir / "*.pt")))
+
+    if not checkpoint_files:
+        print(f"No .pt files found in {checkpoint_dir}")
+        return
+
+    print(f"Found {len(checkpoint_files)} checkpoint files")
+
+    # Temporarily modify sys.argv to prevent load_config from parsing our arguments
+    original_argv = sys.argv.copy()
+    sys.argv = [sys.argv[0]]  # Keep only script name
+
+    try:
+        config = load_config(env_name)
+    finally:
+        sys.argv = original_argv
+
+    # Results storage
+    wosac_results = []
+    collision_results = []
+
+    # === WOSAC Evaluation ===
+    if not args.skip_wosac:
+        print("\n" + "=" * 80)
+        print("STARTING WOSAC EVALUATION")
+        print("=" * 80)
+
+        # Configure for WOSAC
+        config["wosac"]["enabled"] = True
+        config["vec"]["backend"] = "PufferEnv"
+        config["vec"]["num_envs"] = 1
+        config["env"]["num_agents"] = config["wosac"]["num_total_wosac_agents"]
+        config["env"]["init_mode"] = config["wosac"]["init_mode"]
+        config["env"]["control_mode"] = config["wosac"]["control_mode"]
+        config["env"]["init_steps"] = config["wosac"]["init_steps"]
+        config["env"]["goal_behavior"] = config["wosac"]["goal_behavior"]
+        config["env"]["goal_radius"] = config["wosac"]["goal_radius"]
+
+        # Create environment for WOSAC
+        vecenv_wosac = load_env(env_name, config)
+
+        # Evaluate ground truth
+        try:
+            gt_result = evaluate_wosac_ground_truth(config, vecenv_wosac)
+            wosac_results.append(gt_result)
+        except Exception as e:
+            print(f"Error evaluating ground truth: {e}")
+
+        # Evaluate random policy
+        try:
+            policy_template = load_policy(config, vecenv_wosac, env_name)
+            random_result = evaluate_wosac_random(config, vecenv_wosac, policy_template)
+            wosac_results.append(random_result)
+        except Exception as e:
+            print(f"Error evaluating random policy: {e}")
+
+        # Evaluate each checkpoint
+        for checkpoint_path in checkpoint_files:
+            checkpoint_name = Path(checkpoint_path).stem
+            print(f"\nEvaluating checkpoint: {checkpoint_name}")
+
+            try:
+                # Load policy
+                policy = load_policy(config, vecenv_wosac, env_name)
+                state_dict = torch.load(checkpoint_path, map_location=config["train"]["device"])
+                state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+                policy.load_state_dict(state_dict)
+
+                # Evaluate
+                result = evaluate_wosac(config, vecenv_wosac, policy, policy_name=checkpoint_name)
+                wosac_results.append(result)
+
+            except Exception as e:
+                print(f"Error evaluating {checkpoint_name}: {e}")
+
+        vecenv_wosac.close()
+
+    # === Collision Rate Evaluation ===
+    if not args.skip_collision:
+        print("\n" + "=" * 80)
+        print("STARTING COLLISION RATE EVALUATION")
+        print("=" * 80)
+
+        # Configure for SDC-only collision evaluation
+        config["vec"]["backend"] = "PufferEnv"
+        config["vec"]["num_envs"] = 1
+        config["env"]["num_agents"] = 1
+        config["env"]["control_mode"] = "control_sdc_only"
+        config["env"]["init_mode"] = "create_all_valid"
+        config["env"]["init_steps"] = 10
+
+        # Create environment for collision evaluation
+        vecenv_human_replay = load_env(env_name, config)
+
+        # Evaluate random policy
+        try:
+            random_collision_result = evaluate_collision_rate_random(
+                config, vecenv_human_replay, num_episodes=args.num_collision_episodes
+            )
+            collision_results.append(random_collision_result)
+        except Exception as e:
+            print(f"Error evaluating random policy collision rate: {e}")
+
+        # Evaluate each checkpoint
+        for checkpoint_path in checkpoint_files:
+            checkpoint_name = Path(checkpoint_path).stem
+            print(f"\nEvaluating checkpoint collision rate: {checkpoint_name}")
+
+            try:
+                # Load policy
+                policy = load_policy(config, vecenv_human_replay, env_name)
+                state_dict = torch.load(checkpoint_path, map_location=config["train"]["device"])
+                state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+                policy.load_state_dict(state_dict)
+
+                # Evaluate
+                result = evaluate_collision_rate(
+                    config,
+                    vecenv_human_replay,
+                    policy,
+                    policy_name=checkpoint_name,
+                    num_episodes=args.num_collision_episodes,
+                )
+                collision_results.append(result)
+
+            except Exception as e:
+                print(f"Error evaluating {checkpoint_name} collision rate: {e}")
+
+        vecenv_human_replay.close()
+
+    # === Save Results ===
+    print("\n" + "=" * 80)
+    print("SAVING RESULTS")
+    print("=" * 80)
+
+    # Create DataFrames
+    if wosac_results:
+        df_wosac = pd.DataFrame(wosac_results)
+        wosac_output = args.output.replace(".csv", "_wosac.csv")
+        df_wosac.to_csv(wosac_output, index=False)
+        print(f"\nWOSAC results saved to {wosac_output}")
+        print("\nWOSAC Results Summary:")
+        print(df_wosac.to_string(index=False))
+
+    if collision_results:
+        df_collision = pd.DataFrame(collision_results)
+        collision_output = args.output.replace(".csv", "_collision.csv")
+        df_collision.to_csv(collision_output, index=False)
+        print(f"\nCollision results saved to {collision_output}")
+        print("\nCollision Results Summary:")
+        print(df_collision.to_string(index=False))
+
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pufferlib/ocean/benchmark/evaluator.py b/pufferlib/ocean/benchmark/evaluator.py
index 003bfad27..a83869edc 100644
--- a/pufferlib/ocean/benchmark/evaluator.py
+++ b/pufferlib/ocean/benchmark/evaluator.py
@@ -7,6 +7,7 @@
 import matplotlib.pyplot as plt
 import configparser
 import os
+from tqdm import tqdm
 
 import pufferlib
 from pufferlib.ocean.benchmark import metrics
@@ -40,6 +41,7 @@ def __init__(self, config: Dict):
         wosac_metrics_path = os.path.join(os.path.dirname(__file__), "wosac.ini")
         self.metrics_config = configparser.ConfigParser()
         self.metrics_config.read(wosac_metrics_path)
+        self.mode = "rl"
 
     def _compute_metametric(self, metrics: pd.Series) -> float:
         metametric = 0.0
@@ -67,8 +69,15 @@ def collect_ground_truth_trajectories(self, puffer_env):
         """
         return puffer_env.get_ground_truth_trajectories()
 
-    def collect_simulated_trajectories(self, args, puffer_env, policy):
+    def collect_simulated_trajectories(self, args, puffer_env, policy=None, actions=None):
         """Roll out policy in env and collect trajectories.
+        Args:
+            args: configuration dictionary
+            puffer_env: PufferDrive environment
+            policy: policy to evaluate (if None, actions must be provided)
+            actions: actions to step the agent (if policy is None). Currently only works with discrete actions and the
+                classic dynamics model. Shape: [time, num_agents, 1]
+
         Returns:
             trajectories: dict with keys 'x', 'y', 'z', 'heading' each of shape
                 (num_agents, num_rollouts, num_steps)
@@ -86,11 +95,13 @@ def collect_simulated_trajectories(self, args, puffer_env, policy):
             "id": np.zeros((num_agents, self.num_rollouts, self.sim_steps), dtype=np.int32),
         }
 
-        for rollout_idx in range(self.num_rollouts):
+        for rollout_idx in tqdm(range(self.num_rollouts), desc="Collecting rollouts", colour="blue"):
             print(f"\rCollecting rollout {rollout_idx + 1}/{self.num_rollouts}...", end="", flush=True)
+
             obs, info = puffer_env.reset()
             state = {}
-            if args["train"]["use_rnn"]:
+
+            if args["train"]["use_rnn"] and policy is not None:
                 state = dict(
                     lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
                     lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
@@ -106,14 +117,34 @@ def collect_simulated_trajectories(self, args, puffer_env, policy):
                 trajectories["id"][:, rollout_idx, time_idx] = agent_state["id"]
 
                 # Step policy
-                with torch.no_grad():
-                    ob_tensor = torch.as_tensor(obs).to(device)
-                    logits, value = policy.forward_eval(ob_tensor, state)
-                    action, logprob, _ = pufferlib.pytorch.sample_logits(logits)
-                    action_np = action.cpu().numpy().reshape(puffer_env.action_space.shape)
-
-                if isinstance(logits, torch.distributions.Normal):
-                    action_np = np.clip(action_np, puffer_env.action_space.low, puffer_env.action_space.high)
+                if policy is None and actions is not None:
+                    human_act_time_index = self.init_steps + time_idx
+                    action_np = actions[human_act_time_index, :].copy()
+
+                    # Replace invalid actions (-1) with "do nothing" action
+                    # For discrete actions: use action 45 (accel=0, steer=0)
+                    # For continuous actions: use [0.0, 0.0]
+                    invalid_mask = action_np == -1.0
+
+                    if puffer_env.action_space.__class__.__name__ == "MultiDiscrete":
+                        # Discrete action space
+                        action_np[invalid_mask] = 45  # Do nothing action
+
+                elif policy is not None:
+                    if self.mode == "bc_policy":
+                        with torch.no_grad():
+                            ob_tensor = torch.as_tensor(obs).to(device)
+                            pred_action = policy(ob_tensor, deterministic=True)
+                            action_np = pred_action.cpu().numpy().reshape(puffer_env.action_space.shape)
+                    else:
+                        with torch.no_grad():
+                            ob_tensor = torch.as_tensor(obs).to(device)
+                            logits, value = policy.forward_eval(ob_tensor, state)
+                            action, logprob, _ = pufferlib.pytorch.sample_logits(logits)
+                            action_np = action.cpu().numpy().reshape(puffer_env.action_space.shape)
+
+                        if isinstance(logits, torch.distributions.Normal):
+                            action_np = np.clip(action_np, puffer_env.action_space.low, puffer_env.action_space.high)
 
                 obs, _, _, _, _ = puffer_env.step(action_np)
 
@@ -192,6 +223,8 @@ def compute_metrics(
             "Agent IDs don't match between simulated and ground truth trajectories"
         )
 
+        print("Computing metrics...")
+
         eval_mask = ground_truth_trajectories["id"][:, 0] >= 0
 
         # Extract trajectories
@@ -488,52 +521,54 @@ def compute_metrics(
             }
         )
 
-        scene_level_results = df.groupby("scenario_id")[
+        # Aggregate along agent dimenision: Obtain one score per scenario
+        df_scene_level = df.groupby("scenario_id").mean().drop(columns=["agent_id"]).dropna()
+
+        # Exponentiate the averaged log-likelihoods to get final likelihoods
+        likelihood_columns = [col for col in df_scene_level.columns if col.startswith("likelihood_")]
+        df_scene_level[likelihood_columns] = np.exp(df_scene_level[likelihood_columns])
+
+        df_scene_level["realism_meta_score"] = df_scene_level.apply(self._compute_metametric, axis=1)
+        df_scene_level["num_agents_per_scene"] = df.groupby("scenario_id").size()
+        df_scene_level = df_scene_level.round(3)
+
+        # Get group summary metrics
+        kinematic_metrics = np.mean(
+            [
+                df_scene_level["likelihood_linear_speed"],
+                df_scene_level["likelihood_linear_acceleration"],
+                df_scene_level["likelihood_angular_speed"],
+                df_scene_level["likelihood_angular_acceleration"],
+            ]
+        )
+
+        interactive_metrics = np.mean(
             [
-                "ade",
-                "min_ade",
-                "num_collisions_sim",
-                "num_collisions_ref",
-                "num_offroad_sim",
-                "num_offroad_ref",
-                "likelihood_linear_speed",
-                "likelihood_linear_acceleration",
-                "likelihood_angular_speed",
-                "likelihood_angular_acceleration",
-                "likelihood_distance_to_nearest_object",
-                "likelihood_time_to_collision",
-                "likelihood_collision_indication",
-                "likelihood_distance_to_road_edge",
-                "likelihood_offroad_indication",
+                df_scene_level["likelihood_collision_indication"],
+                df_scene_level["likelihood_distance_to_nearest_object"],
+                df_scene_level["likelihood_time_to_collision"],
             ]
-        ].mean()
+        )
 
-        # Transform log-likelihoods to positive scores:
-        likelihood_cols = [c for c in scene_level_results.columns if "likelihood" in c]
-        scene_level_results[likelihood_cols] = np.exp(scene_level_results[likelihood_cols])
+        map_metrics = np.mean(
+            [
+                df_scene_level["likelihood_distance_to_road_edge"],
+                df_scene_level["likelihood_offroad_indication"],
+            ]
+        )
 
-        scene_level_results["realism_meta_score"] = scene_level_results.apply(self._compute_metametric, axis=1)
-        scene_level_results["num_agents"] = df.groupby("scenario_id").size()
-        scene_level_results = scene_level_results[
-            ["num_agents"] + [col for col in scene_level_results.columns if col != "num_agents"]
-        ]
+        df_scene_level["kinematic_metrics"] = kinematic_metrics
+        df_scene_level["interactive_metrics"] = interactive_metrics
+        df_scene_level["map_based_metrics"] = map_metrics
 
         if aggregate_results:
-            aggregate_metrics = scene_level_results.mean().to_dict()
-            aggregate_metrics["total_num_agents"] = scene_level_results["num_agents"].sum()
-            # Convert numpy types to Python native types
-            return {k: v.item() if hasattr(v, "item") else v for k, v in aggregate_metrics.items()}
+            # Aggregate over scenarios
+            aggregate_metrics = df_scene_level.mean().to_dict()
+            aggregate_metrics["total_num_agents"] = df_scene_level["num_agents_per_scene"].sum()
+            aggregate_metrics["realism_score_std"] = df_scene_level["realism_meta_score"].std()
+            return aggregate_metrics
         else:
-            print("\n Scene-level results:\n")
-            print(scene_level_results)
-
-            print(f"\n Overall realism meta score: {scene_level_results['realism_meta_score'].mean():.4f}")
-            print(f"\n Overall minADE: {scene_level_results['min_ade'].mean():.4f}")
-            print(f"\n Overall ADE: {scene_level_results['ade'].mean():.4f}")
-
-            # print(f"\n Full agent-level results:\n")
-            # print(df)
-            return scene_level_results
+            return df_scene_level
 
     def _quick_sanity_check(self, gt_trajectories, simulated_trajectories, agent_idx=None, max_agents_to_plot=10):
         if agent_idx is None:
diff --git a/pufferlib/ocean/benchmark/metrics_sanity_check.py b/pufferlib/ocean/benchmark/metrics_sanity_check.py
index 74d8596f8..f957ad455 100644
--- a/pufferlib/ocean/benchmark/metrics_sanity_check.py
+++ b/pufferlib/ocean/benchmark/metrics_sanity_check.py
@@ -1,14 +1,22 @@
 """
-Validation script for WOSAC log-likelihood metrics.
+Comprehensive evaluation script for Drive environment checkpoints.
 
-Idea is to check how the log-likelihood metrics change as we replace
-increasing numbers of random rollouts with ground-truth data.
+Evaluates all .pt checkpoints in a folder using:
+1. WOSAC metrics (realism, ADE, likelihood metrics)
+2. Collision rates (SDC-only control mode)
+
+Includes baselines for ground truth and random policy.
 """
 
 import argparse
+import os
+import glob
+from pathlib import Path
 import numpy as np
+import pandas as pd
+import torch
 
-from pufferlib.pufferl import load_config, load_env, load_policy
+from pufferlib.pufferl import load_env, load_policy
 from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator
 
 
@@ -29,6 +37,7 @@ def replace_rollouts_with_gt(simulated_traj, gt_traj, num_replacements):
 def run_validation_experiment(config, vecenv):
     evaluator = WOSACEvaluator(config)
 
+    # Collect ground truth trajectories
     gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv)
     simulated_trajectories = evaluator.collect_wosac_random_baseline(vecenv)
     agent_state = vecenv.driver_env.get_global_agent_state()
@@ -79,9 +88,24 @@ def format_results_table(results):
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Validate WOSAC log-likelihood metrics")
-    parser.add_argument("--env", default="puffer_drive")
-    parser.add_argument("--config", default="config/ocean/drive.ini")
+    parser = argparse.ArgumentParser(
+        description="Evaluate Drive environment checkpoints", formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("checkpoint_dir", type=str, help="Directory containing .pt checkpoint files")
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="evaluation_results.csv",
+        help="Output CSV file path (default: evaluation_results.csv)",
+    )
+    parser.add_argument(
+        "--num-collision-episodes",
+        type=int,
+        default=100,
+        help="Number of episodes for collision rate evaluation (default: 100)",
+    )
+    parser.add_argument("--skip-wosac", action="store_true", help="Skip WOSAC evaluation")
+    parser.add_argument("--skip-collision", action="store_true", help="Skip collision rate evaluation")
     args = parser.parse_args()
 
     config = load_config(args.env)
diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c
index e8abb8f2b..e0eb38b91 100644
--- a/pufferlib/ocean/drive/binding.c
+++ b/pufferlib/ocean/drive/binding.c
@@ -193,6 +193,11 @@ static int my_init(Env *env, PyObject *args, PyObject *kwargs) {
     env->reward_goal_post_respawn = conf.reward_goal_post_respawn;
     env->episode_length = conf.episode_length;
     env->termination_mode = conf.termination_mode;
+    env->use_guided_autonomy = conf.use_guided_autonomy;
+    env->guidance_speed_weight = conf.guidance_speed_weight;
+    env->guidance_heading_weight = conf.guidance_heading_weight;
+    env->waypoint_reach_threshold = conf.waypoint_reach_threshold;
+    env->goal_radius = conf.goal_radius;
     env->collision_behavior = conf.collision_behavior;
     env->offroad_behavior = conf.offroad_behavior;
     env->max_controlled_agents = unpack(kwargs, "max_controlled_agents");
diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h
index 4df3bf021..df0600ff5 100644
--- a/pufferlib/ocean/drive/drive.h
+++ b/pufferlib/ocean/drive/drive.h
@@ -110,9 +110,16 @@ static const float JERK_LONG[4] = {-15.0f, -4.0f, 0.0f, 4.0f};
 static const float JERK_LAT[3] = {-4.0f, 0.0f, 4.0f};
 
 // Classic action space (for CLASSIC dynamics model)
-static const float ACCELERATION_VALUES[7] = {-4.0000f, -2.6670f, -1.3330f, -0.0000f, 1.3330f, 2.6670f, 4.0000f};
-static const float STEERING_VALUES[13] = {-1.000f, -0.833f, -0.667f, -0.500f, -0.333f, -0.167f, 0.000f,
-                                          0.167f,  0.333f,  0.500f,  0.667f,  0.833f,  1.000f};
+#define NUM_ACCEL_BINS 7 // 21
+#define ACCEL_MIN -4.0f
+#define ACCEL_MAX 4.0f
+
+#define NUM_STEER_BINS 13 // 31
+#define STEER_MIN -1.0f   // radians
+#define STEER_MAX 1.0f
+
+static float ACCELERATION_VALUES[NUM_ACCEL_BINS];
+static float STEERING_VALUES[NUM_STEER_BINS];
 
 static const float offsets[4][2] = {
     {-1, 1}, // top-left
@@ -179,6 +186,8 @@ struct Entity {
     float *traj_vz;
     float *traj_heading;
     int *traj_valid;
+    float *expert_accel;
+    float *expert_steering;
     float width;
     float length;
     float height;
@@ -218,6 +227,12 @@ struct Entity {
     float jerk_lat;
     float steering_angle;
     float wheelbase;
+
+    // Guided autonomy tracking
+    int *waypoints_hit;        // Boolean array: 1 if waypoint reached, 0 otherwise
+    int waypoints_hit_count;   // Total waypoints reached so far
+    int total_valid_waypoints; // Total valid waypoints in trajectory
+    float route_progress;      // Percentage of route completed (0.0 to 1.0)
 };
 
 void free_entity(Entity *entity) {
@@ -230,6 +245,9 @@ void free_entity(Entity *entity) {
     free(entity->traj_vz);
     free(entity->traj_heading);
     free(entity->traj_valid);
+    free(entity->waypoints_hit);
+    free(entity->expert_accel);
+    free(entity->expert_steering);
 }
 
 // Utility functions
@@ -253,6 +271,14 @@ float clip(float value, float min, float max) {
     return value;
 }
 
+float normalize_heading(float heading) {
+    if (heading > M_PI)
+        heading -= 2 * M_PI;
+    if (heading < -M_PI)
+        heading += 2 * M_PI;
+    return heading;
+}
+
 typedef struct GridMapEntity GridMapEntity;
 struct GridMapEntity {
     int entity_idx;
@@ -283,6 +309,7 @@ struct Drive {
     float *actions;
     float *rewards;
     unsigned char *terminals;
+    unsigned char *truncations;
     Log log;
     Log *logs;
     int num_agents;
@@ -308,6 +335,10 @@ struct Drive {
     int termination_mode;
     float reward_vehicle_collision;
     float reward_offroad_collision;
+    int use_guided_autonomy;        // Boolean: whether to calculate and add guided autonomy reward
+    float guidance_speed_weight;    // Weight for speed deviation penalty
+    float guidance_heading_weight;  // Weight for heading deviation penalty
+    float waypoint_reach_threshold; // Distance threshold for hitting waypoints (e.g., 2.0m)
     char *map_name;
     float world_mean_x;
     float world_mean_y;
@@ -381,6 +412,148 @@ void add_log(Drive *env) {
     }
 }
 
+void init_action_space() {
+
+    float accel_step = (ACCEL_MAX - ACCEL_MIN) / (NUM_ACCEL_BINS - 1);
+    for (int i = 0; i < NUM_ACCEL_BINS; i++) {
+        ACCELERATION_VALUES[i] = ACCEL_MIN + i * accel_step;
+    }
+
+    float steer_step = (STEER_MAX - STEER_MIN) / (NUM_STEER_BINS - 1);
+    for (int i = 0; i < NUM_STEER_BINS; i++) {
+        STEERING_VALUES[i] = STEER_MIN + i * steer_step;
+    }
+
+    // printf("Acceleration values: ");
+    // for (int i = 0; i < NUM_ACCEL_BINS; i++) {
+    //     printf("%.3f ", ACCELERATION_VALUES[i]);
+    // }
+    // printf("\nSteering values: ");
+    // for (int i = 0; i < NUM_STEER_BINS; i++) {
+    //     printf("%.3f ", STEERING_VALUES[i]);
+    // }
+    // printf("\n");
+}
+
+// Guided autonomy helper functions
+int is_waypoint_within_reach(Entity *agent, int traj_idx, float threshold) {
+    if (traj_idx >= agent->array_size || !agent->traj_valid[traj_idx]) {
+        return 0;
+    }
+
+    float ref_x = agent->traj_x[traj_idx];
+    float ref_y = agent->traj_y[traj_idx];
+
+    if (ref_x == INVALID_POSITION || ref_y == INVALID_POSITION) {
+        return 0;
+    }
+
+    float dx = agent->x - ref_x;
+    float dy = agent->y - ref_y;
+    float distance = sqrtf(dx * dx + dy * dy);
+
+    return distance < threshold;
+}
+
+float compute_route_guidance_reward(Drive *env, Entity *agent, int timestep) {
+    float reward = 0.0f;
+    int new_hits = 0;
+
+    // Check all waypoints from current timestep to end of trajectory
+    for (int i = timestep; i < agent->array_size; i++) {
+        // Skip if already hit or invalid
+        if (agent->waypoints_hit[i] || !agent->traj_valid[i]) {
+            continue;
+        }
+
+        // Check if within reach
+        if (is_waypoint_within_reach(agent, i, env->waypoint_reach_threshold)) {
+            agent->waypoints_hit[i] = 1;
+            agent->waypoints_hit_count++;
+            new_hits++;
+        }
+    }
+
+    // Compute route progress
+    if (agent->total_valid_waypoints > 0) {
+        agent->route_progress = (float)agent->waypoints_hit_count / agent->total_valid_waypoints;
+    }
+
+    // Scale reward by number of valid waypoints to normalize
+    if (agent->total_valid_waypoints > 0 && new_hits > 0) {
+        reward = (float)new_hits / agent->total_valid_waypoints;
+        reward = fminf(reward, 0.1f); // Cap at 0.1 per step
+    }
+
+    return reward;
+}
+
+float compute_speed_guidance_reward(Entity *agent, int timestep, float weight) {
+    if (timestep >= agent->array_size || !agent->traj_valid[timestep]) {
+        return 0.0f;
+    }
+
+    // Get reference speed at current timestep
+    float ref_vx = agent->traj_vx[timestep];
+    float ref_vy = agent->traj_vy[timestep];
+    float ref_speed = sqrtf(ref_vx * ref_vx + ref_vy * ref_vy);
+
+    // Get actual speed
+    float actual_speed = sqrtf(agent->vx * agent->vx + agent->vy * agent->vy);
+
+    // Compute squared error
+    float speed_error = ref_speed - actual_speed;
+    float speed_error_sq = speed_error * speed_error;
+
+    // Exponential penalty: 1.0 - exp(-error²)
+    float penalty = 1.0f - expf(-speed_error_sq);
+
+    return -weight * penalty;
+}
+
+float compute_heading_guidance_reward(Entity *agent, int timestep, float weight) {
+    if (timestep >= agent->array_size || !agent->traj_valid[timestep]) {
+        return 0.0f;
+    }
+
+    // Get reference heading at current timestep
+    float ref_heading = agent->traj_heading[timestep];
+
+    // Get actual heading
+    float actual_heading = agent->heading;
+
+    // Compute heading error (accounting for angle wrapping)[-π, π]
+    float heading_error = normalize_heading(ref_heading - actual_heading);
+
+    float heading_error_sq = heading_error * heading_error;
+
+    // Exponential penalty: 1.0 - exp(-error²)
+    float penalty = 1.0f - expf(-heading_error_sq);
+
+    return -weight * penalty;
+}
+
+float compute_guided_autonomy_reward(Drive *env, int agent_idx, int active_idx) {
+    Entity *agent = &env->entities[agent_idx];
+    int timestep = env->timestep;
+
+    float total_reward = 0.0f;
+
+    // 1. Route guidance (waypoint hitting)
+    float route_reward = compute_route_guidance_reward(env, agent, timestep);
+    total_reward += route_reward;
+
+    // 2. Speed guidance
+    float speed_reward = compute_speed_guidance_reward(agent, timestep, env->guidance_speed_weight);
+    total_reward += speed_reward;
+
+    // 3. Heading guidance
+    float heading_reward = compute_heading_guidance_reward(agent, timestep, env->guidance_heading_weight);
+    total_reward += heading_reward;
+
+    return total_reward;
+}
+
 Entity *load_map_binary(const char *filename, Drive *env) {
     FILE *file = fopen(filename, "rb");
     if (!file)
@@ -405,6 +578,7 @@ Entity *load_map_binary(const char *filename, Drive *env) {
     fread(&env->num_roads, sizeof(int), 1, file);
     env->num_entities = env->num_objects + env->num_roads;
     Entity *entities = (Entity *)malloc(env->num_entities * sizeof(Entity));
+
     for (int i = 0; i < env->num_entities; i++) {
         // Read base entity data
         fread(&entities[i].scenario_id, sizeof(int), 1, file);
@@ -424,6 +598,13 @@ Entity *load_map_binary(const char *filename, Drive *env) {
             entities[i].traj_vz = (float *)malloc(size * sizeof(float));
             entities[i].traj_heading = (float *)malloc(size * sizeof(float));
             entities[i].traj_valid = (int *)malloc(size * sizeof(int));
+            entities[i].expert_accel = (float *)malloc(size * sizeof(float));
+            entities[i].expert_steering = (float *)malloc(size * sizeof(float));
+            // Allocate waypoints_hit array for guided autonomy
+            entities[i].waypoints_hit = (int *)calloc(size, sizeof(int));
+            entities[i].waypoints_hit_count = 0;
+            entities[i].total_valid_waypoints = 0;
+            entities[i].route_progress = 0.0f;
         } else {
             // Roads don't use these arrays
             entities[i].traj_vx = NULL;
@@ -431,6 +612,12 @@ Entity *load_map_binary(const char *filename, Drive *env) {
             entities[i].traj_vz = NULL;
             entities[i].traj_heading = NULL;
             entities[i].traj_valid = NULL;
+            entities[i].expert_accel = NULL;
+            entities[i].expert_steering = NULL;
+            entities[i].waypoints_hit = NULL;
+            entities[i].waypoints_hit_count = 0;
+            entities[i].total_valid_waypoints = 0;
+            entities[i].route_progress = 0.0f;
         }
         // Read array data
         fread(entities[i].traj_x, sizeof(float), size, file);
@@ -443,6 +630,14 @@ Entity *load_map_binary(const char *filename, Drive *env) {
             fread(entities[i].traj_vz, sizeof(float), size, file);
             fread(entities[i].traj_heading, sizeof(float), size, file);
             fread(entities[i].traj_valid, sizeof(int), size, file);
+            fread(entities[i].expert_accel, sizeof(float), size, file);
+            fread(entities[i].expert_steering, sizeof(float), size, file);
+            // Count total valid waypoints for guided autonomy
+            for (int j = 0; j < size; j++) {
+                if (entities[i].traj_valid[j]) {
+                    entities[i].total_valid_waypoints++;
+                }
+            }
         }
         // Read remaining scalar fields
         fread(&entities[i].width, sizeof(float), 1, file);
@@ -513,7 +708,15 @@ void set_start_position(Drive *env) {
         e->jerk_lat = 0.0f;
         e->steering_angle = 0.0f;
         e->wheelbase = 0.6f * e->length;
+
+        // Reset guided autonomy tracking
+        if (e->waypoints_hit != NULL) {
+            memset(e->waypoints_hit, 0, e->array_size * sizeof(int));
+        }
+        e->waypoints_hit_count = 0;
+        e->route_progress = 0.0f;
     }
+    // EndDrawing();
 }
 
 int getGridIndex(Drive *env, float x1, float y1) {
@@ -1408,6 +1611,7 @@ void init_goal_positions(Drive *env) {
 void init(Drive *env) {
     env->human_agent_idx = 0;
     env->timestep = 0;
+    init_action_space();
     env->entities = load_map_binary(env->map_name, env);
     set_means(env);
     init_grid_map(env);
@@ -1458,6 +1662,7 @@ void allocate(Drive *env) {
     env->actions = (float *)calloc(env->active_agent_count * 2, sizeof(float));
     env->rewards = (float *)calloc(env->active_agent_count, sizeof(float));
     env->terminals = (unsigned char *)calloc(env->active_agent_count, sizeof(unsigned char));
+    env->truncations = (unsigned char *)calloc(env->active_agent_count, sizeof(unsigned char));
 }
 
 void free_allocated(Drive *env) {
@@ -1465,6 +1670,7 @@ void free_allocated(Drive *env) {
     free(env->actions);
     free(env->rewards);
     free(env->terminals);
+    free(env->truncations);
     c_close(env);
 }
 
@@ -1477,14 +1683,6 @@ float clipSpeed(float speed) {
     return speed;
 }
 
-float normalize_heading(float heading) {
-    if (heading > M_PI)
-        heading -= 2 * M_PI;
-    if (heading < -M_PI)
-        heading += 2 * M_PI;
-    return heading;
-}
-
 float normalize_value(float value, float min, float max) { return (value - min) / (max - min); }
 
 void move_dynamics(Drive *env, int action_idx, int agent_idx) {
@@ -1508,12 +1706,12 @@ void move_dynamics(Drive *env, int action_idx, int agent_idx) {
             acceleration = action_array_f[action_idx][0];
             steering = action_array_f[action_idx][1];
 
-            acceleration *= ACCELERATION_VALUES[6];
-            steering *= STEERING_VALUES[12];
+            acceleration *= ACCEL_MAX;
+            steering *= STEER_MAX;
         } else { // discrete
             // Interpret action as a single integer: a = accel_idx * num_steer + steer_idx
             int *action_array = (int *)env->actions;
-            int num_steer = sizeof(STEERING_VALUES) / sizeof(STEERING_VALUES[0]);
+            int num_steer = NUM_STEER_BINS;
             int action_val = action_array[action_idx];
             int acceleration_index = action_val / num_steer;
             int steering_index = action_val % num_steer;
@@ -1549,7 +1747,7 @@ void move_dynamics(Drive *env, int action_idx, int agent_idx) {
         // Update position
         x = x + (new_vx * env->dt);
         y = y + (new_vy * env->dt);
-        heading = heading + yaw_rate * env->dt;
+        heading = normalize_heading(heading + yaw_rate * env->dt);
 
         // Apply updates to the agent's state
         agent->x = x;
@@ -2017,24 +2215,9 @@ void respawn_agent(Drive *env, int agent_idx) {
 void c_step(Drive *env) {
     memset(env->rewards, 0, env->active_agent_count * sizeof(float));
     memset(env->terminals, 0, env->active_agent_count * sizeof(unsigned char));
+    memset(env->truncations, 0, env->active_agent_count * sizeof(unsigned char));
     env->timestep++;
 
-    int originals_remaining = 0;
-    for (int i = 0; i < env->active_agent_count; i++) {
-        int agent_idx = env->active_agent_indices[i];
-        // Keep flag true if there is at least one agent that has not been respawned yet
-        if (env->entities[agent_idx].respawn_count == 0) {
-            originals_remaining = 1;
-            break;
-        }
-    }
-
-    if (env->timestep == env->episode_length || (!originals_remaining && env->termination_mode == 1)) {
-        add_log(env);
-        c_reset(env);
-        return;
-    }
-
     // Move static experts
     for (int i = 0; i < env->expert_static_agent_count; i++) {
         int expert_idx = env->expert_static_agent_indices[i];
@@ -2053,14 +2236,14 @@ void c_step(Drive *env) {
 
         move_dynamics(env, i, agent_idx);
 
-        // Tiny jerk penalty for smoothness
-        if (env->dynamics_model == CLASSIC) {
-            float delta_vx = env->entities[agent_idx].vx - prev_vx;
-            float delta_vy = env->entities[agent_idx].vy - prev_vy;
-            float jerk_penalty = -0.0002f * sqrtf(delta_vx * delta_vx + delta_vy * delta_vy) / env->dt;
-            env->rewards[i] += jerk_penalty;
-            env->logs[i].episode_return += jerk_penalty;
-        }
+        // // Tiny jerk penalty for smoothness
+        // if (env->dynamics_model == CLASSIC) {
+        //     float delta_vx = env->entities[agent_idx].vx - prev_vx;
+        //     float delta_vy = env->entities[agent_idx].vy - prev_vy;
+        //     float jerk_penalty = -0.0002f * sqrtf(delta_vx * delta_vx + delta_vy * delta_vy) / env->dt;
+        //     env->rewards[i] += jerk_penalty;
+        //     env->logs[i].episode_return += jerk_penalty;
+        // }
     }
 
     // Compute rewards
@@ -2122,6 +2305,13 @@ void c_step(Drive *env) {
 
         int lane_aligned = env->entities[agent_idx].metrics_array[LANE_ALIGNED_IDX];
         env->logs[i].lane_alignment_rate = lane_aligned;
+
+        // Apply guided autonomy reward
+        if (env->use_guided_autonomy) {
+            float ga_reward = compute_guided_autonomy_reward(env, agent_idx, i);
+            env->rewards[i] += ga_reward;
+            env->logs[i].episode_return += ga_reward;
+        }
     }
 
     if (env->goal_behavior == GOAL_RESPAWN) {
@@ -2129,6 +2319,7 @@ void c_step(Drive *env) {
             int agent_idx = env->active_agent_indices[i];
             int reached_goal = env->entities[agent_idx].metrics_array[REACHED_GOAL_IDX];
             if (reached_goal) {
+                env->terminals[i] = 1;
                 respawn_agent(env, agent_idx);
                 env->entities[agent_idx].respawn_count++;
             }
@@ -2144,9 +2335,140 @@ void c_step(Drive *env) {
         }
     }
 
+    // Episode boundary after this step: treat time-limit and early-termination as truncation.
+    // `timestep` is incremented at step start, so truncate when `(timestep + 1) >= episode_length`.
+    int originals_remaining = 0;
+    for (int i = 0; i < env->active_agent_count; i++) {
+        int agent_idx = env->active_agent_indices[i];
+        if (env->entities[agent_idx].respawn_count == 0) {
+            originals_remaining = 1;
+            break;
+        }
+    }
+    int reached_time_limit = (env->timestep + 1) >= env->episode_length;
+    int reached_early_termination = (!originals_remaining && env->termination_mode == 1);
+    if (reached_time_limit || reached_early_termination) {
+        for (int i = 0; i < env->active_agent_count; i++) {
+            env->truncations[i] = 1;
+        }
+        add_log(env);
+        c_reset(env);
+        return;
+    }
+
     compute_observations(env);
 }
 
+void c_collect_expert_data(Drive *env, float *expert_actions_discrete_out, float *expert_actions_continuous_out,
+                           float *expert_obs_out) {
+
+    int ego_dim = (env->dynamics_model == JERK) ? 10 : 7;
+    int max_obs = ego_dim + 7 * (MAX_AGENTS - 1) + 7 * MAX_ROAD_SEGMENT_OBSERVATIONS;
+
+    int original_timestep = env->timestep;
+
+    c_reset(env);
+
+    for (int t = 0; t < TRAJECTORY_LENGTH; t++) {
+        // Get the current observations
+        int obs_offset = t * env->active_agent_count * max_obs;
+        memcpy(&expert_obs_out[obs_offset], env->observations, env->active_agent_count * max_obs * sizeof(float));
+
+        // Now set expert actions for this timestep
+        for (int i = 0; i < env->active_agent_count; i++) {
+            int agent_idx = env->active_agent_indices[i];
+            Entity *agent = &env->entities[agent_idx];
+
+            // Check bounds and validity
+            bool is_valid = (t < agent->array_size && agent->expert_accel && agent->expert_steering &&
+                             agent->expert_accel[t] != -1.0f && agent->expert_steering[t] != -1.0f);
+
+            if (is_valid) {
+                float continuous_accel = agent->expert_accel[t];
+                float continuous_steer = agent->expert_steering[t];
+
+                // Store continuous actions
+                int continuous_offset = t * env->active_agent_count * 2 + i * 2;
+                expert_actions_continuous_out[continuous_offset] = continuous_accel;
+                expert_actions_continuous_out[continuous_offset + 1] = continuous_steer;
+
+                // Discretize acceleration - find closest value in ACCELERATION_VALUES
+                int best_accel_idx = 0;
+                float min_accel_diff = fabsf(continuous_accel - ACCELERATION_VALUES[0]);
+                for (int j = 1; j < NUM_ACCEL_BINS; j++) {
+                    float diff = fabsf(continuous_accel - ACCELERATION_VALUES[j]);
+                    if (diff < min_accel_diff) {
+                        min_accel_diff = diff;
+                        best_accel_idx = j;
+                    }
+                }
+
+                // Discretize steering - find closest value in STEERING_VALUES
+                int best_steer_idx = 0;
+                float min_steer_diff = fabsf(continuous_steer - STEERING_VALUES[0]);
+                for (int j = 1; j < NUM_STEER_BINS; j++) {
+                    float diff = fabsf(continuous_steer - STEERING_VALUES[j]);
+                    if (diff < min_steer_diff) {
+                        min_steer_diff = diff;
+                        best_steer_idx = j;
+                    }
+                }
+
+                // Compute joint discrete action: action = accel_idx * num_steer + steer_idx
+                int joint_action = best_accel_idx * NUM_STEER_BINS + best_steer_idx;
+
+                // Store joint discrete action
+                int discrete_offset = t * env->active_agent_count + i;
+                expert_actions_discrete_out[discrete_offset] = (float)joint_action;
+
+                // Apply the expert actions to env->actions so that c_step will use them
+                if (env->action_type == 1) { // continuous
+                    float (*action_array_f)[2] = (float (*)[2])env->actions;
+                    action_array_f[i][0] = continuous_accel / ACCEL_MAX;
+                    action_array_f[i][1] = continuous_steer / STEER_MAX; // Normalize
+                } else {
+                    int *action_array = (int *)env->actions;
+                    action_array[i] = joint_action;
+
+                    // printf("Timestep %d, Agent %d: VALID - Accel: %.3f, Steer: %.3f, Discrete: %d\n", t, i,
+                    //        continuous_accel, continuous_steer, joint_action);
+                }
+            } else {
+                // Invalid action: store -1.0 as placeholder
+                int continuous_offset = t * env->active_agent_count * 2 + i * 2;
+                expert_actions_continuous_out[continuous_offset] = -1.0f;
+                expert_actions_continuous_out[continuous_offset + 1] = -1.0f;
+
+                int discrete_offset = t * env->active_agent_count + i;
+                expert_actions_discrete_out[discrete_offset] = -1.0f;
+
+                // Apply "do nothing" action (zero acceleration and steering)
+                if (env->action_type == 1) { // continuous
+                    float (*action_array_f)[2] = (float (*)[2])env->actions;
+                    action_array_f[i][0] = 0.0f; // No acceleration
+                    action_array_f[i][1] = 0.0f; // No steering
+                } else {                         // discrete
+                    int *action_array = (int *)env->actions;
+                    // Apply "do nothing" action - middle acceleration (index 10) and straight steering (index 15)
+                    int do_nothing_accel_idx = NUM_ACCEL_BINS / 2; // Middle of range (should be 0 accel)
+                    int do_nothing_steer_idx = NUM_STEER_BINS / 2; // Middle of range (should be 0 steer)
+                    int do_nothing_action = do_nothing_accel_idx * NUM_STEER_BINS + do_nothing_steer_idx;
+                    action_array[i] = do_nothing_action;
+                }
+            }
+        }
+
+        // Step environment to get next observations
+        if (t < TRAJECTORY_LENGTH - 1) {
+            c_step(env);
+        }
+    }
+
+    // Restore original state
+    env->timestep = original_timestep;
+    c_reset(env);
+}
+
 typedef struct Client Client;
 struct Client {
     float width;
@@ -2828,9 +3150,8 @@ void c_render(Drive *env) {
         int action_val = action_array[env->human_agent_idx];
 
         if (env->dynamics_model == CLASSIC) {
-            int num_steer = 13;
-            int accel_idx = action_val / num_steer;
-            int steer_idx = action_val % num_steer;
+            int accel_idx = action_val / NUM_STEER_BINS;
+            int steer_idx = action_val % NUM_STEER_BINS;
             float accel_value = ACCELERATION_VALUES[accel_idx];
             float steer_value = STEERING_VALUES[steer_idx];
 
diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py
index e85eb7322..e39f37376 100644
--- a/pufferlib/ocean/drive/drive.py
+++ b/pufferlib/ocean/drive/drive.py
@@ -3,6 +3,8 @@
 import json
 import struct
 import os
+import math
+import torch
 import pufferlib
 from pufferlib.ocean.drive import binding
 from multiprocessing import Pool, cpu_count
@@ -10,6 +12,8 @@
 
 
 class Drive(pufferlib.PufferEnv):
+    _human_data_prepped = False
+
     def __init__(
         self,
         render_mode=None,
@@ -21,6 +25,10 @@ def __init__(
         reward_offroad_collision=-0.1,
         reward_goal=1.0,
         reward_goal_post_respawn=0.5,
+        use_guided_autonomy=0,
+        guidance_speed_weight=0.0,
+        guidance_heading_weight=0.0,
+        waypoint_reach_threshold=2.0,
         goal_behavior=0,
         goal_target_distance=10.0,
         goal_radius=2.0,
@@ -38,11 +46,17 @@ def __init__(
         max_controlled_agents=-1,
         buf=None,
         seed=1,
+        bptt_horizon=32,
+        human_data_dir="pufferlib/resources/drive/human_demonstrations",
+        max_expert_sequences=128,
+        prep_human_data=True,
         init_steps=0,
         init_mode="create_all_valid",
         control_mode="control_vehicles",
         map_dir="resources/drive/binaries/training",
         sequential_map_sampling=False,
+        ini_file_path="pufferlib/config/ocean/drive.ini",
+        save_data_to_disk=True,
     ):
         # env
         self.dt = dt
@@ -59,11 +73,17 @@ def __init__(
         self.goal_target_distance = goal_target_distance
         self.collision_behavior = collision_behavior
         self.offroad_behavior = offroad_behavior
+        self.use_guided_autonomy = use_guided_autonomy
+        self.guidance_speed_weight = guidance_speed_weight
+        self.guidance_heading_weight = guidance_heading_weight
+        self.waypoint_reach_threshold = waypoint_reach_threshold
         self.human_agent_idx = human_agent_idx
         self.episode_length = episode_length
         self.termination_mode = termination_mode
         self.resample_frequency = resample_frequency
         self.dynamics_model = dynamics_model
+        self.ini_file_path = ini_file_path
+        self.save_data_to_disk = save_data_to_disk
 
         # Observation space calculation
         self.ego_features = {"classic": binding.EGO_FEATURES_CLASSIC, "jerk": binding.EGO_FEATURES_JERK}.get(
@@ -87,6 +107,8 @@ def __init__(
         self.init_steps = init_steps
         self.init_mode_str = init_mode
         self.control_mode_str = control_mode
+        self.prep_human_data = prep_human_data
+        self.max_expert_sequences = int(max_expert_sequences)
         self.map_dir = map_dir
 
         if self.control_mode_str == "control_vehicles":
@@ -113,7 +135,8 @@ def __init__(
         if action_type == "discrete":
             if dynamics_model == "classic":
                 # Joint action space (assume dependence)
-                self.single_action_space = gymnasium.spaces.MultiDiscrete([7 * 13])
+                self.joint_action_space_size = binding.NUM_ACCEL_BINS * binding.NUM_STEER_BINS
+                self.single_action_space = gymnasium.spaces.MultiDiscrete([self.joint_action_space_size])
                 # Multi discrete (assume independence)
                 # self.single_action_space = gymnasium.spaces.MultiDiscrete([7, 13])
             elif dynamics_model == "jerk":
@@ -180,6 +203,10 @@ def __init__(
                 reward_offroad_collision=reward_offroad_collision,
                 reward_goal=reward_goal,
                 reward_goal_post_respawn=reward_goal_post_respawn,
+                use_guided_autonomy=use_guided_autonomy,
+                guidance_speed_weight=guidance_speed_weight,
+                guidance_heading_weight=guidance_heading_weight,
+                waypoint_reach_threshold=waypoint_reach_threshold,
                 goal_radius=goal_radius,
                 goal_speed=goal_speed,
                 goal_behavior=self.goal_behavior,
@@ -192,7 +219,7 @@ def __init__(
                 max_controlled_agents=self.max_controlled_agents,
                 map_id=map_ids[i],
                 max_agents=nxt - cur,
-                ini_file="pufferlib/config/ocean/drive.ini",
+                ini_file=self.ini_file_path,
                 init_steps=init_steps,
                 init_mode=self.init_mode,
                 control_mode=self.control_mode,
@@ -202,13 +229,40 @@ def __init__(
 
         self.c_envs = binding.vectorize(*env_ids)
 
+        # Human data preparation
+        self.bptt_horizon = bptt_horizon
+        self.human_data_dir = human_data_dir
+        self.save_data_to_disk = save_data_to_disk
+
+        self.expert_data_metrics = {}
+
+        os.makedirs(self.human_data_dir, exist_ok=True)
+
+        if self.prep_human_data and not Drive._human_data_prepped:
+            self.expert_data_metrics = self._prep_human_data(bptt_horizon)
+            Drive._human_data_prepped = True
+        elif self.prep_human_data:
+            if self.save_data_to_disk and os.path.exists(
+                os.path.join(self.human_data_dir, f"expert_actions_discrete_h{bptt_horizon}.pt")
+            ):
+                discrete_actions = torch.load(
+                    os.path.join(self.human_data_dir, f"expert_actions_discrete_h{bptt_horizon}.pt"),
+                    map_location="cpu",
+                    weights_only=False,
+                )
+                self._cache_size = len(discrete_actions)
+            else:
+                self._cache_size = 0
+
     def reset(self, seed=0):
         binding.vec_reset(self.c_envs, seed)
         self.tick = 0
+        self.truncations[:] = 0
         return self.observations, []
 
     def step(self, actions):
         self.terminals[:] = 0
+        self.truncations[:] = 0
         self.actions[:] = actions
         binding.vec_step(self.c_envs)
         self.tick += 1
@@ -255,6 +309,10 @@ def step(self, actions):
                     reward_offroad_collision=self.reward_offroad_collision,
                     reward_goal=self.reward_goal,
                     reward_goal_post_respawn=self.reward_goal_post_respawn,
+                    use_guided_autonomy=self.use_guided_autonomy,
+                    guidance_speed_weight=self.guidance_speed_weight,
+                    guidance_heading_weight=self.guidance_heading_weight,
+                    waypoint_reach_threshold=self.waypoint_reach_threshold,
                     goal_radius=self.goal_radius,
                     goal_behavior=self.goal_behavior,
                     goal_target_distance=self.goal_target_distance,
@@ -266,7 +324,7 @@ def step(self, actions):
                     max_controlled_agents=self.max_controlled_agents,
                     map_id=map_ids[i],
                     max_agents=nxt - cur,
-                    ini_file="pufferlib/config/ocean/drive.ini",
+                    ini_file=self.ini_file_path,
                     init_steps=self.init_steps,
                     init_mode=self.init_mode,
                     control_mode=self.control_mode,
@@ -276,7 +334,18 @@ def step(self, actions):
             self.c_envs = binding.vectorize(*env_ids)
 
             binding.vec_reset(self.c_envs, seed)
-            self.terminals[:] = 1
+            # Map resampling is an external reset boundary (dataset/map switch). Treat as truncation.
+            self.truncations[:] = 1
+
+            # Resample human data if needed and capture metrics
+            if self.prep_human_data and hasattr(self, "_needs_resampling") and self._needs_resampling:
+                resample_metrics = self.resample_human_data()
+                # Add resampling metrics to info for logging
+                if resample_metrics:
+                    # Prefix with "resampled_" to distinguish from initial metrics
+                    resample_metrics = {f"resampled_{k}": v for k, v in resample_metrics.items()}
+                    info.append(resample_metrics)
+
         return (self.observations, self.rewards, self.terminals, self.truncations, info)
 
     def get_global_agent_state(self):
@@ -347,6 +416,185 @@ def get_ground_truth_trajectories(self):
 
         return trajectories
 
+    def resample_human_data(self):
+        """Resample human data when needed (called during environment resampling).
+
+        Returns:
+            dict: Metrics about the expert data collection for logging
+        """
+        if not hasattr(self, "_needs_resampling") or not self._needs_resampling:
+            return {}
+
+        print(
+            f"Resampling human data (max_expert_sequences={self.max_expert_sequences} > "
+            f"available={self._total_available_sequences})..."
+        )
+
+        # Re-prepare human data with the new environment state
+        return self._prep_human_data(self.bptt_horizon)
+
+    def _prep_human_data(self, bptt_horizon=32):
+        """Collect and save expert trajectories with bptt_horizon length sequences.
+
+        Returns:
+            dict: Metrics about the expert data collection for logging
+        """
+        trajectory_length = 91
+
+        if self.dynamics_model == "jerk":
+            raise NotImplementedError(
+                "Expert data collection is not yet implemented for jerk dynamics model. "
+                "Only classic dynamics model is currently supported."
+            )
+
+        # Collect human trajectories.
+        self.expert_actions_discrete = np.zeros((trajectory_length, self.num_agents, 1), dtype=np.float32)
+        self.expert_actions_continuous = np.zeros((trajectory_length, self.num_agents, 2), dtype=np.float32)
+        self.expert_observations_full = np.zeros((trajectory_length, self.num_agents, self.num_obs), dtype=np.float32)
+
+        binding.vec_collect_expert_data(
+            self.c_envs, self.expert_actions_discrete, self.expert_actions_continuous, self.expert_observations_full
+        )
+
+        # Extract all valid sequences of length bptt_horizon from all trajectories
+        discrete_sequences_list = []
+        continuous_sequences_list = []
+        obs_sequences_list = []
+
+        # Track metrics
+        unique_agents = set()
+        total_sequences_extracted = 0
+
+        for agent_idx in range(self.num_agents):
+            # Extract all valid windows of length bptt_horizon for this agent
+            for start_t in range(trajectory_length - bptt_horizon + 1):
+                end_t = start_t + bptt_horizon
+
+                # Check if this window is entirely valid (no -1 values)
+                window_valid = np.all(self.expert_actions_discrete[start_t:end_t, agent_idx, 0] != -1.0)
+
+                if window_valid:
+                    discrete_sequences_list.append(self.expert_actions_discrete[start_t:end_t, agent_idx, :].copy())
+                    continuous_sequences_list.append(self.expert_actions_continuous[start_t:end_t, agent_idx, :].copy())
+                    obs_sequences_list.append(self.expert_observations_full[start_t:end_t, agent_idx, :].copy())
+                    unique_agents.add(agent_idx)
+                    total_sequences_extracted += 1
+
+        # Convert lists to arrays
+        if len(discrete_sequences_list) == 0:
+            raise ValueError("No valid expert sequences found!")
+
+        all_discrete = np.stack(discrete_sequences_list, axis=0)
+        all_continuous = np.stack(continuous_sequences_list, axis=0)
+        all_obs = np.stack(obs_sequences_list, axis=0)
+
+        # Sample sequences (with replacement if max_expert_sequences > available)
+        num_sequences = self.max_expert_sequences
+        needs_resampling = num_sequences > len(discrete_sequences_list)
+
+        # Randomly sample indices (with replacement if necessary)
+        sampled_indices = np.random.choice(len(discrete_sequences_list), size=num_sequences, replace=needs_resampling)
+
+        discrete_sequences = all_discrete[sampled_indices]
+        continuous_sequences = all_continuous[sampled_indices]
+        obs_sequences = all_obs[sampled_indices]
+
+        self._cache_size = num_sequences
+        self._total_available_sequences = len(discrete_sequences_list)
+        self._needs_resampling = needs_resampling
+
+        # Compute statistics
+        avg_sequences_per_agent = total_sequences_extracted / len(unique_agents) if len(unique_agents) > 0 else 0
+        coverage_pct = 100 * len(unique_agents) / self.num_agents if self.num_agents > 0 else 0
+        utilization_pct = (
+            100 * min(num_sequences, total_sequences_extracted) / total_sequences_extracted
+            if total_sequences_extracted > 0
+            else 0
+        )
+
+        data_metrics = {
+            "expert_data/total_agents": self.num_agents,
+            "expert_data/unique_agents_with_data": len(unique_agents),
+            "expert_data/agent_coverage_pct": coverage_pct,
+            "expert_data/total_sequences_extracted": total_sequences_extracted,
+            "expert_data/avg_sequences_per_agent": avg_sequences_per_agent,
+            "expert_data/sequences_stored": num_sequences,
+            "expert_data/sequence_length": bptt_horizon,
+            "expert_data/storage_utilization_pct": utilization_pct,
+            "expert_data/resampling_enabled": int(needs_resampling),
+            "expert_data/resampling_factor": num_sequences / len(discrete_sequences_list) if needs_resampling else 1.0,
+        }
+
+        if self.save_data_to_disk:
+            print(
+                f"Saving {num_sequences} expert sequences of length {bptt_horizon} to disk at {self.human_data_dir}..."
+            )
+            torch.save(
+                torch.from_numpy(discrete_sequences),
+                os.path.join(self.human_data_dir, f"expert_actions_discrete_h{bptt_horizon}.pt"),
+            )
+            torch.save(
+                torch.from_numpy(continuous_sequences),
+                os.path.join(self.human_data_dir, f"expert_actions_continuous_h{bptt_horizon}.pt"),
+            )
+            torch.save(
+                torch.from_numpy(obs_sequences),
+                os.path.join(self.human_data_dir, f"expert_observations_h{bptt_horizon}.pt"),
+            )
+
+        return data_metrics
+
+    def sample_expert_data(self, n_samples=512, return_both=False):
+        """Sample a random batch of human (expert) sequences from disk.
+
+        Args:
+            n_samples: Number of sequences to randomly sample
+            return_both: If True, return both continuous and discrete actions as a tuple.
+                        If False, return only the action type matching the environment's action space.
+
+        Returns:
+            If return_both=True:
+                (discrete_actions, continuous_actions, observations)
+            If return_both=False:
+                (actions, observations) where actions match the env's action type
+
+        Note:
+            For classic discrete actions, the shape is (n_samples, bptt_horizon, 1) for joint actions.
+            For continuous actions, the shape is always (n_samples, bptt_horizon, 2).
+            Jerk dynamics model is not currently supported for expert data.
+        """
+        if not self.save_data_to_disk:
+            raise ValueError("Expert data was not saved to disk. Cannot sample expert data.")
+
+        discrete_path = os.path.join(self.human_data_dir, f"expert_actions_discrete_h{self.bptt_horizon}.pt")
+        continuous_path = os.path.join(self.human_data_dir, f"expert_actions_continuous_h{self.bptt_horizon}.pt")
+        observations_path = os.path.join(self.human_data_dir, f"expert_observations_h{self.bptt_horizon}.pt")
+
+        observations_full = torch.load(observations_path, map_location="cpu", weights_only=False)
+
+        # breakpoint()
+
+        # Sample indices
+        samples = min(n_samples, self._cache_size)
+        indices = torch.randint(0, self._cache_size, (samples,))
+
+        # print(f'Sampling {samples} expert sequences from {self._cache_size} available sequences.')
+        # print(indices)
+
+        sampled_obs = observations_full[indices]
+
+        if return_both:
+            discrete_actions = torch.load(discrete_path, map_location="cpu", weights_only=False)
+            continuous_actions = torch.load(continuous_path, map_location="cpu", weights_only=False)
+            return discrete_actions[indices], continuous_actions[indices], sampled_obs
+        else:
+            # Return only the action type matching the environment
+            if self._action_type_flag == 1:  # continuous
+                actions = torch.load(continuous_path, map_location="cpu", weights_only=False)
+            else:  # discrete
+                actions = torch.load(discrete_path, map_location="cpu", weights_only=False)
+            return actions[indices], sampled_obs
+
     def get_road_edge_polylines(self):
         """Get road edge polylines for all scenarios.
 
@@ -380,6 +628,92 @@ def close(self):
         binding.vec_close(self.c_envs)
 
 
+def infer_human_actions(obj):
+    """Infer expert actions (steer, accel) using inverse bicycle model."""
+    trajectory_length = 91
+
+    # Initialize expert actions arrays
+    expert_acceleration = []
+    expert_steering = []
+
+    positions = obj.get("position", [])
+    velocities = obj.get("velocity", [])
+    headings = obj.get("heading", [])
+    valids = obj.get("valid", [])
+
+    if len(positions) < 2 or len(velocities) < 2 or len(headings) < 2:
+        return [-1.0] * trajectory_length, [-1.0] * trajectory_length
+
+    dt = 0.1  # Discretization
+    vehicle_length = obj.get("length", 4.5)  # Default vehicle length
+
+    for t in range(trajectory_length):
+        # Check if we have enough data and both current and next timesteps are valid
+        if (
+            t >= len(positions)
+            or t >= len(velocities)
+            or t >= len(headings)
+            or t >= len(valids)
+            or not valids[t]
+            or t + 1 >= len(positions)
+            or t + 1 >= len(velocities)
+            or t + 1 >= len(headings)
+            or t + 1 >= len(valids)
+            or not valids[t + 1]
+        ):
+            expert_acceleration.append(-1.0)
+            expert_steering.append(-1.0)
+            continue
+
+        # Current state
+        vel_t = velocities[t]
+        heading_t = headings[t]
+        speed_t = math.sqrt(vel_t.get("x", 0.0) ** 2 + vel_t.get("y", 0.0) ** 2)
+
+        # Next state
+        vel_t1 = velocities[t + 1]
+        heading_t1 = headings[t + 1]
+        speed_t1 = math.sqrt(vel_t1.get("x", 0.0) ** 2 + vel_t1.get("y", 0.0) ** 2)
+
+        # Compute acceleration
+        acceleration = (speed_t1 - speed_t) / dt
+
+        # Normalize heading difference
+        heading_diff = heading_t1 - heading_t
+        while heading_diff > math.pi:
+            heading_diff -= 2 * math.pi
+        while heading_diff < -math.pi:
+            heading_diff += 2 * math.pi
+
+        # Compute yaw rate
+        yaw_rate = heading_diff / dt
+
+        # Compute steering using inverse bicycle model
+        steering = 0.0
+        if speed_t > 0.1:  # Avoid division by zero
+            # From bicycle model: yaw_rate = (v * cos(beta) * tan(delta)) / L
+            # Assuming beta ≈ 0: yaw_rate ≈ (v * tan(delta)) / L
+            tan_steering = (yaw_rate * vehicle_length) / speed_t
+            # Clamp tan_steering to avoid extreme values
+            tan_steering = max(-10.0, min(10.0, tan_steering))
+            steering = math.atan(tan_steering)
+
+        # Clamp values to reasonable ranges
+        acceleration = max(-10.0, min(10.0, acceleration))
+        steering = max(-2.0, min(2.0, steering))
+
+        expert_acceleration.append(acceleration)
+        expert_steering.append(steering)
+
+    # Ensure arrays are exactly trajectory_length (should already be, but just in case)
+    assert len(expert_acceleration) == trajectory_length, (
+        f"Expected {trajectory_length}, got {len(expert_acceleration)}"
+    )
+    assert len(expert_steering) == trajectory_length, f"Expected {trajectory_length}, got {len(expert_steering)}"
+
+    return expert_acceleration, expert_steering
+
+
 def calculate_area(p1, p2, p3):
     # Calculate the area of the triangle using the determinant method
     return 0.5 * abs((p1["x"] - p3["x"]) * (p2["y"] - p1["y"]) - (p1["x"] - p2["x"]) * (p3["y"] - p1["y"]))
@@ -507,6 +841,20 @@ def save_map_binary(map_data, output_file, unique_map_id):
                 )
             )
 
+            # Infer and write human actions
+            if obj_type in [1, 2, 3]:  # For vehicles, pedestrians, cyclists
+                human_accel, human_steering = infer_human_actions(obj)
+
+                # print(f"Human Acceleration: {human_accel}")
+                # print(f"Human Steering: {human_steering}")
+
+                f.write(struct.pack(f"{trajectory_length}f", *human_accel))
+                f.write(struct.pack(f"{trajectory_length}f", *human_steering))
+            else:
+                # Write zeros for non-vehicles
+                f.write(struct.pack(f"{trajectory_length}f", *[0.0] * trajectory_length))
+                f.write(struct.pack(f"{trajectory_length}f", *[0.0] * trajectory_length))
+
             # Write scalar fields
             f.write(struct.pack("f", float(obj.get("width", 0.0))))
             f.write(struct.pack("f", float(obj.get("length", 0.0))))
@@ -669,10 +1017,71 @@ def test_performance(timeout=10, atn_cache=1024, num_agents=1024):
     env.close()
 
 
+def test_human_demonstrations():
+    import matplotlib.pyplot as plt
+    from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator
+    from pufferlib.pufferl import load_config, load_env
+
+    args = load_config("puffer_drive")
+    backend = args["eval"]["backend"]
+
+    args["env"]["map_dir"] = args["eval"]["map_dir"]
+    args["env"]["num_maps"] = args["eval"]["wosac_num_maps"]
+    args["env"]["sequential_map_sampling"] = True
+    dataset_name = args["env"]["map_dir"].split("/")[-1]
+
+    args["vec"] = dict(backend=backend, num_envs=1)
+    args["env"]["init_mode"] = args["eval"]["wosac_init_mode"]
+    args["env"]["control_mode"] = args["eval"]["wosac_control_mode"]
+    args["env"]["init_steps"] = args["eval"]["wosac_init_steps"]
+    args["env"]["goal_behavior"] = args["eval"]["wosac_goal_behavior"]
+    args["env"]["goal_radius"] = args["eval"]["wosac_goal_radius"]
+    # args["env"]["num_agents"] = args["eval"]["wosac_num_agents"]
+
+    vecenv = load_env("puffer_drive", args)
+
+    exp_actions = np.zeros((91, vecenv.num_agents, 1), dtype=np.float32)
+    exp_actions_cont = np.zeros((91, vecenv.num_agents, 2), dtype=np.float32)
+    exp_observations_full = np.zeros((91, vecenv.num_agents, vecenv.num_obs), dtype=np.float32)
+    binding.vec_collect_expert_data(vecenv.c_envs, exp_actions, exp_actions_cont, exp_observations_full)
+
+    evaluator = WOSACEvaluator(args)
+
+    gt_trajectories = evaluator.collect_ground_truth_trajectories(vecenv)
+
+    # Step with inferred human actions
+    simulated_trajectories = evaluator.collect_simulated_trajectories(
+        args=args,
+        puffer_env=vecenv,
+        policy=None,
+        actions=exp_actions,
+    )
+
+    evaluator._quick_sanity_check(gt_trajectories, simulated_trajectories)
+
+    # Analyze and compute metrics
+    agent_state = vecenv.get_global_agent_state()
+    road_edge_polylines = vecenv.get_road_edge_polylines()
+    results = evaluator.compute_metrics(
+        gt_trajectories,
+        simulated_trajectories,
+        agent_state,
+        road_edge_polylines,
+        True,
+    )
+
+    print("\n", results["realism_meta_score"])
+
+    from pprint import pprint
+
+    pprint(results)
+
+
 if __name__ == "__main__":
+    # test_human_demonstrations()
     # test_performance()
     # Process the train dataset
-    process_all_maps(data_folder="data/processed/training")
+    process_all_maps(data_folder="data/processed/wosac/validation_interactive_small")
     # Process the validation/test dataset
     # process_all_maps(data_folder="data/processed/validation")
     # # Process the validation_interactive dataset
diff --git a/pufferlib/ocean/drive/drivenet.h b/pufferlib/ocean/drive/drivenet.h
index 3082fa585..714cc0fc3 100644
--- a/pufferlib/ocean/drive/drivenet.h
+++ b/pufferlib/ocean/drive/drivenet.h
@@ -62,8 +62,8 @@ DriveNet *init_drivenet(Weights *weights, int num_agents, int dynamics_model) {
     int action_size, logit_sizes[2];
     int action_dim;
     if (dynamics_model == CLASSIC) {
-        action_size = 7 * 13; // Joint action space
-        logit_sizes[0] = 7 * 13;
+        action_size = NUM_ACCEL_BINS * NUM_STEER_BINS; // Joint action space
+        logit_sizes[0] = NUM_ACCEL_BINS * NUM_STEER_BINS;
         action_dim = 1;
     } else {                 // JERK
         action_size = 4 * 3; // Joint action space (4 longitudinal × 3 lateral = 12)
diff --git a/pufferlib/ocean/drive/visualize.c b/pufferlib/ocean/drive/visualize.c
index de235b4fa..3a511a262 100644
--- a/pufferlib/ocean/drive/visualize.c
+++ b/pufferlib/ocean/drive/visualize.c
@@ -13,8 +13,13 @@
 #include "drivenet.h"
 #include "libgen.h"
 #include "../env_config.h"
+
 #define TRAJECTORY_LENGTH_DEFAULT 91
 
+#define ACTIONS_FROM_POLICY 0
+#define ACTIONS_INFERRED_FROM_HUMAN 1
+#define ACTIONS_RANDOM 2
+
 typedef struct {
     int pipefd[2];
     pid_t pid;
@@ -67,7 +72,7 @@ void CloseVideo(VideoRecorder *recorder) {
 
 void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int lasers, int trajectories,
                        int frame_count, float *path, int show_human_logs, int show_grid, int img_width, int img_height,
-                       int zoom_in) {
+                       int zoom_in, int control) {
     BeginDrawing();
 
     // Top-down orthographic camera
@@ -133,6 +138,26 @@ void renderTopDownView(Drive *env, Client *client, int map_height, int obs, int
     // Draw scene
     draw_scene(env, client, 1, obs, lasers, show_grid);
     EndMode3D();
+
+    // Draw control overlay text
+    const char *control_text;
+    Color text_color;
+    if (control == ACTIONS_INFERRED_FROM_HUMAN) {
+        control_text = "Control: Inferred human actions";
+        text_color = LIGHTGREEN;
+    } else if (control == ACTIONS_RANDOM) {
+        control_text = "Control: Random actions";
+        text_color = RED;
+    } else {
+        control_text = "Control: Policy actions";
+        text_color = YELLOW;
+    }
+
+    int text_width = MeasureText(control_text, 30);
+    int text_x = img_width - text_width - 20;
+    int text_y = 20;
+
+    DrawText(control_text, text_x, text_y, 30, text_color);
     EndDrawing();
 }
 
@@ -189,9 +214,76 @@ static int make_gif_from_frames(const char *pattern, int fps, const char *palett
     return 0;
 }
 
+static void set_actions(Drive *env, int timestep, int control, DriveNet *net) {
+    // Helper function to set actions based on control type
+    if (control == ACTIONS_INFERRED_FROM_HUMAN) {
+        // Use expert actions from trajectory data
+        for (int j = 0; j < env->active_agent_count; j++) {
+            int agent_idx = env->active_agent_indices[j];
+            Entity *agent = &env->entities[agent_idx];
+
+            // Check bounds
+            if (timestep < agent->array_size && agent->expert_accel && agent->expert_steering) {
+
+                // Check if expert actions are invalid (-1 means no action available)
+                if (agent->expert_accel[timestep] == -1.0f && agent->expert_steering[timestep] == -1.0f) {
+                    // Do nothing - apply zero action
+                    continue;
+                }
+                if (env->action_type == 1) { // continuous
+                    float (*action_array_f)[2] = (float (*)[2])env->actions;
+                    action_array_f[j][0] = agent->expert_accel[timestep] / ACCEL_MAX;
+                    action_array_f[j][1] = agent->expert_steering[timestep] / STEER_MAX;
+                } else { // discrete
+                    // Discretize to nearest action
+                    int best_accel_idx = 0;
+                    float min_accel_diff = fabsf(agent->expert_accel[timestep] - ACCELERATION_VALUES[0]);
+                    for (int k = 1; k < NUM_ACCEL_BINS; k++) {
+                        float diff = fabsf(agent->expert_accel[timestep] - ACCELERATION_VALUES[k]);
+                        if (diff < min_accel_diff) {
+                            min_accel_diff = diff;
+                            best_accel_idx = k;
+                        }
+                    }
+
+                    int best_steer_idx = 0;
+                    float min_steer_diff = fabsf(agent->expert_steering[timestep] - STEERING_VALUES[0]);
+                    for (int k = 1; k < NUM_STEER_BINS; k++) {
+                        float diff = fabsf(agent->expert_steering[timestep] - STEERING_VALUES[k]);
+                        if (diff < min_steer_diff) {
+                            min_steer_diff = diff;
+                            best_steer_idx = k;
+                        }
+                    }
+
+                    int *action_array = (int *)env->actions;
+                    action_array[j] = best_accel_idx * NUM_STEER_BINS + best_steer_idx;
+                }
+            }
+        }
+    } else if (control == ACTIONS_RANDOM) {
+        // Use random actions
+        for (int j = 0; j < env->active_agent_count; j++) {
+            if (env->action_type == 1) { // continuous
+                float (*action_array_f)[2] = (float (*)[2])env->actions;
+                // Random values in range [-1, 1]
+                action_array_f[j][0] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+                action_array_f[j][1] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+            } else { // discrete
+                int *action_array = (int *)env->actions;
+                // Random action index
+                action_array[j] = rand() % (NUM_ACCEL_BINS * NUM_STEER_BINS);
+            }
+        }
+    } else {
+        // Use policy by default (ACTIONS_FROM_POLICY)
+        forward(net, env->observations, (int *)env->actions);
+    }
+}
+
 int eval_gif(const char *map_name, const char *policy_name, int show_grid, int obs_only, int lasers,
              int show_human_logs, int frame_skip, const char *view_mode, const char *output_topdown,
-             const char *output_agent, int num_maps, int zoom_in) {
+             const char *output_agent, int num_maps, int zoom_in, int control) {
 
     // Parse configuration from INI file
     env_init_config conf = {0};
@@ -294,9 +386,14 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
     client->cyclist = LoadModel("resources/drive/cyclist.glb");
     client->pedestrian = LoadModel("resources/drive/pedestrian.glb");
 
-    Weights *weights = load_weights(policy_name);
-    printf("Active agents in map: %d\n", env.active_agent_count);
-    DriveNet *net = init_drivenet(weights, env.active_agent_count, env.dynamics_model);
+    Weights *weights = NULL;
+    DriveNet *net = NULL;
+
+    if (control == ACTIONS_FROM_POLICY) {
+        weights = load_weights(policy_name);
+        printf("Active agents in map: %d\n", env.active_agent_count);
+        net = init_drivenet(weights, env.active_agent_count, env.dynamics_model);
+    }
 
     int frame_count = env.episode_length > 0 ? env.episode_length : TRAJECTORY_LENGTH_DEFAULT;
     char filename_topdown[256];
@@ -355,11 +452,11 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
         for (int i = 0; i < frame_count; i++) {
             if (i % frame_skip == 0) {
                 renderTopDownView(&env, client, map_height, 0, 0, 0, frame_count, NULL, show_human_logs, show_grid,
-                                  img_width, img_height, zoom_in);
+                                  img_width, img_height, zoom_in, control);
                 WriteFrame(&topdown_recorder, img_width, img_height);
                 rendered_frames++;
             }
-            forward(net, env.observations, (int *)env.actions);
+            set_actions(&env, i, control, net);
             c_step(&env);
         }
     }
@@ -377,7 +474,7 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
                 WriteFrame(&agent_recorder, img_width, img_height);
                 rendered_frames++;
             }
-            forward(net, env.observations, (int *)env.actions);
+            set_actions(&env, i, control, net);
             c_step(&env);
         }
     }
@@ -399,8 +496,14 @@ int eval_gif(const char *map_name, const char *policy_name, int show_grid, int o
 
     free(client);
     free_allocated(&env);
-    free_drivenet(net);
-    free(weights);
+
+    if (net != NULL) {
+        free_drivenet(net);
+    }
+    if (weights != NULL) {
+        free(weights);
+    }
+
     return 0;
 }
 
@@ -409,17 +512,18 @@ int main(int argc, char *argv[]) {
     int show_grid = 0;
     int obs_only = 0;
     int lasers = 0;
-    int show_human_logs = 0;
+    int show_human_logs = 1;
     int frame_skip = 1;
-    int zoom_in = 0;
-    const char *view_mode = "both";
+    int zoom_in = 1;
+    const char *view_mode = "topdown";
 
     // File paths and num_maps (not in [env] section)
     const char *map_name = NULL;
     const char *policy_name = "resources/drive/puffer_drive_weights.bin";
     const char *output_topdown = NULL;
     const char *output_agent = NULL;
-    int num_maps = 1;
+    int num_maps = 100;
+    int control = ACTIONS_FROM_POLICY; // ACTIONS_INFERRED_FROM_HUMAN;
 
     // Parse command line arguments
     for (int i = 1; i < argc; i++) {
@@ -429,7 +533,7 @@ int main(int argc, char *argv[]) {
             obs_only = 1;
         } else if (strcmp(argv[i], "--lasers") == 0) {
             lasers = 1;
-        } else if (strcmp(argv[i], "--log-trajectories") == 0) {
+        } else if (strcmp(argv[i], "--show-human-logs") == 0) {
             show_human_logs = 1;
         } else if (strcmp(argv[i], "--frame-skip") == 0) {
             if (i + 1 < argc) {
@@ -485,10 +589,28 @@ int main(int argc, char *argv[]) {
                 num_maps = atoi(argv[i + 1]);
                 i++;
             }
+        } else if (strcmp(argv[i], "--control") == 0) {
+            if (i + 1 < argc) {
+                const char *control_str = argv[i + 1];
+                i++;
+                if (strcmp(control_str, "policy") == 0) {
+                    control = ACTIONS_FROM_POLICY;
+                } else if (strcmp(control_str, "inferred_human") == 0) {
+                    control = ACTIONS_INFERRED_FROM_HUMAN;
+                } else if (strcmp(control_str, "random") == 0) {
+                    control = ACTIONS_RANDOM;
+                } else {
+                    fprintf(stderr, "Error: --control must be 'policy' or 'inferred_human'\n");
+                    return 1;
+                }
+            } else {
+                fprintf(stderr, "Error: --control option requires a value (policy/inferred_human)\n");
+                return 1;
+            }
         }
     }
 
     eval_gif(map_name, policy_name, show_grid, obs_only, lasers, show_human_logs, frame_skip, view_mode, output_topdown,
-             output_agent, num_maps, zoom_in);
+             output_agent, num_maps, zoom_in, control);
     return 0;
 }
diff --git a/pufferlib/ocean/env_binding.h b/pufferlib/ocean/env_binding.h
index c6b766776..50a7d89c5 100644
--- a/pufferlib/ocean/env_binding.h
+++ b/pufferlib/ocean/env_binding.h
@@ -128,7 +128,7 @@ static PyObject *env_init(PyObject *self, PyObject *args, PyObject *kwargs) {
         PyErr_SetString(PyExc_ValueError, "Truncations must be 1D");
         return NULL;
     }
-    // env->truncations = PyArray_DATA(truncations);
+    env->truncations = PyArray_DATA(truncations);
 
     PyObject *seed_arg = PyTuple_GetItem(args, 5);
     if (!PyObject_TypeCheck(seed_arg, &PyLong_Type)) {
@@ -412,7 +412,7 @@ static PyObject *vec_init(PyObject *self, PyObject *args, PyObject *kwargs) {
         env->actions = (void *)((char *)PyArray_DATA(actions) + i * PyArray_STRIDE(actions, 0));
         env->rewards = (void *)((char *)PyArray_DATA(rewards) + i * PyArray_STRIDE(rewards, 0));
         env->terminals = (void *)((char *)PyArray_DATA(terminals) + i * PyArray_STRIDE(terminals, 0));
-        // env->truncations = (void*)((char*)PyArray_DATA(truncations) + i*PyArray_STRIDE(truncations, 0));
+        env->truncations = (void *)((char *)PyArray_DATA(truncations) + i * PyArray_STRIDE(truncations, 0));
 
         // Assumes each process has the same number of environments
         int env_seed = i + seed * vec->num_envs;
@@ -951,6 +951,111 @@ static char *unpack_str(PyObject *kwargs, char *key) {
     return ret;
 }
 
+static PyObject *vec_collect_expert_data(PyObject *self, PyObject *args) {
+    if (PyTuple_Size(args) != 4) {
+        PyErr_SetString(PyExc_TypeError, "collect_expert_data requires 4 arguments");
+        return NULL;
+    }
+
+    VecEnv *vec = unpack_vecenv(args);
+    if (!vec) {
+        return NULL;
+    }
+
+    // Argument 1: discrete actions
+    PyObject *discrete_actions_arg = PyTuple_GetItem(args, 1);
+    if (!PyObject_TypeCheck(discrete_actions_arg, &PyArray_Type)) {
+        PyErr_SetString(PyExc_TypeError, "expert_actions_discrete must be a NumPy array");
+        return NULL;
+    }
+    PyArrayObject *expert_actions_discrete = (PyArrayObject *)discrete_actions_arg;
+
+    // Argument 2: continuous actions
+    PyObject *continuous_actions_arg = PyTuple_GetItem(args, 2);
+    if (!PyObject_TypeCheck(continuous_actions_arg, &PyArray_Type)) {
+        PyErr_SetString(PyExc_TypeError, "expert_actions_continuous must be a NumPy array");
+        return NULL;
+    }
+    PyArrayObject *expert_actions_continuous = (PyArrayObject *)continuous_actions_arg;
+
+    // Argument 3: observations
+    PyObject *obs_arg = PyTuple_GetItem(args, 3);
+    if (!PyObject_TypeCheck(obs_arg, &PyArray_Type)) {
+        PyErr_SetString(PyExc_TypeError, "expert_observations must be a NumPy array");
+        return NULL;
+    }
+    PyArrayObject *expert_observations = (PyArrayObject *)obs_arg;
+
+    // Check array shapes
+    if (PyArray_NDIM(expert_actions_discrete) != 3) {
+        PyErr_SetString(PyExc_ValueError, "expert_actions_discrete must be 3D (trajectory_length, num_agents, 2)");
+        return NULL;
+    }
+    if (PyArray_NDIM(expert_actions_continuous) != 3) {
+        PyErr_SetString(PyExc_ValueError, "expert_actions_continuous must be 3D (trajectory_length, num_agents, 2)");
+        return NULL;
+    }
+    if (PyArray_NDIM(expert_observations) != 3) {
+        PyErr_SetString(PyExc_ValueError, "expert_observations must be 3D (trajectory_length, num_agents, obs_dim)");
+        return NULL;
+    }
+
+    int trajectory_length = PyArray_DIM(expert_actions_discrete, 0);
+    int max_obs = 7 + 7 * (MAX_AGENTS - 1) + 7 * MAX_ROAD_SEGMENT_OBSERVATIONS;
+
+    // Process each environment
+    int agent_offset = 0;
+    for (int i = 0; i < vec->num_envs; i++) {
+        Env *env = vec->envs[i];
+        int num_agents = env->active_agent_count;
+
+        // Allocate temporary buffers for this environment
+        float *env_actions_discrete = (float *)malloc(trajectory_length * num_agents * 1 * sizeof(float));
+        float *env_actions_continuous = (float *)malloc(trajectory_length * num_agents * 2 * sizeof(float));
+        float *env_obs = (float *)malloc(trajectory_length * num_agents * max_obs * sizeof(float));
+
+        if (!env_actions_discrete || !env_actions_continuous || !env_obs) {
+            PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffers");
+            free(env_actions_discrete);
+            free(env_actions_continuous);
+            free(env_obs);
+            return NULL;
+        }
+
+        // Call C function with both action types
+        c_collect_expert_data(env, env_actions_discrete, env_actions_continuous, env_obs);
+
+        // Copy results back to Python arrays
+        for (int t = 0; t < trajectory_length; t++) {
+            for (int a = 0; a < num_agents; a++) {
+                // Copy discrete actions
+                float *discrete_action_src = &env_actions_discrete[t * num_agents + a];
+                float *discrete_action_dst = (float *)PyArray_GETPTR3(expert_actions_discrete, t, agent_offset + a, 0);
+                discrete_action_dst[0] = discrete_action_src[0];
+
+                // Copy continuous actions
+                float *continuous_action_src = &env_actions_continuous[t * num_agents * 2 + a * 2];
+                float *continuous_action_dst =
+                    (float *)PyArray_GETPTR3(expert_actions_continuous, t, agent_offset + a, 0);
+                continuous_action_dst[0] = continuous_action_src[0];
+                continuous_action_dst[1] = continuous_action_src[1];
+
+                // Copy observations
+                float *obs_src = &env_obs[t * num_agents * max_obs + a * max_obs];
+                float *obs_dst = (float *)PyArray_GETPTR3(expert_observations, t, agent_offset + a, 0);
+                memcpy(obs_dst, obs_src, max_obs * sizeof(float));
+            }
+        }
+
+        free(env_actions_discrete);
+        free(env_actions_continuous);
+        free(env_obs);
+        agent_offset += num_agents;
+    }
+
+    Py_RETURN_NONE;
+}
+
 // Method table
 static PyMethodDef methods[] = {
     {"env_init", (PyCFunction)env_init, METH_VARARGS | METH_KEYWORDS,
@@ -969,6 +1074,7 @@ static PyMethodDef methods[] = {
     {"vec_render", vec_render, METH_VARARGS, "Render the vector of environments"},
     {"vec_close", vec_close, METH_VARARGS, "Close the vector of environments"},
     {"shared", (PyCFunction)my_shared, METH_VARARGS | METH_KEYWORDS, "Shared state"},
+    {"vec_collect_expert_data", vec_collect_expert_data, METH_VARARGS, "Collect expert actions and observations"},
     {"get_global_agent_state", get_global_agent_state, METH_VARARGS, "Get global agent state"},
     {"vec_get_global_agent_state", vec_get_global_agent_state, METH_VARARGS, "Get agent state from vectorized env"},
     {"get_ground_truth_trajectories", get_ground_truth_trajectories, METH_VARARGS, "Get ground truth trajectories"},
@@ -993,6 +1099,8 @@ PyMODINIT_FUNC PyInit_binding(void) {
     }
 
     // Make constants accessible from Python
+    PyModule_AddIntConstant(m, "NUM_ACCEL_BINS", NUM_ACCEL_BINS);
+    PyModule_AddIntConstant(m, "NUM_STEER_BINS", NUM_STEER_BINS);
     PyModule_AddIntConstant(m, "MAX_ROAD_SEGMENT_OBSERVATIONS", MAX_ROAD_SEGMENT_OBSERVATIONS);
     PyModule_AddIntConstant(m, "MAX_AGENTS", MAX_AGENTS);
     PyModule_AddIntConstant(m, "TRAJECTORY_LENGTH", TRAJECTORY_LENGTH);
diff --git a/pufferlib/ocean/env_config.h b/pufferlib/ocean/env_config.h
index 71c19debb..6b7368e70 100644
--- a/pufferlib/ocean/env_config.h
+++ b/pufferlib/ocean/env_config.h
@@ -15,6 +15,10 @@ typedef struct {
     float reward_goal;
     float reward_goal_post_respawn;
     float reward_vehicle_collision_post_respawn;
+    int use_guided_autonomy;        // Boolean: whether to calculate and add guided autonomy reward
+    float guidance_speed_weight;    // Weight for speed deviation penalty
+    float guidance_heading_weight;  // Weight for heading deviation penalty
+    float waypoint_reach_threshold; // Distance threshold for hitting waypoints
     float goal_radius;
     float goal_speed;
     int collision_behavior;
@@ -68,6 +72,14 @@ static int handler(void *config, const char *section, const char *name, const ch
         env_config->reward_goal_post_respawn = atof(value);
     } else if (MATCH("env", "reward_vehicle_collision_post_respawn")) {
         env_config->reward_vehicle_collision_post_respawn = atof(value);
+    } else if (MATCH("env", "use_guided_autonomy")) {
+        env_config->use_guided_autonomy = atoi(value);
+    } else if (MATCH("env", "guidance_speed_weight")) {
+        env_config->guidance_speed_weight = atof(value);
+    } else if (MATCH("env", "guidance_heading_weight")) {
+        env_config->guidance_heading_weight = atof(value);
+    } else if (MATCH("env", "waypoint_reach_threshold")) {
+        env_config->waypoint_reach_threshold = atof(value);
     } else if (MATCH("env", "goal_radius")) {
         env_config->goal_radius = atof(value);
     } else if (MATCH("env", "goal_speed")) {
diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py
index c7a6e6f04..98cc11a4e 100644
--- a/pufferlib/ocean/torch.py
+++ b/pufferlib/ocean/torch.py
@@ -8,7 +8,6 @@
 from pufferlib.models import Default as Policy  # noqa: F401
 from pufferlib.models import Convolutional as Conv  # noqa: F401
 
-
 Recurrent = pufferlib.models.LSTMWrapper
 
 
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index c797f3244..412828030 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -220,11 +220,18 @@ def __init__(self, config, vecenv, policy, logger=None):
         self.stats = defaultdict(list)
         self.last_stats = defaultdict(list)
         self.losses = {}
+        self.realism = {}
 
         # Dashboard
         self.model_size = sum(p.numel() for p in policy.parameters() if p.requires_grad)
         self.print_dashboard(clear=True)
 
+        if hasattr(vecenv, "driver_env") and hasattr(vecenv.driver_env, "expert_data_metrics"):
+            expert_metrics = vecenv.driver_env.expert_data_metrics
+            if expert_metrics:
+                # Log to wandb/neptune immediately at step 0
+                self.logger.log(expert_metrics, step=0)
+
     @property
     def uptime(self):
         return time.time() - self.start_time
@@ -258,7 +265,6 @@ def evaluate(self):
             profile("eval_misc", epoch)
             env_id = slice(env_id[0], env_id[-1] + 1)
 
-            done_mask = d + t  # TODO: Handle truncations separately
             self.global_step += int(mask.sum())
 
             profile("eval_copy", epoch)
@@ -266,12 +272,14 @@ def evaluate(self):
             o_device = o.to(device)  # , non_blocking=True)
             r = torch.as_tensor(r).to(device)  # , non_blocking=True)
             d = torch.as_tensor(d).to(device)  # , non_blocking=True)
+            t = torch.as_tensor(t).to(device)  # , non_blocking=True)
+            done_mask = (d + t).clamp(max=1)
 
             profile("eval_forward", epoch)
             with torch.no_grad(), self.amp_context:
                 state = dict(
                     reward=r,
-                    done=d,
+                    done=done_mask,
                     env_id=env_id,
                     mask=mask,
                 )
@@ -301,8 +309,16 @@ def evaluate(self):
 
                 self.actions[batch_rows, l] = action
                 self.logprobs[batch_rows, l] = logprob
+                # Truncation bootstrap hack for auto-reset envs.
+                # Ideally we add `gamma * V(s_{t+1})` on truncation steps, but Drive resets in C so
+                # the value at index `l` is post-reset. We use `values[..., l-1]` as a heuristic
+                # proxy for the pre-reset terminal value (bootstrap term is not clipped).
+                if l > 0:
+                    trunc_mask = (t > 0) & (d == 0)
+                    r = r + trunc_mask.to(r.dtype) * config["gamma"] * self.values[batch_rows, l - 1]
                 self.rewards[batch_rows, l] = r
-                self.terminals[batch_rows, l] = d.float()
+                self.terminals[batch_rows, l] = done_mask.float()
+                self.truncations[batch_rows, l] = t.float()
                 self.values[batch_rows, l] = value.flatten()
 
                 # Note: We are not yet handling masks in this version
@@ -351,6 +367,7 @@ def train(self):
         a = config["prio_alpha"]
         clip_coef = config["clip_coef"]
         vf_clip = config["vf_clip_coef"]
+        human_clip = config["human_clip_coef"]
         anneal_beta = b0 + (1 - b0) * a * self.epoch / self.total_epochs
         self.ratio[:] = 1
 
@@ -413,6 +430,32 @@ def train(self):
                 approx_kl = ((ratio - 1) - logratio).mean()
                 clipfrac = ((ratio - 1.0).abs() > config["clip_coef"]).float().mean()
 
+            # Compute log likelihood loss of human actions under current policy.
+            # 1: Sample a batch of human actions and observations from dataset
+            # Shape: [n_sequences, bptt_horizon, feature_dim]
+            discrete_human_actions, continuous_human_actions, human_observations = (
+                self.vecenv.driver_env.sample_expert_data(n_samples=config["human_sequences"], return_both=True)
+            )
+
+            use_continuous = self.vecenv.driver_env._action_type_flag == 1
+            human_actions = continuous_human_actions if use_continuous else discrete_human_actions
+            human_actions = human_actions.reshape(-1, 1).to(device)
+            human_observations = human_observations.to(device)
+
+            # 2: Compute the log-likelihood of human actions under the current policy,
+            # given the corresponding human observations. A higher likelihood indicates
+            # that the policy behaves more like a human under the same observations.
+            human_state = dict(
+                lstm_h=None,
+                lstm_c=None,
+            )
+
+            human_logits, _ = self.policy(human_observations, human_state)
+
+            _, human_log_prob, human_entropy = pufferlib.pytorch.sample_logits(
+                logits=human_logits, action=human_actions
+            )
+
             adv = advantages[idx]
             adv = compute_puff_advantage(
                 mb_values,
@@ -439,9 +482,16 @@ def train(self):
             v_loss_clipped = (v_clipped - mb_returns) ** 2
             v_loss = 0.5 * torch.max(v_loss_unclipped, v_loss_clipped).mean()
 
+            human_nll = human_log_prob.mean()
+
             entropy_loss = entropy.mean()
 
-            loss = pg_loss + config["vf_coef"] * v_loss - config["ent_coef"] * entropy_loss
+            loss = (
+                pg_loss
+                + config["vf_coef"] * v_loss
+                - config["ent_coef"] * entropy_loss
+                - config["human_ll_coef"] * human_nll
+            )
             self.amp_context.__enter__()  # TODO: AMP needs some debugging
 
             # This breaks vloss clipping?
@@ -456,10 +506,14 @@ def train(self):
             losses["approx_kl"] += approx_kl.item() / self.total_minibatches
             losses["clipfrac"] += clipfrac.item() / self.total_minibatches
             losses["importance"] += ratio.mean().item() / self.total_minibatches
+            losses["human_nll"] += human_nll / self.total_minibatches
+            self.realism["human_log_prob"] = human_log_prob.mean().item()
 
             # Learn on accumulated minibatches
             profile("learn", epoch)
+
             loss.backward()
+
             if (mb + 1) % self.accumulate_minibatches == 0:
                 torch.nn.utils.clip_grad_norm_(self.policy.parameters(), config["max_grad_norm"])
                 self.optimizer.step()
@@ -489,38 +543,16 @@ def train(self):
             self.last_log_step = self.global_step
             profile.clear()
 
-        if self.epoch % config["checkpoint_interval"] == 0 or done_training:
+        if (self.epoch - 1) % config["checkpoint_interval"] == 0 or done_training:
             self.save_checkpoint()
             self.msg = f"Checkpoint saved at update {self.epoch}"
 
-            if self.render and self.epoch % self.render_interval == 0:
-                model_dir = os.path.join(self.config["data_dir"], f"{self.config['env']}_{self.logger.run_id}")
-                model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
-
-                if model_files:
-                    # Take the latest checkpoint
-                    latest_cpt = max(model_files, key=os.path.getctime)
-                    bin_path = f"{model_dir}.bin"
-
-                    # Export to .bin for rendering with raylib
-                    try:
-                        export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config}
-
-                        export(
-                            args=export_args,
-                            env_name=self.config["env"],
-                            vecenv=self.vecenv,
-                            policy=self.uncompiled_policy,
-                            path=bin_path,
-                            silent=True,
-                        )
-                        pufferlib.utils.render_videos(
-                            self.config, self.vecenv, self.logger, self.epoch, self.global_step, bin_path
-                        )
-
-                    except Exception as e:
-                        print(f"Failed to export model weights: {e}")
-
+            if self.render and (self.epoch - 1) % self.render_interval == 0:
+                bin_path = self._export_to_bin()
+                if bin_path:
+                    pufferlib.utils.render_videos(
+                        self.config, self.vecenv, self.logger, self.epoch, self.global_step, bin_path
+                    )
         if self.config["eval"]["wosac_realism_eval"] and (
             (self.epoch - 1) % self.config["eval"]["eval_interval"] == 0 or done_training
         ):
@@ -531,6 +563,9 @@ def train(self):
         ):
             pufferlib.utils.run_human_replay_eval_in_subprocess(self.config, self.logger, self.global_step)
 
+        if done_training:  # Export latest checkpoint to .bin
+            self._export_to_bin()
+
     def mean_and_log(self):
         config = self.config
         for k in list(self.stats.keys()):
@@ -538,12 +573,18 @@ def mean_and_log(self):
             try:
                 v = np.mean(v)
             except:
-                del self.stats[k]
+                # Keep single-value metrics (like resampled expert data metrics)
+                if isinstance(v, (int, float)):
+                    pass
+                else:
+                    del self.stats[k]
+                    continue
 
             self.stats[k] = v
 
         device = config["device"]
         agent_steps = int(dist_sum(self.global_step, device))
+
         logs = {
             "SPS": dist_sum(self.sps, device),
             "agent_steps": agent_steps,
@@ -558,6 +599,16 @@ def mean_and_log(self):
             # **{f'performance/{k}': dist_sum(v['elapsed'], device) for k, v in self.profile},
         }
 
+        for k, v in self.realism.items():
+            # if k.endswith("_histogram"):
+            #     if hasattr(self.logger, "wandb") and self.logger.wandb:
+            #         import wandb
+
+            #         metric_name = k.replace("_histogram", "")
+            #         logs[f"eval/{metric_name}"] = wandb.Histogram(v)
+            # else:
+            logs[f"eval/{k}"] = v
+
         if torch.distributed.is_initialized():
             if torch.distributed.get_rank() != 0:
                 self.logger.log(logs, agent_steps)
@@ -601,12 +652,40 @@ def save_checkpoint(self):
             "update": self.epoch,
             "model_name": model_name,
             "run_id": run_id,
+            "max_expert_sequences": self.vecenv.driver_env.max_expert_sequences,
+            "bptt_horizon": self.config["bptt_horizon"],
         }
         state_path = os.path.join(path, "trainer_state.pt")
         torch.save(state, state_path + ".tmp")
         os.rename(state_path + ".tmp", state_path)
         return model_path
 
+    def _export_to_bin(self):
+        """Export latest checkpoint to .bin for rendering."""
+        model_dir = os.path.join(self.config["data_dir"], f"{self.config['env']}_{self.logger.run_id}")
+        model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
+
+        if not model_files:
+            return None
+
+        latest_cpt = max(model_files, key=os.path.getctime)
+        bin_path = f"{model_dir}.bin"
+
+        try:
+            export_args = {"env_name": self.config["env"], "load_model_path": latest_cpt, **self.config}
+            export(
+                args=export_args,
+                env_name=self.config["env"],
+                vecenv=self.vecenv,
+                policy=self.uncompiled_policy,
+                path=bin_path,
+                silent=True,
+            )
+            return bin_path
+        except Exception as e:
+            print(f"Failed to export model weights: {e}")
+            return None
+
     def print_dashboard(self, clear=False, idx=[0], c1="[cyan]", c2="[white]", b1="[bright_cyan]", b2="[bright_white]"):
         config = self.config
         sps = dist_sum(self.sps, config["device"])
@@ -800,8 +879,8 @@ def __call__(self, name, epoch, nest=False):
         if epoch % self.frequency != 0:
             return
 
-        # if torch.cuda.is_available():
-        #    torch.cuda.synchronize()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
 
         tick = time.time()
         if len(self.stack) != 0 and not nest:
@@ -817,8 +896,8 @@ def pop(self, end):
         profile["delta"] += delta
 
     def end(self):
-        # if torch.cuda.is_available():
-        #    torch.cuda.synchronize()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
 
         end = time.time()
         for i in range(len(self.stack)):
@@ -933,15 +1012,16 @@ class WandbLogger:
     def __init__(self, args, load_id=None, resume="allow"):
         import wandb
 
+        run_name = args.get("wandb_run_name", None)
         wandb.init(
             id=load_id or wandb.util.generate_id(),
+            name=run_name,
             project=args["wandb_project"],
             group=args["wandb_group"],
             allow_val_change=True,
             save_code=False,
             resume=resume,
             config=args,
-            name=args.get("wandb_name"),
             tags=[args["tag"]] if args["tag"] is not None else [],
         )
         self.wandb = wandb
@@ -1036,6 +1116,7 @@ def eval(env_name, args=None, vecenv=None, policy=None):
     """Evaluate a policy."""
 
     args = args or load_config(env_name)
+    args["env"]["prep_human_data"] = False
 
     wosac_enabled = args["eval"]["wosac_realism_eval"]
     human_replay_enabled = args["eval"]["human_replay_eval"]
@@ -1085,6 +1166,7 @@ def eval(env_name, args=None, vecenv=None, policy=None):
             road_edge_polylines,
             args["eval"]["wosac_aggregate_results"],
         )
+        results = {k: v.item() if hasattr(v, "item") else v for k, v in results.items()}
 
         if args["eval"]["wosac_aggregate_results"]:
             import json
@@ -1103,6 +1185,7 @@ def eval(env_name, args=None, vecenv=None, policy=None):
         args["vec"] = dict(backend=backend, num_envs=1)
         args["env"]["control_mode"] = args["eval"]["human_replay_control_mode"]
         args["env"]["episode_length"] = 91  # WOMD scenario length
+        args["env"]["num_maps"] = args["eval"]["human_replay_num_agents"]
 
         vecenv = vecenv or load_env(env_name, args)
         policy = policy or load_policy(args, vecenv, env_name)
@@ -1197,7 +1280,7 @@ def sweep(args=None, env_name=None):
     points_per_run = args["sweep"]["downsample"]
     target_key = f"environment/{args['sweep']['metric']}"
     for i in range(args["max_runs"]):
-        seed = time.time_ns() & 0xFFFFFFFF
+        seed = 42
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
@@ -1255,6 +1338,11 @@ def controlled_exp(env_name, args=None):
             section, param = key.split(".")
             exp_args[section][param] = value
 
+        # Create descriptive name
+        run_name_parts = [f"{key.split('.')[-1]}={value}" for key, value in zip(keys, combo)]
+        exp_name = "_".join(run_name_parts)
+        exp_args["wandb_run_name"] = f"{exp_name}"
+
         print(f"\nExperiment {i}/{len(combinations)}: {dict(zip(keys, combo))}")
 
         # Train
diff --git a/pufferlib/resources/drive/binaries/map_000.bin b/pufferlib/resources/drive/binaries/map_000.bin
index 0b71744f6..deea84b71 100644
Binary files a/pufferlib/resources/drive/binaries/map_000.bin and b/pufferlib/resources/drive/binaries/map_000.bin differ
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index 76bf4a5a0..6d9df13e4 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -112,7 +112,7 @@ def run_wosac_eval_in_subprocess(config, logger, global_step):
             "--eval.wosac-goal-behavior",
             str(eval_config.get("wosac_goal_behavior", 2)),
             "--eval.wosac-goal-radius",
-            str(eval_config.get("wosac_goal_radius", 2.0)),
+            str(eval_config.get("wosac_goal_radius", 1.0)),
             "--eval.wosac-sanity-check",
             str(eval_config.get("wosac_sanity_check", False)),
             "--eval.wosac-aggregate-results",
@@ -141,9 +141,18 @@ def run_wosac_eval_in_subprocess(config, logger, global_step):
                 if hasattr(logger, "wandb") and logger.wandb:
                     logger.wandb.log(
                         {
-                            "eval/wosac_realism_meta_score": wosac_metrics["realism_meta_score"],
+                            "eval/wosac_realism_meta_score_mean": wosac_metrics["realism_meta_score"],
+                            "eval/wosac_realism_meta_score_std": wosac_metrics["realism_score_std"],
                             "eval/wosac_ade": wosac_metrics["ade"],
                             "eval/wosac_min_ade": wosac_metrics["min_ade"],
+                            "eval/wosac_kinematic_metrics": wosac_metrics["kinematic_metrics"],
+                            "eval/wosac_interactive_metrics": wosac_metrics["interactive_metrics"],
+                            "eval/map_based_metrics": wosac_metrics["map_based_metrics"],
+                            "eval/wosac_likelihood_ttc": wosac_metrics["likelihood_time_to_collision"],
+                            "eval/wosac_likelihood_collision": wosac_metrics["likelihood_collision_indication"],
+                            "eval/wosac_likelihood_dist_to_no": wosac_metrics["likelihood_distance_to_nearest_object"],
+                            "eval/num_collisions_sim": wosac_metrics["num_collisions_sim"],
+                            "eval/num_collisions_ref": wosac_metrics["num_collisions_ref"],
                             "eval/wosac_total_num_agents": wosac_metrics["total_num_agents"],
                         },
                         step=global_step,
diff --git a/setup.py b/setup.py
index b834f94cf..0ae3a7705 100644
--- a/setup.py
+++ b/setup.py
@@ -312,6 +312,7 @@ def run(self):
     "gym==0.23",
     "gymnasium==0.29.1",
     "pettingzoo==1.24.1",
+    "matplotlib",
 ]
 
 if not NO_TRAIN:
diff --git a/wosac_evaluation_results.png b/wosac_evaluation_results.png
new file mode 100644
index 000000000..bafd2b2f9
Binary files /dev/null and b/wosac_evaluation_results.png differ
diff --git a/wosac_realism_score_distributions.png b/wosac_realism_score_distributions.png
new file mode 100644
index 000000000..6150cb766
Binary files /dev/null and b/wosac_realism_score_distributions.png differ