Emerge-Lab · nadarenator · Nov 28, 2025 · Nov 28, 2025 · Nov 28, 2025 · Nov 28, 2025
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
@@ -1324,12 +1324,45 @@ def ensure_drive_binary():
 
 
 def autotune(args=None, env_name=None, vecenv=None, policy=None):
+    args = args or load_config(env_name)
     package = args["package"]
     module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}"
     env_module = importlib.import_module(module_name)
     env_name = args["env_name"]
     make_env = env_module.env_creator(env_name)
-    pufferlib.vector.autotune(make_env, batch_size=args["train"]["env_batch_size"])
+
+    # For multi-agent envs, convert train.batch_size (agent-steps) to orchestrator env count
+    # For single-agent envs, this division results in the same value
+    num_agents_per_env = args["env"].get("num_agents", 1)
+    train_batch_size = args["train"]["batch_size"]
+    orchestrator_batch_size = train_batch_size // num_agents_per_env
+
+    # max_envs must be at least as large as the batch size
+    max_envs = args.get("max_envs")
+    if max_envs is None:
+        # Default to 2x the batch size to allow for testing different configurations
+        max_envs = orchestrator_batch_size * 2
+    elif max_envs < orchestrator_batch_size:
+        raise ValueError(
+            f"max_envs ({max_envs}) must be >= orchestrator_batch_size ({orchestrator_batch_size}). "
+            f"Either increase --max-envs or reduce train.batch_size in the config."
+        )
+
+    print(f"Autotune configuration:")
+    print(f"  Training batch size: {train_batch_size} agent-steps")
+    print(f"  Agents per environment: {num_agents_per_env}")
+    print(f"  Orchestrator batch size: {orchestrator_batch_size} environments")
+    print(f"  Max environments to test: {max_envs}")
+    print()
+
+    pufferlib.vector.autotune(
+        lambda: make_env(**args["env"]),
+        batch_size=orchestrator_batch_size,
+        max_env_ram_gb=args.get("max_env_ram_gb"),
+        max_batch_vram_gb=args.get("max_batch_vram_gb"),
+        max_envs=max_envs,
+        time_per_test=args.get("autotune_time", 5),
+    )
 
 
 def load_env(env_name, args):
@@ -1410,6 +1443,10 @@ def load_config(env_name, config_dir=None):
     parser.add_argument("--neptune-project", type=str, default="ablations")
     parser.add_argument("--local-rank", type=int, default=0, help="Used by torchrun for DDP")
     parser.add_argument("--tag", type=str, default=None, help="Tag for experiment")
+    parser.add_argument("--max-env-ram-gb", type=float, default=None, help="Max RAM (GB) for autotune (overrides auto-detection)")
+    parser.add_argument("--max-batch-vram-gb", type=float, default=None, help="Max VRAM (GB) for autotune (overrides auto-detection)")
+    parser.add_argument("--max-envs", type=int, default=None, help="Max environments for autotune (default: 2x batch size)")
+    parser.add_argument("--autotune-time", type=int, default=5, help="Time per test (seconds) for autotune")
     args = parser.parse_known_args()[0]
 
     if config_dir is None:

diff --git a/pufferlib/vector.py b/pufferlib/vector.py
@@ -795,10 +795,10 @@ def check_envs(envs, driver):
 def autotune(
     env_creator,
     batch_size,
-    max_envs=194,
+    max_envs=None,
     model_forward_s=0.0,
-    max_env_ram_gb=32,
-    max_batch_vram_gb=0.05,
+    max_env_ram_gb=None,
+    max_batch_vram_gb=None,
     time_per_test=5,
 ):
     """Determine the optimal vectorization parameters for your system"""
@@ -807,6 +807,31 @@ def autotune(
     if batch_size is None:
         raise ValueError("batch_size must not be None")
 
+    # Auto-detect hardware limits if not specified
+    if max_env_ram_gb is None:
+        # Use 80% of available system RAM to leave room for OS and other processes
+        total_ram_gb = psutil.virtual_memory().total / 1e9
+        max_env_ram_gb = total_ram_gb * 0.8
+        print(f"Auto-detected max RAM: {max_env_ram_gb:.2f} GB (80% of {total_ram_gb:.2f} GB total)")
+
+    if max_batch_vram_gb is None:
+        try:
+            import torch
+
+            if torch.cuda.is_available():
+                # Use 80% of GPU VRAM to leave room for model and gradients
+                total_vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
+                max_batch_vram_gb = total_vram_gb * 0.8
+                print(f"Auto-detected max VRAM: {max_batch_vram_gb:.2f} GB (80% of {total_vram_gb:.2f} GB total)")
+            else:
+                # No GPU, use conservative default
+                max_batch_vram_gb = 0.05
+                print("No GPU detected, using default max_batch_vram_gb=0.05 GB")
+        except ImportError:
+            # torch not available, use conservative default
+            max_batch_vram_gb = 0.05
+            print("PyTorch not available, using default max_batch_vram_gb=0.05 GB")
+
     if max_envs < batch_size:
         raise ValueError("max_envs < min_batch_size")
 
@@ -829,7 +854,7 @@ def autotune(
     while time.time() - start < time_per_test:
         idle_ram = max(idle_ram, psutil.Process().memory_info().rss)
         s = time.time()
-        if env.done:
+        if hasattr(env, "done") and env.done:
             env.reset()
             reset_times.append(time.time() - s)
         else:
@@ -839,10 +864,10 @@ def autotune(
 
     env.close()
     sum_time = sum(step_times) + sum(reset_times)
-    reset_percent = 100 * sum(reset_times) / sum_time
-    sps = steps * num_agents / sum_time
-    step_variance = 100 * np.std(step_times) / np.mean(step_times)
-    reset_mean = np.mean(reset_times)
+    reset_percent = 100 * sum(reset_times) / sum_time if sum_time > 0 else 0
+    sps = steps * num_agents / sum_time if sum_time > 0 else 0
+    step_variance = 100 * np.std(step_times) / np.mean(step_times) if len(step_times) > 0 else 0
+    reset_mean = np.mean(reset_times) if len(reset_times) > 0 else 0
     ram_usage = max(1, (idle_ram - load_ram)) / 1e9
 
     obs_size_gb = np.prod(obs_space.shape) * np.dtype(obs_space.dtype).itemsize * num_agents / 1e9