diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index 149768253..27a95daf1 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -1324,12 +1324,45 @@ def ensure_drive_binary(): def autotune(args=None, env_name=None, vecenv=None, policy=None): + args = args or load_config(env_name) package = args["package"] module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}" env_module = importlib.import_module(module_name) env_name = args["env_name"] make_env = env_module.env_creator(env_name) - pufferlib.vector.autotune(make_env, batch_size=args["train"]["env_batch_size"]) + + # For multi-agent envs, convert train.batch_size (agent-steps) to orchestrator env count + # For single-agent envs, this division results in the same value + num_agents_per_env = args["env"].get("num_agents", 1) + train_batch_size = args["train"]["batch_size"] + orchestrator_batch_size = train_batch_size // num_agents_per_env + + # max_envs must be at least as large as the batch size + max_envs = args.get("max_envs") + if max_envs is None: + # Default to 2x the batch size to allow for testing different configurations + max_envs = orchestrator_batch_size * 2 + elif max_envs < orchestrator_batch_size: + raise ValueError( + f"max_envs ({max_envs}) must be >= orchestrator_batch_size ({orchestrator_batch_size}). " + f"Either increase --max-envs or reduce train.batch_size in the config." + ) + + print(f"Autotune configuration:") + print(f" Training batch size: {train_batch_size} agent-steps") + print(f" Agents per environment: {num_agents_per_env}") + print(f" Orchestrator batch size: {orchestrator_batch_size} environments") + print(f" Max environments to test: {max_envs}") + print() + + pufferlib.vector.autotune( + lambda: make_env(**args["env"]), + batch_size=orchestrator_batch_size, + max_env_ram_gb=args.get("max_env_ram_gb"), + max_batch_vram_gb=args.get("max_batch_vram_gb"), + max_envs=max_envs, + time_per_test=args.get("autotune_time", 5), + ) def load_env(env_name, args): @@ -1410,6 +1443,10 @@ def load_config(env_name, config_dir=None): parser.add_argument("--neptune-project", type=str, default="ablations") parser.add_argument("--local-rank", type=int, default=0, help="Used by torchrun for DDP") parser.add_argument("--tag", type=str, default=None, help="Tag for experiment") + parser.add_argument("--max-env-ram-gb", type=float, default=None, help="Max RAM (GB) for autotune (overrides auto-detection)") + parser.add_argument("--max-batch-vram-gb", type=float, default=None, help="Max VRAM (GB) for autotune (overrides auto-detection)") + parser.add_argument("--max-envs", type=int, default=None, help="Max environments for autotune (default: 2x batch size)") + parser.add_argument("--autotune-time", type=int, default=5, help="Time per test (seconds) for autotune") args = parser.parse_known_args()[0] if config_dir is None: diff --git a/pufferlib/vector.py b/pufferlib/vector.py index bf5dc7460..c014ac965 100644 --- a/pufferlib/vector.py +++ b/pufferlib/vector.py @@ -795,10 +795,10 @@ def check_envs(envs, driver): def autotune( env_creator, batch_size, - max_envs=194, + max_envs=None, model_forward_s=0.0, - max_env_ram_gb=32, - max_batch_vram_gb=0.05, + max_env_ram_gb=None, + max_batch_vram_gb=None, time_per_test=5, ): """Determine the optimal vectorization parameters for your system""" @@ -807,6 +807,31 @@ def autotune( if batch_size is None: raise ValueError("batch_size must not be None") + # Auto-detect hardware limits if not specified + if max_env_ram_gb is None: + # Use 80% of available system RAM to leave room for OS and other processes + total_ram_gb = psutil.virtual_memory().total / 1e9 + max_env_ram_gb = total_ram_gb * 0.8 + print(f"Auto-detected max RAM: {max_env_ram_gb:.2f} GB (80% of {total_ram_gb:.2f} GB total)") + + if max_batch_vram_gb is None: + try: + import torch + + if torch.cuda.is_available(): + # Use 80% of GPU VRAM to leave room for model and gradients + total_vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 + max_batch_vram_gb = total_vram_gb * 0.8 + print(f"Auto-detected max VRAM: {max_batch_vram_gb:.2f} GB (80% of {total_vram_gb:.2f} GB total)") + else: + # No GPU, use conservative default + max_batch_vram_gb = 0.05 + print("No GPU detected, using default max_batch_vram_gb=0.05 GB") + except ImportError: + # torch not available, use conservative default + max_batch_vram_gb = 0.05 + print("PyTorch not available, using default max_batch_vram_gb=0.05 GB") + if max_envs < batch_size: raise ValueError("max_envs < min_batch_size") @@ -829,7 +854,7 @@ def autotune( while time.time() - start < time_per_test: idle_ram = max(idle_ram, psutil.Process().memory_info().rss) s = time.time() - if env.done: + if hasattr(env, "done") and env.done: env.reset() reset_times.append(time.time() - s) else: @@ -839,10 +864,10 @@ def autotune( env.close() sum_time = sum(step_times) + sum(reset_times) - reset_percent = 100 * sum(reset_times) / sum_time - sps = steps * num_agents / sum_time - step_variance = 100 * np.std(step_times) / np.mean(step_times) - reset_mean = np.mean(reset_times) + reset_percent = 100 * sum(reset_times) / sum_time if sum_time > 0 else 0 + sps = steps * num_agents / sum_time if sum_time > 0 else 0 + step_variance = 100 * np.std(step_times) / np.mean(step_times) if len(step_times) > 0 else 0 + reset_mean = np.mean(reset_times) if len(reset_times) > 0 else 0 ram_usage = max(1, (idle_ram - load_ram)) / 1e9 obs_size_gb = np.prod(obs_space.shape) * np.dtype(obs_space.dtype).itemsize * num_agents / 1e9