diff --git a/src/ddpg.py b/src/ddpg.py index 4ed7b63..50cd892 100644 --- a/src/ddpg.py +++ b/src/ddpg.py @@ -55,11 +55,12 @@ class Args: save_model: bool = False # whether to save model into the `runs/{run_name}` folder # Algorithm specific arguments - net_arch: list[int] = (256, 256, 256) # Structure of the network, default 64,64 + actor_net_arch: list[int] = (64,64) # Structure of actor network, default 64,64 + critic_net_arch: list[int] = (64,64) # Structure of critic network, default 64,64 learning_rate: float = 1e-3 # learning rate of optimizer buffer_size: int = int(1e6) # replay memory buffer size gamma: float = 0.99 # discount factor gamma - tau: float = 0.005 # target smoothing coefficient (default: 0.005) + tau: float = 0.005 # target smoothing coefficient (default: 0.005) batch_size: int = 256 # batch size of sample from the reply memory exploration_noise: float = 0.1 # scale of exploration noise # learning_starts: int = 0 # timestep to start learning @@ -123,13 +124,12 @@ class QNetwork(nn.Module): def __init__(self, env): super().__init__() valid_networks = [(64, 64), (256,256), (256,256,256), [64,64], [256,256], [256,256,256]] - if args.net_arch in valid_networks: - arch = np.array(args.net_arch) + if args.critic_net_arch in valid_networks: + arch = np.array(args.critic_net_arch) else: # invalid architecture print("Exiting: incorrect network architecture. example: ") print("--net_arch 256 256 256") exit() - self.dims = len(arch) if self.dims == 2: # 64,64 or 256,256 self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), \ @@ -163,7 +163,13 @@ def forward(self, x, a): class Actor(nn.Module): def __init__(self, env): super().__init__() - arch = np.array(args.net_arch) + valid_networks = [(64, 64), (256,256), (256,256,256), [64,64], [256,256], [256,256,256]] + if args.actor_net_arch in valid_networks: + arch = np.array(args.actor_net_arch) + else: # invalid architecture + print("Exiting: incorrect network architecture. example: ") + print("--net_arch 256 256 256") + exit() self.dims = len(arch) if self.dims == 2: # 64,64 or 256,256 self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), arch[0]) @@ -188,13 +194,13 @@ def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = torch.tanh(self.fc_mu(x)) - return x * self.action_scale + self.action_bias + return x * self.action_scale + self.action_bias, None elif self.dims == 3: x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = F.relu(self.fc3(x)) x = torch.tanh(self.fc_mu(x)) - return x * self.action_scale + self.action_bias + return x * self.action_scale + self.action_bias, None if __name__ == "__main__": @@ -289,7 +295,7 @@ def forward(self, x): actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) else: with torch.no_grad(): - actions = actor(torch.Tensor(obs).to(device)) + actions, _ = actor(torch.Tensor(obs).to(device)) actions += torch.normal(0, actor.action_scale * args.exploration_noise) actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high) diff --git a/src/evaluator.py b/src/evaluator.py index 4a10510..08099ac 100644 --- a/src/evaluator.py +++ b/src/evaluator.py @@ -71,7 +71,7 @@ def _evaluate(self): while not done: # ALGO LOGIC: put action logic here with torch.no_grad(): - actions = self.actor(torch.Tensor(obs).to(self.device)) + actions, _ = self.actor(torch.Tensor(obs).to(self.device)) actions = actions.cpu().numpy().clip(self.eval_env.action_space.low, self.eval_env.action_space.high) diff --git a/src/plotting/utils.py b/src/plotting/utils.py index 223bb2c..2a34aa9 100644 --- a/src/plotting/utils.py +++ b/src/plotting/utils.py @@ -142,7 +142,7 @@ def get_paths(results_dir, filename='evaluations.npz'): paths.append(f'{results_dir}/{subdir}/{filename}') return paths - +# TODO: Add addit. arg to choose timesteps or updates def get_data(results_dir, field_name='returns', filename='evaluations.npz'): try: @@ -164,6 +164,6 @@ def get_data(results_dir, field_name='returns', filename='evaluations.npz'): avg_vals = vals results.append(avg_vals) - timesteps = data['timesteps'] + timesteps = data['timesteps'] #### return timesteps, np.array(results) diff --git a/src/sac.py b/src/sac.py index 0c3d97d..c5aecbd 100644 --- a/src/sac.py +++ b/src/sac.py @@ -23,7 +23,8 @@ class Args: # wandb tracking exp_name: str = os.path.basename(__file__)[: -len(".py")] # name of this experiment - seed: int = 1 + seed: Optional[int] = None # seed of the experiment + # seed: int = 1 torch_deterministic: bool = True # if toggled, torch.backends.cudnn.deterministic=False cuda: bool = True track: bool = False # tracked with Weights and Biases @@ -60,33 +61,47 @@ class Args: save_rootdir: str = "results" # top-level directory where results will be saved save_subdir: Optional[str] = None # lower level directories save_dir: str = field(init=False) # the lower-level directories + save_model: bool = False # whether to save model into the `runs/{run_name}` folder + eval_freq: Optional[int] = None # num of timesteps between policy evals + n_eval_episodes: int = 80 # num of eval episodes -def __post_init__(self): - if self.save_subdir == None: - self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg" - else: - self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg/{self.save_subdir}" - if self.run_id is None: - self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1 - self.save_dir += f"/run_{self.run_id}" - if self.seed is None: - self.seed = self.run_id - else: - self.seed = np.random.randint(2 ** 32 - 1) + def __post_init__(self): + if self.eval_freq == None: + # 20 evals per training run unless specified otherwise. + self.eval_freq = self.total_timesteps/20 - # dump training config to save dir - os.makedirs(self.save_dir, exist_ok=True) - with open(os.path.join(self.save_dir, "config.yml"), "w") as f: - yaml.dump(self, f, sort_keys=True) + if self.save_subdir == None: + self.save_dir = f"{self.save_rootdir}/{self.env_id}/sac" + else: + self.save_dir = f"{self.save_rootdir}/{self.env_id}/sac/{self.save_subdir}" + if self.run_id is None: + self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1 + self.save_dir += f"/run_{self.run_id}" + if self.seed is None: + self.seed = self.run_id + else: + self.seed = np.random.randint(2 ** 32 - 1) + print("self.save_dir = "+self.save_dir) -def make_env(env_id, seed, idx, capture_video, run_name): + # dump training config to save dir + os.makedirs(self.save_dir, exist_ok=True) + with open(os.path.join(self.save_dir, "config.yml"), "w") as f: + yaml.dump(self, f, sort_keys=True) + + +## added env_kwargs from ddpg.py +def make_env(env_id, env_kwargs, seed, idx, capture_video, run_name): def thunk(): if capture_video and idx == 0: - env = gym.make(env_id, render_mode="rgb_array") + env = gym.make(env_id, render_mode="rgb_array", **env_kwargs) env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") else: - env = gym.make(env_id) + env = gym.make(env_id, **env_kwargs) + + # Flatten Dict obs so we don't need to handle them a special case in DA + if isinstance(env.observation_space, gym.spaces.Dict): + env = gym.wrappers.FlattenObservation(env) env = gym.wrappers.RecordEpisodeStatistics(env) env.action_space.seed(seed) return env @@ -190,10 +205,11 @@ def get_action(self, x): torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + # device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + device = torch.device("cpu") # env setup - envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)]) + envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, args.seed, 0, args.capture_video, run_name)]) assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" max_action = float(envs.single_action_space.high[0]) @@ -226,11 +242,11 @@ def get_action(self, x): handle_timeout_termination=False, ) - # eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, 0, 0, False, run_name)]) - # evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes) + eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, 0, 0, False, run_name)]) + evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes) - start_time = time.time() num_updates = 0 + start_time = time.time() # TRY NOT TO MODIFY: start the game obs, _ = envs.reset(seed=args.seed) @@ -239,9 +255,10 @@ def get_action(self, x): if global_step < args.learning_starts: actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) else: - actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) + actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) actions = actions.detach().cpu().numpy() + # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards, terminations, truncations, infos = envs.step(actions) @@ -329,8 +346,15 @@ def get_action(self, x): if args.autotune: writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step) - # if global_step % args.eval_freq == 0: - # evaluator.evaluate(global_step, num_updates=num_updates) + if global_step % args.eval_freq == 0: + evaluator.evaluate(global_step, num_updates=num_updates) + + + if args.save_model: + model_path = f"{args.save_dir}/model" + torch.save((actor.state_dict(), qf1.state_dict()), model_path) + print(f"model saved to {model_path}") + envs.close() writer.close() \ No newline at end of file diff --git a/src/td3.py b/src/td3.py index 3ec0cdc..6277515 100644 --- a/src/td3.py +++ b/src/td3.py @@ -2,7 +2,7 @@ import os import random import time -from dataclasses import dataclass +from dataclasses import dataclass, field import gymnasium as gym import numpy as np @@ -14,6 +14,10 @@ from stable_baselines3.common.buffers import ReplayBuffer from torch.utils.tensorboard import SummaryWriter +from typing import Optional, Union +from src.utils import get_latest_run_id +from src.evaluator import Evaluator +import yaml @dataclass class Args: @@ -23,13 +27,26 @@ class Args: cuda: bool = True # cuda will be enabled by default track: bool = False # if toggled, experiment will be tracked with Weights and Biases wandb_project_name: str = "cleanRL" - wandb_entity: str = None # entity (team) of wandb's project + wandb_entity: Optional[str] = None # the entity (team) of wandb's project capture_video: bool = False # capture videos of the agent performances (check out `videos` folder) save_model: bool = False # whether to save model into the `runs/{run_name}` folder upload_model: bool = False # upload the saved model to huggingface hf_entity: str = "" # user or org name of the model repository from the Hugging Face Hub + run_id: Optional[int] = None + save_rootdir: str = "results" # top-level directory where results will be saved + save_subdir: Optional[str] = None # lower level directories + save_dir: str = field(init=False) # the lower-level directories + save_model: bool = False # whether to save model into the `runs/{run_name}` folder + # Algorithm specific arguments + env_kwargs: dict[str, Union[bool, float, str]] = field(default_factory=dict) + """ + usage: --env_kwargs arg1 val1 arg2 val2 arg3 val3 + + To make PointMaze tasks use a sparse reward function: + --env_kwargs continuing_task False + """ env_id: str = "Hopper-v4" # the id of the environment total_timesteps: int = 1000000 # total timesteps of the experiments learning_rate: float = 3e-4 # learning rate of the optimizer @@ -42,22 +59,52 @@ class Args: learning_starts: int = 25e3 # timestep to start learning policy_frequency: int = 2 # the frequency of training policy (delayed noise_clip: float = 0.5 # noise clip parameter of the Target Policy Smoothing Regularization + eval_freq: Optional[int] = None # num of timesteps between policy evals + n_eval_episodes: int = 80 # num of eval episodes + + + def __post_init__(self): + if self.eval_freq == None: + # 20 evals per training run unless specified otherwise. + self.eval_freq = self.total_timesteps/20 + + if self.save_subdir == None: + self.save_dir = f"{self.save_rootdir}/{self.env_id}/td3" + else: + self.save_dir = f"{self.save_rootdir}/{self.env_id}/td3/{self.save_subdir}" + if self.run_id is None: + self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1 + self.save_dir += f"/run_{self.run_id}" + if self.seed is None: + self.seed = self.run_id + else: + self.seed = np.random.randint(2 ** 32 - 1) + + print("self.save_dir = "+self.save_dir) + # dump training config to save dir + os.makedirs(self.save_dir, exist_ok=True) + with open(os.path.join(self.save_dir, "config.yml"), "w") as f: + yaml.dump(self, f, sort_keys=True) -def make_env(env_id, seed, idx, capture_video, run_name): + +def make_env(env_id, env_kwargs, seed, idx, capture_video, run_name): def thunk(): if capture_video and idx == 0: - env = gym.make(env_id, render_mode="rgb_array") + env = gym.make(env_id, render_mode="rgb_array", **env_kwargs) env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") else: - env = gym.make(env_id) + env = gym.make(env_id, **env_kwargs) + + # Flatten Dict obs so we don't need to handle them a special case in DA + if isinstance(env.observation_space, gym.spaces.Dict): + env = gym.wrappers.FlattenObservation(env) env = gym.wrappers.RecordEpisodeStatistics(env) env.action_space.seed(seed) return env return thunk - # ALGO LOGIC: initialize agent here: class QNetwork(nn.Module): def __init__(self, env): @@ -92,7 +139,7 @@ def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = torch.tanh(self.fc_mu(x)) - return x * self.action_scale + self.action_bias + return x * self.action_scale + self.action_bias, None if __name__ == "__main__": @@ -131,10 +178,11 @@ def forward(self, x): torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + # device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + device = torch.device("cpu") # env setup - envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)]) + envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, args.seed, 0, args.capture_video, run_name)]) assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" actor = Actor(envs).to(device) @@ -157,6 +205,11 @@ def forward(self, x): device, handle_timeout_termination=False, ) + + eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, 0, 0, False, run_name)]) + evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes) + + num_updates = 0 start_time = time.time() # TRY NOT TO MODIFY: start the game @@ -167,7 +220,7 @@ def forward(self, x): actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) else: with torch.no_grad(): - actions = actor(torch.Tensor(obs).to(device)) + actions, _ = actor(torch.Tensor(obs).to(device)) actions += torch.normal(0, actor.action_scale * args.exploration_noise) actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high) @@ -177,7 +230,7 @@ def forward(self, x): # TRY NOT TO MODIFY: record rewards for plotting purposes if "final_info" in infos: for info in infos["final_info"]: - print(f"global_step={global_step}, episodic_return={info['episode']['r']}") + # print(f"global_step={global_step}, episodic_return={info['episode']['r']}") writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) break @@ -240,9 +293,13 @@ def forward(self, x): writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) - print("SPS:", int(global_step / (time.time() - start_time))) + # print("SPS:", int(global_step / (time.time() - start_time))) writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + if global_step % args.eval_freq == 0: + evaluator.evaluate(global_step, num_updates=num_updates) + + if args.save_model: model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model" torch.save((actor.state_dict(), qf1.state_dict(), qf2.state_dict()), model_path) @@ -264,7 +321,6 @@ def forward(self, x): if args.upload_model: from cleanrl_utils.huggingface import push_to_hub - repo_name = f"{args.env_id}-{args.exp_name}-seed{args.seed}" repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name push_to_hub(args, episodic_returns, repo_id, "TD3", f"runs/{run_name}", f"videos/{run_name}-eval")