From e9f0b4108cc9576579af6e4c1484abf3de678290 Mon Sep 17 00:00:00 2001 From: Nora Tseng Date: Thu, 30 May 2024 22:15:55 -0700 Subject: [PATCH 1/5] added sac and td3 (continuous), working on using evaluator --- src/sac.py | 336 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/td3.py | 273 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 609 insertions(+) create mode 100644 src/sac.py create mode 100644 src/td3.py diff --git a/src/sac.py b/src/sac.py new file mode 100644 index 0000000..0c3d97d --- /dev/null +++ b/src/sac.py @@ -0,0 +1,336 @@ +# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy +import os +import random +import time +from dataclasses import dataclass, field +from typing import Optional, Union + +import gymnasium as gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import tyro +from stable_baselines3.common.buffers import ReplayBuffer +from torch.utils.tensorboard import SummaryWriter + +from src.utils import get_latest_run_id +from src.evaluator import Evaluator +import yaml + +@dataclass +class Args: + # wandb tracking + exp_name: str = os.path.basename(__file__)[: -len(".py")] # name of this experiment + seed: int = 1 + torch_deterministic: bool = True # if toggled, torch.backends.cudnn.deterministic=False + cuda: bool = True + track: bool = False # tracked with Weights and Biases + wandb_project_name: str = "cleanRL" # wandb's project name + wandb_entity: str = None # the entity (team) of wandb's project + capture_video: bool = False # whether to capture videos of the agent performances (check out `videos` folder) + + # Algorithm specific arguments + env_id: str = "Hopper-v4" # environment id of the task + env_kwargs: dict[str, Union[bool, float, str]] = field(default_factory=dict) + """ + usage: --env_kwargs arg1 val1 arg2 val2 arg3 val3 + + To make PointMaze tasks use a sparse reward function: + --env_kwargs continuing_task False + """ + total_timesteps: int = 1000000 # total timesteps of the experiments + buffer_size: int = int(1e6) # the replay memory buffer size + gamma: float = 0.99 # discount factor gamma + tau: float = 0.005 # target smoothing coefficient + batch_size: int = 256 # batch size of sample from the reply memory + learning_starts: int = 5e3 # timestep to start learning + policy_lr: float = 3e-4 # learning rate of the policy network optimizer + q_lr: float = 1e-3 # learning rate of the Q network network optimizer + policy_frequency: int = 2 # frequency of training policy (delayed)""" + target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. + """the frequency of updates for the target nerworks""" + noise_clip: float = 0.5 # noise clip parameter of the Target Policy Smoothing Regularization + alpha: float = 0.2 # Entropy regularization coefficient. + autotune: bool = True # automatic tuning of the entropy coefficient + + # added args from ddpg.py + run_id: Optional[int] = None + save_rootdir: str = "results" # top-level directory where results will be saved + save_subdir: Optional[str] = None # lower level directories + save_dir: str = field(init=False) # the lower-level directories + +def __post_init__(self): + if self.save_subdir == None: + self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg" + else: + self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg/{self.save_subdir}" + if self.run_id is None: + self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1 + self.save_dir += f"/run_{self.run_id}" + if self.seed is None: + self.seed = self.run_id + else: + self.seed = np.random.randint(2 ** 32 - 1) + + # dump training config to save dir + os.makedirs(self.save_dir, exist_ok=True) + with open(os.path.join(self.save_dir, "config.yml"), "w") as f: + yaml.dump(self, f, sort_keys=True) + + +def make_env(env_id, seed, idx, capture_video, run_name): + def thunk(): + if capture_video and idx == 0: + env = gym.make(env_id, render_mode="rgb_array") + env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") + else: + env = gym.make(env_id) + env = gym.wrappers.RecordEpisodeStatistics(env) + env.action_space.seed(seed) + return env + + return thunk + + +# ALGO LOGIC: initialize agent here: +class SoftQNetwork(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) + self.fc2 = nn.Linear(256, 256) + self.fc3 = nn.Linear(256, 1) + + def forward(self, x, a): + x = torch.cat([x, a], 1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +LOG_STD_MAX = 2 +LOG_STD_MIN = -5 + + +class Actor(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) + self.fc2 = nn.Linear(256, 256) + self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) + self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) + # action rescaling + self.register_buffer( + "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32) + ) + self.register_buffer( + "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32) + ) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + mean = self.fc_mean(x) + log_std = self.fc_logstd(x) + log_std = torch.tanh(log_std) + log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats + + return mean, log_std + + def get_action(self, x): + mean, log_std = self(x) + std = log_std.exp() + normal = torch.distributions.Normal(mean, std) + x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) + y_t = torch.tanh(x_t) + action = y_t * self.action_scale + self.action_bias + log_prob = normal.log_prob(x_t) + # Enforcing Action Bound + log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) + log_prob = log_prob.sum(1, keepdim=True) + mean = torch.tanh(mean) * self.action_scale + self.action_bias + return action, log_prob, mean + + +if __name__ == "__main__": + import stable_baselines3 as sb3 + + if sb3.__version__ < "2.0": + raise ValueError( + """Ongoing migration: run the following command to install the new dependencies: +poetry run pip install "stable_baselines3==2.0.0a1" +""" + ) + + args = tyro.cli(Args) + run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" + if args.track: + import wandb + + wandb.init( + project=args.wandb_project_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + # TRY NOT TO MODIFY: seeding + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.deterministic = args.torch_deterministic + + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + + # env setup + envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)]) + assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" + + max_action = float(envs.single_action_space.high[0]) + + actor = Actor(envs).to(device) + qf1 = SoftQNetwork(envs).to(device) + qf2 = SoftQNetwork(envs).to(device) + qf1_target = SoftQNetwork(envs).to(device) + qf2_target = SoftQNetwork(envs).to(device) + qf1_target.load_state_dict(qf1.state_dict()) + qf2_target.load_state_dict(qf2.state_dict()) + q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) + actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) + + # Automatic entropy tuning + if args.autotune: + target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() + log_alpha = torch.zeros(1, requires_grad=True, device=device) + alpha = log_alpha.exp().item() + a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) + else: + alpha = args.alpha + + envs.single_observation_space.dtype = np.float32 + rb = ReplayBuffer( + args.buffer_size, + envs.single_observation_space, + envs.single_action_space, + device, + handle_timeout_termination=False, + ) + + # eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, 0, 0, False, run_name)]) + # evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes) + + start_time = time.time() + num_updates = 0 + + # TRY NOT TO MODIFY: start the game + obs, _ = envs.reset(seed=args.seed) + for global_step in range(args.total_timesteps): + # ALGO LOGIC: put action logic here + if global_step < args.learning_starts: + actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) + else: + actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) + actions = actions.detach().cpu().numpy() + + # TRY NOT TO MODIFY: execute the game and log data. + next_obs, rewards, terminations, truncations, infos = envs.step(actions) + + # TRY NOT TO MODIFY: record rewards for plotting purposes + if "final_info" in infos: + for info in infos["final_info"]: + # print(f"global_step={global_step}, episodic_return={info['episode']['r']}") + writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) + writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) + break + + # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` + real_next_obs = next_obs.copy() + for idx, trunc in enumerate(truncations): + if trunc: + real_next_obs[idx] = infos["final_observation"][idx] + rb.add(obs, real_next_obs, actions, rewards, terminations, infos) + + # TRY NOT TO MODIFY: CRUCIAL step easy to overlook + obs = next_obs + + # ALGO LOGIC: training. + if global_step > args.learning_starts: + num_updates += 1 + data = rb.sample(args.batch_size) + with torch.no_grad(): + next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) + qf1_next_target = qf1_target(data.next_observations, next_state_actions) + qf2_next_target = qf2_target(data.next_observations, next_state_actions) + min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi + next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) + + qf1_a_values = qf1(data.observations, data.actions).view(-1) + qf2_a_values = qf2(data.observations, data.actions).view(-1) + qf1_loss = F.mse_loss(qf1_a_values, next_q_value) + qf2_loss = F.mse_loss(qf2_a_values, next_q_value) + qf_loss = qf1_loss + qf2_loss + + # optimize the model + q_optimizer.zero_grad() + qf_loss.backward() + q_optimizer.step() + + if global_step % args.policy_frequency == 0: # TD 3 Delayed update support + for _ in range( + args.policy_frequency + ): # compensate for the delay by doing 'actor_update_interval' instead of 1 + pi, log_pi, _ = actor.get_action(data.observations) + qf1_pi = qf1(data.observations, pi) + qf2_pi = qf2(data.observations, pi) + min_qf_pi = torch.min(qf1_pi, qf2_pi) + actor_loss = ((alpha * log_pi) - min_qf_pi).mean() + + actor_optimizer.zero_grad() + actor_loss.backward() + actor_optimizer.step() + + if args.autotune: + with torch.no_grad(): + _, log_pi, _ = actor.get_action(data.observations) + alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() + + a_optimizer.zero_grad() + alpha_loss.backward() + a_optimizer.step() + alpha = log_alpha.exp().item() + + # update the target networks + if global_step % args.target_network_frequency == 0: + for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + + if global_step % 100 == 0: + writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step) + writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) + writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) + writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) + writer.add_scalar("losses/alpha", alpha, global_step) + # print("SPS:", int(global_step / (time.time() - start_time))) + writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + if args.autotune: + writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step) + + # if global_step % args.eval_freq == 0: + # evaluator.evaluate(global_step, num_updates=num_updates) + + envs.close() + writer.close() \ No newline at end of file diff --git a/src/td3.py b/src/td3.py new file mode 100644 index 0000000..3ec0cdc --- /dev/null +++ b/src/td3.py @@ -0,0 +1,273 @@ +# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/td3/#td3_continuous_actionpy +import os +import random +import time +from dataclasses import dataclass + +import gymnasium as gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import tyro +from stable_baselines3.common.buffers import ReplayBuffer +from torch.utils.tensorboard import SummaryWriter + + +@dataclass +class Args: + exp_name: str = os.path.basename(__file__)[: -len(".py")] # name of this experiment + seed: int = 1 + torch_deterministic: bool = True # if toggled, torch.backends.cudnn.deterministic=False + cuda: bool = True # cuda will be enabled by default + track: bool = False # if toggled, experiment will be tracked with Weights and Biases + wandb_project_name: str = "cleanRL" + wandb_entity: str = None # entity (team) of wandb's project + capture_video: bool = False # capture videos of the agent performances (check out `videos` folder) + save_model: bool = False # whether to save model into the `runs/{run_name}` folder + upload_model: bool = False # upload the saved model to huggingface + hf_entity: str = "" # user or org name of the model repository from the Hugging Face Hub + + # Algorithm specific arguments + env_id: str = "Hopper-v4" # the id of the environment + total_timesteps: int = 1000000 # total timesteps of the experiments + learning_rate: float = 3e-4 # learning rate of the optimizer + buffer_size: int = int(1e6) # replay memory buffer size + gamma: float = 0.99 # the discount factor gamma + tau: float = 0.005 # target smoothing coefficient (default: 0.005) + batch_size: int = 256 # the batch size of sample from the reply memory + policy_noise: float = 0.2 # the scale of policy noise + exploration_noise: float = 0.1 # the scale of exploration noise + learning_starts: int = 25e3 # timestep to start learning + policy_frequency: int = 2 # the frequency of training policy (delayed + noise_clip: float = 0.5 # noise clip parameter of the Target Policy Smoothing Regularization + + +def make_env(env_id, seed, idx, capture_video, run_name): + def thunk(): + if capture_video and idx == 0: + env = gym.make(env_id, render_mode="rgb_array") + env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") + else: + env = gym.make(env_id) + env = gym.wrappers.RecordEpisodeStatistics(env) + env.action_space.seed(seed) + return env + + return thunk + + +# ALGO LOGIC: initialize agent here: +class QNetwork(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) + self.fc2 = nn.Linear(256, 256) + self.fc3 = nn.Linear(256, 1) + + def forward(self, x, a): + x = torch.cat([x, a], 1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +class Actor(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) + self.fc2 = nn.Linear(256, 256) + self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape)) + # action rescaling + self.register_buffer( + "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32) + ) + self.register_buffer( + "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32) + ) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = torch.tanh(self.fc_mu(x)) + return x * self.action_scale + self.action_bias + + +if __name__ == "__main__": + import stable_baselines3 as sb3 + + if sb3.__version__ < "2.0": + raise ValueError( + """Ongoing migration: run the following command to install the new dependencies: +poetry run pip install "stable_baselines3==2.0.0a1" +""" + ) + + args = tyro.cli(Args) + run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" + if args.track: + import wandb + + wandb.init( + project=args.wandb_project_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + # TRY NOT TO MODIFY: seeding + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.deterministic = args.torch_deterministic + + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + + # env setup + envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)]) + assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" + + actor = Actor(envs).to(device) + qf1 = QNetwork(envs).to(device) + qf2 = QNetwork(envs).to(device) + qf1_target = QNetwork(envs).to(device) + qf2_target = QNetwork(envs).to(device) + target_actor = Actor(envs).to(device) + target_actor.load_state_dict(actor.state_dict()) + qf1_target.load_state_dict(qf1.state_dict()) + qf2_target.load_state_dict(qf2.state_dict()) + q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.learning_rate) + actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.learning_rate) + + envs.single_observation_space.dtype = np.float32 + rb = ReplayBuffer( + args.buffer_size, + envs.single_observation_space, + envs.single_action_space, + device, + handle_timeout_termination=False, + ) + start_time = time.time() + + # TRY NOT TO MODIFY: start the game + obs, _ = envs.reset(seed=args.seed) + for global_step in range(args.total_timesteps): + # ALGO LOGIC: put action logic here + if global_step < args.learning_starts: + actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) + else: + with torch.no_grad(): + actions = actor(torch.Tensor(obs).to(device)) + actions += torch.normal(0, actor.action_scale * args.exploration_noise) + actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high) + + # TRY NOT TO MODIFY: execute the game and log data. + next_obs, rewards, terminations, truncations, infos = envs.step(actions) + + # TRY NOT TO MODIFY: record rewards for plotting purposes + if "final_info" in infos: + for info in infos["final_info"]: + print(f"global_step={global_step}, episodic_return={info['episode']['r']}") + writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) + writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) + break + + # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` + real_next_obs = next_obs.copy() + for idx, trunc in enumerate(truncations): + if trunc: + real_next_obs[idx] = infos["final_observation"][idx] + rb.add(obs, real_next_obs, actions, rewards, terminations, infos) + + # TRY NOT TO MODIFY: CRUCIAL step easy to overlook + obs = next_obs + + # ALGO LOGIC: training. + if global_step > args.learning_starts: + data = rb.sample(args.batch_size) + with torch.no_grad(): + clipped_noise = (torch.randn_like(data.actions, device=device) * args.policy_noise).clamp( + -args.noise_clip, args.noise_clip + ) * target_actor.action_scale + + next_state_actions = (target_actor(data.next_observations) + clipped_noise).clamp( + envs.single_action_space.low[0], envs.single_action_space.high[0] + ) + qf1_next_target = qf1_target(data.next_observations, next_state_actions) + qf2_next_target = qf2_target(data.next_observations, next_state_actions) + min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) + next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) + + qf1_a_values = qf1(data.observations, data.actions).view(-1) + qf2_a_values = qf2(data.observations, data.actions).view(-1) + qf1_loss = F.mse_loss(qf1_a_values, next_q_value) + qf2_loss = F.mse_loss(qf2_a_values, next_q_value) + qf_loss = qf1_loss + qf2_loss + + # optimize the model + q_optimizer.zero_grad() + qf_loss.backward() + q_optimizer.step() + + if global_step % args.policy_frequency == 0: + actor_loss = -qf1(data.observations, actor(data.observations)).mean() + actor_optimizer.zero_grad() + actor_loss.backward() + actor_optimizer.step() + + # update the target network + for param, target_param in zip(actor.parameters(), target_actor.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + + if global_step % 100 == 0: + writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step) + writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) + writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) + writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) + print("SPS:", int(global_step / (time.time() - start_time))) + writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + + if args.save_model: + model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model" + torch.save((actor.state_dict(), qf1.state_dict(), qf2.state_dict()), model_path) + print(f"model saved to {model_path}") + from cleanrl_utils.evals.td3_eval import evaluate + + episodic_returns = evaluate( + model_path, + make_env, + args.env_id, + eval_episodes=10, + run_name=f"{run_name}-eval", + Model=(Actor, QNetwork), + device=device, + exploration_noise=args.exploration_noise, + ) + for idx, episodic_return in enumerate(episodic_returns): + writer.add_scalar("eval/episodic_return", episodic_return, idx) + + if args.upload_model: + from cleanrl_utils.huggingface import push_to_hub + + repo_name = f"{args.env_id}-{args.exp_name}-seed{args.seed}" + repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name + push_to_hub(args, episodic_returns, repo_id, "TD3", f"runs/{run_name}", f"videos/{run_name}-eval") + + envs.close() + writer.close() \ No newline at end of file From 9438239463360cc989151474171a40b1f6b8489d Mon Sep 17 00:00:00 2001 From: Nora Tseng Date: Wed, 12 Jun 2024 12:51:16 -0700 Subject: [PATCH 2/5] buggy sac --- src/sac.py | 83 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/src/sac.py b/src/sac.py index 0c3d97d..761264b 100644 --- a/src/sac.py +++ b/src/sac.py @@ -23,7 +23,8 @@ class Args: # wandb tracking exp_name: str = os.path.basename(__file__)[: -len(".py")] # name of this experiment - seed: int = 1 + seed: Optional[int] = None # seed of the experiment + # seed: int = 1 torch_deterministic: bool = True # if toggled, torch.backends.cudnn.deterministic=False cuda: bool = True track: bool = False # tracked with Weights and Biases @@ -60,33 +61,47 @@ class Args: save_rootdir: str = "results" # top-level directory where results will be saved save_subdir: Optional[str] = None # lower level directories save_dir: str = field(init=False) # the lower-level directories + save_model: bool = False # whether to save model into the `runs/{run_name}` folder + eval_freq: Optional[int] = None # num of timesteps between policy evals + n_eval_episodes: int = 80 # num of eval episodes -def __post_init__(self): - if self.save_subdir == None: - self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg" - else: - self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg/{self.save_subdir}" - if self.run_id is None: - self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1 - self.save_dir += f"/run_{self.run_id}" - if self.seed is None: - self.seed = self.run_id - else: - self.seed = np.random.randint(2 ** 32 - 1) + def __post_init__(self): + if self.eval_freq == None: + # 20 evals per training run unless specified otherwise. + self.eval_freq = self.total_timesteps/20 + + if self.save_subdir == None: + self.save_dir = f"{self.save_rootdir}/{self.env_id}/sac" + else: + self.save_dir = f"{self.save_rootdir}/{self.env_id}/sac/{self.save_subdir}" + if self.run_id is None: + self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1 + self.save_dir += f"/run_{self.run_id}" + if self.seed is None: + self.seed = self.run_id + else: + self.seed = np.random.randint(2 ** 32 - 1) + + print("self.save_dir = "+self.save_dir) - # dump training config to save dir - os.makedirs(self.save_dir, exist_ok=True) - with open(os.path.join(self.save_dir, "config.yml"), "w") as f: - yaml.dump(self, f, sort_keys=True) + # dump training config to save dir + os.makedirs(self.save_dir, exist_ok=True) + with open(os.path.join(self.save_dir, "config.yml"), "w") as f: + yaml.dump(self, f, sort_keys=True) -def make_env(env_id, seed, idx, capture_video, run_name): +## added env_kwargs from ddpg.py +def make_env(env_id, env_kwargs, seed, idx, capture_video, run_name): def thunk(): if capture_video and idx == 0: - env = gym.make(env_id, render_mode="rgb_array") + env = gym.make(env_id, render_mode="rgb_array", **env_kwargs) env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") else: - env = gym.make(env_id) + env = gym.make(env_id, **env_kwargs) + + # Flatten Dict obs so we don't need to handle them a special case in DA + if isinstance(env.observation_space, gym.spaces.Dict): + env = gym.wrappers.FlattenObservation(env) env = gym.wrappers.RecordEpisodeStatistics(env) env.action_space.seed(seed) return env @@ -190,10 +205,11 @@ def get_action(self, x): torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + # device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + device = torch.device("cpu") # env setup - envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)]) + envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, args.seed, 0, args.capture_video, run_name)]) assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" max_action = float(envs.single_action_space.high[0]) @@ -226,8 +242,8 @@ def get_action(self, x): handle_timeout_termination=False, ) - # eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, 0, 0, False, run_name)]) - # evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes) + eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, 0, 0, False, run_name)]) + evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes) start_time = time.time() num_updates = 0 @@ -239,8 +255,14 @@ def get_action(self, x): if global_step < args.learning_starts: actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) else: - actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) + actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) actions = actions.detach().cpu().numpy() + ###### copied from ddpg + # with torch.no_grad(): + # actions = actor(torch.Tensor(obs).to(device)) + # actions += torch.normal(0, actor.action_scale * args.exploration_noise) + # actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high) + # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards, terminations, truncations, infos = envs.step(actions) @@ -329,8 +351,15 @@ def get_action(self, x): if args.autotune: writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step) - # if global_step % args.eval_freq == 0: - # evaluator.evaluate(global_step, num_updates=num_updates) + if global_step % args.eval_freq == 0: + evaluator.evaluate(global_step, num_updates=num_updates) + + + if args.save_model: + model_path = f"{args.save_dir}/model" + torch.save((actor.state_dict(), qf1.state_dict()), model_path) + print(f"model saved to {model_path}") + envs.close() writer.close() \ No newline at end of file From 0e931546921ea0be7c483e91172fd7aef2f59b9b Mon Sep 17 00:00:00 2001 From: Nora Tseng Date: Tue, 18 Jun 2024 16:42:40 -0700 Subject: [PATCH 3/5] sac working, td3 WIP --- src/ddpg.py | 6 +++--- src/evaluator.py | 2 +- src/sac.py | 5 ----- src/td3.py | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 48 insertions(+), 13 deletions(-) diff --git a/src/ddpg.py b/src/ddpg.py index 4ed7b63..8f87e72 100644 --- a/src/ddpg.py +++ b/src/ddpg.py @@ -188,13 +188,13 @@ def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = torch.tanh(self.fc_mu(x)) - return x * self.action_scale + self.action_bias + return x * self.action_scale + self.action_bias, None elif self.dims == 3: x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = F.relu(self.fc3(x)) x = torch.tanh(self.fc_mu(x)) - return x * self.action_scale + self.action_bias + return x * self.action_scale + self.action_bias, None if __name__ == "__main__": @@ -289,7 +289,7 @@ def forward(self, x): actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) else: with torch.no_grad(): - actions = actor(torch.Tensor(obs).to(device)) + actions, _ = actor(torch.Tensor(obs).to(device)) actions += torch.normal(0, actor.action_scale * args.exploration_noise) actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high) diff --git a/src/evaluator.py b/src/evaluator.py index 4a10510..08099ac 100644 --- a/src/evaluator.py +++ b/src/evaluator.py @@ -71,7 +71,7 @@ def _evaluate(self): while not done: # ALGO LOGIC: put action logic here with torch.no_grad(): - actions = self.actor(torch.Tensor(obs).to(self.device)) + actions, _ = self.actor(torch.Tensor(obs).to(self.device)) actions = actions.cpu().numpy().clip(self.eval_env.action_space.low, self.eval_env.action_space.high) diff --git a/src/sac.py b/src/sac.py index 761264b..417bcf9 100644 --- a/src/sac.py +++ b/src/sac.py @@ -257,11 +257,6 @@ def get_action(self, x): else: actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) actions = actions.detach().cpu().numpy() - ###### copied from ddpg - # with torch.no_grad(): - # actions = actor(torch.Tensor(obs).to(device)) - # actions += torch.normal(0, actor.action_scale * args.exploration_noise) - # actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high) # TRY NOT TO MODIFY: execute the game and log data. diff --git a/src/td3.py b/src/td3.py index 3ec0cdc..9a1bb03 100644 --- a/src/td3.py +++ b/src/td3.py @@ -2,7 +2,7 @@ import os import random import time -from dataclasses import dataclass +from dataclasses import dataclass, field import gymnasium as gym import numpy as np @@ -14,6 +14,10 @@ from stable_baselines3.common.buffers import ReplayBuffer from torch.utils.tensorboard import SummaryWriter +from typing import Optional, Union +from src.utils import get_latest_run_id +from src.evaluator import Evaluator +import yaml @dataclass class Args: @@ -23,12 +27,18 @@ class Args: cuda: bool = True # cuda will be enabled by default track: bool = False # if toggled, experiment will be tracked with Weights and Biases wandb_project_name: str = "cleanRL" - wandb_entity: str = None # entity (team) of wandb's project + wandb_entity: Optional[str] = None # the entity (team) of wandb's project capture_video: bool = False # capture videos of the agent performances (check out `videos` folder) save_model: bool = False # whether to save model into the `runs/{run_name}` folder upload_model: bool = False # upload the saved model to huggingface hf_entity: str = "" # user or org name of the model repository from the Hugging Face Hub + run_id: Optional[int] = None + save_rootdir: str = "results" # top-level directory where results will be saved + save_subdir: Optional[str] = None # lower level directories + save_dir: str = field(init=False) # the lower-level directories + save_model: bool = False # whether to save model into the `runs/{run_name}` folder + # Algorithm specific arguments env_id: str = "Hopper-v4" # the id of the environment total_timesteps: int = 1000000 # total timesteps of the experiments @@ -44,6 +54,31 @@ class Args: noise_clip: float = 0.5 # noise clip parameter of the Target Policy Smoothing Regularization + def __post_init__(self): + # if self.eval_freq == None: + # # 20 evals per training run unless specified otherwise. + # self.eval_freq = self.total_timesteps/20 + + if self.save_subdir == None: + self.save_dir = f"{self.save_rootdir}/{self.env_id}/td3" + else: + self.save_dir = f"{self.save_rootdir}/{self.env_id}/td3/{self.save_subdir}" + if self.run_id is None: + self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1 + self.save_dir += f"/run_{self.run_id}" + if self.seed is None: + self.seed = self.run_id + else: + self.seed = np.random.randint(2 ** 32 - 1) + + print("self.save_dir = "+self.save_dir) + + # dump training config to save dir + os.makedirs(self.save_dir, exist_ok=True) + with open(os.path.join(self.save_dir, "config.yml"), "w") as f: + yaml.dump(self, f, sort_keys=True) + + def make_env(env_id, seed, idx, capture_video, run_name): def thunk(): if capture_video and idx == 0: @@ -62,6 +97,9 @@ def thunk(): class QNetwork(nn.Module): def __init__(self, env): super().__init__() + print("QNET env.single_observation_space.shape") + print(env.single_observation_space.shape) + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) self.fc2 = nn.Linear(256, 256) self.fc3 = nn.Linear(256, 1) @@ -77,6 +115,8 @@ def forward(self, x, a): class Actor(nn.Module): def __init__(self, env): super().__init__() + print("env.single_observation_space.shape") + print(env.single_observation_space.shape) self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) self.fc2 = nn.Linear(256, 256) self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape)) @@ -92,7 +132,7 @@ def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = torch.tanh(self.fc_mu(x)) - return x * self.action_scale + self.action_bias + return x * self.action_scale + self.action_bias, None if __name__ == "__main__": @@ -167,7 +207,7 @@ def forward(self, x): actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) else: with torch.no_grad(): - actions = actor(torch.Tensor(obs).to(device)) + actions, _ = actor(torch.Tensor(obs).to(device)) actions += torch.normal(0, actor.action_scale * args.exploration_noise) actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high) From d04e5b6c3ad71460bd6d7a55087e35acc7cd5a01 Mon Sep 17 00:00:00 2001 From: Nora Tseng Date: Wed, 19 Jun 2024 14:40:12 -0700 Subject: [PATCH 4/5] sac + td3 working --- src/ddpg.py | 2 +- src/sac.py | 2 +- src/td3.py | 50 +++++++++++++++++++++++++++++++++----------------- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/src/ddpg.py b/src/ddpg.py index 8f87e72..4d78a64 100644 --- a/src/ddpg.py +++ b/src/ddpg.py @@ -59,7 +59,7 @@ class Args: learning_rate: float = 1e-3 # learning rate of optimizer buffer_size: int = int(1e6) # replay memory buffer size gamma: float = 0.99 # discount factor gamma - tau: float = 0.005 # target smoothing coefficient (default: 0.005) + tau: float = 0.005 # target smoothing coefficient (default: 0.005) batch_size: int = 256 # batch size of sample from the reply memory exploration_noise: float = 0.1 # scale of exploration noise # learning_starts: int = 0 # timestep to start learning diff --git a/src/sac.py b/src/sac.py index 417bcf9..c5aecbd 100644 --- a/src/sac.py +++ b/src/sac.py @@ -245,8 +245,8 @@ def get_action(self, x): eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, 0, 0, False, run_name)]) evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes) - start_time = time.time() num_updates = 0 + start_time = time.time() # TRY NOT TO MODIFY: start the game obs, _ = envs.reset(seed=args.seed) diff --git a/src/td3.py b/src/td3.py index 9a1bb03..6277515 100644 --- a/src/td3.py +++ b/src/td3.py @@ -40,6 +40,13 @@ class Args: save_model: bool = False # whether to save model into the `runs/{run_name}` folder # Algorithm specific arguments + env_kwargs: dict[str, Union[bool, float, str]] = field(default_factory=dict) + """ + usage: --env_kwargs arg1 val1 arg2 val2 arg3 val3 + + To make PointMaze tasks use a sparse reward function: + --env_kwargs continuing_task False + """ env_id: str = "Hopper-v4" # the id of the environment total_timesteps: int = 1000000 # total timesteps of the experiments learning_rate: float = 3e-4 # learning rate of the optimizer @@ -52,12 +59,14 @@ class Args: learning_starts: int = 25e3 # timestep to start learning policy_frequency: int = 2 # the frequency of training policy (delayed noise_clip: float = 0.5 # noise clip parameter of the Target Policy Smoothing Regularization + eval_freq: Optional[int] = None # num of timesteps between policy evals + n_eval_episodes: int = 80 # num of eval episodes def __post_init__(self): - # if self.eval_freq == None: - # # 20 evals per training run unless specified otherwise. - # self.eval_freq = self.total_timesteps/20 + if self.eval_freq == None: + # 20 evals per training run unless specified otherwise. + self.eval_freq = self.total_timesteps/20 if self.save_subdir == None: self.save_dir = f"{self.save_rootdir}/{self.env_id}/td3" @@ -79,27 +88,27 @@ def __post_init__(self): yaml.dump(self, f, sort_keys=True) -def make_env(env_id, seed, idx, capture_video, run_name): +def make_env(env_id, env_kwargs, seed, idx, capture_video, run_name): def thunk(): if capture_video and idx == 0: - env = gym.make(env_id, render_mode="rgb_array") + env = gym.make(env_id, render_mode="rgb_array", **env_kwargs) env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") else: - env = gym.make(env_id) + env = gym.make(env_id, **env_kwargs) + + # Flatten Dict obs so we don't need to handle them a special case in DA + if isinstance(env.observation_space, gym.spaces.Dict): + env = gym.wrappers.FlattenObservation(env) env = gym.wrappers.RecordEpisodeStatistics(env) env.action_space.seed(seed) return env return thunk - # ALGO LOGIC: initialize agent here: class QNetwork(nn.Module): def __init__(self, env): super().__init__() - print("QNET env.single_observation_space.shape") - print(env.single_observation_space.shape) - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) self.fc2 = nn.Linear(256, 256) self.fc3 = nn.Linear(256, 1) @@ -115,8 +124,6 @@ def forward(self, x, a): class Actor(nn.Module): def __init__(self, env): super().__init__() - print("env.single_observation_space.shape") - print(env.single_observation_space.shape) self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) self.fc2 = nn.Linear(256, 256) self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape)) @@ -171,10 +178,11 @@ def forward(self, x): torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + # device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + device = torch.device("cpu") # env setup - envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)]) + envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, args.seed, 0, args.capture_video, run_name)]) assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" actor = Actor(envs).to(device) @@ -197,6 +205,11 @@ def forward(self, x): device, handle_timeout_termination=False, ) + + eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, 0, 0, False, run_name)]) + evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes) + + num_updates = 0 start_time = time.time() # TRY NOT TO MODIFY: start the game @@ -217,7 +230,7 @@ def forward(self, x): # TRY NOT TO MODIFY: record rewards for plotting purposes if "final_info" in infos: for info in infos["final_info"]: - print(f"global_step={global_step}, episodic_return={info['episode']['r']}") + # print(f"global_step={global_step}, episodic_return={info['episode']['r']}") writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) break @@ -280,9 +293,13 @@ def forward(self, x): writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) - print("SPS:", int(global_step / (time.time() - start_time))) + # print("SPS:", int(global_step / (time.time() - start_time))) writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + if global_step % args.eval_freq == 0: + evaluator.evaluate(global_step, num_updates=num_updates) + + if args.save_model: model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model" torch.save((actor.state_dict(), qf1.state_dict(), qf2.state_dict()), model_path) @@ -304,7 +321,6 @@ def forward(self, x): if args.upload_model: from cleanrl_utils.huggingface import push_to_hub - repo_name = f"{args.env_id}-{args.exp_name}-seed{args.seed}" repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name push_to_hub(args, episodic_returns, repo_id, "TD3", f"runs/{run_name}", f"videos/{run_name}-eval") From 99d48d3cfc035350a2e9b1fb4a3102d2ee71ec07 Mon Sep 17 00:00:00 2001 From: Nora Tseng Date: Sun, 30 Jun 2024 19:23:56 -0700 Subject: [PATCH 5/5] separate network sizes for actor/critic --- src/ddpg.py | 16 +++++++++++----- src/plotting/utils.py | 4 ++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/ddpg.py b/src/ddpg.py index 4d78a64..50cd892 100644 --- a/src/ddpg.py +++ b/src/ddpg.py @@ -55,7 +55,8 @@ class Args: save_model: bool = False # whether to save model into the `runs/{run_name}` folder # Algorithm specific arguments - net_arch: list[int] = (256, 256, 256) # Structure of the network, default 64,64 + actor_net_arch: list[int] = (64,64) # Structure of actor network, default 64,64 + critic_net_arch: list[int] = (64,64) # Structure of critic network, default 64,64 learning_rate: float = 1e-3 # learning rate of optimizer buffer_size: int = int(1e6) # replay memory buffer size gamma: float = 0.99 # discount factor gamma @@ -123,13 +124,12 @@ class QNetwork(nn.Module): def __init__(self, env): super().__init__() valid_networks = [(64, 64), (256,256), (256,256,256), [64,64], [256,256], [256,256,256]] - if args.net_arch in valid_networks: - arch = np.array(args.net_arch) + if args.critic_net_arch in valid_networks: + arch = np.array(args.critic_net_arch) else: # invalid architecture print("Exiting: incorrect network architecture. example: ") print("--net_arch 256 256 256") exit() - self.dims = len(arch) if self.dims == 2: # 64,64 or 256,256 self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), \ @@ -163,7 +163,13 @@ def forward(self, x, a): class Actor(nn.Module): def __init__(self, env): super().__init__() - arch = np.array(args.net_arch) + valid_networks = [(64, 64), (256,256), (256,256,256), [64,64], [256,256], [256,256,256]] + if args.actor_net_arch in valid_networks: + arch = np.array(args.actor_net_arch) + else: # invalid architecture + print("Exiting: incorrect network architecture. example: ") + print("--net_arch 256 256 256") + exit() self.dims = len(arch) if self.dims == 2: # 64,64 or 256,256 self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), arch[0]) diff --git a/src/plotting/utils.py b/src/plotting/utils.py index 223bb2c..2a34aa9 100644 --- a/src/plotting/utils.py +++ b/src/plotting/utils.py @@ -142,7 +142,7 @@ def get_paths(results_dir, filename='evaluations.npz'): paths.append(f'{results_dir}/{subdir}/{filename}') return paths - +# TODO: Add addit. arg to choose timesteps or updates def get_data(results_dir, field_name='returns', filename='evaluations.npz'): try: @@ -164,6 +164,6 @@ def get_data(results_dir, field_name='returns', filename='evaluations.npz'): avg_vals = vals results.append(avg_vals) - timesteps = data['timesteps'] + timesteps = data['timesteps'] #### return timesteps, np.array(results)