Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 15 additions & 9 deletions src/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,12 @@ class Args:
save_model: bool = False # whether to save model into the `runs/{run_name}` folder

# Algorithm specific arguments
net_arch: list[int] = (256, 256, 256) # Structure of the network, default 64,64
actor_net_arch: list[int] = (64,64) # Structure of actor network, default 64,64
critic_net_arch: list[int] = (64,64) # Structure of critic network, default 64,64
learning_rate: float = 1e-3 # learning rate of optimizer
buffer_size: int = int(1e6) # replay memory buffer size
gamma: float = 0.99 # discount factor gamma
tau: float = 0.005 # target smoothing coefficient (default: 0.005)
tau: float = 0.005 # target smoothing coefficient (default: 0.005)
batch_size: int = 256 # batch size of sample from the reply memory
exploration_noise: float = 0.1 # scale of exploration noise
# learning_starts: int = 0 # timestep to start learning
Expand Down Expand Up @@ -123,13 +124,12 @@ class QNetwork(nn.Module):
def __init__(self, env):
super().__init__()
valid_networks = [(64, 64), (256,256), (256,256,256), [64,64], [256,256], [256,256,256]]
if args.net_arch in valid_networks:
arch = np.array(args.net_arch)
if args.critic_net_arch in valid_networks:
arch = np.array(args.critic_net_arch)
else: # invalid architecture
print("Exiting: incorrect network architecture. example: ")
print("--net_arch 256 256 256")
exit()

self.dims = len(arch)
if self.dims == 2: # 64,64 or 256,256
self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), \
Expand Down Expand Up @@ -163,7 +163,13 @@ def forward(self, x, a):
class Actor(nn.Module):
def __init__(self, env):
super().__init__()
arch = np.array(args.net_arch)
valid_networks = [(64, 64), (256,256), (256,256,256), [64,64], [256,256], [256,256,256]]
if args.actor_net_arch in valid_networks:
arch = np.array(args.actor_net_arch)
else: # invalid architecture
print("Exiting: incorrect network architecture. example: ")
print("--net_arch 256 256 256")
exit()
self.dims = len(arch)
if self.dims == 2: # 64,64 or 256,256
self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), arch[0])
Expand All @@ -188,13 +194,13 @@ def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = torch.tanh(self.fc_mu(x))
return x * self.action_scale + self.action_bias
return x * self.action_scale + self.action_bias, None
elif self.dims == 3:
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = torch.tanh(self.fc_mu(x))
return x * self.action_scale + self.action_bias
return x * self.action_scale + self.action_bias, None


if __name__ == "__main__":
Expand Down Expand Up @@ -289,7 +295,7 @@ def forward(self, x):
actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
else:
with torch.no_grad():
actions = actor(torch.Tensor(obs).to(device))
actions, _ = actor(torch.Tensor(obs).to(device))
actions += torch.normal(0, actor.action_scale * args.exploration_noise)
actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)

Expand Down
2 changes: 1 addition & 1 deletion src/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _evaluate(self):
while not done:
# ALGO LOGIC: put action logic here
with torch.no_grad():
actions = self.actor(torch.Tensor(obs).to(self.device))
actions, _ = self.actor(torch.Tensor(obs).to(self.device))
actions = actions.cpu().numpy().clip(self.eval_env.action_space.low,
self.eval_env.action_space.high)

Expand Down
4 changes: 2 additions & 2 deletions src/plotting/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def get_paths(results_dir, filename='evaluations.npz'):
paths.append(f'{results_dir}/{subdir}/{filename}')
return paths


# TODO: Add addit. arg to choose timesteps or updates
def get_data(results_dir, field_name='returns', filename='evaluations.npz'):

try:
Expand All @@ -164,6 +164,6 @@ def get_data(results_dir, field_name='returns', filename='evaluations.npz'):
avg_vals = vals

results.append(avg_vals)
timesteps = data['timesteps']
timesteps = data['timesteps'] ####

return timesteps, np.array(results)
80 changes: 52 additions & 28 deletions src/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
class Args:
# wandb tracking
exp_name: str = os.path.basename(__file__)[: -len(".py")] # name of this experiment
seed: int = 1
seed: Optional[int] = None # seed of the experiment
# seed: int = 1
torch_deterministic: bool = True # if toggled, torch.backends.cudnn.deterministic=False
cuda: bool = True
track: bool = False # tracked with Weights and Biases
Expand Down Expand Up @@ -60,33 +61,47 @@ class Args:
save_rootdir: str = "results" # top-level directory where results will be saved
save_subdir: Optional[str] = None # lower level directories
save_dir: str = field(init=False) # the lower-level directories
save_model: bool = False # whether to save model into the `runs/{run_name}` folder
eval_freq: Optional[int] = None # num of timesteps between policy evals
n_eval_episodes: int = 80 # num of eval episodes

def __post_init__(self):
if self.save_subdir == None:
self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg"
else:
self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg/{self.save_subdir}"
if self.run_id is None:
self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1
self.save_dir += f"/run_{self.run_id}"
if self.seed is None:
self.seed = self.run_id
else:
self.seed = np.random.randint(2 ** 32 - 1)
def __post_init__(self):
if self.eval_freq == None:
# 20 evals per training run unless specified otherwise.
self.eval_freq = self.total_timesteps/20

# dump training config to save dir
os.makedirs(self.save_dir, exist_ok=True)
with open(os.path.join(self.save_dir, "config.yml"), "w") as f:
yaml.dump(self, f, sort_keys=True)
if self.save_subdir == None:
self.save_dir = f"{self.save_rootdir}/{self.env_id}/sac"
else:
self.save_dir = f"{self.save_rootdir}/{self.env_id}/sac/{self.save_subdir}"
if self.run_id is None:
self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1
self.save_dir += f"/run_{self.run_id}"
if self.seed is None:
self.seed = self.run_id
else:
self.seed = np.random.randint(2 ** 32 - 1)

print("self.save_dir = "+self.save_dir)

def make_env(env_id, seed, idx, capture_video, run_name):
# dump training config to save dir
os.makedirs(self.save_dir, exist_ok=True)
with open(os.path.join(self.save_dir, "config.yml"), "w") as f:
yaml.dump(self, f, sort_keys=True)


## added env_kwargs from ddpg.py
def make_env(env_id, env_kwargs, seed, idx, capture_video, run_name):
def thunk():
if capture_video and idx == 0:
env = gym.make(env_id, render_mode="rgb_array")
env = gym.make(env_id, render_mode="rgb_array", **env_kwargs)
env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
else:
env = gym.make(env_id)
env = gym.make(env_id, **env_kwargs)

# Flatten Dict obs so we don't need to handle them a special case in DA
if isinstance(env.observation_space, gym.spaces.Dict):
env = gym.wrappers.FlattenObservation(env)
env = gym.wrappers.RecordEpisodeStatistics(env)
env.action_space.seed(seed)
return env
Expand Down Expand Up @@ -190,10 +205,11 @@ def get_action(self, x):
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
# device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
device = torch.device("cpu")

# env setup
envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)])
envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, args.seed, 0, args.capture_video, run_name)])
assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"

max_action = float(envs.single_action_space.high[0])
Expand Down Expand Up @@ -226,11 +242,11 @@ def get_action(self, x):
handle_timeout_termination=False,
)

# eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, 0, 0, False, run_name)])
# evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes)
eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, 0, 0, False, run_name)])
evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes)

start_time = time.time()
num_updates = 0
start_time = time.time()

# TRY NOT TO MODIFY: start the game
obs, _ = envs.reset(seed=args.seed)
Expand All @@ -239,9 +255,10 @@ def get_action(self, x):
if global_step < args.learning_starts:
actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
else:
actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
actions = actions.detach().cpu().numpy()


# TRY NOT TO MODIFY: execute the game and log data.
next_obs, rewards, terminations, truncations, infos = envs.step(actions)

Expand Down Expand Up @@ -329,8 +346,15 @@ def get_action(self, x):
if args.autotune:
writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step)

# if global_step % args.eval_freq == 0:
# evaluator.evaluate(global_step, num_updates=num_updates)
if global_step % args.eval_freq == 0:
evaluator.evaluate(global_step, num_updates=num_updates)


if args.save_model:
model_path = f"{args.save_dir}/model"
torch.save((actor.state_dict(), qf1.state_dict()), model_path)
print(f"model saved to {model_path}")


envs.close()
writer.close()
Loading