From e9f0b4108cc9576579af6e4c1484abf3de678290 Mon Sep 17 00:00:00 2001
From: Nora Tseng <nora@tseng.us>
Date: Thu, 30 May 2024 22:15:55 -0700
Subject: [PATCH 1/5] added sac and td3 (continuous), working on using
 evaluator

---
 src/sac.py | 336 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/td3.py | 273 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 609 insertions(+)
 create mode 100644 src/sac.py
 create mode 100644 src/td3.py

diff --git a/src/sac.py b/src/sac.py
new file mode 100644
index 0000000..0c3d97d
--- /dev/null
+++ b/src/sac.py
@@ -0,0 +1,336 @@
+# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy
+import os
+import random
+import time
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+import gymnasium as gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import tyro
+from stable_baselines3.common.buffers import ReplayBuffer
+from torch.utils.tensorboard import SummaryWriter
+
+from src.utils import get_latest_run_id
+from src.evaluator import Evaluator
+import yaml
+
+@dataclass
+class Args:
+    # wandb tracking
+    exp_name: str = os.path.basename(__file__)[: -len(".py")] # name of this experiment
+    seed: int = 1
+    torch_deterministic: bool = True # if toggled, torch.backends.cudnn.deterministic=False
+    cuda: bool = True
+    track: bool = False # tracked with Weights and Biases
+    wandb_project_name: str = "cleanRL" # wandb's project name
+    wandb_entity: str = None # the entity (team) of wandb's project
+    capture_video: bool = False # whether to capture videos of the agent performances (check out `videos` folder)
+
+    # Algorithm specific arguments
+    env_id: str = "Hopper-v4" # environment id of the task
+    env_kwargs: dict[str, Union[bool, float, str]] = field(default_factory=dict) 
+    """
+    usage: --env_kwargs arg1 val1 arg2 val2 arg3 val3
+    
+    To make PointMaze tasks use a sparse reward function:
+        --env_kwargs continuing_task False
+    """
+    total_timesteps: int = 1000000 # total timesteps of the experiments
+    buffer_size: int = int(1e6) # the replay memory buffer size
+    gamma: float = 0.99     # discount factor gamma
+    tau: float = 0.005      # target smoothing coefficient
+    batch_size: int = 256   # batch size of sample from the reply memory
+    learning_starts: int = 5e3 # timestep to start learning
+    policy_lr: float = 3e-4 # learning rate of the policy network optimizer
+    q_lr: float = 1e-3      # learning rate of the Q network network optimizer
+    policy_frequency: int = 2   # frequency of training policy (delayed)"""
+    target_network_frequency: int = 1  # Denis Yarats' implementation delays this by 2.
+    """the frequency of updates for the target nerworks"""
+    noise_clip: float = 0.5 # noise clip parameter of the Target Policy Smoothing Regularization
+    alpha: float = 0.2 # Entropy regularization coefficient.
+    autotune: bool = True # automatic tuning of the entropy coefficient
+
+    # added args from ddpg.py
+    run_id: Optional[int] = None
+    save_rootdir: str = "results"           # top-level directory where results will be saved
+    save_subdir: Optional[str] = None       # lower level directories
+    save_dir: str = field(init=False)       # the lower-level directories 
+
+def __post_init__(self):
+    if self.save_subdir == None:
+        self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg"
+    else:
+        self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg/{self.save_subdir}"
+    if self.run_id is None:
+        self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1
+    self.save_dir += f"/run_{self.run_id}"
+    if self.seed is None:
+        self.seed = self.run_id
+    else:
+        self.seed = np.random.randint(2 ** 32 - 1)
+
+    # dump training config to save dir
+    os.makedirs(self.save_dir, exist_ok=True)
+    with open(os.path.join(self.save_dir, "config.yml"), "w") as f:
+        yaml.dump(self, f, sort_keys=True)
+
+
+def make_env(env_id, seed, idx, capture_video, run_name):
+    def thunk():
+        if capture_video and idx == 0:
+            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
+        else:
+            env = gym.make(env_id)
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        env.action_space.seed(seed)
+        return env
+
+    return thunk
+
+
+# ALGO LOGIC: initialize agent here:
+class SoftQNetwork(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc3 = nn.Linear(256, 1)
+
+    def forward(self, x, a):
+        x = torch.cat([x, a], 1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -5
+
+
+class Actor(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape))
+        self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape))
+        # action rescaling
+        self.register_buffer(
+            "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32)
+        )
+        self.register_buffer(
+            "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32)
+        )
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        mean = self.fc_mean(x)
+        log_std = self.fc_logstd(x)
+        log_std = torch.tanh(log_std)
+        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)  # From SpinUp / Denis Yarats
+
+        return mean, log_std
+
+    def get_action(self, x):
+        mean, log_std = self(x)
+        std = log_std.exp()
+        normal = torch.distributions.Normal(mean, std)
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
+        y_t = torch.tanh(x_t)
+        action = y_t * self.action_scale + self.action_bias
+        log_prob = normal.log_prob(x_t)
+        # Enforcing Action Bound
+        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6)
+        log_prob = log_prob.sum(1, keepdim=True)
+        mean = torch.tanh(mean) * self.action_scale + self.action_bias
+        return action, log_prob, mean
+
+
+if __name__ == "__main__":
+    import stable_baselines3 as sb3
+
+    if sb3.__version__ < "2.0":
+        raise ValueError(
+            """Ongoing migration: run the following command to install the new dependencies:
+poetry run pip install "stable_baselines3==2.0.0a1"
+"""
+        )
+
+    args = tyro.cli(Args)
+    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
+    if args.track:
+        import wandb
+
+        wandb.init(
+            project=args.wandb_project_name,
+            entity=args.wandb_entity,
+            sync_tensorboard=True,
+            config=vars(args),
+            name=run_name,
+            monitor_gym=True,
+            save_code=True,
+        )
+    writer = SummaryWriter(f"runs/{run_name}")
+    writer.add_text(
+        "hyperparameters",
+        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+    )
+
+    # TRY NOT TO MODIFY: seeding
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.deterministic = args.torch_deterministic
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+
+    # env setup
+    envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)])
+    assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
+
+    max_action = float(envs.single_action_space.high[0])
+
+    actor = Actor(envs).to(device)
+    qf1 = SoftQNetwork(envs).to(device)
+    qf2 = SoftQNetwork(envs).to(device)
+    qf1_target = SoftQNetwork(envs).to(device)
+    qf2_target = SoftQNetwork(envs).to(device)
+    qf1_target.load_state_dict(qf1.state_dict())
+    qf2_target.load_state_dict(qf2.state_dict())
+    q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr)
+    actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr)
+
+    # Automatic entropy tuning
+    if args.autotune:
+        target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item()
+        log_alpha = torch.zeros(1, requires_grad=True, device=device)
+        alpha = log_alpha.exp().item()
+        a_optimizer = optim.Adam([log_alpha], lr=args.q_lr)
+    else:
+        alpha = args.alpha
+
+    envs.single_observation_space.dtype = np.float32
+    rb = ReplayBuffer(
+        args.buffer_size,
+        envs.single_observation_space,
+        envs.single_action_space,
+        device,
+        handle_timeout_termination=False,
+    )
+    
+    # eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, 0, 0, False, run_name)])
+    # evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes)
+
+    start_time = time.time()
+    num_updates = 0
+
+    # TRY NOT TO MODIFY: start the game
+    obs, _ = envs.reset(seed=args.seed)
+    for global_step in range(args.total_timesteps):
+        # ALGO LOGIC: put action logic here
+        if global_step < args.learning_starts:
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
+        else:
+            actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
+            actions = actions.detach().cpu().numpy()
+
+        # TRY NOT TO MODIFY: execute the game and log data.
+        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
+
+        # TRY NOT TO MODIFY: record rewards for plotting purposes
+        if "final_info" in infos:
+            for info in infos["final_info"]:
+                # print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
+                writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
+                writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
+                break
+
+        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
+        real_next_obs = next_obs.copy()
+        for idx, trunc in enumerate(truncations):
+            if trunc:
+                real_next_obs[idx] = infos["final_observation"][idx]
+        rb.add(obs, real_next_obs, actions, rewards, terminations, infos)
+
+        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
+        obs = next_obs
+
+        # ALGO LOGIC: training.
+        if global_step > args.learning_starts:
+            num_updates += 1
+            data = rb.sample(args.batch_size)
+            with torch.no_grad():
+                next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations)
+                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
+                qf2_next_target = qf2_target(data.next_observations, next_state_actions)
+                min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi
+                next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)
+
+            qf1_a_values = qf1(data.observations, data.actions).view(-1)
+            qf2_a_values = qf2(data.observations, data.actions).view(-1)
+            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
+            qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
+            qf_loss = qf1_loss + qf2_loss
+
+            # optimize the model
+            q_optimizer.zero_grad()
+            qf_loss.backward()
+            q_optimizer.step()
+
+            if global_step % args.policy_frequency == 0:  # TD 3 Delayed update support
+                for _ in range(
+                    args.policy_frequency
+                ):  # compensate for the delay by doing 'actor_update_interval' instead of 1
+                    pi, log_pi, _ = actor.get_action(data.observations)
+                    qf1_pi = qf1(data.observations, pi)
+                    qf2_pi = qf2(data.observations, pi)
+                    min_qf_pi = torch.min(qf1_pi, qf2_pi)
+                    actor_loss = ((alpha * log_pi) - min_qf_pi).mean()
+
+                    actor_optimizer.zero_grad()
+                    actor_loss.backward()
+                    actor_optimizer.step()
+
+                    if args.autotune:
+                        with torch.no_grad():
+                            _, log_pi, _ = actor.get_action(data.observations)
+                        alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean()
+
+                        a_optimizer.zero_grad()
+                        alpha_loss.backward()
+                        a_optimizer.step()
+                        alpha = log_alpha.exp().item()
+
+            # update the target networks
+            if global_step % args.target_network_frequency == 0:
+                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+                for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+
+            if global_step % 100 == 0:
+                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
+                writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
+                writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
+                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
+                writer.add_scalar("losses/alpha", alpha, global_step)
+                # print("SPS:", int(global_step / (time.time() - start_time)))
+                writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
+                if args.autotune:
+                    writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step)
+
+        # if global_step % args.eval_freq == 0:
+        #     evaluator.evaluate(global_step, num_updates=num_updates)
+
+    envs.close()
+    writer.close()
\ No newline at end of file
diff --git a/src/td3.py b/src/td3.py
new file mode 100644
index 0000000..3ec0cdc
--- /dev/null
+++ b/src/td3.py
@@ -0,0 +1,273 @@
+# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/td3/#td3_continuous_actionpy
+import os
+import random
+import time
+from dataclasses import dataclass
+
+import gymnasium as gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import tyro
+from stable_baselines3.common.buffers import ReplayBuffer
+from torch.utils.tensorboard import SummaryWriter
+
+
+@dataclass
+class Args:
+    exp_name: str = os.path.basename(__file__)[: -len(".py")] # name of this experiment
+    seed: int = 1
+    torch_deterministic: bool = True # if toggled, torch.backends.cudnn.deterministic=False
+    cuda: bool = True # cuda will be enabled by default
+    track: bool = False # if toggled,  experiment will be tracked with Weights and Biases
+    wandb_project_name: str = "cleanRL"
+    wandb_entity: str = None # entity (team) of wandb's project
+    capture_video: bool = False # capture videos of the agent performances (check out `videos` folder)
+    save_model: bool = False # whether to save model into the `runs/{run_name}` folder
+    upload_model: bool = False # upload the saved model to huggingface
+    hf_entity: str = "" # user or org name of the model repository from the Hugging Face Hub
+
+    # Algorithm specific arguments
+    env_id: str = "Hopper-v4" # the id of the environment
+    total_timesteps: int = 1000000 # total timesteps of the experiments
+    learning_rate: float = 3e-4 # learning rate of the optimizer
+    buffer_size: int = int(1e6) # replay memory buffer size
+    gamma: float = 0.99     # the discount factor gamma
+    tau: float = 0.005      # target smoothing coefficient (default: 0.005)
+    batch_size: int = 256   # the batch size of sample from the reply memory
+    policy_noise: float = 0.2 # the scale of policy noise
+    exploration_noise: float = 0.1 # the scale of exploration noise
+    learning_starts: int = 25e3 # timestep to start learning
+    policy_frequency: int = 2 # the frequency of training policy (delayed
+    noise_clip: float = 0.5 # noise clip parameter of the Target Policy Smoothing Regularization
+
+
+def make_env(env_id, seed, idx, capture_video, run_name):
+    def thunk():
+        if capture_video and idx == 0:
+            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
+        else:
+            env = gym.make(env_id)
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        env.action_space.seed(seed)
+        return env
+
+    return thunk
+
+
+# ALGO LOGIC: initialize agent here:
+class QNetwork(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc3 = nn.Linear(256, 1)
+
+    def forward(self, x, a):
+        x = torch.cat([x, a], 1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+class Actor(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape))
+        # action rescaling
+        self.register_buffer(
+            "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32)
+        )
+        self.register_buffer(
+            "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32)
+        )
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = torch.tanh(self.fc_mu(x))
+        return x * self.action_scale + self.action_bias
+
+
+if __name__ == "__main__":
+    import stable_baselines3 as sb3
+
+    if sb3.__version__ < "2.0":
+        raise ValueError(
+            """Ongoing migration: run the following command to install the new dependencies:
+poetry run pip install "stable_baselines3==2.0.0a1"
+"""
+        )
+
+    args = tyro.cli(Args)
+    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
+    if args.track:
+        import wandb
+
+        wandb.init(
+            project=args.wandb_project_name,
+            entity=args.wandb_entity,
+            sync_tensorboard=True,
+            config=vars(args),
+            name=run_name,
+            monitor_gym=True,
+            save_code=True,
+        )
+    writer = SummaryWriter(f"runs/{run_name}")
+    writer.add_text(
+        "hyperparameters",
+        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+    )
+
+    # TRY NOT TO MODIFY: seeding
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.deterministic = args.torch_deterministic
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+
+    # env setup
+    envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)])
+    assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
+
+    actor = Actor(envs).to(device)
+    qf1 = QNetwork(envs).to(device)
+    qf2 = QNetwork(envs).to(device)
+    qf1_target = QNetwork(envs).to(device)
+    qf2_target = QNetwork(envs).to(device)
+    target_actor = Actor(envs).to(device)
+    target_actor.load_state_dict(actor.state_dict())
+    qf1_target.load_state_dict(qf1.state_dict())
+    qf2_target.load_state_dict(qf2.state_dict())
+    q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.learning_rate)
+    actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.learning_rate)
+
+    envs.single_observation_space.dtype = np.float32
+    rb = ReplayBuffer(
+        args.buffer_size,
+        envs.single_observation_space,
+        envs.single_action_space,
+        device,
+        handle_timeout_termination=False,
+    )
+    start_time = time.time()
+
+    # TRY NOT TO MODIFY: start the game
+    obs, _ = envs.reset(seed=args.seed)
+    for global_step in range(args.total_timesteps):
+        # ALGO LOGIC: put action logic here
+        if global_step < args.learning_starts:
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
+        else:
+            with torch.no_grad():
+                actions = actor(torch.Tensor(obs).to(device))
+                actions += torch.normal(0, actor.action_scale * args.exploration_noise)
+                actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)
+
+        # TRY NOT TO MODIFY: execute the game and log data.
+        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
+
+        # TRY NOT TO MODIFY: record rewards for plotting purposes
+        if "final_info" in infos:
+            for info in infos["final_info"]:
+                print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
+                writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
+                writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
+                break
+
+        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
+        real_next_obs = next_obs.copy()
+        for idx, trunc in enumerate(truncations):
+            if trunc:
+                real_next_obs[idx] = infos["final_observation"][idx]
+        rb.add(obs, real_next_obs, actions, rewards, terminations, infos)
+
+        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
+        obs = next_obs
+
+        # ALGO LOGIC: training.
+        if global_step > args.learning_starts:
+            data = rb.sample(args.batch_size)
+            with torch.no_grad():
+                clipped_noise = (torch.randn_like(data.actions, device=device) * args.policy_noise).clamp(
+                    -args.noise_clip, args.noise_clip
+                ) * target_actor.action_scale
+
+                next_state_actions = (target_actor(data.next_observations) + clipped_noise).clamp(
+                    envs.single_action_space.low[0], envs.single_action_space.high[0]
+                )
+                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
+                qf2_next_target = qf2_target(data.next_observations, next_state_actions)
+                min_qf_next_target = torch.min(qf1_next_target, qf2_next_target)
+                next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)
+
+            qf1_a_values = qf1(data.observations, data.actions).view(-1)
+            qf2_a_values = qf2(data.observations, data.actions).view(-1)
+            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
+            qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
+            qf_loss = qf1_loss + qf2_loss
+
+            # optimize the model
+            q_optimizer.zero_grad()
+            qf_loss.backward()
+            q_optimizer.step()
+
+            if global_step % args.policy_frequency == 0:
+                actor_loss = -qf1(data.observations, actor(data.observations)).mean()
+                actor_optimizer.zero_grad()
+                actor_loss.backward()
+                actor_optimizer.step()
+
+                # update the target network
+                for param, target_param in zip(actor.parameters(), target_actor.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+                for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+
+            if global_step % 100 == 0:
+                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
+                writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
+                writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
+                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
+                print("SPS:", int(global_step / (time.time() - start_time)))
+                writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
+
+    if args.save_model:
+        model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model"
+        torch.save((actor.state_dict(), qf1.state_dict(), qf2.state_dict()), model_path)
+        print(f"model saved to {model_path}")
+        from cleanrl_utils.evals.td3_eval import evaluate
+
+        episodic_returns = evaluate(
+            model_path,
+            make_env,
+            args.env_id,
+            eval_episodes=10,
+            run_name=f"{run_name}-eval",
+            Model=(Actor, QNetwork),
+            device=device,
+            exploration_noise=args.exploration_noise,
+        )
+        for idx, episodic_return in enumerate(episodic_returns):
+            writer.add_scalar("eval/episodic_return", episodic_return, idx)
+
+        if args.upload_model:
+            from cleanrl_utils.huggingface import push_to_hub
+
+            repo_name = f"{args.env_id}-{args.exp_name}-seed{args.seed}"
+            repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name
+            push_to_hub(args, episodic_returns, repo_id, "TD3", f"runs/{run_name}", f"videos/{run_name}-eval")
+
+    envs.close()
+    writer.close()
\ No newline at end of file

From 9438239463360cc989151474171a40b1f6b8489d Mon Sep 17 00:00:00 2001
From: Nora Tseng <nora@tseng.us>
Date: Wed, 12 Jun 2024 12:51:16 -0700
Subject: [PATCH 2/5] buggy sac

---
 src/sac.py | 83 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 27 deletions(-)

diff --git a/src/sac.py b/src/sac.py
index 0c3d97d..761264b 100644
--- a/src/sac.py
+++ b/src/sac.py
@@ -23,7 +23,8 @@
 class Args:
     # wandb tracking
     exp_name: str = os.path.basename(__file__)[: -len(".py")] # name of this experiment
-    seed: int = 1
+    seed: Optional[int] = None       # seed of the experiment
+    # seed: int = 1
     torch_deterministic: bool = True # if toggled, torch.backends.cudnn.deterministic=False
     cuda: bool = True
     track: bool = False # tracked with Weights and Biases
@@ -60,33 +61,47 @@ class Args:
     save_rootdir: str = "results"           # top-level directory where results will be saved
     save_subdir: Optional[str] = None       # lower level directories
     save_dir: str = field(init=False)       # the lower-level directories 
+    save_model: bool = False                # whether to save model into the `runs/{run_name}` folder
+    eval_freq: Optional[int] = None         # num of timesteps between policy evals
+    n_eval_episodes: int = 80               # num of eval episodes
 
-def __post_init__(self):
-    if self.save_subdir == None:
-        self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg"
-    else:
-        self.save_dir = f"{self.save_rootdir}/{self.env_id}/ddpg/{self.save_subdir}"
-    if self.run_id is None:
-        self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1
-    self.save_dir += f"/run_{self.run_id}"
-    if self.seed is None:
-        self.seed = self.run_id
-    else:
-        self.seed = np.random.randint(2 ** 32 - 1)
+    def __post_init__(self):
+        if self.eval_freq == None: 
+            # 20 evals per training run unless specified otherwise.
+            self.eval_freq = self.total_timesteps/20
+
+        if self.save_subdir == None:
+            self.save_dir = f"{self.save_rootdir}/{self.env_id}/sac"
+        else:
+            self.save_dir = f"{self.save_rootdir}/{self.env_id}/sac/{self.save_subdir}"
+        if self.run_id is None:
+            self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1
+        self.save_dir += f"/run_{self.run_id}"
+        if self.seed is None:
+            self.seed = self.run_id
+        else:
+            self.seed = np.random.randint(2 ** 32 - 1)
+
+        print("self.save_dir = "+self.save_dir)
 
-    # dump training config to save dir
-    os.makedirs(self.save_dir, exist_ok=True)
-    with open(os.path.join(self.save_dir, "config.yml"), "w") as f:
-        yaml.dump(self, f, sort_keys=True)
+        # dump training config to save dir
+        os.makedirs(self.save_dir, exist_ok=True)
+        with open(os.path.join(self.save_dir, "config.yml"), "w") as f:
+            yaml.dump(self, f, sort_keys=True)
 
 
-def make_env(env_id, seed, idx, capture_video, run_name):
+## added env_kwargs from ddpg.py
+def make_env(env_id, env_kwargs, seed, idx, capture_video, run_name):
     def thunk():
         if capture_video and idx == 0:
-            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.make(env_id, render_mode="rgb_array", **env_kwargs)
             env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
         else:
-            env = gym.make(env_id)
+            env = gym.make(env_id, **env_kwargs)
+
+        # Flatten Dict obs so we don't need to handle them a special case in DA
+        if isinstance(env.observation_space, gym.spaces.Dict):
+            env = gym.wrappers.FlattenObservation(env)
         env = gym.wrappers.RecordEpisodeStatistics(env)
         env.action_space.seed(seed)
         return env
@@ -190,10 +205,11 @@ def get_action(self, x):
     torch.manual_seed(args.seed)
     torch.backends.cudnn.deterministic = args.torch_deterministic
 
-    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+    # device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+    device = torch.device("cpu")
 
     # env setup
-    envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)])
+    envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, args.seed, 0, args.capture_video, run_name)])
     assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
 
     max_action = float(envs.single_action_space.high[0])
@@ -226,8 +242,8 @@ def get_action(self, x):
         handle_timeout_termination=False,
     )
     
-    # eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, 0, 0, False, run_name)])
-    # evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes)
+    eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, 0, 0, False, run_name)])
+    evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes)
 
     start_time = time.time()
     num_updates = 0
@@ -239,8 +255,14 @@ def get_action(self, x):
         if global_step < args.learning_starts:
             actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
         else:
-            actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
+            actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) 
             actions = actions.detach().cpu().numpy()
+            ###### copied from ddpg
+            # with torch.no_grad():
+            #     actions = actor(torch.Tensor(obs).to(device))
+            #     actions += torch.normal(0, actor.action_scale * args.exploration_noise)
+            #     actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)
+
 
         # TRY NOT TO MODIFY: execute the game and log data.
         next_obs, rewards, terminations, truncations, infos = envs.step(actions)
@@ -329,8 +351,15 @@ def get_action(self, x):
                 if args.autotune:
                     writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step)
 
-        # if global_step % args.eval_freq == 0:
-        #     evaluator.evaluate(global_step, num_updates=num_updates)
+        if global_step % args.eval_freq == 0:
+            evaluator.evaluate(global_step, num_updates=num_updates)
+
+
+    if args.save_model:
+        model_path = f"{args.save_dir}/model"
+        torch.save((actor.state_dict(), qf1.state_dict()), model_path)
+        print(f"model saved to {model_path}")
+
 
     envs.close()
     writer.close()
\ No newline at end of file

From 0e931546921ea0be7c483e91172fd7aef2f59b9b Mon Sep 17 00:00:00 2001
From: Nora Tseng <nora@tseng.us>
Date: Tue, 18 Jun 2024 16:42:40 -0700
Subject: [PATCH 3/5] sac working, td3 WIP

---
 src/ddpg.py      |  6 +++---
 src/evaluator.py |  2 +-
 src/sac.py       |  5 -----
 src/td3.py       | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/src/ddpg.py b/src/ddpg.py
index 4ed7b63..8f87e72 100644
--- a/src/ddpg.py
+++ b/src/ddpg.py
@@ -188,13 +188,13 @@ def forward(self, x):
             x = F.relu(self.fc1(x))
             x = F.relu(self.fc2(x))
             x = torch.tanh(self.fc_mu(x))
-            return x * self.action_scale + self.action_bias
+            return x * self.action_scale + self.action_bias, None
         elif self.dims == 3: 
             x = F.relu(self.fc1(x))
             x = F.relu(self.fc2(x))
             x = F.relu(self.fc3(x))
             x = torch.tanh(self.fc_mu(x))
-            return x * self.action_scale + self.action_bias
+            return x * self.action_scale + self.action_bias, None
 
 
 if __name__ == "__main__":
@@ -289,7 +289,7 @@ def forward(self, x):
             actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
         else:
             with torch.no_grad():
-                actions = actor(torch.Tensor(obs).to(device))
+                actions, _ = actor(torch.Tensor(obs).to(device))
                 actions += torch.normal(0, actor.action_scale * args.exploration_noise)
                 actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)
 
diff --git a/src/evaluator.py b/src/evaluator.py
index 4a10510..08099ac 100644
--- a/src/evaluator.py
+++ b/src/evaluator.py
@@ -71,7 +71,7 @@ def _evaluate(self):
             while not done:
                 # ALGO LOGIC: put action logic here
                 with torch.no_grad():
-                    actions = self.actor(torch.Tensor(obs).to(self.device))
+                    actions, _ = self.actor(torch.Tensor(obs).to(self.device))
                     actions = actions.cpu().numpy().clip(self.eval_env.action_space.low,
                                                          self.eval_env.action_space.high)
 
diff --git a/src/sac.py b/src/sac.py
index 761264b..417bcf9 100644
--- a/src/sac.py
+++ b/src/sac.py
@@ -257,11 +257,6 @@ def get_action(self, x):
         else:
             actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) 
             actions = actions.detach().cpu().numpy()
-            ###### copied from ddpg
-            # with torch.no_grad():
-            #     actions = actor(torch.Tensor(obs).to(device))
-            #     actions += torch.normal(0, actor.action_scale * args.exploration_noise)
-            #     actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)
 
 
         # TRY NOT TO MODIFY: execute the game and log data.
diff --git a/src/td3.py b/src/td3.py
index 3ec0cdc..9a1bb03 100644
--- a/src/td3.py
+++ b/src/td3.py
@@ -2,7 +2,7 @@
 import os
 import random
 import time
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 import gymnasium as gym
 import numpy as np
@@ -14,6 +14,10 @@
 from stable_baselines3.common.buffers import ReplayBuffer
 from torch.utils.tensorboard import SummaryWriter
 
+from typing import Optional, Union
+from src.utils import get_latest_run_id
+from src.evaluator import Evaluator
+import yaml
 
 @dataclass
 class Args:
@@ -23,12 +27,18 @@ class Args:
     cuda: bool = True # cuda will be enabled by default
     track: bool = False # if toggled,  experiment will be tracked with Weights and Biases
     wandb_project_name: str = "cleanRL"
-    wandb_entity: str = None # entity (team) of wandb's project
+    wandb_entity: Optional[str] = None # the entity (team) of wandb's project
     capture_video: bool = False # capture videos of the agent performances (check out `videos` folder)
     save_model: bool = False # whether to save model into the `runs/{run_name}` folder
     upload_model: bool = False # upload the saved model to huggingface
     hf_entity: str = "" # user or org name of the model repository from the Hugging Face Hub
 
+    run_id: Optional[int] = None
+    save_rootdir: str = "results"           # top-level directory where results will be saved
+    save_subdir: Optional[str] = None       # lower level directories
+    save_dir: str = field(init=False)       # the lower-level directories 
+    save_model: bool = False # whether to save model into the `runs/{run_name}` folder
+
     # Algorithm specific arguments
     env_id: str = "Hopper-v4" # the id of the environment
     total_timesteps: int = 1000000 # total timesteps of the experiments
@@ -44,6 +54,31 @@ class Args:
     noise_clip: float = 0.5 # noise clip parameter of the Target Policy Smoothing Regularization
 
 
+    def __post_init__(self):
+        # if self.eval_freq == None: 
+        #     # 20 evals per training run unless specified otherwise.
+        #     self.eval_freq = self.total_timesteps/20
+
+        if self.save_subdir == None:
+            self.save_dir = f"{self.save_rootdir}/{self.env_id}/td3"
+        else:
+            self.save_dir = f"{self.save_rootdir}/{self.env_id}/td3/{self.save_subdir}"
+        if self.run_id is None:
+            self.run_id = get_latest_run_id(save_dir=self.save_dir) + 1
+        self.save_dir += f"/run_{self.run_id}"
+        if self.seed is None:
+            self.seed = self.run_id
+        else:
+            self.seed = np.random.randint(2 ** 32 - 1)
+
+        print("self.save_dir = "+self.save_dir)
+
+        # dump training config to save dir
+        os.makedirs(self.save_dir, exist_ok=True)
+        with open(os.path.join(self.save_dir, "config.yml"), "w") as f:
+            yaml.dump(self, f, sort_keys=True)
+
+
 def make_env(env_id, seed, idx, capture_video, run_name):
     def thunk():
         if capture_video and idx == 0:
@@ -62,6 +97,9 @@ def thunk():
 class QNetwork(nn.Module):
     def __init__(self, env):
         super().__init__()
+        print("QNET env.single_observation_space.shape")
+        print(env.single_observation_space.shape)
+
         self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
         self.fc2 = nn.Linear(256, 256)
         self.fc3 = nn.Linear(256, 1)
@@ -77,6 +115,8 @@ def forward(self, x, a):
 class Actor(nn.Module):
     def __init__(self, env):
         super().__init__()
+        print("env.single_observation_space.shape")
+        print(env.single_observation_space.shape)
         self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
         self.fc2 = nn.Linear(256, 256)
         self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape))
@@ -92,7 +132,7 @@ def forward(self, x):
         x = F.relu(self.fc1(x))
         x = F.relu(self.fc2(x))
         x = torch.tanh(self.fc_mu(x))
-        return x * self.action_scale + self.action_bias
+        return x * self.action_scale + self.action_bias, None
 
 
 if __name__ == "__main__":
@@ -167,7 +207,7 @@ def forward(self, x):
             actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
         else:
             with torch.no_grad():
-                actions = actor(torch.Tensor(obs).to(device))
+                actions, _ = actor(torch.Tensor(obs).to(device))
                 actions += torch.normal(0, actor.action_scale * args.exploration_noise)
                 actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)
 

From d04e5b6c3ad71460bd6d7a55087e35acc7cd5a01 Mon Sep 17 00:00:00 2001
From: Nora Tseng <nora@tseng.us>
Date: Wed, 19 Jun 2024 14:40:12 -0700
Subject: [PATCH 4/5] sac + td3 working

---
 src/ddpg.py |  2 +-
 src/sac.py  |  2 +-
 src/td3.py  | 50 +++++++++++++++++++++++++++++++++-----------------
 3 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/src/ddpg.py b/src/ddpg.py
index 8f87e72..4d78a64 100644
--- a/src/ddpg.py
+++ b/src/ddpg.py
@@ -59,7 +59,7 @@ class Args:
     learning_rate: float = 1e-3     # learning rate of optimizer
     buffer_size: int = int(1e6)     # replay memory buffer size
     gamma: float = 0.99             # discount factor gamma
-    tau: float = 0.005               # target smoothing coefficient (default: 0.005)
+    tau: float = 0.005              # target smoothing coefficient (default: 0.005)
     batch_size: int = 256           # batch size of sample from the reply memory
     exploration_noise: float = 0.1  # scale of exploration noise
     # learning_starts: int = 0      # timestep to start learning
diff --git a/src/sac.py b/src/sac.py
index 417bcf9..c5aecbd 100644
--- a/src/sac.py
+++ b/src/sac.py
@@ -245,8 +245,8 @@ def get_action(self, x):
     eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, 0, 0, False, run_name)])
     evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes)
 
-    start_time = time.time()
     num_updates = 0
+    start_time = time.time()
 
     # TRY NOT TO MODIFY: start the game
     obs, _ = envs.reset(seed=args.seed)
diff --git a/src/td3.py b/src/td3.py
index 9a1bb03..6277515 100644
--- a/src/td3.py
+++ b/src/td3.py
@@ -40,6 +40,13 @@ class Args:
     save_model: bool = False # whether to save model into the `runs/{run_name}` folder
 
     # Algorithm specific arguments
+    env_kwargs: dict[str, Union[bool, float, str]] = field(default_factory=dict) 
+    """
+    usage: --env_kwargs arg1 val1 arg2 val2 arg3 val3
+    
+    To make PointMaze tasks use a sparse reward function:
+        --env_kwargs continuing_task False
+    """
     env_id: str = "Hopper-v4" # the id of the environment
     total_timesteps: int = 1000000 # total timesteps of the experiments
     learning_rate: float = 3e-4 # learning rate of the optimizer
@@ -52,12 +59,14 @@ class Args:
     learning_starts: int = 25e3 # timestep to start learning
     policy_frequency: int = 2 # the frequency of training policy (delayed
     noise_clip: float = 0.5 # noise clip parameter of the Target Policy Smoothing Regularization
+    eval_freq: Optional[int] = None         # num of timesteps between policy evals
+    n_eval_episodes: int = 80               # num of eval episodes
 
 
     def __post_init__(self):
-        # if self.eval_freq == None: 
-        #     # 20 evals per training run unless specified otherwise.
-        #     self.eval_freq = self.total_timesteps/20
+        if self.eval_freq == None: 
+            # 20 evals per training run unless specified otherwise.
+            self.eval_freq = self.total_timesteps/20
 
         if self.save_subdir == None:
             self.save_dir = f"{self.save_rootdir}/{self.env_id}/td3"
@@ -79,27 +88,27 @@ def __post_init__(self):
             yaml.dump(self, f, sort_keys=True)
 
 
-def make_env(env_id, seed, idx, capture_video, run_name):
+def make_env(env_id, env_kwargs, seed, idx, capture_video, run_name):
     def thunk():
         if capture_video and idx == 0:
-            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.make(env_id, render_mode="rgb_array", **env_kwargs)
             env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
         else:
-            env = gym.make(env_id)
+            env = gym.make(env_id, **env_kwargs)
+
+        # Flatten Dict obs so we don't need to handle them a special case in DA
+        if isinstance(env.observation_space, gym.spaces.Dict):
+            env = gym.wrappers.FlattenObservation(env)
         env = gym.wrappers.RecordEpisodeStatistics(env)
         env.action_space.seed(seed)
         return env
 
     return thunk
 
-
 # ALGO LOGIC: initialize agent here:
 class QNetwork(nn.Module):
     def __init__(self, env):
         super().__init__()
-        print("QNET env.single_observation_space.shape")
-        print(env.single_observation_space.shape)
-
         self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
         self.fc2 = nn.Linear(256, 256)
         self.fc3 = nn.Linear(256, 1)
@@ -115,8 +124,6 @@ def forward(self, x, a):
 class Actor(nn.Module):
     def __init__(self, env):
         super().__init__()
-        print("env.single_observation_space.shape")
-        print(env.single_observation_space.shape)
         self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
         self.fc2 = nn.Linear(256, 256)
         self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape))
@@ -171,10 +178,11 @@ def forward(self, x):
     torch.manual_seed(args.seed)
     torch.backends.cudnn.deterministic = args.torch_deterministic
 
-    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+    # device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+    device = torch.device("cpu")
 
     # env setup
-    envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)])
+    envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, args.seed, 0, args.capture_video, run_name)])
     assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
 
     actor = Actor(envs).to(device)
@@ -197,6 +205,11 @@ def forward(self, x):
         device,
         handle_timeout_termination=False,
     )
+    
+    eval_env = gym.vector.SyncVectorEnv([make_env(args.env_id, args.env_kwargs, 0, 0, False, run_name)])
+    evaluator = Evaluator(actor, eval_env, args.save_dir, n_eval_episodes=args.n_eval_episodes)
+
+    num_updates = 0
     start_time = time.time()
 
     # TRY NOT TO MODIFY: start the game
@@ -217,7 +230,7 @@ def forward(self, x):
         # TRY NOT TO MODIFY: record rewards for plotting purposes
         if "final_info" in infos:
             for info in infos["final_info"]:
-                print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
+                # print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
                 writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
                 writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
                 break
@@ -280,9 +293,13 @@ def forward(self, x):
                 writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
                 writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
                 writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
-                print("SPS:", int(global_step / (time.time() - start_time)))
+                # print("SPS:", int(global_step / (time.time() - start_time)))
                 writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
 
+        if global_step % args.eval_freq == 0:
+            evaluator.evaluate(global_step, num_updates=num_updates)
+
+
     if args.save_model:
         model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model"
         torch.save((actor.state_dict(), qf1.state_dict(), qf2.state_dict()), model_path)
@@ -304,7 +321,6 @@ def forward(self, x):
 
         if args.upload_model:
             from cleanrl_utils.huggingface import push_to_hub
-
             repo_name = f"{args.env_id}-{args.exp_name}-seed{args.seed}"
             repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name
             push_to_hub(args, episodic_returns, repo_id, "TD3", f"runs/{run_name}", f"videos/{run_name}-eval")

From 99d48d3cfc035350a2e9b1fb4a3102d2ee71ec07 Mon Sep 17 00:00:00 2001
From: Nora Tseng <nora@tseng.us>
Date: Sun, 30 Jun 2024 19:23:56 -0700
Subject: [PATCH 5/5] separate network sizes for actor/critic

---
 src/ddpg.py           | 16 +++++++++++-----
 src/plotting/utils.py |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/ddpg.py b/src/ddpg.py
index 4d78a64..50cd892 100644
--- a/src/ddpg.py
+++ b/src/ddpg.py
@@ -55,7 +55,8 @@ class Args:
     save_model: bool = False # whether to save model into the `runs/{run_name}` folder
 
     # Algorithm specific arguments
-    net_arch: list[int] = (256, 256, 256)  # Structure of the network, default 64,64
+    actor_net_arch: list[int] = (64,64)  # Structure of actor network, default 64,64
+    critic_net_arch: list[int] = (64,64)  # Structure of critic network, default 64,64
     learning_rate: float = 1e-3     # learning rate of optimizer
     buffer_size: int = int(1e6)     # replay memory buffer size
     gamma: float = 0.99             # discount factor gamma
@@ -123,13 +124,12 @@ class QNetwork(nn.Module):
     def __init__(self, env):
         super().__init__()
         valid_networks = [(64, 64), (256,256), (256,256,256), [64,64], [256,256], [256,256,256]]
-        if args.net_arch in valid_networks:
-            arch = np.array(args.net_arch)
+        if args.critic_net_arch in valid_networks:
+            arch = np.array(args.critic_net_arch)
         else: # invalid architecture    
             print("Exiting: incorrect network architecture. example: ")
             print("--net_arch 256 256 256")
             exit()
-        
         self.dims = len(arch)
         if self.dims == 2: # 64,64 or 256,256
             self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), \
@@ -163,7 +163,13 @@ def forward(self, x, a):
 class Actor(nn.Module):
     def __init__(self, env):
         super().__init__()
-        arch = np.array(args.net_arch)
+        valid_networks = [(64, 64), (256,256), (256,256,256), [64,64], [256,256], [256,256,256]]
+        if args.actor_net_arch in valid_networks:
+            arch = np.array(args.actor_net_arch)
+        else: # invalid architecture    
+            print("Exiting: incorrect network architecture. example: ")
+            print("--net_arch 256 256 256")
+            exit()
         self.dims = len(arch)
         if self.dims == 2: # 64,64 or 256,256
             self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), arch[0])
diff --git a/src/plotting/utils.py b/src/plotting/utils.py
index 223bb2c..2a34aa9 100644
--- a/src/plotting/utils.py
+++ b/src/plotting/utils.py
@@ -142,7 +142,7 @@ def get_paths(results_dir, filename='evaluations.npz'):
             paths.append(f'{results_dir}/{subdir}/{filename}')
     return paths
 
-
+# TODO: Add addit. arg to choose timesteps or updates
 def get_data(results_dir, field_name='returns', filename='evaluations.npz'):
 
     try:
@@ -164,6 +164,6 @@ def get_data(results_dir, field_name='returns', filename='evaluations.npz'):
                 avg_vals = vals
 
             results.append(avg_vals)
-            timesteps = data['timesteps']
+            timesteps = data['timesteps']  ####
 
     return timesteps, np.array(results)