From a977dbc6cf61a209f156ce5a021fc600b5341482 Mon Sep 17 00:00:00 2001 From: RobvanGastel Date: Fri, 19 May 2023 14:03:50 +0200 Subject: [PATCH 1/4] Add .gitignore file --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..749ccda --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class From 6d1a8ad332e42b0a02d049ef9dc132e5bd414fcb Mon Sep 17 00:00:00 2001 From: RobvanGastel Date: Fri, 19 May 2023 14:06:43 +0200 Subject: [PATCH 2/4] Combine figure 3 and 4 for RT3D --- .../train_RT3D.py | 229 ++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 examples/Figure_3_and_4_RT3D_chemostat/train_RT3D.py diff --git a/examples/Figure_3_and_4_RT3D_chemostat/train_RT3D.py b/examples/Figure_3_and_4_RT3D_chemostat/train_RT3D.py new file mode 100644 index 0000000..75c503f --- /dev/null +++ b/examples/Figure_3_and_4_RT3D_chemostat/train_RT3D.py @@ -0,0 +1,229 @@ +import math +import os +import sys +import argparse + +# TODO: Can we just run using python -m? +IMPORT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(IMPORT_PATH) + +import multiprocessing +import wandb +import hydra +import numpy as np +from casadi import * +from hydra.utils import instantiate +from omegaconf import DictConfig, OmegaConf + +from RED.agents.continuous_agents.rt3d import RT3D_agent +from RED.environments.chemostat.xdot_chemostat import xdot +from RED.environments.OED_env import OED_env +from RED.utils.visualization import plot_returns + +# https://omegaconf.readthedocs.io/en/2.3_branch/how_to_guides.html#how-to-perform-arithmetic-using-eval-as-a-resolver +OmegaConf.register_new_resolver("eval", eval) + + +@hydra.main(version_base=None, config_path="../../RED/configs", config_name="example/Figure_4_RT3D_chemostat") +def train_RT3D(cfg : DictConfig): + ### config setup + cfg = cfg.example + print( + "--- Configuration ---", + OmegaConf.to_yaml(cfg, resolve=True), + "--- End of configuration ---", + sep="\n\n" + ) + + # start a new wandb run to track this script + wandb.init(project=cfg.wandb_project_name, entity=cfg.wandb_team, config=dict(cfg)) + + ### prepare save path + os.makedirs(cfg.save_path, exist_ok=True) + print("Results will be saved in: ", cfg.save_path) + + ### agent setup + agent = instantiate(cfg.model) + explore_rate = cfg.initial_explore_rate + seq_dim = cfg.environment.n_observed_variables + 1 + cfg.environment.n_controlled_inputs + + ### env setup + env, n_params = setup_env(cfg) + total_episodes = cfg.environment.n_episodes // cfg.environment.n_parallel_experiments + skip_first_n_episodes = cfg.environment.skip_first_n_experiments // cfg.environment.n_parallel_experiments + + history = {k: [] for k in ["returns", "actions", "rewards", "us", "explore_rate"]} + update_count = 0 + + ### training loop + for episode in range(total_episodes): + # sample params from uniform distribution + actual_params = np.random.uniform( + low=cfg.environment.lb, + high=cfg.environment.ub, + size=(cfg.environment.n_parallel_experiments, n_params) + ) + env.param_guesses = DM(actual_params) + + ### episode buffers for agent + states = [env.get_initial_RL_state_parallel() for i in range(cfg.environment.n_parallel_experiments)] + trajectories = [[] for _ in range(cfg.environment.n_parallel_experiments)] + sequences = [[[0] * seq_dim] for _ in range(cfg.environment.n_parallel_experiments)] + + ### episode logging buffers + e_returns = [0 for _ in range(cfg.environment.n_parallel_experiments)] + e_actions = [] + e_rewards = [[] for _ in range(cfg.environment.n_parallel_experiments)] + e_us = [[] for _ in range(cfg.environment.n_parallel_experiments)] + + ### reset env between episodes + env.reset() + env.param_guesses = DM(actual_params) + env.logdetFIMs = [[] for _ in range(cfg.environment.n_parallel_experiments)] + env.detFIMs = [[] for _ in range(cfg.environment.n_parallel_experiments)] + + ### run an episode + for control_interval in range(0, cfg.environment.N_control_intervals): + inputs = [states, sequences] + + ### get agent's actions + if episode < skip_first_n_episodes: + actions = agent.get_actions(inputs, explore_rate=1, test_episode=cfg.test_episode, recurrent=True) + else: + actions = agent.get_actions(inputs, explore_rate=explore_rate, test_episode=cfg.test_episode, recurrent=True) + e_actions.append(actions) + + ### step env + outputs = env.map_parallel_step(actions.T, actual_params, continuous=True) + next_states = [] + for i, obs in enumerate(outputs): + state, action = states[i], actions[i] + next_state, reward, done, _, u = obs + + ### set done flag + if control_interval == cfg.environment.N_control_intervals - 1 \ + or np.all(np.abs(next_state) >= 1) \ + or math.isnan(np.sum(next_state)): + done = True + + ### memorize transition + transition = (state, action, reward, next_state, done) + trajectories[i].append(transition) + sequences[i].append(np.concatenate((state, action))) + + ### log episode data + e_us[i].append(u) + next_states.append(next_state) + if reward != -1: # dont include the unstable trajectories as they override the true return + e_rewards[i].append(reward) + e_returns[i] += reward + states = next_states + + ### do not memorize the test trajectory (the last one) + if cfg.test_episode: + trajectories = trajectories[:-1] + + ### append trajectories to memory + for trajectory in trajectories: + # check for instability + if np.all([np.all(np.abs(trajectory[i][0]) <= 1) for i in range(len(trajectory))]) \ + and not math.isnan(np.sum(trajectory[-1][0])): + agent.memory.append(trajectory) + + ### train agent + if episode > skip_first_n_episodes: + for _ in range(cfg.environment.n_parallel_experiments): + update_count += 1 + update_policy = update_count % cfg.policy_delay == 0 + agent.Q_update(policy=update_policy, recurrent=True) + + ### update explore rate + explore_rate = cfg.explore_rate_mul * agent.get_rate( + episode=episode, + min_rate=0, + max_rate=1, + denominator=cfg.environment.n_episodes / (11 * cfg.environment.n_parallel_experiments) + ) + + ### log results + history["returns"].extend(e_returns) + history["actions"].extend(np.array(e_actions).transpose(1, 0, 2)) + history["rewards"].extend(e_rewards) + history["us"].extend(e_us) + history["explore_rate"].append(explore_rate) + + ### log results to w and b + for i in range(len(e_returns)): + wandb.log({"returns": e_returns[i], "actions": np.array(e_actions).transpose(1, 0, 2)[i], + "us": e_us[i], "explore_rate": explore_rate}) + + print( + f"\nEPISODE: [{episode}/{total_episodes}] ({episode * cfg.environment.n_parallel_experiments} experiments)", + f"explore rate:\t{explore_rate:.2f}", + f"average return:\t{np.mean(e_returns):.5f}", + sep="\n", + ) + + if cfg.test_episode: + print( + f"test actions:\n{np.array(e_actions)[:, -1]}", + f"test rewards:\n{np.array(e_rewards)[-1, :]}", + f"test return:\n{np.sum(np.array(e_rewards)[-1, :])}", + sep="\n", + ) + + ### checkpoint + if cfg.ckpt_freq is not None and episode % cfg.ckpt_freq == 0: + ckpt_dir = os.path.join(cfg.save_path, f"ckpt_{episode}") + os.makedirs(ckpt_dir, exist_ok=True) + agent.save_network(ckpt_dir) + for k in history.keys(): + np.save(os.path.join(ckpt_dir, f"{k}.npy"), np.array(history[k])) + + ### save results and plot + agent.save_network(cfg.save_path) + for k in history.keys(): + np.save(os.path.join(cfg.save_path, f"{k}.npy"), np.array(history[k])) + plot_returns( + returns=history["returns"], + explore_rates=history["explore_rate"], + show=False, + save_to_dir=cfg.save_path, + conv_window=25, + ) + + wandb.finish() + + +def setup_env(cfg): + n_cores = multiprocessing.cpu_count() + actual_params = DM(cfg.environment.actual_params) + normaliser = np.array(cfg.environment.normaliser) + n_params = actual_params.size()[0] + param_guesses = actual_params + args = cfg.environment.y0, xdot, param_guesses, actual_params, cfg.environment.n_observed_variables, \ + cfg.environment.n_controlled_inputs, cfg.environment.num_inputs, cfg.environment.input_bounds, \ + cfg.environment.dt, cfg.environment.control_interval_time, normaliser + env = OED_env(*args) + env.mapped_trajectory_solver = env.CI_solver.map(cfg.environment.n_parallel_experiments, "thread", n_cores) + return env, n_params + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", "--config_name", + type=str, + required=True, + help="Pass the config yaml file for either figure 3 or 4. For example, 'example/Figure_4_RT3D_chemostat'." + ) + parser.add_argument( + "-r", "--repeats", + type=int, + required=True, + help="Number of runs to average the R3TD results across" + ) + args = parser.parse_args() + + # TODO: How do we average multiple runs, in weights and biases or experiment side + train_RT3D(config_name=args.config_name) From fc8b48ddb77f181ad25f4ddcd43664ef601e78c4 Mon Sep 17 00:00:00 2001 From: RobvanGastel Date: Fri, 9 Jun 2023 15:34:43 +0200 Subject: [PATCH 3/4] Update wandb logging with averaging, edit .yaml files --- .../example/Figure_3_RT3D_chemostat.yaml | 2 + .../example/Figure_4_RT3D_chemostat.yaml | 3 ++ .../train_RT3D.py | 52 +++++++++++++------ 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/RED/configs/example/Figure_3_RT3D_chemostat.yaml b/RED/configs/example/Figure_3_RT3D_chemostat.yaml index 3164c03..a035ae7 100644 --- a/RED/configs/example/Figure_3_RT3D_chemostat.yaml +++ b/RED/configs/example/Figure_3_RT3D_chemostat.yaml @@ -4,6 +4,8 @@ defaults: - _self_ policy_delay: 2 +wandb_team: rl-oed +wandb_project_name: figure3-example initial_explore_rate: 1 explore_rate_mul: 1 test_episode: False diff --git a/RED/configs/example/Figure_4_RT3D_chemostat.yaml b/RED/configs/example/Figure_4_RT3D_chemostat.yaml index 3164c03..47ac457 100644 --- a/RED/configs/example/Figure_4_RT3D_chemostat.yaml +++ b/RED/configs/example/Figure_4_RT3D_chemostat.yaml @@ -4,6 +4,9 @@ defaults: - _self_ policy_delay: 2 +number_of_trials: 10 +wandb_team: rl-oed +wandb_project_name: figure4-example initial_explore_rate: 1 explore_rate_mul: 1 test_episode: False diff --git a/examples/Figure_3_and_4_RT3D_chemostat/train_RT3D.py b/examples/Figure_3_and_4_RT3D_chemostat/train_RT3D.py index 75c503f..e3a9710 100644 --- a/examples/Figure_3_and_4_RT3D_chemostat/train_RT3D.py +++ b/examples/Figure_3_and_4_RT3D_chemostat/train_RT3D.py @@ -1,7 +1,7 @@ import math import os import sys -import argparse +from datetime import datetime # TODO: Can we just run using python -m? IMPORT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -35,8 +35,26 @@ def train_RT3D(cfg : DictConfig): sep="\n\n" ) + group_name = None + number_of_trials = 1 + if hasattr(cfg, "number_of_trials"): + group_name = datetime.now().strftime("%d/%m/%Y_%H:%M") + number_of_trials = cfg.number_of_trials + + for _ in range(number_of_trials): + run_single_experiment(cfg, group_name=group_name) + + + +def run_single_experiment(cfg : DictConfig, group_name : str): + # start a new wandb run to track this script - wandb.init(project=cfg.wandb_project_name, entity=cfg.wandb_team, config=dict(cfg)) + if group_name: + wandb.init(reinit=True, project=cfg.wandb_project_name, + entity=cfg.wandb_team, group=group_name, config=dict(cfg)) + else: + wandb.init(reinit=True, project=cfg.wandb_project_name, + entity=cfg.wandb_team, config=dict(cfg)) ### prepare save path os.makedirs(cfg.save_path, exist_ok=True) @@ -210,20 +228,20 @@ def setup_env(cfg): if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - "-c", "--config_name", - type=str, - required=True, - help="Pass the config yaml file for either figure 3 or 4. For example, 'example/Figure_4_RT3D_chemostat'." - ) - parser.add_argument( - "-r", "--repeats", - type=int, - required=True, - help="Number of runs to average the R3TD results across" - ) - args = parser.parse_args() + # parser = argparse.ArgumentParser() + # parser.add_argument( + # "-c", "--config_name", + # type=str, + # required=True, + # help="Pass the config yaml file for either figure 3 or 4. For example, 'example/Figure_4_RT3D_chemostat'." + # ) + # parser.add_argument( + # "-r", "--repeats", + # type=int, + # required=True, + # help="Number of runs to average the R3TD results across" + # ) + # args = parser.parse_args() # TODO: How do we average multiple runs, in weights and biases or experiment side - train_RT3D(config_name=args.config_name) + train_RT3D() From 6fa2892cad8453e920dc9beb7f09e481dd329a2f Mon Sep 17 00:00:00 2001 From: RobvanGastel Date: Fri, 9 Jun 2023 15:39:28 +0200 Subject: [PATCH 4/4] Remove previous figure 3 and 4 experiments --- .../Figure_3_RT3D_chemostat/train_RT3D.py | 202 ------------------ .../Figure_4_RT3D_chemostat/train_RT3D.py | 202 ------------------ 2 files changed, 404 deletions(-) delete mode 100644 examples/Figure_3_RT3D_chemostat/train_RT3D.py delete mode 100644 examples/Figure_4_RT3D_chemostat/train_RT3D.py diff --git a/examples/Figure_3_RT3D_chemostat/train_RT3D.py b/examples/Figure_3_RT3D_chemostat/train_RT3D.py deleted file mode 100644 index 6eafea7..0000000 --- a/examples/Figure_3_RT3D_chemostat/train_RT3D.py +++ /dev/null @@ -1,202 +0,0 @@ - -import math -import os -import sys - -IMPORT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append(IMPORT_PATH) - -import multiprocessing - -import hydra -import matplotlib.pyplot as plt -import numpy as np -from casadi import * -from hydra.utils import instantiate -from omegaconf import DictConfig, OmegaConf - -from RED.agents.continuous_agents.rt3d import RT3D_agent -from RED.environments.chemostat.xdot_chemostat import xdot -from RED.environments.OED_env import OED_env -from RED.utils.visualization import plot_returns - -# https://omegaconf.readthedocs.io/en/2.3_branch/how_to_guides.html#how-to-perform-arithmetic-using-eval-as-a-resolver -OmegaConf.register_new_resolver("eval", eval) - - -@hydra.main(version_base=None, config_path="../../RED/configs", config_name="example/Figure_3_RT3D_chemostat") -def train_RT3D(cfg : DictConfig): - ### config setup - cfg = cfg.example - print( - "--- Configuration ---", - OmegaConf.to_yaml(cfg, resolve=True), - "--- End of configuration ---", - sep="\n\n" - ) - - ### prepare save path - os.makedirs(cfg.save_path, exist_ok=True) - print("Results will be saved in: ", cfg.save_path) - - ### agent setup - agent = instantiate(cfg.model) - explore_rate = cfg.initial_explore_rate - seq_dim = cfg.environment.n_observed_variables + 1 + cfg.environment.n_controlled_inputs - - ### env setup - env, n_params = setup_env(cfg) - total_episodes = cfg.environment.n_episodes // cfg.environment.n_parallel_experiments - skip_first_n_episodes = cfg.environment.skip_first_n_experiments // cfg.environment.n_parallel_experiments - - history = {k: [] for k in ["returns", "actions", "rewards", "us", "explore_rate"]} - update_count = 0 - - ### training loop - for episode in range(total_episodes): - actual_params = np.random.uniform( - low=cfg.environment.actual_params, - high=cfg.environment.actual_params, - size=(cfg.environment.n_parallel_experiments, n_params) - ) - env.param_guesses = DM(actual_params) - - ### episode buffers for agent - states = [env.get_initial_RL_state_parallel() for i in range(cfg.environment.n_parallel_experiments)] - trajectories = [[] for _ in range(cfg.environment.n_parallel_experiments)] - sequences = [[[0] * seq_dim] for _ in range(cfg.environment.n_parallel_experiments)] - - ### episode logging buffers - e_returns = [0 for _ in range(cfg.environment.n_parallel_experiments)] - e_actions = [] - e_rewards = [[] for _ in range(cfg.environment.n_parallel_experiments)] - e_us = [[] for _ in range(cfg.environment.n_parallel_experiments)] - - ### reset env between episodes - env.reset() - env.param_guesses = DM(actual_params) - env.logdetFIMs = [[] for _ in range(cfg.environment.n_parallel_experiments)] - env.detFIMs = [[] for _ in range(cfg.environment.n_parallel_experiments)] - - ### run an episode - for control_interval in range(0, cfg.environment.N_control_intervals): - inputs = [states, sequences] - - ### get agent's actions - if episode < skip_first_n_episodes: - actions = agent.get_actions(inputs, explore_rate=1, test_episode=cfg.test_episode, recurrent=True) - else: - actions = agent.get_actions(inputs, explore_rate=explore_rate, test_episode=cfg.test_episode, recurrent=True) - e_actions.append(actions) - - ### step env - outputs = env.map_parallel_step(actions.T, actual_params, continuous=True) - next_states = [] - for i, obs in enumerate(outputs): - state, action = states[i], actions[i] - next_state, reward, done, _, u = obs - - ### set done flag - if control_interval == cfg.environment.N_control_intervals - 1 \ - or np.all(np.abs(next_state) >= 1) \ - or math.isnan(np.sum(next_state)): - done = True - - ### memorize transition - transition = (state, action, reward, next_state, done) - trajectories[i].append(transition) - sequences[i].append(np.concatenate((state, action))) - - ### log episode data - e_us[i].append(u) - next_states.append(next_state) - if reward != -1: # dont include the unstable trajectories as they override the true return - e_rewards[i].append(reward) - e_returns[i] += reward - states = next_states - - ### do not memorize the test trajectory (the last one) - if cfg.test_episode: - trajectories = trajectories[:-1] - - ### append trajectories to memory - for trajectory in trajectories: - # check for instability - if np.all([np.all(np.abs(trajectory[i][0]) <= 1) for i in range(len(trajectory))]) \ - and not math.isnan(np.sum(trajectory[-1][0])): - agent.memory.append(trajectory) - - ### train agent - if episode > skip_first_n_episodes: - for _ in range(cfg.environment.n_parallel_experiments): - update_count += 1 - update_policy = update_count % cfg.policy_delay == 0 - agent.Q_update(policy=update_policy, recurrent=True) - - ### update explore rate - explore_rate = cfg.explore_rate_mul * agent.get_rate( - episode=episode, - min_rate=0, - max_rate=1, - denominator=cfg.environment.n_episodes / (11 * cfg.environment.n_parallel_experiments) - ) - - ### log results - history["returns"].extend(e_returns) - history["actions"].extend(np.array(e_actions).transpose(1, 0, 2)) - history["rewards"].extend(e_rewards) - history["us"].extend(e_us) - history["explore_rate"].append(explore_rate) - - print( - f"\nEPISODE: [{episode}/{total_episodes}] ({episode * cfg.environment.n_parallel_experiments} experiments)", - f"explore rate:\t{explore_rate:.2f}", - f"average return:\t{np.mean(e_returns):.5f}", - sep="\n", - ) - - if cfg.test_episode: - print( - f"test actions:\n{np.array(e_actions)[:, -1]}", - f"test rewards:\n{np.array(e_rewards)[-1, :]}", - f"test return:\n{np.sum(np.array(e_rewards)[-1, :])}", - sep="\n", - ) - - ### checkpoint - if cfg.ckpt_freq is not None and episode % cfg.ckpt_freq == 0: - ckpt_dir = os.path.join(cfg.save_path, f"ckpt_{episode}") - os.makedirs(ckpt_dir, exist_ok=True) - agent.save_network(ckpt_dir) - for k in history.keys(): - np.save(os.path.join(ckpt_dir, f"{k}.npy"), np.array(history[k])) - - ### save results and plot - agent.save_network(cfg.save_path) - for k in history.keys(): - np.save(os.path.join(cfg.save_path, f"{k}.npy"), np.array(history[k])) - plot_returns( - returns=history["returns"], - explore_rates=history["explore_rate"], - show=False, - save_to_dir=cfg.save_path, - conv_window=25, - ) - - -def setup_env(cfg): - n_cores = multiprocessing.cpu_count() - actual_params = DM(cfg.environment.actual_params) - normaliser = np.array(cfg.environment.normaliser) - n_params = actual_params.size()[0] - param_guesses = actual_params - args = cfg.environment.y0, xdot, param_guesses, actual_params, cfg.environment.n_observed_variables, \ - cfg.environment.n_controlled_inputs, cfg.environment.num_inputs, cfg.environment.input_bounds, \ - cfg.environment.dt, cfg.environment.control_interval_time, normaliser - env = OED_env(*args) - env.mapped_trajectory_solver = env.CI_solver.map(cfg.environment.n_parallel_experiments, "thread", n_cores) - return env, n_params - - -if __name__ == '__main__': - train_RT3D() diff --git a/examples/Figure_4_RT3D_chemostat/train_RT3D.py b/examples/Figure_4_RT3D_chemostat/train_RT3D.py deleted file mode 100644 index 0b23c06..0000000 --- a/examples/Figure_4_RT3D_chemostat/train_RT3D.py +++ /dev/null @@ -1,202 +0,0 @@ - -import math -import os -import sys - -IMPORT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append(IMPORT_PATH) - -import multiprocessing - -import hydra -import numpy as np -from casadi import * -from hydra.utils import instantiate -from omegaconf import DictConfig, OmegaConf - -from RED.agents.continuous_agents.rt3d import RT3D_agent -from RED.environments.chemostat.xdot_chemostat import xdot -from RED.environments.OED_env import OED_env -from RED.utils.visualization import plot_returns - -# https://omegaconf.readthedocs.io/en/2.3_branch/how_to_guides.html#how-to-perform-arithmetic-using-eval-as-a-resolver -OmegaConf.register_new_resolver("eval", eval) - - -@hydra.main(version_base=None, config_path="../../RED/configs", config_name="example/Figure_4_RT3D_chemostat") -def train_RT3D(cfg : DictConfig): - ### config setup - cfg = cfg.example - print( - "--- Configuration ---", - OmegaConf.to_yaml(cfg, resolve=True), - "--- End of configuration ---", - sep="\n\n" - ) - - ### prepare save path - os.makedirs(cfg.save_path, exist_ok=True) - print("Results will be saved in: ", cfg.save_path) - - ### agent setup - agent = instantiate(cfg.model) - explore_rate = cfg.initial_explore_rate - seq_dim = cfg.environment.n_observed_variables + 1 + cfg.environment.n_controlled_inputs - - ### env setup - env, n_params = setup_env(cfg) - total_episodes = cfg.environment.n_episodes // cfg.environment.n_parallel_experiments - skip_first_n_episodes = cfg.environment.skip_first_n_experiments // cfg.environment.n_parallel_experiments - - history = {k: [] for k in ["returns", "actions", "rewards", "us", "explore_rate"]} - update_count = 0 - - ### training loop - for episode in range(total_episodes): - # sample params from uniform distribution - actual_params = np.random.uniform( - low=cfg.environment.lb, - high=cfg.environment.ub, - size=(cfg.environment.n_parallel_experiments, 3) - ) - env.param_guesses = DM(actual_params) - - ### episode buffers for agent - states = [env.get_initial_RL_state_parallel() for i in range(cfg.environment.n_parallel_experiments)] - trajectories = [[] for _ in range(cfg.environment.n_parallel_experiments)] - sequences = [[[0] * seq_dim] for _ in range(cfg.environment.n_parallel_experiments)] - - ### episode logging buffers - e_returns = [0 for _ in range(cfg.environment.n_parallel_experiments)] - e_actions = [] - e_rewards = [[] for _ in range(cfg.environment.n_parallel_experiments)] - e_us = [[] for _ in range(cfg.environment.n_parallel_experiments)] - - ### reset env between episodes - env.reset() - env.param_guesses = DM(actual_params) - env.logdetFIMs = [[] for _ in range(cfg.environment.n_parallel_experiments)] - env.detFIMs = [[] for _ in range(cfg.environment.n_parallel_experiments)] - - ### run an episode - for control_interval in range(0, cfg.environment.N_control_intervals): - inputs = [states, sequences] - - ### get agent's actions - if episode < skip_first_n_episodes: - actions = agent.get_actions(inputs, explore_rate=1, test_episode=cfg.test_episode, recurrent=True) - else: - actions = agent.get_actions(inputs, explore_rate=explore_rate, test_episode=cfg.test_episode, recurrent=True) - e_actions.append(actions) - - ### step env - outputs = env.map_parallel_step(actions.T, actual_params, continuous=True) - next_states = [] - for i, obs in enumerate(outputs): - state, action = states[i], actions[i] - next_state, reward, done, _, u = obs - - ### set done flag - if control_interval == cfg.environment.N_control_intervals - 1 \ - or np.all(np.abs(next_state) >= 1) \ - or math.isnan(np.sum(next_state)): - done = True - - ### memorize transition - transition = (state, action, reward, next_state, done) - trajectories[i].append(transition) - sequences[i].append(np.concatenate((state, action))) - - ### log episode data - e_us[i].append(u) - next_states.append(next_state) - if reward != -1: # dont include the unstable trajectories as they override the true return - e_rewards[i].append(reward) - e_returns[i] += reward - states = next_states - - ### do not memorize the test trajectory (the last one) - if cfg.test_episode: - trajectories = trajectories[:-1] - - ### append trajectories to memory - for trajectory in trajectories: - # check for instability - if np.all([np.all(np.abs(trajectory[i][0]) <= 1) for i in range(len(trajectory))]) \ - and not math.isnan(np.sum(trajectory[-1][0])): - agent.memory.append(trajectory) - - ### train agent - if episode > skip_first_n_episodes: - for _ in range(cfg.environment.n_parallel_experiments): - update_count += 1 - update_policy = update_count % cfg.policy_delay == 0 - agent.Q_update(policy=update_policy, recurrent=True) - - ### update explore rate - explore_rate = cfg.explore_rate_mul * agent.get_rate( - episode=episode, - min_rate=0, - max_rate=1, - denominator=cfg.environment.n_episodes / (11 * cfg.environment.n_parallel_experiments) - ) - - ### log results - history["returns"].extend(e_returns) - history["actions"].extend(np.array(e_actions).transpose(1, 0, 2)) - history["rewards"].extend(e_rewards) - history["us"].extend(e_us) - history["explore_rate"].append(explore_rate) - - print( - f"\nEPISODE: [{episode}/{total_episodes}] ({episode * cfg.environment.n_parallel_experiments} experiments)", - f"explore rate:\t{explore_rate:.2f}", - f"average return:\t{np.mean(e_returns):.5f}", - sep="\n", - ) - - if cfg.test_episode: - print( - f"test actions:\n{np.array(e_actions)[:, -1]}", - f"test rewards:\n{np.array(e_rewards)[-1, :]}", - f"test return:\n{np.sum(np.array(e_rewards)[-1, :])}", - sep="\n", - ) - - ### checkpoint - if cfg.ckpt_freq is not None and episode % cfg.ckpt_freq == 0: - ckpt_dir = os.path.join(cfg.save_path, f"ckpt_{episode}") - os.makedirs(ckpt_dir, exist_ok=True) - agent.save_network(ckpt_dir) - for k in history.keys(): - np.save(os.path.join(ckpt_dir, f"{k}.npy"), np.array(history[k])) - - ### save results and plot - agent.save_network(cfg.save_path) - for k in history.keys(): - np.save(os.path.join(cfg.save_path, f"{k}.npy"), np.array(history[k])) - plot_returns( - returns=history["returns"], - explore_rates=history["explore_rate"], - show=False, - save_to_dir=cfg.save_path, - conv_window=25, - ) - - -def setup_env(cfg): - n_cores = multiprocessing.cpu_count() - actual_params = DM(cfg.environment.actual_params) - normaliser = np.array(cfg.environment.normaliser) - n_params = actual_params.size()[0] - param_guesses = actual_params - args = cfg.environment.y0, xdot, param_guesses, actual_params, cfg.environment.n_observed_variables, \ - cfg.environment.n_controlled_inputs, cfg.environment.num_inputs, cfg.environment.input_bounds, \ - cfg.environment.dt, cfg.environment.control_interval_time, normaliser - env = OED_env(*args) - env.mapped_trajectory_solver = env.CI_solver.map(cfg.environment.n_parallel_experiments, "thread", n_cores) - return env, n_params - - -if __name__ == '__main__': - train_RT3D()