From 0899b65e4d3821c766411f46c66f2a301f29ac83 Mon Sep 17 00:00:00 2001 From: Ivan_Marshev Date: Sat, 23 Sep 2023 14:02:33 +0300 Subject: [PATCH 01/19] modified: rl_lib/src/algoritms/a2c/actor_critic.py modified: rl_lib/src/models/model.py modified: rl_lib/tests/config.yaml renamed: rl_lib/tests/dqn_config.yaml -> rl_lib/tests/drqn_config.yaml --- rl_lib/src/algoritms/a2c/actor_critic.py | 4 +-- rl_lib/src/models/model.py | 22 +++++++--------- rl_lib/tests/config.yaml | 25 ++++++++----------- .../{dqn_config.yaml => drqn_config.yaml} | 25 +++++++++++-------- 4 files changed, 36 insertions(+), 40 deletions(-) rename rl_lib/tests/{dqn_config.yaml => drqn_config.yaml} (65%) diff --git a/rl_lib/src/algoritms/a2c/actor_critic.py b/rl_lib/src/algoritms/a2c/actor_critic.py index a24669e..11a6926 100644 --- a/rl_lib/src/algoritms/a2c/actor_critic.py +++ b/rl_lib/src/algoritms/a2c/actor_critic.py @@ -122,8 +122,8 @@ def get_weights(self, ) -> dict: 'critic': self.critic_model.get_weights() } - def input_spec(self): - return self.actor_model.input_spec() + def input_spec(self, key=None): + return self.actor_model.input_spec(key=key) def load(self, path): self.actor_model.load(path) diff --git a/rl_lib/src/models/model.py b/rl_lib/src/models/model.py index cbc96cd..0a3c8c9 100644 --- a/rl_lib/src/models/model.py +++ b/rl_lib/src/models/model.py @@ -20,19 +20,14 @@ def _initial_model(self): else: return self.create_model_with_conv(input_shape, action_space) - def check_input_shape(self, inputs): + def check_input_shape(self, inputs, key=None): if not isinstance(inputs, (tf.Tensor, np.ndarray)): - if isinstance(inputs, dict): - for key, inpt in inputs.items(): - inputs[key] = self.check_input_shape(inpt) - return inputs - elif isinstance(inputs, list): - for key, inpt in enumerate(inputs): - inputs[key] = self.check_input_shape(inpt) - return inputs - while len(inputs.shape) < len(self.input_spec()): + for key, inpt in inputs.items() if isinstance(inputs, dict) else enumerate(inputs): + inputs[key] = self.check_input_shape(inpt, key=key) + return inputs + while len(inputs.shape) < len(self.input_spec(key=key)): inputs = tf.expand_dims(inputs,0) - if len(inputs.shape) > len(self.input_spec()): assert 0 #inputs.shape не может быть больше входа модели + if len(inputs.shape) > len(self.input_spec(key=key)): assert 0 #inputs.shape не может быть больше входа модели return inputs def initial_model(self): @@ -43,8 +38,9 @@ def initial_model(self): optimizer = get_optimizer(**optimizer) self.set_new_model(model, optimizer) - def input_spec(self): - return self.model.layers[0].input_shape[0] + def input_spec(self, key=None): + if key!=None: return self.model.input[key].shape + return self.model.input.shape def load(self, path): self.model = tf.keras.models.load_model(path+self.name+'.h5') diff --git a/rl_lib/tests/config.yaml b/rl_lib/tests/config.yaml index 9ff0efb..b7d2c72 100644 --- a/rl_lib/tests/config.yaml +++ b/rl_lib/tests/config.yaml @@ -1,16 +1,15 @@ -#default DQN config +#CartPole DQN config model_config: model: None - name: "_cart_pole" + name: "_test_cart_pole" input_shape: None action_space: None - lstm_size: 64 discount_factor : 0.9 - n_step: 3 - priority: True + n_step: 1 batch_size: 32 - double_network: False + double_network: True + priority: False tau: 1.0 optimizer_config: @@ -22,23 +21,19 @@ optimizer_config: custom_optimizer: None buffer_config: - size: 100000 - priority: True - recurrent: True - trace_length: 10 - recurrent_skip: 5 + size: 10000 eps: 0.01 alpha: 0.5 beta: 0.4 beta_changing: 0.0005 beta_changing_curve: 'linear' - max_priority: 0.01 + max_priority: 0.1 exploration_config: - strategy_name: "epsilon_greedy" + strategy_name: "soft_q" strategy_config: - eps_decay_steps: 5000 - eps_max: 1.0 + decay: 1. + tau: 0.1 eps_min: 0.01 eps_test: 0.001 action_space: None diff --git a/rl_lib/tests/dqn_config.yaml b/rl_lib/tests/drqn_config.yaml similarity index 65% rename from rl_lib/tests/dqn_config.yaml rename to rl_lib/tests/drqn_config.yaml index b7d2c72..9ff0efb 100644 --- a/rl_lib/tests/dqn_config.yaml +++ b/rl_lib/tests/drqn_config.yaml @@ -1,15 +1,16 @@ -#CartPole DQN config +#default DQN config model_config: model: None - name: "_test_cart_pole" + name: "_cart_pole" input_shape: None action_space: None + lstm_size: 64 discount_factor : 0.9 - n_step: 1 + n_step: 3 + priority: True batch_size: 32 - double_network: True - priority: False + double_network: False tau: 1.0 optimizer_config: @@ -21,19 +22,23 @@ optimizer_config: custom_optimizer: None buffer_config: - size: 10000 + size: 100000 + priority: True + recurrent: True + trace_length: 10 + recurrent_skip: 5 eps: 0.01 alpha: 0.5 beta: 0.4 beta_changing: 0.0005 beta_changing_curve: 'linear' - max_priority: 0.1 + max_priority: 0.01 exploration_config: - strategy_name: "soft_q" + strategy_name: "epsilon_greedy" strategy_config: - decay: 1. - tau: 0.1 + eps_decay_steps: 5000 + eps_max: 1.0 eps_min: 0.01 eps_test: 0.001 action_space: None From 4b485b10dd6b68e2992c6eb39c37c21da2b0e57c Mon Sep 17 00:00:00 2001 From: Ivan_Marshev Date: Sat, 23 Sep 2023 18:33:21 +0300 Subject: [PATCH 02/19] new file: rl_lib/src/replay_buffers/dict_array.py modified: rl_lib/src/replay_buffers/random_buffers.py --- rl_lib/src/replay_buffers/dict_array.py | 79 +++++++++++++++++++++ rl_lib/src/replay_buffers/random_buffers.py | 19 +++-- 2 files changed, 88 insertions(+), 10 deletions(-) create mode 100644 rl_lib/src/replay_buffers/dict_array.py diff --git a/rl_lib/src/replay_buffers/dict_array.py b/rl_lib/src/replay_buffers/dict_array.py new file mode 100644 index 0000000..d0e7ef5 --- /dev/null +++ b/rl_lib/src/replay_buffers/dict_array.py @@ -0,0 +1,79 @@ +from typing import Any +import numpy as np + +class StructArray: + """Структурированный массив""" + def __init__(self, shape, dict_keys, dtype=object) -> None: + self.data = np.zeros(shape=shape, + dtype=( + [ + (key, dtype) for key in sorted(dict_keys) + ] + ) + ) + self.dict_keys = sorted(dict_keys) + self.dtype = dtype + + def __getitem__(self, index): + data = self.data[index] + return {key: np.asarray(data[key]).astype(np.float32) + if isinstance(index, int) else StructArray.stack(data[key], axis=0).astype(np.float32) + for key in self.dict_keys} + + def __setitem__(self, index, values): + "values = (state, action, reward, next_state, done, *other_data)" + self.data[index] = tuple(values[key] for key in self.dict_keys) + + @staticmethod + def stack(array, axis=0): + return np.stack(array, axis=axis) + +class NonStructArray: + """Не структурированный массив""" + def __init__(self, shape, dtype=object) -> None: + self.data = np.zeros(shape=shape, dtype=dtype) + self.dtype = dtype + + def __getitem__(self, index): + return StructArray.stack(self.data[index]) + + def __setitem__(self, index, values): + "values = (state, action, reward, next_state, done, *other_data)" + self.data[index] = values + +class DictArray: + """ + Класс реализующий сохранение/ извлечение данных в структурированные массивы numpy + """ + def __init__(self, shape, dtype=object) -> None: + self.dtype = dtype + self.initialized = False + self.shape = shape + self.data = np.zeros((shape[1], ), dtype=object) #В этом массиве мы будем хранить вложенные массивы (s,a,r,s',d) + + def __getitem__(self, index): + return tuple(self.data[i][index] for i in range(self.shape[1])) + + def __setitem__(self, index, values): + "values = (state, action, reward, next_state, done, *other_data)" + if not self.initialized: self.init_array(values) + for i in range(self.shape[1]): + self.data[i][index] = values[i] + + def choose_array_type(self, data): + if isinstance(data, dict): return self.init_struct_array((self.shape[0], ), data.keys()) + else: return self.init_non_struct_array((self.shape[0], )) + + def init_array(self, data): + for i, d in zip(range(self.shape[0]), data): + self.data[i] = self.choose_array_type(d) + self.initialized=True + + def init_struct_array(self, shape, dict_keys): + return StructArray(shape, dict_keys, dtype=self.dtype) + + def init_non_struct_array(self, shape): + return NonStructArray(shape=shape, dtype=self.dtype) + + + \ No newline at end of file diff --git a/rl_lib/src/replay_buffers/random_buffers.py b/rl_lib/src/replay_buffers/random_buffers.py index 0bc5157..6781362 100644 --- a/rl_lib/src/replay_buffers/random_buffers.py +++ b/rl_lib/src/replay_buffers/random_buffers.py @@ -1,5 +1,6 @@ import numpy as np from ..data_saver.utils import save_data, load_data +from .dict_array import DictArray class _n_step_buffer: def __init__(self, **kwargs): @@ -52,8 +53,9 @@ def __init__(self, **kwargs): self.size = kwargs.get("size", 100000) discount_factor = kwargs.get("discount_factor", 0.99) num_var = kwargs.get("num_var", 5) + # буфер для хранения перехода - self.data = np.zeros((self.size, num_var), dtype=object) + self.data = DictArray((self.size, num_var), dtype=object) self.name = "Random_Buffer" # размер буфера @@ -63,13 +65,13 @@ def __init__(self, **kwargs): self.n_step_buffer = _n_step_buffer(**kwargs) if n_step > 1 else None def clear(self, ): - self.data = np.zeros(self.data.shape, dtype=object) + self.data = DictArray(self.data.shape, dtype=object) self.count = 0 self.real_size = 0 if self.n_step_buffer != None: self.n_step_buffer.clear() def add(self, samples: tuple, args=None): - """Добавляет данные в буфер""" + """Добавляет данные в буфер s,a,r,n_s,d""" if self.n_step_buffer != None: result = self.n_step_buffer.add(samples) if result != None: @@ -82,14 +84,11 @@ def sample(self, batch_size, idx=None): """Возвращает батч: dict""" if np.any(idx) == None: idx = self._get_idx( batch_size) - state = np.stack(self.data[idx, 0], axis=0).astype(np.float32) - action = np.stack(self.data[idx, 1], axis=0).astype(np.float32) - reward = self.data[idx, 2].astype(np.float32) - done = self.data[idx, 3].astype(np.float32) - next_state = np.stack(self.data[idx, 4], axis=0).astype(np.float32) + data = self.data[idx] + state, action, reward, done, next_state = data[:5] other_data = {} if 5 < self.data.shape[1] <= 7: - other_data = {key: np.stack(self.data[idx, i], axis=0).astype(np.float32) for i, key in zip(range(5,7), ('h_t', 'c_t'))} + other_data = {key: data[i] for i, key in zip(range(5,7), ('h_t', 'c_t'))} return {'state': state, 'action': action, 'reward': reward, 'done': done, 'next_state': next_state, **other_data} @@ -112,7 +111,7 @@ def load(self, path): def _add_data(self, samples): - self.data[self.count, :] = samples + self.data[self.count] = samples self.count = (self.count + 1) % self.size self.real_size = min(self.size, self.real_size + 1) return True From 12bd0b1a8b91afc833f705c84e6935c15255fc4f Mon Sep 17 00:00:00 2001 From: Ivan_Marshev Date: Sat, 23 Sep 2023 19:09:24 +0300 Subject: [PATCH 03/19] modified: rl_lib/src/algoritms/simple_q.py modified: rl_lib/src/replay_buffers/dict_array.py --- rl_lib/src/algoritms/simple_q.py | 3 ++- rl_lib/src/replay_buffers/dict_array.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/rl_lib/src/algoritms/simple_q.py b/rl_lib/src/algoritms/simple_q.py index 5a0e331..be8d573 100644 --- a/rl_lib/src/algoritms/simple_q.py +++ b/rl_lib/src/algoritms/simple_q.py @@ -1,5 +1,6 @@ import tensorflow as tf import numpy as np +from copy import copy from .base_algo import Base_Algo from rl_lib.src.replay_buffers.replay_buffer import ReplayBuffer @@ -85,7 +86,7 @@ def choice_model_for_double_calculates(self, **batch): def _get_action(self, observation: tf.Tensor) -> tf.Tensor: """Возвращает ценность дейтсвий Q(s,a) всех действий на основе наблюдения""" - return self.sample_action(self.action_model.check_input_shape(observation)) + return self.sample_action(self.action_model.check_input_shape(copy(observation))) def get_action(self, observation: tf.Tensor) -> float: """Возвращает действие на основе наблюдения с учетом исследования""" diff --git a/rl_lib/src/replay_buffers/dict_array.py b/rl_lib/src/replay_buffers/dict_array.py index d0e7ef5..752da93 100644 --- a/rl_lib/src/replay_buffers/dict_array.py +++ b/rl_lib/src/replay_buffers/dict_array.py @@ -26,7 +26,7 @@ def __setitem__(self, index, values): @staticmethod def stack(array, axis=0): - return np.stack(array, axis=axis) + return np.stack(array, axis=axis).astype(np.float32) class NonStructArray: """Не структурированный массив""" From a06e3a2394a96f27ca184e57690d42c5a21d401f Mon Sep 17 00:00:00 2001 From: Ivan_Marshev Date: Sun, 24 Sep 2023 22:37:20 +0300 Subject: [PATCH 04/19] modified: rl_lib/src/explore_env/ou_noise.py --- rl_lib/src/explore_env/ou_noise.py | 38 ++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/rl_lib/src/explore_env/ou_noise.py b/rl_lib/src/explore_env/ou_noise.py index f99f44d..4133c9c 100644 --- a/rl_lib/src/explore_env/ou_noise.py +++ b/rl_lib/src/explore_env/ou_noise.py @@ -4,6 +4,29 @@ from ..data_saver.utils import save_data, load_data from .base_explore import Base_Explore +class OU_Noise_generator: + def __init__(self, mean, sigma , theta=0.15, dt=1e-2, x_initial=None): + self.theta = theta + self.mean = mean + self.sigma = sigma + self.dt = dt + self.x_initial = x_initial + self.reset() + + def __call__(self): + # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process. + dx = (self.theta * (self.mean - self.x_prev) * self.dt+ self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)) + # Store x into x_prev + # Makes next noise dependent on current one + self.x_prev += dx + return self.x_prev + + def reset(self): + if self.x_initial is not None: + self.x_prev = self.x_initial + else: + self.x_prev = np.zeros_like(self.mean) + class OU_Noise(Base_Explore): """Шум Орнштейна — Уленбека стратегия исследования, применяется к предсказанным непрерывным действиям. @@ -13,11 +36,18 @@ class OU_Noise(Base_Explore): axis: int, ось вычислений sigma: float, Максимальный эпсилон """ - def __init__(self, action_space = None, axis=-1, alpha = 0.9, lower_bound = -1.0, sigma=1.0, upper_bound = 1.0,**kwargs): + def __init__(self, action_space = None, + axis=-1, alpha = 0.9, dt = 0.01, + lower_bound = -1.0, mean = 0.0, + sigma=1.0, theta = 0.15, + upper_bound = 1.0, + **kwargs): + self.action_space = action_space self.alpha = alpha self.axis = axis - self.eps = np.random.normal(size=self.action_space, scale = sigma) + self.ou_gen = OU_Noise_generator(mean, sigma , theta=theta, dt=dt, x_initial=None) + self.eps = self.ou_gen() self.lower_bound = lower_bound self.sigma = sigma self._name = "ou_noise" @@ -26,7 +56,7 @@ def __init__(self, action_space = None, axis=-1, alpha = 0.9, lower_bound = -1.0 def __call__(self, action): action += self.eps - self.eps = self.alpha*self.eps + np.random.normal(size=self.action_space, scale = self.sigma) + self.eps = self.alpha*self.eps + self.ou_gen() return clip_by_value(action, clip_value_min=self.lower_bound, clip_value_max=self.upper_bound) def load(self, path): @@ -37,7 +67,7 @@ def name(self): return self._name def reset(self, ): - self.eps = np.random.normal(size=self.action_space, scale = self.sigma) + self.eps = self.ou_gen() def save(self, path): save_data(path+self.name, self.__dict__) From 7cbb5eb87620288e9fdfc379f93eaf00af189a7c Mon Sep 17 00:00:00 2001 From: Ivan_Marshev Date: Sun, 24 Sep 2023 22:41:24 +0300 Subject: [PATCH 05/19] modified: rl_lib/src/explore_env/ou_noise.py --- rl_lib/src/explore_env/ou_noise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rl_lib/src/explore_env/ou_noise.py b/rl_lib/src/explore_env/ou_noise.py index 4133c9c..8b19d05 100644 --- a/rl_lib/src/explore_env/ou_noise.py +++ b/rl_lib/src/explore_env/ou_noise.py @@ -38,7 +38,7 @@ class OU_Noise(Base_Explore): """ def __init__(self, action_space = None, axis=-1, alpha = 0.9, dt = 0.01, - lower_bound = -1.0, mean = 0.0, + lower_bound = -1.0, mean = np.zeros(action_space), sigma=1.0, theta = 0.15, upper_bound = 1.0, **kwargs): From b06028579631d0391e73b15c777d770a5c744adf Mon Sep 17 00:00:00 2001 From: Ivan_Marshev Date: Sun, 24 Sep 2023 22:45:07 +0300 Subject: [PATCH 06/19] modified: rl_lib/src/explore_env/ou_noise.py --- rl_lib/src/explore_env/ou_noise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rl_lib/src/explore_env/ou_noise.py b/rl_lib/src/explore_env/ou_noise.py index 8b19d05..ebc63fd 100644 --- a/rl_lib/src/explore_env/ou_noise.py +++ b/rl_lib/src/explore_env/ou_noise.py @@ -38,7 +38,7 @@ class OU_Noise(Base_Explore): """ def __init__(self, action_space = None, axis=-1, alpha = 0.9, dt = 0.01, - lower_bound = -1.0, mean = np.zeros(action_space), + lower_bound = -1.0, mean: np.ndarray = None, sigma=1.0, theta = 0.15, upper_bound = 1.0, **kwargs): @@ -46,7 +46,7 @@ def __init__(self, action_space = None, self.action_space = action_space self.alpha = alpha self.axis = axis - self.ou_gen = OU_Noise_generator(mean, sigma , theta=theta, dt=dt, x_initial=None) + self.ou_gen = OU_Noise_generator(np.zeros(action_space) if mean==None else mean, sigma , theta=theta, dt=dt, x_initial=None) self.eps = self.ou_gen() self.lower_bound = lower_bound self.sigma = sigma From 51e45da04d5657dd4a87fe01654ea5cc0b40ddc9 Mon Sep 17 00:00:00 2001 From: Ivan_Marshev Date: Wed, 27 Sep 2023 23:01:21 +0300 Subject: [PATCH 07/19] modified: examples/ddpg/car_racing/ddpg_car_racing.py modified: examples/dqn/cart_pole/dqn_cart_pole.py modified: examples/drqn/cart_pole/drqn_cart_pole.py modified: rl_lib/src/algoritms/a2c/actor_critic.py modified: rl_lib/src/algoritms/ddpg/config.yaml modified: rl_lib/src/algoritms/simple_q.py modified: rl_lib/src/explore_env/ou_noise.py new file: rl_lib/src/gym_wrappers/obsv_wrapper.py modified: rl_lib/src/models/base_models.py new file: rl_lib/src/normalizes.py modified: rl_lib/tests/config.yaml deleted: rl_lib/tests/ddpg_config.yaml new file: rl_lib/tests/dqn_config.yaml modified: rl_lib/tests/first_test_ddpg.py modified: rl_lib/tests/first_test_dqn.py modified: rl_lib/tests/first_test_drqn.py --- examples/ddpg/car_racing/ddpg_car_racing.py | 3 +- examples/dqn/cart_pole/dqn_cart_pole.py | 1 + examples/drqn/cart_pole/drqn_cart_pole.py | 1 + rl_lib/src/algoritms/a2c/actor_critic.py | 8 ++- rl_lib/src/algoritms/ddpg/config.yaml | 3 + rl_lib/src/algoritms/simple_q.py | 2 +- rl_lib/src/explore_env/ou_noise.py | 5 +- rl_lib/src/gym_wrappers/obsv_wrapper.py | 19 +++++++ rl_lib/src/models/base_models.py | 3 +- rl_lib/src/normalizes.py | 8 +++ rl_lib/tests/config.yaml | 62 ++++++++++++++------- rl_lib/tests/ddpg_config.yaml | 61 -------------------- rl_lib/tests/dqn_config.yaml | 44 +++++++++++++++ rl_lib/tests/first_test_ddpg.py | 60 +++++++++++--------- rl_lib/tests/first_test_dqn.py | 3 +- rl_lib/tests/first_test_drqn.py | 1 + 16 files changed, 166 insertions(+), 118 deletions(-) create mode 100644 rl_lib/src/gym_wrappers/obsv_wrapper.py create mode 100644 rl_lib/src/normalizes.py delete mode 100644 rl_lib/tests/ddpg_config.yaml create mode 100644 rl_lib/tests/dqn_config.yaml diff --git a/examples/ddpg/car_racing/ddpg_car_racing.py b/examples/ddpg/car_racing/ddpg_car_racing.py index ca5ac1a..6c16bfb 100644 --- a/examples/ddpg/car_racing/ddpg_car_racing.py +++ b/examples/ddpg/car_racing/ddpg_car_racing.py @@ -42,7 +42,7 @@ def create_critic_model(): flatten = layers.Flatten()(concat) dence_layer1 = layers.Dense(256, activation='relu')(flatten) dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1) - dence_out = layers.Dense(env.action_space.shape[0], activation=None)(dence_layer2) + dence_out = layers.Dense(1, activation=None)(dence_layer2) return tf.keras.Model(inputs=[input_layer, input_action_layer], outputs=dence_out) @@ -81,6 +81,7 @@ def run(algo): observation, info = env.reset() episode_reward = 0 + episode_loss = [] for step in range(1, steps+1): action = algo.get_action(observation) new_observation, reward, done, _, info = env.step(action) diff --git a/examples/dqn/cart_pole/dqn_cart_pole.py b/examples/dqn/cart_pole/dqn_cart_pole.py index 872f1e4..5d10dba 100644 --- a/examples/dqn/cart_pole/dqn_cart_pole.py +++ b/examples/dqn/cart_pole/dqn_cart_pole.py @@ -49,6 +49,7 @@ def run(algo): observation, info = env.reset() episode_reward = 0 + episode_loss = [] for step in range(1, steps): action = algo.get_action(observation) new_observation, reward, done, _, info = env.step(action) diff --git a/examples/drqn/cart_pole/drqn_cart_pole.py b/examples/drqn/cart_pole/drqn_cart_pole.py index e3cce0c..307755f 100644 --- a/examples/drqn/cart_pole/drqn_cart_pole.py +++ b/examples/drqn/cart_pole/drqn_cart_pole.py @@ -56,6 +56,7 @@ def run(algo): observation, info = env.reset() algo.initial_state() episode_reward = 0 + episode_loss = [] for step in range(1, steps): action = algo.get_action(observation) new_observation, reward, done, _, info = env.step(action) diff --git a/rl_lib/src/algoritms/a2c/actor_critic.py b/rl_lib/src/algoritms/a2c/actor_critic.py index 11a6926..af46b6d 100644 --- a/rl_lib/src/algoritms/a2c/actor_critic.py +++ b/rl_lib/src/algoritms/a2c/actor_critic.py @@ -1,9 +1,10 @@ import tensorflow as tf from tensorflow.keras import layers -import abc +from tensorflow.keras.models import clone_model from rl_lib.src.algoritms.dqn.dqn import DQN_Model + class Actor_Model(DQN_Model): def __init__(self, config = {},**kwargs): config['model_config'] = config['actor_model_config']['model_config'] @@ -48,7 +49,8 @@ def calculate_gradients(self, **kwargs) -> dict: td_error = kwargs['Qtarget'] - Q loss = self.loss(kwargs['Qtarget'], Q)*kwargs.get('weights', 1.0) - gradients = tape.gradient(loss, self.model.trainable_variables) + E_loss = tf.reduce_mean(loss, axis=0) + gradients = tape.gradient(E_loss, self.model.trainable_variables) loss = tf.reduce_mean(loss, axis=-1) return {'gradients': gradients, 'loss': loss, 'td_error': td_error} @@ -103,7 +105,7 @@ def update_weights(self, **kwargs): return self.update_weights_critic(**kwargs) def update_weights_actor(self, **kwargs): - kwargs['critic_model'] = self.critic_model + kwargs['critic_model'] = self.critic_model.model loss = self.actor_model.update_weights(**kwargs) return {'loss': loss['loss'], 'td_error': loss['td_error']} diff --git a/rl_lib/src/algoritms/ddpg/config.yaml b/rl_lib/src/algoritms/ddpg/config.yaml index c7bab98..682655b 100644 --- a/rl_lib/src/algoritms/ddpg/config.yaml +++ b/rl_lib/src/algoritms/ddpg/config.yaml @@ -59,6 +59,9 @@ exploration_config: action_space: None upper_bound: None lower_bound: None + dt: 0.01 + mean: None + theta: 0.15 data_saver: path: "" diff --git a/rl_lib/src/algoritms/simple_q.py b/rl_lib/src/algoritms/simple_q.py index be8d573..00160b6 100644 --- a/rl_lib/src/algoritms/simple_q.py +++ b/rl_lib/src/algoritms/simple_q.py @@ -67,7 +67,7 @@ def calculate_new_best_action(self, **kwargs) -> tf.Tensor: experimental_autograph_options = tf.autograph.experimental.Feature.ALL) def calculate_target(self, **kwargs): Qtarget = self.calculate_new_best_action(**kwargs) - dones = tf.ones(kwargs['done'].shape, dtype=tf.dtypes.float32) + dones = tf.ones_like(kwargs['done'], dtype=tf.dtypes.float32) dones = dones - kwargs['done'] Qtarget = kwargs['reward'] + (self.discount_factor**self.n_step) * Qtarget * dones if self.recurrent: diff --git a/rl_lib/src/explore_env/ou_noise.py b/rl_lib/src/explore_env/ou_noise.py index ebc63fd..d1d24ed 100644 --- a/rl_lib/src/explore_env/ou_noise.py +++ b/rl_lib/src/explore_env/ou_noise.py @@ -15,7 +15,7 @@ def __init__(self, mean, sigma , theta=0.15, dt=1e-2, x_initial=None): def __call__(self): # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process. - dx = (self.theta * (self.mean - self.x_prev) * self.dt+ self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)) + dx = (self.theta * (self.mean - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape, scale=self.sigma)) # Store x into x_prev # Makes next noise dependent on current one self.x_prev += dx @@ -42,11 +42,10 @@ def __init__(self, action_space = None, sigma=1.0, theta = 0.15, upper_bound = 1.0, **kwargs): - self.action_space = action_space self.alpha = alpha self.axis = axis - self.ou_gen = OU_Noise_generator(np.zeros(action_space) if mean==None else mean, sigma , theta=theta, dt=dt, x_initial=None) + self.ou_gen = OU_Noise_generator(np.zeros(action_space) if mean=="None" else mean, sigma , theta=theta, dt=dt, x_initial=None) self.eps = self.ou_gen() self.lower_bound = lower_bound self.sigma = sigma diff --git a/rl_lib/src/gym_wrappers/obsv_wrapper.py b/rl_lib/src/gym_wrappers/obsv_wrapper.py new file mode 100644 index 0000000..4bd195b --- /dev/null +++ b/rl_lib/src/gym_wrappers/obsv_wrapper.py @@ -0,0 +1,19 @@ +import gym +import numpy as np + +class ConvWrapper(gym.Wrapper): + def __init__(self, env): + super().__init__(env) + + def reset(self, seed=40, options={}): + observation, info = self.env.reset(seed=40, options={}) + return self.preprocess(observation), info + + def step(self, action): + observation, reward, done, tr , info = self.env.step(action) + return self.preprocess(observation), reward, done, tr, info + + def preprocess(self, observation): + observation = (observation- 255/2)/(255/2) + return observation.astype(np.float16) + \ No newline at end of file diff --git a/rl_lib/src/models/base_models.py b/rl_lib/src/models/base_models.py index e77bf9b..6a4c6df 100644 --- a/rl_lib/src/models/base_models.py +++ b/rl_lib/src/models/base_models.py @@ -145,7 +145,8 @@ def calculate_gradients(self, **kwargs) -> dict: td_error = kwargs['Qtarget'] - Q loss = self.loss(kwargs['Qtarget'], Q)*kwargs.get('weights', 1.0) - gradients = tape.gradient(loss, self.model.trainable_variables) + E_loss = tf.reduce_mean(loss, axis=0) + gradients = tape.gradient(E_loss, self.model.trainable_variables) return {'gradients': gradients, 'loss': loss, 'td_error': td_error} @tf.function(reduce_retracing=True, diff --git a/rl_lib/src/normalizes.py b/rl_lib/src/normalizes.py new file mode 100644 index 0000000..6506aa1 --- /dev/null +++ b/rl_lib/src/normalizes.py @@ -0,0 +1,8 @@ + +def normalize_m1_1(x): + """Нормализует RGB изображение в диапазон [-1, 1].""" + return x / 127.5 - 1 + +def normalize_01(x): + """Нормализует RGB изображение в диапазон [0, 1].""" + return x / 255.0 \ No newline at end of file diff --git a/rl_lib/tests/config.yaml b/rl_lib/tests/config.yaml index b7d2c72..afd1539 100644 --- a/rl_lib/tests/config.yaml +++ b/rl_lib/tests/config.yaml @@ -1,27 +1,46 @@ -#CartPole DQN config +#default DDPG config model_config: - model: None - name: "_test_cart_pole" + name: "_test_CarRacing" input_shape: None action_space: None - discount_factor : 0.9 + discount_factor : 0.99 n_step: 1 - batch_size: 32 - double_network: True + batch_size: 16 + double_network: False priority: False - tau: 1.0 - -optimizer_config: - optimizer_name: "adam" - optimizer_params: - learning_rate: 0.01 - epsilon: 0.001 - clipnorm: 1.0 - custom_optimizer: None + + +actor_model_config: + model_config: + model: None + tau: 0.001 + +critic_model_config: + model_config: + model: None + tau: 0.001 + +actor_optimizer_config: + optimizer_config: + optimizer_name: "adam" + optimizer_params: + learning_rate: 0.0001 + epsilon: 0.001 + clipnorm: 1.0 + custom_optimizer: None + +critic_optimizer_config: + optimizer_config: + optimizer_name: "adam" + optimizer_params: + learning_rate: 0.001 + epsilon: 0.001 + clipnorm: 1.0 + custom_optimizer: None buffer_config: - size: 10000 + size: 100000 eps: 0.01 alpha: 0.5 beta: 0.4 @@ -30,13 +49,14 @@ buffer_config: max_priority: 0.1 exploration_config: - strategy_name: "soft_q" + strategy_name: "ou_noise" strategy_config: - decay: 1. - tau: 0.1 - eps_min: 0.01 - eps_test: 0.001 + alpha: 0.0 + sigma: 0.2 action_space: None + dt: 0.01 + mean: None + theta: 0.15 data_saver: path: "..\\rl_lib\\rl_lib\\tests\\models/" diff --git a/rl_lib/tests/ddpg_config.yaml b/rl_lib/tests/ddpg_config.yaml deleted file mode 100644 index 408d396..0000000 --- a/rl_lib/tests/ddpg_config.yaml +++ /dev/null @@ -1,61 +0,0 @@ -#default DDPG config - -model_config: - name: "_test_Pendulum" - input_shape: None - action_space: None - discount_factor : 0.9 - n_step: 1 - batch_size: 64 - double_network: False - priority: False - - -actor_model_config: - model_config: - model: None - tau: 0.01 - -critic_model_config: - model_config: - model: None - tau: 0.01 - -actor_optimizer_config: - optimizer_config: - optimizer_name: "adam" - optimizer_params: - learning_rate: 0.001 - epsilon: 0.001 - clipnorm: 1.0 - custom_optimizer: None - -critic_optimizer_config: - optimizer_config: - optimizer_name: "adam" - optimizer_params: - learning_rate: 0.002 - epsilon: 0.001 - clipnorm: 1.0 - custom_optimizer: None - -buffer_config: - size: 50000 - eps: 0.01 - alpha: 0.5 - beta: 0.4 - beta_changing: 0.0005 - beta_changing_curve: 'linear' - max_priority: 0.1 - -exploration_config: - strategy_name: "ou_noise" - strategy_config: - alpha: 0.3 - sigma: 0.2 - action_space: None - -data_saver: - path: "..\\rl_lib\\rl_lib\\tests\\models/" - copy_path: "" - diff --git a/rl_lib/tests/dqn_config.yaml b/rl_lib/tests/dqn_config.yaml new file mode 100644 index 0000000..b8f9a82 --- /dev/null +++ b/rl_lib/tests/dqn_config.yaml @@ -0,0 +1,44 @@ +#CartPole DQN config + +model_config: + model: None + name: "_test_cart_pole" + input_shape: None + action_space: None + discount_factor : 0.9 + n_step: 1 + batch_size: 32 + double_network: False + priority: False + tau: 1.0 + +optimizer_config: + optimizer_name: "adam" + optimizer_params: + learning_rate: 0.01 + epsilon: 0.001 + clipnorm: 1.0 + custom_optimizer: None + +buffer_config: + size: 10000 + eps: 0.01 + alpha: 0.5 + beta: 0.4 + beta_changing: 0.0005 + beta_changing_curve: 'linear' + max_priority: 0.1 + +exploration_config: + strategy_name: "soft_q" + strategy_config: + decay: 1. + tau: 0.1 + eps_min: 0.01 + eps_test: 0.001 + action_space: None + +data_saver: + path: "..\\rl_lib\\rl_lib\\tests\\models/" + copy_path: "" + diff --git a/rl_lib/tests/first_test_ddpg.py b/rl_lib/tests/first_test_ddpg.py index 9d21a68..043bcfd 100644 --- a/rl_lib/tests/first_test_ddpg.py +++ b/rl_lib/tests/first_test_ddpg.py @@ -9,23 +9,28 @@ from rl_lib.src.algoritms.ddpg.ddpg import DDPG from rl_lib.src.data_saver.utils import load_default_config +from rl_lib.src.normalizes import normalize_m1_1 -env = gym.make('BipedalWalker-v3') +env = gym.make('CarRacing-v2') + +initializer = tf.keras.initializers.RandomUniform(minval=-3*1e-4, maxval=3*1e-4, seed=40) def create_conv(): input_layer = layers.Input(shape=env.observation_space.shape, ) - cov_layer1 = layers.Conv2D(16, 7, activation='relu')(input_layer) - cov_layer2 = layers.Conv2D(32, 5, activation='relu')(cov_layer1) - conv_out = layers.Flatten()(cov_layer2) + lambda_layer = layers.Lambda(normalize_m1_1)(input_layer) + cov_layer1 = layers.Conv2D(32, 7, 4, activation='relu', kernel_initializer=initializer)(lambda_layer) + cov_layer2 = layers.Conv2D(32, 5, 2,activation='relu', kernel_initializer=initializer)(cov_layer1) + cov_layer3 = layers.Conv2D(32, 3, 2,activation='relu', kernel_initializer=initializer)(cov_layer2) + conv_out = layers.Flatten()(cov_layer3) return tf.keras.Model(inputs=input_layer, outputs=conv_out) def create_model(): """Создает модель tf.keras.Model, архитектура DQN""" input_layer = layers.Input(shape=env.observation_space.shape, ) - # conv_out = create_conv()(input_layer) - dence_layer1 = layers.Dense(64, activation='relu')(input_layer) - dence_layer2 = layers.Dense(64, activation='relu')(dence_layer1) - dence_out = layers.Dense(env.action_space.shape[0], activation='tanh')(dence_layer2) + conv_out = create_conv()(input_layer) + dence_layer1 = layers.Dense(256, activation='relu', kernel_initializer=initializer)(conv_out) + dence_layer2 = layers.Dense(256, activation='relu', kernel_initializer=initializer)(dence_layer1) + dence_out = layers.Dense(env.action_space.shape[0], activation='tanh', kernel_initializer=initializer)(dence_layer2) dence_out = dence_out*tf.reduce_max((tf.abs(env.action_space.low), env.action_space.high)) @@ -34,17 +39,17 @@ def create_model(): def create_critic_model(): """Создает модель tf.keras.Model, архитектура DQN, начальные слои - сверточные""" input_layer = layers.Input(shape=env.observation_space.shape, ) - obsv_layer = layers.Dense(16, activation='relu')(input_layer) - obsv_layer = layers.Dense(32, activation='relu')(obsv_layer) + obsv_layer = layers.Dense(128, activation='relu', kernel_initializer=initializer)(input_layer) + obsv_layer = layers.Dense(64, activation='relu', kernel_initializer=initializer)(obsv_layer) input_action_layer = layers.Input(shape=env.action_space.shape, ) - action_layer = layers.Dense(32, activation='relu')(input_action_layer) + action_layer = layers.Dense(32, activation='relu', kernel_initializer=initializer)(input_action_layer) - # conv_out = create_conv()(input_layer) - concat = layers.Concatenate()((input_layer, action_layer)) + conv_out = create_conv()(input_layer) + concat = layers.Concatenate()((conv_out, action_layer)) flatten = layers.Flatten()(concat) - dence_layer1 = layers.Dense(64, activation='relu')(flatten) - dence_layer2 = layers.Dense(64, activation='relu')(dence_layer1) - dence_out = layers.Dense(env.action_space.shape[0], activation=None)(dence_layer2) + dence_layer1 = layers.Dense(256, activation='relu', kernel_initializer=initializer)(flatten) + dence_layer2 = layers.Dense(256, activation='relu', kernel_initializer=initializer)(dence_layer1) + dence_out = layers.Dense(1, activation=None)(dence_layer2) return tf.keras.Model(inputs=[input_layer, input_action_layer], outputs=dence_out) @@ -60,15 +65,17 @@ def create_critic_model(): algo = DDPG(config) +# algo.load() pprint(algo.config) def run(algo): epidodes = 250 - steps = 500 + steps = 250 train_frequency = 1 - test_frequency = 10 - test_steps = 500 - pre_train_steps = 1000 + test_frequency = 30 + save_frequency = 10 + test_steps = 250 + pre_train_steps = 1 copy_weigths_frequency = 1 #history data @@ -83,9 +90,10 @@ def run(algo): observation, info = env.reset() episode_reward = 0 + episode_loss = [] for step in range(1, steps+1): action = algo.get_action(observation) - new_observation, reward, done, _, info = env.step(action) + new_observation, reward, done, tr, info = env.step(action) algo.add((observation, action, reward, done, new_observation)) episode_reward += reward count += 1 @@ -95,10 +103,10 @@ def run(algo): if count % copy_weigths_frequency == 0: res = algo.copy_weights() observation = new_observation - if done: + if done or tr: break - algo.save() + if episode % save_frequency == 0: algo.save() rewards.append(episode_reward) #testing algoritm perfomans if episode%test_frequency == 0: @@ -106,9 +114,9 @@ def run(algo): episode_test_reward = 0 for test_step in range(1, test_steps+1): action = algo.get_test_action(observation) - observation, test_reward, done, _, info = env.step(action) + observation, test_reward, done, tr, info = env.step(action) episode_test_reward += test_reward - if done: + if done or tr: break @@ -124,7 +132,7 @@ def run(algo): count ) ) - algo.load() + # algo.load() if __name__ == "__main__": try: diff --git a/rl_lib/tests/first_test_dqn.py b/rl_lib/tests/first_test_dqn.py index b03f6ae..e8d70be 100644 --- a/rl_lib/tests/first_test_dqn.py +++ b/rl_lib/tests/first_test_dqn.py @@ -51,6 +51,7 @@ def run(algo): observation, info = env.reset() episode_reward = 0 + episode_loss = [] for step in range(1, steps): action = algo.get_action(observation) new_observation, reward, done, _, info = env.step(action) @@ -66,7 +67,7 @@ def run(algo): if done: break - # algo.save() + algo.save() rewards.append(episode_reward) #testing algoritm perfomans if episode%test_frequency == 0: diff --git a/rl_lib/tests/first_test_drqn.py b/rl_lib/tests/first_test_drqn.py index e3afbdd..6f3a8b5 100644 --- a/rl_lib/tests/first_test_drqn.py +++ b/rl_lib/tests/first_test_drqn.py @@ -57,6 +57,7 @@ def run(algo): observation, info = env.reset() algo.initial_state() episode_reward = 0 + episode_loss = [] for step in range(1, steps): action = algo.get_action(observation) new_observation, reward, done, _, info = env.step(action) From aeaa924a8bcf49fffeeb9f5b00e615df6bcbb65e Mon Sep 17 00:00:00 2001 From: Ivan_Marshev Date: Thu, 28 Sep 2023 00:11:54 +0300 Subject: [PATCH 08/19] modified: examples/ddpg/car_racing/config.yaml modified: rl_lib/src/algoritms/ddpg/config.yaml modified: rl_lib/tests/first_test_ddpg.py --- examples/ddpg/car_racing/config.yaml | 19 ++++++++++++------- rl_lib/src/algoritms/ddpg/config.yaml | 14 +++++++------- rl_lib/tests/first_test_ddpg.py | 5 ++--- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/examples/ddpg/car_racing/config.yaml b/examples/ddpg/car_racing/config.yaml index e7cd5c6..ce89ea7 100644 --- a/examples/ddpg/car_racing/config.yaml +++ b/examples/ddpg/car_racing/config.yaml @@ -6,7 +6,7 @@ model_config: action_space: None discount_factor : 0.99 n_step: 1 - batch_size: 32 + batch_size: 16 double_network: False priority: False @@ -14,18 +14,18 @@ model_config: actor_model_config: model_config: model: None - tau: 0.01 + tau: 0.001 critic_model_config: model_config: model: None - tau: 0.01 + tau: 0.001 actor_optimizer_config: optimizer_config: optimizer_name: "adam" optimizer_params: - learning_rate: 0.001 + learning_rate: 0.0001 epsilon: 0.001 clipnorm: 1.0 custom_optimizer: None @@ -34,7 +34,7 @@ critic_optimizer_config: optimizer_config: optimizer_name: "adam" optimizer_params: - learning_rate: 0.002 + learning_rate: 0.001 epsilon: 0.001 clipnorm: 1.0 custom_optimizer: None @@ -54,9 +54,14 @@ buffer_config: exploration_config: strategy_name: "ou_noise" strategy_config: - alpha: 0.5 - sigma: 1.0 + alpha: 0.0 + sigma: 0.2 action_space: None + upper_bound: None + lower_bound: None + dt: 0.01 + mean: None + theta: 0.15 data_saver: path: "" diff --git a/rl_lib/src/algoritms/ddpg/config.yaml b/rl_lib/src/algoritms/ddpg/config.yaml index 682655b..e0b26dd 100644 --- a/rl_lib/src/algoritms/ddpg/config.yaml +++ b/rl_lib/src/algoritms/ddpg/config.yaml @@ -6,7 +6,7 @@ model_config: action_space: None discount_factor : 0.99 n_step: 1 - batch_size: 32 + batch_size: 16 double_network: False priority: False @@ -14,18 +14,18 @@ model_config: actor_model_config: model_config: model: None - tau: 0.01 + tau: 0.001 critic_model_config: model_config: model: None - tau: 0.01 + tau: 0.001 actor_optimizer_config: optimizer_config: optimizer_name: "adam" optimizer_params: - learning_rate: 0.001 + learning_rate: 0.0001 epsilon: 0.001 clipnorm: 1.0 custom_optimizer: None @@ -34,7 +34,7 @@ critic_optimizer_config: optimizer_config: optimizer_name: "adam" optimizer_params: - learning_rate: 0.002 + learning_rate: 0.001 epsilon: 0.001 clipnorm: 1.0 custom_optimizer: None @@ -54,8 +54,8 @@ buffer_config: exploration_config: strategy_name: "ou_noise" strategy_config: - alpha: 0.9 - sigma: 1.0 + alpha: 0.0 + sigma: 0.2 action_space: None upper_bound: None lower_bound: None diff --git a/rl_lib/tests/first_test_ddpg.py b/rl_lib/tests/first_test_ddpg.py index 043bcfd..c8fbae8 100644 --- a/rl_lib/tests/first_test_ddpg.py +++ b/rl_lib/tests/first_test_ddpg.py @@ -9,7 +9,6 @@ from rl_lib.src.algoritms.ddpg.ddpg import DDPG from rl_lib.src.data_saver.utils import load_default_config -from rl_lib.src.normalizes import normalize_m1_1 env = gym.make('CarRacing-v2') @@ -17,8 +16,8 @@ def create_conv(): input_layer = layers.Input(shape=env.observation_space.shape, ) - lambda_layer = layers.Lambda(normalize_m1_1)(input_layer) - cov_layer1 = layers.Conv2D(32, 7, 4, activation='relu', kernel_initializer=initializer)(lambda_layer) + rescaling_layer = layers.experimental.preprocessing.Rescaling(1.0 / 127.5, offset=-1)(input_layer) + cov_layer1 = layers.Conv2D(32, 7, 4, activation='relu', kernel_initializer=initializer)(rescaling_layer) cov_layer2 = layers.Conv2D(32, 5, 2,activation='relu', kernel_initializer=initializer)(cov_layer1) cov_layer3 = layers.Conv2D(32, 3, 2,activation='relu', kernel_initializer=initializer)(cov_layer2) conv_out = layers.Flatten()(cov_layer3) From 832f35258f59db33135e67b7077e1e93bacaa352 Mon Sep 17 00:00:00 2001 From: Ivan_Marshev Date: Tue, 24 Oct 2023 17:59:46 +0300 Subject: [PATCH 09/19] modified: .gitignore deleted: rl_lib/src/algoritms/__init__.py deleted: rl_lib/src/algoritms/a2c/actor_critic.py deleted: rl_lib/src/algoritms/base_algo.py deleted: rl_lib/src/algoritms/ddpg/ddpg.py deleted: rl_lib/src/algoritms/dqn/dqn.py deleted: rl_lib/src/algoritms/drqn/drqn.py new file: rl_lib/src/algoritms/model_free/continuous_control/ddpg/__init__.py renamed: rl_lib/src/algoritms/ddpg/config.yaml -> rl_lib/src/algoritms/model_free/continuous_control/ddpg/config.yaml new file: rl_lib/src/algoritms/model_free/continuous_control/ddpg/ddpg.py new file: rl_lib/src/algoritms/model_free/policy_gradient/a2c/actor_critic.py new file: rl_lib/src/algoritms/model_free/policy_gradient/ppo/__init__.py new file: rl_lib/src/algoritms/model_free/policy_gradient/ppo/ppo.py new file: rl_lib/src/algoritms/model_free/value_based/__init__.py new file: rl_lib/src/algoritms/model_free/value_based/base_algo.py renamed: rl_lib/src/algoritms/dqn/config.yaml -> rl_lib/src/algoritms/model_free/value_based/dqn/config.yaml new file: rl_lib/src/algoritms/model_free/value_based/dqn/dqn.py renamed: rl_lib/src/algoritms/drqn/config.yaml -> rl_lib/src/algoritms/model_free/value_based/drqn/config.yaml new file: rl_lib/src/algoritms/model_free/value_based/drqn/drqn.py new file: rl_lib/src/algoritms/model_free/value_based/simple_q.py renamed: rl_lib/src/algoritms/tests/config.yaml -> rl_lib/src/algoritms/model_free/value_based/tests/config.yaml new file: rl_lib/src/algoritms/model_free/value_based/tests/test_simpe_q.py new file: rl_lib/src/algoritms/model_free/value_based/utils.py deleted: rl_lib/src/algoritms/simple_q.py deleted: rl_lib/src/algoritms/tests/test_simpe_q.py deleted: rl_lib/src/algoritms/utils.py modified: rl_lib/src/data_saver/saver.py modified: rl_lib/src/data_saver/tests/test_saver.py modified: rl_lib/src/data_saver/utils.py modified: rl_lib/src/explore_env/base_explore.py modified: rl_lib/src/explore_env/epsilon_greedy.py modified: rl_lib/src/explore_env/exploration_manager.py modified: rl_lib/src/explore_env/ou_noise.py modified: rl_lib/src/explore_env/soft_q.py modified: rl_lib/src/explore_env/tests/test_epsilon_greedy.py modified: rl_lib/src/explore_env/tests/test_exploration_manager.py modified: rl_lib/src/explore_env/tests/test_soft_q.py modified: rl_lib/src/gym_wrappers/obsv_wrapper.py modified: rl_lib/src/models/base_models.py modified: rl_lib/src/models/model.py renamed: rl_lib/src/normalizes.py -> rl_lib/src/normalizers.py modified: rl_lib/src/optimizers/__init__.py modified: rl_lib/src/optimizers/optimizer.py modified: rl_lib/src/replay_buffers/dict_array.py modified: rl_lib/src/replay_buffers/priority_buffers.py modified: rl_lib/src/replay_buffers/random_buffers.py modified: rl_lib/src/replay_buffers/replay_buffer.py modified: rl_lib/src/replay_buffers/tests/test_replay_buffer.py modified: rl_lib/tests/dqn_config.yaml modified: rl_lib/tests/drqn_config.yaml modified: rl_lib/tests/first_test_ddpg.py modified: rl_lib/tests/first_test_dqn.py modified: rl_lib/tests/first_test_drqn.py modified: setup.py --- .gitignore | 4 + rl_lib/src/algoritms/__init__.py | 1 - rl_lib/src/algoritms/a2c/actor_critic.py | 148 --------- rl_lib/src/algoritms/base_algo.py | 108 ------- rl_lib/src/algoritms/ddpg/ddpg.py | 63 ---- rl_lib/src/algoritms/dqn/dqn.py | 68 ---- rl_lib/src/algoritms/drqn/drqn.py | 138 -------- .../continuous_control/ddpg/__init__.py | 1 + .../continuous_control}/ddpg/config.yaml | 0 .../continuous_control/ddpg/ddpg.py | 72 +++++ .../policy_gradient/a2c/actor_critic.py | 155 +++++++++ .../policy_gradient/ppo/__init__.py | 0 .../model_free/policy_gradient/ppo/ppo.py | 0 .../model_free/value_based/__init__.py | 2 + .../model_free/value_based/base_algo.py | 136 ++++++++ .../value_based}/dqn/config.yaml | 0 .../model_free/value_based/dqn/dqn.py | 84 +++++ .../value_based}/drqn/config.yaml | 0 .../model_free/value_based/drqn/drqn.py | 178 ++++++++++ .../model_free/value_based/simple_q.py | 217 +++++++++++++ .../value_based}/tests/config.yaml | 0 .../value_based/tests/test_simpe_q.py | 83 +++++ .../algoritms/model_free/value_based/utils.py | 13 + rl_lib/src/algoritms/simple_q.py | 182 ----------- rl_lib/src/algoritms/tests/test_simpe_q.py | 72 ----- rl_lib/src/algoritms/utils.py | 13 - rl_lib/src/data_saver/saver.py | 129 ++++---- rl_lib/src/data_saver/tests/test_saver.py | 32 +- rl_lib/src/data_saver/utils.py | 28 +- rl_lib/src/explore_env/base_explore.py | 64 ++-- rl_lib/src/explore_env/epsilon_greedy.py | 38 ++- rl_lib/src/explore_env/exploration_manager.py | 97 +++--- rl_lib/src/explore_env/ou_noise.py | 66 ++-- rl_lib/src/explore_env/soft_q.py | 95 +++--- .../explore_env/tests/test_epsilon_greedy.py | 28 +- .../tests/test_exploration_manager.py | 30 +- rl_lib/src/explore_env/tests/test_soft_q.py | 28 +- rl_lib/src/gym_wrappers/obsv_wrapper.py | 18 +- rl_lib/src/models/base_models.py | 304 +++++++++--------- rl_lib/src/models/model.py | 133 ++++---- rl_lib/src/{normalizes.py => normalizers.py} | 9 +- rl_lib/src/optimizers/__init__.py | 19 +- rl_lib/src/optimizers/optimizer.py | 48 +-- rl_lib/src/replay_buffers/dict_array.py | 74 +++-- rl_lib/src/replay_buffers/priority_buffers.py | 137 ++++---- rl_lib/src/replay_buffers/random_buffers.py | 151 +++++---- rl_lib/src/replay_buffers/replay_buffer.py | 124 +++---- .../tests/test_replay_buffer.py | 169 +++++----- rl_lib/tests/dqn_config.yaml | 2 +- rl_lib/tests/drqn_config.yaml | 4 +- rl_lib/tests/first_test_ddpg.py | 119 ++++--- rl_lib/tests/first_test_dqn.py | 56 ++-- rl_lib/tests/first_test_drqn.py | 75 +++-- setup.py | 2 +- 54 files changed, 2102 insertions(+), 1715 deletions(-) delete mode 100644 rl_lib/src/algoritms/__init__.py delete mode 100644 rl_lib/src/algoritms/a2c/actor_critic.py delete mode 100644 rl_lib/src/algoritms/base_algo.py delete mode 100644 rl_lib/src/algoritms/ddpg/ddpg.py delete mode 100644 rl_lib/src/algoritms/dqn/dqn.py delete mode 100644 rl_lib/src/algoritms/drqn/drqn.py create mode 100644 rl_lib/src/algoritms/model_free/continuous_control/ddpg/__init__.py rename rl_lib/src/algoritms/{ => model_free/continuous_control}/ddpg/config.yaml (100%) create mode 100644 rl_lib/src/algoritms/model_free/continuous_control/ddpg/ddpg.py create mode 100644 rl_lib/src/algoritms/model_free/policy_gradient/a2c/actor_critic.py create mode 100644 rl_lib/src/algoritms/model_free/policy_gradient/ppo/__init__.py create mode 100644 rl_lib/src/algoritms/model_free/policy_gradient/ppo/ppo.py create mode 100644 rl_lib/src/algoritms/model_free/value_based/__init__.py create mode 100644 rl_lib/src/algoritms/model_free/value_based/base_algo.py rename rl_lib/src/algoritms/{ => model_free/value_based}/dqn/config.yaml (100%) create mode 100644 rl_lib/src/algoritms/model_free/value_based/dqn/dqn.py rename rl_lib/src/algoritms/{ => model_free/value_based}/drqn/config.yaml (100%) create mode 100644 rl_lib/src/algoritms/model_free/value_based/drqn/drqn.py create mode 100644 rl_lib/src/algoritms/model_free/value_based/simple_q.py rename rl_lib/src/algoritms/{ => model_free/value_based}/tests/config.yaml (100%) create mode 100644 rl_lib/src/algoritms/model_free/value_based/tests/test_simpe_q.py create mode 100644 rl_lib/src/algoritms/model_free/value_based/utils.py delete mode 100644 rl_lib/src/algoritms/simple_q.py delete mode 100644 rl_lib/src/algoritms/tests/test_simpe_q.py delete mode 100644 rl_lib/src/algoritms/utils.py rename rl_lib/src/{normalizes.py => normalizers.py} (59%) diff --git a/.gitignore b/.gitignore index b4b3b13..fcecf64 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,7 @@ rl_lib/tests/models/ #jupiter notebooks *.ipynb + + +.vscode +requirements.txt \ No newline at end of file diff --git a/rl_lib/src/algoritms/__init__.py b/rl_lib/src/algoritms/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/rl_lib/src/algoritms/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/rl_lib/src/algoritms/a2c/actor_critic.py b/rl_lib/src/algoritms/a2c/actor_critic.py deleted file mode 100644 index af46b6d..0000000 --- a/rl_lib/src/algoritms/a2c/actor_critic.py +++ /dev/null @@ -1,148 +0,0 @@ -import tensorflow as tf -from tensorflow.keras import layers -from tensorflow.keras.models import clone_model - -from rl_lib.src.algoritms.dqn.dqn import DQN_Model - - -class Actor_Model(DQN_Model): - def __init__(self, config = {},**kwargs): - config['model_config'] = config['actor_model_config']['model_config'] - config['optimizer_config'] = config['actor_optimizer_config']['optimizer_config'] - super().__init__(config = config, **kwargs) - self.name = kwargs.get('name', 'error_name') + '_actor_' - - def _prediction_processing(self, inputs: tf.Tensor, **kwargs): - return kwargs['critic_model']([kwargs['state'], inputs]) - - def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor: - """Вычисляет и возвращает потери в соответствии с функцией потерь""" - return tf.reduce_mean(predict, axis = 0) * (-1) - -class Critic_Model(DQN_Model): - def __init__(self, config = {},**kwargs): - config['model_config'] = config['critic_model_config']['model_config'] - config['optimizer_config'] = config['critic_optimizer_config']['optimizer_config'] - super().__init__(config = config, **kwargs) - self.name = kwargs.get('name', 'error_name') + '_critic_' - - def _prediction_processing(self, inputs: tf.Tensor, **kwargs): - return inputs - - @tf.function(reduce_retracing=True, - jit_compile=False, - experimental_autograph_options = tf.autograph.experimental.Feature.ALL) - def calculate_gradients(self, **kwargs) -> dict: - """ - Вычисляет градиенты, лосс, td-ошибку - - Kwargs: - dict содержащий батч, таргет, маску, опционально приоритетные веса - - Returns: - dict содержащий лоссы и td-ошибку - """ - with tf.GradientTape(persistent=False) as tape: - Q = self.model([kwargs['state'], kwargs['action']], training=True) - Q = self.prediction_processing(Q, **kwargs) - if len(Q.shape) != len(kwargs['Qtarget'].shape): Q = tf.expand_dims(Q, -1) - - td_error = kwargs['Qtarget'] - Q - loss = self.loss(kwargs['Qtarget'], Q)*kwargs.get('weights', 1.0) - E_loss = tf.reduce_mean(loss, axis=0) - gradients = tape.gradient(E_loss, self.model.trainable_variables) - loss = tf.reduce_mean(loss, axis=-1) - return {'gradients': gradients, 'loss': loss, 'td_error': td_error} - - @staticmethod - def create_model(input_shape: tuple, action_space: int) -> tf.keras.Model: - """Создает модель tf.keras.Model, архитектура DQN""" - input_layer = layers.Input(shape=input_shape, ) - action_layer = layers.Input(shape=action_space, ) - concat = layers.Concatenate()((input_layer, action_layer)) - flatten = layers.Flatten()(concat) - dence_layer1 = layers.Dense(256, activation='relu')(flatten) - dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1) - dence_out = layers.Dense(action_space, activation=None)(dence_layer2) - - return tf.keras.Model(inputs=[input_layer, action_layer], outputs=dence_out) - - @staticmethod - def create_model_with_conv(input_shape: tuple, action_space: int) -> tf.keras.Model: - """Создает модель tf.keras.Model, архитектура DQN, начальные слои - сверточные""" - input_layer = layers.Input(shape=input_shape, ) - action_layer = layers.Input(shape=action_space, ) - cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer) - cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1) - cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2) - conv_out = layers.Flatten()(cov_layer3) - - concat = layers.Concatenate()((conv_out, action_layer)) - flatten = layers.Flatten()(concat) - dence_layer1 = layers.Dense(256, activation='relu')(flatten) - dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1) - dence_out = layers.Dense(action_space, activation=None)(dence_layer2) - - return tf.keras.Model(inputs=[input_layer, action_layer], outputs=dence_out) - -class Actor_Critic_Model(DQN_Model): - def __init__(self, config = {},**kwargs): - config['actor_model_config']['model_config']['name'] = config['model_config']['name'] - config['actor_model_config']['model_config']['input_shape'] = config['model_config']['input_shape'] - config['actor_model_config']['model_config']['action_space'] = config['model_config']['action_space'] - - config['critic_model_config']['model_config']['name'] = config['model_config']['name'] - config['critic_model_config']['model_config']['input_shape'] = config['model_config']['input_shape'] - config['critic_model_config']['model_config']['action_space'] = config['model_config']['action_space'] - self.actor_model = Actor_Model(config=config, **kwargs) - self.critic_model = Critic_Model(config=config, **kwargs) - - def __call__(self, input: tf.Tensor) -> tf.Tensor: - return self.critic_model([input, self.actor_model(input)]) - - def update_weights(self, **kwargs): - _ = self.update_weights_actor(**kwargs) - return self.update_weights_critic(**kwargs) - - def update_weights_actor(self, **kwargs): - kwargs['critic_model'] = self.critic_model.model - loss = self.actor_model.update_weights(**kwargs) - return {'loss': loss['loss'], 'td_error': loss['td_error']} - - def update_weights_critic(self, **kwargs) -> dict: - loss = self.critic_model.update_weights(**kwargs) - return {'loss': loss['loss'], 'td_error': loss['td_error']} - - def calculate_gradients(self, **kwargs) -> dict: - kwargs['action'] = self.actor_model(kwargs['next_state']) - gradients = self.critic_model.calculate_gradients(**kwargs) - return gradients - - def get_weights(self, ) -> dict: - return { - 'actor': self.actor_model.get_weights(), - 'critic': self.critic_model.get_weights() - } - - def input_spec(self, key=None): - return self.actor_model.input_spec(key=key) - - def load(self, path): - self.actor_model.load(path) - self.critic_model.load(path) - - def save(self, path): - self.actor_model.save(path) - self.critic_model.save(path) - - def set_weights(self, weights: dict) -> None: - self.actor_model.set_weights(weights=weights['actor']) - self.critic_model.set_weights(weights=weights['critic']) - - @property - def summary(self): - self.actor_model.summary - self.critic_model.summary - - - \ No newline at end of file diff --git a/rl_lib/src/algoritms/base_algo.py b/rl_lib/src/algoritms/base_algo.py deleted file mode 100644 index e677388..0000000 --- a/rl_lib/src/algoritms/base_algo.py +++ /dev/null @@ -1,108 +0,0 @@ -import tensorflow as tf -import abc -from typing import Union -from copy import copy - -from ..data_saver.utils import load_default_config -from .utils import update_config -from rl_lib.src.data_saver.saver import Saver - -class Base_Algo(Saver, abc.ABC): - """Базовый абстрактный класс алгоритма. - Хранит все методы, необходимые для вычислений в каком либо алгоритме. - """ - def __init__(self, action_model: object, target_model: object, config: dict, default_config_path: str, *args, **kwargs): - self._config = load_default_config(default_config_path) - update_config(self._config, config) - - self.action_model = action_model(config = copy(self._config), algo_name = kwargs.get("algo_name", "unkown"), name = kwargs.get("name", "unkown_name") + "_action_" + config.get("model_config", {}).get("name", "")) - self.target_model = target_model(config = copy(self._config), algo_name = kwargs.get("algo_name", "unkown"), name = kwargs.get("name", "unkown_name") + "_target_" + config.get("model_config", {}).get("name", "")) - super().__init__(**self.config.get('data_saver', {}), **kwargs) - self.target_model.set_weights(self.action_model.get_weights()) - - - @property - def config(self): - return self._config - - @abc.abstractclassmethod - def calculate_new_best_action(self) -> tf.Tensor: - """Вычислеят новое лучшее действие для получения таргета""" - - @abc.abstractclassmethod - def calculate_target(self) -> dict: - """Вычисляет таргет для обучения""" - - @abc.abstractclassmethod - def get_action(self, observation) -> float: - """Возвращает действие на основе наблюдения с учетом исследования""" - - @abc.abstractclassmethod - def get_test_action(self, observation) -> float: - """Возвращает действие на основе наблюдения без исследования""" - - @abc.abstractclassmethod - def get_gradients(self) -> tf.Tensor: - """Вычисляет градиенты и возвращает их""" - - @abc.abstractclassmethod - def load(self, path) -> None: - """Загружает алгоритм""" - - @abc.abstractclassmethod - def reset(self) -> None: - """Сбрасывает внутренние данные модели""" - - @abc.abstractclassmethod - def _train_step(self) -> dict: - """Вспомогательная train_step""" - - @abc.abstractclassmethod - def train_step(self) -> dict: - """Вычисляет полный обучающий шаг""" - - @abc.abstractclassmethod - def save(self, path) -> None: - """Сохраняет алгоритм""" - - @abc.abstractclassmethod - def summary(self) -> None: - """Выводит архитектуру модели""" - - @tf.function(reduce_retracing=None, jit_compile=None, experimental_autograph_options=None) - def _copy_weights(self, action_model_weights: list, target_model_weights: list, tau: float) -> tf.constant: - """Копирует веса из модели действия в целевую модель""" - for a_w, t_w in zip(action_model_weights, target_model_weights): - new_weights = tf.add(tf.multiply(tau, a_w), tf.multiply((1-tau), t_w)) - t_w.assign(tf.identity(new_weights)) - return tf.constant(1) - - def copy_weights(self) -> tf.constant: - """Копирует веса из модели действия в целевую модель""" - res = self._copy_weights(self.action_model.weights, self.target_model.weights, self.tau) - return res - - @tf.function(reduce_retracing=True, - jit_compile=True, - experimental_autograph_options = tf.autograph.experimental.Feature.ALL) - def sample_action(self, state: Union[tf.Tensor, tuple]) -> Union[tf.Tensor, list]: - """Возвращает предсказания модели на основе текущих наблюдений""" - predict = self.action_model(state) - if isinstance(predict, list): - return self.squeeze_predict(predict[0]), predict[1], predict[2] - return self.squeeze_predict(predict) - - @tf.function(reduce_retracing=None, jit_compile=None, experimental_autograph_options=None) - def set_weights(self, target_weights: list) -> tf.constant: - """Устанавливает переданные как аргумент веса в основную сеть""" - for a_w, t_w in zip(self.action_model.weights, target_weights): - a_w.assign(tf.identity(t_w)) - return tf.constant(1) - - @staticmethod - def squeeze_predict(predict) -> tf.Tensor: - """Удаляет единичные измерения из предсказаний""" - while len(predict.shape)>=1 and predict.shape[0] == 1: - predict = tf.squeeze(predict, axis=0) - return predict - diff --git a/rl_lib/src/algoritms/ddpg/ddpg.py b/rl_lib/src/algoritms/ddpg/ddpg.py deleted file mode 100644 index b21701f..0000000 --- a/rl_lib/src/algoritms/ddpg/ddpg.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import Any -import tensorflow as tf -from tensorflow.keras import layers - -from rl_lib.src.models.model import Model -from rl_lib.src.algoritms.simple_q import SimpleQ -from rl_lib.src.algoritms.a2c.actor_critic import Actor_Critic_Model - - - -class DDPG_Model(Actor_Critic_Model): - def __init__(self, config = {},**kwargs): - super().__init__(config=config, **kwargs) - -class DDPG(SimpleQ): - def __init__(self, config): - self.actor_tau = config['actor_model_config']['model_config']['tau'] - self.critic_tau = config['critic_model_config']['model_config']['tau'] - super().__init__(DDPG_Model, DDPG_Model, config, default_config_path=__file__, algo_name = "DDPG_Model", name = "DDPG_Model_" + config.get('model_config','').get('name','')) - - def _prediction_processing(self, input_data): - pass - - def _update_next_state(self, state, action): - pass - - def initial_state(self): - pass - - def get_batch(self, ): - batch = super().get_batch() - batch['reward'] = tf.reshape(batch['reward'], (self.batch_size, 1)) - batch['done'] = tf.reshape(batch['done'], (self.batch_size, 1)) - return batch - - def get_best_action(self, Qaction, Qtarget): - return Qtarget - - def _train_step(self, **batch) -> dict: - """Вспомогательная train_step""" - batch = self.choice_model_for_double_calculates(**batch) - batch['batch_dims'] = self.batch_dims - if self.priority: batch['weights'] = tf.expand_dims(batch['weights'], -1) - if batch['p_double'] > 0.5: - self.action_model.update_weights_actor(**batch) - return self.action_model.update_weights_critic(**batch) - else: - self.target_model.update_weights_actor(**batch) - return self.target_model.update_weights_critic(**batch) - - def copy_weights(self) -> tf.constant: - """Копирует веса из модели действия в целевую модель""" - _ = self._copy_weights(self.action_model.actor_model.weights, self.target_model.actor_model.weights, self.actor_tau) - _ = self._copy_weights(self.action_model.critic_model.weights, self.target_model.critic_model.weights, self.critic_tau) - return tf.constant(1) - - @tf.function(reduce_retracing=True, - jit_compile=True, - experimental_autograph_options = tf.autograph.experimental.Feature.ALL) - def sample_action(self, state: tf.Tensor) -> tf.Tensor: - """Возвращает предсказания модели на основе текущих наблюдений""" - predict = self.action_model.actor_model(state) - return self.squeeze_predict(predict) \ No newline at end of file diff --git a/rl_lib/src/algoritms/dqn/dqn.py b/rl_lib/src/algoritms/dqn/dqn.py deleted file mode 100644 index c42f4fa..0000000 --- a/rl_lib/src/algoritms/dqn/dqn.py +++ /dev/null @@ -1,68 +0,0 @@ -import tensorflow as tf -from tensorflow.keras import layers - -from rl_lib.src.models.model import Model -from rl_lib.src.algoritms.simple_q import SimpleQ - -class DQN_Model(Model): - def __init__(self, config = {},**kwargs): - super().__init__(model_config = config.get('model_config', {}), config = config, **kwargs) - - def _prediction_processing(self, inputs: tf.Tensor, **kwargs): - mask = self.make_mask(tf.cast(kwargs['action'], dtype = tf.int32)) - if len(inputs.shape) != len(mask.shape): mask = tf.expand_dims(mask, -1) - return tf.reduce_sum(tf.multiply(inputs, mask), axis=kwargs['batch_dims']) - - def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor: - """Вычисляет и возвращает потери в соответствии с функцией потерь""" - return tf.math.squared_difference(target, predict) - - def make_mask(self, action) -> tf.Tensor: - """Создает маску по действиям """ - return tf.one_hot(action, self.output_spec()[-1]) - - def _update_next_state(self, state, action): - pass - - def initial_state(self): - pass - - @staticmethod - def create_model(input_shape: tuple, action_space: int) -> tf.keras.Model: - """Создает модель tf.keras.Model, архитектура DQN""" - input_layer = layers.Input(shape=input_shape, ) - dence_layer1 = layers.Dense(256, activation='relu')(input_layer) - dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1) - dence_out = layers.Dense(action_space, activation=None)(dence_layer2) - - return tf.keras.Model(inputs=input_layer, outputs=dence_out) - - @staticmethod - def create_model_with_conv(input_shape: tuple, action_space: int) -> tf.keras.Model: - """Создает модель tf.keras.Model, архитектура DQN, начальные слои - сверточные""" - input_layer = layers.Input(shape=input_shape, ) - cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer) - cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1) - cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2) - conv_out = layers.Flatten()(cov_layer3) - - dence_layer1 = layers.Dense(256, activation='relu')(conv_out) - dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1) - dence_out = layers.Dense(action_space, activation=None)(dence_layer2) - - return tf.keras.Model(inputs=input_layer, outputs=dence_out) - -class DQN(SimpleQ): - def __init__(self, config): - super().__init__(DQN_Model, DQN_Model, config, default_config_path=__file__, algo_name = "DQN", name = "DQN_Model_" + config.get('model_config','').get('name','')) - - def _prediction_processing(self, input_data): - pass - - def _update_next_state(self, state, action): - pass - - def initial_state(self): - pass - - diff --git a/rl_lib/src/algoritms/drqn/drqn.py b/rl_lib/src/algoritms/drqn/drqn.py deleted file mode 100644 index 5d9294c..0000000 --- a/rl_lib/src/algoritms/drqn/drqn.py +++ /dev/null @@ -1,138 +0,0 @@ -import tensorflow as tf -from tensorflow.keras import layers -import numpy as np - -from rl_lib.src.models.model import Model -from rl_lib.src.algoritms.simple_q import SimpleQ - -class DRQN_Model(Model): - def __init__(self, config = {},**kwargs): - super().__init__(model_config = config.get('model_config', {}), config = config, default_config_path=__file__, **kwargs) - self.h_t, self.c_t, self.new_h_t, self.new_c_t = None, None, None, None - self.lstm_size = config['model_config'].get("lstm_size", 64) - - def __call__(self, inputs: tf.Tensor) -> tf.Tensor: - return super().__call__([inputs, self.h_t, self.c_t] if not isinstance(inputs, list) else inputs) - - def _initial_model(self): - input_shape = self._config['model_config']["input_shape"] - action_space = self._config['model_config']["action_space"] - if len(input_shape) == 1: - return self.create_model(input_shape, action_space, self.lstm_size) - else: - return self.create_model_with_conv(input_shape, action_space, self.lstm_size) - - def initial_state(self): - """Инициализирует внутреннее состояние рекуррентной сети""" - self.h_t = tf.zeros((1, self.lstm_size),dtype=tf.float32) - self.c_t = self.h_t - - def get_states(self) -> tuple: - """Возвращает кортеж внутренних состояний реккурентной сети""" - return tf.squeeze(self.h_t.numpy()), tf.squeeze(self.c_t.numpy()) - - def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor: - """Вычисляет и возвращает потери в соответствии с функцией потерь""" - return tf.math.squared_difference(target, predict) - - def make_mask(self, action) -> tf.Tensor: - """Создает маску по действиям """ - return tf.one_hot(action, self.output_spec()[-1]) - - def _prediction_processing(self, inputs: tf.Tensor, **kwargs): - mask = self.make_mask(kwargs['action']) - while len(inputs.shape) < len(mask.shape): mask = tf.expand_dims(mask, -1) - return tf.reduce_sum(tf.multiply(inputs, mask), axis=kwargs['batch_dims'])[:, kwargs['recurrent_skip']:] - - - def _update_next_state(self): - """Обновляет внутреннее состояние рекуррентной сети""" - self.h_t, self.c_t = self.new_h_t, self.new_c_t - - @staticmethod - def create_model(input_shape: tuple, action_space: int, lstm_size: int) -> tf.keras.Model: - """Создает модель tf.keras.Model, архитектура DRQN""" - input_layer = layers.Input(shape=input_shape, ) - h_t_input = layers.Input(shape=(lstm_size, ), ) - c_t_input = layers.Input(shape=(lstm_size, ), ) - - lstm = layers.LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', return_sequences = True, - return_state=True, stateful = False)(input_layer, initial_state = [h_t_input, c_t_input]) - dence_layer1 = layers.Dense(256, activation='relu')(input_layer) - dence_layer2 = layers.Dense(128, activation='relu')(dence_layer1) - dence_out = layers.Dense(action_space, activation=None)(dence_layer2) - - return tf.keras.Model(inputs=[input_layer, h_t_input, c_t_input], outputs=[dence_out, lstm[1], lstm[2]]) - - @staticmethod - def create_model_with_conv(input_shape: tuple, action_space: int, lstm_size: int) -> tf.keras.Model: - """Создает модель tf.keras.Model, архитектура DRQN, начальные слои - сверточные""" - input_layer = layers.Input(shape=input_shape, ) - h_t_input = layers.Input(shape=(lstm_size, ), ) - c_t_input = layers.Input(shape=(lstm_size, ), ) - - cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer) - cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1) - cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2) - conv_out = layers.Flatten()(cov_layer3) - lstm = layers.LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', return_sequences = True, - return_state=True, stateful = False)(conv_out, initial_state = [h_t_input, c_t_input]) - dence_layer1 = layers.Dense(256, activation='relu')(lstm[0]) - dence_layer2 = layers.Dense(128, activation='relu')(dence_layer1) - dence_out = layers.Dense(action_space, activation=None)(dence_layer2) - - return tf.keras.Model(inputs=[input_layer, h_t_input, c_t_input], outputs=[dence_out, lstm[1], lstm[2]]) - -class DRQN(SimpleQ): - def __init__(self, config): - super().__init__(DRQN_Model, DRQN_Model, config, default_config_path=__file__, algo_name = "DRQN", name = "DRQN_Model_" + config.get('model_config','').get('name','')) - - self.initial_state() - self.recurrent_skip = self.config['buffer_config']['recurrent_skip'] - self.trace_length = self.config['buffer_config']['trace_length'] - self.recurrent = True - self.batch_dims = 2 - - def add(self, data: tuple, priority = None) -> None: - """ - Добавляет переходы в буфер - Аргументы: - data: tuple(state, action, reward, done, next_state) - priority: np.array (только для приоритетных буферов) - """ - super().add((*data, *self.action_model.get_states()), priority) - self._update_next_state() - - def initial_state(self): - """Сбравсывает внутренне состояние lstm""" - self.action_model.initial_state() - - def _get_action(self, observation: tf.Tensor) -> tf.Tensor: - """Возвращает ценность дейтсвий Q(s,a) всех действий на основе наблюдения""" - predict = super()._get_action(observation) - action, self.action_model.new_h_t, self.action_model.new_c_t = predict - return action - - def get_test_action(self, observation: tf.Tensor) -> float: - action = super().get_test_action(observation) - self._update_next_state() - return action - - def get_batch(self, ): - batch = super().get_batch() - - new_h_t, new_c_t = tf.squeeze(batch['h_t'][:, 1:],axis=1), tf.squeeze(batch['c_t'][:, 1:],axis=1) - h_t, c_t = tf.squeeze(batch['h_t'][:, :-1],axis=1), tf.squeeze(batch['c_t'][:, :-1],axis=1) - batch['state'] = [batch['state'], h_t, c_t] - batch['next_state'] = [batch['next_state'], new_h_t, new_c_t] - batch['recurrent_skip'] = self.recurrent_skip - batch['trace_length'] = self.trace_length - - if self.priority: batch['weights'] = np.repeat(np.expand_dims(batch['weights'], -1), self.trace_length-self.recurrent_skip, axis=1) - return batch - - def _update_next_state(self): - """Обновляет внутреннее состояние lstm новым состоянием lstm""" - self.action_model._update_next_state() - - diff --git a/rl_lib/src/algoritms/model_free/continuous_control/ddpg/__init__.py b/rl_lib/src/algoritms/model_free/continuous_control/ddpg/__init__.py new file mode 100644 index 0000000..20c4cf6 --- /dev/null +++ b/rl_lib/src/algoritms/model_free/continuous_control/ddpg/__init__.py @@ -0,0 +1 @@ +from .ddpg import DDPG, DDPG_Model diff --git a/rl_lib/src/algoritms/ddpg/config.yaml b/rl_lib/src/algoritms/model_free/continuous_control/ddpg/config.yaml similarity index 100% rename from rl_lib/src/algoritms/ddpg/config.yaml rename to rl_lib/src/algoritms/model_free/continuous_control/ddpg/config.yaml diff --git a/rl_lib/src/algoritms/model_free/continuous_control/ddpg/ddpg.py b/rl_lib/src/algoritms/model_free/continuous_control/ddpg/ddpg.py new file mode 100644 index 0000000..a8f303f --- /dev/null +++ b/rl_lib/src/algoritms/model_free/continuous_control/ddpg/ddpg.py @@ -0,0 +1,72 @@ +from typing import Any + +import tensorflow as tf +from tensorflow.keras import layers + +from ...policy_gradient.a2c.actor_critic import Actor_Critic_Model +from ...value_based.simple_q import SimpleQ + + +class DDPG_Model(Actor_Critic_Model): + def __init__(self, config={}, **kwargs): + super().__init__(config=config, **kwargs) + + +class DDPG(SimpleQ): + def __init__(self, config): + self.actor_tau = config['actor_model_config']['model_config']['tau'] + self.critic_tau = config['critic_model_config']['model_config']['tau'] + super().__init__(DDPG_Model, DDPG_Model, + config, default_config_path=__file__, + algo_name="DDPG_Model", + name=("DDPG_Model_" + + config.get('model_config', '').get('name', ''))) + + def _prediction_processing(self, input_data): + pass + + def _update_next_state(self, state, action): + pass + + def initial_state(self): + pass + + def get_batch(self, ): + batch = super().get_batch() + batch['reward'] = tf.reshape(batch['reward'], (self.batch_size, 1)) + batch['done'] = tf.reshape(batch['done'], (self.batch_size, 1)) + return batch + + def get_best_action(self, Qaction, Qtarget): + return Qtarget + + def _train_step(self, **batch) -> dict: + """Вспомогательная train_step""" + batch = self.choice_model_for_double_calculates(**batch) + batch['batch_dims'] = self.batch_dims + if self.priority: + batch['weights'] = tf.expand_dims(batch['weights'], -1) + if batch['p_double'] > 0.5: + self.action_model.update_weights_actor(**batch) + return self.action_model.update_weights_critic(**batch) + else: + self.target_model.update_weights_actor(**batch) + return self.target_model.update_weights_critic(**batch) + + def copy_weights(self) -> tf.constant: + """Копирует веса из модели действия в целевую модель""" + _ = self._copy_weights(self.action_model.actor_model.weights, + self.target_model.actor_model.weights, + self.actor_tau) + _ = self._copy_weights(self.action_model.critic_model.weights, + self.target_model.critic_model.weights, + self.critic_tau) + return tf.constant(1) + + @tf.function(reduce_retracing=True, + jit_compile=True, + experimental_autograph_options=tf.autograph.experimental.Feature.ALL) + def sample_action(self, state: tf.Tensor) -> tf.Tensor: + """Возвращает предсказания модели на основе текущих наблюдений""" + predict = self.action_model.actor_model(state) + return self.squeeze_predict(predict) diff --git a/rl_lib/src/algoritms/model_free/policy_gradient/a2c/actor_critic.py b/rl_lib/src/algoritms/model_free/policy_gradient/a2c/actor_critic.py new file mode 100644 index 0000000..f2b5eb2 --- /dev/null +++ b/rl_lib/src/algoritms/model_free/policy_gradient/a2c/actor_critic.py @@ -0,0 +1,155 @@ +import tensorflow as tf +from tensorflow.keras import layers + +from rl_lib.src.algoritms.model_free.value_based import DQN_Model + + +class Actor_Model(DQN_Model): + def __init__(self, config={}, **kwargs): + config['model_config'] = config['actor_model_config']['model_config'] + config['optimizer_config'] = config['actor_optimizer_config']['optimizer_config'] + super().__init__(config=config, **kwargs) + self.name = kwargs.get('name', 'error_name') + '_actor_' + + def _prediction_processing(self, inputs: tf.Tensor, **kwargs): + return kwargs['critic_model']([kwargs['state'], inputs]) + + def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor: + """Вычисляет и возвращает потери в соответствии с функцией потерь""" + return tf.reduce_mean(predict, axis=0) * (-1) + + +class Critic_Model(DQN_Model): + def __init__(self, config={}, **kwargs): + config['model_config'] = config['critic_model_config']['model_config'] + config['optimizer_config'] = config['critic_optimizer_config']['optimizer_config'] + super().__init__(config=config, **kwargs) + self.name = kwargs.get('name', 'error_name') + '_critic_' + + def _prediction_processing(self, inputs: tf.Tensor, **kwargs): + return inputs + + @tf.function(reduce_retracing=True, + jit_compile=False, + experimental_autograph_options=tf.autograph.experimental.Feature.ALL) + def calculate_gradients(self, **kwargs) -> dict: + """ + Вычисляет градиенты, лосс, td-ошибку + + Kwargs: + dict содержащий батч, таргет, маску, опционально приоритетные веса + + Returns: + dict содержащий лоссы и td-ошибку + """ + with tf.GradientTape(persistent=False) as tape: + Q = self.model([kwargs['state'], kwargs['action']], training=True) + Q = self.prediction_processing(Q, **kwargs) + if len(Q.shape) != len(kwargs['Qtarget'].shape): + Q = tf.expand_dims(Q, -1) + + td_error = kwargs['Qtarget'] - Q + loss = self.loss(kwargs['Qtarget'], Q)*kwargs.get('weights', 1.0) + E_loss = tf.reduce_mean(loss, axis=0) + gradients = tape.gradient(E_loss, self.model.trainable_variables) + loss = tf.reduce_mean(loss, axis=-1) + return {'gradients': gradients, 'loss': loss, 'td_error': td_error} + + @staticmethod + def create_model(input_shape: tuple, action_space: int) -> tf.keras.Model: + """Создает модель tf.keras.Model, архитектура DQN""" + input_layer = layers.Input(shape=input_shape, ) + action_layer = layers.Input(shape=action_space, ) + concat = layers.Concatenate()((input_layer, action_layer)) + flatten = layers.Flatten()(concat) + dence_layer1 = layers.Dense(256, activation='relu')(flatten) + dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1) + dence_out = layers.Dense(action_space, activation=None)(dence_layer2) + + return tf.keras.Model( + inputs=[input_layer, action_layer], + outputs=dence_out + ) + + @staticmethod + def create_model_with_conv(input_shape: tuple, + action_space: int) -> tf.keras.Model: + """Создает модель tf.keras.Model, архитектура DQN, + начальные слои - сверточные""" + input_layer = layers.Input(shape=input_shape, ) + action_layer = layers.Input(shape=action_space, ) + cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer) + cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1) + cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2) + conv_out = layers.Flatten()(cov_layer3) + + concat = layers.Concatenate()((conv_out, action_layer)) + flatten = layers.Flatten()(concat) + dence_layer1 = layers.Dense(256, activation='relu')(flatten) + dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1) + dence_out = layers.Dense(action_space, activation=None)(dence_layer2) + + return tf.keras.Model( + inputs=[input_layer, action_layer], + outputs=dence_out + ) + + +class Actor_Critic_Model(DQN_Model): + def __init__(self, config={}, **kwargs): + config['actor_model_config']['model_config']['name'] = config['model_config']['name'] + config['actor_model_config']['model_config']['input_shape'] = config['model_config']['input_shape'] + config['actor_model_config']['model_config']['action_space'] = config['model_config']['action_space'] + + config['critic_model_config']['model_config']['name'] = config['model_config']['name'] + config['critic_model_config']['model_config']['input_shape'] = config['model_config']['input_shape'] + config['critic_model_config']['model_config']['action_space'] = config['model_config']['action_space'] + self.actor_model = Actor_Model(config=config, **kwargs) + self.critic_model = Critic_Model(config=config, **kwargs) + + def __call__(self, input: tf.Tensor) -> tf.Tensor: + return self.critic_model([input, self.actor_model(input)]) + + def update_weights(self, **kwargs): + _ = self.update_weights_actor(**kwargs) + return self.update_weights_critic(**kwargs) + + def update_weights_actor(self, **kwargs): + kwargs['critic_model'] = self.critic_model.model + loss = self.actor_model.update_weights(**kwargs) + return {'loss': loss['loss'], 'td_error': loss['td_error']} + + def update_weights_critic(self, **kwargs) -> dict: + loss = self.critic_model.update_weights(**kwargs) + return {'loss': loss['loss'], 'td_error': loss['td_error']} + + def calculate_gradients(self, **kwargs) -> dict: + kwargs['action'] = self.actor_model(kwargs['next_state']) + gradients = self.critic_model.calculate_gradients(**kwargs) + return gradients + + def get_weights(self, ) -> dict: + return { + 'actor': self.actor_model.get_weights(), + 'critic': self.critic_model.get_weights() + } + + def input_spec(self, key=None): + return self.actor_model.input_spec(key=key) + + def load(self, path): + self.actor_model.load(path) + self.critic_model.load(path) + + def save(self, path): + self.actor_model.save(path) + self.critic_model.save(path) + + def set_weights(self, weights: dict) -> None: + self.actor_model.set_weights(weights=weights['actor']) + self.critic_model.set_weights(weights=weights['critic']) + + @property + def summary(self): + self.actor_model.summary + self.critic_model.summary diff --git a/rl_lib/src/algoritms/model_free/policy_gradient/ppo/__init__.py b/rl_lib/src/algoritms/model_free/policy_gradient/ppo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rl_lib/src/algoritms/model_free/policy_gradient/ppo/ppo.py b/rl_lib/src/algoritms/model_free/policy_gradient/ppo/ppo.py new file mode 100644 index 0000000..e69de29 diff --git a/rl_lib/src/algoritms/model_free/value_based/__init__.py b/rl_lib/src/algoritms/model_free/value_based/__init__.py new file mode 100644 index 0000000..1935861 --- /dev/null +++ b/rl_lib/src/algoritms/model_free/value_based/__init__.py @@ -0,0 +1,2 @@ +from .dqn.dqn import DQN, DQN_Model +from .drqn.drqn import DRQN, DRQN_Model diff --git a/rl_lib/src/algoritms/model_free/value_based/base_algo.py b/rl_lib/src/algoritms/model_free/value_based/base_algo.py new file mode 100644 index 0000000..88707f9 --- /dev/null +++ b/rl_lib/src/algoritms/model_free/value_based/base_algo.py @@ -0,0 +1,136 @@ +import abc +from copy import copy +from typing import Union + +import tensorflow as tf + +from rl_lib.src.data_saver.saver import Saver + +from rl_lib.src.data_saver.utils import load_default_config +from .utils import update_config + + +class Base_Algo(Saver, abc.ABC): + """Базовый абстрактный класс алгоритма. + Хранит все методы, необходимые для вычислений в каком либо алгоритме. + """ + + def __init__(self, action_model: object, + target_model: object, + config: dict, + default_config_path: str, + *args, **kwargs): + self._config = load_default_config(default_config_path) + update_config(self._config, config) + + self.action_model = action_model( + config=copy(self._config), + algo_name=kwargs.get("algo_name", "unkown"), + name=(kwargs.get("name", "unkown_name") + + "_action_" + + config.get("model_config", {}).get("name", "")) + ) + self.target_model = target_model( + config=copy(self._config), + algo_name=kwargs.get("algo_name", "unkown"), + name=(kwargs.get("name", "unkown_name") + + "_target_" + + config.get("model_config", {}).get("name", "")) + ) + super().__init__(**self.config.get('data_saver', {}), **kwargs) + self.target_model.set_weights(self.action_model.get_weights()) + + @property + def config(self): + return self._config + + @abc.abstractclassmethod + def calculate_new_best_action(self) -> tf.Tensor: + """Вычислеят новое лучшее действие для получения таргета""" + + @abc.abstractclassmethod + def calculate_target(self) -> dict: + """Вычисляет таргет для обучения""" + + @abc.abstractclassmethod + def get_action(self, observation) -> float: + """Возвращает действие на основе наблюдения с учетом исследования""" + + @abc.abstractclassmethod + def get_test_action(self, observation) -> float: + """Возвращает действие на основе наблюдения без исследования""" + + @abc.abstractclassmethod + def get_gradients(self) -> tf.Tensor: + """Вычисляет градиенты и возвращает их""" + + @abc.abstractclassmethod + def load(self, path) -> None: + """Загружает алгоритм""" + + @abc.abstractclassmethod + def reset(self) -> None: + """Сбрасывает внутренние данные модели""" + + @abc.abstractclassmethod + def _train_step(self) -> dict: + """Вспомогательная train_step""" + + @abc.abstractclassmethod + def train_step(self) -> dict: + """Вычисляет полный обучающий шаг""" + + @abc.abstractclassmethod + def save(self, path) -> None: + """Сохраняет алгоритм""" + + @abc.abstractclassmethod + def summary(self) -> None: + """Выводит архитектуру модели""" + + @tf.function(reduce_retracing=None, + jit_compile=None, + experimental_autograph_options=None) + def _copy_weights(self, action_model_weights: list, + target_model_weights: list, + tau: float) -> tf.constant: + """Копирует веса из модели действия в целевую модель""" + for a_w, t_w in zip(action_model_weights, target_model_weights): + new_weights = tf.add(tf.multiply(tau, a_w), + tf.multiply((1-tau), t_w)) + t_w.assign(tf.identity(new_weights)) + return tf.constant(1) + + def copy_weights(self) -> tf.constant: + """Копирует веса из модели действия в целевую модель""" + res = self._copy_weights( + self.action_model.weights, self.target_model.weights, self.tau) + return res + + @tf.function(reduce_retracing=True, + jit_compile=True, + experimental_autograph_options=tf.autograph.experimental.Feature.ALL) + def sample_action(self, + state: Union[tf.Tensor, tuple] + ) -> Union[tf.Tensor, list]: + """Возвращает предсказания модели на основе текущих наблюдений""" + predict = self.action_model(state) + if isinstance(predict, list): + return self.squeeze_predict(predict[0]), predict[1], predict[2] + return self.squeeze_predict(predict) + + @tf.function(reduce_retracing=None, + jit_compile=None, + experimental_autograph_options=None) + def set_weights(self, target_weights: list) -> tf.constant: + """Устанавливает переданные как аргумент веса в основную сеть""" + for a_w, t_w in zip(self.action_model.weights, target_weights): + a_w.assign(tf.identity(t_w)) + return tf.constant(1) + + @staticmethod + def squeeze_predict(predict) -> tf.Tensor: + """Удаляет единичные измерения из предсказаний""" + while len(predict.shape) >= 1 and predict.shape[0] == 1: + predict = tf.squeeze(predict, axis=0) + return predict diff --git a/rl_lib/src/algoritms/dqn/config.yaml b/rl_lib/src/algoritms/model_free/value_based/dqn/config.yaml similarity index 100% rename from rl_lib/src/algoritms/dqn/config.yaml rename to rl_lib/src/algoritms/model_free/value_based/dqn/config.yaml diff --git a/rl_lib/src/algoritms/model_free/value_based/dqn/dqn.py b/rl_lib/src/algoritms/model_free/value_based/dqn/dqn.py new file mode 100644 index 0000000..79683fc --- /dev/null +++ b/rl_lib/src/algoritms/model_free/value_based/dqn/dqn.py @@ -0,0 +1,84 @@ +import tensorflow as tf +from tensorflow.keras import layers + +from rl_lib.src.models.model import Model + +from ..simple_q import SimpleQ + + +class DQN_Model(Model): + def __init__(self, config={}, **kwargs): + super().__init__( + model_config=config.get('model_config', {}), + config=config, + **kwargs) + + def _prediction_processing(self, inputs: tf.Tensor, **kwargs): + mask = self.make_mask(tf.cast(kwargs['action'], dtype=tf.int32)) + if len(inputs.shape) != len(mask.shape): + mask = tf.expand_dims(mask, -1) + return tf.reduce_sum( + tf.multiply(inputs, mask), + axis=kwargs['batch_dims'] + ) + + def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor: + """Вычисляет и возвращает потери в соответствии с функцией потерь""" + return tf.math.squared_difference(target, predict) + + def make_mask(self, action) -> tf.Tensor: + """Создает маску по действиям """ + return tf.one_hot(action, self.output_spec()[-1]) + + def _update_next_state(self, state, action): + pass + + def initial_state(self): + pass + + @staticmethod + def create_model(input_shape: tuple, action_space: int) -> tf.keras.Model: + """Создает модель tf.keras.Model, архитектура DQN""" + input_layer = layers.Input(shape=input_shape, ) + dence_layer1 = layers.Dense(256, activation='relu')(input_layer) + dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1) + dence_out = layers.Dense(action_space, activation=None)(dence_layer2) + + return tf.keras.Model(inputs=input_layer, outputs=dence_out) + + @staticmethod + def create_model_with_conv(input_shape: tuple, + action_space: int + ) -> tf.keras.Model: + """Создает модель tf.keras.Model, архитектура DQN, + начальные слои - сверточные""" + input_layer = layers.Input(shape=input_shape, ) + cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer) + cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1) + cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2) + conv_out = layers.Flatten()(cov_layer3) + + dence_layer1 = layers.Dense(256, activation='relu')(conv_out) + dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1) + dence_out = layers.Dense(action_space, activation=None)(dence_layer2) + + return tf.keras.Model(inputs=input_layer, outputs=dence_out) + + +class DQN(SimpleQ): + def __init__(self, config): + super().__init__(DQN_Model, DQN_Model, config, + default_config_path=__file__, + algo_name="DQN", + name=("DQN_Model_" + + config.get('model_config', '').get('name', '')) + ) + + def _prediction_processing(self, input_data): + pass + + def _update_next_state(self, state, action): + pass + + def initial_state(self): + pass diff --git a/rl_lib/src/algoritms/drqn/config.yaml b/rl_lib/src/algoritms/model_free/value_based/drqn/config.yaml similarity index 100% rename from rl_lib/src/algoritms/drqn/config.yaml rename to rl_lib/src/algoritms/model_free/value_based/drqn/config.yaml diff --git a/rl_lib/src/algoritms/model_free/value_based/drqn/drqn.py b/rl_lib/src/algoritms/model_free/value_based/drqn/drqn.py new file mode 100644 index 0000000..6b216ee --- /dev/null +++ b/rl_lib/src/algoritms/model_free/value_based/drqn/drqn.py @@ -0,0 +1,178 @@ +import numpy as np +import tensorflow as tf +from tensorflow.keras import layers + +from ..simple_q import SimpleQ +from rl_lib.src.models.model import Model + + +class DRQN_Model(Model): + def __init__(self, config={}, **kwargs): + super().__init__(model_config=config.get('model_config', {}), + config=config, default_config_path=__file__, + **kwargs) + self.h_t, self.c_t, self.new_h_t, self.new_c_t = None, None, None, None + self.lstm_size = config['model_config'].get("lstm_size", 64) + + def __call__(self, inputs: tf.Tensor) -> tf.Tensor: + return super().__call__( + [inputs, self.h_t, self.c_t] + if not isinstance(inputs, list) else inputs + ) + + def _initial_model(self): + input_shape = self._config['model_config']["input_shape"] + action_space = self._config['model_config']["action_space"] + if len(input_shape) == 1: + return self.create_model(input_shape, action_space, self.lstm_size) + else: + return self.create_model_with_conv(input_shape, + action_space, + self.lstm_size) + + def initial_state(self): + """Инициализирует внутреннее состояние рекуррентной сети""" + self.h_t = tf.zeros((1, self.lstm_size), dtype=tf.float32) + self.c_t = self.h_t + + def get_states(self) -> tuple: + """Возвращает кортеж внутренних состояний реккурентной сети""" + return tf.squeeze(self.h_t.numpy()), tf.squeeze(self.c_t.numpy()) + + def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor: + """Вычисляет и возвращает потери в соответствии с функцией потерь""" + return tf.math.squared_difference(target, predict) + + def make_mask(self, action) -> tf.Tensor: + """Создает маску по действиям """ + return tf.one_hot(tf.cast(action, tf.int32), self.output_spec()[-1]) + + def _prediction_processing(self, inputs: tf.Tensor, **kwargs): + mask = self.make_mask(kwargs['action']) + while len(inputs.shape) < len(mask.shape): + mask = tf.expand_dims(mask, -1) + return tf.reduce_sum( + tf.multiply(inputs, mask), + axis=kwargs['batch_dims'])[:, kwargs['recurrent_skip']:] + + def _update_next_state(self): + """Обновляет внутреннее состояние рекуррентной сети""" + self.h_t, self.c_t = self.new_h_t, self.new_c_t + + @staticmethod + def create_model(input_shape: tuple, + action_space: int, + lstm_size: int + ) -> tf.keras.Model: + """Создает модель tf.keras.Model, архитектура DRQN""" + input_layer = layers.Input(shape=input_shape, ) + h_t_input = layers.Input(shape=(lstm_size, ), ) + c_t_input = layers.Input(shape=(lstm_size, ), ) + + lstm = layers.LSTM(lstm_size, activation='tanh', + recurrent_activation='sigmoid', + return_sequences=True, + return_state=True, stateful=False)(input_layer, + initial_state=[h_t_input, c_t_input]) + dence_layer1 = layers.Dense(256, activation='relu')(input_layer) + dence_layer2 = layers.Dense(128, activation='relu')(dence_layer1) + dence_out = layers.Dense(action_space, activation=None)(dence_layer2) + + return tf.keras.Model( + inputs=[input_layer, h_t_input, c_t_input], + outputs=[dence_out, lstm[1], lstm[2]] + ) + + @staticmethod + def create_model_with_conv(input_shape: tuple, + action_space: int, + lstm_size: int) -> tf.keras.Model: + """Создает модель tf.keras.Model, архитектура DRQN, + начальные слои - сверточные""" + input_layer = layers.Input(shape=input_shape, ) + h_t_input = layers.Input(shape=(lstm_size, ), ) + c_t_input = layers.Input(shape=(lstm_size, ), ) + + cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer) + cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1) + cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2) + conv_out = layers.Flatten()(cov_layer3) + lstm = layers.LSTM(lstm_size, activation='tanh', + recurrent_activation='sigmoid', + return_sequences=True, + return_state=True, stateful=False)(conv_out, + initial_state=[h_t_input, c_t_input]) + dence_layer1 = layers.Dense(256, activation='relu')(lstm[0]) + dence_layer2 = layers.Dense(128, activation='relu')(dence_layer1) + dence_out = layers.Dense(action_space, activation=None)(dence_layer2) + + return tf.keras.Model( + inputs=[input_layer, h_t_input, c_t_input], + outputs=[dence_out, lstm[1], lstm[2]] + ) + + +class DRQN(SimpleQ): + def __init__(self, config): + super().__init__(DRQN_Model, DRQN_Model, config, + default_config_path=__file__, + algo_name="DRQN", + name=("DRQN_Model_" + + config.get('model_config', '').get('name', '')) + ) + + self.initial_state() + self.recurrent_skip = self.config['buffer_config']['recurrent_skip'] + self.trace_length = self.config['buffer_config']['trace_length'] + self.recurrent = True + self.batch_dims = 2 + + def add(self, data: tuple, priority=None) -> None: + """ + Добавляет переходы в буфер + Аргументы: + data: tuple(state, action, reward, done, next_state) + priority: np.array (только для приоритетных буферов) + """ + super().add((*data, *self.action_model.get_states()), priority) + self._update_next_state() + + def initial_state(self): + """Сбравсывает внутренне состояние lstm""" + self.action_model.initial_state() + + def _get_action(self, observation: tf.Tensor) -> tf.Tensor: + """Возвращает ценность дейтсвий Q(s,a) всех действий + на основе наблюдения""" + predict = super()._get_action(observation) + action, self.action_model.new_h_t, self.action_model.new_c_t = predict + return action + + def get_test_action(self, observation: tf.Tensor) -> float: + action = super().get_test_action(observation) + self._update_next_state() + return action + + def get_batch(self, ): + batch = super().get_batch() + + new_h_t, new_c_t = tf.squeeze(batch['h_t'][:, 1:], axis=1), tf.squeeze( + batch['c_t'][:, 1:], axis=1) + h_t, c_t = tf.squeeze( + batch['h_t'][:, :-1], axis=1), tf.squeeze(batch['c_t'][:, :-1], + axis=1) + batch['state'] = [batch['state'], h_t, c_t] + batch['next_state'] = [batch['next_state'], new_h_t, new_c_t] + batch['recurrent_skip'] = self.recurrent_skip + batch['trace_length'] = self.trace_length + + if self.priority: + batch['weights'] = np.repeat(np.expand_dims( + batch['weights'], -1), + self.trace_length-self.recurrent_skip, + axis=1) + return batch + + def _update_next_state(self): + """Обновляет внутреннее состояние lstm новым состоянием lstm""" + self.action_model._update_next_state() diff --git a/rl_lib/src/algoritms/model_free/value_based/simple_q.py b/rl_lib/src/algoritms/model_free/value_based/simple_q.py new file mode 100644 index 0000000..d8883b6 --- /dev/null +++ b/rl_lib/src/algoritms/model_free/value_based/simple_q.py @@ -0,0 +1,217 @@ +from copy import copy + +import numpy as np +import tensorflow as tf + +from rl_lib.src.explore_env.exploration_manager import ExplorationManager +from rl_lib.src.replay_buffers.replay_buffer import ReplayBuffer + +from .base_algo import Base_Algo + + +class SimpleQ(Base_Algo, ): + """Произовдит все вычисления необходимые для Q-learning + """ + + def __init__(self, action_model: object, + target_model: object, + config: dict, + **kwargs): + + Base_Algo.__init__(self, action_model, target_model, config, **kwargs) + # print(self.config) + config = link_data_inside_the_config(self.config) + self.buffer = ReplayBuffer(**config.get("buffer_config", {})) + self.exploration = ExplorationManager( + **config.get("exploration_config", {})) + + self.discount_factor = self.config['model_config']['discount_factor'] + self.n_step = self.config['model_config']['n_step'] + + self.batch_size = self.config['model_config'].get("batch_size") + self.double_network = self.config['model_config'].get("double_network") + self.priority = self.config['model_config'].get("priority") + self.tau = self.config['model_config'].get("tau") + + self.recurrent = False + self.batch_dims = 1 + self.ind_axis = -1 + + def add(self, data: tuple, priority=None) -> None: + """ + Добавляет переходы в буфер + Аргументы: + data: tuple(state, action, reward, done, next_state) + priority: np.array (только для приоритетных буферов) + """ + self.buffer.add(data, priority) + + def calculate_double_q(self, **kwargs): + Qaction = self.action_model(kwargs['next_state']) + Qtarget = self.target_model(kwargs['next_state']) + Qaction = Qaction[0] if isinstance(Qaction, list) else Qaction + Qtarget = Qtarget[0] if isinstance(Qtarget, list) else Qtarget + if kwargs["p_double"] < 0.5: + Qtarget = self.get_best_action(Qtarget, Qaction) + else: + Qtarget = self.get_best_action(Qaction, Qtarget) + return Qtarget + + def calculate_gradients(self, batch=None): + if batch is None: + batch = self.choice_model_for_double_calculates(**batch) + batch = self.choice_model_for_double_calculates(**batch) + return (self.action_model.calculate_gradients(**batch) + if batch['p_double'] > 0.5 + else self.target_model.calculate_gradients(**batch)) + + def calculate_new_best_action(self, **kwargs) -> tf.Tensor: + """Вычислеят новое лучшее действие для получения таргета""" + if self.double_network: + Qtarget = self.calculate_double_q(**kwargs) + else: + Qtarget = self.target_model(kwargs['next_state']) + Qtarget = Qtarget[0] if isinstance(Qtarget, list) else Qtarget + Qtarget = self.get_best_action(Qtarget, Qtarget) + return Qtarget + + @tf.function(reduce_retracing=True, + jit_compile=False, + experimental_autograph_options=tf.autograph.experimental.Feature.ALL) + def calculate_target(self, **kwargs): + Qtarget = self.calculate_new_best_action(**kwargs) + dones = tf.ones_like(kwargs['done'], dtype=tf.dtypes.float32) + dones = dones - kwargs['done'] + Qtarget = kwargs['reward'] + \ + (self.discount_factor**self.n_step) * Qtarget * dones + if self.recurrent: + Qtarget = Qtarget[:, kwargs.get('recurrent_skip', 10):] + return Qtarget + + def check_fullness_buffer(self): + """Проверяет наполненость буфера, + возвращает true если в буфере элементов больше батча + """ + if self.buffer.real_size > self.batch_size: + return True + else: + return False + + def choice_model_for_double_calculates(self, **batch): + batch['p_double'] = tf.random.uniform( + (1,), minval=0.0, maxval=1.0) if self.double_network else 1. + batch['Qtarget'] = self.calculate_target(**batch) + return batch + + def _get_action(self, observation: tf.Tensor) -> tf.Tensor: + """Возвращает ценность дейтсвий Q(s,a) всех действий + на основе наблюдения + """ + return self.sample_action( + self.action_model.check_input_shape( + copy(observation) + ) + ) + + def get_action(self, observation: tf.Tensor) -> float: + """Возвращает действие на основе наблюдения с учетом исследования""" + action = self.exploration(self._get_action(observation)) + if isinstance(action, int): + return int(action) + else: + return action.numpy() + + def get_test_action(self, observation: tf.Tensor) -> float: + """Возвращает действие на основе наблюдения без исследования""" + action = self.exploration.test(self._get_action(observation)) + if isinstance(action, int): + return int(action) + else: + return action.numpy() + + def get_batch(self): + """Получает батч из буфера""" + return self.buffer.sample(self.batch_size) + + def get_batch_and_td_error(self): + batch = self.get_batch() + batch['batch_dims'] = self.batch_dims + batch['p_double'] = 1. + td_error = self.calculate_gradients(batch)['td_error'] + return {'td_error': td_error.numpy(), 'batch': batch} + + def get_best_action(self, Qaction, Qtarget): + ind = tf.argmax(Qaction, axis=self.ind_axis) + Qtarget = tf.gather(Qtarget, ind, batch_dims=self.batch_dims) + return Qtarget + + def get_gradients(self) -> tf.Tensor: + """Вычисляет градиенты и возвращает их""" + batch = self.get_batch() + batch['batch_dims'] = self.batch_dims + batch['p_double'] = 1. + return self.calculate_gradients(batch)['gradients'] + + def load(self, ) -> None: + """Загружает алгоритм""" + self.action_model.load(self.path) + self.target_model.load(self.path) + self.buffer.load(self.path) + self.exploration.load(self.path) + + def reset(self) -> None: + """Сбрасывает внутренние данные модели""" + self.buffer.reset() + self.exploration.reset() + self.initial_model() + + def _train_step(self, **batch) -> dict: + """Вспомогательная train_step""" + batch = self.choice_model_for_double_calculates(**batch) + batch['batch_dims'] = self.batch_dims + return (self.action_model.update_weights(**batch) + if batch['p_double'] > 0.5 + else self.target_model.update_weights(**batch)) + + def train_step(self) -> np.array: + """Вычисляет полный обучающий шаг""" + if not self.check_fullness_buffer(): + return 0 + batch = self.get_batch() + result = self._train_step(**batch) + td_error = result['td_error'].numpy() + loss = result['loss'].numpy() + assert not np.all(np.isnan(td_error)), "td_error не может быть nan" + if self.priority: + self.buffer.update_priorities( + batch['data_idxs'], loss + if not self.recurrent + else loss[:, -1]) + if self.tau != 1: + _ = self.copy_weights() + return np.mean(td_error) + + def save(self) -> None: + """Сохраняет алгоритм""" + self.action_model.save(self.path) + self.target_model.save(self.path) + self.buffer.save(self.path) + self.exploration.save(self.path) + + def summary(self) -> None: + """Выводит архитектуру модели""" + self.action_model.summary + + +def link_data_inside_the_config(config): + # print(config) + discount_factor = config['model_config']['discount_factor'] + n_step = config['model_config']['n_step'] + action_space = config['model_config']['action_space'] + priority = config['model_config']['priority'] + + config['buffer_config']['priority'] = priority + config['buffer_config']['discount_factor'] = discount_factor + config['buffer_config']['n_step'] = n_step + config['exploration_config']['strategy_config']['action_space'] = action_space + return config diff --git a/rl_lib/src/algoritms/tests/config.yaml b/rl_lib/src/algoritms/model_free/value_based/tests/config.yaml similarity index 100% rename from rl_lib/src/algoritms/tests/config.yaml rename to rl_lib/src/algoritms/model_free/value_based/tests/config.yaml diff --git a/rl_lib/src/algoritms/model_free/value_based/tests/test_simpe_q.py b/rl_lib/src/algoritms/model_free/value_based/tests/test_simpe_q.py new file mode 100644 index 0000000..51837c9 --- /dev/null +++ b/rl_lib/src/algoritms/model_free/value_based/tests/test_simpe_q.py @@ -0,0 +1,83 @@ +import os + +from rl_lib.src.models.model import Model + +from ..simple_q import SimpleQ + + +class Simple_Model(Model): + def __init__(self, config={}, **kwargs): + super().__init__(model_config=config.get('model_config', {}), + config=config, default_config_path=__file__, + **kwargs) + + def _prediction_processing(self, input_data): + pass + + def _update_next_state(self, state, action): + pass + + def initial_state(self): + pass + + @staticmethod + def create_model(input_shape: tuple, action_space: int): + pass + + @staticmethod + def create_model_with_conv(input_shape: tuple, action_space: int): + pass + + def set_new_model(self, *args): + pass + + +class Test_Simple_Q(): + def __init__(self, config): + self.simple_q = SimpleQ(Simple_Model, Simple_Model, config, + default_config_path=__file__, + algo_name="SimpleQ", name="Simple_Model") + + def test_save(self): + self.simple_q.save() + real_structure = get_directory_structure(self.simple_q.path) + assert self.simple_q.path != self.simple_q.config['data_saver']['path'], "Пути не совпадают" + correct_structure = {self.simple_q.name: + { + self.simple_q.exploration.name + ".data": None, + self.simple_q.buffer.name + ".data": None, + self.simple_q.action_model.name + ".keras": None, + self.simple_q.target_model.name + ".keras": None, + } + } + assert compare_directory_structures( + real_structure, correct_structure), "Каталоги разные" + + +def compare_directory_structures(dir_structure1: dict, + dir_structure2: dict) -> bool: + """Проверяет одинаковые ли структуры каталогов""" + if dir_structure1.keys() != dir_structure2.keys(): + return False + + for key in dir_structure1.keys(): + if isinstance(dir_structure1[key], dict) and isinstance(dir_structure2[key], dict): + if not compare_directory_structures(dir_structure1[key], + dir_structure2[key]): + return False + elif dir_structure1[key] != dir_structure2[key]: + return False + + return True + + +def get_directory_structure(path: str) -> dict: + """Получает всю структуру переданного каталога""" + structure = {} + for dirpath, dirnames, filenames in os.walk(path): + current_level = structure + for dirname in dirpath.split(os.sep): + current_level = current_level.setdefault(dirname, {}) + for filename in filenames: + current_level[filename] = None + return structure diff --git a/rl_lib/src/algoritms/model_free/value_based/utils.py b/rl_lib/src/algoritms/model_free/value_based/utils.py new file mode 100644 index 0000000..385c493 --- /dev/null +++ b/rl_lib/src/algoritms/model_free/value_based/utils.py @@ -0,0 +1,13 @@ +def update_config(config: dict, new_data: dict) -> None: + """Обвновляет конфигурацию по умолчанию + Args: + config: dict: Конфигурация, которую надо обновить + new_data: dict: Конфигурация с новыми данными + Returns: + None + """ + for key, value in new_data.items(): + if isinstance(value, dict) and key in config and isinstance(config[key], dict): + update_config(config[key], value) + else: + config[key] = value diff --git a/rl_lib/src/algoritms/simple_q.py b/rl_lib/src/algoritms/simple_q.py deleted file mode 100644 index 00160b6..0000000 --- a/rl_lib/src/algoritms/simple_q.py +++ /dev/null @@ -1,182 +0,0 @@ -import tensorflow as tf -import numpy as np -from copy import copy - -from .base_algo import Base_Algo -from rl_lib.src.replay_buffers.replay_buffer import ReplayBuffer -from rl_lib.src.explore_env.exploration_manager import ExplorationManager - -class SimpleQ(Base_Algo, ): - """Произовдит все вычисления необходимые для Q-learning - """ - def __init__(self, action_model: object, target_model: object, config: dict, **kwargs): - - Base_Algo.__init__(self, action_model, target_model, config, **kwargs) - # print(self.config) - config = link_data_inside_the_config(self.config) - self.buffer = ReplayBuffer(**config.get("buffer_config", {})) - self.exploration = ExplorationManager(**config.get("exploration_config", {})) - - self.discount_factor = self.config['model_config']['discount_factor'] - self.n_step = self.config['model_config']['n_step'] - - self.batch_size = self.config['model_config'].get("batch_size") - self.double_network = self.config['model_config'].get("double_network") - self.priority = self.config['model_config'].get("priority") - self.tau = self.config['model_config'].get("tau") - - self.recurrent = False - self.batch_dims = 1 - self.ind_axis = -1 - - def add(self, data: tuple, priority = None) -> None: - """ - Добавляет переходы в буфер - Аргументы: - data: tuple(state, action, reward, done, next_state) - priority: np.array (только для приоритетных буферов) - """ - self.buffer.add(data, priority) - - def calculate_double_q(self, **kwargs): - Qaction = self.action_model(kwargs['next_state']) - Qtarget = self.target_model(kwargs['next_state']) - Qaction = Qaction[0] if isinstance(Qaction, list) else Qaction - Qtarget = Qtarget[0] if isinstance(Qtarget, list) else Qtarget - if kwargs["p_double"] < 0.5 : Qtarget = self.get_best_action(Qtarget, Qaction) - else: Qtarget = self.get_best_action(Qaction, Qtarget) - return Qtarget - - def calculate_gradients(self, batch = None): - if batch == None: batch = self.choice_model_for_double_calculates(**batch) - batch = self.choice_model_for_double_calculates(**batch) - return self.action_model.calculate_gradients(**batch) if batch['p_double'] > 0.5 else self.target_model.calculate_gradients(**batch) - - def calculate_new_best_action(self, **kwargs) -> tf.Tensor: - """Вычислеят новое лучшее действие для получения таргета""" - if self.double_network: - Qtarget = self.calculate_double_q(**kwargs) - else: - Qtarget = self.target_model(kwargs['next_state']) - Qtarget = Qtarget[0] if isinstance(Qtarget, list) else Qtarget - Qtarget = self.get_best_action(Qtarget, Qtarget) - return Qtarget - - @tf.function(reduce_retracing=True, - jit_compile=False, - experimental_autograph_options = tf.autograph.experimental.Feature.ALL) - def calculate_target(self, **kwargs): - Qtarget = self.calculate_new_best_action(**kwargs) - dones = tf.ones_like(kwargs['done'], dtype=tf.dtypes.float32) - dones = dones - kwargs['done'] - Qtarget = kwargs['reward'] + (self.discount_factor**self.n_step) * Qtarget * dones - if self.recurrent: - Qtarget = Qtarget[:, kwargs.get('recurrent_skip', 10):] - return Qtarget - - def check_fullness_buffer(self): - """Проверяет наполненость буфера, возвращает true если в буфере элементов больше батча""" - if self.buffer.real_size > self.batch_size: return True - else: return False - - def choice_model_for_double_calculates(self, **batch): - batch['p_double'] = tf.random.uniform((1,), minval = 0.0, maxval = 1.0) if self.double_network else 1. - batch['Qtarget'] = self.calculate_target(**batch) - return batch - - def _get_action(self, observation: tf.Tensor) -> tf.Tensor: - """Возвращает ценность дейтсвий Q(s,a) всех действий на основе наблюдения""" - return self.sample_action(self.action_model.check_input_shape(copy(observation))) - - def get_action(self, observation: tf.Tensor) -> float: - """Возвращает действие на основе наблюдения с учетом исследования""" - action = self.exploration(self._get_action(observation)) - if isinstance(action, int): return int(action) - else: return action.numpy() - - def get_test_action(self, observation: tf.Tensor) -> float: - """Возвращает действие на основе наблюдения без исследования""" - action = self.exploration.test(self._get_action(observation)) - if isinstance(action, int): return int(action) - else: return action.numpy() - - def get_batch(self): - """Получает батч из буфера""" - return self.buffer.sample(self.batch_size) - - def get_batch_and_td_error(self): - batch = self.get_batch() - batch['batch_dims'] = self.batch_dims - batch['p_double'] = 1. - td_error = self.calculate_gradients(batch)['td_error'] - return {'td_error': td_error.numpy(), 'batch': batch} - - def get_best_action(self, Qaction, Qtarget): - ind = tf.argmax(Qaction, axis=self.ind_axis) - Qtarget = tf.gather(Qtarget, ind, batch_dims=self.batch_dims) - return Qtarget - - def get_gradients(self) -> tf.Tensor: - """Вычисляет градиенты и возвращает их""" - batch = self.get_batch() - batch['batch_dims'] = self.batch_dims - batch['p_double'] = 1. - return self.calculate_gradients(batch)['gradients'] - - def load(self, ) -> None: - """Загружает алгоритм""" - self.action_model.load(self.path) - self.target_model.load(self.path) - self.buffer.load(self.path) - self.exploration.load(self.path) - - def reset(self) -> None: - """Сбрасывает внутренние данные модели""" - self.buffer.reset() - self.exploration.reset() - self.initial_model() - - def _train_step(self, **batch) -> dict: - """Вспомогательная train_step""" - batch = self.choice_model_for_double_calculates(**batch) - batch['batch_dims'] = self.batch_dims - return self.action_model.update_weights(**batch) if batch['p_double'] > 0.5 else self.target_model.update_weights(**batch) - - def train_step(self) -> np.array: - """Вычисляет полный обучающий шаг""" - if not self.check_fullness_buffer(): return 0 - batch = self.get_batch() - result = self._train_step(**batch) - td_error = result['td_error'].numpy() - loss = result['loss'].numpy() - assert not np.all(np.isnan(td_error)), "td_error не может быть nan" - if self.priority: self.buffer.update_priorities(batch['data_idxs'], loss if not self.recurrent else loss[:, -1]) - if self.tau != 1: - _ = self.copy_weights() - return np.mean(td_error) - - def save(self) -> None: - """Сохраняет алгоритм""" - self.action_model.save(self.path) - self.target_model.save(self.path) - self.buffer.save(self.path) - self.exploration.save(self.path) - - def summary(self) -> None: - """Выводит архитектуру модели""" - self.action_model.summary - - - -def link_data_inside_the_config(config): - # print(config) - discount_factor = config['model_config']['discount_factor'] - n_step = config['model_config']['n_step'] - action_space = config['model_config']['action_space'] - priority = config['model_config']['priority'] - - config['buffer_config']['priority'] = priority - config['buffer_config']['discount_factor'] = discount_factor - config['buffer_config']['n_step'] = n_step - config['exploration_config']['strategy_config']['action_space'] = action_space - return config diff --git a/rl_lib/src/algoritms/tests/test_simpe_q.py b/rl_lib/src/algoritms/tests/test_simpe_q.py deleted file mode 100644 index 73fd998..0000000 --- a/rl_lib/src/algoritms/tests/test_simpe_q.py +++ /dev/null @@ -1,72 +0,0 @@ -from ..simple_q import SimpleQ -from rl_lib.src.models.model import Model - -import os - -class Simple_Model(Model): - def __init__(self, config = {},**kwargs): - super().__init__(model_config = config.get('model_config', {}), config = config, default_config_path=__file__, **kwargs) - - def _prediction_processing(self, input_data): - pass - - def _update_next_state(self, state, action): - pass - - def initial_state(self): - pass - - @staticmethod - def create_model(input_shape: tuple, action_space: int): - pass - - @staticmethod - def create_model_with_conv(input_shape: tuple, action_space: int): - pass - - - def set_new_model(self, *args): - pass - -class Test_Simple_Q(): - def __init__(self, config): - self.simple_q = SimpleQ(Simple_Model, Simple_Model, config, default_config_path=__file__, algo_name = "SimpleQ", name = "Simple_Model") - - def test_save(self): - self.simple_q.save() - real_structure = get_directory_structure(self.simple_q.path) - assert self.simple_q.path != self.simple_q.config['data_saver']['path'], "Пути не совпадают" - correct_structure = {self.simple_q.name: - { - self.simple_q.exploration.name + ".data": None, - self.simple_q.buffer.name + ".data": None, - self.simple_q.action_model.name + ".h5": None, - self.simple_q.target_model.name + ".h5": None, - } - } - assert compare_directory_structures(real_structure, correct_structure), "Каталоги разные" - -def compare_directory_structures(dir_structure1: dict, dir_structure2: dict) -> bool: - """Проверяет одинаковые ли структуры каталогов""" - if dir_structure1.keys() != dir_structure2.keys(): - return False - - for key in dir_structure1.keys(): - if isinstance(dir_structure1[key], dict) and isinstance(dir_structure2[key], dict): - if not compare_directory_structures(dir_structure1[key], dir_structure2[key]): - return False - elif dir_structure1[key] != dir_structure2[key]: - return False - - return True - -def get_directory_structure(path: str) -> dict: - """Получает всю структуру переданного каталога""" - structure = {} - for dirpath, dirnames, filenames in os.walk(path): - current_level = structure - for dirname in dirpath.split(os.sep): - current_level = current_level.setdefault(dirname, {}) - for filename in filenames: - current_level[filename] = None - return structure diff --git a/rl_lib/src/algoritms/utils.py b/rl_lib/src/algoritms/utils.py deleted file mode 100644 index 033bd3c..0000000 --- a/rl_lib/src/algoritms/utils.py +++ /dev/null @@ -1,13 +0,0 @@ -def update_config(config: dict, new_data: dict) -> None: - """Обвновляет конфигурацию по умолчанию - Args: - config: dict: Конфигурация, которую надо обновить - new_data: dict: Конфигурация с новыми данными - Returns: - None - """ - for key, value in new_data.items(): - if isinstance(value, dict) and key in config and isinstance(config[key], dict): - update_config(config[key], value) - else: - config[key] = value diff --git a/rl_lib/src/data_saver/saver.py b/rl_lib/src/data_saver/saver.py index 1bce4b6..89dd553 100644 --- a/rl_lib/src/data_saver/saver.py +++ b/rl_lib/src/data_saver/saver.py @@ -1,62 +1,75 @@ import os from shutil import copy, make_archive + class Saver: - """Хранит в себе пути сохранения этапа обучения и путь резервного копирования. - При инициализации создает папки для сохранения и резервного копирования. - Args: - name: str. Необязательно, название алгоритма - path: str. Путь сохранения - copy_path: str. Путь резервного копирования - """ - def __init__(self, algo_name="None", copy_path="", name="", path="", **kwargs): - self.algo_name = algo_name - self.copy_path = copy_path - self.name = name - self.original_path = os.getcwd() - self.path = path - - self.validate_path() - - self.init_copy_dir() - self.init_save_dir() - - @property - def get_save_path(self): - return self.path - - @property - def get_copy_path(self): - if self.copy_path != "": return self.copy_path - else: return "Path is not defined" - - def init_copy_dir(self): - if self.copy_path != "": - self.copy_path = self.copy_path + self.algo_name + "/" - if not os.path.isdir(self.copy_path): - os.makedirs(self.copy_path) - - def init_save_dir(self): - """Создает путь сохранения и директорию сохранения""" - if self.path == "": self.path = self.original_path + "/models/" + self.algo_name + "/" + self.name + "/" - else: self.path = self.path + self.name + "/" - if not os.path.isdir(self.path): - os.makedirs(self.path) - - def make_copy(self): - """Резервное копирование архива директории""" - copy(self.path +'/' + self.name+'.zip', self.copy_path) - - def make_archive(self): - """Создает архив директории""" - make_archive(base_name=self.name, format='zip', root_dir=self.path) - - def validate_path(self): - assert isinstance(self.algo_name, str), "Неверный тип аргумента, должно быть str" - assert isinstance(self.copy_path, str), "Неверный тип аргумента, должно быть str" - assert isinstance(self.name, str), "Неверный тип аргумента, должно быть str" - assert isinstance(self.path, str), "Неверный тип аргумента, должно быть str" - if len(self.path) > 0: assert self.path[-1] == "/", "В конце пути должен быть /" - if len(self.copy_path) > 0: assert self.copy_path[-1] == "/", "В конце пути должен быть /" - - + """Хранит в себе пути сохранения этапа обучения + и путь резервного копирования. + При инициализации создает папки для сохранения и резервного копирования. + Args: + name: str. Необязательно, название алгоритма + path: str. Путь сохранения + copy_path: str. Путь резервного копирования + """ + + def __init__(self, algo_name="None", copy_path="", name="", path="", + **kwargs): + self.algo_name = algo_name + self.copy_path = copy_path + self.name = name + self.original_path = os.getcwd() + self.path = path + + self.validate_path() + + self.init_copy_dir() + self.init_save_dir() + + @property + def get_save_path(self): + return self.path + + @property + def get_copy_path(self): + if self.copy_path != "": + return self.copy_path + else: + return "Path is not defined" + + def init_copy_dir(self): + if self.copy_path != "": + self.copy_path = self.copy_path + self.algo_name + "/" + if not os.path.isdir(self.copy_path): + os.makedirs(self.copy_path) + + def init_save_dir(self): + """Создает путь сохранения и директорию сохранения""" + if self.path == "": + self.path = self.original_path + "/models/" + \ + self.algo_name + "/" + self.name + "/" + else: + self.path = self.path + self.name + "/" + if not os.path.isdir(self.path): + os.makedirs(self.path) + + def make_copy(self): + """Резервное копирование архива директории""" + copy(self.path + '/' + self.name+'.zip', self.copy_path) + + def make_archive(self): + """Создает архив директории""" + make_archive(base_name=self.name, format='zip', root_dir=self.path) + + def validate_path(self): + assert isinstance( + self.algo_name, str), "Неверный тип аргумента, должно быть str" + assert isinstance( + self.copy_path, str), "Неверный тип аргумента, должно быть str" + assert isinstance( + self.name, str), "Неверный тип аргумента, должно быть str" + assert isinstance( + self.path, str), "Неверный тип аргумента, должно быть str" + if len(self.path) > 0: + assert self.path[-1] == "/", "В конце пути должен быть /" + if len(self.copy_path) > 0: + assert self.copy_path[-1] == "/", "В конце пути должен быть /" diff --git a/rl_lib/src/data_saver/tests/test_saver.py b/rl_lib/src/data_saver/tests/test_saver.py index 05d9f03..1ed435c 100644 --- a/rl_lib/src/data_saver/tests/test_saver.py +++ b/rl_lib/src/data_saver/tests/test_saver.py @@ -1,19 +1,23 @@ +from rl_lib.rl_lib.src.algoritms.tests.test_simpe_q import ( + compare_directory_structures, get_directory_structure) + from ..saver import Saver -from rl_lib.rl_lib.src.algoritms.tests.test_simpe_q import get_directory_structure, compare_directory_structures -class Test_Saver: - def __init__(self, **kwargs): - self.saver = Saver(**kwargs) - def test_init(self, path, copy_path): - self.check_structure(self.saver.path, path) - self.check_structure(self.saver.copy_path, copy_path) - print("Тест пройден успешно") +class Test_Saver: + def __init__(self, **kwargs): + self.saver = Saver(**kwargs) + def test_init(self, path, copy_path): + self.check_structure(self.saver.path, path) + self.check_structure(self.saver.copy_path, copy_path) + print("Тест пройден успешно") - def check_structure(self, real_path, corrrect_path): - real_structure = get_directory_structure(real_path) - assert real_path == corrrect_path, "Пути не совпадают" - if corrrect_path != "": - correct_structure = {corrrect_path.replace("/", ""): {"":{}, self.saver.name: {}}} - assert compare_directory_structures(real_structure, correct_structure), "Каталоги разные" + def check_structure(self, real_path, corrrect_path): + real_structure = get_directory_structure(real_path) + assert real_path == corrrect_path, "Пути не совпадают" + if corrrect_path != "": + correct_structure = {corrrect_path.replace( + "/", ""): {"": {}, self.saver.name: {}}} + assert compare_directory_structures( + real_structure, correct_structure), "Каталоги разные" diff --git a/rl_lib/src/data_saver/utils.py b/rl_lib/src/data_saver/utils.py index b1ef00f..7ce4351 100644 --- a/rl_lib/src/data_saver/utils.py +++ b/rl_lib/src/data_saver/utils.py @@ -1,15 +1,29 @@ +import os.path as os_path from pickle import dump, load + from yaml import safe_load -import os.path as os_path + def save_data(path, data): - with open(path+'.data', 'wb') as f: - dump(data, f) + with open(path+'.data', 'wb') as f: + dump(data, f) + def load_data(path): - with open(path+'.data', 'rb') as f: - loaded_data = load(f) - return loaded_data + with open(path+'.data', 'rb') as f: + loaded_data = load(f) + return loaded_data + def load_default_config(path): - return safe_load(open(os_path.join(os_path.dirname(path),"./config.yaml"), "rb")) + file_name = "./config.yaml" + if path.split('/')[-1].split('.')[-1] in 'yaml': + file_name = path.split('/')[-1] + return safe_load( + open( + os_path.join( + os_path.dirname(path), + file_name + ), "rb" + ) + ) \ No newline at end of file diff --git a/rl_lib/src/explore_env/base_explore.py b/rl_lib/src/explore_env/base_explore.py index c741d9d..d1fb054 100644 --- a/rl_lib/src/explore_env/base_explore.py +++ b/rl_lib/src/explore_env/base_explore.py @@ -1,34 +1,36 @@ import abc + class Base_Explore(abc.ABC): - """Абстрактный класс представляющий общий интерфейс для всех классов исследования - нейронной сетью среды обучения - - """ - def __init__(): - pass - - @property - @abc.abstractmethod - def name(self): - """Возвращает имя стратегии""" - - @abc.abstractmethod - def reset(self, ) -> None: - """Выполняет внутренний сброс""" - - @abc.abstractmethod - def save(self, path) -> None: - """Сохраняет какие либо внутренние переменные""" - - @abc.abstractmethod - def load(self, path) -> None: - """Загружает какие либо внутренние переменные""" - - @abc.abstractmethod - def __call__(self, action) -> int: - """Возвращает действие в соответствии с стратегией исследования""" - - @abc.abstractmethod - def test(self, action) -> int: - """Возвращает действие в соответствии с стратегией тестирования""" + """Абстрактный класс представляющий + общий интерфейс для всех классов исследования + нейронной сетью среды обучения + + """ + def __init__(): + pass + + @property + @abc.abstractmethod + def name(self): + """Возвращает имя стратегии""" + + @abc.abstractmethod + def reset(self, ) -> None: + """Выполняет внутренний сброс""" + + @abc.abstractmethod + def save(self, path) -> None: + """Сохраняет какие либо внутренние переменные""" + + @abc.abstractmethod + def load(self, path) -> None: + """Загружает какие либо внутренние переменные""" + + @abc.abstractmethod + def __call__(self, action) -> int: + """Возвращает действие в соответствии с стратегией исследования""" + + @abc.abstractmethod + def test(self, action) -> int: + """Возвращает действие в соответствии с стратегией тестирования""" diff --git a/rl_lib/src/explore_env/epsilon_greedy.py b/rl_lib/src/explore_env/epsilon_greedy.py index 269a7ff..cc843e2 100644 --- a/rl_lib/src/explore_env/epsilon_greedy.py +++ b/rl_lib/src/explore_env/epsilon_greedy.py @@ -1,56 +1,64 @@ -import numpy as np -from tensorflow.math import argmax +import numpy as np from tensorflow.dtypes import int32 +from tensorflow.math import argmax -from ..data_saver.utils import save_data, load_data +from ..data_saver.utils import load_data, save_data from .base_explore import Base_Explore + class Epsilon_Greedy(Base_Explore): """Эпсилон-жадная стратегия исследования Kwargs: - eps_decay_steps: int, Количество внутренних шагов исследований до установки минимального эпсилон + eps_decay_steps: int, Количество внутренних шагов исследований + до установки минимального эпсилон eps_max: float, Максимальный эпсилон eps_min: float, Минимальный эпсилон eps_test: float, Тестовый эпсилон action_spase: int, Размер пространтства действий axis: int, Ось вычислений """ - def __init__(self, eps_decay_steps=1e6, eps_max=1.0, eps_min=1e-1, eps_test=1e-3, action_space=None, axis=-1, **kwargs): + + def __init__(self, eps_decay_steps=1e6, + eps_max=1.0, eps_min=1e-1, + eps_test=1e-3, action_space=None, + axis=-1, **kwargs): self.eps_desay_steps = eps_decay_steps self.eps_min = eps_min self.eps_max = eps_max self.eps_test = eps_test - assert type(action_space) == int, "Пространство действий должно быть int" - self.action_space = action_space + assert type( + action_space) is int, "Пространство действий должно быть int" + self.action_space = action_space self.axis = axis self._name = "epsilon_greedy_strategy" self.reset() def __call__(self, Q): - self.eps = max(self.eps_min, self.eps_max - (self.eps_max-self.eps_min) * self.count/self.eps_desay_steps) + self.eps = max(self.eps_min, self.eps_max - (self.eps_max - + self.eps_min) * self.count/self.eps_desay_steps) self.count += 1 return self.get_action(self.eps, Q) def get_action(self, eps, Q): - if np.random.random() < eps: return np.random.randint(self.action_space) - else: return argmax(Q, axis=self.axis, output_type=int32) + if np.random.random() < eps: + return np.random.randint(self.action_space) + else: + return argmax(Q, axis=self.axis, output_type=int32) def load(self, path): self.__dict__ = load_data(path+self.name) - + @property def name(self): return self._name - + def reset(self, ): self.count = 0 self.eps = self.eps_max - + def save(self, path): save_data(path+self.name, self.__dict__) def test(self, Q): return self.get_action(self.eps_test, Q) - - diff --git a/rl_lib/src/explore_env/exploration_manager.py b/rl_lib/src/explore_env/exploration_manager.py index ffa5a17..548bda0 100644 --- a/rl_lib/src/explore_env/exploration_manager.py +++ b/rl_lib/src/explore_env/exploration_manager.py @@ -1,50 +1,55 @@ -from .epsilon_greedy import Epsilon_Greedy -from .soft_q import Soft_Q from .base_explore import Base_Explore +from .epsilon_greedy import Epsilon_Greedy from .ou_noise import OU_Noise +from .soft_q import Soft_Q + class ExplorationManager(Base_Explore): - """Выбирает стратегию исследования и выполняет все ее функции - Kwargs: - strategy_name: str, Название стратегии - strategy_config: dict, Параметры стратегии - """ - def __init__(self, strategy_name="epsilon_greedy", strategy_config = {}, **kwargs): - self._config = {"strategy_name": strategy_name, "strategy_config": strategy_config} - - if strategy_name.lower() == "epsilon_greedy": - self.strategy = Epsilon_Greedy(**strategy_config) - - elif strategy_name.lower() == "soft_q": - self.strategy = Soft_Q(**strategy_config) - - elif strategy_name.lower() == "ou_noise": - self.strategy = OU_Noise(**strategy_config) - - else: - assert 0, "Неизвестная стратегия" - - self.strategy_name = self.strategy.name - - def __call__(self, Q): - return self.strategy(Q) - - @property - def config(self): - return self.config - - @property - def name(self): - return self.strategy.name - - def load(self, path): - self.strategy.load(path) - - def reset(self, ): - self.strategy.reset() - - def save(self, path): - self.strategy.save(path) - - def test(self, Q): - return self.strategy.test(Q) + """Выбирает стратегию исследования и выполняет все ее функции + Kwargs: + strategy_name: str, Название стратегии + strategy_config: dict, Параметры стратегии + """ + + def __init__(self, strategy_name="epsilon_greedy", + strategy_config={}, + *args, **kwargs): + self._config = {"strategy_name": strategy_name, + "strategy_config": strategy_config} + + if strategy_name.lower() == "epsilon_greedy": + self.strategy = Epsilon_Greedy(**strategy_config) + + elif strategy_name.lower() == "soft_q": + self.strategy = Soft_Q(**strategy_config) + + elif strategy_name.lower() == "ou_noise": + self.strategy = OU_Noise(**strategy_config) + + else: + assert 0, "Неизвестная стратегия" + + self.strategy_name = self.strategy.name + + def __call__(self, Q): + return self.strategy(Q) + + @property + def config(self): + return self.config + + @property + def name(self): + return self.strategy.name + + def load(self, path): + self.strategy.load(path) + + def reset(self, ): + self.strategy.reset() + + def save(self, path): + self.strategy.save(path) + + def test(self, Q): + return self.strategy.test(Q) diff --git a/rl_lib/src/explore_env/ou_noise.py b/rl_lib/src/explore_env/ou_noise.py index d1d24ed..e0d6d31 100644 --- a/rl_lib/src/explore_env/ou_noise.py +++ b/rl_lib/src/explore_env/ou_noise.py @@ -1,21 +1,26 @@ -import numpy as np +import numpy as np from tensorflow import clip_by_value -from ..data_saver.utils import save_data, load_data +from ..data_saver.utils import load_data, save_data from .base_explore import Base_Explore + class OU_Noise_generator: - def __init__(self, mean, sigma , theta=0.15, dt=1e-2, x_initial=None): + def __init__(self, mean, sigma, theta=0.15, dt=1e-2, x_initial=None): self.theta = theta self.mean = mean - self.sigma = sigma + self.sigma = sigma self.dt = dt self.x_initial = x_initial self.reset() def __call__(self): - # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process. - dx = (self.theta * (self.mean - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape, scale=self.sigma)) + """Formula taken from + https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process. + """ + dx = (self.theta * (self.mean - self.x_prev) * self.dt + self.sigma * + np.sqrt(self.dt) * np.random.normal(size=self.mean.shape, + scale=self.sigma)) # Store x into x_prev # Makes next noise dependent on current one self.x_prev += dx @@ -27,49 +32,58 @@ def reset(self): else: self.x_prev = np.zeros_like(self.mean) + class OU_Noise(Base_Explore): - """Шум Орнштейна — Уленбека стратегия исследования, применяется к предсказанным непрерывным действиям. + """Шум Орнштейна — Уленбека стратегия исследования, + применяется к предсказанным непрерывным действиям. - Kwargs: + Kwargs: action_spase: int, Размер пространтства действий - alpha: int, Количество внутренних шагов исследований до установки минимального эпсилон + alpha: int, Количество внутренних шагов исследований + до установки минимального эпсилон axis: int, ось вычислений sigma: float, Максимальный эпсилон """ - def __init__(self, action_space = None, - axis=-1, alpha = 0.9, dt = 0.01, - lower_bound = -1.0, mean: np.ndarray = None, - sigma=1.0, theta = 0.15, - upper_bound = 1.0, - **kwargs): - self.action_space = action_space + + def __init__(self, action_space=None, + axis=-1, alpha=0.9, dt=0.01, + lower_bound=-1.0, mean: np.ndarray = None, + sigma=1.0, theta=0.15, + upper_bound=1.0, + **kwargs): + self.action_space = action_space self.alpha = alpha self.axis = axis - self.ou_gen = OU_Noise_generator(np.zeros(action_space) if mean=="None" else mean, sigma , theta=theta, dt=dt, x_initial=None) + self.ou_gen = OU_Noise_generator(np.zeros( + action_space) if mean == "None" else mean, sigma, theta=theta, + dt=dt, x_initial=None) self.eps = self.ou_gen() self.lower_bound = lower_bound - self.sigma = sigma - self._name = "ou_noise" + self.sigma = sigma + self._name = "ou_noise" self.upper_bound = upper_bound - - + def __call__(self, action): action += self.eps self.eps = self.alpha*self.eps + self.ou_gen() - return clip_by_value(action, clip_value_min=self.lower_bound, clip_value_max=self.upper_bound) + return clip_by_value(action, + clip_value_min=self.lower_bound, + clip_value_max=self.upper_bound) def load(self, path): self.__dict__ = load_data(path+self.name) - + @property def name(self): return self._name - + def reset(self, ): self.eps = self.ou_gen() - + def save(self, path): save_data(path+self.name, self.__dict__) def test(self, action): - return clip_by_value(action, clip_value_min=self.lower_bound, clip_value_max=self.upper_bound) \ No newline at end of file + return clip_by_value(action, + clip_value_min=self.lower_bound, + clip_value_max=self.upper_bound) diff --git a/rl_lib/src/explore_env/soft_q.py b/rl_lib/src/explore_env/soft_q.py index 737c8a1..bc87c20 100644 --- a/rl_lib/src/explore_env/soft_q.py +++ b/rl_lib/src/explore_env/soft_q.py @@ -1,50 +1,57 @@ -from tensorflow.keras.activations import softmax -from tensorflow.math import argmax -from tensorflow.dtypes import int32 from tensorflow import expand_dims -from tensorflow.math import log +from tensorflow.dtypes import int32 +from tensorflow.keras.activations import softmax +from tensorflow.math import argmax, log from tensorflow.random import categorical +from rl_lib.src.algoritms.model_free.value_based.base_algo import Base_Algo + +from ..data_saver.utils import load_data, save_data from .base_explore import Base_Explore -from ..data_saver.utils import save_data, load_data -from rl_lib.src.algoritms.base_algo import Base_Algo + class Soft_Q(Base_Explore): - """Больцмановская стратегия исследования - a = softmax(Q/tau) - - Kwargs: - tau: float, Больцмановская температура - axis: int, Ось вычислений - """ - def __init__(self, decay = 0, tau=1.0, axis=-1, **kwargs): - self.decay = decay - self.tau = tau - self.axis = axis - self._name = "soft_q_strategy" - - def __call__(self, Q) -> int: - """Возвращает действие в соответствии с стратегией исследования""" - probability = softmax(expand_dims(Q, 0)/self.tau, axis=self.axis) - self.tau = self.tau * self.decay - return Base_Algo.squeeze_predict(categorical(log(probability), 1, dtype=int32)) - - @property - def name(self): - return self._name - - def load(self, path) -> None: - """Загружает какие либо внутренние переменные""" - self.__dict__ = load_data(path+self.name) - - def reset(self, ) -> None: - """Выполняет внутренний сброс""" - pass - - def save(self, path) -> None: - """Сохраняет какие либо внутренние переменные""" - save_data(path+self.name, self.__dict__) - - def test(self, Q) -> int: - """Возвращает действие в соответствии с стратегией тестирования""" - return argmax(Q, axis=self.axis, output_type=int32) + """Больцмановская стратегия исследования + a = softmax(Q/tau) + + Kwargs: + tau: float, Больцмановская температура + axis: int, Ось вычислений + """ + + def __init__(self, decay=0, tau=1.0, axis=-1, **kwargs): + self.decay = decay + self.tau = tau + self.axis = axis + self._name = "soft_q_strategy" + + def __call__(self, Q) -> int: + """Возвращает действие в соответствии с стратегией исследования""" + probability = softmax(expand_dims(Q, 0)/self.tau, axis=self.axis) + self.tau = self.tau * self.decay + return Base_Algo.squeeze_predict( + categorical( + log(probability), + 1, + dtype=int32) + ) + + @property + def name(self): + return self._name + + def load(self, path) -> None: + """Загружает какие либо внутренние переменные""" + self.__dict__ = load_data(path+self.name) + + def reset(self, ) -> None: + """Выполняет внутренний сброс""" + pass + + def save(self, path) -> None: + """Сохраняет какие либо внутренние переменные""" + save_data(path+self.name, self.__dict__) + + def test(self, Q) -> int: + """Возвращает действие в соответствии с стратегией тестирования""" + return argmax(Q, axis=self.axis, output_type=int32) diff --git a/rl_lib/src/explore_env/tests/test_epsilon_greedy.py b/rl_lib/src/explore_env/tests/test_epsilon_greedy.py index 93d4b80..4f7aea1 100644 --- a/rl_lib/src/explore_env/tests/test_epsilon_greedy.py +++ b/rl_lib/src/explore_env/tests/test_epsilon_greedy.py @@ -1,19 +1,19 @@ from ..epsilon_greedy import Epsilon_Greedy + class Test_Epsilon_Greedy: - def __init__(config): - self.strategy = Epsilon_Gredy(**config) - self.config = config + def __init__(self, config): + self.strategy = Epsilon_Greedy(**config) + self.config = config + + def test_reset(self): + pass + + def test_save(self): + pass - def test_reset(self): - pass + def test_call(self): + pass - def test_save(self): - pass - - def test_call(self): - pass - - def test_test(self): - pass - + def test_test(self): + pass diff --git a/rl_lib/src/explore_env/tests/test_exploration_manager.py b/rl_lib/src/explore_env/tests/test_exploration_manager.py index 368c5a5..0d63a17 100644 --- a/rl_lib/src/explore_env/tests/test_exploration_manager.py +++ b/rl_lib/src/explore_env/tests/test_exploration_manager.py @@ -1,19 +1,19 @@ -from ..exploration_manger import Exploration_Manger +from ..exploration_manager import Exploration_Manger + class Test_Epsilon_Greedy: - def __init__(config): - self.strategy = Exploration_Manger(**config) - self.config = config + def __init__(self, config): + self.strategy = Exploration_Manger(**config) + self.config = config + + def test_reset(self): + pass + + def test_save(self): + pass - def test_reset(self): - pass + def test_call(self): + pass - def test_save(self): - pass - - def test_call(self): - pass - - def test_test(self): - pass - + def test_test(self): + pass diff --git a/rl_lib/src/explore_env/tests/test_soft_q.py b/rl_lib/src/explore_env/tests/test_soft_q.py index 301cbbb..7016b0d 100644 --- a/rl_lib/src/explore_env/tests/test_soft_q.py +++ b/rl_lib/src/explore_env/tests/test_soft_q.py @@ -1,19 +1,19 @@ from ..soft_q import Soft_Q + class Test_Epsilon_Greedy: - def __init__(config): - self.strategy = Soft_Q(**config) - self.config = config + def __init__(self, config): + self.strategy = Soft_Q(**config) + self.config = config + + def test_reset(self): + pass + + def test_save(self): + pass - def test_reset(self): - pass + def test_call(self): + pass - def test_save(self): - pass - - def test_call(self): - pass - - def test_test(self): - pass - + def test_test(self): + pass diff --git a/rl_lib/src/gym_wrappers/obsv_wrapper.py b/rl_lib/src/gym_wrappers/obsv_wrapper.py index 4bd195b..8b396e6 100644 --- a/rl_lib/src/gym_wrappers/obsv_wrapper.py +++ b/rl_lib/src/gym_wrappers/obsv_wrapper.py @@ -1,19 +1,25 @@ import gym import numpy as np -class ConvWrapper(gym.Wrapper): + +class ImageNormWrapper(gym.Wrapper): + """Обертка нормализации наблюдений среды, + если это изображения + + Args: + gym (_type_): _description_ + """ def __init__(self, env): super().__init__(env) def reset(self, seed=40, options={}): - observation, info = self.env.reset(seed=40, options={}) - return self.preprocess(observation), info + observation, info = self.env.reset(seed=40, options={}) + return self.preprocess(observation), info def step(self, action): - observation, reward, done, tr , info = self.env.step(action) + observation, reward, done, tr, info = self.env.step(action) return self.preprocess(observation), reward, done, tr, info def preprocess(self, observation): - observation = (observation- 255/2)/(255/2) + observation = (observation - 255/2)/(255/2) return observation.astype(np.float16) - \ No newline at end of file diff --git a/rl_lib/src/models/base_models.py b/rl_lib/src/models/base_models.py index 6a4c6df..36e9abb 100644 --- a/rl_lib/src/models/base_models.py +++ b/rl_lib/src/models/base_models.py @@ -1,167 +1,175 @@ -from ..data_saver.saver import Saver - import abc + import tensorflow as tf class BaseModel(abc.ABC): - """Абстрактный базовый класс, - представляющий общий интерфейс для всех алгоритмов и моделей в RL-Lib. + """Абстрактный базовый класс, + представляющий общий интерфейс для всех алгоритмов и моделей в RL-Lib. + + Model определяет общие методы для ввода, вывода и базовых вычислений, + которые должны быть реализованы в каждом конкретном алгоритме или модели. - Model определяет общие методы для ввода, вывода и базовых вычислений, - которые должны быть реализованы в каждом конкретном алгоритме или модели. + Этот класс служит в качестве основы для всех других классов в RL-Lib + и обеспечивает единый интерфейс для работы с различными моделями. + """ - Этот класс служит в качестве основы для всех других классов в RL-Lib - и обеспечивает единый интерфейс для работы с различными моделями. - """ + def __init__(self, **kwargs): + pass - def __init__(self, **kwargs): - pass + @property + @abc.abstractmethod + def input_spec(self) -> tuple: + """Возвращает кортеж размера входных данных Модели""" - @property - @abc.abstractmethod - def input_spec(self) -> tuple: - """Возвращает кортеж размера входных данных Модели""" + @property + @abc.abstractmethod + def output_spec(self) -> tuple: + """Возвращает кортеж размера выходных данных Модели""" - @property - @abc.abstractmethod - def output_spec(self) -> tuple: - """Возвращает кортеж размера выходных данных Модели""" + @abc.abstractmethod + def initial_state(self) -> None: + """Инициализирует внутреннее состояние реккурентной Модели""" - @abc.abstractmethod - def initial_state(self) -> None: - """Инициализирует внутреннее состояние реккурентной Модели""" + @abc.abstractmethod + def _update_next_state(self) -> None: + """Обновляет внутреннее состояние реккурентной Модели""" - @abc.abstractmethod - def _update_next_state(self) -> None: - """Обновляет внутреннее состояние реккурентной Модели""" class ModelIO(abc.ABC): - def __init__(self, config: dict, **kwargs): - super().__init__(**config,**kwargs) - self._config = config - self.name = kwargs.get("name","") - - @property - def config(self) -> dict: - """Возвращает конфигурацию алгоритма""" - return self._config - - @abc.abstractmethod - def save(self, path) -> None: - """Сохраняет модель в директории""" - - @abc.abstractmethod - def load(self, path) -> None: - """Загружает модель из директории""" + def __init__(self, config: dict, **kwargs): + super().__init__(**config, **kwargs) + self._config = config + self.name = kwargs.get("name", "") + + @property + def config(self) -> dict: + """Возвращает конфигурацию алгоритма""" + return self._config + + @abc.abstractmethod + def save(self, path) -> None: + """Сохраняет модель в директории""" + + @abc.abstractmethod + def load(self, path) -> None: + """Загружает модель из директории""" + class ModelNN(abc.ABC): - """Абстрактрный класс, представляющий модель нейронной сети для вычисления градиента, - обновления весов и извлечения слоев, весов, компиляции модели. - - Kwargs: - model: tf.keras.Model - name: str Необязательно, название модели - """ - - def __init__(self, model_config: dict, **kwargs): - super().__init__(**kwargs) - self.model = model_config.get('model', None) - - def __call__(self, inputs: tf.Tensor) -> tf.Tensor: - return self.model(inputs) - - @abc.abstractmethod - def _prediction_processing(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor: - """Обрабатывает выходы модели перед вычислением лоссов - Args: - inputs: tf.Tensor(dtype=tf.float32) - Returns - outputs: tf.Tensor(dtype=tf.float32 - """ - return inputs - - @abc.abstractclassmethod - def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor: - """Вычисляет и возвращает потери в соответствии с функцией потерь""" - - @abc.abstractclassmethod - def make_mask(self) -> tf.Tensor: - """Создает и возвращает маску для выходов с модели""" - - def prediction_processing(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor: - """Обрабатывает выходы модели перед вычислением лоссов - Args: - inputs: tf.Tensor(dtype=tf.float32) - Returns - outputs: tf.Tensor(dtype=tf.float32 + """Абстрактрный класс, + представляющий модель нейронной сети для вычисления градиента, + обновления весов и извлечения слоев, весов, компиляции модели. + + Kwargs: + model: tf.keras.Model + name: str Необязательно, название модели """ - inputs = inputs[0] if isinstance(inputs, list) else inputs - return self._prediction_processing(inputs, **kwargs) - - def set_new_model(self, model: tf.keras.Model, optimizer: tf.keras.optimizers, jit_compile=True) -> None: - self.model = model - self.model.compile(optimizer=optimizer, jit_compile=jit_compile) - - def validate_args(self): - assert isinstance(self.model, tf.keras.Model), "Передан неверный аргумент, должно быть tf.keras.Model" - - @property - def layers(self, ) -> list: - return self.model.layers - - @property - def weights(self, ) -> list: - return self.model.weights - - @property - def summary(self, ) -> None: - print(self.name) - self.model.summary() - - def get_weights(self, ) -> list: - return self.model.get_weights() - - def set_weights(self, weights: list) -> None: - self.model.set_weights(weights) - - @tf.function(reduce_retracing=True, + + def __init__(self, model_config: dict, **kwargs): + super().__init__(**kwargs) + self.model = model_config.get('model', None) + + def __call__(self, inputs: tf.Tensor) -> tf.Tensor: + return self.model(inputs) + + @abc.abstractmethod + def _prediction_processing(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor: + """Обрабатывает выходы модели перед вычислением лоссов + Args: + inputs: tf.Tensor(dtype=tf.float32) + Returns + outputs: tf.Tensor(dtype=tf.float32 + """ + return inputs + + @abc.abstractclassmethod + def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor: + """Вычисляет и возвращает потери в соответствии с функцией потерь""" + + @abc.abstractclassmethod + def make_mask(self) -> tf.Tensor: + """Создает и возвращает маску для выходов с модели""" + + def prediction_processing(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor: + """Обрабатывает выходы модели перед вычислением лоссов + Args: + inputs: tf.Tensor(dtype=tf.float32) + Returns + outputs: tf.Tensor(dtype=tf.float32 + """ + inputs = inputs[0] if isinstance(inputs, list) else inputs + return self._prediction_processing(inputs, **kwargs) + + def set_new_model(self, model: tf.keras.Model, + optimizer: tf.keras.optimizers, + jit_compile=True) -> None: + self.model = model + self.model.compile(optimizer=optimizer, jit_compile=jit_compile) + + def validate_args(self): + assert isinstance( + self.model, tf.keras.Model), """Передан неверный аргумент, + должно быть tf.keras.Model""" + + @property + def layers(self, ) -> list: + return self.model.layers + + @property + def weights(self, ) -> list: + return self.model.weights + + @property + def summary(self, ) -> None: + print(self.name) + self.model.summary() + + def get_weights(self, ) -> list: + return self.model.get_weights() + + def set_weights(self, weights: list) -> None: + self.model.set_weights(weights) + + @tf.function(reduce_retracing=True, jit_compile=False, - experimental_autograph_options = tf.autograph.experimental.Feature.ALL) - def calculate_gradients(self, **kwargs) -> dict: - """ - Вычисляет градиенты, лосс, td-ошибку - - Kwargs: - dict содержащий батч, таргет, маску, опционально приоритетные веса - - Returns: - dict содержащий лоссы и td-ошибку - """ - with tf.GradientTape(persistent=False) as tape: - Q = self.model(kwargs['state'], training=True) - Q = self.prediction_processing(Q, **kwargs) - if len(Q.shape) != len(kwargs['Qtarget'].shape): Q = tf.expand_dims(Q, -1) - - td_error = kwargs['Qtarget'] - Q - loss = self.loss(kwargs['Qtarget'], Q)*kwargs.get('weights', 1.0) - E_loss = tf.reduce_mean(loss, axis=0) - gradients = tape.gradient(E_loss, self.model.trainable_variables) - return {'gradients': gradients, 'loss': loss, 'td_error': td_error} - - @tf.function(reduce_retracing=True, + experimental_autograph_options=tf.autograph.experimental.Feature.ALL) + def calculate_gradients(self, **kwargs) -> dict: + """ + Вычисляет градиенты, лосс, td-ошибку + + Kwargs: + dict содержащий батч, таргет, маску, опционально приоритетные веса + + Returns: + dict содержащий лоссы и td-ошибку + """ + with tf.GradientTape(persistent=False) as tape: + Q = self.model(kwargs['state'], training=True) + Q = self.prediction_processing(Q, **kwargs) + if len(Q.shape) != len(kwargs['Qtarget'].shape): + Q = tf.expand_dims(Q, -1) + + td_error = kwargs['Qtarget'] - Q + loss = self.loss(kwargs['Qtarget'], Q)*kwargs.get('weights', 1.0) + E_loss = tf.reduce_mean(loss, axis=0) + gradients = tape.gradient(E_loss, self.model.trainable_variables) + return {'gradients': gradients, 'loss': loss, 'td_error': td_error} + + @tf.function(reduce_retracing=True, jit_compile=False, - experimental_autograph_options = tf.autograph.experimental.Feature.ALL) - def update_weights(self, **kwargs) -> dict: - """ - Выполняет шаг отимизатора - - Kwargs: - dict содержащий батч, таргет, маску, опционально приоритетные веса - - Returns: - dict содержащий лоссы и td-ошибку - """ - gradients, loss, td_error = self.calculate_gradients(**kwargs).values() - self.model.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) - return {'loss': loss, 'td_error': td_error} + experimental_autograph_options=tf.autograph.experimental.Feature.ALL) + def update_weights(self, **kwargs) -> dict: + """ + Выполняет шаг отимизатора + + Kwargs: + dict содержащий батч, таргет, маску, опционально приоритетные веса + + Returns: + dict содержащий лоссы и td-ошибку + """ + gradients, loss, td_error = self.calculate_gradients(**kwargs).values() + self.model.optimizer.apply_gradients( + zip(gradients, self.model.trainable_variables)) + return {'loss': loss, 'td_error': td_error} diff --git a/rl_lib/src/models/model.py b/rl_lib/src/models/model.py index 0a3c8c9..1076ad7 100644 --- a/rl_lib/src/models/model.py +++ b/rl_lib/src/models/model.py @@ -1,63 +1,82 @@ -import tensorflow as tf -import numpy as np import abc + +import numpy as np +import tensorflow as tf from tensorflow.keras.models import clone_model -from .base_models import ModelNN, ModelIO, BaseModel from ..optimizers.optimizer import get_optimizer +from .base_models import BaseModel, ModelIO, ModelNN + class Model(ModelNN, ModelIO, BaseModel, abc.ABC): - """Абстрактный класс модели, который соединяет все методы классов ModelNN, ModelIO, BaseModel""" - def __init__(self, **config: dict): - super().__init__(**config) - self.initial_model() - - def _initial_model(self): - input_shape = self._config['model_config']["input_shape"] - action_space = self._config['model_config']["action_space"] - if len(input_shape) == 1: - return self.create_model(input_shape, action_space) - else: - return self.create_model_with_conv(input_shape, action_space) - - def check_input_shape(self, inputs, key=None): - if not isinstance(inputs, (tf.Tensor, np.ndarray)): - for key, inpt in inputs.items() if isinstance(inputs, dict) else enumerate(inputs): - inputs[key] = self.check_input_shape(inpt, key=key) - return inputs - while len(inputs.shape) < len(self.input_spec(key=key)): - inputs = tf.expand_dims(inputs,0) - if len(inputs.shape) > len(self.input_spec(key=key)): assert 0 #inputs.shape не может быть больше входа модели - return inputs - - def initial_model(self): - """Инициализирует модель в соответствии с типом алгоритма""" - if str(self.config['model_config']['model']) == 'None': model = self._initial_model() - else: model = clone_model(self.config['model_config']['model']) - optimizer = self.config.get("optimizer_config") - optimizer = get_optimizer(**optimizer) - self.set_new_model(model, optimizer) - - def input_spec(self, key=None): - if key!=None: return self.model.input[key].shape - return self.model.input.shape - - def load(self, path): - self.model = tf.keras.models.load_model(path+self.name+'.h5') - - def output_spec(self): - """Возвращает кортеж размера выходных данных Модели""" - return self.model.layers[-1].output_shape - - def save(self, path): - self.model.save(path+self.name+'.h5') - - @staticmethod - @abc.abstractclassmethod - def create_model(input_shape: tuple, action_space: int) -> tf.keras.Model: - """Создает модель по умолчанию и возвращает tf.keras.Model, архитектура в соответствии с алгоритмом, начальные слои - полносвязные""" - - @staticmethod - @abc.abstractclassmethod - def create_model_with_conv(input_shape: tuple, action_space: int) -> tf.keras.Model: - """Создает модель по умолчанию и возвращает tf.keras.Model, архитектура в соответствии с алгоритмом, начальные слои - сверточные""" + """Абстрактный класс модели, + который соединяет все методы классов ModelNN, ModelIO, BaseModel + """ + + def __init__(self, **config: dict): + super().__init__(**config) + self.initial_model() + + def _initial_model(self): + input_shape = self._config['model_config']["input_shape"] + action_space = self._config['model_config']["action_space"] + if len(input_shape) == 1: + return self.create_model(input_shape, action_space) + else: + return self.create_model_with_conv(input_shape, action_space) + + def check_input_shape(self, inputs, key=None): + if not isinstance(inputs, (tf.Tensor, np.ndarray)): + print(inputs) + for key, inpt in inputs.items() if isinstance(inputs, dict) else enumerate(inputs): + inputs[key] = self.check_input_shape(inpt, key=key) + return inputs + while len(inputs.shape) < len(self.input_spec(key=key)): + inputs = tf.expand_dims(inputs, 0) + if len(inputs.shape) > len(self.input_spec(key=key)): + assert 0 # inputs.shape не может быть больше входа модели + return inputs + + def initial_model(self): + """Инициализирует модель в соответствии с типом алгоритма""" + if str(self.config['model_config']['model']) == 'None': + model = self._initial_model() + else: + model = clone_model(self.config['model_config']['model']) + optimizer = self.config.get("optimizer_config") + optimizer = get_optimizer(**optimizer) + self.set_new_model(model, optimizer) + + def input_spec(self, key=None): + if key is not None: + return self.model.input[key].shape + elif isinstance(self.model.input, list): + if self.lstm_size: + return self.model.input[0].shape + return self.model.input.shape + + def load(self, path): + self.model = tf.keras.models.load_model(path+self.name+'.keras') + + def output_spec(self): + """Возвращает кортеж размера выходных данных Модели""" + return self.model.layers[-1].output_shape + + def save(self, path): + self.model.save(path+self.name+'.keras') + + @staticmethod + @abc.abstractclassmethod + def create_model(input_shape: tuple, + action_space: int) -> tf.keras.Model: + """Создает модель по умолчанию и возвращает tf.keras.Model, + архитектура в соответствии с алгоритмом, начальные слои - полносвязные + """ + + @staticmethod + @abc.abstractclassmethod + def create_model_with_conv(input_shape: tuple, + action_space: int) -> tf.keras.Model: + """Создает модель по умолчанию и возвращает tf.keras.Model, + архитектура в соответствии с алгоритмом, начальные слои - сверточные + """ diff --git a/rl_lib/src/normalizes.py b/rl_lib/src/normalizers.py similarity index 59% rename from rl_lib/src/normalizes.py rename to rl_lib/src/normalizers.py index 6506aa1..9766a87 100644 --- a/rl_lib/src/normalizes.py +++ b/rl_lib/src/normalizers.py @@ -1,8 +1,11 @@ +import numpy as np -def normalize_m1_1(x): + +def normalize_m1_1(x: np.ndarray) -> np.ndarray: """Нормализует RGB изображение в диапазон [-1, 1].""" return x / 127.5 - 1 -def normalize_01(x): + +def normalize_01(x: np.ndarray) -> np.ndarray: """Нормализует RGB изображение в диапазон [0, 1].""" - return x / 255.0 \ No newline at end of file + return x / 255.0 diff --git a/rl_lib/src/optimizers/__init__.py b/rl_lib/src/optimizers/__init__.py index f8835c4..9267e9a 100644 --- a/rl_lib/src/optimizers/__init__.py +++ b/rl_lib/src/optimizers/__init__.py @@ -1,11 +1,12 @@ try: - import tensorflow_addons as tfa + import tensorflow_addons as tfa except ImportError: - try: - import subprocess - import sys - subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow_addons"]) - import tensorflow_addons - except ImportError: - print("Не удалось установить и импортировать TENSORFLOW_ADDONS") - raise SystemExit(1) + try: + import subprocess + import sys + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "tensorflow_addons"]) + import tensorflow_addons + except ImportError: + print("Не удалось установить и импортировать TENSORFLOW_ADDONS") + raise SystemExit(1) diff --git a/rl_lib/src/optimizers/optimizer.py b/rl_lib/src/optimizers/optimizer.py index 15561a4..bccc8ef 100644 --- a/rl_lib/src/optimizers/optimizer.py +++ b/rl_lib/src/optimizers/optimizer.py @@ -1,29 +1,31 @@ import tensorflow.keras.optimizers as optimizers import tensorflow_addons as tfa -def get_optimizer(optimizer_name: str = "adam", optimizer_params: dict = {}, custom_optimizer: object = None) -> object: - """Возврщает настроенный оптимизатор. - Доступные оптимизаторы tensorflow: - Adam - LAMB - Adadelta - RMSprop - Args: - optimizer: str: Название оптимизатора - optimizer_params: dict: Параметры оптимизатора - cutom_optimizer: object: Класс кастомного потимизатора - """ - if optimizer_name.lower() == 'adam': - return optimizers.Adam(**optimizer_params) - - elif optimizer_name.lower() == 'lamb': - return tfa.optimizers.LAMB(**optimizer_params) - elif optimizer_name.lower() == 'cutom' and type(custom_optimizer) != None: - return custom_optimizer(**optimizer_params) +def get_optimizer(optimizer_name: str = "adam", optimizer_params: dict = {}, + custom_optimizer: object = None) -> object: + """Возврщает настроенный оптимизатор. + Доступные оптимизаторы tensorflow: + Adam + LAMB + Adadelta + RMSprop + Args: + optimizer: str: Название оптимизатора + optimizer_params: dict: Параметры оптимизатора + cutom_optimizer: object: Класс кастомного потимизатора + """ + if optimizer_name.lower() == 'adam': + return optimizers.Adam(**optimizer_params) - elif optimizer_name.lower() == 'adadelta': - return optimizers.Adam(**optimizer_params) + elif optimizer_name.lower() == 'lamb': + return tfa.optimizers.LAMB(**optimizer_params) - elif optimizer_name.lower() == 'rmsprop': - return optimizers.Adam(**optimizer_params) + elif optimizer_name.lower() == 'cutom' and type(custom_optimizer) is not None: + return custom_optimizer(**optimizer_params) + + elif optimizer_name.lower() == 'adadelta': + return optimizers.Adam(**optimizer_params) + + elif optimizer_name.lower() == 'rmsprop': + return optimizers.Adam(**optimizer_params) diff --git a/rl_lib/src/replay_buffers/dict_array.py b/rl_lib/src/replay_buffers/dict_array.py index 752da93..4ff47bc 100644 --- a/rl_lib/src/replay_buffers/dict_array.py +++ b/rl_lib/src/replay_buffers/dict_array.py @@ -1,79 +1,103 @@ from typing import Any + import numpy as np + class StructArray: - """Структурированный массив""" + """Структурированный массив + """ def __init__(self, shape, dict_keys, dtype=object) -> None: - self.data = np.zeros(shape=shape, - dtype=( - [ - (key, dtype) for key in sorted(dict_keys) - ] - ) - ) self.dict_keys = sorted(dict_keys) + self.data = np.zeros(shape=shape, + dtype=( + [ + (key, dtype) for key in self.dict_keys + ] + ) + ) self.dtype = dtype - + def __getitem__(self, index): data = self.data[index] return {key: np.asarray(data[key]).astype(np.float32) - if isinstance(index, int) else StructArray.stack(data[key], axis=0).astype(np.float32) + if isinstance(index, + int) else StructArray.stack(data[key], + axis=0).astype( + np.float32) for key in self.dict_keys} - + def __setitem__(self, index, values): "values = (state, action, reward, next_state, done, *other_data)" self.data[index] = tuple(values[key] for key in self.dict_keys) @staticmethod def stack(array, axis=0): - return np.stack(array, axis=axis).astype(np.float32) + if isinstance(array, np.ndarray): + if len(array.shape) > 1: + return np.asarray( + tuple(StructArray.stack(array[i]) + for i in range(array.shape[0]) + ) + ) + return np.stack(array, axis=axis).astype(np.float32) + return array + class NonStructArray: """Не структурированный массив""" + def __init__(self, shape, dtype=object) -> None: self.data = np.zeros(shape=shape, dtype=dtype) self.dtype = dtype - + def __getitem__(self, index): return StructArray.stack(self.data[index]) - + def __setitem__(self, index, values): "values = (state, action, reward, next_state, done, *other_data)" self.data[index] = values - + + class DictArray: """ - Класс реализующий сохранение/ извлечение данных в структурированные массивы numpy + Класс реализующий сохранение/ извлечение данных + в структурированные массивы numpy """ + def __init__(self, shape, dtype=object) -> None: self.dtype = dtype self.initialized = False self.shape = shape - self.data = np.zeros((shape[1], ), dtype=object) #В этом массиве мы будем хранить вложенные массивы (s,a,r,s',d) + # В этом массиве мы будем хранить вложенные массивы (s,a,r,s',d) + self.data = np.zeros((shape[1], ), dtype=object) def __getitem__(self, index): return tuple(self.data[i][index] for i in range(self.shape[1])) - + def __setitem__(self, index, values): "values = (state, action, reward, next_state, done, *other_data)" - if not self.initialized: self.init_array(values) + if not self.initialized: + self.init_array(values) for i in range(self.shape[1]): self.data[i][index] = values[i] def choose_array_type(self, data): - if isinstance(data, dict): return self.init_struct_array((self.shape[0], ), data.keys()) - else: return self.init_non_struct_array((self.shape[0], )) + if isinstance(data, dict): + return self.init_struct_array((self.shape[0], ), data.keys()) + else: + return self.init_non_struct_array((self.shape[0], )) def init_array(self, data): for i, d in zip(range(self.shape[0]), data): self.data[i] = self.choose_array_type(d) - self.initialized=True + self.initialized = True def init_struct_array(self, shape, dict_keys): return StructArray(shape, dict_keys, dtype=self.dtype) - + def init_non_struct_array(self, shape): return NonStructArray(shape=shape, dtype=self.dtype) - - \ No newline at end of file + @property + def data_array(self): + return tuple(array.data for array in self.data) \ No newline at end of file diff --git a/rl_lib/src/replay_buffers/priority_buffers.py b/rl_lib/src/replay_buffers/priority_buffers.py index bb35b0e..ef6ac5d 100644 --- a/rl_lib/src/replay_buffers/priority_buffers.py +++ b/rl_lib/src/replay_buffers/priority_buffers.py @@ -1,16 +1,18 @@ import numpy as np -from ..data_saver.utils import save_data, load_data + +from ..data_saver.utils import load_data, save_data from .random_buffers import Random_Buffer, Random_Recurrent_Buffer + class Sum_Tree: def __init__(self, size): - self.tree = np.zeros(2*size - 1, dtype = np.float64) + self.tree = np.zeros(2*size - 1, dtype=np.float64) self.size = size self.count = 0 self.real_size = 0 def clear(self, ): - self.tree = np.zeros(self.tree.shape, dtype = np.float64) + self.tree = np.zeros(self.tree.shape, dtype=np.float64) self.count = 0 self.real_size = 0 @@ -19,17 +21,18 @@ def total(self): return self.tree[0] def update(self, data_idx, value): - assert type(data_idx)!=np.array and type(data_idx)!=list and type(data_idx)!=tuple + assert not isinstance(data_idx, np.ndarray) or not isinstance( + data_idx, list) or not isinstance(data_idx, tuple) idx = data_idx + self.size - 1 change = value - self.tree[idx] self.tree[idx] = value parent = (idx - 1) // 2 - idx =[] + idx = [] idx.append(parent) while parent > 0: - parent = (parent - 1) // 2 - idx.append(parent) - parent = np.asarray(idx, dtype = np.int32) + parent = (parent - 1) // 2 + idx.append(parent) + parent = np.asarray(idx, dtype=np.int32) self.tree[parent] += change def add(self, value): @@ -44,14 +47,15 @@ def get(self, s): left = 2 * idx + 1 right = left + 1 while np.any(idx != left): - idx = np.where(s <= self.tree[left], left, right) - s = np.where(s <= self.tree[left], s, s - self.tree[left]) - left = 2 * idx + 1 - left = np.where(left >= self.tree.shape[0], idx, left) - right = np.where(left >= self.tree.shape[0], right, left + 1) + idx = np.where(s <= self.tree[left], left, right) + s = np.where(s <= self.tree[left], s, s - self.tree[left]) + left = 2 * idx + 1 + left = np.where(left >= self.tree.shape[0], idx, left) + right = np.where(left >= self.tree.shape[0], right, left + 1) data_idx = idx - self.size + 1 - return data_idx, self.tree[idx] + return data_idx, self.tree[idx] + class Prioritized_Replay_Buffer(Random_Buffer): ''' @@ -59,7 +63,8 @@ class Prioritized_Replay_Buffer(Random_Buffer): size: int n_step: int discount_factor: float - num_var: int (Кол-во сохраянемых переменных, по умполчанию 5 (s, a, r, d, s')) + num_var: int (Кол-во сохраянемых переменных, + по умполчанию 5 (s, a, r, d, s')) eps: float alpha: float beta: float @@ -67,72 +72,80 @@ class Prioritized_Replay_Buffer(Random_Buffer): beta_changing_curve: str max_priority: float ''' + def __init__(self, **kwargs): size = kwargs.get("size", 100000) Random_Buffer.__init__(self, **kwargs) self.name = "Prioritized_Replay_Buffer" - + self.tree = Sum_Tree(size=size) # PER params - self.eps = kwargs.get("eps", 1e-2) - self.alpha = kwargs.get("alpha", 0.6) - self.beta = kwargs.get("beta", 0.4) + self.eps = kwargs.get("eps", 1e-2) + self.alpha = kwargs.get("alpha", 0.6) + self.beta = kwargs.get("beta", 0.4) self.beta_changing = kwargs.get("beta_changing", 5e-4) self.beta_changing_curve = kwargs.get("beta_changing_curve", 'linear') - self.max_priority = kwargs.get("max_priority", 1e-2) + self.max_priority = kwargs.get("max_priority", 1e-2) def clear(self, ): Random_Buffer.clear(self,) self.tree.clear() - def add(self, samples, priority = None): + def add(self, samples, priority=None): '''samples -> tuple(s,a,r,d,s') - priority -> float если передается, то приоритет в буфере выставлется по преданному числу, - по умолчанию вычисляется по self.max_priotiry + priority -> float если передается, + то приоритет в буфере выставлется по преданному числу, + по умолчанию вычисляется по self.max_priotiry ''' if Random_Buffer.add(self, samples): - self.tree.add(self.max_priority if priority == None else priority) + self.tree.add(self.max_priority if priority is None else priority) assert self.count == self.tree.count and self.real_size == self.tree.real_size, "tree and has same real sizes" def sample(self, batch_size): data_idxs, weights = self._get_idx(batch_size) - return {**Random_Buffer.sample(self, batch_size, data_idxs), 'data_idxs': data_idxs, 'weights': weights} - + return {**Random_Buffer.sample(self, batch_size, data_idxs), + 'data_idxs': data_idxs, 'weights': weights} + def update_priorities(self, data_idxs, priorities): priorities = self._calculate_new_priority(priorities) self.max_priority = max(self.max_priority, max(priorities)) for data_idx, priority in zip(data_idxs, priorities): - self.tree.update(data_idx, priority) + self.tree.update(data_idx, priority) def save(self, path): Random_Buffer.save(self, path + "Random_Buffer_") path += self.name save_data(path, { - 'tree': self.tree.tree, - 'tree_count': self.tree.count, - 'tree_real_size': self.tree.real_size - }) + 'tree': self.tree.tree, + 'tree_count': self.tree.count, + 'tree_real_size': self.tree.real_size + }) def load(self, path): - Random_Buffer.load(self, path + "Random_Buffer_") + Random_Buffer.load(self, path + "Random_Buffer_") path += self.name data = load_data(path) self.tree.tree = data['tree'] self.tree.count = data['tree_count'] - self.tree.real_size = data['tree_real_size'] - + self.tree.real_size = data['tree_real_size'] + def _get_idx(self, batch_size): assert self.real_size >= batch_size, "buffer contains less samples than batch size" segment = self.tree.total / batch_size - segment_array = np.random.uniform(segment * np.arange(batch_size), segment * (np.arange(batch_size) + 1)) + segment_array = np.random.uniform( + segment * np.arange(batch_size), + segment * (np.arange(batch_size) + 1) + ) data_idxs, priorities = self.tree.get(segment_array) weights = self._calculate_weights(priorities) if self.beta_changing_curve.lower() == 'exponential': precision = len(str(self.beta_changing).split('.')[1]) - self.beta = round(1 - np.power(np.exp, -self.beta*self.beta_changing), precision) - else: self.beta = min(1, self.beta*self.beta_changing) + self.beta = round( + 1 - np.power(np.exp, -self.beta*self.beta_changing), precision) + else: + self.beta = min(1, self.beta*self.beta_changing) return data_idxs, weights @@ -146,51 +159,67 @@ def _calculate_weights(self, weights): weights = weights.astype(np.float32) return weights -class Prioritized_Replay_Recurrent_Buffer(Prioritized_Replay_Buffer, Random_Recurrent_Buffer, Random_Buffer): + +class Prioritized_Replay_Recurrent_Buffer(Prioritized_Replay_Buffer, + Random_Recurrent_Buffer, + Random_Buffer): def __init__(self, **kwargs): kwargs["num_var"] = 7 self.trace_length = kwargs.get("trace_length", 10) Prioritized_Replay_Buffer.__init__(self, **kwargs) Random_Recurrent_Buffer.__init__(self, **kwargs) self.name = "Prioritized_Replay_Recurrent_Buffer" - + kwargs["size"] = self.trace_length - self.trace_window = Random_Buffer(**kwargs) #нужно для того чтобы граничные индексы кольцевого буфера из приоритетного выбора были с историческими данными + """нужно для того чтобы граничные индексы кольцевого буфера + из приоритетного выбора были с историческими данными + """ + self.trace_window = Random_Buffer(**kwargs) def clear(self, ): Prioritized_Replay_Buffer.clear(self,) self.trace_window.clear() - def add(self, samples, priority = None): + def add(self, samples, priority=None): if self.trace_window.real_size != self.trace_length: - self.trace_window.add(samples) + self.trace_window.add(samples) else: - if self.real_size != self.size: self.trace_window.add(samples) - else: self.trace_window.add(self.data[self.count]) - Prioritized_Replay_Buffer.add(self, samples, priority) + if self.real_size != self.size: + self.trace_window.add(samples) + else: + self.trace_window.add(self.data[self.count]) + Prioritized_Replay_Buffer.add(self, samples, priority) def sample(self, batch_size): - if self.data[-1][1] == 0: self.data[-self.trace_length:] = self.trace_window.data - data_idxs, weights = Prioritized_Replay_Buffer._get_idx(self, batch_size) + if self.data[-1][1] == 0: + self.data[-self.trace_length:] = self.trace_window.data.data_array + data_idxs, weights = Prioritized_Replay_Buffer._get_idx( + self, batch_size) data = Random_Recurrent_Buffer.sample(self, batch_size, data_idxs) data = self.add_trace_window(data, data_idxs) - return {**data, 'data_idxs': data_idxs, 'weights': weights} + return {**data, 'data_idxs': data_idxs, 'weights': weights} def add_trace_window(self, data, data_idxs): - error_idx = np.where((data_idxs < self.count + self.trace_length) & (data_idxs > self.count))[0] + error_idx = np.where( + (data_idxs < self.count + self.trace_length) & + (data_idxs > self.count) + )[0] errors = data_idxs[error_idx] count = abs(errors - self.count - self.trace_length) for e, c in zip(error_idx, count): repair_data = self.get_repair_data(c) z = np.arange(c, 0, -1)-1 for key in data.keys(): - if key in ('h_t', 'c_t'): z = np.arange(2, 0, -1)-1 - data[key][e][z] = repair_data[key][-2:] if key in ('h_t', 'c_t') else repair_data[key] + if key in ('h_t', 'c_t'): + z = np.arange(2, 0, -1)-1 + data[key][e][z] = repair_data[key][-2:] if key in ( + 'h_t', 'c_t') else repair_data[key] return data def get_repair_data(self, count): - l = np.arange(count) + 1 - return self.trace_window.sample(count, self.trace_window.count - l) + length = np.arange(count) + 1 + return self.trace_window.sample(count, + self.trace_window.count - length) def save(self, path): Prioritized_Replay_Buffer.save(self, path) @@ -198,4 +227,4 @@ def save(self, path): def load(self, path): Prioritized_Replay_Buffer.load(self, path) - self.trace_window.load(path+'small_buffer_') + self.trace_window.load(path+'small_buffer_') diff --git a/rl_lib/src/replay_buffers/random_buffers.py b/rl_lib/src/replay_buffers/random_buffers.py index 6781362..d302688 100644 --- a/rl_lib/src/replay_buffers/random_buffers.py +++ b/rl_lib/src/replay_buffers/random_buffers.py @@ -1,7 +1,9 @@ import numpy as np -from ..data_saver.utils import save_data, load_data + +from ..data_saver.utils import load_data, save_data from .dict_array import DictArray + class _n_step_buffer: def __init__(self, **kwargs): self.buffer = [[]] @@ -14,92 +16,109 @@ def clear(self, ): def add(self, memory_tuplet): state, action, reward, done, next_state, *another_data = memory_tuplet - if len(self.buffer[0])==0: - self.buffer[-1]= [ state, action, [reward], None, None, *[None for _ in range(len(another_data))]] + if len(self.buffer[0]) == 0: + self.buffer[-1] = [state, action, [reward], None, + None, *[None for _ in range(len(another_data))]] - if len(self.buffer[0][2]) == self.steps-1 or (len(self.buffer[0][2]) == self.steps and self.steps == 1): - self.buffer[0][3]=int(done) - self.buffer[0][4]=next_state - for i in range(len(another_data)): - self.buffer[0][i+5]=another_data[i] + if len( + self.buffer[0][2] + ) == self.steps-1 or ( + len(self.buffer[0][2]) == self.steps and self.steps == 1 + ): + self.buffer[0][3] = int(done) + self.buffer[0][4] = next_state + for i in range(len(another_data)): + self.buffer[0][i+5] = another_data[i] for j in range(len(self.buffer)-1): - self.buffer[j][2].append(reward * self.discount_factor**len(self.buffer[j][2])) + self.buffer[j][2].append( + reward * self.discount_factor**len(self.buffer[j][2])) if len(self.buffer) != 1: - self.buffer[-1]=[state, action, [reward], None, None, *[None for _ in range(len(another_data))]] + self.buffer[-1] = [state, action, [reward], None, + None, *[None for _ in range(len(another_data))]] self.buffer.append([]) if len(self.buffer) > self.steps: - self.buffer[0][2] = sum(self.buffer[0][2]) - return_data = self.buffer[0] - self.buffer = self.buffer[1:] - return return_data + self.buffer[0][2] = sum(self.buffer[0][2]) + return_data = self.buffer[0] + self.buffer = self.buffer[1:] + return return_data return None class Random_Buffer: '''Сохраняет переходы (s,a,r,d,s') и возвращает батчи. - + Аргументы: size: int. Размер буфера n_step: int. N-step алгоритм discount_factor: float - num_var: int (Кол-во сохраянемых переменных, по умполчанию 5 (s, a, r, d, s')) + num_var: int, Кол-во сохраянемых переменных, + по умполчанию 5 (s, a, r, d, s') ''' + def __init__(self, **kwargs): n_step = kwargs.get("n_step", 1) self.size = kwargs.get("size", 100000) - discount_factor = kwargs.get("discount_factor", 0.99) + # discount_factor = kwargs.get("discount_factor", 0.99) num_var = kwargs.get("num_var", 5) - # буфер для хранения перехода - self.data = DictArray((self.size, num_var), dtype=object) + # буфер для хранения перехода + self.data = DictArray((self.size, num_var), dtype=object) self.name = "Random_Buffer" - + # размер буфера self.count = 0 self.real_size = 0 - + self.n_step_buffer = _n_step_buffer(**kwargs) if n_step > 1 else None def clear(self, ): - self.data = DictArray(self.data.shape, dtype=object) + self.data = DictArray(self.data.shape, dtype=object) self.count = 0 self.real_size = 0 - if self.n_step_buffer != None: self.n_step_buffer.clear() + if self.n_step_buffer is not None: + self.n_step_buffer.clear() def add(self, samples: tuple, args=None): """Добавляет данные в буфер s,a,r,n_s,d""" - if self.n_step_buffer != None: - result = self.n_step_buffer.add(samples) - if result != None: - return self._add_data(result) - return False + if self.n_step_buffer is not None: + result = self.n_step_buffer.add(samples) + if result is not None: + return self._add_data(result) + return False else: - return self._add_data(samples) - + return self._add_data(samples) + def sample(self, batch_size, idx=None): """Возвращает батч: dict""" - if np.any(idx) == None: - idx = self._get_idx( batch_size) + if np.any(idx) is None: + idx = self._get_idx(batch_size) data = self.data[idx] state, action, reward, done, next_state = data[:5] other_data = {} + if 5 < self.data.shape[1] <= 7: - other_data = {key: data[i] for i, key in zip(range(5,7), ('h_t', 'c_t'))} + other_data = {key: data[i][:, :2] + for i, key in zip(range(5, 7), ('h_t', 'c_t'))} - return {'state': state, 'action': action, 'reward': reward, 'done': done, 'next_state': next_state, **other_data} + return {'state': state, + 'action': action, + 'reward': reward, + 'done': done, + 'next_state': next_state, + **other_data} def save(self, path): path += self.name save_data(path, { - 'data': self.data, - 'count': self.count, - 'size': self.size, - 'real_size': self.real_size - }) + 'data': self.data, + 'count': self.count, + 'size': self.size, + 'real_size': self.real_size + }) def load(self, path): path += self.name @@ -109,15 +128,15 @@ def load(self, path): self.size = data['size'] self.real_size = data['real_size'] - def _add_data(self, samples): self.data[self.count] = samples self.count = (self.count + 1) % self.size self.real_size = min(self.size, self.real_size + 1) - return True + return True def _get_idx(self, batch_size): - return np.random.choice(self.real_size, size = batch_size, replace = False) + return np.random.choice(self.real_size, size=batch_size, replace=False) + class Random_Recurrent_Buffer(Random_Buffer): ''' @@ -125,37 +144,37 @@ class Random_Recurrent_Buffer(Random_Buffer): size: int. Размер буфера n_step: int. N-step алгоритм discount_factor: float - num_var: int (Кол-во сохраняемых переменных, по умполчанию 7 (s, a, r, d, s', h, c)) + num_var: int, Кол-во сохраняемых переменных, + по умполчанию 7 (s, a, r, d, s', h, c) trace_length: int. Длина возращаемой последовательности ''' + def __init__(self, **kwargs): kwargs["num_var"] = 7 - super().__init__(**kwargs) + Random_Buffer.__init__(self, **kwargs) self.name = "Random_Recurrent_Buffer" self.trace_length = kwargs.get("trace_length", 10) - + + def _make_linspace(self, idx): + idx = np.linspace(start=idx - self.trace_length, + stop=idx, num=self.trace_length+1, + dtype=int, axis=1)[:, :-1] + return idx + + def _get_idx(self, batch_size, *args, **kwargs): + if self.real_size != self.size: + return self._make_linspace(np.random.randint(low=self.trace_length, + high=self.real_size, size=(batch_size,))) + else: + return self._make_linspace(np.random.randint( + low=-self.size + self.count + self.trace_length, + high=self.count, size=(batch_size,) + )) + def sample(self, batch_size, idx=None): - if np.any(idx) == None: - idx = self._get_idx_rec(batch_size) - - vector_idx = np.linspace(start=idx - self.trace_length, stop=idx, num = self.trace_length+1, dtype = int, axis=1) - mem_idx = vector_idx[:,:2] - - state = self.stack(self.data[vector_idx[:, :-1], 0], batch_size).astype(np.float32) - action = self.stack(self.data[vector_idx[:, :-1], 1], batch_size).astype(np.int32) - reward = self.data[vector_idx[:, :-1], 2].astype(np.float32) - done = self.data[vector_idx[:, :-1], 3].astype(np.float32) - next_state = self.stack(self.data[vector_idx[:, :-1], 4], batch_size).astype(np.float32) - h_t = self.stack(self.data[mem_idx, 5], batch_size).astype(np.float32) - c_t = self.stack(self.data[mem_idx, 6], batch_size).astype(np.float32) - - return {'state': state, 'action': action, 'reward': reward, 'done': done, 'next_state': next_state, 'h_t': h_t, 'c_t': c_t} - - def _get_idx_rec(self, batch_size): - if self.real_size != self.size: - return np.random.randint(low = self.trace_length, high = self.real_size, size=(batch_size,)) - else: - return np.random.randint(low = -self.size + self.count + self.trace_length, high = self.count, size=(batch_size,)) + if idx is not None: + idx = self._make_linspace(idx) + return Random_Buffer.sample(self, batch_size, idx) def stack(self, data, batch_size): return np.asarray([np.stack(data[i]) for i in range(batch_size)]) diff --git a/rl_lib/src/replay_buffers/replay_buffer.py b/rl_lib/src/replay_buffers/replay_buffer.py index ed7dc79..90565d3 100644 --- a/rl_lib/src/replay_buffers/replay_buffer.py +++ b/rl_lib/src/replay_buffers/replay_buffer.py @@ -1,60 +1,68 @@ -from .random_buffers import * -from .priority_buffers import * +from .priority_buffers import (Prioritized_Replay_Recurrent_Buffer, + Prioritized_Replay_Buffer) +from .random_buffers import Random_Recurrent_Buffer, Random_Buffer + class ReplayBuffer: - """Сохраняет переходы и выполняет сэмплирование батчей - - Kwargs: - priority: bool True если приоритетный - recurrent: bool True если рекуррентный - size: int Размер буфера - n_step: int - discount_factor: float - num_var: int (Кол-во сохраянемых переменных, по умполчанию 5 (s, a, r, d, s')) - eps: float - alpha: float - beta: float - beta_changing: float - beta_changing_curve: str - max_priority: float Максимальный приоритет при добавлении новых данных - trace_length: int. Длина возращаемой последовательности - """ - - def __init__(self, **kwargs): - self._config = kwargs - if kwargs.get("priority", 0) : - if kwargs.get("recurrent", 0): self.buffer = Prioritized_Replay_Recurrent_Buffer(**kwargs) - else: self.buffer = Prioritized_Replay_Buffer(**kwargs) - else: - if kwargs.get("recurrent", 0): self.buffer = Random_Recurrent_Buffer(**kwargs) - else: self.buffer = Random_Buffer(**kwargs) - - def add(self, *args): - self.buffer.add(*args) - - @property - def config(self): - return self._config - - def clear(self): - self.buffer.clear() - - def load(self, *args): - self.buffer.load(*args) - - @property - def name(self): - return self.buffer.name - - @property - def real_size(self): - return self.buffer.real_size - - def sample(self, *args): - return self.buffer.sample(*args) - - def save(self, *args): - self.buffer.save(*args) - - def update_priorities(self, *args): - self.buffer.update_priorities(*args) + """Сохраняет переходы и выполняет сэмплирование батчей + + Kwargs: + priority: bool True если приоритетный + recurrent: bool True если рекуррентный + size: int Размер буфера + n_step: int + discount_factor: float + num_var: int, Кол-во сохраянемых переменных, + по умполчанию 5 (s, a, r, d, s') + eps: float + alpha: float + beta: float + beta_changing: float + beta_changing_curve: str + max_priority: float, + Максимальный приоритет при добавлении новых данных + trace_length: int. Длина возращаемой последовательности + """ + + def __init__(self, **kwargs): + self._config = kwargs + if kwargs.get("priority", 0): + if kwargs.get("recurrent", 0): + self.buffer = Prioritized_Replay_Recurrent_Buffer(**kwargs) + else: + self.buffer = Prioritized_Replay_Buffer(**kwargs) + else: + if kwargs.get("recurrent", 0): + self.buffer = Random_Recurrent_Buffer(**kwargs) + else: + self.buffer = Random_Buffer(**kwargs) + + def add(self, *args): + self.buffer.add(*args) + + @property + def config(self): + return self._config + + def clear(self): + self.buffer.clear() + + def load(self, *args): + self.buffer.load(*args) + + @property + def name(self): + return self.buffer.name + + @property + def real_size(self): + return self.buffer.real_size + + def sample(self, *args): + return self.buffer.sample(*args) + + def save(self, *args): + self.buffer.save(*args) + + def update_priorities(self, *args): + self.buffer.update_priorities(*args) diff --git a/rl_lib/src/replay_buffers/tests/test_replay_buffer.py b/rl_lib/src/replay_buffers/tests/test_replay_buffer.py index c3ea3e7..083acbc 100644 --- a/rl_lib/src/replay_buffers/tests/test_replay_buffer.py +++ b/rl_lib/src/replay_buffers/tests/test_replay_buffer.py @@ -1,86 +1,105 @@ -from rl_lib.src.replay_buffers.replay_buffer import ReplayBuffer import os -from shutil import rmtree from copy import deepcopy - +from shutil import rmtree + +from rl_lib.src.replay_buffers.replay_buffer import ReplayBuffer + + class Test_Replay_Buffer: - """ - Производит тестирование буфера + """ + Производит тестирование буфера + + buffer_args: + priority: bool True если приоритетный + recurrent: bool True если рекуррентный + size: int Размер буфера + n_step: int + discount_factor: float + num_var: int, Кол-во сохраянемых переменных, + по умполчанию 5 (s, a, r, d, s') + eps: float + alpha: float + beta: float + beta_changing: float + beta_changing_curve: str + max_priority: float Максимальный приоритет + при добавлении новых данных + trace_length: int. Длина возращаемой последовательности + """ + + def __init__(self, buffer_args): + self.buffer = ReplayBuffer(**buffer_args) + self.path = os.getcwd() + '/test_replay_buffer/' + if not os.path.isdir(self.path): + os.mkdir(self.path) + + def __exit__(self): + """Удаляет созданную папку с файлами, если есть""" + if os.path.isdir(self.path): + rmtree(self.path) - buffer_args: - priority: bool True если приоритетный - recurrent: bool True если рекуррентный - size: int Размер буфера - n_step: int - discount_factor: float - num_var: int (Кол-во сохраянемых переменных, по умполчанию 5 (s, a, r, d, s')) - eps: float - alpha: float - beta: float - beta_changing: float - beta_changing_curve: str - max_priority: float Максимальный приоритет при добавлении новых данных - trace_length: int. Длина возращаемой последовательности - """ - def __init__(self, buffer_args): - self.buffer = ReplayBuffer(**buffer_args) - self.path = os.getcwd() + '/test_replay_buffer/' - if not os.path.isdir(self.path): - os.mkdir(self.path) + def test_new_init_args(self, **buffer_args): + """Проверяет переинициализацию с новыми аргументами""" + self.buffer = ReplayBuffer(**buffer_args) - def __exit__(self): - """Удаляет созданную папку с файлами, если есть""" - if os.path.isdir(self.path): - rmtree(self.path) - - def test_new_init_args(self, **buffer_args): - """Проверяет переинициализацию с новыми аргументами""" - self.buffer = ReplayBuffer(**buffer_args) + def test_add_data(self): + """Проверяет возможность добавить в буфер данные + """ + pass - def test_add_data(self): - """Проверяет возможность добавить в буфер данные""" - pass + def test_samples(self): + """Сэмплирует батчи из буфера и проверяет размерности, + количество аргументов + """ + pass - def test_samples(self): - """Сэмплирует батчи из буфера и проверяет размерности, количество аргументов""" - pass + def test_save(self): + """Выполняет сохранение буфера и проверяет появился ли файл буфера + """ + self.buffer.save(self.path) + print("Буфер сохранен") + files = os.listdir(self.path) + file_names = [f.split('.')[0] for f in files if os.path.isfile( + os.path.join(self.path, f))] + print("Проверка сохранения буфера") + print(f"Найдено {len(file_names)} файлов: ", *file_names) + assert self.buffer.name in file_names, """Файл не найден, + проверка не пройдена""" + print('Успешно тест сохранения данных') - def test_save(self): - """Выполняет сохранение буфера и проверяет появился ли файл буфера""" - self.buffer.save(self.path) - print("Буфер сохранен") - files = os.listdir(self.path) - file_names = [f.split('.')[0] for f in files if os.path.isfile(os.path.join(self.path, f))] - print("Проверка сохранения буфера") - print(f"Найдено {len(file_names)} файлов: ", *file_names) - assert self.buffer.name in file_names, "Файл не найден, проверка не пройдена" - print('Успешно тест сохранения данных') - - def test_load(self): - """Выполняет test_save, потом загружает и проверяет соответствуют ли загруженные файлы сохраненным""" - self.buffer.save(self.path) - copy_buffer = deepcopy(self.buffer) + def test_load(self): + """Выполняет test_save, потом загружает и + проверяет соответствуют ли загруженные файлы сохраненным + """ + self.buffer.save(self.path) + copy_buffer = deepcopy(self.buffer) - self.buffer.load(self.path) - assert self.check_load_data(copy_buffer.buffer.__dict__, self.buffer.buffer.__dict__), "Файлы загрузки не соответствуют настоящим файлам" - print("Успешный тест зарузки данных") + self.buffer.load(self.path) + assert self.check_load_data( + copy_buffer.buffer.__dict__, self.buffer.buffer.__dict__), "Файлы загрузки не соответствуют настоящим файлам" + print("Успешный тест зарузки данных") - def check_load_data(self, real_data: dict, loaded_data: dict) -> bool: - for key, value in real_data.items(): - if key == 'tree': continue - if key == 'trace_window': - if not self.check_load_data(value.__dict__, loaded_data[key].__dict__): return False - continue - if key == 'data': - if loaded_data[key].all() != value.all(): return False - continue - if loaded_data[key] != value: return False - return True + def check_load_data(self, real_data: dict, loaded_data: dict) -> bool: + for key, value in real_data.items(): + if key == 'tree': + continue + if key == 'trace_window': + if not self.check_load_data(value.__dict__, + loaded_data[key].__dict__): + return False + continue + if key == 'data': + if loaded_data[key].all() != value.all(): + return False + continue + if loaded_data[key] != value: + return False + return True - def test_all_buffers(self, buffers: list): - for buffer_type in buffers: - self.test_new_init_args(buffer_type) - self.test_add_data() - self.test_samples() - self.test_save() - self.test_load() + def test_all_buffers(self, buffers: list): + for buffer_type in buffers: + self.test_new_init_args(buffer_type) + self.test_add_data() + self.test_samples() + self.test_save() + self.test_load() diff --git a/rl_lib/tests/dqn_config.yaml b/rl_lib/tests/dqn_config.yaml index b8f9a82..4907a7b 100644 --- a/rl_lib/tests/dqn_config.yaml +++ b/rl_lib/tests/dqn_config.yaml @@ -9,7 +9,7 @@ model_config: n_step: 1 batch_size: 32 double_network: False - priority: False + priority: True tau: 1.0 optimizer_config: diff --git a/rl_lib/tests/drqn_config.yaml b/rl_lib/tests/drqn_config.yaml index 9ff0efb..69e94dc 100644 --- a/rl_lib/tests/drqn_config.yaml +++ b/rl_lib/tests/drqn_config.yaml @@ -1,4 +1,4 @@ -#default DQN config +#default DRQN config model_config: model: None @@ -32,7 +32,7 @@ buffer_config: beta: 0.4 beta_changing: 0.0005 beta_changing_curve: 'linear' - max_priority: 0.01 + max_priority: 0.1 exploration_config: strategy_name: "epsilon_greedy" diff --git a/rl_lib/tests/first_test_ddpg.py b/rl_lib/tests/first_test_ddpg.py index c8fbae8..5700e3f 100644 --- a/rl_lib/tests/first_test_ddpg.py +++ b/rl_lib/tests/first_test_ddpg.py @@ -1,56 +1,75 @@ +import os.path as os_path +import time +import traceback +from pprint import pprint + import gym import numpy as np -import time -import os.path as os_path -from tensorflow.keras import layers import tensorflow as tf -from pprint import pprint -import traceback +from tensorflow.keras import layers -from rl_lib.src.algoritms.ddpg.ddpg import DDPG +from rl_lib.src.algoritms.model_free.continuous_control.ddpg import DDPG from rl_lib.src.data_saver.utils import load_default_config -env = gym.make('CarRacing-v2') +env = gym.make('Walker2d-v4') + +initializer = tf.keras.initializers.RandomUniform( + minval=-3*1e-4, maxval=3*1e-4, seed=40) -initializer = tf.keras.initializers.RandomUniform(minval=-3*1e-4, maxval=3*1e-4, seed=40) def create_conv(): input_layer = layers.Input(shape=env.observation_space.shape, ) - rescaling_layer = layers.experimental.preprocessing.Rescaling(1.0 / 127.5, offset=-1)(input_layer) - cov_layer1 = layers.Conv2D(32, 7, 4, activation='relu', kernel_initializer=initializer)(rescaling_layer) - cov_layer2 = layers.Conv2D(32, 5, 2,activation='relu', kernel_initializer=initializer)(cov_layer1) - cov_layer3 = layers.Conv2D(32, 3, 2,activation='relu', kernel_initializer=initializer)(cov_layer2) + rescaling_layer = layers.experimental.preprocessing.Rescaling( + 1.0 / 127.5, offset=-1)(input_layer) + cov_layer1 = layers.Conv2D( + 32, 7, 4, activation='relu', kernel_initializer=initializer)(rescaling_layer) + cov_layer2 = layers.Conv2D( + 32, 5, 2, activation='relu', kernel_initializer=initializer)(cov_layer1) + cov_layer3 = layers.Conv2D( + 32, 3, 2, activation='relu', kernel_initializer=initializer)(cov_layer2) conv_out = layers.Flatten()(cov_layer3) - return tf.keras.Model(inputs=input_layer, outputs=conv_out) + return tf.keras.Model(inputs=input_layer, outputs=conv_out) + def create_model(): """Создает модель tf.keras.Model, архитектура DQN""" input_layer = layers.Input(shape=env.observation_space.shape, ) - conv_out = create_conv()(input_layer) - dence_layer1 = layers.Dense(256, activation='relu', kernel_initializer=initializer)(conv_out) - dence_layer2 = layers.Dense(256, activation='relu', kernel_initializer=initializer)(dence_layer1) - dence_out = layers.Dense(env.action_space.shape[0], activation='tanh', kernel_initializer=initializer)(dence_layer2) + # conv_out = create_conv()(input_layer) + dence_layer1 = layers.Dense( + 256, activation='relu', kernel_initializer=initializer)(input_layer) + dence_layer2 = layers.Dense( + 256, activation='relu', kernel_initializer=initializer)(dence_layer1) + dence_out = layers.Dense( + env.action_space.shape[0], activation='tanh', kernel_initializer=initializer)(dence_layer2) + + dence_out = dence_out * \ + tf.reduce_max((tf.abs(env.action_space.low), env.action_space.high)) - dence_out = dence_out*tf.reduce_max((tf.abs(env.action_space.low), env.action_space.high)) - return tf.keras.Model(inputs=input_layer, outputs=dence_out) + def create_critic_model(): """Создает модель tf.keras.Model, архитектура DQN, начальные слои - сверточные""" input_layer = layers.Input(shape=env.observation_space.shape, ) - obsv_layer = layers.Dense(128, activation='relu', kernel_initializer=initializer)(input_layer) - obsv_layer = layers.Dense(64, activation='relu', kernel_initializer=initializer)(obsv_layer) + obsv_layer = layers.Dense(128, activation='relu', + kernel_initializer=initializer)(input_layer) + obsv_layer = layers.Dense(64, activation='relu', + kernel_initializer=initializer)(obsv_layer) input_action_layer = layers.Input(shape=env.action_space.shape, ) - action_layer = layers.Dense(32, activation='relu', kernel_initializer=initializer)(input_action_layer) - - conv_out = create_conv()(input_layer) - concat = layers.Concatenate()((conv_out, action_layer)) + action_layer = layers.Dense( + 32, activation='relu', kernel_initializer=initializer)(input_action_layer) + + # conv_out = create_conv()(input_layer) + concat = layers.Concatenate()((obsv_layer, action_layer)) flatten = layers.Flatten()(concat) - dence_layer1 = layers.Dense(256, activation='relu', kernel_initializer=initializer)(flatten) - dence_layer2 = layers.Dense(256, activation='relu', kernel_initializer=initializer)(dence_layer1) + dence_layer1 = layers.Dense( + 256, activation='relu', kernel_initializer=initializer)(flatten) + dence_layer2 = layers.Dense( + 256, activation='relu', kernel_initializer=initializer)(dence_layer1) dence_out = layers.Dense(1, activation=None)(dence_layer2) - - return tf.keras.Model(inputs=[input_layer, input_action_layer], outputs=dence_out) + + return tf.keras.Model(inputs=[input_layer, input_action_layer], outputs=dence_out) + config = load_default_config(__file__) @@ -67,6 +86,7 @@ def create_critic_model(): # algo.load() pprint(algo.config) + def run(algo): epidodes = 250 steps = 250 @@ -77,7 +97,7 @@ def run(algo): pre_train_steps = 1 copy_weigths_frequency = 1 - #history data + # history data rewards = [] episode_reward = 0 episode_test_reward = 0 @@ -90,7 +110,7 @@ def run(algo): observation, info = env.reset() episode_reward = 0 episode_loss = [] - for step in range(1, steps+1): + while True: # for step in range(1, steps+1): action = algo.get_action(observation) new_observation, reward, done, tr, info = env.step(action) algo.add((observation, action, reward, done, new_observation)) @@ -105,39 +125,40 @@ def run(algo): if done or tr: break - if episode % save_frequency == 0: algo.save() + if episode % save_frequency == 0: + algo.save() rewards.append(episode_reward) - #testing algoritm perfomans - if episode%test_frequency == 0: + # testing algoritm perfomans + if episode % test_frequency == 0: observation, info = env.reset() episode_test_reward = 0 - for test_step in range(1, test_steps+1): + while True: # for test_step in range(1, test_steps+1): action = algo.get_test_action(observation) observation, test_reward, done, tr, info = env.step(action) episode_test_reward += test_reward if done or tr: break - - #print info + # print info print(" Episode %d - Reward = %.3f, episode reward = %.3f, test reward %.3f, Loss = %.6f, Time = %.f sec, Total steps = %.f" % - ( - episode, - np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0, - episode_reward, - episode_test_reward, - np.asarray(episode_loss).mean() if len(episode_loss) != 0 else 0, - time.time()-start_time, - count - ) - ) + ( + episode, + np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0, + episode_reward, + episode_test_reward, + np.asarray(episode_loss).mean() if len( + episode_loss) != 0 else 0, + time.time()-start_time, + count + ) + ) # algo.load() + if __name__ == "__main__": try: run(algo=algo) - + except Exception: print(traceback.format_exc()) input("Press enter to exit: ") - diff --git a/rl_lib/tests/first_test_dqn.py b/rl_lib/tests/first_test_dqn.py index e8d70be..6779de9 100644 --- a/rl_lib/tests/first_test_dqn.py +++ b/rl_lib/tests/first_test_dqn.py @@ -1,27 +1,30 @@ +import os.path as os_path +import time +import traceback +from pprint import pprint + import gym import numpy as np -import time -import os.path as os_path -from tensorflow.keras import layers import tensorflow as tf -from pprint import pprint -import traceback +from tensorflow.keras import layers -from rl_lib.src.algoritms.dqn.dqn import DQN +from rl_lib.src.algoritms.model_free.value_based import DQN from rl_lib.src.data_saver.utils import load_default_config env = gym.make('CartPole-v0') + def create_model(): """Создает модель tf.keras.Model, архитектура DQN""" input_layer = layers.Input(shape=env.observation_space.shape, ) dence_layer1 = layers.Dense(32, activation='relu')(input_layer) dence_layer2 = layers.Dense(32, activation='relu')(dence_layer1) dence_out = layers.Dense(env.action_space.n, activation=None)(dence_layer2) - + return tf.keras.Model(inputs=input_layer, outputs=dence_out) -config = load_default_config(__file__) + +config = load_default_config("./rl_lib/tests/dqn_config.yaml") pprint(config) config['model_config']['model'] = create_model() config['model_config']['input_shape'] = env.observation_space.shape @@ -30,6 +33,7 @@ def create_model(): pprint(algo.config) + def run(algo): epidodes = 250 steps = 200 @@ -39,7 +43,7 @@ def run(algo): pre_train_steps = 2000 copy_weigths_frequency = 100 - #history data + # history data rewards = [] episode_reward = 0 episode_test_reward = 0 @@ -67,10 +71,10 @@ def run(algo): if done: break - algo.save() + algo.save() rewards.append(episode_reward) - #testing algoritm perfomans - if episode%test_frequency == 0: + # testing algoritm perfomans + if episode % test_frequency == 0: observation, info = env.reset() episode_test_reward = 0 for test_step in range(1, test_steps): @@ -79,27 +83,27 @@ def run(algo): episode_test_reward += test_reward if done: break - - #print info + # print info print(" Episode %d - Reward = %.3f, episode reward = %.3f, test reward %.3f, Loss = %.6f, Time = %.f sec, Total steps = %.f" % - ( - episode, - np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0, - episode_reward, - episode_test_reward, - np.asarray(episode_loss).mean() if len(episode_loss) != 0 else 0, - time.time()-start_time, - count - ) - ) + ( + episode, + np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0, + episode_reward, + episode_test_reward, + np.asarray(episode_loss).mean() if len( + episode_loss) != 0 else 0, + time.time()-start_time, + count + ) + ) algo.load() + if __name__ == "__main__": try: run(algo=algo) - + except Exception as e: print(traceback.format_exc()) input("Press enter to exit: ") - diff --git a/rl_lib/tests/first_test_drqn.py b/rl_lib/tests/first_test_drqn.py index 6f3a8b5..5920524 100644 --- a/rl_lib/tests/first_test_drqn.py +++ b/rl_lib/tests/first_test_drqn.py @@ -1,50 +1,55 @@ +import os.path as os_path +import time +import traceback +from pprint import pprint +import os +os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8"' import gym import numpy as np -import time -import os.path as os_path -from tensorflow.keras import layers import tensorflow as tf -from pprint import pprint -import traceback +from tensorflow.keras import layers -from rl_lib.src.algoritms.drqn.drqn import DRQN +from rl_lib.src.algoritms.model_free.value_based import DRQN from rl_lib.src.data_saver.utils import load_default_config env = gym.make('CartPole-v0') -def create_model(lstm_size = 32): + +def create_model(lstm_size=32): """Создает модель tf.keras.Model, архитектура DRQN""" - input_layer = layers.Input(shape= (None, *env.observation_space.shape), ) - h_t_input = layers.Input(shape=(lstm_size, ), ) - c_t_input = layers.Input(shape=(lstm_size, ), ) - + input_layer = layers.Input(shape=(None, *env.observation_space.shape), ) + h_t_input = layers.Input(shape=(lstm_size, ), ) + c_t_input = layers.Input(shape=(lstm_size, ), ) - lstm = layers.LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', return_sequences = True, - return_state=True, stateful = False)(input_layer, initial_state = [h_t_input, c_t_input]) + lstm = layers.LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', return_sequences=True, + return_state=True, stateful=False)(input_layer, initial_state=[h_t_input, c_t_input]) dence_layer1 = layers.Dense(32, activation='relu')(lstm[0]) dence_out = layers.Dense(env.action_space.n, activation=None)(dence_layer1) - + return tf.keras.Model(inputs=[input_layer, h_t_input, c_t_input], outputs=[dence_out, lstm[1], lstm[2]]) -config = load_default_config("..\\rl_lib\\rl_lib\\tests/") -config['model_config']['model'] = create_model(lstm_size=config['model_config']['lstm_size']) + +config = load_default_config("./rl_lib/tests/drqn_config.yaml") +config['model_config']['model'] = create_model( + lstm_size=config['model_config']['lstm_size']) config['model_config']['input_shape'] = env.observation_space.shape config['model_config']['action_space'] = env.action_space.n algo = DRQN(config) pprint(algo.config) + def run(algo): epidodes = 250 steps = 200 train_frequency = 1 test_frequency = 10 test_steps = 200 - pre_train_steps = 2000 + pre_train_steps = 500 copy_weigths_frequency = 100 - #history data + # history data rewards = [] episode_reward = 0 episode_test_reward = 0 @@ -73,10 +78,10 @@ def run(algo): if done: break - algo.save() + algo.save() rewards.append(episode_reward) - #testing algoritm perfomans - if episode%test_frequency == 0: + # testing algoritm perfomans + if episode % test_frequency == 0: observation, info = env.reset() algo.initial_state() episode_test_reward = 0 @@ -86,26 +91,26 @@ def run(algo): episode_test_reward += test_reward if done: break - - #print info + # print info print(" Episode %d - Reward = %.3f, episode reward = %.3f, test reward %.3f, Loss = %.6f, Time = %.f sec, Total steps = %.f" % - ( - episode, - np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0, - episode_reward, - episode_test_reward, - np.asarray(episode_loss).mean() if len(episode_loss) != 0 else 0, - time.time()-start_time, - count - ) - ) + ( + episode, + np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0, + episode_reward, + episode_test_reward, + np.asarray(episode_loss).mean() if len( + episode_loss) != 0 else 0, + time.time()-start_time, + count + ) + ) + if __name__ == "__main__": try: run(algo=algo) - + except Exception: print(traceback.format_exc()) input("Press enter to exit: ") - diff --git a/setup.py b/setup.py index fa6a509..6629af6 100644 --- a/setup.py +++ b/setup.py @@ -8,5 +8,5 @@ version=os.getenv('PACKAGE_VERSION', '0.1.dev0'), # package_dir={'rl_lib': ''}, packages=find_packages(), - description='A demo version of the reinforcement learning library.', + description='A dev version of the reinforcement learning library.', ) \ No newline at end of file From dc80b8857d7e0cafd5fb5f5485c005ec957499eb Mon Sep 17 00:00:00 2001 From: Ivan_Marshev Date: Mon, 30 Oct 2023 14:12:06 +0300 Subject: [PATCH 10/19] modified: README.md deleted: __init__.py modified: examples/ddpg/car_racing/ddpg_car_racing.py modified: examples/dqn/cart_pole/dqn_cart_pole.py modified: examples/drqn/cart_pole/config.yaml modified: examples/drqn/cart_pole/drqn_cart_pole.py new file: models/QR_DQN/QR_DQN_Model_default_QR_DQN/QR_DQN_Model_default_QR_DQN_action_default_QR_DQN.keras new file: models/QR_DQN/QR_DQN_Model_default_QR_DQN/QR_DQN_Model_default_QR_DQN_target_default_QR_DQN.keras new file: models/QR_DQN/QR_DQN_Model_default_QR_DQN/Random_Buffer.data new file: models/QR_DQN/QR_DQN_Model_default_QR_DQN/epsilon_greedy_strategy.data new file: rl_lib/__init__.py new file: rl_lib/src/algoritms/__init__.py new file: rl_lib/src/algoritms/model_free/continuous_control/__init__.py deleted: rl_lib/src/algoritms/model_free/continuous_control/ddpg/__init__.py modified: rl_lib/src/algoritms/model_free/continuous_control/ddpg/ddpg.py modified: rl_lib/src/algoritms/model_free/value_based/__init__.py modified: rl_lib/src/algoritms/model_free/value_based/base_algo.py modified: rl_lib/src/algoritms/model_free/value_based/drqn/config.yaml modified: rl_lib/src/algoritms/model_free/value_based/drqn/drqn.py new file: rl_lib/src/algoritms/model_free/value_based/qr_dqn/config.yaml new file: rl_lib/src/algoritms/model_free/value_based/qr_dqn/qr_dqn.py modified: rl_lib/src/algoritms/model_free/value_based/simple_q.py modified: rl_lib/src/models/model.py modified: rl_lib/src/replay_buffers/replay_buffer.py new file: rl_lib/src/runners/base_runner.py modified: rl_lib/tests/first_test_ddpg.py new file: rl_lib/tests/first_test_qr_dqn.py new file: rl_lib/tests/qr_dqn_config.yaml new file: rl_lib/tests/second_test_dqn_w_runner.py --- README.md | 43 ++-- __init__.py | 0 examples/ddpg/car_racing/ddpg_car_racing.py | 2 +- examples/dqn/cart_pole/dqn_cart_pole.py | 2 +- examples/drqn/cart_pole/config.yaml | 2 +- examples/drqn/cart_pole/drqn_cart_pole.py | 4 +- ...default_QR_DQN_action_default_QR_DQN.keras | Bin 0 -> 38736 bytes ...default_QR_DQN_target_default_QR_DQN.keras | Bin 0 -> 23944 bytes .../Random_Buffer.data | Bin 0 -> 637188 bytes .../epsilon_greedy_strategy.data | Bin 0 -> 216 bytes rl_lib/__init__.py | 3 + rl_lib/src/algoritms/__init__.py | 2 + .../model_free/continuous_control/__init__.py | 1 + .../continuous_control/ddpg/__init__.py | 1 - .../continuous_control/ddpg/ddpg.py | 2 +- .../model_free/value_based/__init__.py | 1 + .../model_free/value_based/base_algo.py | 8 + .../model_free/value_based/drqn/config.yaml | 2 +- .../model_free/value_based/drqn/drqn.py | 4 +- .../model_free/value_based/qr_dqn/config.yaml | 49 ++++ .../model_free/value_based/qr_dqn/qr_dqn.py | 123 ++++++++++ .../model_free/value_based/simple_q.py | 27 +-- rl_lib/src/models/model.py | 1 - rl_lib/src/replay_buffers/replay_buffer.py | 17 +- rl_lib/src/runners/base_runner.py | 217 ++++++++++++++++++ rl_lib/tests/first_test_ddpg.py | 2 +- rl_lib/tests/first_test_qr_dqn.py | 111 +++++++++ rl_lib/tests/qr_dqn_config.yaml | 49 ++++ rl_lib/tests/second_test_dqn_w_runner.py | 45 ++++ 29 files changed, 672 insertions(+), 46 deletions(-) delete mode 100644 __init__.py create mode 100644 models/QR_DQN/QR_DQN_Model_default_QR_DQN/QR_DQN_Model_default_QR_DQN_action_default_QR_DQN.keras create mode 100644 models/QR_DQN/QR_DQN_Model_default_QR_DQN/QR_DQN_Model_default_QR_DQN_target_default_QR_DQN.keras create mode 100644 models/QR_DQN/QR_DQN_Model_default_QR_DQN/Random_Buffer.data create mode 100644 models/QR_DQN/QR_DQN_Model_default_QR_DQN/epsilon_greedy_strategy.data create mode 100644 rl_lib/__init__.py create mode 100644 rl_lib/src/algoritms/__init__.py create mode 100644 rl_lib/src/algoritms/model_free/continuous_control/__init__.py delete mode 100644 rl_lib/src/algoritms/model_free/continuous_control/ddpg/__init__.py create mode 100644 rl_lib/src/algoritms/model_free/value_based/qr_dqn/config.yaml create mode 100644 rl_lib/src/algoritms/model_free/value_based/qr_dqn/qr_dqn.py create mode 100644 rl_lib/src/runners/base_runner.py create mode 100644 rl_lib/tests/first_test_qr_dqn.py create mode 100644 rl_lib/tests/qr_dqn_config.yaml create mode 100644 rl_lib/tests/second_test_dqn_w_runner.py diff --git a/README.md b/README.md index 50af15b..cb719e4 100644 --- a/README.md +++ b/README.md @@ -28,12 +28,13 @@ RL_Lib - это мощный и гибкий инструмент для обу
  • DQN и его модификации
  • DRQN
  • DDPG
  • +
  • QR_DQN
  • ## Базовое использование #### Создание алгоритма по умолчанию (конфиг можно посмотреть в папке алгоритма): ``` -from rl_lib.src.algoritms.dqn.dqn import DQN +from rl_lib import DQN config = {'model_config':{}} config['model_config']['input_shape'] = env.observation_space.shape @@ -42,25 +43,40 @@ config['model_config']['action_space'] = env.action_space.n algo = DQN(config) ``` -#### Создание алгоритма пользовательского алгоритма: +#### Загрузка пользовательской конфигурации алгоритма: ``` -from rl_lib.src.algoritms.dqn.dqn import DQN -from yaml import safe_load +from rl_lib import DQN +from rl_lib import load_default_config -path = #путь к файлу конфигурации +path = #путь к файлу конфигурации, должен оканчиваться на .yaml -config = safe_load( - open( - os_path.join( - os_path.dirname(path),"./config.yaml" - ), - "rb") - ) +config = load_default_config(path) config['model_config']['input_shape'] = env.observation_space.shape config['model_config']['action_space'] = env.action_space.n algo = DQN(config) ``` + +#### Верхнеуровневое API для обучения алгоритма: +``` +from rl_lib import DQN +from rl_lib import load_default_config +from rl_lib import Base_Env_Runner + +path = #путь к файлу конфигурации, должен оканчиваться на .yaml + +config = load_default_config(path) +config['model_config']['input_shape'] = env.observation_space.shape +config['model_config']['action_space'] = env.action_space.n +algo = DQN(config) + +runner = Base_Env_Runner(env=env, + algo=algo, + ...) + +runner.run() +``` + ## Основные методы алгоритма #### Сохранение и загрузка сохраненного алгоритма: ``` @@ -110,7 +126,6 @@ algo.initial_state()