diff --git a/.gitignore b/.gitignore
index b4b3b13..45b9404 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,8 @@ rl_lib/tests/models/
#jupiter notebooks
*.ipynb
+
+
+.vscode
+requirements.txt
+dist
\ No newline at end of file
diff --git a/README.md b/README.md
index 50af15b..cb719e4 100644
--- a/README.md
+++ b/README.md
@@ -28,12 +28,13 @@ RL_Lib - это мощный и гибкий инструмент для обу
DQN и его модификации
DRQN
DDPG
+ QR_DQN
## Базовое использование
#### Создание алгоритма по умолчанию (конфиг можно посмотреть в папке алгоритма):
```
-from rl_lib.src.algoritms.dqn.dqn import DQN
+from rl_lib import DQN
config = {'model_config':{}}
config['model_config']['input_shape'] = env.observation_space.shape
@@ -42,25 +43,40 @@ config['model_config']['action_space'] = env.action_space.n
algo = DQN(config)
```
-#### Создание алгоритма пользовательского алгоритма:
+#### Загрузка пользовательской конфигурации алгоритма:
```
-from rl_lib.src.algoritms.dqn.dqn import DQN
-from yaml import safe_load
+from rl_lib import DQN
+from rl_lib import load_default_config
-path = #путь к файлу конфигурации
+path = #путь к файлу конфигурации, должен оканчиваться на .yaml
-config = safe_load(
- open(
- os_path.join(
- os_path.dirname(path),"./config.yaml"
- ),
- "rb")
- )
+config = load_default_config(path)
config['model_config']['input_shape'] = env.observation_space.shape
config['model_config']['action_space'] = env.action_space.n
algo = DQN(config)
```
+
+#### Верхнеуровневое API для обучения алгоритма:
+```
+from rl_lib import DQN
+from rl_lib import load_default_config
+from rl_lib import Base_Env_Runner
+
+path = #путь к файлу конфигурации, должен оканчиваться на .yaml
+
+config = load_default_config(path)
+config['model_config']['input_shape'] = env.observation_space.shape
+config['model_config']['action_space'] = env.action_space.n
+algo = DQN(config)
+
+runner = Base_Env_Runner(env=env,
+ algo=algo,
+ ...)
+
+runner.run()
+```
+
## Основные методы алгоритма
#### Сохранение и загрузка сохраненного алгоритма:
```
@@ -110,7 +126,6 @@ algo.initial_state()
- Реализация алгоритмов:
- - QR-DQN
- IQN
- A2C
- TD3
@@ -118,6 +133,6 @@ algo.initial_state()
- RD2D
- Bandits
- - Добавление LaziFrames в буферы сохранения
+ - Добавление LazyFrames в буферы сохранения
- Написание обертки шагов обучения в среде
- Реализация записи статистики обучения
\ No newline at end of file
diff --git a/examples/ddpg/car_racing/config.yaml b/examples/ddpg/car_racing/config.yaml
index e7cd5c6..ce89ea7 100644
--- a/examples/ddpg/car_racing/config.yaml
+++ b/examples/ddpg/car_racing/config.yaml
@@ -6,7 +6,7 @@ model_config:
action_space: None
discount_factor : 0.99
n_step: 1
- batch_size: 32
+ batch_size: 16
double_network: False
priority: False
@@ -14,18 +14,18 @@ model_config:
actor_model_config:
model_config:
model: None
- tau: 0.01
+ tau: 0.001
critic_model_config:
model_config:
model: None
- tau: 0.01
+ tau: 0.001
actor_optimizer_config:
optimizer_config:
optimizer_name: "adam"
optimizer_params:
- learning_rate: 0.001
+ learning_rate: 0.0001
epsilon: 0.001
clipnorm: 1.0
custom_optimizer: None
@@ -34,7 +34,7 @@ critic_optimizer_config:
optimizer_config:
optimizer_name: "adam"
optimizer_params:
- learning_rate: 0.002
+ learning_rate: 0.001
epsilon: 0.001
clipnorm: 1.0
custom_optimizer: None
@@ -54,9 +54,14 @@ buffer_config:
exploration_config:
strategy_name: "ou_noise"
strategy_config:
- alpha: 0.5
- sigma: 1.0
+ alpha: 0.0
+ sigma: 0.2
action_space: None
+ upper_bound: None
+ lower_bound: None
+ dt: 0.01
+ mean: None
+ theta: 0.15
data_saver:
path: ""
diff --git a/examples/ddpg/car_racing/ddpg_car_racing.py b/examples/ddpg/car_racing/ddpg_car_racing.py
index ca5ac1a..4d0ea52 100644
--- a/examples/ddpg/car_racing/ddpg_car_racing.py
+++ b/examples/ddpg/car_racing/ddpg_car_racing.py
@@ -7,7 +7,7 @@
from pprint import pprint
import traceback
-from rl_lib.src.algoritms.ddpg.ddpg import DDPG
+from rl_lib import DDPG
from rl_lib.src.data_saver.utils import load_default_config
env = gym.make('CarRacing-v2')
@@ -42,7 +42,7 @@ def create_critic_model():
flatten = layers.Flatten()(concat)
dence_layer1 = layers.Dense(256, activation='relu')(flatten)
dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
- dence_out = layers.Dense(env.action_space.shape[0], activation=None)(dence_layer2)
+ dence_out = layers.Dense(1, activation=None)(dence_layer2)
return tf.keras.Model(inputs=[input_layer, input_action_layer], outputs=dence_out)
@@ -81,6 +81,7 @@ def run(algo):
observation, info = env.reset()
episode_reward = 0
+ episode_loss = []
for step in range(1, steps+1):
action = algo.get_action(observation)
new_observation, reward, done, _, info = env.step(action)
diff --git a/examples/dqn/cart_pole/dqn_cart_pole.py b/examples/dqn/cart_pole/dqn_cart_pole.py
index 872f1e4..a3a560f 100644
--- a/examples/dqn/cart_pole/dqn_cart_pole.py
+++ b/examples/dqn/cart_pole/dqn_cart_pole.py
@@ -6,7 +6,7 @@
import tensorflow as tf
from pprint import pprint
-from rl_lib.src.algoritms.dqn.dqn import DQN
+from rl_lib import DQN
from rl_lib.src.data_saver.utils import load_default_config
env = gym.make('CartPole-v0')
@@ -49,6 +49,7 @@ def run(algo):
observation, info = env.reset()
episode_reward = 0
+ episode_loss = []
for step in range(1, steps):
action = algo.get_action(observation)
new_observation, reward, done, _, info = env.step(action)
diff --git a/examples/drqn/cart_pole/config.yaml b/examples/drqn/cart_pole/config.yaml
index 8c22e4a..d8da9bc 100644
--- a/examples/drqn/cart_pole/config.yaml
+++ b/examples/drqn/cart_pole/config.yaml
@@ -1,4 +1,4 @@
-#default DQN config
+#default DRQN config
model_config:
model: None
diff --git a/examples/drqn/cart_pole/drqn_cart_pole.py b/examples/drqn/cart_pole/drqn_cart_pole.py
index e3cce0c..08a445d 100644
--- a/examples/drqn/cart_pole/drqn_cart_pole.py
+++ b/examples/drqn/cart_pole/drqn_cart_pole.py
@@ -6,7 +6,7 @@
import tensorflow as tf
from pprint import pprint
-from rl_lib.src.algoritms.drqn.drqn import DRQN
+from rl_lib import DRQN
from rl_lib.src.data_saver.utils import load_default_config
env = gym.make('CartPole-v0')
@@ -26,7 +26,7 @@ def create_model(lstm_size = 32):
return tf.keras.Model(inputs=[input_layer, h_t_input, c_t_input], outputs=[dence_out, lstm[1], lstm[2]])
-config = load_default_config("..\\rl_lib\\rl_lib\\examples\\drqn\\cart_pole/")
+config = load_default_config(__file__)
config['model_config']['model'] = create_model(lstm_size=config['model_config']['lstm_size'])
config['model_config']['input_shape'] = env.observation_space.shape
config['model_config']['action_space'] = env.action_space.n
@@ -56,6 +56,7 @@ def run(algo):
observation, info = env.reset()
algo.initial_state()
episode_reward = 0
+ episode_loss = []
for step in range(1, steps):
action = algo.get_action(observation)
new_observation, reward, done, _, info = env.step(action)
diff --git a/rl_lib/__init__.py b/rl_lib/__init__.py
new file mode 100644
index 0000000..194b658
--- /dev/null
+++ b/rl_lib/__init__.py
@@ -0,0 +1,3 @@
+from rl_lib.src.algoritms import DDPG, DQN, DRQN, QR_DQN
+from rl_lib.src.data_saver.utils import load_default_config
+from rl_lib.src.runners.base_runner import Base_Env_Runner
diff --git a/rl_lib/src/algoritms/__init__.py b/rl_lib/src/algoritms/__init__.py
index 8b13789..9653b8f 100644
--- a/rl_lib/src/algoritms/__init__.py
+++ b/rl_lib/src/algoritms/__init__.py
@@ -1 +1,2 @@
-
+from .model_free.continuous_control import DDPG
+from .model_free.value_based import DQN, DRQN, QR_DQN
diff --git a/rl_lib/src/algoritms/a2c/actor_critic.py b/rl_lib/src/algoritms/a2c/actor_critic.py
deleted file mode 100644
index a24669e..0000000
--- a/rl_lib/src/algoritms/a2c/actor_critic.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import tensorflow as tf
-from tensorflow.keras import layers
-import abc
-
-from rl_lib.src.algoritms.dqn.dqn import DQN_Model
-
-class Actor_Model(DQN_Model):
- def __init__(self, config = {},**kwargs):
- config['model_config'] = config['actor_model_config']['model_config']
- config['optimizer_config'] = config['actor_optimizer_config']['optimizer_config']
- super().__init__(config = config, **kwargs)
- self.name = kwargs.get('name', 'error_name') + '_actor_'
-
- def _prediction_processing(self, inputs: tf.Tensor, **kwargs):
- return kwargs['critic_model']([kwargs['state'], inputs])
-
- def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor:
- """Вычисляет и возвращает потери в соответствии с функцией потерь"""
- return tf.reduce_mean(predict, axis = 0) * (-1)
-
-class Critic_Model(DQN_Model):
- def __init__(self, config = {},**kwargs):
- config['model_config'] = config['critic_model_config']['model_config']
- config['optimizer_config'] = config['critic_optimizer_config']['optimizer_config']
- super().__init__(config = config, **kwargs)
- self.name = kwargs.get('name', 'error_name') + '_critic_'
-
- def _prediction_processing(self, inputs: tf.Tensor, **kwargs):
- return inputs
-
- @tf.function(reduce_retracing=True,
- jit_compile=False,
- experimental_autograph_options = tf.autograph.experimental.Feature.ALL)
- def calculate_gradients(self, **kwargs) -> dict:
- """
- Вычисляет градиенты, лосс, td-ошибку
-
- Kwargs:
- dict содержащий батч, таргет, маску, опционально приоритетные веса
-
- Returns:
- dict содержащий лоссы и td-ошибку
- """
- with tf.GradientTape(persistent=False) as tape:
- Q = self.model([kwargs['state'], kwargs['action']], training=True)
- Q = self.prediction_processing(Q, **kwargs)
- if len(Q.shape) != len(kwargs['Qtarget'].shape): Q = tf.expand_dims(Q, -1)
-
- td_error = kwargs['Qtarget'] - Q
- loss = self.loss(kwargs['Qtarget'], Q)*kwargs.get('weights', 1.0)
- gradients = tape.gradient(loss, self.model.trainable_variables)
- loss = tf.reduce_mean(loss, axis=-1)
- return {'gradients': gradients, 'loss': loss, 'td_error': td_error}
-
- @staticmethod
- def create_model(input_shape: tuple, action_space: int) -> tf.keras.Model:
- """Создает модель tf.keras.Model, архитектура DQN"""
- input_layer = layers.Input(shape=input_shape, )
- action_layer = layers.Input(shape=action_space, )
- concat = layers.Concatenate()((input_layer, action_layer))
- flatten = layers.Flatten()(concat)
- dence_layer1 = layers.Dense(256, activation='relu')(flatten)
- dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
- dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
-
- return tf.keras.Model(inputs=[input_layer, action_layer], outputs=dence_out)
-
- @staticmethod
- def create_model_with_conv(input_shape: tuple, action_space: int) -> tf.keras.Model:
- """Создает модель tf.keras.Model, архитектура DQN, начальные слои - сверточные"""
- input_layer = layers.Input(shape=input_shape, )
- action_layer = layers.Input(shape=action_space, )
- cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer)
- cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1)
- cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2)
- conv_out = layers.Flatten()(cov_layer3)
-
- concat = layers.Concatenate()((conv_out, action_layer))
- flatten = layers.Flatten()(concat)
- dence_layer1 = layers.Dense(256, activation='relu')(flatten)
- dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
- dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
-
- return tf.keras.Model(inputs=[input_layer, action_layer], outputs=dence_out)
-
-class Actor_Critic_Model(DQN_Model):
- def __init__(self, config = {},**kwargs):
- config['actor_model_config']['model_config']['name'] = config['model_config']['name']
- config['actor_model_config']['model_config']['input_shape'] = config['model_config']['input_shape']
- config['actor_model_config']['model_config']['action_space'] = config['model_config']['action_space']
-
- config['critic_model_config']['model_config']['name'] = config['model_config']['name']
- config['critic_model_config']['model_config']['input_shape'] = config['model_config']['input_shape']
- config['critic_model_config']['model_config']['action_space'] = config['model_config']['action_space']
- self.actor_model = Actor_Model(config=config, **kwargs)
- self.critic_model = Critic_Model(config=config, **kwargs)
-
- def __call__(self, input: tf.Tensor) -> tf.Tensor:
- return self.critic_model([input, self.actor_model(input)])
-
- def update_weights(self, **kwargs):
- _ = self.update_weights_actor(**kwargs)
- return self.update_weights_critic(**kwargs)
-
- def update_weights_actor(self, **kwargs):
- kwargs['critic_model'] = self.critic_model
- loss = self.actor_model.update_weights(**kwargs)
- return {'loss': loss['loss'], 'td_error': loss['td_error']}
-
- def update_weights_critic(self, **kwargs) -> dict:
- loss = self.critic_model.update_weights(**kwargs)
- return {'loss': loss['loss'], 'td_error': loss['td_error']}
-
- def calculate_gradients(self, **kwargs) -> dict:
- kwargs['action'] = self.actor_model(kwargs['next_state'])
- gradients = self.critic_model.calculate_gradients(**kwargs)
- return gradients
-
- def get_weights(self, ) -> dict:
- return {
- 'actor': self.actor_model.get_weights(),
- 'critic': self.critic_model.get_weights()
- }
-
- def input_spec(self):
- return self.actor_model.input_spec()
-
- def load(self, path):
- self.actor_model.load(path)
- self.critic_model.load(path)
-
- def save(self, path):
- self.actor_model.save(path)
- self.critic_model.save(path)
-
- def set_weights(self, weights: dict) -> None:
- self.actor_model.set_weights(weights=weights['actor'])
- self.critic_model.set_weights(weights=weights['critic'])
-
- @property
- def summary(self):
- self.actor_model.summary
- self.critic_model.summary
-
-
-
\ No newline at end of file
diff --git a/rl_lib/src/algoritms/base_algo.py b/rl_lib/src/algoritms/base_algo.py
deleted file mode 100644
index e677388..0000000
--- a/rl_lib/src/algoritms/base_algo.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import tensorflow as tf
-import abc
-from typing import Union
-from copy import copy
-
-from ..data_saver.utils import load_default_config
-from .utils import update_config
-from rl_lib.src.data_saver.saver import Saver
-
-class Base_Algo(Saver, abc.ABC):
- """Базовый абстрактный класс алгоритма.
- Хранит все методы, необходимые для вычислений в каком либо алгоритме.
- """
- def __init__(self, action_model: object, target_model: object, config: dict, default_config_path: str, *args, **kwargs):
- self._config = load_default_config(default_config_path)
- update_config(self._config, config)
-
- self.action_model = action_model(config = copy(self._config), algo_name = kwargs.get("algo_name", "unkown"), name = kwargs.get("name", "unkown_name") + "_action_" + config.get("model_config", {}).get("name", ""))
- self.target_model = target_model(config = copy(self._config), algo_name = kwargs.get("algo_name", "unkown"), name = kwargs.get("name", "unkown_name") + "_target_" + config.get("model_config", {}).get("name", ""))
- super().__init__(**self.config.get('data_saver', {}), **kwargs)
- self.target_model.set_weights(self.action_model.get_weights())
-
-
- @property
- def config(self):
- return self._config
-
- @abc.abstractclassmethod
- def calculate_new_best_action(self) -> tf.Tensor:
- """Вычислеят новое лучшее действие для получения таргета"""
-
- @abc.abstractclassmethod
- def calculate_target(self) -> dict:
- """Вычисляет таргет для обучения"""
-
- @abc.abstractclassmethod
- def get_action(self, observation) -> float:
- """Возвращает действие на основе наблюдения с учетом исследования"""
-
- @abc.abstractclassmethod
- def get_test_action(self, observation) -> float:
- """Возвращает действие на основе наблюдения без исследования"""
-
- @abc.abstractclassmethod
- def get_gradients(self) -> tf.Tensor:
- """Вычисляет градиенты и возвращает их"""
-
- @abc.abstractclassmethod
- def load(self, path) -> None:
- """Загружает алгоритм"""
-
- @abc.abstractclassmethod
- def reset(self) -> None:
- """Сбрасывает внутренние данные модели"""
-
- @abc.abstractclassmethod
- def _train_step(self) -> dict:
- """Вспомогательная train_step"""
-
- @abc.abstractclassmethod
- def train_step(self) -> dict:
- """Вычисляет полный обучающий шаг"""
-
- @abc.abstractclassmethod
- def save(self, path) -> None:
- """Сохраняет алгоритм"""
-
- @abc.abstractclassmethod
- def summary(self) -> None:
- """Выводит архитектуру модели"""
-
- @tf.function(reduce_retracing=None, jit_compile=None, experimental_autograph_options=None)
- def _copy_weights(self, action_model_weights: list, target_model_weights: list, tau: float) -> tf.constant:
- """Копирует веса из модели действия в целевую модель"""
- for a_w, t_w in zip(action_model_weights, target_model_weights):
- new_weights = tf.add(tf.multiply(tau, a_w), tf.multiply((1-tau), t_w))
- t_w.assign(tf.identity(new_weights))
- return tf.constant(1)
-
- def copy_weights(self) -> tf.constant:
- """Копирует веса из модели действия в целевую модель"""
- res = self._copy_weights(self.action_model.weights, self.target_model.weights, self.tau)
- return res
-
- @tf.function(reduce_retracing=True,
- jit_compile=True,
- experimental_autograph_options = tf.autograph.experimental.Feature.ALL)
- def sample_action(self, state: Union[tf.Tensor, tuple]) -> Union[tf.Tensor, list]:
- """Возвращает предсказания модели на основе текущих наблюдений"""
- predict = self.action_model(state)
- if isinstance(predict, list):
- return self.squeeze_predict(predict[0]), predict[1], predict[2]
- return self.squeeze_predict(predict)
-
- @tf.function(reduce_retracing=None, jit_compile=None, experimental_autograph_options=None)
- def set_weights(self, target_weights: list) -> tf.constant:
- """Устанавливает переданные как аргумент веса в основную сеть"""
- for a_w, t_w in zip(self.action_model.weights, target_weights):
- a_w.assign(tf.identity(t_w))
- return tf.constant(1)
-
- @staticmethod
- def squeeze_predict(predict) -> tf.Tensor:
- """Удаляет единичные измерения из предсказаний"""
- while len(predict.shape)>=1 and predict.shape[0] == 1:
- predict = tf.squeeze(predict, axis=0)
- return predict
-
diff --git a/rl_lib/src/algoritms/ddpg/ddpg.py b/rl_lib/src/algoritms/ddpg/ddpg.py
deleted file mode 100644
index b21701f..0000000
--- a/rl_lib/src/algoritms/ddpg/ddpg.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from typing import Any
-import tensorflow as tf
-from tensorflow.keras import layers
-
-from rl_lib.src.models.model import Model
-from rl_lib.src.algoritms.simple_q import SimpleQ
-from rl_lib.src.algoritms.a2c.actor_critic import Actor_Critic_Model
-
-
-
-class DDPG_Model(Actor_Critic_Model):
- def __init__(self, config = {},**kwargs):
- super().__init__(config=config, **kwargs)
-
-class DDPG(SimpleQ):
- def __init__(self, config):
- self.actor_tau = config['actor_model_config']['model_config']['tau']
- self.critic_tau = config['critic_model_config']['model_config']['tau']
- super().__init__(DDPG_Model, DDPG_Model, config, default_config_path=__file__, algo_name = "DDPG_Model", name = "DDPG_Model_" + config.get('model_config','').get('name',''))
-
- def _prediction_processing(self, input_data):
- pass
-
- def _update_next_state(self, state, action):
- pass
-
- def initial_state(self):
- pass
-
- def get_batch(self, ):
- batch = super().get_batch()
- batch['reward'] = tf.reshape(batch['reward'], (self.batch_size, 1))
- batch['done'] = tf.reshape(batch['done'], (self.batch_size, 1))
- return batch
-
- def get_best_action(self, Qaction, Qtarget):
- return Qtarget
-
- def _train_step(self, **batch) -> dict:
- """Вспомогательная train_step"""
- batch = self.choice_model_for_double_calculates(**batch)
- batch['batch_dims'] = self.batch_dims
- if self.priority: batch['weights'] = tf.expand_dims(batch['weights'], -1)
- if batch['p_double'] > 0.5:
- self.action_model.update_weights_actor(**batch)
- return self.action_model.update_weights_critic(**batch)
- else:
- self.target_model.update_weights_actor(**batch)
- return self.target_model.update_weights_critic(**batch)
-
- def copy_weights(self) -> tf.constant:
- """Копирует веса из модели действия в целевую модель"""
- _ = self._copy_weights(self.action_model.actor_model.weights, self.target_model.actor_model.weights, self.actor_tau)
- _ = self._copy_weights(self.action_model.critic_model.weights, self.target_model.critic_model.weights, self.critic_tau)
- return tf.constant(1)
-
- @tf.function(reduce_retracing=True,
- jit_compile=True,
- experimental_autograph_options = tf.autograph.experimental.Feature.ALL)
- def sample_action(self, state: tf.Tensor) -> tf.Tensor:
- """Возвращает предсказания модели на основе текущих наблюдений"""
- predict = self.action_model.actor_model(state)
- return self.squeeze_predict(predict)
\ No newline at end of file
diff --git a/rl_lib/src/algoritms/dqn/dqn.py b/rl_lib/src/algoritms/dqn/dqn.py
deleted file mode 100644
index c42f4fa..0000000
--- a/rl_lib/src/algoritms/dqn/dqn.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import tensorflow as tf
-from tensorflow.keras import layers
-
-from rl_lib.src.models.model import Model
-from rl_lib.src.algoritms.simple_q import SimpleQ
-
-class DQN_Model(Model):
- def __init__(self, config = {},**kwargs):
- super().__init__(model_config = config.get('model_config', {}), config = config, **kwargs)
-
- def _prediction_processing(self, inputs: tf.Tensor, **kwargs):
- mask = self.make_mask(tf.cast(kwargs['action'], dtype = tf.int32))
- if len(inputs.shape) != len(mask.shape): mask = tf.expand_dims(mask, -1)
- return tf.reduce_sum(tf.multiply(inputs, mask), axis=kwargs['batch_dims'])
-
- def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor:
- """Вычисляет и возвращает потери в соответствии с функцией потерь"""
- return tf.math.squared_difference(target, predict)
-
- def make_mask(self, action) -> tf.Tensor:
- """Создает маску по действиям """
- return tf.one_hot(action, self.output_spec()[-1])
-
- def _update_next_state(self, state, action):
- pass
-
- def initial_state(self):
- pass
-
- @staticmethod
- def create_model(input_shape: tuple, action_space: int) -> tf.keras.Model:
- """Создает модель tf.keras.Model, архитектура DQN"""
- input_layer = layers.Input(shape=input_shape, )
- dence_layer1 = layers.Dense(256, activation='relu')(input_layer)
- dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
- dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
-
- return tf.keras.Model(inputs=input_layer, outputs=dence_out)
-
- @staticmethod
- def create_model_with_conv(input_shape: tuple, action_space: int) -> tf.keras.Model:
- """Создает модель tf.keras.Model, архитектура DQN, начальные слои - сверточные"""
- input_layer = layers.Input(shape=input_shape, )
- cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer)
- cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1)
- cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2)
- conv_out = layers.Flatten()(cov_layer3)
-
- dence_layer1 = layers.Dense(256, activation='relu')(conv_out)
- dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
- dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
-
- return tf.keras.Model(inputs=input_layer, outputs=dence_out)
-
-class DQN(SimpleQ):
- def __init__(self, config):
- super().__init__(DQN_Model, DQN_Model, config, default_config_path=__file__, algo_name = "DQN", name = "DQN_Model_" + config.get('model_config','').get('name',''))
-
- def _prediction_processing(self, input_data):
- pass
-
- def _update_next_state(self, state, action):
- pass
-
- def initial_state(self):
- pass
-
-
diff --git a/rl_lib/src/algoritms/drqn/drqn.py b/rl_lib/src/algoritms/drqn/drqn.py
deleted file mode 100644
index 5d9294c..0000000
--- a/rl_lib/src/algoritms/drqn/drqn.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import tensorflow as tf
-from tensorflow.keras import layers
-import numpy as np
-
-from rl_lib.src.models.model import Model
-from rl_lib.src.algoritms.simple_q import SimpleQ
-
-class DRQN_Model(Model):
- def __init__(self, config = {},**kwargs):
- super().__init__(model_config = config.get('model_config', {}), config = config, default_config_path=__file__, **kwargs)
- self.h_t, self.c_t, self.new_h_t, self.new_c_t = None, None, None, None
- self.lstm_size = config['model_config'].get("lstm_size", 64)
-
- def __call__(self, inputs: tf.Tensor) -> tf.Tensor:
- return super().__call__([inputs, self.h_t, self.c_t] if not isinstance(inputs, list) else inputs)
-
- def _initial_model(self):
- input_shape = self._config['model_config']["input_shape"]
- action_space = self._config['model_config']["action_space"]
- if len(input_shape) == 1:
- return self.create_model(input_shape, action_space, self.lstm_size)
- else:
- return self.create_model_with_conv(input_shape, action_space, self.lstm_size)
-
- def initial_state(self):
- """Инициализирует внутреннее состояние рекуррентной сети"""
- self.h_t = tf.zeros((1, self.lstm_size),dtype=tf.float32)
- self.c_t = self.h_t
-
- def get_states(self) -> tuple:
- """Возвращает кортеж внутренних состояний реккурентной сети"""
- return tf.squeeze(self.h_t.numpy()), tf.squeeze(self.c_t.numpy())
-
- def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor:
- """Вычисляет и возвращает потери в соответствии с функцией потерь"""
- return tf.math.squared_difference(target, predict)
-
- def make_mask(self, action) -> tf.Tensor:
- """Создает маску по действиям """
- return tf.one_hot(action, self.output_spec()[-1])
-
- def _prediction_processing(self, inputs: tf.Tensor, **kwargs):
- mask = self.make_mask(kwargs['action'])
- while len(inputs.shape) < len(mask.shape): mask = tf.expand_dims(mask, -1)
- return tf.reduce_sum(tf.multiply(inputs, mask), axis=kwargs['batch_dims'])[:, kwargs['recurrent_skip']:]
-
-
- def _update_next_state(self):
- """Обновляет внутреннее состояние рекуррентной сети"""
- self.h_t, self.c_t = self.new_h_t, self.new_c_t
-
- @staticmethod
- def create_model(input_shape: tuple, action_space: int, lstm_size: int) -> tf.keras.Model:
- """Создает модель tf.keras.Model, архитектура DRQN"""
- input_layer = layers.Input(shape=input_shape, )
- h_t_input = layers.Input(shape=(lstm_size, ), )
- c_t_input = layers.Input(shape=(lstm_size, ), )
-
- lstm = layers.LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', return_sequences = True,
- return_state=True, stateful = False)(input_layer, initial_state = [h_t_input, c_t_input])
- dence_layer1 = layers.Dense(256, activation='relu')(input_layer)
- dence_layer2 = layers.Dense(128, activation='relu')(dence_layer1)
- dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
-
- return tf.keras.Model(inputs=[input_layer, h_t_input, c_t_input], outputs=[dence_out, lstm[1], lstm[2]])
-
- @staticmethod
- def create_model_with_conv(input_shape: tuple, action_space: int, lstm_size: int) -> tf.keras.Model:
- """Создает модель tf.keras.Model, архитектура DRQN, начальные слои - сверточные"""
- input_layer = layers.Input(shape=input_shape, )
- h_t_input = layers.Input(shape=(lstm_size, ), )
- c_t_input = layers.Input(shape=(lstm_size, ), )
-
- cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer)
- cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1)
- cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2)
- conv_out = layers.Flatten()(cov_layer3)
- lstm = layers.LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', return_sequences = True,
- return_state=True, stateful = False)(conv_out, initial_state = [h_t_input, c_t_input])
- dence_layer1 = layers.Dense(256, activation='relu')(lstm[0])
- dence_layer2 = layers.Dense(128, activation='relu')(dence_layer1)
- dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
-
- return tf.keras.Model(inputs=[input_layer, h_t_input, c_t_input], outputs=[dence_out, lstm[1], lstm[2]])
-
-class DRQN(SimpleQ):
- def __init__(self, config):
- super().__init__(DRQN_Model, DRQN_Model, config, default_config_path=__file__, algo_name = "DRQN", name = "DRQN_Model_" + config.get('model_config','').get('name',''))
-
- self.initial_state()
- self.recurrent_skip = self.config['buffer_config']['recurrent_skip']
- self.trace_length = self.config['buffer_config']['trace_length']
- self.recurrent = True
- self.batch_dims = 2
-
- def add(self, data: tuple, priority = None) -> None:
- """
- Добавляет переходы в буфер
- Аргументы:
- data: tuple(state, action, reward, done, next_state)
- priority: np.array (только для приоритетных буферов)
- """
- super().add((*data, *self.action_model.get_states()), priority)
- self._update_next_state()
-
- def initial_state(self):
- """Сбравсывает внутренне состояние lstm"""
- self.action_model.initial_state()
-
- def _get_action(self, observation: tf.Tensor) -> tf.Tensor:
- """Возвращает ценность дейтсвий Q(s,a) всех действий на основе наблюдения"""
- predict = super()._get_action(observation)
- action, self.action_model.new_h_t, self.action_model.new_c_t = predict
- return action
-
- def get_test_action(self, observation: tf.Tensor) -> float:
- action = super().get_test_action(observation)
- self._update_next_state()
- return action
-
- def get_batch(self, ):
- batch = super().get_batch()
-
- new_h_t, new_c_t = tf.squeeze(batch['h_t'][:, 1:],axis=1), tf.squeeze(batch['c_t'][:, 1:],axis=1)
- h_t, c_t = tf.squeeze(batch['h_t'][:, :-1],axis=1), tf.squeeze(batch['c_t'][:, :-1],axis=1)
- batch['state'] = [batch['state'], h_t, c_t]
- batch['next_state'] = [batch['next_state'], new_h_t, new_c_t]
- batch['recurrent_skip'] = self.recurrent_skip
- batch['trace_length'] = self.trace_length
-
- if self.priority: batch['weights'] = np.repeat(np.expand_dims(batch['weights'], -1), self.trace_length-self.recurrent_skip, axis=1)
- return batch
-
- def _update_next_state(self):
- """Обновляет внутреннее состояние lstm новым состоянием lstm"""
- self.action_model._update_next_state()
-
-
diff --git a/__init__.py b/rl_lib/src/algoritms/model_free/__init__.py
similarity index 100%
rename from __init__.py
rename to rl_lib/src/algoritms/model_free/__init__.py
diff --git a/rl_lib/src/algoritms/model_free/continuous_control/__init__.py b/rl_lib/src/algoritms/model_free/continuous_control/__init__.py
new file mode 100644
index 0000000..fca471c
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/continuous_control/__init__.py
@@ -0,0 +1 @@
+from .ddpg.ddpg import DDPG, DDPG_Model
diff --git a/rl_lib/src/algoritms/model_free/continuous_control/ddpg/__init__.py b/rl_lib/src/algoritms/model_free/continuous_control/ddpg/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rl_lib/src/algoritms/ddpg/config.yaml b/rl_lib/src/algoritms/model_free/continuous_control/ddpg/config.yaml
similarity index 87%
rename from rl_lib/src/algoritms/ddpg/config.yaml
rename to rl_lib/src/algoritms/model_free/continuous_control/ddpg/config.yaml
index c7bab98..e0b26dd 100644
--- a/rl_lib/src/algoritms/ddpg/config.yaml
+++ b/rl_lib/src/algoritms/model_free/continuous_control/ddpg/config.yaml
@@ -6,7 +6,7 @@ model_config:
action_space: None
discount_factor : 0.99
n_step: 1
- batch_size: 32
+ batch_size: 16
double_network: False
priority: False
@@ -14,18 +14,18 @@ model_config:
actor_model_config:
model_config:
model: None
- tau: 0.01
+ tau: 0.001
critic_model_config:
model_config:
model: None
- tau: 0.01
+ tau: 0.001
actor_optimizer_config:
optimizer_config:
optimizer_name: "adam"
optimizer_params:
- learning_rate: 0.001
+ learning_rate: 0.0001
epsilon: 0.001
clipnorm: 1.0
custom_optimizer: None
@@ -34,7 +34,7 @@ critic_optimizer_config:
optimizer_config:
optimizer_name: "adam"
optimizer_params:
- learning_rate: 0.002
+ learning_rate: 0.001
epsilon: 0.001
clipnorm: 1.0
custom_optimizer: None
@@ -54,11 +54,14 @@ buffer_config:
exploration_config:
strategy_name: "ou_noise"
strategy_config:
- alpha: 0.9
- sigma: 1.0
+ alpha: 0.0
+ sigma: 0.2
action_space: None
upper_bound: None
lower_bound: None
+ dt: 0.01
+ mean: None
+ theta: 0.15
data_saver:
path: ""
diff --git a/rl_lib/src/algoritms/model_free/continuous_control/ddpg/ddpg.py b/rl_lib/src/algoritms/model_free/continuous_control/ddpg/ddpg.py
new file mode 100644
index 0000000..139acf6
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/continuous_control/ddpg/ddpg.py
@@ -0,0 +1,72 @@
+from typing import Any
+
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from ...policy_gradient.a2c.actor_critic import Actor_Critic_Model
+from ...value_based.simple_q import SimpleQ
+
+
+class DDPG_Model(Actor_Critic_Model):
+ def __init__(self, config={}, **kwargs):
+ super().__init__(config=config, **kwargs)
+
+
+class DDPG(SimpleQ):
+ def __init__(self, config):
+ self.actor_tau = config['actor_model_config']['model_config']['tau']
+ self.critic_tau = config['critic_model_config']['model_config']['tau']
+ super().__init__(DDPG_Model, DDPG_Model,
+ config, default_config_path=__file__,
+ algo_name="DDPG_Model",
+ name=("DDPG_Model_" +
+ config.get('model_config', '').get('name', '')))
+
+ def _prediction_processing(self, input_data):
+ pass
+
+ def _update_next_state(self, state, action):
+ pass
+
+ def initial_state(self):
+ pass
+
+ def get_batch(self, ):
+ batch = super().get_batch()
+ batch['reward'] = tf.reshape(batch['reward'], (self.batch_size, 1))
+ batch['done'] = tf.reshape(batch['done'], (self.batch_size, 1))
+ return batch
+
+ def get_best_action(self, Qaction, Qtarget):
+ return Qtarget
+
+ def _train_step(self, **batch) -> dict:
+ """Вспомогательная train_step"""
+ batch = self.choice_model_for_double_calculates(**batch)
+ batch['batch_dims'] = self.BATCH_DIMS
+ if self.priority:
+ batch['weights'] = tf.expand_dims(batch['weights'], -1)
+ if batch['p_double'] > 0.5:
+ self.action_model.update_weights_actor(**batch)
+ return self.action_model.update_weights_critic(**batch)
+ else:
+ self.target_model.update_weights_actor(**batch)
+ return self.target_model.update_weights_critic(**batch)
+
+ def copy_weights(self) -> tf.constant:
+ """Копирует веса из модели действия в целевую модель"""
+ _ = self._copy_weights(self.action_model.actor_model.weights,
+ self.target_model.actor_model.weights,
+ self.actor_tau)
+ _ = self._copy_weights(self.action_model.critic_model.weights,
+ self.target_model.critic_model.weights,
+ self.critic_tau)
+ return tf.constant(1)
+
+ @tf.function(reduce_retracing=True,
+ jit_compile=True,
+ experimental_autograph_options=tf.autograph.experimental.Feature.ALL)
+ def sample_action(self, state: tf.Tensor) -> tf.Tensor:
+ """Возвращает предсказания модели на основе текущих наблюдений"""
+ predict = self.action_model.actor_model(state)
+ return self.squeeze_predict(predict)
diff --git a/rl_lib/src/algoritms/model_free/policy_gradient/__init__.py b/rl_lib/src/algoritms/model_free/policy_gradient/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rl_lib/src/algoritms/model_free/policy_gradient/a2c/__init__.py b/rl_lib/src/algoritms/model_free/policy_gradient/a2c/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rl_lib/src/algoritms/model_free/policy_gradient/a2c/actor_critic.py b/rl_lib/src/algoritms/model_free/policy_gradient/a2c/actor_critic.py
new file mode 100644
index 0000000..14e6421
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/policy_gradient/a2c/actor_critic.py
@@ -0,0 +1,180 @@
+import abc
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from rl_lib.src.models.model import Model
+from rl_lib.src.algoritms.model_free.value_based import DQN_Model
+
+
+class _abc_Actor_Model(Model):
+ def __init__(self, config: dict = {}, **kwargs):
+ config['model_config'] = config['actor_model_config']['model_config']
+ config['optimizer_config'] = config['actor_optimizer_config']['optimizer_config']
+ super().__init__(config=config, **kwargs)
+ self.name = kwargs.get('name', 'error_name') + '_actor_'
+
+
+class _abc_Critric_Model(Model):
+ def __init__(self, config={}, **kwargs):
+ config['model_config'] = config['critic_model_config']['model_config']
+ config['optimizer_config'] = config['critic_optimizer_config']['optimizer_config']
+ super().__init__(config=config, **kwargs)
+ self.name = kwargs.get('name', 'error_name') + '_critic_'
+
+
+class _abc_Actor_Critic_Model(Model):
+ def __init__(self, actor_model: object = None, critic_model: object = None,
+ config: dict = {}, **kwargs):
+ config['actor_model_config']['model_config']['name'] = config['model_config']['name']
+ config['actor_model_config']['model_config']['input_shape'] = config['model_config']['input_shape']
+ config['actor_model_config']['model_config']['action_space'] = config['model_config']['action_space']
+
+ config['critic_model_config']['model_config']['name'] = config['model_config']['name']
+ config['critic_model_config']['model_config']['input_shape'] = config['model_config']['input_shape']
+ config['critic_model_config']['model_config']['action_space'] = config['model_config']['action_space']
+ self.actor_model = actor_model(config=config, **kwargs)
+ self.critic_model = critic_model(config=config, **kwargs)
+
+ @abc.abstractmethod
+ def calculate_gradients(self, **kwargs) -> dict:
+ "Функция кастомного вычисления градиентов"
+
+ def update_weights(self, **kwargs):
+ _ = self.update_weights_actor(**kwargs)
+ return self.update_weights_critic(**kwargs)
+
+ def update_weights_actor(self, **kwargs):
+ kwargs['critic_model'] = self.critic_model.model
+ loss = self.actor_model.update_weights(**kwargs)
+ return {'loss': loss['loss'], 'td_error': loss['td_error']}
+
+ def update_weights_critic(self, **kwargs) -> dict:
+ loss = self.critic_model.update_weights(**kwargs)
+ return {'loss': loss['loss'], 'td_error': loss['td_error']}
+
+ def get_weights(self, ) -> dict:
+ return {
+ 'actor': self.actor_model.get_weights(),
+ 'critic': self.critic_model.get_weights()
+ }
+
+ def input_spec(self, key=None):
+ return self.actor_model.input_spec(key=key)
+
+ def load(self, path):
+ self.actor_model.load(path)
+ self.critic_model.load(path)
+
+ def save(self, path):
+ self.actor_model.save(path)
+ self.critic_model.save(path)
+
+ def set_weights(self, weights: dict) -> None:
+ self.actor_model.set_weights(weights=weights['actor'])
+ self.critic_model.set_weights(weights=weights['critic'])
+
+ @property
+ def summary(self):
+ self.actor_model.summary
+ self.critic_model.summary
+
+
+class Actor_Model(_abc_Actor_Model, DQN_Model):
+ def __init__(self, config={}, **kwargs):
+ super().__init__(config=config, **kwargs)
+
+ def _prediction_processing(self, inputs: tf.Tensor, **kwargs):
+ return kwargs['critic_model']([kwargs['state'], inputs])
+
+ def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor:
+ """Вычисляет и возвращает потери в соответствии с функцией потерь"""
+ return tf.reduce_mean(predict, axis=0) * (-1)
+
+
+class Critic_Model(_abc_Critric_Model, DQN_Model):
+ def __init__(self, config={}, **kwargs):
+ super().__init__(config=config, **kwargs)
+
+ def _prediction_processing(self, inputs: tf.Tensor, **kwargs):
+ return inputs
+
+ @tf.function(reduce_retracing=True,
+ jit_compile=False,
+ experimental_autograph_options=tf.autograph.experimental.Feature.ALL)
+ def calculate_gradients(self, **kwargs) -> dict:
+ """
+ Вычисляет градиенты, лосс, td-ошибку
+
+ Kwargs:
+ dict содержащий батч, таргет, маску, опционально приоритетные веса
+
+ Returns:
+ dict содержащий лоссы и td-ошибку
+ """
+ with tf.GradientTape(persistent=False) as tape:
+ Q = self.model([kwargs['state'], kwargs['action']], training=True)
+ Q = self.prediction_processing(Q, **kwargs)
+ if len(Q.shape) != len(kwargs['Qtarget'].shape):
+ Q = tf.expand_dims(Q, -1)
+
+ td_error = kwargs['Qtarget'] - Q
+ loss = self.loss(kwargs['Qtarget'], Q)*kwargs.get('weights', 1.0)
+ E_loss = tf.reduce_mean(loss, axis=0)
+ gradients = tape.gradient(E_loss, self.model.trainable_variables)
+ loss = tf.reduce_mean(loss, axis=-1)
+ return {'gradients': gradients, 'loss': loss, 'td_error': td_error}
+
+ @staticmethod
+ def create_model(input_shape: tuple, action_space: int) -> tf.keras.Model:
+ """Создает модель tf.keras.Model, архитектура DQN"""
+ input_layer = layers.Input(shape=input_shape, )
+ action_layer = layers.Input(shape=action_space, )
+ concat = layers.Concatenate()((input_layer, action_layer))
+ flatten = layers.Flatten()(concat)
+ dence_layer1 = layers.Dense(256, activation='relu')(flatten)
+ dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
+ dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
+
+ return tf.keras.Model(
+ inputs=[input_layer, action_layer],
+ outputs=dence_out
+ )
+
+ @staticmethod
+ def create_model_with_conv(input_shape: tuple,
+ action_space: int) -> tf.keras.Model:
+ """Создает модель tf.keras.Model, архитектура DQN,
+ начальные слои - сверточные"""
+ input_layer = layers.Input(shape=input_shape, )
+ action_layer = layers.Input(shape=action_space, )
+ cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer)
+ cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1)
+ cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2)
+ conv_out = layers.Flatten()(cov_layer3)
+
+ concat = layers.Concatenate()((conv_out, action_layer))
+ flatten = layers.Flatten()(concat)
+ dence_layer1 = layers.Dense(256, activation='relu')(flatten)
+ dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
+ dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
+
+ return tf.keras.Model(
+ inputs=[input_layer, action_layer],
+ outputs=dence_out
+ )
+
+
+class Actor_Critic_Model(_abc_Actor_Critic_Model, DQN_Model):
+ def __init__(self, config={}, **kwargs):
+ super().__init__(actor_model=Actor_Model,
+ critic_model=Critic_Model,
+ config=config,
+ **kwargs)
+
+ def __call__(self, input: tf.Tensor) -> tf.Tensor:
+ return self.critic_model([input, self.actor_model(input)])
+
+ def calculate_gradients(self, **kwargs) -> dict:
+ kwargs['action'] = self.actor_model(kwargs['next_state'])
+ gradients = self.critic_model.calculate_gradients(**kwargs)
+ return gradients
diff --git a/rl_lib/src/algoritms/model_free/policy_gradient/ppo/__init__.py b/rl_lib/src/algoritms/model_free/policy_gradient/ppo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rl_lib/src/algoritms/model_free/policy_gradient/ppo/ppo.py b/rl_lib/src/algoritms/model_free/policy_gradient/ppo/ppo.py
new file mode 100644
index 0000000..ad3a819
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/policy_gradient/ppo/ppo.py
@@ -0,0 +1,14 @@
+from ..a2c.actor_critic import _abc_Actor_Model, _abc_Critric_Model, _abc_Actor_Critic_Model
+
+
+class PPO_Actor_Model(_abc_Actor_Model):
+ def __init__(self, config=..., **kwargs):
+ super().__init__(config, **kwargs)
+
+class PPO_Critic_Model(_abc_Critric_Model):
+ def __init__(self, config=..., **kwargs):
+ super().__init__(config, **kwargs)
+
+class PPO_Model(_abc_Actor_Critic_Model):
+ def __init__(self, actor_model: object = None, critic_model: object = None, config: dict = ..., **kwargs):
+ super().__init__(actor_model, critic_model, config, **kwargs)
\ No newline at end of file
diff --git a/rl_lib/src/algoritms/model_free/value_based/__init__.py b/rl_lib/src/algoritms/model_free/value_based/__init__.py
new file mode 100644
index 0000000..9b3f352
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/value_based/__init__.py
@@ -0,0 +1,3 @@
+from .dqn.dqn import DQN, DQN_Model
+from .drqn.drqn import DRQN, DRQN_Model
+from .qr_dqn.qr_dqn import QR_DQN, QR_DQN_Model
\ No newline at end of file
diff --git a/rl_lib/src/algoritms/model_free/value_based/base_algo.py b/rl_lib/src/algoritms/model_free/value_based/base_algo.py
new file mode 100644
index 0000000..9c74235
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/value_based/base_algo.py
@@ -0,0 +1,144 @@
+import abc
+from copy import copy
+from typing import Union
+
+import tensorflow as tf
+
+from rl_lib.src.data_saver.saver import Saver
+
+from rl_lib.src.data_saver.utils import load_default_config
+from .utils import update_config
+
+
+class Base_Algo(Saver, abc.ABC):
+ """Базовый абстрактный класс алгоритма.
+ Хранит все методы, необходимые для вычислений в каком либо алгоритме.
+ """
+
+ def __init__(self, action_model: object,
+ target_model: object,
+ config: dict,
+ default_config_path: str,
+ *args, **kwargs):
+ self._config = load_default_config(default_config_path)
+ update_config(self._config, config)
+
+ self.action_model = action_model(
+ config=copy(self._config),
+ algo_name=kwargs.get("algo_name", "unkown"),
+ name=(kwargs.get("name", "unkown_name") +
+ "_action_" +
+ config.get("model_config", {}).get("name", ""))
+ )
+ self.target_model = target_model(
+ config=copy(self._config),
+ algo_name=kwargs.get("algo_name", "unkown"),
+ name=(kwargs.get("name", "unkown_name") +
+ "_target_" +
+ config.get("model_config", {}).get("name", ""))
+ )
+ super().__init__(**self.config.get('data_saver', {}), **kwargs)
+ self.target_model.set_weights(self.action_model.get_weights())
+
+ @property
+ def config(self):
+ return self._config
+
+ @abc.abstractclassmethod
+ def calculate_new_best_action(self) -> tf.Tensor:
+ """Вычислеят новое лучшее действие для получения таргета"""
+
+ @abc.abstractclassmethod
+ def calculate_target(self) -> dict:
+ """Вычисляет таргет для обучения"""
+
+ @abc.abstractclassmethod
+ def get_action(self, observation) -> float:
+ """Возвращает действие на основе наблюдения с учетом исследования"""
+
+ @abc.abstractclassmethod
+ def get_test_action(self, observation) -> float:
+ """Возвращает действие на основе наблюдения без исследования"""
+
+ @abc.abstractclassmethod
+ def get_gradients(self) -> tf.Tensor:
+ """Вычисляет градиенты и возвращает их"""
+
+ @abc.abstractclassmethod
+ def load(self, path) -> None:
+ """Загружает алгоритм"""
+
+ @abc.abstractclassmethod
+ def reset(self) -> None:
+ """Сбрасывает внутренние данные модели"""
+
+ @abc.abstractclassmethod
+ def _train_step(self) -> dict:
+ """Вспомогательная train_step"""
+
+ @abc.abstractclassmethod
+ def train_step(self) -> dict:
+ """Вычисляет полный обучающий шаг"""
+
+ @abc.abstractclassmethod
+ def save(self, path) -> None:
+ """Сохраняет алгоритм"""
+
+ @abc.abstractclassmethod
+ def summary(self) -> None:
+ """Выводит архитектуру модели"""
+
+ @tf.function(reduce_retracing=None,
+ jit_compile=None,
+ experimental_autograph_options=None)
+ def _copy_weights(self, action_model_weights: list,
+ target_model_weights: list,
+ tau: float) -> tf.constant:
+ """Копирует веса из модели действия в целевую модель"""
+ for a_w, t_w in zip(action_model_weights, target_model_weights):
+ new_weights = tf.add(tf.multiply(tau, a_w),
+ tf.multiply((1-tau), t_w))
+ t_w.assign(tf.identity(new_weights))
+ return tf.constant(1)
+
+ def copy_weights(self) -> tf.constant:
+ """Копирует веса из модели действия в целевую модель"""
+ res = self._copy_weights(
+ self.action_model.weights, self.target_model.weights, self.tau)
+ return res
+
+ def _expand_dims_like(self, tensor: tf.Tensor,
+ tensor_like: tf.Tensor) -> tf.Tensor:
+ len_tensor_like_shape = len(tensor_like.shape)
+
+ while len(tensor.shape) < len_tensor_like_shape:
+ tensor = tf.expand_dims(tensor, axis=-1)
+ return tensor
+
+ @tf.function(reduce_retracing=True,
+ jit_compile=True,
+ experimental_autograph_options=tf.autograph.experimental.Feature.ALL)
+ def sample_action(self,
+ state: Union[tf.Tensor, tuple]
+ ) -> Union[tf.Tensor, list]:
+ """Возвращает предсказания модели на основе текущих наблюдений"""
+ predict = self.action_model(state)
+ if isinstance(predict, list):
+ return self.squeeze_predict(predict[0]), predict[1], predict[2]
+ return self.squeeze_predict(predict)
+
+ @tf.function(reduce_retracing=None,
+ jit_compile=None,
+ experimental_autograph_options=None)
+ def set_weights(self, target_weights: list) -> tf.constant:
+ """Устанавливает переданные как аргумент веса в основную сеть"""
+ for a_w, t_w in zip(self.action_model.weights, target_weights):
+ a_w.assign(tf.identity(t_w))
+ return tf.constant(1)
+
+ @staticmethod
+ def squeeze_predict(predict) -> tf.Tensor:
+ """Удаляет единичные измерения из предсказаний"""
+ while len(predict.shape) >= 1 and predict.shape[0] == 1:
+ predict = tf.squeeze(predict, axis=0)
+ return predict
diff --git a/rl_lib/src/algoritms/model_free/value_based/dqn/__init__.py b/rl_lib/src/algoritms/model_free/value_based/dqn/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rl_lib/src/algoritms/dqn/config.yaml b/rl_lib/src/algoritms/model_free/value_based/dqn/config.yaml
similarity index 100%
rename from rl_lib/src/algoritms/dqn/config.yaml
rename to rl_lib/src/algoritms/model_free/value_based/dqn/config.yaml
diff --git a/rl_lib/src/algoritms/model_free/value_based/dqn/dqn.py b/rl_lib/src/algoritms/model_free/value_based/dqn/dqn.py
new file mode 100644
index 0000000..79683fc
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/value_based/dqn/dqn.py
@@ -0,0 +1,84 @@
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from rl_lib.src.models.model import Model
+
+from ..simple_q import SimpleQ
+
+
+class DQN_Model(Model):
+ def __init__(self, config={}, **kwargs):
+ super().__init__(
+ model_config=config.get('model_config', {}),
+ config=config,
+ **kwargs)
+
+ def _prediction_processing(self, inputs: tf.Tensor, **kwargs):
+ mask = self.make_mask(tf.cast(kwargs['action'], dtype=tf.int32))
+ if len(inputs.shape) != len(mask.shape):
+ mask = tf.expand_dims(mask, -1)
+ return tf.reduce_sum(
+ tf.multiply(inputs, mask),
+ axis=kwargs['batch_dims']
+ )
+
+ def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor:
+ """Вычисляет и возвращает потери в соответствии с функцией потерь"""
+ return tf.math.squared_difference(target, predict)
+
+ def make_mask(self, action) -> tf.Tensor:
+ """Создает маску по действиям """
+ return tf.one_hot(action, self.output_spec()[-1])
+
+ def _update_next_state(self, state, action):
+ pass
+
+ def initial_state(self):
+ pass
+
+ @staticmethod
+ def create_model(input_shape: tuple, action_space: int) -> tf.keras.Model:
+ """Создает модель tf.keras.Model, архитектура DQN"""
+ input_layer = layers.Input(shape=input_shape, )
+ dence_layer1 = layers.Dense(256, activation='relu')(input_layer)
+ dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
+ dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
+
+ return tf.keras.Model(inputs=input_layer, outputs=dence_out)
+
+ @staticmethod
+ def create_model_with_conv(input_shape: tuple,
+ action_space: int
+ ) -> tf.keras.Model:
+ """Создает модель tf.keras.Model, архитектура DQN,
+ начальные слои - сверточные"""
+ input_layer = layers.Input(shape=input_shape, )
+ cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer)
+ cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1)
+ cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2)
+ conv_out = layers.Flatten()(cov_layer3)
+
+ dence_layer1 = layers.Dense(256, activation='relu')(conv_out)
+ dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
+ dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
+
+ return tf.keras.Model(inputs=input_layer, outputs=dence_out)
+
+
+class DQN(SimpleQ):
+ def __init__(self, config):
+ super().__init__(DQN_Model, DQN_Model, config,
+ default_config_path=__file__,
+ algo_name="DQN",
+ name=("DQN_Model_" +
+ config.get('model_config', '').get('name', ''))
+ )
+
+ def _prediction_processing(self, input_data):
+ pass
+
+ def _update_next_state(self, state, action):
+ pass
+
+ def initial_state(self):
+ pass
diff --git a/rl_lib/src/algoritms/model_free/value_based/drqn/__init__.py b/rl_lib/src/algoritms/model_free/value_based/drqn/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rl_lib/src/algoritms/drqn/config.yaml b/rl_lib/src/algoritms/model_free/value_based/drqn/config.yaml
similarity index 97%
rename from rl_lib/src/algoritms/drqn/config.yaml
rename to rl_lib/src/algoritms/model_free/value_based/drqn/config.yaml
index 98f48c8..4e3755b 100644
--- a/rl_lib/src/algoritms/drqn/config.yaml
+++ b/rl_lib/src/algoritms/model_free/value_based/drqn/config.yaml
@@ -1,4 +1,4 @@
-#default DQN config
+#default DRQN config
model_config:
model: None
diff --git a/rl_lib/src/algoritms/model_free/value_based/drqn/drqn.py b/rl_lib/src/algoritms/model_free/value_based/drqn/drqn.py
new file mode 100644
index 0000000..66c941d
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/value_based/drqn/drqn.py
@@ -0,0 +1,174 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from ..simple_q import SimpleQ
+from rl_lib.src.models.model import Model
+
+
+class DRQN_Model(Model):
+ def __init__(self, config={}, **kwargs):
+ super().__init__(model_config=config.get('model_config', {}),
+ config=config, default_config_path=__file__,
+ **kwargs)
+ self.h_t, self.c_t, self.new_h_t, self.new_c_t = None, None, None, None
+ self.lstm_size = config['model_config'].get("lstm_size", 64)
+
+ def __call__(self, inputs: tf.Tensor) -> tf.Tensor:
+ return super().__call__(
+ [inputs, self.h_t, self.c_t]
+ if not isinstance(inputs, list) else inputs
+ )
+
+ def _initial_model(self):
+ input_shape = self._config['model_config']["input_shape"]
+ action_space = self._config['model_config']["action_space"]
+ if len(input_shape) == 1:
+ return self.create_model(input_shape, action_space, self.lstm_size)
+ else:
+ return self.create_model_with_conv(input_shape,
+ action_space,
+ self.lstm_size)
+
+ def initial_state(self):
+ """Инициализирует внутреннее состояние рекуррентной сети"""
+ self.h_t = tf.zeros((1, self.lstm_size), dtype=tf.float32)
+ self.c_t = self.h_t
+
+ def get_states(self) -> tuple:
+ """Возвращает кортеж внутренних состояний реккурентной сети"""
+ return tf.squeeze(self.h_t.numpy()), tf.squeeze(self.c_t.numpy())
+
+ def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor:
+ """Вычисляет и возвращает потери в соответствии с функцией потерь"""
+ return tf.math.squared_difference(target, predict)
+
+ def make_mask(self, action) -> tf.Tensor:
+ """Создает маску по действиям """
+ return tf.one_hot(tf.cast(action, tf.int32), self.output_spec()[-1])
+
+ def _prediction_processing(self, inputs: tf.Tensor, **kwargs):
+ mask = self.make_mask(kwargs['action'])
+ while len(inputs.shape) < len(mask.shape):
+ mask = tf.expand_dims(mask, -1)
+ return tf.reduce_sum(
+ tf.multiply(inputs, mask),
+ axis=kwargs['batch_dims'])[:, kwargs['recurrent_skip']:]
+
+ def _update_next_state(self):
+ """Обновляет внутреннее состояние рекуррентной сети"""
+ self.h_t, self.c_t = self.new_h_t, self.new_c_t
+
+ @staticmethod
+ def create_model(input_shape: tuple,
+ action_space: int,
+ lstm_size: int
+ ) -> tf.keras.Model:
+ """Создает модель tf.keras.Model, архитектура DRQN"""
+ input_layer = layers.Input(shape=input_shape, )
+ h_t_input = layers.Input(shape=(lstm_size, ), )
+ c_t_input = layers.Input(shape=(lstm_size, ), )
+
+ lstm = layers.LSTM(lstm_size, activation='tanh',
+ recurrent_activation='sigmoid',
+ return_sequences=True,
+ return_state=True, stateful=False)(input_layer,
+ initial_state=[h_t_input, c_t_input])
+ dence_layer1 = layers.Dense(256, activation='relu')(input_layer)
+ dence_layer2 = layers.Dense(128, activation='relu')(dence_layer1)
+ dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
+
+ return tf.keras.Model(
+ inputs=[input_layer, h_t_input, c_t_input],
+ outputs=[dence_out, lstm[1], lstm[2]]
+ )
+
+ @staticmethod
+ def create_model_with_conv(input_shape: tuple,
+ action_space: int,
+ lstm_size: int) -> tf.keras.Model:
+ """Создает модель tf.keras.Model, архитектура DRQN,
+ начальные слои - сверточные"""
+ input_layer = layers.Input(shape=input_shape, )
+ h_t_input = layers.Input(shape=(lstm_size, ), )
+ c_t_input = layers.Input(shape=(lstm_size, ), )
+
+ cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer)
+ cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1)
+ cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2)
+ conv_out = layers.Flatten()(cov_layer3)
+ lstm = layers.LSTM(lstm_size, activation='tanh',
+ recurrent_activation='sigmoid',
+ return_sequences=True,
+ return_state=True, stateful=False)(conv_out,
+ initial_state=[h_t_input, c_t_input])
+ dence_layer1 = layers.Dense(256, activation='relu')(lstm[0])
+ dence_layer2 = layers.Dense(128, activation='relu')(dence_layer1)
+ dence_out = layers.Dense(action_space, activation=None)(dence_layer2)
+
+ return tf.keras.Model(
+ inputs=[input_layer, h_t_input, c_t_input],
+ outputs=[dence_out, lstm[1], lstm[2]]
+ )
+
+
+class DRQN(SimpleQ):
+ def __init__(self, config):
+ super().__init__(DRQN_Model, DRQN_Model, config,
+ default_config_path=__file__,
+ algo_name="DRQN",
+ name=("DRQN_Model_" +
+ config.get('model_config', '').get('name', ''))
+ )
+
+ self.initial_state()
+ self.recurrent_skip = self.config['buffer_config']['recurrent_skip']
+ self.trace_length = self.config['buffer_config']['trace_length']
+ self.RECURRENT = True
+ self.BATCH_DIMS = 2
+
+ def add(self, data: tuple, priority=None) -> None:
+ """
+ Добавляет переходы в буфер
+ Аргументы:
+ data: tuple(state, action, reward, done, next_state)
+ priority: np.array (только для приоритетных буферов)
+ """
+ super().add((*data, *self.action_model.get_states()), priority)
+ self._update_next_state()
+
+ def initial_state(self):
+ """Сбравсывает внутренне состояние lstm"""
+ self.action_model.initial_state()
+
+ def _get_action(self, observation: tf.Tensor) -> tf.Tensor:
+ """Возвращает ценность дейтсвий Q(s,a) всех действий
+ на основе наблюдения"""
+ predict = super()._get_action(observation)
+ action, self.action_model.new_h_t, self.action_model.new_c_t = predict
+ return action
+
+ def get_test_action(self, observation: tf.Tensor) -> float:
+ action = super().get_test_action(observation)
+ self._update_next_state()
+ return action
+
+ def get_batch(self, ):
+ batch = super().get_batch()
+ new_h_t, new_c_t = batch['h_t'][:, 1], batch['c_t'][:, 1]
+ h_t, c_t = batch['h_t'][:, 0], batch['c_t'][:, 0]
+ batch['state'] = [batch['state'], h_t, c_t]
+ batch['next_state'] = [batch['next_state'], new_h_t, new_c_t]
+ batch['recurrent_skip'] = self.recurrent_skip
+ batch['trace_length'] = self.trace_length
+
+ if self.priority:
+ batch['weights'] = np.repeat(np.expand_dims(
+ batch['weights'], -1),
+ self.trace_length-self.recurrent_skip,
+ axis=1)
+ return batch
+
+ def _update_next_state(self):
+ """Обновляет внутреннее состояние lstm новым состоянием lstm"""
+ self.action_model._update_next_state()
diff --git a/rl_lib/src/algoritms/model_free/value_based/qr_dqn/__init__.py b/rl_lib/src/algoritms/model_free/value_based/qr_dqn/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rl_lib/src/algoritms/model_free/value_based/qr_dqn/config.yaml b/rl_lib/src/algoritms/model_free/value_based/qr_dqn/config.yaml
new file mode 100644
index 0000000..54d71a3
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/value_based/qr_dqn/config.yaml
@@ -0,0 +1,49 @@
+#default QR_DQN config
+
+model_config:
+ model: None
+ name: "default_QR_DQN"
+ input_shape: None
+ action_space: None
+ discount_factor : 0.99
+ n_step: 1
+ batch_size: 32
+ double_network: False
+ priority: False
+ tau: 1.0
+ num_atoms: 200
+ hubber_k: 1.0
+
+optimizer_config:
+ optimizer_name: "adam"
+ optimizer_params:
+ learning_rate: 0.0001
+ epsilon: 0.001
+ clipnorm: 1.0
+ custom_optimizer: None
+
+buffer_config:
+ size: 100000
+ priority: False
+ n_step: None
+ discount_factor : None
+ eps: None
+ alpha: None
+ beta: None
+ beta_changing: None
+ beta_changing_curve: None
+ max_priority: None
+
+exploration_config:
+ strategy_name: "epsilon_greedy"
+ strategy_config:
+ eps_decay_steps: 100000
+ eps_max: 1.0
+ eps_min: 0.1
+ eps_test: 0.001
+ action_space: None
+
+data_saver:
+ path: ""
+ copy_path: ""
+
diff --git a/rl_lib/src/algoritms/model_free/value_based/qr_dqn/qr_dqn.py b/rl_lib/src/algoritms/model_free/value_based/qr_dqn/qr_dqn.py
new file mode 100644
index 0000000..f56f946
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/value_based/qr_dqn/qr_dqn.py
@@ -0,0 +1,123 @@
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from rl_lib.src.models.model import Model
+from ..simple_q import SimpleQ
+
+class QR_DQN_Model(Model):
+ def __init__(self, config={}, **kwargs):
+ super().__init__(
+ model_config=config.get('model_config', {}),
+ config=config,
+ **kwargs)
+ self.num_atoms = config['model_config'].get("num_atoms", 200)
+ self.implicit_tau = tf.reshape([i/self.num_atoms for i in range(1, self.num_atoms+1)], (1, 1, self.num_atoms))
+
+ self.hubber_k = config['model_config'].get("hubber_k", 1.0)
+
+ def _prediction_processing(self, inputs: tf.Tensor, **kwargs):
+ mask = self.make_mask(tf.cast(kwargs['action'], dtype=tf.int32))
+ if len(inputs.shape) != len(mask.shape):
+ mask = tf.expand_dims(mask, -1)
+ return tf.reduce_sum(
+ tf.multiply(inputs, mask),
+ axis=kwargs['batch_dims']
+ )
+
+ def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor:
+ """Вычисляет и возвращает потери в соответствии с функцией потерь"""
+ error = target - predict
+ huber_loss = self.huber_loss_func(error, k=self.hubber_k)
+ quantill_loss = tf.abs(self.implicit_tau - tf.cast(error < 0, dtype = tf.float32)) * (huber_loss) #/k IQN loss при k=0 бесконечность...
+ quantill_loss = tf.reduce_mean(quantill_loss, axis = -1) #N' sum and 1/N'
+ quantill_loss = tf.reduce_sum(quantill_loss, -1) #N sum
+ return quantill_loss
+
+ def huber_loss_func(self, error, k=1.0):
+ return tf.where(tf.abs(error) <= k, 0.5 * tf.square(error), k * (tf.abs(error) - 0.5 * k))
+
+ def make_mask(self, action) -> tf.Tensor:
+ """Создает маску по действиям """
+ return tf.one_hot(action, self.output_spec()[-2])
+
+ def _update_next_state(self, state, action):
+ pass
+
+ def initial_state(self):
+ pass
+
+ @staticmethod
+ def create_model(input_shape: tuple, action_space: int,
+ quantile_dim: int = 200) -> tf.keras.Model:
+ """Создает модель tf.keras.Model, архитектура DQN"""
+ input_layer = layers.Input(shape=input_shape, )
+ dence_layer1 = layers.Dense(256, activation='relu')(input_layer)
+ dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
+ dence_out = layers.Dense(action_space * quantile_dim,
+ activation=None)(dence_layer2)
+
+ out = layers.Reshape((action_space, quantile_dim))(dence_out)
+ return tf.keras.Model(inputs=input_layer, outputs=out)
+
+ @staticmethod
+ def create_model_with_conv(input_shape: tuple,
+ action_space: int,
+ quantile_dim: int = 200) -> tf.keras.Model:
+ """Создает модель tf.keras.Model, архитектура DQN,
+ начальные слои - сверточные"""
+ input_layer = layers.Input(shape=input_shape, )
+ cov_layer1 = layers.Conv2D(32, 7, activation='relu')(input_layer)
+ cov_layer2 = layers.Conv2D(64, 5, activation='relu')(cov_layer1)
+ cov_layer3 = layers.Conv2D(64, 3, activation='relu')(cov_layer2)
+ conv_out = layers.Flatten()(cov_layer3)
+
+ dence_layer1 = layers.Dense(256, activation='relu')(conv_out)
+ dence_layer2 = layers.Dense(256, activation='relu')(dence_layer1)
+ dence_out = layers.Dense(action_space * quantile_dim,
+ activation=None)(dence_layer2)
+
+ out = layers.Reshape((action_space, quantile_dim))(dence_out)
+ return tf.keras.Model(inputs=input_layer, outputs=out)
+
+
+class QR_DQN(SimpleQ):
+ def __init__(self, config):
+ super().__init__(QR_DQN_Model, QR_DQN_Model, config,
+ default_config_path=__file__,
+ algo_name="QR_DQN",
+ name=("QR_DQN_Model_" +
+ config.get('model_config', '').get('name', ''))
+ )
+ self.BATCH_DIMS = 1
+ self.IND_AXIS = 1
+ self.MEAN_AXIS = 2
+
+ def _prediction_processing(self, input_data):
+ pass
+
+ def _update_next_state(self, state, action):
+ pass
+
+ def initial_state(self):
+ pass
+
+ def get_best_action(self, Z_action: tf.Tensor, Z_target: tf.Tensor):
+ q = tf.reduce_mean(Z_action, axis=self.MEAN_AXIS)
+ ind = tf.expand_dims(tf.argmax(q, axis=self.IND_AXIS),-1)
+ Z_target = tf.gather(Z_target, ind, batch_dims=self.BATCH_DIMS)
+ return Z_target
+
+ def _train_step(self, **batch) -> dict:
+ result = super()._train_step(**batch)
+ result['td_error'] = tf.reduce_mean(
+ tf.reduce_mean(
+ result['td_error'], -1
+ ), -1
+ )
+ return result
+
+ def _get_action(self, observation: tf.Tensor) -> tf.Tensor:
+ return tf.reduce_mean(
+ super()._get_action(observation),
+ axis=-1
+ )
\ No newline at end of file
diff --git a/rl_lib/src/algoritms/model_free/value_based/simple_q.py b/rl_lib/src/algoritms/model_free/value_based/simple_q.py
new file mode 100644
index 0000000..d9059ea
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/value_based/simple_q.py
@@ -0,0 +1,218 @@
+from copy import copy
+
+import numpy as np
+import tensorflow as tf
+
+from rl_lib.src.explore_env.exploration_manager import ExplorationManager
+from rl_lib.src.replay_buffers.replay_buffer import ReplayBuffer
+
+from .base_algo import Base_Algo
+
+
+class SimpleQ(Base_Algo, ):
+ """Произовдит все вычисления необходимые для Q-learning
+ """
+
+ def __init__(self, action_model: object,
+ target_model: object,
+ config: dict,
+ **kwargs):
+
+ Base_Algo.__init__(self, action_model, target_model, config, **kwargs)
+ # print(self.config)
+ config = link_data_inside_the_config(self.config)
+ self.buffer = ReplayBuffer(**config.get("buffer_config", {}))
+ self.exploration = ExplorationManager(
+ **config.get("exploration_config", {}))
+
+ self.discount_factor = self.config['model_config']['discount_factor']
+ self.n_step = self.config['model_config']['n_step']
+
+ self.batch_size = self.config['model_config'].get("batch_size")
+ self.double_network = self.config['model_config'].get("double_network")
+ self.priority = self.config['model_config'].get("priority")
+ self.tau = self.config['model_config'].get("tau")
+
+ self.RECURRENT = False
+ self.BATCH_DIMS = 1
+ self.IND_AXIS = -1
+
+ def add(self, data: tuple, priority=None) -> None:
+ """
+ Добавляет переходы в буфер
+ Аргументы:
+ data: tuple(state, action, reward, done, next_state)
+ priority: np.array (только для приоритетных буферов)
+ """
+ self.buffer.add(data, priority)
+
+ def calculate_double_q(self, **kwargs):
+ Qaction = self.action_model(kwargs['next_state'])
+ Qtarget = self.target_model(kwargs['next_state'])
+ Qaction = Qaction[0] if isinstance(Qaction, list) else Qaction
+ Qtarget = Qtarget[0] if isinstance(Qtarget, list) else Qtarget
+ if kwargs["p_double"] < 0.5:
+ Qtarget = self.get_best_action(Qtarget, Qaction)
+ else:
+ Qtarget = self.get_best_action(Qaction, Qtarget)
+ return Qtarget
+
+ def calculate_gradients(self, batch=None):
+ if batch is None:
+ batch = self.choice_model_for_double_calculates(**batch)
+ batch = self.choice_model_for_double_calculates(**batch)
+ return (self.action_model.calculate_gradients(**batch)
+ if batch['p_double'] > 0.5
+ else self.target_model.calculate_gradients(**batch))
+
+ def calculate_new_best_action(self, **kwargs) -> tf.Tensor:
+ """Вычислеят новое лучшее действие для получения таргета"""
+ if self.double_network:
+ Qtarget = self.calculate_double_q(**kwargs)
+ else:
+ Qtarget = self.target_model(kwargs['next_state'])
+ Qtarget = Qtarget[0] if isinstance(Qtarget, list) else Qtarget
+ Qtarget = self.get_best_action(Qtarget, Qtarget)
+ return Qtarget
+
+ @tf.function(reduce_retracing=True,
+ jit_compile=False,
+ experimental_autograph_options=tf.autograph.experimental.Feature.ALL)
+ def calculate_target(self, **kwargs):
+ Qtarget = self.calculate_new_best_action(**kwargs)
+ dones = tf.ones_like(kwargs['done'], dtype=tf.dtypes.float32)
+ dones = dones - kwargs['done']
+ Qtarget = self._expand_dims_like(kwargs['reward'], Qtarget) + \
+ (self.discount_factor**self.n_step) * Qtarget * \
+ self._expand_dims_like(dones, Qtarget)
+ if self.RECURRENT:
+ Qtarget = Qtarget[:, kwargs.get('recurrent_skip', 10):]
+ return Qtarget
+
+ def check_fullness_buffer(self):
+ """Проверяет наполненость буфера,
+ возвращает true если в буфере элементов больше батча
+ """
+ if self.buffer.real_size > self.batch_size:
+ return True
+ else:
+ return False
+
+ def choice_model_for_double_calculates(self, **batch):
+ batch['p_double'] = tf.random.uniform(
+ (1,), minval=0.0, maxval=1.0) if self.double_network else 1.
+ batch['Qtarget'] = self.calculate_target(**batch)
+ return batch
+
+ def _get_action(self, observation: tf.Tensor) -> tf.Tensor:
+ """Возвращает ценность дейтсвий Q(s,a) всех действий
+ на основе наблюдения
+ """
+ return self.sample_action(
+ self.action_model.check_input_shape(
+ copy(observation)
+ )
+ )
+
+ def get_action(self, observation: tf.Tensor) -> float:
+ """Возвращает действие на основе наблюдения с учетом исследования"""
+ action = self.exploration(self._get_action(observation))
+ if isinstance(action, int):
+ return int(action)
+ else:
+ return action.numpy()
+
+ def get_test_action(self, observation: tf.Tensor) -> float:
+ """Возвращает действие на основе наблюдения без исследования"""
+ action = self.exploration.test(self._get_action(observation))
+ if isinstance(action, int):
+ return int(action)
+ else:
+ return action.numpy()
+
+ def get_batch(self):
+ """Получает батч из буфера"""
+ return self.buffer.sample(self.batch_size)
+
+ def get_batch_and_td_error(self):
+ batch = self.get_batch()
+ batch['batch_dims'] = self.BATCH_DIMS
+ batch['p_double'] = 1.
+ td_error = self.calculate_gradients(batch)['td_error']
+ return {'td_error': td_error.numpy(), 'batch': batch}
+
+ def get_best_action(self, Qaction: tf.Tensor, Qtarget: tf.Tensor):
+ ind = tf.argmax(Qaction, axis=self.IND_AXIS)
+ Qtarget = tf.gather(Qtarget, ind, batch_dims=self.BATCH_DIMS)
+ return Qtarget
+
+ def get_gradients(self) -> tf.Tensor:
+ """Вычисляет градиенты и возвращает их"""
+ batch = self.get_batch()
+ batch['batch_dims'] = self.BATCH_DIMS
+ batch['p_double'] = 1.
+ return self.calculate_gradients(batch)['gradients']
+
+ def load(self, ) -> None:
+ """Загружает алгоритм"""
+ self.action_model.load(self.path)
+ self.target_model.load(self.path)
+ self.buffer.load(self.path)
+ self.exploration.load(self.path)
+
+ def reset(self) -> None:
+ """Сбрасывает внутренние данные модели"""
+ self.buffer.reset()
+ self.exploration.reset()
+ self.initial_model()
+
+ def _train_step(self, **batch) -> dict:
+ """Вспомогательная train_step"""
+ batch = self.choice_model_for_double_calculates(**batch)
+ batch['batch_dims'] = self.BATCH_DIMS
+ return (self.action_model.update_weights(**batch)
+ if batch['p_double'] > 0.5
+ else self.target_model.update_weights(**batch))
+
+ def train_step(self) -> np.array:
+ """Вычисляет полный обучающий шаг"""
+ if not self.check_fullness_buffer():
+ return 0
+ batch = self.get_batch()
+ result = self._train_step(**batch)
+ td_error = result['td_error'].numpy()
+ loss = result['loss'].numpy()
+ assert not np.all(np.isnan(td_error)), "td_error не может быть nan"
+ if self.priority:
+ self.buffer.update_priorities(
+ batch['data_idxs'], loss
+ if not self.RECURRENT
+ else loss[:, -1])
+ if self.tau != 1:
+ _ = self.copy_weights()
+ return np.mean(td_error)
+
+ def save(self) -> None:
+ """Сохраняет алгоритм"""
+ self.action_model.save(self.path)
+ self.target_model.save(self.path)
+ self.buffer.save(self.path)
+ self.exploration.save(self.path)
+
+ def summary(self) -> None:
+ """Выводит архитектуру модели"""
+ self.action_model.summary
+
+
+def link_data_inside_the_config(config):
+ # print(config)
+ discount_factor = config['model_config']['discount_factor']
+ n_step = config['model_config']['n_step']
+ action_space = config['model_config']['action_space']
+ priority = config['model_config']['priority']
+
+ config['buffer_config']['priority'] = priority
+ config['buffer_config']['discount_factor'] = discount_factor
+ config['buffer_config']['n_step'] = n_step
+ config['exploration_config']['strategy_config']['action_space'] = action_space
+ return config
diff --git a/rl_lib/src/algoritms/tests/config.yaml b/rl_lib/src/algoritms/model_free/value_based/tests/config.yaml
similarity index 100%
rename from rl_lib/src/algoritms/tests/config.yaml
rename to rl_lib/src/algoritms/model_free/value_based/tests/config.yaml
diff --git a/rl_lib/src/algoritms/model_free/value_based/tests/test_simpe_q.py b/rl_lib/src/algoritms/model_free/value_based/tests/test_simpe_q.py
new file mode 100644
index 0000000..51837c9
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/value_based/tests/test_simpe_q.py
@@ -0,0 +1,83 @@
+import os
+
+from rl_lib.src.models.model import Model
+
+from ..simple_q import SimpleQ
+
+
+class Simple_Model(Model):
+ def __init__(self, config={}, **kwargs):
+ super().__init__(model_config=config.get('model_config', {}),
+ config=config, default_config_path=__file__,
+ **kwargs)
+
+ def _prediction_processing(self, input_data):
+ pass
+
+ def _update_next_state(self, state, action):
+ pass
+
+ def initial_state(self):
+ pass
+
+ @staticmethod
+ def create_model(input_shape: tuple, action_space: int):
+ pass
+
+ @staticmethod
+ def create_model_with_conv(input_shape: tuple, action_space: int):
+ pass
+
+ def set_new_model(self, *args):
+ pass
+
+
+class Test_Simple_Q():
+ def __init__(self, config):
+ self.simple_q = SimpleQ(Simple_Model, Simple_Model, config,
+ default_config_path=__file__,
+ algo_name="SimpleQ", name="Simple_Model")
+
+ def test_save(self):
+ self.simple_q.save()
+ real_structure = get_directory_structure(self.simple_q.path)
+ assert self.simple_q.path != self.simple_q.config['data_saver']['path'], "Пути не совпадают"
+ correct_structure = {self.simple_q.name:
+ {
+ self.simple_q.exploration.name + ".data": None,
+ self.simple_q.buffer.name + ".data": None,
+ self.simple_q.action_model.name + ".keras": None,
+ self.simple_q.target_model.name + ".keras": None,
+ }
+ }
+ assert compare_directory_structures(
+ real_structure, correct_structure), "Каталоги разные"
+
+
+def compare_directory_structures(dir_structure1: dict,
+ dir_structure2: dict) -> bool:
+ """Проверяет одинаковые ли структуры каталогов"""
+ if dir_structure1.keys() != dir_structure2.keys():
+ return False
+
+ for key in dir_structure1.keys():
+ if isinstance(dir_structure1[key], dict) and isinstance(dir_structure2[key], dict):
+ if not compare_directory_structures(dir_structure1[key],
+ dir_structure2[key]):
+ return False
+ elif dir_structure1[key] != dir_structure2[key]:
+ return False
+
+ return True
+
+
+def get_directory_structure(path: str) -> dict:
+ """Получает всю структуру переданного каталога"""
+ structure = {}
+ for dirpath, dirnames, filenames in os.walk(path):
+ current_level = structure
+ for dirname in dirpath.split(os.sep):
+ current_level = current_level.setdefault(dirname, {})
+ for filename in filenames:
+ current_level[filename] = None
+ return structure
diff --git a/rl_lib/src/algoritms/model_free/value_based/utils.py b/rl_lib/src/algoritms/model_free/value_based/utils.py
new file mode 100644
index 0000000..385c493
--- /dev/null
+++ b/rl_lib/src/algoritms/model_free/value_based/utils.py
@@ -0,0 +1,13 @@
+def update_config(config: dict, new_data: dict) -> None:
+ """Обвновляет конфигурацию по умолчанию
+ Args:
+ config: dict: Конфигурация, которую надо обновить
+ new_data: dict: Конфигурация с новыми данными
+ Returns:
+ None
+ """
+ for key, value in new_data.items():
+ if isinstance(value, dict) and key in config and isinstance(config[key], dict):
+ update_config(config[key], value)
+ else:
+ config[key] = value
diff --git a/rl_lib/src/algoritms/simple_q.py b/rl_lib/src/algoritms/simple_q.py
deleted file mode 100644
index 5a0e331..0000000
--- a/rl_lib/src/algoritms/simple_q.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import tensorflow as tf
-import numpy as np
-
-from .base_algo import Base_Algo
-from rl_lib.src.replay_buffers.replay_buffer import ReplayBuffer
-from rl_lib.src.explore_env.exploration_manager import ExplorationManager
-
-class SimpleQ(Base_Algo, ):
- """Произовдит все вычисления необходимые для Q-learning
- """
- def __init__(self, action_model: object, target_model: object, config: dict, **kwargs):
-
- Base_Algo.__init__(self, action_model, target_model, config, **kwargs)
- # print(self.config)
- config = link_data_inside_the_config(self.config)
- self.buffer = ReplayBuffer(**config.get("buffer_config", {}))
- self.exploration = ExplorationManager(**config.get("exploration_config", {}))
-
- self.discount_factor = self.config['model_config']['discount_factor']
- self.n_step = self.config['model_config']['n_step']
-
- self.batch_size = self.config['model_config'].get("batch_size")
- self.double_network = self.config['model_config'].get("double_network")
- self.priority = self.config['model_config'].get("priority")
- self.tau = self.config['model_config'].get("tau")
-
- self.recurrent = False
- self.batch_dims = 1
- self.ind_axis = -1
-
- def add(self, data: tuple, priority = None) -> None:
- """
- Добавляет переходы в буфер
- Аргументы:
- data: tuple(state, action, reward, done, next_state)
- priority: np.array (только для приоритетных буферов)
- """
- self.buffer.add(data, priority)
-
- def calculate_double_q(self, **kwargs):
- Qaction = self.action_model(kwargs['next_state'])
- Qtarget = self.target_model(kwargs['next_state'])
- Qaction = Qaction[0] if isinstance(Qaction, list) else Qaction
- Qtarget = Qtarget[0] if isinstance(Qtarget, list) else Qtarget
- if kwargs["p_double"] < 0.5 : Qtarget = self.get_best_action(Qtarget, Qaction)
- else: Qtarget = self.get_best_action(Qaction, Qtarget)
- return Qtarget
-
- def calculate_gradients(self, batch = None):
- if batch == None: batch = self.choice_model_for_double_calculates(**batch)
- batch = self.choice_model_for_double_calculates(**batch)
- return self.action_model.calculate_gradients(**batch) if batch['p_double'] > 0.5 else self.target_model.calculate_gradients(**batch)
-
- def calculate_new_best_action(self, **kwargs) -> tf.Tensor:
- """Вычислеят новое лучшее действие для получения таргета"""
- if self.double_network:
- Qtarget = self.calculate_double_q(**kwargs)
- else:
- Qtarget = self.target_model(kwargs['next_state'])
- Qtarget = Qtarget[0] if isinstance(Qtarget, list) else Qtarget
- Qtarget = self.get_best_action(Qtarget, Qtarget)
- return Qtarget
-
- @tf.function(reduce_retracing=True,
- jit_compile=False,
- experimental_autograph_options = tf.autograph.experimental.Feature.ALL)
- def calculate_target(self, **kwargs):
- Qtarget = self.calculate_new_best_action(**kwargs)
- dones = tf.ones(kwargs['done'].shape, dtype=tf.dtypes.float32)
- dones = dones - kwargs['done']
- Qtarget = kwargs['reward'] + (self.discount_factor**self.n_step) * Qtarget * dones
- if self.recurrent:
- Qtarget = Qtarget[:, kwargs.get('recurrent_skip', 10):]
- return Qtarget
-
- def check_fullness_buffer(self):
- """Проверяет наполненость буфера, возвращает true если в буфере элементов больше батча"""
- if self.buffer.real_size > self.batch_size: return True
- else: return False
-
- def choice_model_for_double_calculates(self, **batch):
- batch['p_double'] = tf.random.uniform((1,), minval = 0.0, maxval = 1.0) if self.double_network else 1.
- batch['Qtarget'] = self.calculate_target(**batch)
- return batch
-
- def _get_action(self, observation: tf.Tensor) -> tf.Tensor:
- """Возвращает ценность дейтсвий Q(s,a) всех действий на основе наблюдения"""
- return self.sample_action(self.action_model.check_input_shape(observation))
-
- def get_action(self, observation: tf.Tensor) -> float:
- """Возвращает действие на основе наблюдения с учетом исследования"""
- action = self.exploration(self._get_action(observation))
- if isinstance(action, int): return int(action)
- else: return action.numpy()
-
- def get_test_action(self, observation: tf.Tensor) -> float:
- """Возвращает действие на основе наблюдения без исследования"""
- action = self.exploration.test(self._get_action(observation))
- if isinstance(action, int): return int(action)
- else: return action.numpy()
-
- def get_batch(self):
- """Получает батч из буфера"""
- return self.buffer.sample(self.batch_size)
-
- def get_batch_and_td_error(self):
- batch = self.get_batch()
- batch['batch_dims'] = self.batch_dims
- batch['p_double'] = 1.
- td_error = self.calculate_gradients(batch)['td_error']
- return {'td_error': td_error.numpy(), 'batch': batch}
-
- def get_best_action(self, Qaction, Qtarget):
- ind = tf.argmax(Qaction, axis=self.ind_axis)
- Qtarget = tf.gather(Qtarget, ind, batch_dims=self.batch_dims)
- return Qtarget
-
- def get_gradients(self) -> tf.Tensor:
- """Вычисляет градиенты и возвращает их"""
- batch = self.get_batch()
- batch['batch_dims'] = self.batch_dims
- batch['p_double'] = 1.
- return self.calculate_gradients(batch)['gradients']
-
- def load(self, ) -> None:
- """Загружает алгоритм"""
- self.action_model.load(self.path)
- self.target_model.load(self.path)
- self.buffer.load(self.path)
- self.exploration.load(self.path)
-
- def reset(self) -> None:
- """Сбрасывает внутренние данные модели"""
- self.buffer.reset()
- self.exploration.reset()
- self.initial_model()
-
- def _train_step(self, **batch) -> dict:
- """Вспомогательная train_step"""
- batch = self.choice_model_for_double_calculates(**batch)
- batch['batch_dims'] = self.batch_dims
- return self.action_model.update_weights(**batch) if batch['p_double'] > 0.5 else self.target_model.update_weights(**batch)
-
- def train_step(self) -> np.array:
- """Вычисляет полный обучающий шаг"""
- if not self.check_fullness_buffer(): return 0
- batch = self.get_batch()
- result = self._train_step(**batch)
- td_error = result['td_error'].numpy()
- loss = result['loss'].numpy()
- assert not np.all(np.isnan(td_error)), "td_error не может быть nan"
- if self.priority: self.buffer.update_priorities(batch['data_idxs'], loss if not self.recurrent else loss[:, -1])
- if self.tau != 1:
- _ = self.copy_weights()
- return np.mean(td_error)
-
- def save(self) -> None:
- """Сохраняет алгоритм"""
- self.action_model.save(self.path)
- self.target_model.save(self.path)
- self.buffer.save(self.path)
- self.exploration.save(self.path)
-
- def summary(self) -> None:
- """Выводит архитектуру модели"""
- self.action_model.summary
-
-
-
-def link_data_inside_the_config(config):
- # print(config)
- discount_factor = config['model_config']['discount_factor']
- n_step = config['model_config']['n_step']
- action_space = config['model_config']['action_space']
- priority = config['model_config']['priority']
-
- config['buffer_config']['priority'] = priority
- config['buffer_config']['discount_factor'] = discount_factor
- config['buffer_config']['n_step'] = n_step
- config['exploration_config']['strategy_config']['action_space'] = action_space
- return config
diff --git a/rl_lib/src/algoritms/tests/test_simpe_q.py b/rl_lib/src/algoritms/tests/test_simpe_q.py
deleted file mode 100644
index 73fd998..0000000
--- a/rl_lib/src/algoritms/tests/test_simpe_q.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from ..simple_q import SimpleQ
-from rl_lib.src.models.model import Model
-
-import os
-
-class Simple_Model(Model):
- def __init__(self, config = {},**kwargs):
- super().__init__(model_config = config.get('model_config', {}), config = config, default_config_path=__file__, **kwargs)
-
- def _prediction_processing(self, input_data):
- pass
-
- def _update_next_state(self, state, action):
- pass
-
- def initial_state(self):
- pass
-
- @staticmethod
- def create_model(input_shape: tuple, action_space: int):
- pass
-
- @staticmethod
- def create_model_with_conv(input_shape: tuple, action_space: int):
- pass
-
-
- def set_new_model(self, *args):
- pass
-
-class Test_Simple_Q():
- def __init__(self, config):
- self.simple_q = SimpleQ(Simple_Model, Simple_Model, config, default_config_path=__file__, algo_name = "SimpleQ", name = "Simple_Model")
-
- def test_save(self):
- self.simple_q.save()
- real_structure = get_directory_structure(self.simple_q.path)
- assert self.simple_q.path != self.simple_q.config['data_saver']['path'], "Пути не совпадают"
- correct_structure = {self.simple_q.name:
- {
- self.simple_q.exploration.name + ".data": None,
- self.simple_q.buffer.name + ".data": None,
- self.simple_q.action_model.name + ".h5": None,
- self.simple_q.target_model.name + ".h5": None,
- }
- }
- assert compare_directory_structures(real_structure, correct_structure), "Каталоги разные"
-
-def compare_directory_structures(dir_structure1: dict, dir_structure2: dict) -> bool:
- """Проверяет одинаковые ли структуры каталогов"""
- if dir_structure1.keys() != dir_structure2.keys():
- return False
-
- for key in dir_structure1.keys():
- if isinstance(dir_structure1[key], dict) and isinstance(dir_structure2[key], dict):
- if not compare_directory_structures(dir_structure1[key], dir_structure2[key]):
- return False
- elif dir_structure1[key] != dir_structure2[key]:
- return False
-
- return True
-
-def get_directory_structure(path: str) -> dict:
- """Получает всю структуру переданного каталога"""
- structure = {}
- for dirpath, dirnames, filenames in os.walk(path):
- current_level = structure
- for dirname in dirpath.split(os.sep):
- current_level = current_level.setdefault(dirname, {})
- for filename in filenames:
- current_level[filename] = None
- return structure
diff --git a/rl_lib/src/algoritms/utils.py b/rl_lib/src/algoritms/utils.py
deleted file mode 100644
index 033bd3c..0000000
--- a/rl_lib/src/algoritms/utils.py
+++ /dev/null
@@ -1,13 +0,0 @@
-def update_config(config: dict, new_data: dict) -> None:
- """Обвновляет конфигурацию по умолчанию
- Args:
- config: dict: Конфигурация, которую надо обновить
- new_data: dict: Конфигурация с новыми данными
- Returns:
- None
- """
- for key, value in new_data.items():
- if isinstance(value, dict) and key in config and isinstance(config[key], dict):
- update_config(config[key], value)
- else:
- config[key] = value
diff --git a/rl_lib/src/data_saver/saver.py b/rl_lib/src/data_saver/saver.py
index 1bce4b6..89dd553 100644
--- a/rl_lib/src/data_saver/saver.py
+++ b/rl_lib/src/data_saver/saver.py
@@ -1,62 +1,75 @@
import os
from shutil import copy, make_archive
+
class Saver:
- """Хранит в себе пути сохранения этапа обучения и путь резервного копирования.
- При инициализации создает папки для сохранения и резервного копирования.
- Args:
- name: str. Необязательно, название алгоритма
- path: str. Путь сохранения
- copy_path: str. Путь резервного копирования
- """
- def __init__(self, algo_name="None", copy_path="", name="", path="", **kwargs):
- self.algo_name = algo_name
- self.copy_path = copy_path
- self.name = name
- self.original_path = os.getcwd()
- self.path = path
-
- self.validate_path()
-
- self.init_copy_dir()
- self.init_save_dir()
-
- @property
- def get_save_path(self):
- return self.path
-
- @property
- def get_copy_path(self):
- if self.copy_path != "": return self.copy_path
- else: return "Path is not defined"
-
- def init_copy_dir(self):
- if self.copy_path != "":
- self.copy_path = self.copy_path + self.algo_name + "/"
- if not os.path.isdir(self.copy_path):
- os.makedirs(self.copy_path)
-
- def init_save_dir(self):
- """Создает путь сохранения и директорию сохранения"""
- if self.path == "": self.path = self.original_path + "/models/" + self.algo_name + "/" + self.name + "/"
- else: self.path = self.path + self.name + "/"
- if not os.path.isdir(self.path):
- os.makedirs(self.path)
-
- def make_copy(self):
- """Резервное копирование архива директории"""
- copy(self.path +'/' + self.name+'.zip', self.copy_path)
-
- def make_archive(self):
- """Создает архив директории"""
- make_archive(base_name=self.name, format='zip', root_dir=self.path)
-
- def validate_path(self):
- assert isinstance(self.algo_name, str), "Неверный тип аргумента, должно быть str"
- assert isinstance(self.copy_path, str), "Неверный тип аргумента, должно быть str"
- assert isinstance(self.name, str), "Неверный тип аргумента, должно быть str"
- assert isinstance(self.path, str), "Неверный тип аргумента, должно быть str"
- if len(self.path) > 0: assert self.path[-1] == "/", "В конце пути должен быть /"
- if len(self.copy_path) > 0: assert self.copy_path[-1] == "/", "В конце пути должен быть /"
-
-
+ """Хранит в себе пути сохранения этапа обучения
+ и путь резервного копирования.
+ При инициализации создает папки для сохранения и резервного копирования.
+ Args:
+ name: str. Необязательно, название алгоритма
+ path: str. Путь сохранения
+ copy_path: str. Путь резервного копирования
+ """
+
+ def __init__(self, algo_name="None", copy_path="", name="", path="",
+ **kwargs):
+ self.algo_name = algo_name
+ self.copy_path = copy_path
+ self.name = name
+ self.original_path = os.getcwd()
+ self.path = path
+
+ self.validate_path()
+
+ self.init_copy_dir()
+ self.init_save_dir()
+
+ @property
+ def get_save_path(self):
+ return self.path
+
+ @property
+ def get_copy_path(self):
+ if self.copy_path != "":
+ return self.copy_path
+ else:
+ return "Path is not defined"
+
+ def init_copy_dir(self):
+ if self.copy_path != "":
+ self.copy_path = self.copy_path + self.algo_name + "/"
+ if not os.path.isdir(self.copy_path):
+ os.makedirs(self.copy_path)
+
+ def init_save_dir(self):
+ """Создает путь сохранения и директорию сохранения"""
+ if self.path == "":
+ self.path = self.original_path + "/models/" + \
+ self.algo_name + "/" + self.name + "/"
+ else:
+ self.path = self.path + self.name + "/"
+ if not os.path.isdir(self.path):
+ os.makedirs(self.path)
+
+ def make_copy(self):
+ """Резервное копирование архива директории"""
+ copy(self.path + '/' + self.name+'.zip', self.copy_path)
+
+ def make_archive(self):
+ """Создает архив директории"""
+ make_archive(base_name=self.name, format='zip', root_dir=self.path)
+
+ def validate_path(self):
+ assert isinstance(
+ self.algo_name, str), "Неверный тип аргумента, должно быть str"
+ assert isinstance(
+ self.copy_path, str), "Неверный тип аргумента, должно быть str"
+ assert isinstance(
+ self.name, str), "Неверный тип аргумента, должно быть str"
+ assert isinstance(
+ self.path, str), "Неверный тип аргумента, должно быть str"
+ if len(self.path) > 0:
+ assert self.path[-1] == "/", "В конце пути должен быть /"
+ if len(self.copy_path) > 0:
+ assert self.copy_path[-1] == "/", "В конце пути должен быть /"
diff --git a/rl_lib/src/data_saver/tests/test_saver.py b/rl_lib/src/data_saver/tests/test_saver.py
index 05d9f03..1ed435c 100644
--- a/rl_lib/src/data_saver/tests/test_saver.py
+++ b/rl_lib/src/data_saver/tests/test_saver.py
@@ -1,19 +1,23 @@
+from rl_lib.rl_lib.src.algoritms.tests.test_simpe_q import (
+ compare_directory_structures, get_directory_structure)
+
from ..saver import Saver
-from rl_lib.rl_lib.src.algoritms.tests.test_simpe_q import get_directory_structure, compare_directory_structures
-class Test_Saver:
- def __init__(self, **kwargs):
- self.saver = Saver(**kwargs)
- def test_init(self, path, copy_path):
- self.check_structure(self.saver.path, path)
- self.check_structure(self.saver.copy_path, copy_path)
- print("Тест пройден успешно")
+class Test_Saver:
+ def __init__(self, **kwargs):
+ self.saver = Saver(**kwargs)
+ def test_init(self, path, copy_path):
+ self.check_structure(self.saver.path, path)
+ self.check_structure(self.saver.copy_path, copy_path)
+ print("Тест пройден успешно")
- def check_structure(self, real_path, corrrect_path):
- real_structure = get_directory_structure(real_path)
- assert real_path == corrrect_path, "Пути не совпадают"
- if corrrect_path != "":
- correct_structure = {corrrect_path.replace("/", ""): {"":{}, self.saver.name: {}}}
- assert compare_directory_structures(real_structure, correct_structure), "Каталоги разные"
+ def check_structure(self, real_path, corrrect_path):
+ real_structure = get_directory_structure(real_path)
+ assert real_path == corrrect_path, "Пути не совпадают"
+ if corrrect_path != "":
+ correct_structure = {corrrect_path.replace(
+ "/", ""): {"": {}, self.saver.name: {}}}
+ assert compare_directory_structures(
+ real_structure, correct_structure), "Каталоги разные"
diff --git a/rl_lib/src/data_saver/utils.py b/rl_lib/src/data_saver/utils.py
index b1ef00f..7ce4351 100644
--- a/rl_lib/src/data_saver/utils.py
+++ b/rl_lib/src/data_saver/utils.py
@@ -1,15 +1,29 @@
+import os.path as os_path
from pickle import dump, load
+
from yaml import safe_load
-import os.path as os_path
+
def save_data(path, data):
- with open(path+'.data', 'wb') as f:
- dump(data, f)
+ with open(path+'.data', 'wb') as f:
+ dump(data, f)
+
def load_data(path):
- with open(path+'.data', 'rb') as f:
- loaded_data = load(f)
- return loaded_data
+ with open(path+'.data', 'rb') as f:
+ loaded_data = load(f)
+ return loaded_data
+
def load_default_config(path):
- return safe_load(open(os_path.join(os_path.dirname(path),"./config.yaml"), "rb"))
+ file_name = "./config.yaml"
+ if path.split('/')[-1].split('.')[-1] in 'yaml':
+ file_name = path.split('/')[-1]
+ return safe_load(
+ open(
+ os_path.join(
+ os_path.dirname(path),
+ file_name
+ ), "rb"
+ )
+ )
\ No newline at end of file
diff --git a/rl_lib/src/explore_env/base_explore.py b/rl_lib/src/explore_env/base_explore.py
index c741d9d..d1fb054 100644
--- a/rl_lib/src/explore_env/base_explore.py
+++ b/rl_lib/src/explore_env/base_explore.py
@@ -1,34 +1,36 @@
import abc
+
class Base_Explore(abc.ABC):
- """Абстрактный класс представляющий общий интерфейс для всех классов исследования
- нейронной сетью среды обучения
-
- """
- def __init__():
- pass
-
- @property
- @abc.abstractmethod
- def name(self):
- """Возвращает имя стратегии"""
-
- @abc.abstractmethod
- def reset(self, ) -> None:
- """Выполняет внутренний сброс"""
-
- @abc.abstractmethod
- def save(self, path) -> None:
- """Сохраняет какие либо внутренние переменные"""
-
- @abc.abstractmethod
- def load(self, path) -> None:
- """Загружает какие либо внутренние переменные"""
-
- @abc.abstractmethod
- def __call__(self, action) -> int:
- """Возвращает действие в соответствии с стратегией исследования"""
-
- @abc.abstractmethod
- def test(self, action) -> int:
- """Возвращает действие в соответствии с стратегией тестирования"""
+ """Абстрактный класс представляющий
+ общий интерфейс для всех классов исследования
+ нейронной сетью среды обучения
+
+ """
+ def __init__():
+ pass
+
+ @property
+ @abc.abstractmethod
+ def name(self):
+ """Возвращает имя стратегии"""
+
+ @abc.abstractmethod
+ def reset(self, ) -> None:
+ """Выполняет внутренний сброс"""
+
+ @abc.abstractmethod
+ def save(self, path) -> None:
+ """Сохраняет какие либо внутренние переменные"""
+
+ @abc.abstractmethod
+ def load(self, path) -> None:
+ """Загружает какие либо внутренние переменные"""
+
+ @abc.abstractmethod
+ def __call__(self, action) -> int:
+ """Возвращает действие в соответствии с стратегией исследования"""
+
+ @abc.abstractmethod
+ def test(self, action) -> int:
+ """Возвращает действие в соответствии с стратегией тестирования"""
diff --git a/rl_lib/src/explore_env/epsilon_greedy.py b/rl_lib/src/explore_env/epsilon_greedy.py
index 269a7ff..cc843e2 100644
--- a/rl_lib/src/explore_env/epsilon_greedy.py
+++ b/rl_lib/src/explore_env/epsilon_greedy.py
@@ -1,56 +1,64 @@
-import numpy as np
-from tensorflow.math import argmax
+import numpy as np
from tensorflow.dtypes import int32
+from tensorflow.math import argmax
-from ..data_saver.utils import save_data, load_data
+from ..data_saver.utils import load_data, save_data
from .base_explore import Base_Explore
+
class Epsilon_Greedy(Base_Explore):
"""Эпсилон-жадная стратегия исследования
Kwargs:
- eps_decay_steps: int, Количество внутренних шагов исследований до установки минимального эпсилон
+ eps_decay_steps: int, Количество внутренних шагов исследований
+ до установки минимального эпсилон
eps_max: float, Максимальный эпсилон
eps_min: float, Минимальный эпсилон
eps_test: float, Тестовый эпсилон
action_spase: int, Размер пространтства действий
axis: int, Ось вычислений
"""
- def __init__(self, eps_decay_steps=1e6, eps_max=1.0, eps_min=1e-1, eps_test=1e-3, action_space=None, axis=-1, **kwargs):
+
+ def __init__(self, eps_decay_steps=1e6,
+ eps_max=1.0, eps_min=1e-1,
+ eps_test=1e-3, action_space=None,
+ axis=-1, **kwargs):
self.eps_desay_steps = eps_decay_steps
self.eps_min = eps_min
self.eps_max = eps_max
self.eps_test = eps_test
- assert type(action_space) == int, "Пространство действий должно быть int"
- self.action_space = action_space
+ assert type(
+ action_space) is int, "Пространство действий должно быть int"
+ self.action_space = action_space
self.axis = axis
self._name = "epsilon_greedy_strategy"
self.reset()
def __call__(self, Q):
- self.eps = max(self.eps_min, self.eps_max - (self.eps_max-self.eps_min) * self.count/self.eps_desay_steps)
+ self.eps = max(self.eps_min, self.eps_max - (self.eps_max -
+ self.eps_min) * self.count/self.eps_desay_steps)
self.count += 1
return self.get_action(self.eps, Q)
def get_action(self, eps, Q):
- if np.random.random() < eps: return np.random.randint(self.action_space)
- else: return argmax(Q, axis=self.axis, output_type=int32)
+ if np.random.random() < eps:
+ return np.random.randint(self.action_space)
+ else:
+ return argmax(Q, axis=self.axis, output_type=int32)
def load(self, path):
self.__dict__ = load_data(path+self.name)
-
+
@property
def name(self):
return self._name
-
+
def reset(self, ):
self.count = 0
self.eps = self.eps_max
-
+
def save(self, path):
save_data(path+self.name, self.__dict__)
def test(self, Q):
return self.get_action(self.eps_test, Q)
-
-
diff --git a/rl_lib/src/explore_env/exploration_manager.py b/rl_lib/src/explore_env/exploration_manager.py
index ffa5a17..548bda0 100644
--- a/rl_lib/src/explore_env/exploration_manager.py
+++ b/rl_lib/src/explore_env/exploration_manager.py
@@ -1,50 +1,55 @@
-from .epsilon_greedy import Epsilon_Greedy
-from .soft_q import Soft_Q
from .base_explore import Base_Explore
+from .epsilon_greedy import Epsilon_Greedy
from .ou_noise import OU_Noise
+from .soft_q import Soft_Q
+
class ExplorationManager(Base_Explore):
- """Выбирает стратегию исследования и выполняет все ее функции
- Kwargs:
- strategy_name: str, Название стратегии
- strategy_config: dict, Параметры стратегии
- """
- def __init__(self, strategy_name="epsilon_greedy", strategy_config = {}, **kwargs):
- self._config = {"strategy_name": strategy_name, "strategy_config": strategy_config}
-
- if strategy_name.lower() == "epsilon_greedy":
- self.strategy = Epsilon_Greedy(**strategy_config)
-
- elif strategy_name.lower() == "soft_q":
- self.strategy = Soft_Q(**strategy_config)
-
- elif strategy_name.lower() == "ou_noise":
- self.strategy = OU_Noise(**strategy_config)
-
- else:
- assert 0, "Неизвестная стратегия"
-
- self.strategy_name = self.strategy.name
-
- def __call__(self, Q):
- return self.strategy(Q)
-
- @property
- def config(self):
- return self.config
-
- @property
- def name(self):
- return self.strategy.name
-
- def load(self, path):
- self.strategy.load(path)
-
- def reset(self, ):
- self.strategy.reset()
-
- def save(self, path):
- self.strategy.save(path)
-
- def test(self, Q):
- return self.strategy.test(Q)
+ """Выбирает стратегию исследования и выполняет все ее функции
+ Kwargs:
+ strategy_name: str, Название стратегии
+ strategy_config: dict, Параметры стратегии
+ """
+
+ def __init__(self, strategy_name="epsilon_greedy",
+ strategy_config={},
+ *args, **kwargs):
+ self._config = {"strategy_name": strategy_name,
+ "strategy_config": strategy_config}
+
+ if strategy_name.lower() == "epsilon_greedy":
+ self.strategy = Epsilon_Greedy(**strategy_config)
+
+ elif strategy_name.lower() == "soft_q":
+ self.strategy = Soft_Q(**strategy_config)
+
+ elif strategy_name.lower() == "ou_noise":
+ self.strategy = OU_Noise(**strategy_config)
+
+ else:
+ assert 0, "Неизвестная стратегия"
+
+ self.strategy_name = self.strategy.name
+
+ def __call__(self, Q):
+ return self.strategy(Q)
+
+ @property
+ def config(self):
+ return self.config
+
+ @property
+ def name(self):
+ return self.strategy.name
+
+ def load(self, path):
+ self.strategy.load(path)
+
+ def reset(self, ):
+ self.strategy.reset()
+
+ def save(self, path):
+ self.strategy.save(path)
+
+ def test(self, Q):
+ return self.strategy.test(Q)
diff --git a/rl_lib/src/explore_env/ou_noise.py b/rl_lib/src/explore_env/ou_noise.py
index f99f44d..e0d6d31 100644
--- a/rl_lib/src/explore_env/ou_noise.py
+++ b/rl_lib/src/explore_env/ou_noise.py
@@ -1,46 +1,89 @@
-import numpy as np
+import numpy as np
from tensorflow import clip_by_value
-from ..data_saver.utils import save_data, load_data
+from ..data_saver.utils import load_data, save_data
from .base_explore import Base_Explore
+
+class OU_Noise_generator:
+ def __init__(self, mean, sigma, theta=0.15, dt=1e-2, x_initial=None):
+ self.theta = theta
+ self.mean = mean
+ self.sigma = sigma
+ self.dt = dt
+ self.x_initial = x_initial
+ self.reset()
+
+ def __call__(self):
+ """Formula taken from
+ https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
+ """
+ dx = (self.theta * (self.mean - self.x_prev) * self.dt + self.sigma *
+ np.sqrt(self.dt) * np.random.normal(size=self.mean.shape,
+ scale=self.sigma))
+ # Store x into x_prev
+ # Makes next noise dependent on current one
+ self.x_prev += dx
+ return self.x_prev
+
+ def reset(self):
+ if self.x_initial is not None:
+ self.x_prev = self.x_initial
+ else:
+ self.x_prev = np.zeros_like(self.mean)
+
+
class OU_Noise(Base_Explore):
- """Шум Орнштейна — Уленбека стратегия исследования, применяется к предсказанным непрерывным действиям.
+ """Шум Орнштейна — Уленбека стратегия исследования,
+ применяется к предсказанным непрерывным действиям.
- Kwargs:
+ Kwargs:
action_spase: int, Размер пространтства действий
- alpha: int, Количество внутренних шагов исследований до установки минимального эпсилон
+ alpha: int, Количество внутренних шагов исследований
+ до установки минимального эпсилон
axis: int, ось вычислений
sigma: float, Максимальный эпсилон
"""
- def __init__(self, action_space = None, axis=-1, alpha = 0.9, lower_bound = -1.0, sigma=1.0, upper_bound = 1.0,**kwargs):
- self.action_space = action_space
+
+ def __init__(self, action_space=None,
+ axis=-1, alpha=0.9, dt=0.01,
+ lower_bound=-1.0, mean: np.ndarray = None,
+ sigma=1.0, theta=0.15,
+ upper_bound=1.0,
+ **kwargs):
+ self.action_space = action_space
self.alpha = alpha
self.axis = axis
- self.eps = np.random.normal(size=self.action_space, scale = sigma)
+ self.ou_gen = OU_Noise_generator(np.zeros(
+ action_space) if mean == "None" else mean, sigma, theta=theta,
+ dt=dt, x_initial=None)
+ self.eps = self.ou_gen()
self.lower_bound = lower_bound
- self.sigma = sigma
- self._name = "ou_noise"
+ self.sigma = sigma
+ self._name = "ou_noise"
self.upper_bound = upper_bound
-
-
+
def __call__(self, action):
action += self.eps
- self.eps = self.alpha*self.eps + np.random.normal(size=self.action_space, scale = self.sigma)
- return clip_by_value(action, clip_value_min=self.lower_bound, clip_value_max=self.upper_bound)
+ self.eps = self.alpha*self.eps + self.ou_gen()
+ return clip_by_value(action,
+ clip_value_min=self.lower_bound,
+ clip_value_max=self.upper_bound)
def load(self, path):
self.__dict__ = load_data(path+self.name)
-
+
@property
def name(self):
return self._name
-
+
def reset(self, ):
- self.eps = np.random.normal(size=self.action_space, scale = self.sigma)
-
+ self.eps = self.ou_gen()
+
def save(self, path):
save_data(path+self.name, self.__dict__)
def test(self, action):
- return clip_by_value(action, clip_value_min=self.lower_bound, clip_value_max=self.upper_bound)
\ No newline at end of file
+ return clip_by_value(action,
+ clip_value_min=self.lower_bound,
+ clip_value_max=self.upper_bound)
diff --git a/rl_lib/src/explore_env/soft_q.py b/rl_lib/src/explore_env/soft_q.py
index 737c8a1..bc87c20 100644
--- a/rl_lib/src/explore_env/soft_q.py
+++ b/rl_lib/src/explore_env/soft_q.py
@@ -1,50 +1,57 @@
-from tensorflow.keras.activations import softmax
-from tensorflow.math import argmax
-from tensorflow.dtypes import int32
from tensorflow import expand_dims
-from tensorflow.math import log
+from tensorflow.dtypes import int32
+from tensorflow.keras.activations import softmax
+from tensorflow.math import argmax, log
from tensorflow.random import categorical
+from rl_lib.src.algoritms.model_free.value_based.base_algo import Base_Algo
+
+from ..data_saver.utils import load_data, save_data
from .base_explore import Base_Explore
-from ..data_saver.utils import save_data, load_data
-from rl_lib.src.algoritms.base_algo import Base_Algo
+
class Soft_Q(Base_Explore):
- """Больцмановская стратегия исследования
- a = softmax(Q/tau)
-
- Kwargs:
- tau: float, Больцмановская температура
- axis: int, Ось вычислений
- """
- def __init__(self, decay = 0, tau=1.0, axis=-1, **kwargs):
- self.decay = decay
- self.tau = tau
- self.axis = axis
- self._name = "soft_q_strategy"
-
- def __call__(self, Q) -> int:
- """Возвращает действие в соответствии с стратегией исследования"""
- probability = softmax(expand_dims(Q, 0)/self.tau, axis=self.axis)
- self.tau = self.tau * self.decay
- return Base_Algo.squeeze_predict(categorical(log(probability), 1, dtype=int32))
-
- @property
- def name(self):
- return self._name
-
- def load(self, path) -> None:
- """Загружает какие либо внутренние переменные"""
- self.__dict__ = load_data(path+self.name)
-
- def reset(self, ) -> None:
- """Выполняет внутренний сброс"""
- pass
-
- def save(self, path) -> None:
- """Сохраняет какие либо внутренние переменные"""
- save_data(path+self.name, self.__dict__)
-
- def test(self, Q) -> int:
- """Возвращает действие в соответствии с стратегией тестирования"""
- return argmax(Q, axis=self.axis, output_type=int32)
+ """Больцмановская стратегия исследования
+ a = softmax(Q/tau)
+
+ Kwargs:
+ tau: float, Больцмановская температура
+ axis: int, Ось вычислений
+ """
+
+ def __init__(self, decay=0, tau=1.0, axis=-1, **kwargs):
+ self.decay = decay
+ self.tau = tau
+ self.axis = axis
+ self._name = "soft_q_strategy"
+
+ def __call__(self, Q) -> int:
+ """Возвращает действие в соответствии с стратегией исследования"""
+ probability = softmax(expand_dims(Q, 0)/self.tau, axis=self.axis)
+ self.tau = self.tau * self.decay
+ return Base_Algo.squeeze_predict(
+ categorical(
+ log(probability),
+ 1,
+ dtype=int32)
+ )
+
+ @property
+ def name(self):
+ return self._name
+
+ def load(self, path) -> None:
+ """Загружает какие либо внутренние переменные"""
+ self.__dict__ = load_data(path+self.name)
+
+ def reset(self, ) -> None:
+ """Выполняет внутренний сброс"""
+ pass
+
+ def save(self, path) -> None:
+ """Сохраняет какие либо внутренние переменные"""
+ save_data(path+self.name, self.__dict__)
+
+ def test(self, Q) -> int:
+ """Возвращает действие в соответствии с стратегией тестирования"""
+ return argmax(Q, axis=self.axis, output_type=int32)
diff --git a/rl_lib/src/explore_env/tests/test_epsilon_greedy.py b/rl_lib/src/explore_env/tests/test_epsilon_greedy.py
index 93d4b80..4f7aea1 100644
--- a/rl_lib/src/explore_env/tests/test_epsilon_greedy.py
+++ b/rl_lib/src/explore_env/tests/test_epsilon_greedy.py
@@ -1,19 +1,19 @@
from ..epsilon_greedy import Epsilon_Greedy
+
class Test_Epsilon_Greedy:
- def __init__(config):
- self.strategy = Epsilon_Gredy(**config)
- self.config = config
+ def __init__(self, config):
+ self.strategy = Epsilon_Greedy(**config)
+ self.config = config
+
+ def test_reset(self):
+ pass
+
+ def test_save(self):
+ pass
- def test_reset(self):
- pass
+ def test_call(self):
+ pass
- def test_save(self):
- pass
-
- def test_call(self):
- pass
-
- def test_test(self):
- pass
-
+ def test_test(self):
+ pass
diff --git a/rl_lib/src/explore_env/tests/test_exploration_manager.py b/rl_lib/src/explore_env/tests/test_exploration_manager.py
index 368c5a5..0d63a17 100644
--- a/rl_lib/src/explore_env/tests/test_exploration_manager.py
+++ b/rl_lib/src/explore_env/tests/test_exploration_manager.py
@@ -1,19 +1,19 @@
-from ..exploration_manger import Exploration_Manger
+from ..exploration_manager import Exploration_Manger
+
class Test_Epsilon_Greedy:
- def __init__(config):
- self.strategy = Exploration_Manger(**config)
- self.config = config
+ def __init__(self, config):
+ self.strategy = Exploration_Manger(**config)
+ self.config = config
+
+ def test_reset(self):
+ pass
+
+ def test_save(self):
+ pass
- def test_reset(self):
- pass
+ def test_call(self):
+ pass
- def test_save(self):
- pass
-
- def test_call(self):
- pass
-
- def test_test(self):
- pass
-
+ def test_test(self):
+ pass
diff --git a/rl_lib/src/explore_env/tests/test_soft_q.py b/rl_lib/src/explore_env/tests/test_soft_q.py
index 301cbbb..7016b0d 100644
--- a/rl_lib/src/explore_env/tests/test_soft_q.py
+++ b/rl_lib/src/explore_env/tests/test_soft_q.py
@@ -1,19 +1,19 @@
from ..soft_q import Soft_Q
+
class Test_Epsilon_Greedy:
- def __init__(config):
- self.strategy = Soft_Q(**config)
- self.config = config
+ def __init__(self, config):
+ self.strategy = Soft_Q(**config)
+ self.config = config
+
+ def test_reset(self):
+ pass
+
+ def test_save(self):
+ pass
- def test_reset(self):
- pass
+ def test_call(self):
+ pass
- def test_save(self):
- pass
-
- def test_call(self):
- pass
-
- def test_test(self):
- pass
-
+ def test_test(self):
+ pass
diff --git a/rl_lib/src/gym_wrappers/__init__.py b/rl_lib/src/gym_wrappers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rl_lib/src/gym_wrappers/obsv_wrapper.py b/rl_lib/src/gym_wrappers/obsv_wrapper.py
new file mode 100644
index 0000000..8b396e6
--- /dev/null
+++ b/rl_lib/src/gym_wrappers/obsv_wrapper.py
@@ -0,0 +1,25 @@
+import gym
+import numpy as np
+
+
+class ImageNormWrapper(gym.Wrapper):
+ """Обертка нормализации наблюдений среды,
+ если это изображения
+
+ Args:
+ gym (_type_): _description_
+ """
+ def __init__(self, env):
+ super().__init__(env)
+
+ def reset(self, seed=40, options={}):
+ observation, info = self.env.reset(seed=40, options={})
+ return self.preprocess(observation), info
+
+ def step(self, action):
+ observation, reward, done, tr, info = self.env.step(action)
+ return self.preprocess(observation), reward, done, tr, info
+
+ def preprocess(self, observation):
+ observation = (observation - 255/2)/(255/2)
+ return observation.astype(np.float16)
diff --git a/rl_lib/src/models/base_models.py b/rl_lib/src/models/base_models.py
index e77bf9b..36e9abb 100644
--- a/rl_lib/src/models/base_models.py
+++ b/rl_lib/src/models/base_models.py
@@ -1,166 +1,175 @@
-from ..data_saver.saver import Saver
-
import abc
+
import tensorflow as tf
class BaseModel(abc.ABC):
- """Абстрактный базовый класс,
- представляющий общий интерфейс для всех алгоритмов и моделей в RL-Lib.
+ """Абстрактный базовый класс,
+ представляющий общий интерфейс для всех алгоритмов и моделей в RL-Lib.
+
+ Model определяет общие методы для ввода, вывода и базовых вычислений,
+ которые должны быть реализованы в каждом конкретном алгоритме или модели.
- Model определяет общие методы для ввода, вывода и базовых вычислений,
- которые должны быть реализованы в каждом конкретном алгоритме или модели.
+ Этот класс служит в качестве основы для всех других классов в RL-Lib
+ и обеспечивает единый интерфейс для работы с различными моделями.
+ """
- Этот класс служит в качестве основы для всех других классов в RL-Lib
- и обеспечивает единый интерфейс для работы с различными моделями.
- """
+ def __init__(self, **kwargs):
+ pass
- def __init__(self, **kwargs):
- pass
+ @property
+ @abc.abstractmethod
+ def input_spec(self) -> tuple:
+ """Возвращает кортеж размера входных данных Модели"""
- @property
- @abc.abstractmethod
- def input_spec(self) -> tuple:
- """Возвращает кортеж размера входных данных Модели"""
+ @property
+ @abc.abstractmethod
+ def output_spec(self) -> tuple:
+ """Возвращает кортеж размера выходных данных Модели"""
- @property
- @abc.abstractmethod
- def output_spec(self) -> tuple:
- """Возвращает кортеж размера выходных данных Модели"""
+ @abc.abstractmethod
+ def initial_state(self) -> None:
+ """Инициализирует внутреннее состояние реккурентной Модели"""
- @abc.abstractmethod
- def initial_state(self) -> None:
- """Инициализирует внутреннее состояние реккурентной Модели"""
+ @abc.abstractmethod
+ def _update_next_state(self) -> None:
+ """Обновляет внутреннее состояние реккурентной Модели"""
- @abc.abstractmethod
- def _update_next_state(self) -> None:
- """Обновляет внутреннее состояние реккурентной Модели"""
class ModelIO(abc.ABC):
- def __init__(self, config: dict, **kwargs):
- super().__init__(**config,**kwargs)
- self._config = config
- self.name = kwargs.get("name","")
-
- @property
- def config(self) -> dict:
- """Возвращает конфигурацию алгоритма"""
- return self._config
-
- @abc.abstractmethod
- def save(self, path) -> None:
- """Сохраняет модель в директории"""
-
- @abc.abstractmethod
- def load(self, path) -> None:
- """Загружает модель из директории"""
+ def __init__(self, config: dict, **kwargs):
+ super().__init__(**config, **kwargs)
+ self._config = config
+ self.name = kwargs.get("name", "")
+
+ @property
+ def config(self) -> dict:
+ """Возвращает конфигурацию алгоритма"""
+ return self._config
+
+ @abc.abstractmethod
+ def save(self, path) -> None:
+ """Сохраняет модель в директории"""
+
+ @abc.abstractmethod
+ def load(self, path) -> None:
+ """Загружает модель из директории"""
+
class ModelNN(abc.ABC):
- """Абстрактрный класс, представляющий модель нейронной сети для вычисления градиента,
- обновления весов и извлечения слоев, весов, компиляции модели.
-
- Kwargs:
- model: tf.keras.Model
- name: str Необязательно, название модели
- """
-
- def __init__(self, model_config: dict, **kwargs):
- super().__init__(**kwargs)
- self.model = model_config.get('model', None)
-
- def __call__(self, inputs: tf.Tensor) -> tf.Tensor:
- return self.model(inputs)
-
- @abc.abstractmethod
- def _prediction_processing(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
- """Обрабатывает выходы модели перед вычислением лоссов
- Args:
- inputs: tf.Tensor(dtype=tf.float32)
- Returns
- outputs: tf.Tensor(dtype=tf.float32
- """
- return inputs
-
- @abc.abstractclassmethod
- def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor:
- """Вычисляет и возвращает потери в соответствии с функцией потерь"""
-
- @abc.abstractclassmethod
- def make_mask(self) -> tf.Tensor:
- """Создает и возвращает маску для выходов с модели"""
-
- def prediction_processing(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
- """Обрабатывает выходы модели перед вычислением лоссов
- Args:
- inputs: tf.Tensor(dtype=tf.float32)
- Returns
- outputs: tf.Tensor(dtype=tf.float32
+ """Абстрактрный класс,
+ представляющий модель нейронной сети для вычисления градиента,
+ обновления весов и извлечения слоев, весов, компиляции модели.
+
+ Kwargs:
+ model: tf.keras.Model
+ name: str Необязательно, название модели
"""
- inputs = inputs[0] if isinstance(inputs, list) else inputs
- return self._prediction_processing(inputs, **kwargs)
-
- def set_new_model(self, model: tf.keras.Model, optimizer: tf.keras.optimizers, jit_compile=True) -> None:
- self.model = model
- self.model.compile(optimizer=optimizer, jit_compile=jit_compile)
-
- def validate_args(self):
- assert isinstance(self.model, tf.keras.Model), "Передан неверный аргумент, должно быть tf.keras.Model"
-
- @property
- def layers(self, ) -> list:
- return self.model.layers
-
- @property
- def weights(self, ) -> list:
- return self.model.weights
-
- @property
- def summary(self, ) -> None:
- print(self.name)
- self.model.summary()
-
- def get_weights(self, ) -> list:
- return self.model.get_weights()
-
- def set_weights(self, weights: list) -> None:
- self.model.set_weights(weights)
-
- @tf.function(reduce_retracing=True,
+
+ def __init__(self, model_config: dict, **kwargs):
+ super().__init__(**kwargs)
+ self.model = model_config.get('model', None)
+
+ def __call__(self, inputs: tf.Tensor) -> tf.Tensor:
+ return self.model(inputs)
+
+ @abc.abstractmethod
+ def _prediction_processing(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
+ """Обрабатывает выходы модели перед вычислением лоссов
+ Args:
+ inputs: tf.Tensor(dtype=tf.float32)
+ Returns
+ outputs: tf.Tensor(dtype=tf.float32
+ """
+ return inputs
+
+ @abc.abstractclassmethod
+ def loss(self, target: tf.Tensor, predict: tf.Tensor) -> tf.Tensor:
+ """Вычисляет и возвращает потери в соответствии с функцией потерь"""
+
+ @abc.abstractclassmethod
+ def make_mask(self) -> tf.Tensor:
+ """Создает и возвращает маску для выходов с модели"""
+
+ def prediction_processing(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
+ """Обрабатывает выходы модели перед вычислением лоссов
+ Args:
+ inputs: tf.Tensor(dtype=tf.float32)
+ Returns
+ outputs: tf.Tensor(dtype=tf.float32
+ """
+ inputs = inputs[0] if isinstance(inputs, list) else inputs
+ return self._prediction_processing(inputs, **kwargs)
+
+ def set_new_model(self, model: tf.keras.Model,
+ optimizer: tf.keras.optimizers,
+ jit_compile=True) -> None:
+ self.model = model
+ self.model.compile(optimizer=optimizer, jit_compile=jit_compile)
+
+ def validate_args(self):
+ assert isinstance(
+ self.model, tf.keras.Model), """Передан неверный аргумент,
+ должно быть tf.keras.Model"""
+
+ @property
+ def layers(self, ) -> list:
+ return self.model.layers
+
+ @property
+ def weights(self, ) -> list:
+ return self.model.weights
+
+ @property
+ def summary(self, ) -> None:
+ print(self.name)
+ self.model.summary()
+
+ def get_weights(self, ) -> list:
+ return self.model.get_weights()
+
+ def set_weights(self, weights: list) -> None:
+ self.model.set_weights(weights)
+
+ @tf.function(reduce_retracing=True,
jit_compile=False,
- experimental_autograph_options = tf.autograph.experimental.Feature.ALL)
- def calculate_gradients(self, **kwargs) -> dict:
- """
- Вычисляет градиенты, лосс, td-ошибку
-
- Kwargs:
- dict содержащий батч, таргет, маску, опционально приоритетные веса
-
- Returns:
- dict содержащий лоссы и td-ошибку
- """
- with tf.GradientTape(persistent=False) as tape:
- Q = self.model(kwargs['state'], training=True)
- Q = self.prediction_processing(Q, **kwargs)
- if len(Q.shape) != len(kwargs['Qtarget'].shape): Q = tf.expand_dims(Q, -1)
-
- td_error = kwargs['Qtarget'] - Q
- loss = self.loss(kwargs['Qtarget'], Q)*kwargs.get('weights', 1.0)
- gradients = tape.gradient(loss, self.model.trainable_variables)
- return {'gradients': gradients, 'loss': loss, 'td_error': td_error}
-
- @tf.function(reduce_retracing=True,
+ experimental_autograph_options=tf.autograph.experimental.Feature.ALL)
+ def calculate_gradients(self, **kwargs) -> dict:
+ """
+ Вычисляет градиенты, лосс, td-ошибку
+
+ Kwargs:
+ dict содержащий батч, таргет, маску, опционально приоритетные веса
+
+ Returns:
+ dict содержащий лоссы и td-ошибку
+ """
+ with tf.GradientTape(persistent=False) as tape:
+ Q = self.model(kwargs['state'], training=True)
+ Q = self.prediction_processing(Q, **kwargs)
+ if len(Q.shape) != len(kwargs['Qtarget'].shape):
+ Q = tf.expand_dims(Q, -1)
+
+ td_error = kwargs['Qtarget'] - Q
+ loss = self.loss(kwargs['Qtarget'], Q)*kwargs.get('weights', 1.0)
+ E_loss = tf.reduce_mean(loss, axis=0)
+ gradients = tape.gradient(E_loss, self.model.trainable_variables)
+ return {'gradients': gradients, 'loss': loss, 'td_error': td_error}
+
+ @tf.function(reduce_retracing=True,
jit_compile=False,
- experimental_autograph_options = tf.autograph.experimental.Feature.ALL)
- def update_weights(self, **kwargs) -> dict:
- """
- Выполняет шаг отимизатора
-
- Kwargs:
- dict содержащий батч, таргет, маску, опционально приоритетные веса
-
- Returns:
- dict содержащий лоссы и td-ошибку
- """
- gradients, loss, td_error = self.calculate_gradients(**kwargs).values()
- self.model.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
- return {'loss': loss, 'td_error': td_error}
+ experimental_autograph_options=tf.autograph.experimental.Feature.ALL)
+ def update_weights(self, **kwargs) -> dict:
+ """
+ Выполняет шаг отимизатора
+
+ Kwargs:
+ dict содержащий батч, таргет, маску, опционально приоритетные веса
+
+ Returns:
+ dict содержащий лоссы и td-ошибку
+ """
+ gradients, loss, td_error = self.calculate_gradients(**kwargs).values()
+ self.model.optimizer.apply_gradients(
+ zip(gradients, self.model.trainable_variables))
+ return {'loss': loss, 'td_error': td_error}
diff --git a/rl_lib/src/models/model.py b/rl_lib/src/models/model.py
index cbc96cd..18cb757 100644
--- a/rl_lib/src/models/model.py
+++ b/rl_lib/src/models/model.py
@@ -1,67 +1,81 @@
-import tensorflow as tf
-import numpy as np
import abc
+
+import numpy as np
+import tensorflow as tf
from tensorflow.keras.models import clone_model
-from .base_models import ModelNN, ModelIO, BaseModel
from ..optimizers.optimizer import get_optimizer
+from .base_models import BaseModel, ModelIO, ModelNN
+
class Model(ModelNN, ModelIO, BaseModel, abc.ABC):
- """Абстрактный класс модели, который соединяет все методы классов ModelNN, ModelIO, BaseModel"""
- def __init__(self, **config: dict):
- super().__init__(**config)
- self.initial_model()
+ """Абстрактный класс модели,
+ который соединяет все методы классов ModelNN, ModelIO, BaseModel
+ """
- def _initial_model(self):
- input_shape = self._config['model_config']["input_shape"]
- action_space = self._config['model_config']["action_space"]
- if len(input_shape) == 1:
- return self.create_model(input_shape, action_space)
- else:
- return self.create_model_with_conv(input_shape, action_space)
-
- def check_input_shape(self, inputs):
- if not isinstance(inputs, (tf.Tensor, np.ndarray)):
- if isinstance(inputs, dict):
- for key, inpt in inputs.items():
- inputs[key] = self.check_input_shape(inpt)
+ def __init__(self, **config: dict):
+ super().__init__(**config)
+ self.initial_model()
+
+ def _initial_model(self):
+ input_shape = self._config['model_config']["input_shape"]
+ action_space = self._config['model_config']["action_space"]
+ if len(input_shape) == 1:
+ return self.create_model(input_shape, action_space)
+ else:
+ return self.create_model_with_conv(input_shape, action_space)
+
+ def check_input_shape(self, inputs, key=None):
+ if not isinstance(inputs, (tf.Tensor, np.ndarray)):
+ for key, inpt in inputs.items() if isinstance(inputs, dict) else enumerate(inputs):
+ inputs[key] = self.check_input_shape(inpt, key=key)
+ return inputs
+ while len(inputs.shape) < len(self.input_spec(key=key)):
+ inputs = tf.expand_dims(inputs, 0)
+ if len(inputs.shape) > len(self.input_spec(key=key)):
+ assert 0 # inputs.shape не может быть больше входа модели
return inputs
- elif isinstance(inputs, list):
- for key, inpt in enumerate(inputs):
- inputs[key] = self.check_input_shape(inpt)
- return inputs
- while len(inputs.shape) < len(self.input_spec()):
- inputs = tf.expand_dims(inputs,0)
- if len(inputs.shape) > len(self.input_spec()): assert 0 #inputs.shape не может быть больше входа модели
- return inputs
-
- def initial_model(self):
- """Инициализирует модель в соответствии с типом алгоритма"""
- if str(self.config['model_config']['model']) == 'None': model = self._initial_model()
- else: model = clone_model(self.config['model_config']['model'])
- optimizer = self.config.get("optimizer_config")
- optimizer = get_optimizer(**optimizer)
- self.set_new_model(model, optimizer)
- def input_spec(self):
- return self.model.layers[0].input_shape[0]
+ def initial_model(self):
+ """Инициализирует модель в соответствии с типом алгоритма"""
+ if str(self.config['model_config']['model']) == 'None':
+ model = self._initial_model()
+ else:
+ model = clone_model(self.config['model_config']['model'])
+ optimizer = self.config.get("optimizer_config")
+ optimizer = get_optimizer(**optimizer)
+ self.set_new_model(model, optimizer)
+
+ def input_spec(self, key=None):
+ if key is not None:
+ return self.model.input[key].shape
+ elif isinstance(self.model.input, list):
+ if self.lstm_size:
+ return self.model.input[0].shape
+ return self.model.input.shape
+
+ def load(self, path):
+ self.model = tf.keras.models.load_model(path+self.name+'.keras')
+
+ def output_spec(self):
+ """Возвращает кортеж размера выходных данных Модели"""
+ return self.model.layers[-1].output_shape
+
+ def save(self, path):
+ self.model.save(path+self.name+'.keras')
- def load(self, path):
- self.model = tf.keras.models.load_model(path+self.name+'.h5')
-
- def output_spec(self):
- """Возвращает кортеж размера выходных данных Модели"""
- return self.model.layers[-1].output_shape
+ @staticmethod
+ @abc.abstractclassmethod
+ def create_model(input_shape: tuple,
+ action_space: int) -> tf.keras.Model:
+ """Создает модель по умолчанию и возвращает tf.keras.Model,
+ архитектура в соответствии с алгоритмом, начальные слои - полносвязные
+ """
- def save(self, path):
- self.model.save(path+self.name+'.h5')
-
- @staticmethod
- @abc.abstractclassmethod
- def create_model(input_shape: tuple, action_space: int) -> tf.keras.Model:
- """Создает модель по умолчанию и возвращает tf.keras.Model, архитектура в соответствии с алгоритмом, начальные слои - полносвязные"""
-
- @staticmethod
- @abc.abstractclassmethod
- def create_model_with_conv(input_shape: tuple, action_space: int) -> tf.keras.Model:
- """Создает модель по умолчанию и возвращает tf.keras.Model, архитектура в соответствии с алгоритмом, начальные слои - сверточные"""
+ @staticmethod
+ @abc.abstractclassmethod
+ def create_model_with_conv(input_shape: tuple,
+ action_space: int) -> tf.keras.Model:
+ """Создает модель по умолчанию и возвращает tf.keras.Model,
+ архитектура в соответствии с алгоритмом, начальные слои - сверточные
+ """
diff --git a/rl_lib/src/normalizers.py b/rl_lib/src/normalizers.py
new file mode 100644
index 0000000..9766a87
--- /dev/null
+++ b/rl_lib/src/normalizers.py
@@ -0,0 +1,11 @@
+import numpy as np
+
+
+def normalize_m1_1(x: np.ndarray) -> np.ndarray:
+ """Нормализует RGB изображение в диапазон [-1, 1]."""
+ return x / 127.5 - 1
+
+
+def normalize_01(x: np.ndarray) -> np.ndarray:
+ """Нормализует RGB изображение в диапазон [0, 1]."""
+ return x / 255.0
diff --git a/rl_lib/src/optimizers/__init__.py b/rl_lib/src/optimizers/__init__.py
index f8835c4..9267e9a 100644
--- a/rl_lib/src/optimizers/__init__.py
+++ b/rl_lib/src/optimizers/__init__.py
@@ -1,11 +1,12 @@
try:
- import tensorflow_addons as tfa
+ import tensorflow_addons as tfa
except ImportError:
- try:
- import subprocess
- import sys
- subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow_addons"])
- import tensorflow_addons
- except ImportError:
- print("Не удалось установить и импортировать TENSORFLOW_ADDONS")
- raise SystemExit(1)
+ try:
+ import subprocess
+ import sys
+ subprocess.check_call(
+ [sys.executable, "-m", "pip", "install", "tensorflow_addons"])
+ import tensorflow_addons
+ except ImportError:
+ print("Не удалось установить и импортировать TENSORFLOW_ADDONS")
+ raise SystemExit(1)
diff --git a/rl_lib/src/optimizers/optimizer.py b/rl_lib/src/optimizers/optimizer.py
index 15561a4..bccc8ef 100644
--- a/rl_lib/src/optimizers/optimizer.py
+++ b/rl_lib/src/optimizers/optimizer.py
@@ -1,29 +1,31 @@
import tensorflow.keras.optimizers as optimizers
import tensorflow_addons as tfa
-def get_optimizer(optimizer_name: str = "adam", optimizer_params: dict = {}, custom_optimizer: object = None) -> object:
- """Возврщает настроенный оптимизатор.
- Доступные оптимизаторы tensorflow:
- Adam
- LAMB
- Adadelta
- RMSprop
- Args:
- optimizer: str: Название оптимизатора
- optimizer_params: dict: Параметры оптимизатора
- cutom_optimizer: object: Класс кастомного потимизатора
- """
- if optimizer_name.lower() == 'adam':
- return optimizers.Adam(**optimizer_params)
-
- elif optimizer_name.lower() == 'lamb':
- return tfa.optimizers.LAMB(**optimizer_params)
- elif optimizer_name.lower() == 'cutom' and type(custom_optimizer) != None:
- return custom_optimizer(**optimizer_params)
+def get_optimizer(optimizer_name: str = "adam", optimizer_params: dict = {},
+ custom_optimizer: object = None) -> object:
+ """Возврщает настроенный оптимизатор.
+ Доступные оптимизаторы tensorflow:
+ Adam
+ LAMB
+ Adadelta
+ RMSprop
+ Args:
+ optimizer: str: Название оптимизатора
+ optimizer_params: dict: Параметры оптимизатора
+ cutom_optimizer: object: Класс кастомного потимизатора
+ """
+ if optimizer_name.lower() == 'adam':
+ return optimizers.Adam(**optimizer_params)
- elif optimizer_name.lower() == 'adadelta':
- return optimizers.Adam(**optimizer_params)
+ elif optimizer_name.lower() == 'lamb':
+ return tfa.optimizers.LAMB(**optimizer_params)
- elif optimizer_name.lower() == 'rmsprop':
- return optimizers.Adam(**optimizer_params)
+ elif optimizer_name.lower() == 'cutom' and type(custom_optimizer) is not None:
+ return custom_optimizer(**optimizer_params)
+
+ elif optimizer_name.lower() == 'adadelta':
+ return optimizers.Adam(**optimizer_params)
+
+ elif optimizer_name.lower() == 'rmsprop':
+ return optimizers.Adam(**optimizer_params)
diff --git a/rl_lib/src/perfomans.py b/rl_lib/src/perfomans.py
new file mode 100644
index 0000000..028c544
--- /dev/null
+++ b/rl_lib/src/perfomans.py
@@ -0,0 +1,37 @@
+import time
+import concurrent.futures as pool
+
+
+def execution_time(func):
+ "Декоратор считающий время выполнения функции"
+ def wrapper(*args, **kwargs):
+ s_t = time.time()
+ result = func(*args, **kwargs)
+ print("Время выполнения функции %s = " % (func.__name__),
+ time.time() - s_t, "сек.")
+ return result
+ return wrapper
+
+
+def run_as_multithread(func):
+ """Запускает задачу мультипоточно
+ Args:
+ func: функция
+ input_data: вводные дынне функции
+ Returns:
+ iterable: результат выполнения функции
+ """
+ def wrapper(*args, **kwargs):
+ try:
+ with pool.ThreadPoolExecutor() as executer:
+ result = list()
+ for data in kwargs.get("input_data", []):
+ future = executer.submit(func, *(*args, data), **kwargs)
+ result.append(future)
+ return tuple(res.result() for res in pool.as_completed(result))
+ except:
+ result = list()
+ for data in kwargs.get("input_data", []):
+ result.append(func(*(*args, data), **kwargs))
+ return result
+ return wrapper
diff --git a/rl_lib/src/replay_buffers/dict_array.py b/rl_lib/src/replay_buffers/dict_array.py
new file mode 100644
index 0000000..4ff47bc
--- /dev/null
+++ b/rl_lib/src/replay_buffers/dict_array.py
@@ -0,0 +1,103 @@
+from typing import Any
+
+import numpy as np
+
+
+class StructArray:
+ """Структурированный массив
+ """
+ def __init__(self, shape, dict_keys, dtype=object) -> None:
+ self.dict_keys = sorted(dict_keys)
+ self.data = np.zeros(shape=shape,
+ dtype=(
+ [
+ (key, dtype) for key in self.dict_keys
+ ]
+ )
+ )
+ self.dtype = dtype
+
+ def __getitem__(self, index):
+ data = self.data[index]
+ return {key: np.asarray(data[key]).astype(np.float32)
+ if isinstance(index,
+ int) else StructArray.stack(data[key],
+ axis=0).astype(
+ np.float32)
+ for key in self.dict_keys}
+
+ def __setitem__(self, index, values):
+ "values = (state, action, reward, next_state, done, *other_data)"
+ self.data[index] = tuple(values[key] for key in self.dict_keys)
+
+ @staticmethod
+ def stack(array, axis=0):
+ if isinstance(array, np.ndarray):
+ if len(array.shape) > 1:
+ return np.asarray(
+ tuple(StructArray.stack(array[i])
+ for i in range(array.shape[0])
+ )
+ )
+ return np.stack(array, axis=axis).astype(np.float32)
+ return array
+
+
+class NonStructArray:
+ """Не структурированный массив"""
+
+ def __init__(self, shape, dtype=object) -> None:
+ self.data = np.zeros(shape=shape, dtype=dtype)
+ self.dtype = dtype
+
+ def __getitem__(self, index):
+ return StructArray.stack(self.data[index])
+
+ def __setitem__(self, index, values):
+ "values = (state, action, reward, next_state, done, *other_data)"
+ self.data[index] = values
+
+
+class DictArray:
+ """
+ Класс реализующий сохранение/ извлечение данных
+ в структурированные массивы numpy
+ """
+
+ def __init__(self, shape, dtype=object) -> None:
+ self.dtype = dtype
+ self.initialized = False
+ self.shape = shape
+ # В этом массиве мы будем хранить вложенные массивы (s,a,r,s',d)
+ self.data = np.zeros((shape[1], ), dtype=object)
+
+ def __getitem__(self, index):
+ return tuple(self.data[i][index] for i in range(self.shape[1]))
+
+ def __setitem__(self, index, values):
+ "values = (state, action, reward, next_state, done, *other_data)"
+ if not self.initialized:
+ self.init_array(values)
+ for i in range(self.shape[1]):
+ self.data[i][index] = values[i]
+
+ def choose_array_type(self, data):
+ if isinstance(data, dict):
+ return self.init_struct_array((self.shape[0], ), data.keys())
+ else:
+ return self.init_non_struct_array((self.shape[0], ))
+
+ def init_array(self, data):
+ for i, d in zip(range(self.shape[0]), data):
+ self.data[i] = self.choose_array_type(d)
+ self.initialized = True
+
+ def init_struct_array(self, shape, dict_keys):
+ return StructArray(shape, dict_keys, dtype=self.dtype)
+
+ def init_non_struct_array(self, shape):
+ return NonStructArray(shape=shape, dtype=self.dtype)
+
+ @property
+ def data_array(self):
+ return tuple(array.data for array in self.data)
\ No newline at end of file
diff --git a/rl_lib/src/replay_buffers/priority_buffers.py b/rl_lib/src/replay_buffers/priority_buffers.py
index bb35b0e..ef6ac5d 100644
--- a/rl_lib/src/replay_buffers/priority_buffers.py
+++ b/rl_lib/src/replay_buffers/priority_buffers.py
@@ -1,16 +1,18 @@
import numpy as np
-from ..data_saver.utils import save_data, load_data
+
+from ..data_saver.utils import load_data, save_data
from .random_buffers import Random_Buffer, Random_Recurrent_Buffer
+
class Sum_Tree:
def __init__(self, size):
- self.tree = np.zeros(2*size - 1, dtype = np.float64)
+ self.tree = np.zeros(2*size - 1, dtype=np.float64)
self.size = size
self.count = 0
self.real_size = 0
def clear(self, ):
- self.tree = np.zeros(self.tree.shape, dtype = np.float64)
+ self.tree = np.zeros(self.tree.shape, dtype=np.float64)
self.count = 0
self.real_size = 0
@@ -19,17 +21,18 @@ def total(self):
return self.tree[0]
def update(self, data_idx, value):
- assert type(data_idx)!=np.array and type(data_idx)!=list and type(data_idx)!=tuple
+ assert not isinstance(data_idx, np.ndarray) or not isinstance(
+ data_idx, list) or not isinstance(data_idx, tuple)
idx = data_idx + self.size - 1
change = value - self.tree[idx]
self.tree[idx] = value
parent = (idx - 1) // 2
- idx =[]
+ idx = []
idx.append(parent)
while parent > 0:
- parent = (parent - 1) // 2
- idx.append(parent)
- parent = np.asarray(idx, dtype = np.int32)
+ parent = (parent - 1) // 2
+ idx.append(parent)
+ parent = np.asarray(idx, dtype=np.int32)
self.tree[parent] += change
def add(self, value):
@@ -44,14 +47,15 @@ def get(self, s):
left = 2 * idx + 1
right = left + 1
while np.any(idx != left):
- idx = np.where(s <= self.tree[left], left, right)
- s = np.where(s <= self.tree[left], s, s - self.tree[left])
- left = 2 * idx + 1
- left = np.where(left >= self.tree.shape[0], idx, left)
- right = np.where(left >= self.tree.shape[0], right, left + 1)
+ idx = np.where(s <= self.tree[left], left, right)
+ s = np.where(s <= self.tree[left], s, s - self.tree[left])
+ left = 2 * idx + 1
+ left = np.where(left >= self.tree.shape[0], idx, left)
+ right = np.where(left >= self.tree.shape[0], right, left + 1)
data_idx = idx - self.size + 1
- return data_idx, self.tree[idx]
+ return data_idx, self.tree[idx]
+
class Prioritized_Replay_Buffer(Random_Buffer):
'''
@@ -59,7 +63,8 @@ class Prioritized_Replay_Buffer(Random_Buffer):
size: int
n_step: int
discount_factor: float
- num_var: int (Кол-во сохраянемых переменных, по умполчанию 5 (s, a, r, d, s'))
+ num_var: int (Кол-во сохраянемых переменных,
+ по умполчанию 5 (s, a, r, d, s'))
eps: float
alpha: float
beta: float
@@ -67,72 +72,80 @@ class Prioritized_Replay_Buffer(Random_Buffer):
beta_changing_curve: str
max_priority: float
'''
+
def __init__(self, **kwargs):
size = kwargs.get("size", 100000)
Random_Buffer.__init__(self, **kwargs)
self.name = "Prioritized_Replay_Buffer"
-
+
self.tree = Sum_Tree(size=size)
# PER params
- self.eps = kwargs.get("eps", 1e-2)
- self.alpha = kwargs.get("alpha", 0.6)
- self.beta = kwargs.get("beta", 0.4)
+ self.eps = kwargs.get("eps", 1e-2)
+ self.alpha = kwargs.get("alpha", 0.6)
+ self.beta = kwargs.get("beta", 0.4)
self.beta_changing = kwargs.get("beta_changing", 5e-4)
self.beta_changing_curve = kwargs.get("beta_changing_curve", 'linear')
- self.max_priority = kwargs.get("max_priority", 1e-2)
+ self.max_priority = kwargs.get("max_priority", 1e-2)
def clear(self, ):
Random_Buffer.clear(self,)
self.tree.clear()
- def add(self, samples, priority = None):
+ def add(self, samples, priority=None):
'''samples -> tuple(s,a,r,d,s')
- priority -> float если передается, то приоритет в буфере выставлется по преданному числу,
- по умолчанию вычисляется по self.max_priotiry
+ priority -> float если передается,
+ то приоритет в буфере выставлется по преданному числу,
+ по умолчанию вычисляется по self.max_priotiry
'''
if Random_Buffer.add(self, samples):
- self.tree.add(self.max_priority if priority == None else priority)
+ self.tree.add(self.max_priority if priority is None else priority)
assert self.count == self.tree.count and self.real_size == self.tree.real_size, "tree and has same real sizes"
def sample(self, batch_size):
data_idxs, weights = self._get_idx(batch_size)
- return {**Random_Buffer.sample(self, batch_size, data_idxs), 'data_idxs': data_idxs, 'weights': weights}
-
+ return {**Random_Buffer.sample(self, batch_size, data_idxs),
+ 'data_idxs': data_idxs, 'weights': weights}
+
def update_priorities(self, data_idxs, priorities):
priorities = self._calculate_new_priority(priorities)
self.max_priority = max(self.max_priority, max(priorities))
for data_idx, priority in zip(data_idxs, priorities):
- self.tree.update(data_idx, priority)
+ self.tree.update(data_idx, priority)
def save(self, path):
Random_Buffer.save(self, path + "Random_Buffer_")
path += self.name
save_data(path, {
- 'tree': self.tree.tree,
- 'tree_count': self.tree.count,
- 'tree_real_size': self.tree.real_size
- })
+ 'tree': self.tree.tree,
+ 'tree_count': self.tree.count,
+ 'tree_real_size': self.tree.real_size
+ })
def load(self, path):
- Random_Buffer.load(self, path + "Random_Buffer_")
+ Random_Buffer.load(self, path + "Random_Buffer_")
path += self.name
data = load_data(path)
self.tree.tree = data['tree']
self.tree.count = data['tree_count']
- self.tree.real_size = data['tree_real_size']
-
+ self.tree.real_size = data['tree_real_size']
+
def _get_idx(self, batch_size):
assert self.real_size >= batch_size, "buffer contains less samples than batch size"
segment = self.tree.total / batch_size
- segment_array = np.random.uniform(segment * np.arange(batch_size), segment * (np.arange(batch_size) + 1))
+ segment_array = np.random.uniform(
+ segment * np.arange(batch_size),
+ segment * (np.arange(batch_size) + 1)
+ )
data_idxs, priorities = self.tree.get(segment_array)
weights = self._calculate_weights(priorities)
if self.beta_changing_curve.lower() == 'exponential':
precision = len(str(self.beta_changing).split('.')[1])
- self.beta = round(1 - np.power(np.exp, -self.beta*self.beta_changing), precision)
- else: self.beta = min(1, self.beta*self.beta_changing)
+ self.beta = round(
+ 1 - np.power(np.exp, -self.beta*self.beta_changing), precision)
+ else:
+ self.beta = min(1, self.beta*self.beta_changing)
return data_idxs, weights
@@ -146,51 +159,67 @@ def _calculate_weights(self, weights):
weights = weights.astype(np.float32)
return weights
-class Prioritized_Replay_Recurrent_Buffer(Prioritized_Replay_Buffer, Random_Recurrent_Buffer, Random_Buffer):
+
+class Prioritized_Replay_Recurrent_Buffer(Prioritized_Replay_Buffer,
+ Random_Recurrent_Buffer,
+ Random_Buffer):
def __init__(self, **kwargs):
kwargs["num_var"] = 7
self.trace_length = kwargs.get("trace_length", 10)
Prioritized_Replay_Buffer.__init__(self, **kwargs)
Random_Recurrent_Buffer.__init__(self, **kwargs)
self.name = "Prioritized_Replay_Recurrent_Buffer"
-
+
kwargs["size"] = self.trace_length
- self.trace_window = Random_Buffer(**kwargs) #нужно для того чтобы граничные индексы кольцевого буфера из приоритетного выбора были с историческими данными
+ """нужно для того чтобы граничные индексы кольцевого буфера
+ из приоритетного выбора были с историческими данными
+ """
+ self.trace_window = Random_Buffer(**kwargs)
def clear(self, ):
Prioritized_Replay_Buffer.clear(self,)
self.trace_window.clear()
- def add(self, samples, priority = None):
+ def add(self, samples, priority=None):
if self.trace_window.real_size != self.trace_length:
- self.trace_window.add(samples)
+ self.trace_window.add(samples)
else:
- if self.real_size != self.size: self.trace_window.add(samples)
- else: self.trace_window.add(self.data[self.count])
- Prioritized_Replay_Buffer.add(self, samples, priority)
+ if self.real_size != self.size:
+ self.trace_window.add(samples)
+ else:
+ self.trace_window.add(self.data[self.count])
+ Prioritized_Replay_Buffer.add(self, samples, priority)
def sample(self, batch_size):
- if self.data[-1][1] == 0: self.data[-self.trace_length:] = self.trace_window.data
- data_idxs, weights = Prioritized_Replay_Buffer._get_idx(self, batch_size)
+ if self.data[-1][1] == 0:
+ self.data[-self.trace_length:] = self.trace_window.data.data_array
+ data_idxs, weights = Prioritized_Replay_Buffer._get_idx(
+ self, batch_size)
data = Random_Recurrent_Buffer.sample(self, batch_size, data_idxs)
data = self.add_trace_window(data, data_idxs)
- return {**data, 'data_idxs': data_idxs, 'weights': weights}
+ return {**data, 'data_idxs': data_idxs, 'weights': weights}
def add_trace_window(self, data, data_idxs):
- error_idx = np.where((data_idxs < self.count + self.trace_length) & (data_idxs > self.count))[0]
+ error_idx = np.where(
+ (data_idxs < self.count + self.trace_length) &
+ (data_idxs > self.count)
+ )[0]
errors = data_idxs[error_idx]
count = abs(errors - self.count - self.trace_length)
for e, c in zip(error_idx, count):
repair_data = self.get_repair_data(c)
z = np.arange(c, 0, -1)-1
for key in data.keys():
- if key in ('h_t', 'c_t'): z = np.arange(2, 0, -1)-1
- data[key][e][z] = repair_data[key][-2:] if key in ('h_t', 'c_t') else repair_data[key]
+ if key in ('h_t', 'c_t'):
+ z = np.arange(2, 0, -1)-1
+ data[key][e][z] = repair_data[key][-2:] if key in (
+ 'h_t', 'c_t') else repair_data[key]
return data
def get_repair_data(self, count):
- l = np.arange(count) + 1
- return self.trace_window.sample(count, self.trace_window.count - l)
+ length = np.arange(count) + 1
+ return self.trace_window.sample(count,
+ self.trace_window.count - length)
def save(self, path):
Prioritized_Replay_Buffer.save(self, path)
@@ -198,4 +227,4 @@ def save(self, path):
def load(self, path):
Prioritized_Replay_Buffer.load(self, path)
- self.trace_window.load(path+'small_buffer_')
+ self.trace_window.load(path+'small_buffer_')
diff --git a/rl_lib/src/replay_buffers/random_buffers.py b/rl_lib/src/replay_buffers/random_buffers.py
index 0bc5157..fc3ee07 100644
--- a/rl_lib/src/replay_buffers/random_buffers.py
+++ b/rl_lib/src/replay_buffers/random_buffers.py
@@ -1,5 +1,8 @@
import numpy as np
-from ..data_saver.utils import save_data, load_data
+
+from ..data_saver.utils import load_data, save_data
+from .dict_array import DictArray
+
class _n_step_buffer:
def __init__(self, **kwargs):
@@ -13,94 +16,107 @@ def clear(self, ):
def add(self, memory_tuplet):
state, action, reward, done, next_state, *another_data = memory_tuplet
- if len(self.buffer[0])==0:
- self.buffer[-1]= [ state, action, [reward], None, None, *[None for _ in range(len(another_data))]]
+ if len(self.buffer[0]) == 0:
+ self.buffer[-1] = [state, action, [reward], None,
+ None, *[None for _ in range(len(another_data))]]
- if len(self.buffer[0][2]) == self.steps-1 or (len(self.buffer[0][2]) == self.steps and self.steps == 1):
- self.buffer[0][3]=int(done)
- self.buffer[0][4]=next_state
- for i in range(len(another_data)):
- self.buffer[0][i+5]=another_data[i]
+ if len(
+ self.buffer[0][2]
+ ) == self.steps-1 or (
+ len(self.buffer[0][2]) == self.steps and self.steps == 1
+ ):
+ self.buffer[0][3] = int(done)
+ self.buffer[0][4] = next_state
+ for i in range(len(another_data)):
+ self.buffer[0][i+5] = another_data[i]
for j in range(len(self.buffer)-1):
- self.buffer[j][2].append(reward * self.discount_factor**len(self.buffer[j][2]))
+ self.buffer[j][2].append(
+ reward * self.discount_factor**len(self.buffer[j][2]))
if len(self.buffer) != 1:
- self.buffer[-1]=[state, action, [reward], None, None, *[None for _ in range(len(another_data))]]
+ self.buffer[-1] = [state, action, [reward], None,
+ None, *[None for _ in range(len(another_data))]]
self.buffer.append([])
if len(self.buffer) > self.steps:
- self.buffer[0][2] = sum(self.buffer[0][2])
- return_data = self.buffer[0]
- self.buffer = self.buffer[1:]
- return return_data
+ self.buffer[0][2] = sum(self.buffer[0][2])
+ return_data = self.buffer[0]
+ self.buffer = self.buffer[1:]
+ return return_data
return None
class Random_Buffer:
'''Сохраняет переходы (s,a,r,d,s') и возвращает батчи.
-
+
Аргументы:
size: int. Размер буфера
n_step: int. N-step алгоритм
discount_factor: float
- num_var: int (Кол-во сохраянемых переменных, по умполчанию 5 (s, a, r, d, s'))
+ num_var: int, Кол-во сохраянемых переменных,
+ по умполчанию 5 (s, a, r, d, s')
'''
+
def __init__(self, **kwargs):
n_step = kwargs.get("n_step", 1)
self.size = kwargs.get("size", 100000)
- discount_factor = kwargs.get("discount_factor", 0.99)
+ # discount_factor = kwargs.get("discount_factor", 0.99)
num_var = kwargs.get("num_var", 5)
- # буфер для хранения перехода
- self.data = np.zeros((self.size, num_var), dtype=object)
+ self.hash_table = kwargs.get("var_names",
+ {"state": 0,
+ "action": 1,
+ "reward": 2,
+ "done": 3,
+ "next_state": 4}
+ )
+
+ # буфер для хранения перехода
+ self.data = DictArray((self.size, num_var), dtype=object)
self.name = "Random_Buffer"
-
+
# размер буфера
self.count = 0
self.real_size = 0
-
+
self.n_step_buffer = _n_step_buffer(**kwargs) if n_step > 1 else None
def clear(self, ):
- self.data = np.zeros(self.data.shape, dtype=object)
+ self.data = DictArray(self.data.shape, dtype=object)
self.count = 0
self.real_size = 0
- if self.n_step_buffer != None: self.n_step_buffer.clear()
+ if self.n_step_buffer is not None:
+ self.n_step_buffer.clear()
def add(self, samples: tuple, args=None):
- """Добавляет данные в буфер"""
- if self.n_step_buffer != None:
- result = self.n_step_buffer.add(samples)
- if result != None:
- return self._add_data(result)
- return False
+ """Добавляет данные в буфер s,a,r,n_s,d,
+ индексы данных должны быть равны индексам в hash_table.
+ Автоматической проверки нет"""
+ if self.n_step_buffer is not None:
+ result = self.n_step_buffer.add(samples)
+ if result is not None:
+ return self._add_data(result)
+ return False
else:
- return self._add_data(samples)
-
+ return self._add_data(samples)
+
def sample(self, batch_size, idx=None):
"""Возвращает батч: dict"""
- if np.any(idx) == None:
- idx = self._get_idx( batch_size)
- state = np.stack(self.data[idx, 0], axis=0).astype(np.float32)
- action = np.stack(self.data[idx, 1], axis=0).astype(np.float32)
- reward = self.data[idx, 2].astype(np.float32)
- done = self.data[idx, 3].astype(np.float32)
- next_state = np.stack(self.data[idx, 4], axis=0).astype(np.float32)
- other_data = {}
- if 5 < self.data.shape[1] <= 7:
- other_data = {key: np.stack(self.data[idx, i], axis=0).astype(np.float32) for i, key in zip(range(5,7), ('h_t', 'c_t'))}
-
- return {'state': state, 'action': action, 'reward': reward, 'done': done, 'next_state': next_state, **other_data}
+ if np.any(idx) is None:
+ idx = self._get_idx(batch_size)
+ data = self.data[idx]
+ return {key: data[val]
+ for key, val in self.hash_table.items()}
def save(self, path):
path += self.name
save_data(path, {
- 'data': self.data,
- 'count': self.count,
- 'size': self.size,
- 'real_size': self.real_size
- })
+ 'data': self.data,
+ 'count': self.count,
+ 'size': self.size,
+ 'real_size': self.real_size
+ })
def load(self, path):
path += self.name
@@ -110,15 +126,15 @@ def load(self, path):
self.size = data['size']
self.real_size = data['real_size']
-
def _add_data(self, samples):
- self.data[self.count, :] = samples
+ self.data[self.count] = samples
self.count = (self.count + 1) % self.size
self.real_size = min(self.size, self.real_size + 1)
- return True
+ return True
def _get_idx(self, batch_size):
- return np.random.choice(self.real_size, size = batch_size, replace = False)
+ return np.random.choice(self.real_size, size=batch_size, replace=False)
+
class Random_Recurrent_Buffer(Random_Buffer):
'''
@@ -126,37 +142,44 @@ class Random_Recurrent_Buffer(Random_Buffer):
size: int. Размер буфера
n_step: int. N-step алгоритм
discount_factor: float
- num_var: int (Кол-во сохраняемых переменных, по умполчанию 7 (s, a, r, d, s', h, c))
+ num_var: int, Кол-во сохраняемых переменных,
+ по умполчанию 7 (s, a, r, d, s', h, c)
trace_length: int. Длина возращаемой последовательности
'''
+
def __init__(self, **kwargs):
kwargs["num_var"] = 7
- super().__init__(**kwargs)
+ kwargs["var_names"] = {"state": 0,
+ "action": 1,
+ "reward": 2,
+ "done": 3,
+ "next_state": 4,
+ "h_t": 5,
+ "c_t": 6}
+ Random_Buffer.__init__(self, **kwargs)
self.name = "Random_Recurrent_Buffer"
self.trace_length = kwargs.get("trace_length", 10)
+ def _make_linspace(self, idx):
+ idx = np.linspace(start=idx - self.trace_length,
+ stop=idx, num=self.trace_length+1,
+ dtype=int, axis=1)[:, :-1]
+ return idx
+
+ def _get_idx(self, batch_size, *args, **kwargs):
+ if self.real_size != self.size:
+ return self._make_linspace(np.random.randint(low=self.trace_length,
+ high=self.real_size, size=(batch_size,)))
+ else:
+ return self._make_linspace(np.random.randint(
+ low=-self.size + self.count + self.trace_length,
+ high=self.count, size=(batch_size,)
+ ))
+
def sample(self, batch_size, idx=None):
- if np.any(idx) == None:
- idx = self._get_idx_rec(batch_size)
-
- vector_idx = np.linspace(start=idx - self.trace_length, stop=idx, num = self.trace_length+1, dtype = int, axis=1)
- mem_idx = vector_idx[:,:2]
-
- state = self.stack(self.data[vector_idx[:, :-1], 0], batch_size).astype(np.float32)
- action = self.stack(self.data[vector_idx[:, :-1], 1], batch_size).astype(np.int32)
- reward = self.data[vector_idx[:, :-1], 2].astype(np.float32)
- done = self.data[vector_idx[:, :-1], 3].astype(np.float32)
- next_state = self.stack(self.data[vector_idx[:, :-1], 4], batch_size).astype(np.float32)
- h_t = self.stack(self.data[mem_idx, 5], batch_size).astype(np.float32)
- c_t = self.stack(self.data[mem_idx, 6], batch_size).astype(np.float32)
-
- return {'state': state, 'action': action, 'reward': reward, 'done': done, 'next_state': next_state, 'h_t': h_t, 'c_t': c_t}
-
- def _get_idx_rec(self, batch_size):
- if self.real_size != self.size:
- return np.random.randint(low = self.trace_length, high = self.real_size, size=(batch_size,))
- else:
- return np.random.randint(low = -self.size + self.count + self.trace_length, high = self.count, size=(batch_size,))
+ if idx is not None:
+ idx = self._make_linspace(idx)
+ return Random_Buffer.sample(self, batch_size, idx)
def stack(self, data, batch_size):
return np.asarray([np.stack(data[i]) for i in range(batch_size)])
diff --git a/rl_lib/src/replay_buffers/replay_buffer.py b/rl_lib/src/replay_buffers/replay_buffer.py
index ed7dc79..e2503a4 100644
--- a/rl_lib/src/replay_buffers/replay_buffer.py
+++ b/rl_lib/src/replay_buffers/replay_buffer.py
@@ -1,60 +1,71 @@
-from .random_buffers import *
-from .priority_buffers import *
+import threading
+
+from .priority_buffers import (Prioritized_Replay_Buffer,
+ Prioritized_Replay_Recurrent_Buffer)
+from .random_buffers import Random_Buffer, Random_Recurrent_Buffer
+
class ReplayBuffer:
- """Сохраняет переходы и выполняет сэмплирование батчей
-
- Kwargs:
- priority: bool True если приоритетный
- recurrent: bool True если рекуррентный
- size: int Размер буфера
- n_step: int
- discount_factor: float
- num_var: int (Кол-во сохраянемых переменных, по умполчанию 5 (s, a, r, d, s'))
- eps: float
- alpha: float
- beta: float
- beta_changing: float
- beta_changing_curve: str
- max_priority: float Максимальный приоритет при добавлении новых данных
- trace_length: int. Длина возращаемой последовательности
- """
-
- def __init__(self, **kwargs):
- self._config = kwargs
- if kwargs.get("priority", 0) :
- if kwargs.get("recurrent", 0): self.buffer = Prioritized_Replay_Recurrent_Buffer(**kwargs)
- else: self.buffer = Prioritized_Replay_Buffer(**kwargs)
- else:
- if kwargs.get("recurrent", 0): self.buffer = Random_Recurrent_Buffer(**kwargs)
- else: self.buffer = Random_Buffer(**kwargs)
-
- def add(self, *args):
- self.buffer.add(*args)
-
- @property
- def config(self):
- return self._config
-
- def clear(self):
- self.buffer.clear()
-
- def load(self, *args):
- self.buffer.load(*args)
-
- @property
- def name(self):
- return self.buffer.name
-
- @property
- def real_size(self):
- return self.buffer.real_size
-
- def sample(self, *args):
- return self.buffer.sample(*args)
-
- def save(self, *args):
- self.buffer.save(*args)
-
- def update_priorities(self, *args):
- self.buffer.update_priorities(*args)
+ """Сохраняет переходы и выполняет сэмплирование батчей
+
+ Kwargs:
+ priority: bool True если приоритетный
+ recurrent: bool True если рекуррентный
+ size: int Размер буфера
+ n_step: int
+ discount_factor: float
+ num_var: int, Кол-во сохраянемых переменных,
+ по умполчанию 5 (s, a, r, d, s')
+ eps: float
+ alpha: float
+ beta: float
+ beta_changing: float
+ beta_changing_curve: str
+ max_priority: float,
+ Максимальный приоритет при добавлении новых данных
+ trace_length: int. Длина возращаемой последовательности
+ """
+
+ def __init__(self, **kwargs):
+ self._config = kwargs
+ if kwargs.get("priority", 0):
+ if kwargs.get("recurrent", 0):
+ self.buffer = Prioritized_Replay_Recurrent_Buffer(**kwargs)
+ else:
+ self.buffer = Prioritized_Replay_Buffer(**kwargs)
+ else:
+ if kwargs.get("recurrent", 0):
+ self.buffer = Random_Recurrent_Buffer(**kwargs)
+ else:
+ self.buffer = Random_Buffer(**kwargs)
+ self.lock = threading.Lock()
+
+ def add(self, *args):
+ with self.lock: self.buffer.add(*args)
+
+ @property
+ def config(self):
+ return self._config
+
+ def clear(self):
+ self.buffer.clear()
+
+ def load(self, *args):
+ self.buffer.load(*args)
+
+ @property
+ def name(self):
+ return self.buffer.name
+
+ @property
+ def real_size(self):
+ with self.lock: return self.buffer.real_size
+
+ def sample(self, *args):
+ with self.lock: return self.buffer.sample(*args)
+
+ def save(self, *args):
+ self.buffer.save(*args)
+
+ def update_priorities(self, *args):
+ with self.lock: self.buffer.update_priorities(*args)
diff --git a/rl_lib/src/replay_buffers/tests/test_replay_buffer.py b/rl_lib/src/replay_buffers/tests/test_replay_buffer.py
index c3ea3e7..083acbc 100644
--- a/rl_lib/src/replay_buffers/tests/test_replay_buffer.py
+++ b/rl_lib/src/replay_buffers/tests/test_replay_buffer.py
@@ -1,86 +1,105 @@
-from rl_lib.src.replay_buffers.replay_buffer import ReplayBuffer
import os
-from shutil import rmtree
from copy import deepcopy
-
+from shutil import rmtree
+
+from rl_lib.src.replay_buffers.replay_buffer import ReplayBuffer
+
+
class Test_Replay_Buffer:
- """
- Производит тестирование буфера
+ """
+ Производит тестирование буфера
+
+ buffer_args:
+ priority: bool True если приоритетный
+ recurrent: bool True если рекуррентный
+ size: int Размер буфера
+ n_step: int
+ discount_factor: float
+ num_var: int, Кол-во сохраянемых переменных,
+ по умполчанию 5 (s, a, r, d, s')
+ eps: float
+ alpha: float
+ beta: float
+ beta_changing: float
+ beta_changing_curve: str
+ max_priority: float Максимальный приоритет
+ при добавлении новых данных
+ trace_length: int. Длина возращаемой последовательности
+ """
+
+ def __init__(self, buffer_args):
+ self.buffer = ReplayBuffer(**buffer_args)
+ self.path = os.getcwd() + '/test_replay_buffer/'
+ if not os.path.isdir(self.path):
+ os.mkdir(self.path)
+
+ def __exit__(self):
+ """Удаляет созданную папку с файлами, если есть"""
+ if os.path.isdir(self.path):
+ rmtree(self.path)
- buffer_args:
- priority: bool True если приоритетный
- recurrent: bool True если рекуррентный
- size: int Размер буфера
- n_step: int
- discount_factor: float
- num_var: int (Кол-во сохраянемых переменных, по умполчанию 5 (s, a, r, d, s'))
- eps: float
- alpha: float
- beta: float
- beta_changing: float
- beta_changing_curve: str
- max_priority: float Максимальный приоритет при добавлении новых данных
- trace_length: int. Длина возращаемой последовательности
- """
- def __init__(self, buffer_args):
- self.buffer = ReplayBuffer(**buffer_args)
- self.path = os.getcwd() + '/test_replay_buffer/'
- if not os.path.isdir(self.path):
- os.mkdir(self.path)
+ def test_new_init_args(self, **buffer_args):
+ """Проверяет переинициализацию с новыми аргументами"""
+ self.buffer = ReplayBuffer(**buffer_args)
- def __exit__(self):
- """Удаляет созданную папку с файлами, если есть"""
- if os.path.isdir(self.path):
- rmtree(self.path)
-
- def test_new_init_args(self, **buffer_args):
- """Проверяет переинициализацию с новыми аргументами"""
- self.buffer = ReplayBuffer(**buffer_args)
+ def test_add_data(self):
+ """Проверяет возможность добавить в буфер данные
+ """
+ pass
- def test_add_data(self):
- """Проверяет возможность добавить в буфер данные"""
- pass
+ def test_samples(self):
+ """Сэмплирует батчи из буфера и проверяет размерности,
+ количество аргументов
+ """
+ pass
- def test_samples(self):
- """Сэмплирует батчи из буфера и проверяет размерности, количество аргументов"""
- pass
+ def test_save(self):
+ """Выполняет сохранение буфера и проверяет появился ли файл буфера
+ """
+ self.buffer.save(self.path)
+ print("Буфер сохранен")
+ files = os.listdir(self.path)
+ file_names = [f.split('.')[0] for f in files if os.path.isfile(
+ os.path.join(self.path, f))]
+ print("Проверка сохранения буфера")
+ print(f"Найдено {len(file_names)} файлов: ", *file_names)
+ assert self.buffer.name in file_names, """Файл не найден,
+ проверка не пройдена"""
+ print('Успешно тест сохранения данных')
- def test_save(self):
- """Выполняет сохранение буфера и проверяет появился ли файл буфера"""
- self.buffer.save(self.path)
- print("Буфер сохранен")
- files = os.listdir(self.path)
- file_names = [f.split('.')[0] for f in files if os.path.isfile(os.path.join(self.path, f))]
- print("Проверка сохранения буфера")
- print(f"Найдено {len(file_names)} файлов: ", *file_names)
- assert self.buffer.name in file_names, "Файл не найден, проверка не пройдена"
- print('Успешно тест сохранения данных')
-
- def test_load(self):
- """Выполняет test_save, потом загружает и проверяет соответствуют ли загруженные файлы сохраненным"""
- self.buffer.save(self.path)
- copy_buffer = deepcopy(self.buffer)
+ def test_load(self):
+ """Выполняет test_save, потом загружает и
+ проверяет соответствуют ли загруженные файлы сохраненным
+ """
+ self.buffer.save(self.path)
+ copy_buffer = deepcopy(self.buffer)
- self.buffer.load(self.path)
- assert self.check_load_data(copy_buffer.buffer.__dict__, self.buffer.buffer.__dict__), "Файлы загрузки не соответствуют настоящим файлам"
- print("Успешный тест зарузки данных")
+ self.buffer.load(self.path)
+ assert self.check_load_data(
+ copy_buffer.buffer.__dict__, self.buffer.buffer.__dict__), "Файлы загрузки не соответствуют настоящим файлам"
+ print("Успешный тест зарузки данных")
- def check_load_data(self, real_data: dict, loaded_data: dict) -> bool:
- for key, value in real_data.items():
- if key == 'tree': continue
- if key == 'trace_window':
- if not self.check_load_data(value.__dict__, loaded_data[key].__dict__): return False
- continue
- if key == 'data':
- if loaded_data[key].all() != value.all(): return False
- continue
- if loaded_data[key] != value: return False
- return True
+ def check_load_data(self, real_data: dict, loaded_data: dict) -> bool:
+ for key, value in real_data.items():
+ if key == 'tree':
+ continue
+ if key == 'trace_window':
+ if not self.check_load_data(value.__dict__,
+ loaded_data[key].__dict__):
+ return False
+ continue
+ if key == 'data':
+ if loaded_data[key].all() != value.all():
+ return False
+ continue
+ if loaded_data[key] != value:
+ return False
+ return True
- def test_all_buffers(self, buffers: list):
- for buffer_type in buffers:
- self.test_new_init_args(buffer_type)
- self.test_add_data()
- self.test_samples()
- self.test_save()
- self.test_load()
+ def test_all_buffers(self, buffers: list):
+ for buffer_type in buffers:
+ self.test_new_init_args(buffer_type)
+ self.test_add_data()
+ self.test_samples()
+ self.test_save()
+ self.test_load()
diff --git a/rl_lib/src/runners/__init__.py b/rl_lib/src/runners/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rl_lib/src/runners/base_runner.py b/rl_lib/src/runners/base_runner.py
new file mode 100644
index 0000000..bb0b0d3
--- /dev/null
+++ b/rl_lib/src/runners/base_runner.py
@@ -0,0 +1,219 @@
+import abc
+import time
+import traceback
+
+import numpy as np
+from gym import Env
+
+
+class Abc_Base_Env_Runner(abc.ABC):
+ def __init__(self, env: Env, algo) -> None:
+ self.env = env
+ self.algo = algo
+
+ @abc.abstractmethod
+ def _run():
+ "Основная логика обучения"
+ pass
+
+ @abc.abstractmethod
+ def train():
+ "Запускает обучения алгоритма в среде"
+ pass
+
+ @abc.abstractmethod
+ def test():
+ "Запускает тестирование алгоритма в среде"
+ pass
+
+ def save(self):
+ "Сохраняет текущие прааметры обучения"
+ self.algo.save()
+
+ def load(self):
+ "Загружает текущие прааметры обучения"
+ self.algo.load()
+
+ def run(self):
+ "Запуск процесса обучения нейронной сети в текущей среде"
+ try:
+ self._run()
+ except Exception as e:
+ print(traceback.format_exc())
+ input("Press enter to exit: ")
+
+
+def run_episode(func):
+ "Обертка выполнения эпизода в среде. Работает только с self."
+
+ def wrapper(self, *args, **kwargs):
+ tr = False
+ observation, _ = self.env.reset()
+ self.algo.initial_state()
+ episode_reward = 0
+ other_info = []
+ for env_step in range(1, kwargs.get("steps")):
+ env_step_result, _other_info = func(self, observation)
+ observation, reward, done = env_step_result[:3]
+ if self.new_step_api:
+ tr = env_step_result[3]
+ episode_reward += reward
+ if _other_info: other_info.append(_other_info)
+ if done or tr:
+ break
+ return episode_reward, other_info
+ return wrapper
+
+
+class Base_Env_Runner(Abc_Base_Env_Runner):
+ """Этот класс реализует в себе все методы для обучения нейронной сети.
+ Для запуска обучения просто нужно передать
+ все параметры обучения алгоритма, среду и алгоритм.
+
+ Args:
+ env: gym.Env
+ algo: Any, какой либо алгоритм из rl_lib.src.algoritms
+ """
+
+ def __init__(self, env: Env, algo,
+ episodes: int = None,
+ env_steps: int = None,
+ env_test_steps: int = None,
+ pre_train_steps: int = None,
+ test_counts: int = 1,
+ train_frequency: int = None,
+ test_frequency: int = None,
+ copy_weigths_frequency: int = 1,
+ new_step_api: bool = False,
+ save_frequency: int = -1,
+ *args, **kwargs) -> None:
+ super().__init__(env, algo)
+ self.episodes = episodes
+ self.env_steps = env_steps
+ self.env_test_steps = env_test_steps
+ self.pre_train_steps = pre_train_steps
+ self.test_counts = test_counts
+
+ self.train_frequency = train_frequency
+ self.test_frequency = test_frequency
+ self.copy_weigths_frequency = copy_weigths_frequency
+ self.save_frequency = save_frequency
+
+ self.new_step_api = new_step_api
+
+ self.counter = 0
+ self.episode_num = 0
+ self._check_params()
+
+ def _check_params(self):
+ "Проверяет все параметры обучения"
+ assert isinstance(
+ self.episodes, int), "Кол-во эпизодов должно быть int"
+ assert isinstance(
+ self.env_steps, int), "Кол-во шагов в среде должно быть int"
+ assert isinstance(self.env_test_steps,
+ int), "Кол-во тестовых шагов в среде должно быть int"
+ assert isinstance(self.pre_train_steps,
+ int), "Кол-во претреин шагов должно быть int"
+
+ assert isinstance(self.train_frequency,
+ int), "Частота обучения должна быть int"
+ assert isinstance(self.test_frequency, int) \
+ or self.test_frequency is None, "Частота тестов должна быть int"
+ assert isinstance(self.copy_weigths_frequency,
+ int), "Частота копирования весов должна быть int"
+ assert isinstance(self.save_frequency,
+ int), "Частота сохранения должна быть int"
+
+ def _single_explore_step(self, observation):
+ action = self.algo.get_action(observation)
+ env_step_result = self.env.step(action)
+ self.algo.add(
+ (observation, action, *env_step_result[1:3], env_step_result[0]))
+ self.counter += 1
+ return env_step_result
+
+ def _train_step(self):
+ td_error = self.algo.train_step()
+ if self.counter % self.copy_weigths_frequency == 0:
+ res = self.algo.copy_weights()
+ assert res, "Ошибка копирования весов"
+ return td_error
+
+ @run_episode
+ def _train_episode(self, observation=None):
+ td_error = None
+ env_step_result = self._single_explore_step(observation=observation)
+ if self.counter % self.train_frequency == 0 \
+ and self.counter > self.pre_train_steps:
+ td_error = self._train_step()
+ return env_step_result, td_error
+
+ def _single_test_step(self, observation):
+ action = self.algo.get_test_action(observation)
+ env_step_result = self.env.step(action)
+ return env_step_result
+
+ @run_episode
+ def _single_test_episode(self, observation=None) -> float:
+ episode_test_reward = self._single_test_step(observation=observation)
+ return episode_test_reward, None
+
+ def _print_info(self,
+ episode: int,
+ all_rewards: list,
+ episode_reward: float,
+ avg_test_reward: float,
+ all_td_error: list,
+ start_time: float,
+ counter: int):
+ print(" Episode %d - Reward = %.3f, episode reward = %.3f, test reward %.3f, Loss = %.6f, Time = %.f sec, Total steps = %.f" %
+ (
+ episode,
+ np.asarray(
+ all_rewards[-10:]).mean() if len(all_rewards) != 0 else 0,
+ episode_reward,
+ avg_test_reward,
+ np.asarray(all_td_error).mean() if len(
+ all_td_error) != 0 else 0,
+ time.time()-start_time,
+ counter
+ )
+ )
+
+ def _run(self):
+ all_td_error = []
+ all_rewards = []
+ avg_test_reward = 0
+ for episode in range(self.episodes):
+ start_time = time.time()
+ episode_reward, td_error = self.train()
+ if episode % self.test_frequency == 0:
+ avg_test_reward = self.test()
+ if self.save_frequency > 0 and \
+ episode % self.save_frequency == 0:
+ self.save()
+ all_rewards.append(episode_reward)
+ if td_error:
+ all_td_error.extend(td_error)
+ self._print_info(
+ episode=episode,
+ all_rewards=all_rewards,
+ episode_reward=episode_reward,
+ avg_test_reward=avg_test_reward,
+ all_td_error=all_td_error,
+ start_time=start_time,
+ counter=self.counter
+ )
+
+ def train(self):
+ return self._train_episode(self,
+ observation=None,
+ steps=self.env_steps)
+
+ def test(self):
+ test_reward = []
+ for _ in range(self.test_counts):
+ test_reward.append(self._single_test_episode(
+ self, observation=None, steps=self.env_test_steps)[0])
+ return sum(test_reward)/len(test_reward)
diff --git a/rl_lib/tests/config.yaml b/rl_lib/tests/config.yaml
index 9ff0efb..afd1539 100644
--- a/rl_lib/tests/config.yaml
+++ b/rl_lib/tests/config.yaml
@@ -1,47 +1,62 @@
-#default DQN config
+#default DDPG config
model_config:
- model: None
- name: "_cart_pole"
+ name: "_test_CarRacing"
input_shape: None
action_space: None
- lstm_size: 64
- discount_factor : 0.9
- n_step: 3
- priority: True
- batch_size: 32
+ discount_factor : 0.99
+ n_step: 1
+ batch_size: 16
double_network: False
- tau: 1.0
-
-optimizer_config:
- optimizer_name: "adam"
- optimizer_params:
- learning_rate: 0.01
- epsilon: 0.001
- clipnorm: 1.0
- custom_optimizer: None
+ priority: False
+
+
+actor_model_config:
+ model_config:
+ model: None
+ tau: 0.001
+
+critic_model_config:
+ model_config:
+ model: None
+ tau: 0.001
+
+actor_optimizer_config:
+ optimizer_config:
+ optimizer_name: "adam"
+ optimizer_params:
+ learning_rate: 0.0001
+ epsilon: 0.001
+ clipnorm: 1.0
+ custom_optimizer: None
+
+critic_optimizer_config:
+ optimizer_config:
+ optimizer_name: "adam"
+ optimizer_params:
+ learning_rate: 0.001
+ epsilon: 0.001
+ clipnorm: 1.0
+ custom_optimizer: None
buffer_config:
size: 100000
- priority: True
- recurrent: True
- trace_length: 10
- recurrent_skip: 5
eps: 0.01
alpha: 0.5
beta: 0.4
beta_changing: 0.0005
beta_changing_curve: 'linear'
- max_priority: 0.01
+ max_priority: 0.1
exploration_config:
- strategy_name: "epsilon_greedy"
+ strategy_name: "ou_noise"
strategy_config:
- eps_decay_steps: 5000
- eps_max: 1.0
- eps_min: 0.01
- eps_test: 0.001
+ alpha: 0.0
+ sigma: 0.2
action_space: None
+ dt: 0.01
+ mean: None
+ theta: 0.15
data_saver:
path: "..\\rl_lib\\rl_lib\\tests\\models/"
diff --git a/rl_lib/tests/ddpg_config.yaml b/rl_lib/tests/ddpg_config.yaml
deleted file mode 100644
index 408d396..0000000
--- a/rl_lib/tests/ddpg_config.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-#default DDPG config
-
-model_config:
- name: "_test_Pendulum"
- input_shape: None
- action_space: None
- discount_factor : 0.9
- n_step: 1
- batch_size: 64
- double_network: False
- priority: False
-
-
-actor_model_config:
- model_config:
- model: None
- tau: 0.01
-
-critic_model_config:
- model_config:
- model: None
- tau: 0.01
-
-actor_optimizer_config:
- optimizer_config:
- optimizer_name: "adam"
- optimizer_params:
- learning_rate: 0.001
- epsilon: 0.001
- clipnorm: 1.0
- custom_optimizer: None
-
-critic_optimizer_config:
- optimizer_config:
- optimizer_name: "adam"
- optimizer_params:
- learning_rate: 0.002
- epsilon: 0.001
- clipnorm: 1.0
- custom_optimizer: None
-
-buffer_config:
- size: 50000
- eps: 0.01
- alpha: 0.5
- beta: 0.4
- beta_changing: 0.0005
- beta_changing_curve: 'linear'
- max_priority: 0.1
-
-exploration_config:
- strategy_name: "ou_noise"
- strategy_config:
- alpha: 0.3
- sigma: 0.2
- action_space: None
-
-data_saver:
- path: "..\\rl_lib\\rl_lib\\tests\\models/"
- copy_path: ""
-
diff --git a/rl_lib/tests/dqn_config.yaml b/rl_lib/tests/dqn_config.yaml
index b7d2c72..4907a7b 100644
--- a/rl_lib/tests/dqn_config.yaml
+++ b/rl_lib/tests/dqn_config.yaml
@@ -8,8 +8,8 @@ model_config:
discount_factor : 0.9
n_step: 1
batch_size: 32
- double_network: True
- priority: False
+ double_network: False
+ priority: True
tau: 1.0
optimizer_config:
diff --git a/rl_lib/tests/drqn_config.yaml b/rl_lib/tests/drqn_config.yaml
new file mode 100644
index 0000000..91b8c90
--- /dev/null
+++ b/rl_lib/tests/drqn_config.yaml
@@ -0,0 +1,49 @@
+#default DRQN config
+
+model_config:
+ model: None
+ name: "_cart_pole"
+ input_shape: None
+ action_space: None
+ lstm_size: 64
+ discount_factor : 0.9
+ n_step: 1
+ priority: False
+ batch_size: 32
+ double_network: False
+ tau: 1.0
+
+optimizer_config:
+ optimizer_name: "adam"
+ optimizer_params:
+ learning_rate: 0.01
+ epsilon: 0.001
+ clipnorm: 1.0
+ custom_optimizer: None
+
+buffer_config:
+ size: 10000
+ priority: True
+ recurrent: True
+ trace_length: 10
+ recurrent_skip: 5
+ eps: 0.01
+ alpha: 0.5
+ beta: 0.4
+ beta_changing: 0.0005
+ beta_changing_curve: 'linear'
+ max_priority: 0.1
+
+exploration_config:
+ strategy_name: "epsilon_greedy"
+ strategy_config:
+ eps_decay_steps: 5000
+ eps_max: 1.0
+ eps_min: 0.01
+ eps_test: 0.001
+ action_space: None
+
+data_saver:
+ path: "..\\rl_lib\\rl_lib\\tests\\models/"
+ copy_path: ""
+
diff --git a/rl_lib/tests/first_test_ddpg.py b/rl_lib/tests/first_test_ddpg.py
index 9d21a68..83eed35 100644
--- a/rl_lib/tests/first_test_ddpg.py
+++ b/rl_lib/tests/first_test_ddpg.py
@@ -1,52 +1,75 @@
+import os.path as os_path
+import time
+import traceback
+from pprint import pprint
+
import gym
import numpy as np
-import time
-import os.path as os_path
-from tensorflow.keras import layers
import tensorflow as tf
-from pprint import pprint
-import traceback
+from tensorflow.keras import layers
-from rl_lib.src.algoritms.ddpg.ddpg import DDPG
+from rl_lib import DDPG
from rl_lib.src.data_saver.utils import load_default_config
-env = gym.make('BipedalWalker-v3')
+env = gym.make('Walker2d-v4')
+
+initializer = tf.keras.initializers.RandomUniform(
+ minval=-3*1e-4, maxval=3*1e-4, seed=40)
+
def create_conv():
input_layer = layers.Input(shape=env.observation_space.shape, )
- cov_layer1 = layers.Conv2D(16, 7, activation='relu')(input_layer)
- cov_layer2 = layers.Conv2D(32, 5, activation='relu')(cov_layer1)
- conv_out = layers.Flatten()(cov_layer2)
- return tf.keras.Model(inputs=input_layer, outputs=conv_out)
+ rescaling_layer = layers.experimental.preprocessing.Rescaling(
+ 1.0 / 127.5, offset=-1)(input_layer)
+ cov_layer1 = layers.Conv2D(
+ 32, 7, 4, activation='relu', kernel_initializer=initializer)(rescaling_layer)
+ cov_layer2 = layers.Conv2D(
+ 32, 5, 2, activation='relu', kernel_initializer=initializer)(cov_layer1)
+ cov_layer3 = layers.Conv2D(
+ 32, 3, 2, activation='relu', kernel_initializer=initializer)(cov_layer2)
+ conv_out = layers.Flatten()(cov_layer3)
+ return tf.keras.Model(inputs=input_layer, outputs=conv_out)
+
def create_model():
"""Создает модель tf.keras.Model, архитектура DQN"""
input_layer = layers.Input(shape=env.observation_space.shape, )
# conv_out = create_conv()(input_layer)
- dence_layer1 = layers.Dense(64, activation='relu')(input_layer)
- dence_layer2 = layers.Dense(64, activation='relu')(dence_layer1)
- dence_out = layers.Dense(env.action_space.shape[0], activation='tanh')(dence_layer2)
+ dence_layer1 = layers.Dense(
+ 256, activation='relu', kernel_initializer=initializer)(input_layer)
+ dence_layer2 = layers.Dense(
+ 256, activation='relu', kernel_initializer=initializer)(dence_layer1)
+ dence_out = layers.Dense(
+ env.action_space.shape[0], activation='tanh', kernel_initializer=initializer)(dence_layer2)
+
+ dence_out = dence_out * \
+ tf.reduce_max((tf.abs(env.action_space.low), env.action_space.high))
- dence_out = dence_out*tf.reduce_max((tf.abs(env.action_space.low), env.action_space.high))
-
return tf.keras.Model(inputs=input_layer, outputs=dence_out)
+
def create_critic_model():
"""Создает модель tf.keras.Model, архитектура DQN, начальные слои - сверточные"""
input_layer = layers.Input(shape=env.observation_space.shape, )
- obsv_layer = layers.Dense(16, activation='relu')(input_layer)
- obsv_layer = layers.Dense(32, activation='relu')(obsv_layer)
+ obsv_layer = layers.Dense(128, activation='relu',
+ kernel_initializer=initializer)(input_layer)
+ obsv_layer = layers.Dense(64, activation='relu',
+ kernel_initializer=initializer)(obsv_layer)
input_action_layer = layers.Input(shape=env.action_space.shape, )
- action_layer = layers.Dense(32, activation='relu')(input_action_layer)
-
+ action_layer = layers.Dense(
+ 32, activation='relu', kernel_initializer=initializer)(input_action_layer)
+
# conv_out = create_conv()(input_layer)
- concat = layers.Concatenate()((input_layer, action_layer))
+ concat = layers.Concatenate()((obsv_layer, action_layer))
flatten = layers.Flatten()(concat)
- dence_layer1 = layers.Dense(64, activation='relu')(flatten)
- dence_layer2 = layers.Dense(64, activation='relu')(dence_layer1)
- dence_out = layers.Dense(env.action_space.shape[0], activation=None)(dence_layer2)
-
- return tf.keras.Model(inputs=[input_layer, input_action_layer], outputs=dence_out)
+ dence_layer1 = layers.Dense(
+ 256, activation='relu', kernel_initializer=initializer)(flatten)
+ dence_layer2 = layers.Dense(
+ 256, activation='relu', kernel_initializer=initializer)(dence_layer1)
+ dence_out = layers.Dense(1, activation=None)(dence_layer2)
+
+ return tf.keras.Model(inputs=[input_layer, input_action_layer], outputs=dence_out)
+
config = load_default_config(__file__)
@@ -60,18 +83,21 @@ def create_critic_model():
algo = DDPG(config)
+# algo.load()
pprint(algo.config)
+
def run(algo):
epidodes = 250
- steps = 500
+ steps = 250
train_frequency = 1
- test_frequency = 10
- test_steps = 500
- pre_train_steps = 1000
+ test_frequency = 30
+ save_frequency = 10
+ test_steps = 250
+ pre_train_steps = 1
copy_weigths_frequency = 1
- #history data
+ # history data
rewards = []
episode_reward = 0
episode_test_reward = 0
@@ -83,9 +109,10 @@ def run(algo):
observation, info = env.reset()
episode_reward = 0
- for step in range(1, steps+1):
+ episode_loss = []
+ while True: # for step in range(1, steps+1):
action = algo.get_action(observation)
- new_observation, reward, done, _, info = env.step(action)
+ new_observation, reward, done, tr, info = env.step(action)
algo.add((observation, action, reward, done, new_observation))
episode_reward += reward
count += 1
@@ -95,42 +122,43 @@ def run(algo):
if count % copy_weigths_frequency == 0:
res = algo.copy_weights()
observation = new_observation
- if done:
+ if done or tr:
break
- algo.save()
+ if episode % save_frequency == 0:
+ algo.save()
rewards.append(episode_reward)
- #testing algoritm perfomans
- if episode%test_frequency == 0:
+ # testing algoritm perfomans
+ if episode % test_frequency == 0:
observation, info = env.reset()
episode_test_reward = 0
- for test_step in range(1, test_steps+1):
+ while True: # for test_step in range(1, test_steps+1):
action = algo.get_test_action(observation)
- observation, test_reward, done, _, info = env.step(action)
+ observation, test_reward, done, tr, info = env.step(action)
episode_test_reward += test_reward
- if done:
+ if done or tr:
break
-
- #print info
+ # print info
print(" Episode %d - Reward = %.3f, episode reward = %.3f, test reward %.3f, Loss = %.6f, Time = %.f sec, Total steps = %.f" %
- (
- episode,
- np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0,
- episode_reward,
- episode_test_reward,
- np.asarray(episode_loss).mean() if len(episode_loss) != 0 else 0,
- time.time()-start_time,
- count
- )
- )
- algo.load()
+ (
+ episode,
+ np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0,
+ episode_reward,
+ episode_test_reward,
+ np.asarray(episode_loss).mean() if len(
+ episode_loss) != 0 else 0,
+ time.time()-start_time,
+ count
+ )
+ )
+ # algo.load()
+
if __name__ == "__main__":
try:
run(algo=algo)
-
+
except Exception:
print(traceback.format_exc())
input("Press enter to exit: ")
-
diff --git a/rl_lib/tests/first_test_dqn.py b/rl_lib/tests/first_test_dqn.py
index b03f6ae..6779de9 100644
--- a/rl_lib/tests/first_test_dqn.py
+++ b/rl_lib/tests/first_test_dqn.py
@@ -1,27 +1,30 @@
+import os.path as os_path
+import time
+import traceback
+from pprint import pprint
+
import gym
import numpy as np
-import time
-import os.path as os_path
-from tensorflow.keras import layers
import tensorflow as tf
-from pprint import pprint
-import traceback
+from tensorflow.keras import layers
-from rl_lib.src.algoritms.dqn.dqn import DQN
+from rl_lib.src.algoritms.model_free.value_based import DQN
from rl_lib.src.data_saver.utils import load_default_config
env = gym.make('CartPole-v0')
+
def create_model():
"""Создает модель tf.keras.Model, архитектура DQN"""
input_layer = layers.Input(shape=env.observation_space.shape, )
dence_layer1 = layers.Dense(32, activation='relu')(input_layer)
dence_layer2 = layers.Dense(32, activation='relu')(dence_layer1)
dence_out = layers.Dense(env.action_space.n, activation=None)(dence_layer2)
-
+
return tf.keras.Model(inputs=input_layer, outputs=dence_out)
-config = load_default_config(__file__)
+
+config = load_default_config("./rl_lib/tests/dqn_config.yaml")
pprint(config)
config['model_config']['model'] = create_model()
config['model_config']['input_shape'] = env.observation_space.shape
@@ -30,6 +33,7 @@ def create_model():
pprint(algo.config)
+
def run(algo):
epidodes = 250
steps = 200
@@ -39,7 +43,7 @@ def run(algo):
pre_train_steps = 2000
copy_weigths_frequency = 100
- #history data
+ # history data
rewards = []
episode_reward = 0
episode_test_reward = 0
@@ -51,6 +55,7 @@ def run(algo):
observation, info = env.reset()
episode_reward = 0
+ episode_loss = []
for step in range(1, steps):
action = algo.get_action(observation)
new_observation, reward, done, _, info = env.step(action)
@@ -66,10 +71,10 @@ def run(algo):
if done:
break
- # algo.save()
+ algo.save()
rewards.append(episode_reward)
- #testing algoritm perfomans
- if episode%test_frequency == 0:
+ # testing algoritm perfomans
+ if episode % test_frequency == 0:
observation, info = env.reset()
episode_test_reward = 0
for test_step in range(1, test_steps):
@@ -78,27 +83,27 @@ def run(algo):
episode_test_reward += test_reward
if done:
break
-
- #print info
+ # print info
print(" Episode %d - Reward = %.3f, episode reward = %.3f, test reward %.3f, Loss = %.6f, Time = %.f sec, Total steps = %.f" %
- (
- episode,
- np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0,
- episode_reward,
- episode_test_reward,
- np.asarray(episode_loss).mean() if len(episode_loss) != 0 else 0,
- time.time()-start_time,
- count
- )
- )
+ (
+ episode,
+ np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0,
+ episode_reward,
+ episode_test_reward,
+ np.asarray(episode_loss).mean() if len(
+ episode_loss) != 0 else 0,
+ time.time()-start_time,
+ count
+ )
+ )
algo.load()
+
if __name__ == "__main__":
try:
run(algo=algo)
-
+
except Exception as e:
print(traceback.format_exc())
input("Press enter to exit: ")
-
diff --git a/rl_lib/tests/first_test_drqn.py b/rl_lib/tests/first_test_drqn.py
index e3afbdd..274a136 100644
--- a/rl_lib/tests/first_test_drqn.py
+++ b/rl_lib/tests/first_test_drqn.py
@@ -1,50 +1,55 @@
+import os.path as os_path
+import time
+import traceback
+from pprint import pprint
+import os
+os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8"'
import gym
import numpy as np
-import time
-import os.path as os_path
-from tensorflow.keras import layers
import tensorflow as tf
-from pprint import pprint
-import traceback
+from tensorflow.keras import layers
-from rl_lib.src.algoritms.drqn.drqn import DRQN
+from rl_lib.src.algoritms.model_free.value_based import DRQN
from rl_lib.src.data_saver.utils import load_default_config
env = gym.make('CartPole-v0')
-def create_model(lstm_size = 32):
+
+def create_model(lstm_size=32):
"""Создает модель tf.keras.Model, архитектура DRQN"""
- input_layer = layers.Input(shape= (None, *env.observation_space.shape), )
- h_t_input = layers.Input(shape=(lstm_size, ), )
- c_t_input = layers.Input(shape=(lstm_size, ), )
-
+ input_layer = layers.Input(shape=(None, *env.observation_space.shape), )
+ h_t_input = layers.Input(shape=(lstm_size, ), )
+ c_t_input = layers.Input(shape=(lstm_size, ), )
- lstm = layers.LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', return_sequences = True,
- return_state=True, stateful = False)(input_layer, initial_state = [h_t_input, c_t_input])
+ lstm = layers.LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', return_sequences=True,
+ return_state=True, stateful=False)(input_layer, initial_state=[h_t_input, c_t_input])
dence_layer1 = layers.Dense(32, activation='relu')(lstm[0])
dence_out = layers.Dense(env.action_space.n, activation=None)(dence_layer1)
-
+
return tf.keras.Model(inputs=[input_layer, h_t_input, c_t_input], outputs=[dence_out, lstm[1], lstm[2]])
-config = load_default_config("..\\rl_lib\\rl_lib\\tests/")
-config['model_config']['model'] = create_model(lstm_size=config['model_config']['lstm_size'])
+
+config = load_default_config("./rl_lib/tests/drqn_config.yaml")
+config['model_config']['model'] = create_model(
+ lstm_size=config['model_config']['lstm_size'])
config['model_config']['input_shape'] = env.observation_space.shape
config['model_config']['action_space'] = env.action_space.n
algo = DRQN(config)
pprint(algo.config)
+
def run(algo):
epidodes = 250
steps = 200
train_frequency = 1
test_frequency = 10
test_steps = 200
- pre_train_steps = 2000
+ pre_train_steps = 2500
copy_weigths_frequency = 100
- #history data
+ # history data
rewards = []
episode_reward = 0
episode_test_reward = 0
@@ -57,6 +62,7 @@ def run(algo):
observation, info = env.reset()
algo.initial_state()
episode_reward = 0
+ episode_loss = []
for step in range(1, steps):
action = algo.get_action(observation)
new_observation, reward, done, _, info = env.step(action)
@@ -72,10 +78,10 @@ def run(algo):
if done:
break
- algo.save()
+ algo.save()
rewards.append(episode_reward)
- #testing algoritm perfomans
- if episode%test_frequency == 0:
+ # testing algoritm perfomans
+ if episode % test_frequency == 0:
observation, info = env.reset()
algo.initial_state()
episode_test_reward = 0
@@ -85,26 +91,26 @@ def run(algo):
episode_test_reward += test_reward
if done:
break
-
- #print info
+ # print info
print(" Episode %d - Reward = %.3f, episode reward = %.3f, test reward %.3f, Loss = %.6f, Time = %.f sec, Total steps = %.f" %
- (
- episode,
- np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0,
- episode_reward,
- episode_test_reward,
- np.asarray(episode_loss).mean() if len(episode_loss) != 0 else 0,
- time.time()-start_time,
- count
- )
- )
+ (
+ episode,
+ np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0,
+ episode_reward,
+ episode_test_reward,
+ np.asarray(episode_loss).mean() if len(
+ episode_loss) != 0 else 0,
+ time.time()-start_time,
+ count
+ )
+ )
+
if __name__ == "__main__":
try:
run(algo=algo)
-
+
except Exception:
print(traceback.format_exc())
input("Press enter to exit: ")
-
diff --git a/rl_lib/tests/first_test_qr_dqn.py b/rl_lib/tests/first_test_qr_dqn.py
new file mode 100644
index 0000000..4998678
--- /dev/null
+++ b/rl_lib/tests/first_test_qr_dqn.py
@@ -0,0 +1,111 @@
+import os.path as os_path
+import time
+import traceback
+from pprint import pprint
+
+import gym
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from rl_lib.src.algoritms.model_free.value_based import QR_DQN
+from rl_lib.src.data_saver.utils import load_default_config
+
+env = gym.make('CartPole-v0')
+
+config = load_default_config("./rl_lib/tests/qr_dqn_config.yaml")
+
+def create_model():
+ """Создает модель tf.keras.Model, архитектура DQN"""
+ input_layer = layers.Input(shape=env.observation_space.shape, )
+ dence_layer1 = layers.Dense(32, activation='relu')(input_layer)
+ dence_layer2 = layers.Dense(32, activation='relu')(dence_layer1)
+
+ dence_out = layers.Dense(env.action_space.n * config['model_config']['num_atoms'],
+ activation=None)(dence_layer2)
+
+ out = layers.Reshape((env.action_space.n, config['model_config']['num_atoms']))(dence_out)
+ return tf.keras.Model(inputs=input_layer, outputs=out)
+
+
+pprint(config)
+config['model_config']['model'] = create_model()
+config['model_config']['input_shape'] = env.observation_space.shape
+config['model_config']['action_space'] = env.action_space.n
+algo = QR_DQN(config)
+
+pprint(algo.config)
+
+def run(algo):
+ epidodes = 250
+ steps = 200
+ train_frequency = 1
+ test_frequency = 10
+ test_steps = 200
+ pre_train_steps = 2000
+ copy_weigths_frequency = 100
+
+ # history data
+ rewards = []
+ episode_reward = 0
+ episode_test_reward = 0
+ episode_loss = []
+ count = 0
+
+ for episode in range(1, epidodes):
+ start_time = time.time()
+
+ observation, info = env.reset()
+ episode_reward = 0
+ episode_loss = []
+ for step in range(1, steps):
+ action = algo.get_action(observation)
+ new_observation, reward, done, _, info = env.step(action)
+ algo.add((observation, action, reward, done, new_observation))
+ episode_reward += reward
+ count += 1
+ if count % train_frequency == 0 and count > pre_train_steps:
+ td_error = algo.train_step()
+ episode_loss.append(td_error)
+ if count % copy_weigths_frequency == 0:
+ res = algo.copy_weights()
+ observation = new_observation
+ if done:
+ break
+
+ algo.save()
+ rewards.append(episode_reward)
+ # testing algoritm perfomans
+ if episode % test_frequency == 0:
+ observation, info = env.reset()
+ episode_test_reward = 0
+ for test_step in range(1, test_steps):
+ action = algo.get_test_action(observation)
+ observation, test_reward, done, _, info = env.step(action)
+ episode_test_reward += test_reward
+ if done:
+ break
+
+ # print info
+ print(" Episode %d - Reward = %.3f, episode reward = %.3f, test reward %.3f, Loss = %.6f, Time = %.f sec, Total steps = %.f" %
+ (
+ episode,
+ np.asarray(rewards[-10:]).mean() if len(rewards) != 0 else 0,
+ episode_reward,
+ episode_test_reward,
+ np.asarray(episode_loss).mean() if len(
+ episode_loss) != 0 else 0,
+ time.time()-start_time,
+ count
+ )
+ )
+ algo.load()
+
+
+if __name__ == "__main__":
+ try:
+ run(algo=algo)
+
+ except Exception as e:
+ print(traceback.format_exc())
+ input("Press enter to exit: ")
diff --git a/rl_lib/tests/qr_dqn_config.yaml b/rl_lib/tests/qr_dqn_config.yaml
new file mode 100644
index 0000000..da23722
--- /dev/null
+++ b/rl_lib/tests/qr_dqn_config.yaml
@@ -0,0 +1,49 @@
+#default QR_DQN config
+
+model_config:
+ model: None
+ name: "default_QR_DQN"
+ input_shape: None
+ action_space: None
+ discount_factor : 0.99
+ n_step: 1
+ batch_size: 32
+ double_network: False
+ priority: False
+ tau: 1.0
+ num_atoms: 4
+ hubber_k: 1.0
+
+optimizer_config:
+ optimizer_name: "adam"
+ optimizer_params:
+ learning_rate: 0.01
+ epsilon: 0.001
+ clipnorm: 1.0
+ custom_optimizer: None
+
+buffer_config:
+ size: 10000
+ priority: False
+ n_step: None
+ discount_factor : None
+ eps: None
+ alpha: None
+ beta: None
+ beta_changing: None
+ beta_changing_curve: None
+ max_priority: None
+
+exploration_config:
+ strategy_name: "epsilon_greedy"
+ strategy_config:
+ eps_decay_steps: 5000
+ eps_max: 1.0
+ eps_min: 0.01
+ eps_test: 0.001
+ action_space: None
+
+data_saver:
+ path: ""
+ copy_path: ""
+
diff --git a/rl_lib/tests/second_test_dqn_w_runner.py b/rl_lib/tests/second_test_dqn_w_runner.py
new file mode 100644
index 0000000..d1c35d0
--- /dev/null
+++ b/rl_lib/tests/second_test_dqn_w_runner.py
@@ -0,0 +1,45 @@
+from pprint import pprint
+
+import gym
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from rl_lib import DQN
+from rl_lib import load_default_config
+from rl_lib import Base_Env_Runner
+
+env = gym.make('CartPole-v0')
+
+
+def create_model():
+ """Создает модель tf.keras.Model, архитектура DQN"""
+ input_layer = layers.Input(shape=env.observation_space.shape, )
+ dence_layer1 = layers.Dense(32, activation='relu')(input_layer)
+ dence_layer2 = layers.Dense(32, activation='relu')(dence_layer1)
+ dence_out = layers.Dense(env.action_space.n, activation=None)(dence_layer2)
+
+ return tf.keras.Model(inputs=input_layer, outputs=dence_out)
+
+
+config = load_default_config("./rl_lib/tests/dqn_config.yaml")
+pprint(config)
+config['model_config']['model'] = create_model()
+config['model_config']['input_shape'] = env.observation_space.shape
+config['model_config']['action_space'] = env.action_space.n
+algo = DQN(config)
+
+pprint(algo.config)
+
+runner = Base_Env_Runner(env=env,
+ algo=algo,
+ episodes=250,
+ env_steps=200,
+ env_test_steps=200,
+ pre_train_steps=2000,
+ test_counts=4,
+ train_frequency=4,
+ test_frequency=10,
+ copy_weigths_frequency=100,
+ new_step_api=True)
+
+runner.run()
diff --git a/rl_lib/tests/second_test_drqn_w_runner.py b/rl_lib/tests/second_test_drqn_w_runner.py
new file mode 100644
index 0000000..4126543
--- /dev/null
+++ b/rl_lib/tests/second_test_drqn_w_runner.py
@@ -0,0 +1,49 @@
+from pprint import pprint
+import os
+os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8"'
+import gym
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers
+
+from rl_lib import DRQN, load_default_config, Base_Env_Runner
+
+env = gym.make('CartPole-v0')
+
+
+def create_model(lstm_size=32):
+ """Создает модель tf.keras.Model, архитектура DRQN"""
+
+ input_layer = layers.Input(shape=(None, *env.observation_space.shape), )
+ h_t_input = layers.Input(shape=(lstm_size, ), )
+ c_t_input = layers.Input(shape=(lstm_size, ), )
+
+ lstm = layers.LSTM(lstm_size, activation='tanh', recurrent_activation='sigmoid', return_sequences=True,
+ return_state=True, stateful=False)(input_layer, initial_state=[h_t_input, c_t_input])
+ dence_layer1 = layers.Dense(32, activation='relu')(lstm[0])
+ dence_out = layers.Dense(env.action_space.n, activation=None)(dence_layer1)
+
+ return tf.keras.Model(inputs=[input_layer, h_t_input, c_t_input], outputs=[dence_out, lstm[1], lstm[2]])
+
+
+config = load_default_config("./rl_lib/tests/drqn_config.yaml")
+config['model_config']['model'] = create_model(
+ lstm_size=config['model_config']['lstm_size'])
+config['model_config']['input_shape'] = env.observation_space.shape
+config['model_config']['action_space'] = env.action_space.n
+algo = DRQN(config)
+
+pprint(algo.config)
+runner = Base_Env_Runner(env=env,
+ algo=algo,
+ episodes=250,
+ env_steps=200,
+ env_test_steps=200,
+ pre_train_steps=2000,
+ test_counts=1,
+ train_frequency=1,
+ test_frequency=10,
+ copy_weigths_frequency=100,
+ new_step_api=True)
+
+runner.run()
diff --git a/setup.py b/setup.py
index fa6a509..8d0986d 100644
--- a/setup.py
+++ b/setup.py
@@ -2,11 +2,22 @@
import os
+def find_yaml_files(root):
+ yaml_files = []
+ for foldername, subfolders, filenames in os.walk(root):
+ for filename in filenames:
+ if filename.endswith('.yaml'):
+ yaml_files.append(os.path.relpath(os.path.join(foldername, filename), root))
+ return yaml_files
+
if __name__ == '__main__':
setup(
name='rl_lib',
- version=os.getenv('PACKAGE_VERSION', '0.1.dev0'),
+ version=os.getenv('PACKAGE_VERSION', '0.2.dev0'),
# package_dir={'rl_lib': ''},
packages=find_packages(),
- description='A demo version of the reinforcement learning library.',
+ description='A dev version of the reinforcement learning library.',
+ package_data={
+ '': find_yaml_files('rl_lib'),
+ },
)
\ No newline at end of file