Commit 4ce1e348 authored by Nicola Gatto's avatar Nicola Gatto

Fix RL tests

parent 0cc00c55
Pipeline #159711 failed with stages
......@@ -17,7 +17,7 @@ configuration CartPoleDQN {
use_double_dqn : false
loss : euclidean
loss : huber
replay_memory : buffer{
memory_size : 10000
......
......@@ -8,5 +8,5 @@ implementation Critic(state, action) {
FullyConnected(units=300)
) ->
Add() ->
Relu();
Relu()
}
\ No newline at end of file
......@@ -23,7 +23,7 @@ configuration TorcsDQN {
use_double_dqn : true
loss : euclidean
loss : huber
replay_memory : buffer{
memory_size : 1000000
......
......@@ -3,8 +3,9 @@ import h5py
import mxnet as mx
import logging
import sys
from mxnet import nd
class cartpole_master_dqnDataLoader:
class CNNDataLoader_cartpole_master_dqn:
_input_names_ = ['state']
_output_names_ = ['qvalues_label']
......@@ -14,21 +15,38 @@ class cartpole_master_dqnDataLoader:
def load_data(self, batch_size):
train_h5, test_h5 = self.load_h5_files()
data_mean = train_h5[self._input_names_[0]][:].mean(axis=0)
data_std = train_h5[self._input_names_[0]][:].std(axis=0) + 1e-5
train_data = {}
data_mean = {}
data_std = {}
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_label = {}
for output_name in self._output_names_:
train_label[output_name] = train_h5[output_name]
train_iter = mx.io.NDArrayIter(data=train_data,
label=train_label,
batch_size=batch_size)
train_iter = mx.io.NDArrayIter(train_h5[self._input_names_[0]],
train_h5[self._output_names_[0]],
batch_size=batch_size,
data_name=self._input_names_[0],
label_name=self._output_names_[0])
test_iter = None
if test_h5 != None:
test_iter = mx.io.NDArrayIter(test_h5[self._input_names_[0]],
test_h5[self._output_names_[0]],
batch_size=batch_size,
data_name=self._input_names_[0],
label_name=self._output_names_[0])
test_data = {}
for input_name in self._input_names_:
test_data[input_name] = test_h5[input_name]
test_label = {}
for output_name in self._output_names_:
test_label[output_name] = test_h5[output_name]
test_iter = mx.io.NDArrayIter(data=test_data,
label=test_label,
batch_size=batch_size)
return train_iter, test_iter, data_mean, data_std
def load_h5_files(self):
......@@ -36,21 +54,39 @@ class cartpole_master_dqnDataLoader:
test_h5 = None
train_path = self._data_dir + "train.h5"
test_path = self._data_dir + "test.h5"
if os.path.isfile(train_path):
train_h5 = h5py.File(train_path, 'r')
if not (self._input_names_[0] in train_h5 and self._output_names_[0] in train_h5):
logging.error("The HDF5 file '" + os.path.abspath(train_path) + "' has to contain the datasets: "
+ "'" + self._input_names_[0] + "', '" + self._output_names_[0] + "'")
sys.exit(1)
test_iter = None
for input_name in self._input_names_:
if not input_name in train_h5:
logging.error("The HDF5 file '" + os.path.abspath(train_path) + "' has to contain the dataset "
+ "'" + input_name + "'")
sys.exit(1)
for output_name in self._output_names_:
if not output_name in train_h5:
logging.error("The HDF5 file '" + os.path.abspath(train_path) + "' has to contain the dataset "
+ "'" + output_name + "'")
sys.exit(1)
if os.path.isfile(test_path):
test_h5 = h5py.File(test_path, 'r')
if not (self._input_names_[0] in test_h5 and self._output_names_[0] in test_h5):
logging.error("The HDF5 file '" + os.path.abspath(test_path) + "' has to contain the datasets: "
+ "'" + self._input_names_[0] + "', '" + self._output_names_[0] + "'")
sys.exit(1)
for input_name in self._input_names_:
if not input_name in test_h5:
logging.error("The HDF5 file '" + os.path.abspath(test_path) + "' has to contain the dataset "
+ "'" + input_name + "'")
sys.exit(1)
for output_name in self._output_names_:
if not output_name in test_h5:
logging.error("The HDF5 file '" + os.path.abspath(test_path) + "' has to contain the dataset "
+ "'" + output_name + "'")
sys.exit(1)
else:
logging.warning("Couldn't load test set. File '" + os.path.abspath(test_path) + "' does not exist.")
return train_h5, test_h5
else:
logging.error("Data loading failure. File '" + os.path.abspath(train_path) + "' does not exist.")
......
......@@ -101,7 +101,6 @@ class Net_0(gluon.HybridBlock):
self.fc3_ = gluon.nn.Dense(units=2, use_bias=True)
# fc3_, output shape: {[2,1,1]}
self.last_layers['qvalues'] = 'linear'
def hybrid_forward(self, F, state):
......
......@@ -81,7 +81,7 @@ if __name__ == "__main__":
'qnet':qnet_creator.net,
'use_fix_target': True,
'target_update_interval': 200,
'loss_function': 'euclidean',
'loss': 'huber',
'optimizer': 'rmsprop',
'optimizer_params': {
'learning_rate': 0.001 },
......
......@@ -114,6 +114,8 @@ class Agent(object):
agent_session_file = os.path.join(session_dir, 'agent.p')
logger = self._logger
self._training_stats.save_stats(self._output_directory, episode=self._current_episode)
self._make_pickle_ready(session_dir)
with open(agent_session_file, 'wb') as f:
......@@ -177,6 +179,9 @@ class Agent(object):
return states, actions, rewards, next_states, terminals
def evaluate(self, target=None, sample_games=100, verbose=True):
if sample_games <= 0:
return 0
target = self._target_score if target is None else target
if target:
target_achieved = 0
......@@ -268,8 +273,9 @@ class Agent(object):
def _save_net(self, net, filename, filedir=None):
filedir = self._output_directory if filedir is None else filedir
filename = os.path.join(filedir, filename + '.params')
net.save_parameters(filename)
filename = os.path.join(filedir, filename)
net.save_parameters(filename + '.params')
net.export(filename, epoch=0)
def save_best_network(self, path, epoch=0):
self._logger.info(
......@@ -367,6 +373,8 @@ class DdpgAgent(Agent):
def _make_pickle_ready(self, session_dir):
super(DdpgAgent, self)._make_pickle_ready(session_dir)
self._save_net(self._actor, 'current_actor')
self._save_net(self._actor, 'actor', session_dir)
self._actor = None
self._save_net(self._critic, 'critic', session_dir)
......@@ -457,9 +465,9 @@ class DdpgAgent(Agent):
else:
self._training_stats = DdpgTrainingStats(episodes)
# Initialize target Q' and mu'
self._actor_target = self._copy_actor()
self._critic_target = self._copy_critic()
# Initialize target Q' and mu'
self._actor_target = self._copy_actor()
self._critic_target = self._copy_critic()
# Initialize l2 loss for critic network
l2_loss = gluon.loss.L2Loss()
......@@ -540,7 +548,7 @@ class DdpgAgent(Agent):
actor_qvalues = tmp_critic(states, self._actor(states))
# For maximizing qvalues we have to multiply with -1
# as we use a minimizer
actor_loss = -1 * actor_qvalues
actor_loss = -1 * actor_qvalues.mean()
actor_loss.backward()
trainer_actor.step(self._minibatch_size)
......@@ -732,6 +740,7 @@ class DqnAgent(Agent):
def _make_pickle_ready(self, session_dir):
super(DqnAgent, self)._make_pickle_ready(session_dir)
self._save_net(self._qnet, 'current_qnet')
self._save_net(self._qnet, 'qnet', session_dir)
self._qnet = None
self._save_net(self._target_qnet, 'target_net', session_dir)
......@@ -897,4 +906,4 @@ class DqnAgent(Agent):
def _save_current_as_best_net(self):
self._best_net = copy_net(
self._qnet, (1,) + self._state_dim, ctx=self._ctx)
self._qnet, self._state_dim, ctx=self._ctx)
......@@ -168,5 +168,5 @@ class OrnsteinUhlenbeckStrategy(BaseStrategy):
def select_action(self, values):
noise = self._evolve_state()
action = values + (self.cur_eps * noise)
action = (1.0 - self.cur_eps) * values + (self.cur_eps * noise)
return np.clip(action, self._action_low, self._action_high)
......@@ -127,13 +127,15 @@ class TrainingStats(object):
else:
return self._all_total_rewards[0]
def save(self, path):
np.save(os.path.join(path, 'total_rewards'), self._all_total_rewards)
np.save(os.path.join(path, 'eps'), self._all_eps)
np.save(os.path.join(path, 'time'), self._all_time)
def save(self, path, episode=None):
if episode is None:
episode = self._max_episodes
np.save(os.path.join(path, 'total_rewards'), self._all_total_rewards[:episode])
np.save(os.path.join(path, 'eps'), self._all_eps[:episode])
np.save(os.path.join(path, 'time'), self._all_time[:episode])
np.save(
os.path.join(path, 'mean_reward'),
self._all_mean_reward_last_100_episodes)
self._all_mean_reward_last_100_episodes[:episode])
def _log_episode(self, episode, start_time, training_steps, eps, reward):
self.add_eps(episode, eps)
......@@ -170,33 +172,43 @@ class DqnTrainingStats(TrainingStats):
self._logger.info(info)
return avg_reward
def save_stats(self, path):
def save_stats(self, path, episode=None):
if episode is None:
episode = self._max_episodes
all_total_rewards = self._all_total_rewards[:episode]
all_avg_loss = self._all_avg_loss[:episode]
all_eps = self._all_eps[:episode]
all_mean_reward_last_100_episodes = self._all_mean_reward_last_100_episodes[:episode]
fig = plt.figure(figsize=(20, 20))
sub_rewards = fig.add_subplot(221)
sub_rewards.set_title('Total Rewards per episode')
sub_rewards.plot(
np.arange(self._max_episodes), self._all_total_rewards)
np.arange(episode), all_total_rewards)
sub_loss = fig.add_subplot(222)
sub_loss.set_title('Avg. Loss per episode')
sub_loss.plot(np.arange(self._max_episodes), self._all_avg_loss)
sub_loss.plot(np.arange(episode), all_avg_loss)
sub_eps = fig.add_subplot(223)
sub_eps.set_title('Epsilon per episode')
sub_eps.plot(np.arange(self._max_episodes), self._all_eps)
sub_eps.plot(np.arange(episode), all_eps)
sub_rewards = fig.add_subplot(224)
sub_rewards.set_title('Avg. mean reward of last 100 episodes')
sub_rewards.plot(np.arange(self._max_episodes),
self._all_mean_reward_last_100_episodes)
sub_rewards.plot(np.arange(episode),
all_mean_reward_last_100_episodes)
self.save(path)
self.save(path, episode=episode)
plt.savefig(os.path.join(path, 'stats.pdf'))
def save(self, path):
super(DqnTrainingStats, self).save(path)
np.save(os.path.join(path, 'avg_loss'), self._all_avg_loss)
def save(self, path, episode=None):
if episode is None:
episode = self._max_episodes
super(DqnTrainingStats, self).save(path, episode=episode)
np.save(os.path.join(path, 'avg_loss'), self._all_avg_loss[:episode])
class DdpgTrainingStats(TrainingStats):
......@@ -233,44 +245,56 @@ class DdpgTrainingStats(TrainingStats):
self.logger.info(info)
return avg_reward
def save(self, path):
super(DdpgTrainingStats, self).save(path)
def save(self, path, episode=None):
if episode is None:
episode = self._max_episodes
super(DdpgTrainingStats, self).save(path, episode=episode)
np.save(os.path.join(
path, 'avg_critic_loss'), self._all_avg_critic_loss)
np.save(os.path.join(path, 'avg_actor_loss'), self._all_avg_actor_loss)
np.save(os.path.join(path, 'avg_qvalues'), self._all_avg_qvalues)
path, 'avg_critic_loss'), self._all_avg_critic_loss[:episode])
np.save(os.path.join(path, 'avg_actor_loss'), self._all_avg_actor_loss[:episode])
np.save(os.path.join(path, 'avg_qvalues'), self._all_avg_qvalues[:episode])
def save_stats(self, path, episode=None):
if episode is None:
episode = self._max_episodes
all_total_rewards = self._all_total_rewards[:episode]
all_avg_actor_loss = self._all_avg_actor_loss[:episode]
all_avg_critic_loss = self._all_avg_critic_loss[:episode]
all_avg_qvalues = self._all_avg_qvalues[:episode]
all_eps = self._all_eps[:episode]
all_mean_reward_last_100_episodes = self._all_mean_reward_last_100_episodes[:episode]
def save_stats(self, path):
fig = plt.figure(figsize=(120, 120))
sub_rewards = fig.add_subplot(321)
sub_rewards.set_title('Total Rewards per episode')
sub_rewards.plot(
np.arange(self._max_episodes), self._all_total_rewards)
np.arange(episode), all_total_rewards)
sub_actor_loss = fig.add_subplot(322)
sub_actor_loss.set_title('Avg. Actor Loss per episode')
sub_actor_loss.plot(
np.arange(self._max_episodes), self._all_avg_actor_loss)
np.arange(episode), all_avg_actor_loss)
sub_critic_loss = fig.add_subplot(323)
sub_critic_loss.set_title('Avg. Critic Loss per episode')
sub_critic_loss.plot(
np.arange(self._max_episodes), self._all_avg_critic_loss)
np.arange(episode), all_avg_critic_loss)
sub_qvalues = fig.add_subplot(324)
sub_qvalues.set_title('Avg. QValues per episode')
sub_qvalues.plot(
np.arange(self._max_episodes), self._all_avg_qvalues)
np.arange(episode), all_avg_qvalues)
sub_eps = fig.add_subplot(325)
sub_eps.set_title('Epsilon per episode')
sub_eps.plot(np.arange(self._max_episodes), self._all_eps)
sub_eps.plot(np.arange(episode), all_eps)
sub_rewards = fig.add_subplot(326)
sub_rewards.set_title('Avg. mean reward of last 100 episodes')
sub_rewards.plot(np.arange(self._max_episodes),
self._all_mean_reward_last_100_episodes)
sub_rewards.plot(np.arange(episode),
all_mean_reward_last_100_episodes)
self.save(path)
self.save(path, episode=episode)
plt.savefig(os.path.join(path, 'stats.pdf'))
import mxnet as mx
import logging
import os
from CNNNet_mountaincar_master_actor import Net
from CNNNet_mountaincar_master_actor import Net_0
class CNNCreator_mountaincar_master_actor:
_model_dir_ = "model/mountaincar.agent.MountaincarActor/"
_model_prefix_ = "model"
_input_shapes_ = [(2,)]
def __init__(self):
self.weight_initializer = mx.init.Normal()
self.net = None
def get_input_shapes(self):
return self._input_shapes_
self.networks = {}
def load(self, context):
lastEpoch = 0
param_file = None
try:
os.remove(self._model_dir_ + self._model_prefix_ + "_newest-0000.params")
except OSError:
pass
try:
os.remove(self._model_dir_ + self._model_prefix_ + "_newest-symbol.json")
except OSError:
pass
if os.path.isdir(self._model_dir_):
for file in os.listdir(self._model_dir_):
if ".params" in file and self._model_prefix_ in file:
epochStr = file.replace(".params","").replace(self._model_prefix_ + "-","")
epoch = int(epochStr)
if epoch > lastEpoch:
lastEpoch = epoch
param_file = file
if param_file is None:
return 0
else:
logging.info("Loading checkpoint: " + param_file)
self.net.load_parameters(self._model_dir_ + param_file)
return lastEpoch
earliestLastEpoch = None
for i, network in self.networks.items():
lastEpoch = 0
param_file = None
try:
os.remove(self._model_dir_ + self._model_prefix_ + "_" + str(i) + "_newest-0000.params")
except OSError:
pass
try:
os.remove(self._model_dir_ + self._model_prefix_ + "_" + str(i) + "_newest-symbol.json")
except OSError:
pass
if os.path.isdir(self._model_dir_):
for file in os.listdir(self._model_dir_):
if ".params" in file and self._model_prefix_ + "_" + str(i) in file:
epochStr = file.replace(".params","").replace(self._model_prefix_ + "_" + str(i) + "-","")
epoch = int(epochStr)
if epoch > lastEpoch:
lastEpoch = epoch
param_file = file
if param_file is None:
earliestLastEpoch = 0
else:
logging.info("Loading checkpoint: " + param_file)
network.load_parameters(self._model_dir_ + param_file)
if earliestLastEpoch == None or lastEpoch < earliestLastEpoch:
earliestLastEpoch = lastEpoch
return earliestLastEpoch
def construct(self, context, data_mean=None, data_std=None):
self.net = Net(data_mean=data_mean, data_std=data_std)
self.net.collect_params().initialize(self.weight_initializer, ctx=context)
self.net.hybridize()
self.net(mx.nd.zeros((1,)+self._input_shapes_[0], ctx=context))
self.networks[0] = Net_0(data_mean=data_mean, data_std=data_std)
self.networks[0].collect_params().initialize(self.weight_initializer, ctx=context)
self.networks[0].hybridize()
self.networks[0](mx.nd.zeros((1, 2,), ctx=context))
if not os.path.exists(self._model_dir_):
os.makedirs(self._model_dir_)
self.net.export(self._model_dir_ + self._model_prefix_, epoch=0)
for i, network in self.networks.items():
network.export(self._model_dir_ + self._model_prefix_ + "_" + str(i), epoch=0)
......@@ -3,8 +3,9 @@ import h5py
import mxnet as mx
import logging
import sys
from mxnet import nd
class mountaincar_master_actorDataLoader:
class CNNDataLoader_mountaincar_master_actor:
_input_names_ = ['state']
_output_names_ = ['action_label']
......@@ -14,21 +15,38 @@ class mountaincar_master_actorDataLoader:
def load_data(self, batch_size):
train_h5, test_h5 = self.load_h5_files()
data_mean = train_h5[self._input_names_[0]][:].mean(axis=0)
data_std = train_h5[self._input_names_[0]][:].std(axis=0) + 1e-5
train_data = {}
data_mean = {}
data_std = {}
for input_name in self._input_names_:
train_data[input_name] = train_h5[input_name]
data_mean[input_name] = nd.array(train_h5[input_name][:].mean(axis=0))
data_std[input_name] = nd.array(train_h5[input_name][:].std(axis=0) + 1e-5)
train_label = {}
for output_name in self._output_names_:
train_label[output_name] = train_h5[output_name]
train_iter = mx.io.NDArrayIter(data=train_data,
label=train_label,
batch_size=batch_size)
train_iter = mx.io.NDArrayIter(train_h5[self._input_names_[0]],
train_h5[self._output_names_[0]],
batch_size=batch_size,
data_name=self._input_names_[0],
label_name=self._output_names_[0])
test_iter = None
if test_h5 != None:
test_iter = mx.io.NDArrayIter(test_h5[self._input_names_[0]],
test_h5[self._output_names_[0]],
batch_size=batch_size,
data_name=self._input_names_[0],
label_name=self._output_names_[0])
test_data = {}
for input_name in self._input_names_:
test_data[input_name] = test_h5[input_name]
test_label = {}
for output_name in self._output_names_:
test_label[output_name] = test_h5[output_name]
test_iter = mx.io.NDArrayIter(data=test_data,
label=test_label,
batch_size=batch_size)
return train_iter, test_iter, data_mean, data_std
def load_h5_files(self):
......@@ -36,21 +54,39 @@ class mountaincar_master_actorDataLoader:
test_h5 = None
train_path = self._data_dir + "train.h5"
test_path = self._data_dir + "test.h5"
if os.path.isfile(train_path):
train_h5 = h5py.File(train_path, 'r')
if not (self._input_names_[0] in train_h5 and self._output_names_[0] in train_h5):
logging.error("The HDF5 file '" + os.path.abspath(train_path) + "' has to contain the datasets: "
+ "'" + self._input_names_[0] + "', '" + self._output_names_[0] + "'")
sys.exit(1)
test_iter = None
for input_name in self._input_names_:
if not input_name in train_h5:
logging.error("The HDF5 file '" + os.path.abspath(train_path) + "' has to contain the dataset "
+ "'" + input_name + "'")
sys.exit(1)
for output_name in self._output_names_:
if not output_name in train_h5:
logging.error("The HDF5 file '" + os.path.abspath(train_path) + "' has to contain the dataset "
+ "'" + output_name + "'")
sys.exit(1)
if os.path.isfile(test_path):
test_h5 = h5py.File(test_path, 'r')
if not (self._input_names_[0] in test_h5 and self._output_names_[0] in test_h5):
logging.error("The HDF5 file '" + os.path.abspath(test_path) + "' has to contain the datasets: "
+ "'" + self._input_names_[0] + "', '" + self._output_names_[0] + "'")
sys.exit(1)
for input_name in self._input_names_:
if not input_name in test_h5:
logging.error("The HDF5 file '" + os.path.abspath(test_path) + "' has to contain the dataset "
+ "'" + input_name + "'")
sys.exit(1)
for output_name in self._output_names_:
if not output_name in test_h5:
logging.error("The HDF5 file '" + os.path.abspath(test_path) + "' has to contain the dataset "
+ "'" + output_name + "'")
sys.exit(1)
else:
logging.warning("Couldn't load test set. File '" + os.path.abspath(test_path) + "' does not exist.")
return train_h5, test_h5
else:
logging.error("Data loading failure. File '" + os.path.abspath(train_path) + "' does not exist.")
......
......@@ -2,6 +2,16 @@ import mxnet as mx
import numpy as np
from mxnet import gluon
class OneHot(gluon.HybridBlock):
def __init__(self, size, **kwargs):
super(OneHot, self).__init__(**kwargs)
with self.name_scope():
self.size = size
def hybrid_forward(self, F, x):
return F.one_hot(indices=F.argmax(data=x, axis=1), depth=self.size)
class Softmax(gluon.HybridBlock):
def __init__(self, **kwargs):
super(Softmax, self).__init__(**kwargs)
......@@ -68,15 +78,17 @@ class NoNormalization(gluon.HybridBlock):
return x
class Net(gluon.HybridBlock):
class Net_0(gluon.HybridBlock):
def __init__(self, data_mean=None, data_std=None, **kwargs):
super(Net, self).__init__(**kwargs)
super(Net_0, self).__init__(**kwargs)