Commit a51b42b6 authored by Nicola Gatto's avatar Nicola Gatto

Adjust tests

parent cc31bd8e
......@@ -214,11 +214,12 @@ public class GenerationTest extends AbstractSymtabTest {
"HelperA.h",
"start_training.sh",
"reinforcement_learning/__init__.py",
"reinforcement_learning/action_policy.py",
"reinforcement_learning/strategy.py",
"reinforcement_learning/agent.py",
"reinforcement_learning/environment.py",
"reinforcement_learning/replay_memory.py",
"reinforcement_learning/util.py"
"reinforcement_learning/util.py",
"reinforcement_learning/cnnarch_logger.py"
)
);
}
......@@ -262,11 +263,12 @@ public class GenerationTest extends AbstractSymtabTest {
"reward/pylib/armanpy/armanpy_3d.i",
"reward/pylib/armanpy/numpy.i",
"reinforcement_learning/__init__.py",
"reinforcement_learning/action_policy.py",
"reinforcement_learning/strategy.py",
"reinforcement_learning/agent.py",
"reinforcement_learning/environment.py",
"reinforcement_learning/replay_memory.py",
"reinforcement_learning/util.py",
"reinforcement_learning/cnnarch_logger.py",
"reinforcement_learning/torcs_agent_dqn_reward_executor.py"
)
);
......@@ -292,5 +294,33 @@ public class GenerationTest extends AbstractSymtabTest {
String[] args = {"-m", "src/test/resources/models/reinforcementModel", "-r", "mountaincar.Master", "-b", "GLUON", "-f", "n", "-c", "n"};
EMADLGeneratorCli.main(args);
assertEquals(0, Log.getFindings().stream().filter(Finding::isError).count());
checkFilesAreEqual(
Paths.get("./target/generated-sources-emadl"),
Paths.get("./src/test/resources/target_code/gluon/reinforcementModel/mountaincar"),
Arrays.asList(
"mountaincar_master.cpp",
"mountaincar_master.h",
"mountaincar_master_actor.h",
"CMakeLists.txt",
"CNNBufferFile.h",
"CNNCreator_mountaincar_master_actor.py",
"CNNNet_mountaincar_master_actor.py",
"CNNPredictor_mountaincar_master_actor.h",
"CNNTrainer_mountaincar_master_actor.py",
"CNNTranslator.h",
"HelperA.h",
"start_training.sh",
"reinforcement_learning/__init__.py",
"reinforcement_learning/CNNCreator_MountaincarCritic.py",
"reinforcement_learning/CNNNet_MountaincarCritic.py",
"reinforcement_learning/strategy.py",
"reinforcement_learning/agent.py",
"reinforcement_learning/environment.py",
"reinforcement_learning/replay_memory.py",
"reinforcement_learning/util.py",
"reinforcement_learning/cnnarch_logger.py"
)
);
}
}
......@@ -24,7 +24,7 @@ configuration CartPoleDQN {
sample_size : 32
}
action_selection : epsgreedy{
strategy : epsgreedy{
epsilon : 1.0
min_epsilon : 0.01
epsilon_decay_method: linear
......
......@@ -14,21 +14,26 @@ configuration MountaincarActor {
snapshot_interval : 20
loss : euclidean
replay_memory : buffer{
memory_size : 10000
sample_size : 32
memory_size : 1000000
sample_size : 64
}
action_selection : epsgreedy{
strategy : ornstein_uhlenbeck{
epsilon : 1.0
min_epsilon : 0.01
epsilon_decay_method: linear
epsilon_decay : 0.01
mu: (0.0)
theta: (0.15)
sigma: (0.3)
}
actor_optimizer : adam {
learning_rate : 0.0001
}
optimizer : rmsprop{
critic_optimizer : adam {
learning_rate : 0.001
}
}
\ No newline at end of file
......@@ -8,8 +8,5 @@ implementation Critic(state, action) {
FullyConnected(units=300)
) ->
Add() ->
Relu() ->
FullyConnected(units=1) ->
Tanh() ->
critic
Relu()
}
\ No newline at end of file
......@@ -30,7 +30,7 @@ configuration TorcsDQN {
sample_size : 32
}
action_selection : epsgreedy{
strategy : epsgreedy{
epsilon : 1.0
min_epsilon : 0.01
epsilon_decay_method: linear
......
import os
import h5py
import mxnet as mx
import logging
import sys
class cartpole_master_dqnDataLoader:
_input_names_ = ['state']
_output_names_ = ['qvalues_label']
def __init__(self):
self._data_dir = "data/"
def load_data(self, batch_size):
train_h5, test_h5 = self.load_h5_files()
data_mean = train_h5[self._input_names_[0]][:].mean(axis=0)
data_std = train_h5[self._input_names_[0]][:].std(axis=0) + 1e-5
train_iter = mx.io.NDArrayIter(train_h5[self._input_names_[0]],
train_h5[self._output_names_[0]],
batch_size=batch_size,
data_name=self._input_names_[0],
label_name=self._output_names_[0])
test_iter = None
if test_h5 != None:
test_iter = mx.io.NDArrayIter(test_h5[self._input_names_[0]],
test_h5[self._output_names_[0]],
batch_size=batch_size,
data_name=self._input_names_[0],
label_name=self._output_names_[0])
return train_iter, test_iter, data_mean, data_std
def load_h5_files(self):
train_h5 = None
test_h5 = None
train_path = self._data_dir + "train.h5"
test_path = self._data_dir + "test.h5"
if os.path.isfile(train_path):
train_h5 = h5py.File(train_path, 'r')
if not (self._input_names_[0] in train_h5 and self._output_names_[0] in train_h5):
logging.error("The HDF5 file '" + os.path.abspath(train_path) + "' has to contain the datasets: "
+ "'" + self._input_names_[0] + "', '" + self._output_names_[0] + "'")
sys.exit(1)
test_iter = None
if os.path.isfile(test_path):
test_h5 = h5py.File(test_path, 'r')
if not (self._input_names_[0] in test_h5 and self._output_names_[0] in test_h5):
logging.error("The HDF5 file '" + os.path.abspath(test_path) + "' has to contain the datasets: "
+ "'" + self._input_names_[0] + "', '" + self._output_names_[0] + "'")
sys.exit(1)
else:
logging.warning("Couldn't load test set. File '" + os.path.abspath(test_path) + "' does not exist.")
return train_h5, test_h5
else:
logging.error("Data loading failure. File '" + os.path.abspath(train_path) + "' does not exist.")
sys.exit(1)
\ No newline at end of file
from reinforcement_learning.agent import DqnAgent
from reinforcement_learning.util import AgentSignalHandler
from reinforcement_learning.cnnarch_logger import ArchLogger
import reinforcement_learning.environment
import CNNCreator_cartpole_master_dqn
......@@ -9,9 +10,6 @@ import re
import logging
import mxnet as mx
session_output_dir = 'session'
agent_name='cartpole_master_dqn'
session_param_output = os.path.join(session_output_dir, agent_name)
def resume_session():
session_param_output = os.path.join(session_output_dir, agent_name)
......@@ -32,60 +30,73 @@ def resume_session():
break
return resume_session, resume_directory
if __name__ == "__main__":
agent_name='cartpole_master_dqn'
# Prepare output directory and logger
output_directory = 'model_output'\
+ '/' + agent_name\
+ '/' + time.strftime(
'%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
ArchLogger.set_output_directory(output_directory)
ArchLogger.set_logger_name(agent_name)
ArchLogger.set_output_level(ArchLogger.INFO)
env = reinforcement_learning.environment.GymEnvironment('CartPole-v0')
context = mx.cpu()
net_creator = CNNCreator_cartpole_master_dqn.CNNCreator_cartpole_master_dqn()
net_creator.construct(context)
replay_memory_params = {
'method':'buffer',
'memory_size':10000,
'sample_size':32,
'state_dtype':'float32',
'action_dtype':'uint8',
'rewards_dtype':'float32'
}
context = mx.cpu()
qnet_creator = CNNCreator_cartpole_master_dqn.CNNCreator_cartpole_master_dqn()
qnet_creator.construct(context)
policy_params = {
'method':'epsgreedy',
'epsilon': 1,
'min_epsilon': 0.01,
'epsilon_decay_method': 'linear',
'epsilon_decay': 0.01,
agent_params = {
'environment': env,
'replay_memory_params': {
'method':'buffer',
'memory_size':10000,
'sample_size':32,
'state_dtype':'float32',
'action_dtype':'float32',
'rewards_dtype':'float32'
},
'strategy_params': {
'method':'epsgreedy',
'epsilon': 1,
'min_epsilon': 0.01,
'epsilon_decay_method': 'linear',
'epsilon_decay': 0.01,
},
'agent_name': agent_name,
'verbose': True,
'state_dim': (4,),
'action_dim': (2,),
'ctx': 'cpu',
'discount_factor': 0.999,
'training_episodes': 160,
'train_interval': 1,
'snapshot_interval': 20,
'max_episode_step': 250,
'target_score': 185.5,
'qnet':qnet_creator.net,
'use_fix_target': True,
'target_update_interval': 200,
'loss_function': 'euclidean',
'optimizer': 'rmsprop',
'optimizer_params': {
'learning_rate': 0.001 },
'double_dqn': False,
}
resume_session, resume_directory = resume_session()
resume, resume_directory = resume_session()
if resume_session:
agent = DqnAgent.resume_from_session(resume_directory, net_creator.net, env)
if resume:
resume_agent_params = {
'session_dir': resume_directory,
'environment': env,
'net': qnet_creator.net,
}
agent = DqnAgent.resume_from_session(**resume_agent_params)
else:
agent = DqnAgent(
network = net_creator.net,
environment=env,
replay_memory_params=replay_memory_params,
policy_params=policy_params,
state_dim=net_creator.get_input_shapes()[0],
ctx='cpu',
discount_factor=0.999,
loss_function='euclidean',
optimizer='rmsprop',
optimizer_params={
'learning_rate': 0.001 },
training_episodes=160,
train_interval=1,
use_fix_target=True,
target_update_interval=200,
double_dqn = False,
snapshot_interval=20,
agent_name=agent_name,
max_episode_step=250,
output_directory=session_output_dir,
verbose=True,
live_plot = True,
make_logfile=True,
target_score=185.5
)
agent = DqnAgent(**agent_params)
signal_handler = AgentSignalHandler()
signal_handler.register_agent(agent)
......@@ -93,4 +104,4 @@ if __name__ == "__main__":
train_successful = agent.train()
if train_successful:
agent.save_best_network(net_creator._model_dir_ + net_creator._model_prefix_ + '_newest', epoch=0)
\ No newline at end of file
agent.save_best_network(qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_newest', epoch=0)
import numpy as np
class ActionPolicyBuilder(object):
def __init__(self):
pass
def build_by_params(self,
method='epsgreedy',
epsilon=0.5,
min_epsilon=0.05,
epsilon_decay_method='no',
epsilon_decay=0.0,
action_dim=None):
if epsilon_decay_method == 'linear':
decay = LinearDecay(eps_decay=epsilon_decay, min_eps=min_epsilon)
else:
decay = NoDecay()
if method == 'epsgreedy':
assert action_dim is not None
assert len(action_dim) == 1
return EpsilonGreedyActionPolicy(eps=epsilon,
number_of_actions=action_dim[0], decay=decay)
else:
assert action_dim is not None
assert len(action_dim) == 1
return GreedyActionPolicy()
class EpsilonGreedyActionPolicy(object):
def __init__(self, eps, number_of_actions, decay):
self.eps = eps
self.cur_eps = eps
self.__number_of_actions = number_of_actions
self.__decay_method = decay
def select_action(self, values):
do_exploration = (np.random.rand() < self.cur_eps)
if do_exploration:
action = np.random.randint(low=0, high=self.__number_of_actions)
else:
action = values.asnumpy().argmax()
return action
def decay(self):
self.cur_eps = self.__decay_method.decay(self.cur_eps)
class GreedyActionPolicy(object):
def __init__(self):
pass
def select_action(self, values):
return values.asnumpy().argmax()
def decay(self):
pass
class NoDecay(object):
def __init__(self):
pass
def decay(self, cur_eps):
return cur_eps
class LinearDecay(object):
def __init__(self, eps_decay, min_eps=0):
self.eps_decay = eps_decay
self.min_eps = min_eps
def decay(self, cur_eps):
return max(cur_eps - self.eps_decay, self.min_eps)
\ No newline at end of file
......@@ -2,118 +2,382 @@ import mxnet as mx
import numpy as np
import time
import os
import logging
import sys
import util
import matplotlib.pyplot as plt
import pyprind
from cnnarch_logger import ArchLogger
from replay_memory import ReplayMemoryBuilder
from action_policy import ActionPolicyBuilder
from util import copy_net, get_loss_function
from strategy import StrategyBuilder
from util import copy_net, get_loss_function,\
copy_net_with_two_inputs, DdpgTrainingStats, DqnTrainingStats,\
make_directory_if_not_exist
from mxnet import nd, gluon, autograd
class DqnAgent(object):
def __init__(self,
network,
class Agent(object):
def __init__(
self,
environment,
replay_memory_params,
policy_params,
strategy_params,
state_dim,
action_dim,
ctx=None,
discount_factor=.9,
loss_function='euclidean',
optimizer='rmsprop',
optimizer_params = {'learning_rate':0.09},
training_episodes=50,
train_interval=1,
use_fix_target=False,
double_dqn = False,
target_update_interval=10,
start_training=0,
snapshot_interval=200,
agent_name='Dqn_agent',
agent_name='Agent',
max_episode_step=99999,
evaluation_samples=1000,
output_directory='model_parameters',
verbose=True,
live_plot = True,
make_logfile=True,
target_score=None):
assert 0 < discount_factor <= 1
assert train_interval > 0
assert target_update_interval > 0
assert snapshot_interval > 0
assert max_episode_step > 0
assert training_episodes > 0
assert replay_memory_params is not None
assert type(state_dim) is tuple
self.__ctx = mx.gpu() if ctx == 'gpu' else mx.cpu()
self.__qnet = network
self.__environment = environment
self.__discount_factor = discount_factor
self.__training_episodes = training_episodes
self.__train_interval = train_interval
self.__verbose = verbose
self.__state_dim = state_dim
self.__action_dim = self.__qnet(nd.random_normal(shape=((1,) + self.__state_dim), ctx=self.__ctx)).shape[1:]
target_score=None
):
assert 0 < discount_factor <= 1,\
'Discount factor must be between 0 and 1'
assert train_interval > 0, 'Train interval must be greater 0'
assert snapshot_interval > 0, 'Snapshot interval must be greater 0'
assert max_episode_step > 0,\
'Maximal steps per episode must be greater 0'
assert training_episodes > 0, 'Trainings episode must be greater 0'
assert replay_memory_params is not None,\
'Replay memory parameter not set'
assert type(state_dim) is tuple, 'State dimension is not a tuple'
assert type(action_dim) is tuple, 'Action dimension is not a tuple'
self._logger = ArchLogger.get_logger()
self._ctx = mx.gpu() if ctx == 'gpu' else mx.cpu()
self._environment = environment
self._discount_factor = discount_factor
self._training_episodes = training_episodes
self._train_interval = train_interval
self._verbose = verbose
self._state_dim = state_dim
replay_memory_params['state_dim'] = state_dim
self.__replay_memory_params = replay_memory_params
replay_memory_params['action_dim'] = action_dim
self._replay_memory_params = replay_memory_params
rm_builder = ReplayMemoryBuilder()
self.__memory = rm_builder.build_by_params(**replay_memory_params)
self.__minibatch_size = self.__memory.sample_size
policy_params['action_dim'] = self.__action_dim
self.__policy_params = policy_params
p_builder = ActionPolicyBuilder()
self.__policy = p_builder.build_by_params(**policy_params)
self.__target_update_interval = target_update_interval
self.__target_qnet = copy_net(self.__qnet, self.__state_dim, ctx=self.__ctx)
self.__loss_function_str = loss_function
self.__loss_function = get_loss_function(loss_function)
self.__agent_name = agent_name
self.__snapshot_interval = snapshot_interval
self.__creation_time = time.time()
self.__max_episode_step = max_episode_step
self.__optimizer = optimizer
self.__optimizer_params = optimizer_params
self.__make_logfile = make_logfile
self.__double_dqn = double_dqn
self.__use_fix_target = use_fix_target
self.__live_plot = live_plot
self.__user_given_directory = output_directory
self.__target_score = target_score
self.__interrupt_flag = False
self._memory = rm_builder.build_by_params(**replay_memory_params)
self._minibatch_size = self._memory.sample_size
self._action_dim = action_dim
strategy_params['action_dim'] = self._action_dim
self._strategy_params = strategy_params
strategy_builder = StrategyBuilder()
self._strategy = strategy_builder.build_by_params(**strategy_params)
self._agent_name = agent_name
self._snapshot_interval = snapshot_interval
self._creation_time = time.time()
self._max_episode_step = max_episode_step
self._start_training = start_training
self._output_directory = output_directory
self._target_score = target_score
self._evaluation_samples = evaluation_samples
self._best_avg_score = -np.infty
self._best_net = None
self._interrupt_flag = False
self._training_stats = None
# Training Context
self.__current_episode = 0
self.__total_steps = 0
# Initialize best network
self.__best_net = copy_net(self.__qnet, self.__state_dim, self.__ctx)
self.__best_avg_score = None
self._current_episode = 0
self._total_steps = 0
# Gluon Trainer definition