Commit 02e3446f authored by Nicola Gatto's avatar Nicola Gatto
Browse files

Add test with reward topic

parent 776dffd5
Pipeline #147044 passed with stages
in 3 minutes and 9 seconds
......@@ -222,7 +222,6 @@ public class GenerationTest extends AbstractSymtabTest {
trainGenerator.generate(modelPath, "ReinforcementConfig3", trainedArchitecture);
/*
assertTrue(Log.getFindings().isEmpty());
checkFilesAreEqual(
Paths.get("./target/generated-sources-cnnarch"),
......@@ -238,7 +237,7 @@ public class GenerationTest extends AbstractSymtabTest {
"reinforcement_learning/util.py",
"reinforcement_learning/cnnarch_logger.py"
)
);*/
);
}
......
from reinforcement_learning.agent import DqnAgent
from reinforcement_learning.util import AgentSignalHandler
from reinforcement_learning.cnnarch_logger import ArchLogger
import reinforcement_learning.environment
import CNNCreator_reinforcementConfig3
import os
import sys
import re
import time
import numpy as np
import mxnet as mx
def resume_session(sessions_dir):
resume_session = False
resume_directory = None
if os.path.isdir(sessions_dir):
regex = re.compile(r'\d\d\d\d-\d\d-\d\d-\d\d-\d\d')
dir_content = os.listdir(sessions_dir)
session_files = filter(regex.search, dir_content)
session_files.sort(reverse=True)
for d in session_files:
interrupted_session_dir = os.path.join(sessions_dir, d, '.interrupted_session')
if os.path.isdir(interrupted_session_dir):
resume = raw_input('Interrupted session from {} found. Do you want to resume? (y/n) '.format(d))
if resume == 'y':
resume_session = True
resume_directory = interrupted_session_dir
break
return resume_session, resume_directory
if __name__ == "__main__":
agent_name = 'reinforcement_agent'
# Prepare output directory and logger
all_output_dir = os.path.join('model', agent_name)
output_directory = os.path.join(
all_output_dir,
time.strftime('%Y-%m-%d-%H-%M-%S',
time.localtime(time.time())))
ArchLogger.set_output_directory(output_directory)
ArchLogger.set_logger_name(agent_name)
ArchLogger.set_output_level(ArchLogger.INFO)
env_params = {
'ros_node_name': 'reinforcementConfig3TrainerNode',
'state_topic': '/environment/state',
'action_topic': '/environment/action',
'reset_topic': '/environment/reset',
'reward_topic': '/environment/reward',
}
env = reinforcement_learning.environment.RosEnvironment(**env_params)
context = mx.cpu()
qnet_creator = CNNCreator_reinforcementConfig3.CNNCreator_reinforcementConfig3()
qnet_creator.construct(context)
agent_params = {
'environment': env,
'replay_memory_params': {
'method': 'buffer',
'memory_size': 1000000,
'sample_size': 64,
'state_dtype': 'float32',
'action_dtype': 'float32',
'rewards_dtype': 'float32'
},
'strategy_params': {
'method':'epsgreedy',
'epsilon': 1,
'min_epsilon': 0.02,
'epsilon_decay_method': 'linear',
'epsilon_decay': 0.0001,
},
'agent_name': agent_name,
'verbose': True,
'output_directory': output_directory,
'state_dim': (8,),
'action_dim': (3,),
'discount_factor': 0.99999,
'training_episodes': 1000,
'train_interval': 1,
'snapshot_interval': 500,
'max_episode_step': 10000,
'target_score': 35000,
'qnet':qnet_creator.net,
'use_fix_target': True,
'target_update_interval': 500,
'loss_function': 'huber_loss',
'optimizer': 'adam',
'optimizer_params': {
'learning_rate': 0.001 },
'double_dqn': True,
}
resume, resume_directory = resume_session(all_output_dir)
if resume:
output_directory, _ = os.path.split(resume_directory)
ArchLogger.set_output_directory(output_directory)
resume_agent_params = {
'session_dir': resume_directory,
'environment': env,
'net': qnet_creator.net,
}
agent = DqnAgent.resume_from_session(**resume_agent_params)
else:
agent = DqnAgent(**agent_params)
signal_handler = AgentSignalHandler()
signal_handler.register_agent(agent)
train_successful = agent.train()
if train_successful:
agent.save_best_network(qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_newest', epoch=0)
import mxnet as mx
import numpy as np
import time
import os
import sys
import util
import matplotlib.pyplot as plt
import pyprind
from cnnarch_logger import ArchLogger
from replay_memory import ReplayMemoryBuilder
from strategy import StrategyBuilder
from util import copy_net, get_loss_function,\
copy_net_with_two_inputs, DdpgTrainingStats, DqnTrainingStats,\
make_directory_if_not_exist
from mxnet import nd, gluon, autograd
class Agent(object):
def __init__(
self,
environment,
replay_memory_params,
strategy_params,
state_dim,
action_dim,
ctx=None,
discount_factor=.9,
training_episodes=50,
train_interval=1,
start_training=0,
snapshot_interval=200,
agent_name='Agent',
max_episode_step=99999,
evaluation_samples=1000,
output_directory='model_parameters',
verbose=True,
target_score=None
):
assert 0 < discount_factor <= 1,\
'Discount factor must be between 0 and 1'
assert train_interval > 0, 'Train interval must be greater 0'
assert snapshot_interval > 0, 'Snapshot interval must be greater 0'
assert max_episode_step > 0,\
'Maximal steps per episode must be greater 0'
assert training_episodes > 0, 'Trainings episode must be greater 0'
assert replay_memory_params is not None,\
'Replay memory parameter not set'
assert type(state_dim) is tuple, 'State dimension is not a tuple'
assert type(action_dim) is tuple, 'Action dimension is not a tuple'
self._logger = ArchLogger.get_logger()
self._ctx = mx.gpu() if ctx == 'gpu' else mx.cpu()
self._environment = environment
self._discount_factor = discount_factor
self._training_episodes = training_episodes
self._train_interval = train_interval
self._verbose = verbose
self._state_dim = state_dim
replay_memory_params['state_dim'] = state_dim
replay_memory_params['action_dim'] = action_dim
self._replay_memory_params = replay_memory_params
rm_builder = ReplayMemoryBuilder()
self._memory = rm_builder.build_by_params(**replay_memory_params)
self._minibatch_size = self._memory.sample_size
self._action_dim = action_dim
strategy_params['action_dim'] = self._action_dim
self._strategy_params = strategy_params
strategy_builder = StrategyBuilder()
self._strategy = strategy_builder.build_by_params(**strategy_params)
self._agent_name = agent_name
self._snapshot_interval = snapshot_interval
self._creation_time = time.time()
self._max_episode_step = max_episode_step
self._start_training = start_training
self._output_directory = output_directory
self._target_score = target_score
self._evaluation_samples = evaluation_samples
self._best_avg_score = -np.infty
self._best_net = None
self._interrupt_flag = False
self._training_stats = None
# Training Context
self._current_episode = 0
self._total_steps = 0
@property
def current_episode(self):
return self._current_episode
@property
def environment(self):
return self._environment
def save_config_file(self):
import json
make_directory_if_not_exist(self._output_directory)
filename = os.path.join(self._output_directory, 'config.json')
config = self._make_config_dict()
with open(filename, mode='w') as fp:
json.dump(config, fp, indent=4)
def set_interrupt_flag(self, interrupt):
self._interrupt_flag = interrupt
def _interrupt_training(self):
import pickle
self._logger.info('Training interrupted; Store state for resuming')
session_dir = self._get_session_dir()
agent_session_file = os.path.join(session_dir, 'agent.p')
logger = self._logger
self._make_pickle_ready(session_dir)
with open(agent_session_file, 'wb') as f:
pickle.dump(self, f, protocol=2)
logger.info('State successfully stored')
def _make_pickle_ready(self, session_dir):
del self._training_stats.logger
self._logger = None
self._environment.close()
self._environment = None
self._save_net(self._best_net, 'best_net', session_dir)
self._best_net = None
def _make_config_dict(self):
config = dict()
config['state_dim'] = self._state_dim
config['action_dim'] = self._action_dim
config['ctx'] = str(self._ctx)
config['discount_factor'] = self._discount_factor
config['strategy_params'] = self._strategy_params
config['replay_memory_params'] = self._replay_memory_params
config['training_episodes'] = self._training_episodes
config['start_training'] = self._start_training
config['evaluation_samples'] = self._evaluation_samples
config['train_interval'] = self._train_interval
config['snapshot_interval'] = self._snapshot_interval
config['agent_name'] = self._agent_name
config['max_episode_step'] = self._max_episode_step
config['output_directory'] = self._output_directory
config['verbose'] = self._verbose
config['target_score'] = self._target_score
return config
def _adjust_optimizer_params(self, optimizer_params):
if 'weight_decay' in optimizer_params:
optimizer_params['wd'] = optimizer_params['weight_decay']
del optimizer_params['weight_decay']
if 'learning_rate_decay' in optimizer_params:
min_learning_rate = 1e-8
if 'learning_rate_minimum' in optimizer_params:
min_learning_rate = optimizer_params['learning_rate_minimum']
del optimizer_params['learning_rate_minimum']
optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
optimizer_params['step_size'],
factor=optimizer_params['learning_rate_decay'],
stop_factor_lr=min_learning_rate)
del optimizer_params['step_size']
del optimizer_params['learning_rate_decay']
return optimizer_params
def _sample_from_memory(self):
states, actions, rewards, next_states, terminals\
= self._memory.sample(batch_size=self._minibatch_size)
states = nd.array(states, ctx=self._ctx)
actions = nd.array(actions, ctx=self._ctx)
rewards = nd.array(rewards, ctx=self._ctx)
next_states = nd.array(next_states, ctx=self._ctx)
terminals = nd.array(terminals, ctx=self._ctx)
return states, actions, rewards, next_states, terminals
def evaluate(self, target=None, sample_games=100, verbose=True):
target = self._target_score if target is None else target
if target:
target_achieved = 0
total_reward = 0
self._logger.info('Sampling from {} games...'.format(sample_games))
for g in pyprind.prog_bar(range(sample_games)):
state = self._environment.reset()
step = 0
game_reward = 0
terminal = False
while not terminal and (step < self._max_episode_step):
action = self.get_next_action(state)
state, reward, terminal, _ = self._environment.step(action)
game_reward += reward
step += 1
if verbose:
info = 'Game %d: Reward %f' % (g, game_reward)
self._logger.debug(info)
if target:
if game_reward >= target:
target_achieved += 1
total_reward += game_reward
avg_reward = float(total_reward)/float(sample_games)
info = 'Avg. Reward: %f' % avg_reward
if target:
target_achieved_ratio = int(
(float(target_achieved)/float(sample_games))*100)
info += '; Target Achieved in %d%% of games'\
% (target_achieved_ratio)
if verbose:
self._logger.info(info)
return avg_reward
def _do_snapshot_if_in_interval(self, episode):
do_snapshot =\
(episode != 0 and (episode % self._snapshot_interval == 0))
if do_snapshot:
self.save_parameters(episode=episode)
self._evaluate()
def _evaluate(self, verbose=True):
avg_reward = self.evaluate(
sample_games=self._evaluation_samples, verbose=False)
info = 'Evaluation -> Average Reward in {} games: {}'.format(
self._evaluation_samples, avg_reward)
if self._best_avg_score is None or self._best_avg_score <= avg_reward:
self._save_current_as_best_net()
self._best_avg_score = avg_reward
if verbose:
self._logger.info(info)
def _is_target_reached(self, avg_reward):
return self._target_score is not None\
and avg_reward > self._target_score
def _do_training(self):
return (self._total_steps % self._train_interval == 0) and\
(self._memory.is_sample_possible(self._minibatch_size)) and\
(self._current_episode >= self._start_training)
def _check_interrupt_routine(self):
if self._interrupt_flag:
self._interrupt_flag = False
self._interrupt_training()
return True
return False
def _is_target_reached(self, avg_reward):
return self._target_score is not None\
and avg_reward > self._target_score
def _save_parameters(self, net, episode=None, filename='dqn-agent-params'):
assert self._output_directory
assert isinstance(net, gluon.HybridBlock)
make_directory_if_not_exist(self._output_directory)
if(episode is not None):
self._logger.info(
'Saving model parameters after episode %d' % episode)
filename = filename + '-ep{}'.format(episode)
else:
self._logger.info('Saving model parameters')
self._save_net(net, filename)
def _save_net(self, net, filename, filedir=None):
filedir = self._output_directory if filedir is None else filedir
filename = os.path.join(filedir, filename + '.params')
net.save_parameters(filename)
def save_best_network(self, path, epoch=0):
self._logger.info(
'Saving best network with average reward of {}'.format(
self._best_avg_score))
self._best_net.export(path, epoch=epoch)
def _get_session_dir(self):
session_dir = os.path.join(
self._output_directory, '.interrupted_session')
make_directory_if_not_exist(session_dir)
return session_dir
def _save_current_as_best_net(self):
raise NotImplementedError
def get_next_action(self, state):
raise NotImplementedError
def save_parameters(self, episode):
raise NotImplementedError
def train(self, episodes=None):
raise NotImplementedError
class DdpgAgent(Agent):
def __init__(
self,
actor,
critic,
environment,
replay_memory_params,
strategy_params,
state_dim,
action_dim,
soft_target_update_rate=.001,
actor_optimizer='adam',
actor_optimizer_params={'learning_rate': 0.0001},
critic_optimizer='adam',
critic_optimizer_params={'learning_rate': 0.001},
ctx=None,
discount_factor=.9,
training_episodes=50,
start_training=20,
train_interval=1,
snapshot_interval=200,
agent_name='DdpgAgent',
max_episode_step=9999,
evaluation_samples=100,
output_directory='model_parameters',
verbose=True,
target_score=None
):
super(DdpgAgent, self).__init__(
environment=environment, replay_memory_params=replay_memory_params,
strategy_params=strategy_params, state_dim=state_dim,
action_dim=action_dim, ctx=ctx, discount_factor=discount_factor,
training_episodes=training_episodes, start_training=start_training,
train_interval=train_interval,
snapshot_interval=snapshot_interval, agent_name=agent_name,
max_episode_step=max_episode_step,
output_directory=output_directory, verbose=verbose,
target_score=target_score, evaluation_samples=evaluation_samples)
assert critic is not None, 'Critic not set'
assert actor is not None, 'Actor is not set'
assert soft_target_update_rate > 0,\
'Target update must be greater zero'
assert actor_optimizer is not None, 'No actor optimizer set'
assert critic_optimizer is not None, 'No critic optimizer set'
self._actor = actor
self._critic = critic
self._actor_target = self._copy_actor()
self._critic_target = self._copy_critic()
self._actor_optimizer = actor_optimizer
self._actor_optimizer_params = self._adjust_optimizer_params(
actor_optimizer_params)
self._critic_optimizer = critic_optimizer
self._critic_optimizer_params = self._adjust_optimizer_params(
critic_optimizer_params)
self._soft_target_update_rate = soft_target_update_rate
self._logger.info(
'Agent created with following parameters: {}'.format(
self._make_config_dict()))
self._best_net = self._copy_actor()
self._training_stats = DdpgTrainingStats(self._training_episodes)
def _make_pickle_ready(self, session_dir):
super(DdpgAgent, self)._make_pickle_ready(session_dir)
self._save_net(self._actor, 'actor', session_dir)
self._actor = None
self._save_net(self._critic, 'critic', session_dir)
self._critic = None
self._save_net(self._actor_target, 'actor_target', session_dir)
self._actor_target = None
self._save_net(self._critic_target, 'critic_target', session_dir)
self._critic_target = None
@classmethod
def resume_from_session(cls, session_dir, actor, critic, environment):
import pickle
if not os.path.exists(session_dir):
raise ValueError('Session directory does not exist')
files = dict()
files['agent'] = os.path.join(session_dir, 'agent.p')
files['best_net_params'] = os.path.join(session_dir, 'best_net.params')
files['actor_net_params'] = os.path.join(session_dir, 'actor.params')
files['actor_target_net_params'] = os.path.join(
session_dir, 'actor_target.params')
files['critic_net_params'] = os.path.join(session_dir, 'critic.params')
files['critic_target_net_params'] = os.path.join(
session_dir, 'critic_target.params')
for file in files.values():
if not os.path.exists(file):
raise ValueError(
'Session directory is not complete: {} is missing'
.format(file))
with open(files['agent'], 'rb') as f:
agent = pickle.load(f)
agent._environment = environment
agent._actor = actor
agent._actor.load_parameters(files['actor_net_params'], agent._ctx)
agent._actor.hybridize()
agent._actor(nd.random_normal(
shape=((1,) + agent._state_dim), ctx=agent._ctx))
agent._best_net = copy_net(agent._actor, agent._state_dim, agent._ctx)
agent._best_net.load_parameters(files['best_net_params'], agent._ctx)
agent._actor_target = copy_net(
agent._actor, agent._state_dim, agent._ctx)
agent._actor_target.load_parameters(files['actor_target_net_params'])
agent._critic =