diff --git a/src/main/resources/templates/gluon/reinforcement/Trainer.ftl b/src/main/resources/templates/gluon/reinforcement/Trainer.ftl index 5a43200c645a5a66c1d0670a1c156e046d6bbb80..57b2967c84e1b9a1fe8e8e781e6a684cad5c1964 100644 --- a/src/main/resources/templates/gluon/reinforcement/Trainer.ftl +++ b/src/main/resources/templates/gluon/reinforcement/Trainer.ftl @@ -110,14 +110,14 @@ if __name__ == "__main__": <#if config.rlAlgorithm == "dqn"> qnet_creator = CNNCreator_${config.instanceName}.CNNCreator_${config.instanceName}() qnet_creator.setWeightInitializer(initializer) - qnet_creator.construct(context) + qnet_creator.construct([context]) <#elseif config.rlAlgorithm=="ddpg" || config.rlAlgorithm=="td3"> actor_creator = CNNCreator_${config.instanceName}.CNNCreator_${config.instanceName}() actor_creator.setWeightInitializer(initializer) - actor_creator.construct(context) + actor_creator.construct([context]) critic_creator = CNNCreator_${criticInstanceName}() critic_creator.setWeightInitializer(critic_initializer) - critic_creator.construct(context) + critic_creator.construct([context]) agent_params = { @@ -157,6 +157,9 @@ if __name__ == "__main__": <#if (config.evaluationSamples)??> 'evaluation_samples': ${config.evaluationSamples}, +<#if (config.selfPlay)??> + 'self_play': '${config.selfPlay}', + <#if (config.outputDirectory)??> 'output_directory': ${config.outputDirectory}, @@ -201,4 +204,4 @@ if __name__ == "__main__": agent.export_best_network(path=qnet_creator._model_dir_ + qnet_creator._model_prefix_ + '_0_newest', epoch=0) <#else> agent.export_best_network(path=actor_creator._model_dir_ + actor_creator._model_prefix_ + '_0_newest', epoch=0) - \ No newline at end of file + diff --git a/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl b/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl index bf4690a41e038d9d537722e9cae418fd7204f525..628e1e9496b73042c2c0437c1ca61d6b056bfd0d 100644 --- a/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl +++ b/src/main/resources/templates/gluon/reinforcement/agent/Agent.ftl @@ -5,6 +5,8 @@ import mxnet as mx import numpy as np import time import os +import subprocess #library to call shell script +import shlex #library to pass variable to shell script import sys import util import matplotlib.pyplot as plt @@ -35,6 +37,7 @@ class Agent(object): agent_name='Agent', max_episode_step=99999, evaluation_samples=1000, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -81,6 +84,7 @@ class Agent(object): self._target_score = target_score self._evaluation_samples = evaluation_samples + self._self_play = self_play self._best_avg_score = -np.infty self._best_net = None @@ -144,6 +148,7 @@ class Agent(object): config['training_episodes'] = self._training_episodes config['start_training'] = self._start_training config['evaluation_samples'] = self._evaluation_samples + config['self_play'] = self._self_play config['train_interval'] = self._train_interval config['snapshot_interval'] = self._snapshot_interval config['agent_name'] = self._agent_name @@ -228,7 +233,19 @@ class Agent(object): if do_snapshot: self.save_parameters(episode=episode) self._evaluate() - + if self._self_play == 'yes': + print("Self Play activated") + path = os.getcwd() + file = os.path.join(os.getcwd(), os.listdir(os.getcwd())[0]) + home_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file)))))) + os.chdir(home_path) + snapshot_param = str(self._snapshot_interval) + subprocess.check_call(['./update_agent.sh', snapshot_param]) + os.chdir(path) + elif self._self_play == 'no': + print("Self play deactivated") + + def _evaluate(self, verbose=True): avg_reward = self.evaluate( sample_games=self._evaluation_samples, verbose=False) @@ -325,6 +342,7 @@ class DdpgAgent(Agent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -338,7 +356,7 @@ class DdpgAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, self_play=self_play) assert critic is not None, 'Critic not set' assert actor is not None, 'Actor is not set' assert soft_target_update_rate > 0,\ @@ -447,7 +465,7 @@ class DdpgAgent(Agent): def get_next_action(self, state): action = self._actor(nd.array([state], ctx=self._ctx)) - return action[0].asnumpy() + return action[0][0][0].asnumpy() def save_parameters(self, episode): self._export_net( @@ -527,19 +545,19 @@ class DdpgAgent(Agent): actor_target_actions = self._actor_target(next_states) critic_target_qvalues = self._critic_target( - next_states, actor_target_actions) + next_states, actor_target_actions[0][0]) rewards = rewards.reshape(self._minibatch_size, 1) terminals = terminals.reshape(self._minibatch_size, 1) # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) y = rewards + (1.0 - terminals) * self._discount_factor\ - * critic_target_qvalues + * critic_target_qvalues[0][0] # Train the critic network with autograd.record(): qvalues = self._critic(states, actions) - critic_loss = l2_loss(qvalues, y) + critic_loss = l2_loss(qvalues[0][0], y) critic_loss.backward() trainer_critic.step(self._minibatch_size) @@ -551,7 +569,7 @@ class DdpgAgent(Agent): # For maximizing qvalues we have to multiply with -1 # as we use a minimizer actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -569,7 +587,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues[0][0].asnumpy()) / self._minibatch_size training_steps += 1 @@ -667,6 +685,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None, @@ -684,6 +703,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play, critic=critic, soft_target_update_rate=soft_target_update_rate, actor=actor, actor_optimizer=actor_optimizer, actor_optimizer_params=actor_optimizer_params, @@ -886,7 +906,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): ctx=self._ctx ) target_action = np.clip( - self._actor_target(next_states) + clipped_noise, + self._actor_target(next_states)[0][0] + clipped_noise, self._strategy._action_low, self._strategy._action_high) @@ -894,9 +914,9 @@ class TwinDelayedDdpgAgent(DdpgAgent): terminals = terminals.reshape(self._minibatch_size, 1) target_qvalues1 = self._critic_target(next_states, - target_action) + target_action)[0][0] target_qvalues2 = self._critic2_target(next_states, - target_action) + target_action)[0][0] target_qvalues = nd.minimum(target_qvalues1, target_qvalues2) y = rewards + (1 - terminals) * self._discount_factor\ @@ -904,13 +924,13 @@ class TwinDelayedDdpgAgent(DdpgAgent): with autograd.record(): qvalues1 = self._critic(states, actions) - critic1_loss = l2_loss(qvalues1, y) + critic1_loss = l2_loss(qvalues1[0][0], y) critic1_loss.backward() trainer_critic.step(self._minibatch_size) with autograd.record(): qvalues2 = self._critic2(states, actions) - critic2_loss = l2_loss(qvalues2, y) + critic2_loss = l2_loss(qvalues2[0][0], y) critic2_loss.backward() trainer_critic2.step(self._minibatch_size) @@ -920,7 +940,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): tmp_critic = self._copy_critic() with autograd.record(): actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -1018,6 +1038,7 @@ class DqnAgent(Agent): target_update_interval=10, snapshot_interval=200, evaluation_samples=100, + self_play='no', agent_name='Dqn_agent', max_episode_step=99999, output_directory='model_parameters', @@ -1033,7 +1054,8 @@ class DqnAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play) self._qnet = qnet self._target_update_interval = target_update_interval diff --git a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py index 027af51e2c15058f27ed83ceff8bf07b95e3ccc6..7eb80333c96885860e05269674e24f70285a5511 100644 --- a/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ReinforcementConfig1/reinforcement_learning/agent.py @@ -3,6 +3,8 @@ import mxnet as mx import numpy as np import time import os +import subprocess #library to call shell script +import shlex #library to pass variable to shell script import sys import util import matplotlib.pyplot as plt @@ -33,6 +35,7 @@ class Agent(object): agent_name='Agent', max_episode_step=99999, evaluation_samples=1000, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -79,6 +82,7 @@ class Agent(object): self._target_score = target_score self._evaluation_samples = evaluation_samples + self._self_play = self_play self._best_avg_score = -np.infty self._best_net = None @@ -142,6 +146,7 @@ class Agent(object): config['training_episodes'] = self._training_episodes config['start_training'] = self._start_training config['evaluation_samples'] = self._evaluation_samples + config['self_play'] = self._self_play config['train_interval'] = self._train_interval config['snapshot_interval'] = self._snapshot_interval config['agent_name'] = self._agent_name @@ -226,7 +231,19 @@ class Agent(object): if do_snapshot: self.save_parameters(episode=episode) self._evaluate() - + if self._self_play == 'yes': + print("Self Play activated") + path = os.getcwd() + file = os.path.join(os.getcwd(), os.listdir(os.getcwd())[0]) + home_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file)))))) + os.chdir(home_path) + snapshot_param = str(self._snapshot_interval) + subprocess.check_call(['./update_agent.sh', snapshot_param]) + os.chdir(path) + elif self._self_play == 'no': + print("Self play deactivated") + + def _evaluate(self, verbose=True): avg_reward = self.evaluate( sample_games=self._evaluation_samples, verbose=False) @@ -323,6 +340,7 @@ class DdpgAgent(Agent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -336,7 +354,7 @@ class DdpgAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, self_play=self_play) assert critic is not None, 'Critic not set' assert actor is not None, 'Actor is not set' assert soft_target_update_rate > 0,\ @@ -445,7 +463,7 @@ class DdpgAgent(Agent): def get_next_action(self, state): action = self._actor(nd.array([state], ctx=self._ctx)) - return action[0].asnumpy() + return action[0][0][0].asnumpy() def save_parameters(self, episode): self._export_net( @@ -665,6 +683,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None, @@ -682,6 +701,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play, critic=critic, soft_target_update_rate=soft_target_update_rate, actor=actor, actor_optimizer=actor_optimizer, actor_optimizer_params=actor_optimizer_params, @@ -884,7 +904,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): ctx=self._ctx ) target_action = np.clip( - self._actor_target(next_states) + clipped_noise, + self._actor_target(next_states)[0][0] + clipped_noise, self._strategy._action_low, self._strategy._action_high) @@ -892,9 +912,9 @@ class TwinDelayedDdpgAgent(DdpgAgent): terminals = terminals.reshape(self._minibatch_size, 1) target_qvalues1 = self._critic_target(next_states, - target_action) + target_action)[0][0] target_qvalues2 = self._critic2_target(next_states, - target_action) + target_action)[0][0] target_qvalues = nd.minimum(target_qvalues1, target_qvalues2) y = rewards + (1 - terminals) * self._discount_factor\ @@ -902,13 +922,13 @@ class TwinDelayedDdpgAgent(DdpgAgent): with autograd.record(): qvalues1 = self._critic(states, actions) - critic1_loss = l2_loss(qvalues1, y) + critic1_loss = l2_loss(qvalues1[0][0], y) critic1_loss.backward() trainer_critic.step(self._minibatch_size) with autograd.record(): qvalues2 = self._critic2(states, actions) - critic2_loss = l2_loss(qvalues2, y) + critic2_loss = l2_loss(qvalues2[0][0], y) critic2_loss.backward() trainer_critic2.step(self._minibatch_size) @@ -918,7 +938,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): tmp_critic = self._copy_critic() with autograd.record(): actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -1016,6 +1036,7 @@ class DqnAgent(Agent): target_update_interval=10, snapshot_interval=200, evaluation_samples=100, + self_play='no', agent_name='Dqn_agent', max_episode_step=99999, output_directory='model_parameters', @@ -1031,7 +1052,8 @@ class DqnAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play) self._qnet = qnet self._target_update_interval = target_update_interval diff --git a/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py b/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py index 60d238c8722686beab1123e92c03aa31c5ad2afc..2471b5c34a8fe6cd145960eb832674d2bccb44bb 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py +++ b/src/test/resources/target_code/ReinforcementConfig2/CNNTrainer_reinforcementConfig2.py @@ -50,7 +50,7 @@ if __name__ == "__main__": initializer = mx.init.Normal() qnet_creator = CNNCreator_reinforcementConfig2.CNNCreator_reinforcementConfig2() qnet_creator.setWeightInitializer(initializer) - qnet_creator.construct(context) + qnet_creator.construct([context]) agent_params = { 'environment': env, diff --git a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py index 027af51e2c15058f27ed83ceff8bf07b95e3ccc6..e4bf6072e6027bdec7eceec19d187f1e805500b7 100644 --- a/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ReinforcementConfig2/reinforcement_learning/agent.py @@ -3,6 +3,8 @@ import mxnet as mx import numpy as np import time import os +import subprocess #library to call shell script +import shlex #library to pass variable to shell script import sys import util import matplotlib.pyplot as plt @@ -33,6 +35,7 @@ class Agent(object): agent_name='Agent', max_episode_step=99999, evaluation_samples=1000, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -79,6 +82,7 @@ class Agent(object): self._target_score = target_score self._evaluation_samples = evaluation_samples + self._self_play = self_play self._best_avg_score = -np.infty self._best_net = None @@ -142,6 +146,7 @@ class Agent(object): config['training_episodes'] = self._training_episodes config['start_training'] = self._start_training config['evaluation_samples'] = self._evaluation_samples + config['self_play'] = self._self_play config['train_interval'] = self._train_interval config['snapshot_interval'] = self._snapshot_interval config['agent_name'] = self._agent_name @@ -226,7 +231,19 @@ class Agent(object): if do_snapshot: self.save_parameters(episode=episode) self._evaluate() - + if self._self_play == 'yes': + print("Self Play activated") + path = os.getcwd() + file = os.path.join(os.getcwd(), os.listdir(os.getcwd())[0]) + home_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file)))))) + os.chdir(home_path) + snapshot_param = str(self._snapshot_interval) + subprocess.check_call(['./update_agent.sh', snapshot_param]) + os.chdir(path) + elif self._self_play == 'no': + print("Self play deactivated") + + def _evaluate(self, verbose=True): avg_reward = self.evaluate( sample_games=self._evaluation_samples, verbose=False) @@ -323,6 +340,7 @@ class DdpgAgent(Agent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -336,7 +354,7 @@ class DdpgAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, self_play=self_play) assert critic is not None, 'Critic not set' assert actor is not None, 'Actor is not set' assert soft_target_update_rate > 0,\ @@ -445,7 +463,7 @@ class DdpgAgent(Agent): def get_next_action(self, state): action = self._actor(nd.array([state], ctx=self._ctx)) - return action[0].asnumpy() + return action[0][0][0].asnumpy() def save_parameters(self, episode): self._export_net( @@ -525,19 +543,19 @@ class DdpgAgent(Agent): actor_target_actions = self._actor_target(next_states) critic_target_qvalues = self._critic_target( - next_states, actor_target_actions) + next_states, actor_target_actions[0][0]) rewards = rewards.reshape(self._minibatch_size, 1) terminals = terminals.reshape(self._minibatch_size, 1) # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) y = rewards + (1.0 - terminals) * self._discount_factor\ - * critic_target_qvalues + * critic_target_qvalues[0][0] # Train the critic network with autograd.record(): qvalues = self._critic(states, actions) - critic_loss = l2_loss(qvalues, y) + critic_loss = l2_loss(qvalues[0][0], y) critic_loss.backward() trainer_critic.step(self._minibatch_size) @@ -549,7 +567,7 @@ class DdpgAgent(Agent): # For maximizing qvalues we have to multiply with -1 # as we use a minimizer actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -567,7 +585,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues[0][0].asnumpy()) / self._minibatch_size training_steps += 1 @@ -665,6 +683,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None, @@ -682,6 +701,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play, critic=critic, soft_target_update_rate=soft_target_update_rate, actor=actor, actor_optimizer=actor_optimizer, actor_optimizer_params=actor_optimizer_params, @@ -884,7 +904,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): ctx=self._ctx ) target_action = np.clip( - self._actor_target(next_states) + clipped_noise, + self._actor_target(next_states)[0][0] + clipped_noise, self._strategy._action_low, self._strategy._action_high) @@ -892,9 +912,9 @@ class TwinDelayedDdpgAgent(DdpgAgent): terminals = terminals.reshape(self._minibatch_size, 1) target_qvalues1 = self._critic_target(next_states, - target_action) + target_action)[0][0] target_qvalues2 = self._critic2_target(next_states, - target_action) + target_action)[0][0] target_qvalues = nd.minimum(target_qvalues1, target_qvalues2) y = rewards + (1 - terminals) * self._discount_factor\ @@ -902,13 +922,13 @@ class TwinDelayedDdpgAgent(DdpgAgent): with autograd.record(): qvalues1 = self._critic(states, actions) - critic1_loss = l2_loss(qvalues1, y) + critic1_loss = l2_loss(qvalues1[0][0], y) critic1_loss.backward() trainer_critic.step(self._minibatch_size) with autograd.record(): qvalues2 = self._critic2(states, actions) - critic2_loss = l2_loss(qvalues2, y) + critic2_loss = l2_loss(qvalues2[0][0], y) critic2_loss.backward() trainer_critic2.step(self._minibatch_size) @@ -918,7 +938,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): tmp_critic = self._copy_critic() with autograd.record(): actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -1016,6 +1036,7 @@ class DqnAgent(Agent): target_update_interval=10, snapshot_interval=200, evaluation_samples=100, + self_play='no', agent_name='Dqn_agent', max_episode_step=99999, output_directory='model_parameters', @@ -1031,7 +1052,8 @@ class DqnAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play) self._qnet = qnet self._target_update_interval = target_update_interval diff --git a/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py b/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py index ceaf617306654badf2bc95e8495bb7954c920186..3f7505e188b3dba64dbe1a869e187086090d71f0 100644 --- a/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py +++ b/src/test/resources/target_code/ReinforcementConfig3/CNNTrainer_reinforcementConfig3.py @@ -57,7 +57,7 @@ if __name__ == "__main__": initializer = mx.init.Normal() qnet_creator = CNNCreator_reinforcementConfig3.CNNCreator_reinforcementConfig3() qnet_creator.setWeightInitializer(initializer) - qnet_creator.construct(context) + qnet_creator.construct([context]) agent_params = { 'environment': env, diff --git a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py index 027af51e2c15058f27ed83ceff8bf07b95e3ccc6..e4bf6072e6027bdec7eceec19d187f1e805500b7 100644 --- a/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ReinforcementConfig3/reinforcement_learning/agent.py @@ -3,6 +3,8 @@ import mxnet as mx import numpy as np import time import os +import subprocess #library to call shell script +import shlex #library to pass variable to shell script import sys import util import matplotlib.pyplot as plt @@ -33,6 +35,7 @@ class Agent(object): agent_name='Agent', max_episode_step=99999, evaluation_samples=1000, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -79,6 +82,7 @@ class Agent(object): self._target_score = target_score self._evaluation_samples = evaluation_samples + self._self_play = self_play self._best_avg_score = -np.infty self._best_net = None @@ -142,6 +146,7 @@ class Agent(object): config['training_episodes'] = self._training_episodes config['start_training'] = self._start_training config['evaluation_samples'] = self._evaluation_samples + config['self_play'] = self._self_play config['train_interval'] = self._train_interval config['snapshot_interval'] = self._snapshot_interval config['agent_name'] = self._agent_name @@ -226,7 +231,19 @@ class Agent(object): if do_snapshot: self.save_parameters(episode=episode) self._evaluate() - + if self._self_play == 'yes': + print("Self Play activated") + path = os.getcwd() + file = os.path.join(os.getcwd(), os.listdir(os.getcwd())[0]) + home_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file)))))) + os.chdir(home_path) + snapshot_param = str(self._snapshot_interval) + subprocess.check_call(['./update_agent.sh', snapshot_param]) + os.chdir(path) + elif self._self_play == 'no': + print("Self play deactivated") + + def _evaluate(self, verbose=True): avg_reward = self.evaluate( sample_games=self._evaluation_samples, verbose=False) @@ -323,6 +340,7 @@ class DdpgAgent(Agent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -336,7 +354,7 @@ class DdpgAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, self_play=self_play) assert critic is not None, 'Critic not set' assert actor is not None, 'Actor is not set' assert soft_target_update_rate > 0,\ @@ -445,7 +463,7 @@ class DdpgAgent(Agent): def get_next_action(self, state): action = self._actor(nd.array([state], ctx=self._ctx)) - return action[0].asnumpy() + return action[0][0][0].asnumpy() def save_parameters(self, episode): self._export_net( @@ -525,19 +543,19 @@ class DdpgAgent(Agent): actor_target_actions = self._actor_target(next_states) critic_target_qvalues = self._critic_target( - next_states, actor_target_actions) + next_states, actor_target_actions[0][0]) rewards = rewards.reshape(self._minibatch_size, 1) terminals = terminals.reshape(self._minibatch_size, 1) # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) y = rewards + (1.0 - terminals) * self._discount_factor\ - * critic_target_qvalues + * critic_target_qvalues[0][0] # Train the critic network with autograd.record(): qvalues = self._critic(states, actions) - critic_loss = l2_loss(qvalues, y) + critic_loss = l2_loss(qvalues[0][0], y) critic_loss.backward() trainer_critic.step(self._minibatch_size) @@ -549,7 +567,7 @@ class DdpgAgent(Agent): # For maximizing qvalues we have to multiply with -1 # as we use a minimizer actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -567,7 +585,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues[0][0].asnumpy()) / self._minibatch_size training_steps += 1 @@ -665,6 +683,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None, @@ -682,6 +701,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play, critic=critic, soft_target_update_rate=soft_target_update_rate, actor=actor, actor_optimizer=actor_optimizer, actor_optimizer_params=actor_optimizer_params, @@ -884,7 +904,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): ctx=self._ctx ) target_action = np.clip( - self._actor_target(next_states) + clipped_noise, + self._actor_target(next_states)[0][0] + clipped_noise, self._strategy._action_low, self._strategy._action_high) @@ -892,9 +912,9 @@ class TwinDelayedDdpgAgent(DdpgAgent): terminals = terminals.reshape(self._minibatch_size, 1) target_qvalues1 = self._critic_target(next_states, - target_action) + target_action)[0][0] target_qvalues2 = self._critic2_target(next_states, - target_action) + target_action)[0][0] target_qvalues = nd.minimum(target_qvalues1, target_qvalues2) y = rewards + (1 - terminals) * self._discount_factor\ @@ -902,13 +922,13 @@ class TwinDelayedDdpgAgent(DdpgAgent): with autograd.record(): qvalues1 = self._critic(states, actions) - critic1_loss = l2_loss(qvalues1, y) + critic1_loss = l2_loss(qvalues1[0][0], y) critic1_loss.backward() trainer_critic.step(self._minibatch_size) with autograd.record(): qvalues2 = self._critic2(states, actions) - critic2_loss = l2_loss(qvalues2, y) + critic2_loss = l2_loss(qvalues2[0][0], y) critic2_loss.backward() trainer_critic2.step(self._minibatch_size) @@ -918,7 +938,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): tmp_critic = self._copy_critic() with autograd.record(): actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -1016,6 +1036,7 @@ class DqnAgent(Agent): target_update_interval=10, snapshot_interval=200, evaluation_samples=100, + self_play='no', agent_name='Dqn_agent', max_episode_step=99999, output_directory='model_parameters', @@ -1031,7 +1052,8 @@ class DqnAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play) self._qnet = qnet self._target_update_interval = target_update_interval diff --git a/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py b/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py index 1553269ed814da0567436590654a9c87178a5a68..fce2601137c16bc95c715c9fbc7ce126c8757656 100644 --- a/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py +++ b/src/test/resources/target_code/ddpg/CNNTrainer_actorNetwork.py @@ -52,10 +52,10 @@ if __name__ == "__main__": critic_initializer = mx.init.Normal() actor_creator = CNNCreator_actorNetwork.CNNCreator_actorNetwork() actor_creator.setWeightInitializer(initializer) - actor_creator.construct(context) + actor_creator.construct([context]) critic_creator = CNNCreator_CriticNetwork() critic_creator.setWeightInitializer(critic_initializer) - critic_creator.construct(context) + critic_creator.construct([context]) agent_params = { 'environment': env, diff --git a/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py b/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py index 027af51e2c15058f27ed83ceff8bf07b95e3ccc6..e4bf6072e6027bdec7eceec19d187f1e805500b7 100644 --- a/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ddpg/reinforcement_learning/agent.py @@ -3,6 +3,8 @@ import mxnet as mx import numpy as np import time import os +import subprocess #library to call shell script +import shlex #library to pass variable to shell script import sys import util import matplotlib.pyplot as plt @@ -33,6 +35,7 @@ class Agent(object): agent_name='Agent', max_episode_step=99999, evaluation_samples=1000, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -79,6 +82,7 @@ class Agent(object): self._target_score = target_score self._evaluation_samples = evaluation_samples + self._self_play = self_play self._best_avg_score = -np.infty self._best_net = None @@ -142,6 +146,7 @@ class Agent(object): config['training_episodes'] = self._training_episodes config['start_training'] = self._start_training config['evaluation_samples'] = self._evaluation_samples + config['self_play'] = self._self_play config['train_interval'] = self._train_interval config['snapshot_interval'] = self._snapshot_interval config['agent_name'] = self._agent_name @@ -226,7 +231,19 @@ class Agent(object): if do_snapshot: self.save_parameters(episode=episode) self._evaluate() - + if self._self_play == 'yes': + print("Self Play activated") + path = os.getcwd() + file = os.path.join(os.getcwd(), os.listdir(os.getcwd())[0]) + home_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file)))))) + os.chdir(home_path) + snapshot_param = str(self._snapshot_interval) + subprocess.check_call(['./update_agent.sh', snapshot_param]) + os.chdir(path) + elif self._self_play == 'no': + print("Self play deactivated") + + def _evaluate(self, verbose=True): avg_reward = self.evaluate( sample_games=self._evaluation_samples, verbose=False) @@ -323,6 +340,7 @@ class DdpgAgent(Agent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -336,7 +354,7 @@ class DdpgAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, self_play=self_play) assert critic is not None, 'Critic not set' assert actor is not None, 'Actor is not set' assert soft_target_update_rate > 0,\ @@ -445,7 +463,7 @@ class DdpgAgent(Agent): def get_next_action(self, state): action = self._actor(nd.array([state], ctx=self._ctx)) - return action[0].asnumpy() + return action[0][0][0].asnumpy() def save_parameters(self, episode): self._export_net( @@ -525,19 +543,19 @@ class DdpgAgent(Agent): actor_target_actions = self._actor_target(next_states) critic_target_qvalues = self._critic_target( - next_states, actor_target_actions) + next_states, actor_target_actions[0][0]) rewards = rewards.reshape(self._minibatch_size, 1) terminals = terminals.reshape(self._minibatch_size, 1) # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) y = rewards + (1.0 - terminals) * self._discount_factor\ - * critic_target_qvalues + * critic_target_qvalues[0][0] # Train the critic network with autograd.record(): qvalues = self._critic(states, actions) - critic_loss = l2_loss(qvalues, y) + critic_loss = l2_loss(qvalues[0][0], y) critic_loss.backward() trainer_critic.step(self._minibatch_size) @@ -549,7 +567,7 @@ class DdpgAgent(Agent): # For maximizing qvalues we have to multiply with -1 # as we use a minimizer actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -567,7 +585,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues[0][0].asnumpy()) / self._minibatch_size training_steps += 1 @@ -665,6 +683,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None, @@ -682,6 +701,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play, critic=critic, soft_target_update_rate=soft_target_update_rate, actor=actor, actor_optimizer=actor_optimizer, actor_optimizer_params=actor_optimizer_params, @@ -884,7 +904,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): ctx=self._ctx ) target_action = np.clip( - self._actor_target(next_states) + clipped_noise, + self._actor_target(next_states)[0][0] + clipped_noise, self._strategy._action_low, self._strategy._action_high) @@ -892,9 +912,9 @@ class TwinDelayedDdpgAgent(DdpgAgent): terminals = terminals.reshape(self._minibatch_size, 1) target_qvalues1 = self._critic_target(next_states, - target_action) + target_action)[0][0] target_qvalues2 = self._critic2_target(next_states, - target_action) + target_action)[0][0] target_qvalues = nd.minimum(target_qvalues1, target_qvalues2) y = rewards + (1 - terminals) * self._discount_factor\ @@ -902,13 +922,13 @@ class TwinDelayedDdpgAgent(DdpgAgent): with autograd.record(): qvalues1 = self._critic(states, actions) - critic1_loss = l2_loss(qvalues1, y) + critic1_loss = l2_loss(qvalues1[0][0], y) critic1_loss.backward() trainer_critic.step(self._minibatch_size) with autograd.record(): qvalues2 = self._critic2(states, actions) - critic2_loss = l2_loss(qvalues2, y) + critic2_loss = l2_loss(qvalues2[0][0], y) critic2_loss.backward() trainer_critic2.step(self._minibatch_size) @@ -918,7 +938,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): tmp_critic = self._copy_critic() with autograd.record(): actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -1016,6 +1036,7 @@ class DqnAgent(Agent): target_update_interval=10, snapshot_interval=200, evaluation_samples=100, + self_play='no', agent_name='Dqn_agent', max_episode_step=99999, output_directory='model_parameters', @@ -1031,7 +1052,8 @@ class DqnAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play) self._qnet = qnet self._target_update_interval = target_update_interval diff --git a/src/test/resources/target_code/reinforcement_learning/agent.py b/src/test/resources/target_code/reinforcement_learning/agent.py index 027af51e2c15058f27ed83ceff8bf07b95e3ccc6..7eb80333c96885860e05269674e24f70285a5511 100644 --- a/src/test/resources/target_code/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/reinforcement_learning/agent.py @@ -3,6 +3,8 @@ import mxnet as mx import numpy as np import time import os +import subprocess #library to call shell script +import shlex #library to pass variable to shell script import sys import util import matplotlib.pyplot as plt @@ -33,6 +35,7 @@ class Agent(object): agent_name='Agent', max_episode_step=99999, evaluation_samples=1000, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -79,6 +82,7 @@ class Agent(object): self._target_score = target_score self._evaluation_samples = evaluation_samples + self._self_play = self_play self._best_avg_score = -np.infty self._best_net = None @@ -142,6 +146,7 @@ class Agent(object): config['training_episodes'] = self._training_episodes config['start_training'] = self._start_training config['evaluation_samples'] = self._evaluation_samples + config['self_play'] = self._self_play config['train_interval'] = self._train_interval config['snapshot_interval'] = self._snapshot_interval config['agent_name'] = self._agent_name @@ -226,7 +231,19 @@ class Agent(object): if do_snapshot: self.save_parameters(episode=episode) self._evaluate() - + if self._self_play == 'yes': + print("Self Play activated") + path = os.getcwd() + file = os.path.join(os.getcwd(), os.listdir(os.getcwd())[0]) + home_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file)))))) + os.chdir(home_path) + snapshot_param = str(self._snapshot_interval) + subprocess.check_call(['./update_agent.sh', snapshot_param]) + os.chdir(path) + elif self._self_play == 'no': + print("Self play deactivated") + + def _evaluate(self, verbose=True): avg_reward = self.evaluate( sample_games=self._evaluation_samples, verbose=False) @@ -323,6 +340,7 @@ class DdpgAgent(Agent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -336,7 +354,7 @@ class DdpgAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, self_play=self_play) assert critic is not None, 'Critic not set' assert actor is not None, 'Actor is not set' assert soft_target_update_rate > 0,\ @@ -445,7 +463,7 @@ class DdpgAgent(Agent): def get_next_action(self, state): action = self._actor(nd.array([state], ctx=self._ctx)) - return action[0].asnumpy() + return action[0][0][0].asnumpy() def save_parameters(self, episode): self._export_net( @@ -665,6 +683,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None, @@ -682,6 +701,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play, critic=critic, soft_target_update_rate=soft_target_update_rate, actor=actor, actor_optimizer=actor_optimizer, actor_optimizer_params=actor_optimizer_params, @@ -884,7 +904,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): ctx=self._ctx ) target_action = np.clip( - self._actor_target(next_states) + clipped_noise, + self._actor_target(next_states)[0][0] + clipped_noise, self._strategy._action_low, self._strategy._action_high) @@ -892,9 +912,9 @@ class TwinDelayedDdpgAgent(DdpgAgent): terminals = terminals.reshape(self._minibatch_size, 1) target_qvalues1 = self._critic_target(next_states, - target_action) + target_action)[0][0] target_qvalues2 = self._critic2_target(next_states, - target_action) + target_action)[0][0] target_qvalues = nd.minimum(target_qvalues1, target_qvalues2) y = rewards + (1 - terminals) * self._discount_factor\ @@ -902,13 +922,13 @@ class TwinDelayedDdpgAgent(DdpgAgent): with autograd.record(): qvalues1 = self._critic(states, actions) - critic1_loss = l2_loss(qvalues1, y) + critic1_loss = l2_loss(qvalues1[0][0], y) critic1_loss.backward() trainer_critic.step(self._minibatch_size) with autograd.record(): qvalues2 = self._critic2(states, actions) - critic2_loss = l2_loss(qvalues2, y) + critic2_loss = l2_loss(qvalues2[0][0], y) critic2_loss.backward() trainer_critic2.step(self._minibatch_size) @@ -918,7 +938,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): tmp_critic = self._copy_critic() with autograd.record(): actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -1016,6 +1036,7 @@ class DqnAgent(Agent): target_update_interval=10, snapshot_interval=200, evaluation_samples=100, + self_play='no', agent_name='Dqn_agent', max_episode_step=99999, output_directory='model_parameters', @@ -1031,7 +1052,8 @@ class DqnAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play) self._qnet = qnet self._target_update_interval = target_update_interval diff --git a/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py b/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py index 3e5e431909e6eb49ebda29244bd7334cdc756ae5..d06024699af740739c528c8310d213d6ff54b911 100644 --- a/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py +++ b/src/test/resources/target_code/ros-ddpg/CNNTrainer_rosActorNetwork.py @@ -59,10 +59,10 @@ if __name__ == "__main__": critic_initializer = mx.init.Normal() actor_creator = CNNCreator_rosActorNetwork.CNNCreator_rosActorNetwork() actor_creator.setWeightInitializer(initializer) - actor_creator.construct(context) + actor_creator.construct([context]) critic_creator = CNNCreator_RosCriticNetwork() critic_creator.setWeightInitializer(critic_initializer) - critic_creator.construct(context) + critic_creator.construct([context]) agent_params = { 'environment': env, diff --git a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py index 21ebaf397506d33cc199d49a663c06b3215a7e91..e4bf6072e6027bdec7eceec19d187f1e805500b7 100644 --- a/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/ros-ddpg/reinforcement_learning/agent.py @@ -3,6 +3,8 @@ import mxnet as mx import numpy as np import time import os +import subprocess #library to call shell script +import shlex #library to pass variable to shell script import sys import util import matplotlib.pyplot as plt @@ -33,6 +35,7 @@ class Agent(object): agent_name='Agent', max_episode_step=99999, evaluation_samples=1000, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -79,6 +82,7 @@ class Agent(object): self._target_score = target_score self._evaluation_samples = evaluation_samples + self._self_play = self_play self._best_avg_score = -np.infty self._best_net = None @@ -142,6 +146,7 @@ class Agent(object): config['training_episodes'] = self._training_episodes config['start_training'] = self._start_training config['evaluation_samples'] = self._evaluation_samples + config['self_play'] = self._self_play config['train_interval'] = self._train_interval config['snapshot_interval'] = self._snapshot_interval config['agent_name'] = self._agent_name @@ -226,7 +231,19 @@ class Agent(object): if do_snapshot: self.save_parameters(episode=episode) self._evaluate() - + if self._self_play == 'yes': + print("Self Play activated") + path = os.getcwd() + file = os.path.join(os.getcwd(), os.listdir(os.getcwd())[0]) + home_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file)))))) + os.chdir(home_path) + snapshot_param = str(self._snapshot_interval) + subprocess.check_call(['./update_agent.sh', snapshot_param]) + os.chdir(path) + elif self._self_play == 'no': + print("Self play deactivated") + + def _evaluate(self, verbose=True): avg_reward = self.evaluate( sample_games=self._evaluation_samples, verbose=False) @@ -323,6 +340,7 @@ class DdpgAgent(Agent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -336,7 +354,7 @@ class DdpgAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, self_play=self_play) assert critic is not None, 'Critic not set' assert actor is not None, 'Actor is not set' assert soft_target_update_rate > 0,\ @@ -445,7 +463,7 @@ class DdpgAgent(Agent): def get_next_action(self, state): action = self._actor(nd.array([state], ctx=self._ctx)) - return action[0].asnumpy() + return action[0][0][0].asnumpy() def save_parameters(self, episode): self._export_net( @@ -525,19 +543,19 @@ class DdpgAgent(Agent): actor_target_actions = self._actor_target(next_states) critic_target_qvalues = self._critic_target( - next_states, actor_target_actions) + next_states, actor_target_actions[0][0]) rewards = rewards.reshape(self._minibatch_size, 1) terminals = terminals.reshape(self._minibatch_size, 1) # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) y = rewards + (1.0 - terminals) * self._discount_factor\ - * critic_target_qvalues + * critic_target_qvalues[0][0] # Train the critic network with autograd.record(): qvalues = self._critic(states, actions) - critic_loss = l2_loss(qvalues, y) + critic_loss = l2_loss(qvalues[0][0], y) critic_loss.backward() trainer_critic.step(self._minibatch_size) @@ -549,7 +567,7 @@ class DdpgAgent(Agent): # For maximizing qvalues we have to multiply with -1 # as we use a minimizer actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -567,7 +585,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues[0][0].asnumpy()) / self._minibatch_size training_steps += 1 @@ -665,6 +683,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None, @@ -682,6 +701,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play, critic=critic, soft_target_update_rate=soft_target_update_rate, actor=actor, actor_optimizer=actor_optimizer, actor_optimizer_params=actor_optimizer_params, @@ -884,7 +904,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): ctx=self._ctx ) target_action = np.clip( - self._actor_target(next_states) + clipped_noise, + self._actor_target(next_states)[0][0] + clipped_noise, self._strategy._action_low, self._strategy._action_high) @@ -892,9 +912,9 @@ class TwinDelayedDdpgAgent(DdpgAgent): terminals = terminals.reshape(self._minibatch_size, 1) target_qvalues1 = self._critic_target(next_states, - target_action) + target_action)[0][0] target_qvalues2 = self._critic2_target(next_states, - target_action) + target_action)[0][0] target_qvalues = nd.minimum(target_qvalues1, target_qvalues2) y = rewards + (1 - terminals) * self._discount_factor\ @@ -902,13 +922,13 @@ class TwinDelayedDdpgAgent(DdpgAgent): with autograd.record(): qvalues1 = self._critic(states, actions) - critic1_loss = l2_loss(qvalues1, y) + critic1_loss = l2_loss(qvalues1[0][0], y) critic1_loss.backward() trainer_critic.step(self._minibatch_size) with autograd.record(): qvalues2 = self._critic2(states, actions) - critic2_loss = l2_loss(qvalues2, y) + critic2_loss = l2_loss(qvalues2[0][0], y) critic2_loss.backward() trainer_critic2.step(self._minibatch_size) @@ -918,7 +938,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): tmp_critic = self._copy_critic() with autograd.record(): actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -1016,6 +1036,7 @@ class DqnAgent(Agent): target_update_interval=10, snapshot_interval=200, evaluation_samples=100, + self_play='no', agent_name='Dqn_agent', max_episode_step=99999, output_directory='model_parameters', @@ -1031,7 +1052,8 @@ class DqnAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play) self._qnet = qnet self._target_update_interval = target_update_interval @@ -1267,4 +1289,4 @@ class DqnAgent(Agent): def _save_current_as_best_net(self): self._best_net = copy_net( - self._qnet, self._state_dim, ctx=self._ctx) \ No newline at end of file + self._qnet, self._state_dim, ctx=self._ctx) diff --git a/src/test/resources/target_code/td3/CNNTrainer_tD3Config.py b/src/test/resources/target_code/td3/CNNTrainer_tD3Config.py index b055ca7ddcc6254342a6b346b0715b0a544aa66c..4a6871009b85898163f8a616221571289119c60b 100644 --- a/src/test/resources/target_code/td3/CNNTrainer_tD3Config.py +++ b/src/test/resources/target_code/td3/CNNTrainer_tD3Config.py @@ -52,10 +52,10 @@ if __name__ == "__main__": critic_initializer = mx.init.Normal() actor_creator = CNNCreator_tD3Config.CNNCreator_tD3Config() actor_creator.setWeightInitializer(initializer) - actor_creator.construct(context) + actor_creator.construct([context]) critic_creator = CNNCreator_CriticNetwork() critic_creator.setWeightInitializer(critic_initializer) - critic_creator.construct(context) + critic_creator.construct([context]) agent_params = { 'environment': env, diff --git a/src/test/resources/target_code/td3/reinforcement_learning/agent.py b/src/test/resources/target_code/td3/reinforcement_learning/agent.py index 027af51e2c15058f27ed83ceff8bf07b95e3ccc6..e4bf6072e6027bdec7eceec19d187f1e805500b7 100644 --- a/src/test/resources/target_code/td3/reinforcement_learning/agent.py +++ b/src/test/resources/target_code/td3/reinforcement_learning/agent.py @@ -3,6 +3,8 @@ import mxnet as mx import numpy as np import time import os +import subprocess #library to call shell script +import shlex #library to pass variable to shell script import sys import util import matplotlib.pyplot as plt @@ -33,6 +35,7 @@ class Agent(object): agent_name='Agent', max_episode_step=99999, evaluation_samples=1000, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -79,6 +82,7 @@ class Agent(object): self._target_score = target_score self._evaluation_samples = evaluation_samples + self._self_play = self_play self._best_avg_score = -np.infty self._best_net = None @@ -142,6 +146,7 @@ class Agent(object): config['training_episodes'] = self._training_episodes config['start_training'] = self._start_training config['evaluation_samples'] = self._evaluation_samples + config['self_play'] = self._self_play config['train_interval'] = self._train_interval config['snapshot_interval'] = self._snapshot_interval config['agent_name'] = self._agent_name @@ -226,7 +231,19 @@ class Agent(object): if do_snapshot: self.save_parameters(episode=episode) self._evaluate() - + if self._self_play == 'yes': + print("Self Play activated") + path = os.getcwd() + file = os.path.join(os.getcwd(), os.listdir(os.getcwd())[0]) + home_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(file)))))) + os.chdir(home_path) + snapshot_param = str(self._snapshot_interval) + subprocess.check_call(['./update_agent.sh', snapshot_param]) + os.chdir(path) + elif self._self_play == 'no': + print("Self play deactivated") + + def _evaluate(self, verbose=True): avg_reward = self.evaluate( sample_games=self._evaluation_samples, verbose=False) @@ -323,6 +340,7 @@ class DdpgAgent(Agent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None @@ -336,7 +354,7 @@ class DdpgAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, self_play=self_play) assert critic is not None, 'Critic not set' assert actor is not None, 'Actor is not set' assert soft_target_update_rate > 0,\ @@ -445,7 +463,7 @@ class DdpgAgent(Agent): def get_next_action(self, state): action = self._actor(nd.array([state], ctx=self._ctx)) - return action[0].asnumpy() + return action[0][0][0].asnumpy() def save_parameters(self, episode): self._export_net( @@ -525,19 +543,19 @@ class DdpgAgent(Agent): actor_target_actions = self._actor_target(next_states) critic_target_qvalues = self._critic_target( - next_states, actor_target_actions) + next_states, actor_target_actions[0][0]) rewards = rewards.reshape(self._minibatch_size, 1) terminals = terminals.reshape(self._minibatch_size, 1) # y = r_i + discount * Q'(s_(i+1), mu'(s_(i+1))) y = rewards + (1.0 - terminals) * self._discount_factor\ - * critic_target_qvalues + * critic_target_qvalues[0][0] # Train the critic network with autograd.record(): qvalues = self._critic(states, actions) - critic_loss = l2_loss(qvalues, y) + critic_loss = l2_loss(qvalues[0][0], y) critic_loss.backward() trainer_critic.step(self._minibatch_size) @@ -549,7 +567,7 @@ class DdpgAgent(Agent): # For maximizing qvalues we have to multiply with -1 # as we use a minimizer actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -567,7 +585,7 @@ class DdpgAgent(Agent): episode_actor_loss +=\ np.sum(actor_loss.asnumpy()) / self._minibatch_size episode_avg_q_value +=\ - np.sum(qvalues.asnumpy()) / self._minibatch_size + np.sum(qvalues[0][0].asnumpy()) / self._minibatch_size training_steps += 1 @@ -665,6 +683,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): agent_name='DdpgAgent', max_episode_step=9999, evaluation_samples=100, + self_play='no', output_directory='model_parameters', verbose=True, target_score=None, @@ -682,6 +701,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play, critic=critic, soft_target_update_rate=soft_target_update_rate, actor=actor, actor_optimizer=actor_optimizer, actor_optimizer_params=actor_optimizer_params, @@ -884,7 +904,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): ctx=self._ctx ) target_action = np.clip( - self._actor_target(next_states) + clipped_noise, + self._actor_target(next_states)[0][0] + clipped_noise, self._strategy._action_low, self._strategy._action_high) @@ -892,9 +912,9 @@ class TwinDelayedDdpgAgent(DdpgAgent): terminals = terminals.reshape(self._minibatch_size, 1) target_qvalues1 = self._critic_target(next_states, - target_action) + target_action)[0][0] target_qvalues2 = self._critic2_target(next_states, - target_action) + target_action)[0][0] target_qvalues = nd.minimum(target_qvalues1, target_qvalues2) y = rewards + (1 - terminals) * self._discount_factor\ @@ -902,13 +922,13 @@ class TwinDelayedDdpgAgent(DdpgAgent): with autograd.record(): qvalues1 = self._critic(states, actions) - critic1_loss = l2_loss(qvalues1, y) + critic1_loss = l2_loss(qvalues1[0][0], y) critic1_loss.backward() trainer_critic.step(self._minibatch_size) with autograd.record(): qvalues2 = self._critic2(states, actions) - critic2_loss = l2_loss(qvalues2, y) + critic2_loss = l2_loss(qvalues2[0][0], y) critic2_loss.backward() trainer_critic2.step(self._minibatch_size) @@ -918,7 +938,7 @@ class TwinDelayedDdpgAgent(DdpgAgent): tmp_critic = self._copy_critic() with autograd.record(): actor_loss = -tmp_critic( - states, self._actor(states)).mean() + states, self._actor(states)[0][0])[0][0].mean() actor_loss.backward() trainer_actor.step(self._minibatch_size) @@ -1016,6 +1036,7 @@ class DqnAgent(Agent): target_update_interval=10, snapshot_interval=200, evaluation_samples=100, + self_play='no', agent_name='Dqn_agent', max_episode_step=99999, output_directory='model_parameters', @@ -1031,7 +1052,8 @@ class DqnAgent(Agent): snapshot_interval=snapshot_interval, agent_name=agent_name, max_episode_step=max_episode_step, output_directory=output_directory, verbose=verbose, - target_score=target_score, evaluation_samples=evaluation_samples) + target_score=target_score, evaluation_samples=evaluation_samples, + self_play=self_play) self._qnet = qnet self._target_update_interval = target_update_interval