Skip to content
Snippets Groups Projects
Commit bcf8b105 authored by Nicola Gatto's avatar Nicola Gatto
Browse files

Implement eps decay per step

parent 863f3e93
No related branches found
No related tags found
3 merge requests!20Implemented layer variables and RNN layer,!19Integrate TD3 Algorithm and Gaussian Noise,!18Integrate TD3 Algorithm and Gaussian Noise
Pipeline #159281 failed
......@@ -504,6 +504,7 @@ class DdpgAgent(Agent):
# actor and exploration noise N according to strategy
action = self._strategy.select_action(
self.get_next_action(state))
self._strategy.decay(self._current_episode)
# Execute action a and observe reward r and next state ns
next_state, reward, terminal, _ = \
......@@ -545,10 +546,10 @@ class DdpgAgent(Agent):
# with critic parameters
tmp_critic = self._copy_critic()
with autograd.record():
actor_qvalues = tmp_critic(states, self._actor(states))
# For maximizing qvalues we have to multiply with -1
# as we use a minimizer
actor_loss = -1 * actor_qvalues.mean()
actor_loss = -tmp_critic(
states, self._actor(states)).mean()
actor_loss.backward()
trainer_actor.step(self._minibatch_size)
......@@ -594,7 +595,6 @@ class DdpgAgent(Agent):
self._strategy.cur_eps, episode_reward)
self._do_snapshot_if_in_interval(self._current_episode)
self._strategy.decay(self._current_episode)
if self._is_target_reached(avg_reward):
self._logger.info(
......@@ -1190,6 +1190,7 @@ class DqnAgent(Agent):
# 1. Choose an action based on current game state and policy
q_values = self._qnet(nd.array([state], ctx=self._ctx))
action = self._strategy.select_action(q_values[0])
self._strategy.decay(self._current_episode)
# 2. Play the game for a single step
next_state, reward, terminal, _ =\
......@@ -1226,8 +1227,6 @@ class DqnAgent(Agent):
self._current_episode, start, training_steps,
episode_loss, self._strategy.cur_eps, episode_reward)
self._strategy.decay(self._current_episode)
if self._is_target_reached(avg_reward):
self._logger.info(
'Target score is reached in average; Training is stopped')
......
......@@ -13,6 +13,7 @@ class StrategyBuilder(object):
epsilon_decay_method='no',
epsilon_decay=0.0,
epsilon_decay_start=0,
epsilon_decay_per_step=False,
action_dim=None,
action_low=None,
action_high=None,
......@@ -24,7 +25,8 @@ class StrategyBuilder(object):
if epsilon_decay_method == 'linear':
decay = LinearDecay(
eps_decay=epsilon_decay, min_eps=min_epsilon,
decay_start=epsilon_decay_start)
decay_start=epsilon_decay_start,
decay_per_step=epsilon_decay_per_step)
else:
decay = NoDecay()
......@@ -76,17 +78,27 @@ class NoDecay(BaseDecay):
class LinearDecay(BaseDecay):
def __init__(self, eps_decay, min_eps=0, decay_start=0):
def __init__(self, eps_decay, min_eps=0, decay_start=0, decay_per_step=False):
super(LinearDecay, self).__init__()
self.eps_decay = eps_decay
self.min_eps = min_eps
self.decay_start = decay_start
self.decay_per_step = decay_per_step
self.last_episode = -1
def decay(self, cur_eps, episode):
if episode < self.decay_start:
return cur_eps
def do_decay(self, episode):
if self.decay_per_step:
do = (episode >= self.decay_start)
else:
do = ((self.last_episode != episode) and (episode >= self.decay_start))
self.last_episode = episode
return do
def decay(self, cur_eps, episode):
if self.do_decay(episode):
return max(cur_eps - self.eps_decay, self.min_eps)
else:
return cur_eps
class BaseStrategy(object):
......
......@@ -16,6 +16,9 @@
<#if (config.strategy.epsilon_decay_start)??>
'epsilon_decay_start': ${config.strategy.epsilon_decay_start},
</#if>
<#if (config.strategy.epsilon_decay_start)??>
'epsilon_decay_per_step': ${config.strategy.epsilon_decay_per_step?string('True', 'False')},
</#if>
<#if (config.strategy.method)?? && (config.strategy.method=="ornstein_uhlenbeck")>
<#if (config.strategy.action_low)?? >
'action_low': ${config.strategy.action_low},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment