Skip to content
Snippets Groups Projects
Commit bcf8b105 authored by Nicola Gatto's avatar Nicola Gatto
Browse files

Implement eps decay per step

parent 863f3e93
Branches
No related tags found
3 merge requests!20Implemented layer variables and RNN layer,!19Integrate TD3 Algorithm and Gaussian Noise,!18Integrate TD3 Algorithm and Gaussian Noise
Pipeline #159281 failed
......@@ -504,6 +504,7 @@ class DdpgAgent(Agent):
# actor and exploration noise N according to strategy
action = self._strategy.select_action(
self.get_next_action(state))
self._strategy.decay(self._current_episode)
# Execute action a and observe reward r and next state ns
next_state, reward, terminal, _ = \
......@@ -545,10 +546,10 @@ class DdpgAgent(Agent):
# with critic parameters
tmp_critic = self._copy_critic()
with autograd.record():
actor_qvalues = tmp_critic(states, self._actor(states))
# For maximizing qvalues we have to multiply with -1
# as we use a minimizer
actor_loss = -1 * actor_qvalues.mean()
actor_loss = -tmp_critic(
states, self._actor(states)).mean()
actor_loss.backward()
trainer_actor.step(self._minibatch_size)
......@@ -594,7 +595,6 @@ class DdpgAgent(Agent):
self._strategy.cur_eps, episode_reward)
self._do_snapshot_if_in_interval(self._current_episode)
self._strategy.decay(self._current_episode)
if self._is_target_reached(avg_reward):
self._logger.info(
......@@ -1190,6 +1190,7 @@ class DqnAgent(Agent):
# 1. Choose an action based on current game state and policy
q_values = self._qnet(nd.array([state], ctx=self._ctx))
action = self._strategy.select_action(q_values[0])
self._strategy.decay(self._current_episode)
# 2. Play the game for a single step
next_state, reward, terminal, _ =\
......@@ -1226,8 +1227,6 @@ class DqnAgent(Agent):
self._current_episode, start, training_steps,
episode_loss, self._strategy.cur_eps, episode_reward)
self._strategy.decay(self._current_episode)
if self._is_target_reached(avg_reward):
self._logger.info(
'Target score is reached in average; Training is stopped')
......
......@@ -13,6 +13,7 @@ class StrategyBuilder(object):
epsilon_decay_method='no',
epsilon_decay=0.0,
epsilon_decay_start=0,
epsilon_decay_per_step=False,
action_dim=None,
action_low=None,
action_high=None,
......@@ -24,7 +25,8 @@ class StrategyBuilder(object):
if epsilon_decay_method == 'linear':
decay = LinearDecay(
eps_decay=epsilon_decay, min_eps=min_epsilon,
decay_start=epsilon_decay_start)
decay_start=epsilon_decay_start,
decay_per_step=epsilon_decay_per_step)
else:
decay = NoDecay()
......@@ -76,17 +78,27 @@ class NoDecay(BaseDecay):
class LinearDecay(BaseDecay):
def __init__(self, eps_decay, min_eps=0, decay_start=0):
def __init__(self, eps_decay, min_eps=0, decay_start=0, decay_per_step=False):
super(LinearDecay, self).__init__()
self.eps_decay = eps_decay
self.min_eps = min_eps
self.decay_start = decay_start
self.decay_per_step = decay_per_step
self.last_episode = -1
def decay(self, cur_eps, episode):
if episode < self.decay_start:
return cur_eps
def do_decay(self, episode):
if self.decay_per_step:
do = (episode >= self.decay_start)
else:
do = ((self.last_episode != episode) and (episode >= self.decay_start))
self.last_episode = episode
return do
def decay(self, cur_eps, episode):
if self.do_decay(episode):
return max(cur_eps - self.eps_decay, self.min_eps)
else:
return cur_eps
class BaseStrategy(object):
......
......@@ -16,6 +16,9 @@
<#if (config.strategy.epsilon_decay_start)??>
'epsilon_decay_start': ${config.strategy.epsilon_decay_start},
</#if>
<#if (config.strategy.epsilon_decay_start)??>
'epsilon_decay_per_step': ${config.strategy.epsilon_decay_per_step?string('True', 'False')},
</#if>
<#if (config.strategy.method)?? && (config.strategy.method=="ornstein_uhlenbeck")>
<#if (config.strategy.action_low)?? >
'action_low': ${config.strategy.action_low},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment