#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
from rl_coach.architectures.head_parameters import ACERPolicyHeadParameters, QHeadParameters
from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters
from rl_coach.core_types import Batch
from rl_coach.exploration_policies.categorical import CategoricalParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.utils import eps, last_sample
[docs]class ACERAlgorithmParameters(AlgorithmParameters):
"""
:param num_steps_between_gradient_updates: (int)
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
:param ratio_of_replay: (int)
The number of off-policy training iterations in each ACER iteration.
:param num_transitions_to_start_replay: (int)
Number of environment steps until ACER starts to train off-policy from the experience replay.
This emulates a heat-up phase where the agents learns only on-policy until there are enough transitions in
the experience replay to start the off-policy training.
:param rate_for_copying_weights_to_target: (float)
The rate of the exponential moving average for the average policy which is used for the trust region optimization.
The target network in this algorithm is used as the average policy.
:param importance_weight_truncation: (float)
The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).
:param use_trust_region_optimization: (bool)
If set to True, the gradients of the network will be modified with a term dependant on the KL divergence between
the average policy and the current one, to bound the change of the policy during the network update.
:param max_KL_divergence: (float)
The upper bound parameter for the trust region optimization, use_trust_region_optimization needs to be set true
for this parameter to have an effect.
:param beta_entropy: (float)
An entropy regulaization term can be added to the loss function in order to control exploration. This term
is weighted using the beta value defined by beta_entropy.
"""
def __init__(self):
super().__init__()
self.apply_gradients_every_x_episodes = 5
self.num_steps_between_gradient_updates = 5000
self.ratio_of_replay = 4
self.num_transitions_to_start_replay = 10000
self.rate_for_copying_weights_to_target = 0.01
self.importance_weight_truncation = 10.0
self.use_trust_region_optimization = True
self.max_KL_divergence = 1.0
self.beta_entropy = 0
class ACERNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [QHeadParameters(loss_weight=0.5), ACERPolicyHeadParameters(loss_weight=1.0)]
self.optimizer_type = 'Adam'
self.async_training = True
self.clip_gradients = 40.0
self.create_target_network = True
class ACERAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=ACERAlgorithmParameters(),
exploration={DiscreteActionSpace: CategoricalParameters()},
memory=EpisodicExperienceReplayParameters(),
networks={"main": ACERNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.acer_agent:ACERAgent'
# Actor-Critic with Experience Replay - https://arxiv.org/abs/1611.01224
class ACERAgent(PolicyOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
# signals definition
self.q_loss = self.register_signal('Q Loss')
self.policy_loss = self.register_signal('Policy Loss')
self.probability_loss = self.register_signal('Probability Loss')
self.bias_correction_loss = self.register_signal('Bias Correction Loss')
self.unclipped_grads = self.register_signal('Grads (unclipped)')
self.V_Values = self.register_signal('Values')
self.kl_divergence = self.register_signal('KL Divergence')
def _learn_from_batch(self, batch):
fetches = [self.networks['main'].online_network.output_heads[1].probability_loss,
self.networks['main'].online_network.output_heads[1].bias_correction_loss,
self.networks['main'].online_network.output_heads[1].kl_divergence]
# batch contains a list of transitions to learn from
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# get the values for the current states
Q_values, policy_prob = self.networks['main'].online_network.predict(batch.states(network_keys))
avg_policy_prob = self.networks['main'].target_network.predict(batch.states(network_keys))[1]
current_state_values = np.sum(policy_prob * Q_values, axis=1)
actions = batch.actions()
num_transitions = batch.size
Q_head_targets = Q_values
Q_i = Q_values[np.arange(num_transitions), actions]
mu = batch.info('all_action_probabilities')
rho = policy_prob / (mu + eps)
rho_i = rho[np.arange(batch.size), actions]
rho_bar = np.minimum(1.0, rho_i)
if batch.game_overs()[-1]:
Qret = 0
else:
result = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))
Qret = np.sum(result[0] * result[1], axis=1)[0]
for i in reversed(range(num_transitions)):
Qret = batch.rewards()[i] + self.ap.algorithm.discount * Qret
Q_head_targets[i, actions[i]] = Qret
Qret = rho_bar[i] * (Qret - Q_i[i]) + current_state_values[i]
Q_retrace = Q_head_targets[np.arange(num_transitions), actions]
# train
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
'output_1_0': actions,
'output_1_1': rho,
'output_1_2': rho_i,
'output_1_3': Q_values,
'output_1_4': Q_retrace,
'output_1_5': avg_policy_prob},
[Q_head_targets, current_state_values],
additional_fetches=fetches)
for network in self.networks.values():
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
# logging
total_loss, losses, unclipped_grads, fetch_result = result[:4]
self.q_loss.add_sample(losses[0])
self.policy_loss.add_sample(losses[1])
self.probability_loss.add_sample(fetch_result[0])
self.bias_correction_loss.add_sample(fetch_result[1])
self.unclipped_grads.add_sample(unclipped_grads)
self.V_Values.add_sample(current_state_values)
self.kl_divergence.add_sample(fetch_result[2])
return total_loss, losses, unclipped_grads
def learn_from_batch(self, batch):
# perform on-policy training iteration
total_loss, losses, unclipped_grads = self._learn_from_batch(batch)
if self.ap.algorithm.ratio_of_replay > 0 \
and self.memory.num_transitions() > self.ap.algorithm.num_transitions_to_start_replay:
n = np.random.poisson(self.ap.algorithm.ratio_of_replay)
# perform n off-policy training iterations
for _ in range(n):
new_batch = Batch(self.call_memory('sample', (self.ap.algorithm.num_steps_between_gradient_updates, True)))
result = self._learn_from_batch(new_batch)
total_loss += result[0]
losses += result[1]
unclipped_grads += result[2]
return total_loss, losses, unclipped_grads
def get_prediction(self, states):
tf_input_state = self.prepare_batch_for_inference(states, "main")
return self.networks['main'].online_network.predict(tf_input_state)[1:] # index 0 is the state value