Source code for rl_coach.agents.acer_agent

#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from typing import Union
import numpy as np

from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
from rl_coach.architectures.head_parameters import ACERPolicyHeadParameters, QHeadParameters
from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters
from rl_coach.core_types import Batch
from rl_coach.exploration_policies.categorical import CategoricalParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.utils import eps, last_sample


[docs]class ACERAlgorithmParameters(AlgorithmParameters): """ :param num_steps_between_gradient_updates: (int) Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation. :param ratio_of_replay: (int) The number of off-policy training iterations in each ACER iteration. :param num_transitions_to_start_replay: (int) Number of environment steps until ACER starts to train off-policy from the experience replay. This emulates a heat-up phase where the agents learns only on-policy until there are enough transitions in the experience replay to start the off-policy training. :param rate_for_copying_weights_to_target: (float) The rate of the exponential moving average for the average policy which is used for the trust region optimization. The target network in this algorithm is used as the average policy. :param importance_weight_truncation: (float) The clipping constant for the importance weight truncation (not used in the Q-retrace calculation). :param use_trust_region_optimization: (bool) If set to True, the gradients of the network will be modified with a term dependant on the KL divergence between the average policy and the current one, to bound the change of the policy during the network update. :param max_KL_divergence: (float) The upper bound parameter for the trust region optimization, use_trust_region_optimization needs to be set true for this parameter to have an effect. :param beta_entropy: (float) An entropy regulaization term can be added to the loss function in order to control exploration. This term is weighted using the beta value defined by beta_entropy. """ def __init__(self): super().__init__() self.apply_gradients_every_x_episodes = 5 self.num_steps_between_gradient_updates = 5000 self.ratio_of_replay = 4 self.num_transitions_to_start_replay = 10000 self.rate_for_copying_weights_to_target = 0.01 self.importance_weight_truncation = 10.0 self.use_trust_region_optimization = True self.max_KL_divergence = 1.0 self.beta_entropy = 0
class ACERNetworkParameters(NetworkParameters): def __init__(self): super().__init__() self.input_embedders_parameters = {'observation': InputEmbedderParameters()} self.middleware_parameters = FCMiddlewareParameters() self.heads_parameters = [QHeadParameters(loss_weight=0.5), ACERPolicyHeadParameters(loss_weight=1.0)] self.optimizer_type = 'Adam' self.async_training = True self.clip_gradients = 40.0 self.create_target_network = True class ACERAgentParameters(AgentParameters): def __init__(self): super().__init__(algorithm=ACERAlgorithmParameters(), exploration={DiscreteActionSpace: CategoricalParameters()}, memory=EpisodicExperienceReplayParameters(), networks={"main": ACERNetworkParameters()}) @property def path(self): return 'rl_coach.agents.acer_agent:ACERAgent' # Actor-Critic with Experience Replay - https://arxiv.org/abs/1611.01224 class ACERAgent(PolicyOptimizationAgent): def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None): super().__init__(agent_parameters, parent) # signals definition self.q_loss = self.register_signal('Q Loss') self.policy_loss = self.register_signal('Policy Loss') self.probability_loss = self.register_signal('Probability Loss') self.bias_correction_loss = self.register_signal('Bias Correction Loss') self.unclipped_grads = self.register_signal('Grads (unclipped)') self.V_Values = self.register_signal('Values') self.kl_divergence = self.register_signal('KL Divergence') def _learn_from_batch(self, batch): fetches = [self.networks['main'].online_network.output_heads[1].probability_loss, self.networks['main'].online_network.output_heads[1].bias_correction_loss, self.networks['main'].online_network.output_heads[1].kl_divergence] # batch contains a list of transitions to learn from network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys() # get the values for the current states Q_values, policy_prob = self.networks['main'].online_network.predict(batch.states(network_keys)) avg_policy_prob = self.networks['main'].target_network.predict(batch.states(network_keys))[1] current_state_values = np.sum(policy_prob * Q_values, axis=1) actions = batch.actions() num_transitions = batch.size Q_head_targets = Q_values Q_i = Q_values[np.arange(num_transitions), actions] mu = batch.info('all_action_probabilities') rho = policy_prob / (mu + eps) rho_i = rho[np.arange(batch.size), actions] rho_bar = np.minimum(1.0, rho_i) if batch.game_overs()[-1]: Qret = 0 else: result = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys))) Qret = np.sum(result[0] * result[1], axis=1)[0] for i in reversed(range(num_transitions)): Qret = batch.rewards()[i] + self.ap.algorithm.discount * Qret Q_head_targets[i, actions[i]] = Qret Qret = rho_bar[i] * (Qret - Q_i[i]) + current_state_values[i] Q_retrace = Q_head_targets[np.arange(num_transitions), actions] # train result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys), 'output_1_0': actions, 'output_1_1': rho, 'output_1_2': rho_i, 'output_1_3': Q_values, 'output_1_4': Q_retrace, 'output_1_5': avg_policy_prob}, [Q_head_targets, current_state_values], additional_fetches=fetches) for network in self.networks.values(): network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target) # logging total_loss, losses, unclipped_grads, fetch_result = result[:4] self.q_loss.add_sample(losses[0]) self.policy_loss.add_sample(losses[1]) self.probability_loss.add_sample(fetch_result[0]) self.bias_correction_loss.add_sample(fetch_result[1]) self.unclipped_grads.add_sample(unclipped_grads) self.V_Values.add_sample(current_state_values) self.kl_divergence.add_sample(fetch_result[2]) return total_loss, losses, unclipped_grads def learn_from_batch(self, batch): # perform on-policy training iteration total_loss, losses, unclipped_grads = self._learn_from_batch(batch) if self.ap.algorithm.ratio_of_replay > 0 \ and self.memory.num_transitions() > self.ap.algorithm.num_transitions_to_start_replay: n = np.random.poisson(self.ap.algorithm.ratio_of_replay) # perform n off-policy training iterations for _ in range(n): new_batch = Batch(self.call_memory('sample', (self.ap.algorithm.num_steps_between_gradient_updates, True))) result = self._learn_from_batch(new_batch) total_loss += result[0] losses += result[1] unclipped_grads += result[2] return total_loss, losses, unclipped_grads def get_prediction(self, states): tf_input_state = self.prepare_batch_for_inference(states, "main") return self.networks['main'].online_network.predict(tf_input_state)[1:] # index 0 is the state value