:param agent_parameters: A AgentParameters class instance with all the agent parameters """super().__init__()
+ # use seed
+ ifagent_parameters.task_parameters.seedisnotNone:
+ random.seed(agent_parameters.task_parameters.seed)
+ np.random.seed(agent_parameters.task_parameters.seed)
+ else:
+ # we need to seed the RNG since the different processes are initialized with the same parent seed
+ random.seed()
+ np.random.seed()
+
self.ap=agent_parametersself.task_id=self.ap.task_parameters.task_indexself.is_chief=self.task_id==0
@@ -229,10 +241,10 @@
Source code for rl_coach.agents.agent
andself.ap.memory.shared_memoryifself.shared_memory:self.shared_memory_scratchpad=self.ap.task_parameters.shared_memory_scratchpad
- self.name=agent_parameters.nameself.parent=parentself.parent_level_manager=None
- self.full_name_id=agent_parameters.full_name_id=self.name
+ # TODO this needs to be sorted out. Why the duplicates for the agent's name?
+ self.full_name_id=agent_parameters.full_name_id=self.name=agent_parameters.nameiftype(agent_parameters.task_parameters)==DistributedTaskParameters:screen.log_title("Creating agent - name: {} task id: {} (may take up to 30 seconds due to "
@@ -264,9 +276,17 @@
Source code for rl_coach.agents.agent
self.memory.set_memory_backend(self.memory_backend)ifagent_parameters.memory.load_memory_from_file_path:
- screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
- .format(agent_parameters.memory.load_memory_from_file_path))
- self.memory.load(agent_parameters.memory.load_memory_from_file_path)
+ ifisinstance(agent_parameters.memory.load_memory_from_file_path,PickledReplayBuffer):
+ screen.log_title("Loading a pickled replay buffer. Pickled file path: {}"
+ .format(agent_parameters.memory.load_memory_from_file_path.filepath))
+ self.memory.load_pickled(agent_parameters.memory.load_memory_from_file_path.filepath)
+ elifisinstance(agent_parameters.memory.load_memory_from_file_path,CsvDataset):
+ screen.log_title("Loading a replay buffer from a CSV file. CSV file path: {}"
+ .format(agent_parameters.memory.load_memory_from_file_path.filepath))
+ self.memory.load_csv(agent_parameters.memory.load_memory_from_file_path)
+ else:
+ raiseValueError('Trying to load a replay buffer using an unsupported method - {}. '
+ .format(agent_parameters.memory.load_memory_from_file_path))ifself.shared_memoryandself.is_chief:self.shared_memory_scratchpad.add(self.memory_lookup_name,self.memory)
@@ -327,6 +347,7 @@
self.discounted_return=self.register_signal('Discounted Return')ifisinstance(self.in_action_space,GoalsSpace):self.distance_from_goal=self.register_signal('Distance From Goal',dump_one_value_per_step=True)
- # use seed
- ifself.ap.task_parameters.seedisnotNone:
- random.seed(self.ap.task_parameters.seed)
- np.random.seed(self.ap.task_parameters.seed)
- else:
- # we need to seed the RNG since the different processes are initialized with the same parent seed
- random.seed()
- np.random.seed()
+
+ # batch rl
+ self.ope_manager=OpeManager()ifself.ap.is_batch_rl_trainingelseNone@propertydefparent(self)->'LevelManager':
@@ -408,6 +424,7 @@
self.accumulated_shaped_rewards_across_evaluation_episodes=0self.num_successes_across_evaluation_episodes=0self.num_evaluation_episodes_completed=0
- ifself.ap.is_a_highest_level_agentorself.ap.task_parameters.verbosity=="high":
+
+ # TODO verbosity was mistakenly removed from task_parameters on release 0.11.0, need to bring it back
+ # if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
+ ifself.ap.is_a_highest_level_agent:screen.log_title("{}: Starting evaluation phase".format(self.name))elifending_evaluation:# we write to the next episode, because it could be that the current episode was already written# to disk and then we won't write it again
- self.agent_logger.set_current_time(self.current_episode+1)
+ self.agent_logger.set_current_time(self.get_current_time()+1)
+
evaluation_reward=self.accumulated_rewards_across_evaluation_episodes/self.num_evaluation_episodes_completedself.agent_logger.create_signal_value('Evaluation Reward',evaluation_reward)
@@ -577,9 +598,11 @@
Source code for rl_coach.agents.agent
success_rate=self.num_successes_across_evaluation_episodes/self.num_evaluation_episodes_completedself.agent_logger.create_signal_value("Success Rate",
- success_rate
- )
- ifself.ap.is_a_highest_level_agentorself.ap.task_parameters.verbosity=="high":
+ success_rate)
+
+ # TODO verbosity was mistakenly removed from task_parameters on release 0.11.0, need to bring it back
+ # if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
+ ifself.ap.is_a_highest_level_agent:screen.log_title("{}: Finished evaluation phase. Success rate = {}, Avg Total Reward = {}".format(self.name,np.round(success_rate,2),np.round(evaluation_reward,2)))
@@ -652,8 +675,11 @@
Source code for rl_coach.agents.agent
:return: None """# log all the signals to file
- self.agent_logger.set_current_time(self.current_episode)
+ current_time=self.get_current_time()
+ self.agent_logger.set_current_time(current_time)self.agent_logger.create_signal_value('Training Iter',self.training_iteration)
+ self.agent_logger.create_signal_value('Episode #',self.current_episode)
+ self.agent_logger.create_signal_value('Epoch',self.training_epoch)self.agent_logger.create_signal_value('In Heatup',int(self._phase==RunPhase.HEATUP))self.agent_logger.create_signal_value('ER #Transitions',self.call_memory('num_transitions'))self.agent_logger.create_signal_value('ER #Episodes',self.call_memory('length'))
@@ -666,12 +692,17 @@
Source code for rl_coach.agents.agent
ifself._phase==RunPhase.TRAINelsenp.nan)self.agent_logger.create_signal_value('Update Target Network',0,overwrite=False)
- self.agent_logger.update_wall_clock_time(self.current_episode)
-
- ifself._phase!=RunPhase.TEST:
- self.agent_logger.create_signal_value('Evaluation Reward',np.nan,overwrite=False)
- self.agent_logger.create_signal_value('Shaped Evaluation Reward',np.nan,overwrite=False)
- self.agent_logger.create_signal_value('Success Rate',np.nan,overwrite=False)
+ self.agent_logger.update_wall_clock_time(current_time)
+
+ # The following signals are created with meaningful values only when an evaluation phase is completed.
+ # Creating with default NaNs for any HEATUP/TRAIN/TEST episode which is not the last in an evaluation phase
+ self.agent_logger.create_signal_value('Evaluation Reward',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Shaped Evaluation Reward',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Success Rate',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Inverse Propensity Score',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Direct Method Reward',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Doubly Robust',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Sequential Doubly Robust',np.nan,overwrite=False)forsignalinself.episode_signals:self.agent_logger.create_signal_value("{}/Mean".format(signal.name),signal.get_mean())
@@ -680,8 +711,7 @@
self.total_reward_in_current_episode>=self.spaces.reward.reward_success_threshold:self.num_successes_across_evaluation_episodes+=1
- ifself.ap.visualization.dump_csv:
+ ifself.ap.visualization.dump_csvand \
+ self.parent_level_manager.parent_graph_manager.time_metric==TimeTypes.EpisodeNumber:self.update_log()
- ifself.ap.is_a_highest_level_agentorself.ap.task_parameters.verbosity=="high":
+ # TODO verbosity was mistakenly removed from task_parameters on release 0.11.0, need to bring it back
+ # if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
+ ifself.ap.is_a_highest_level_agent:self.log_to_screen()
"""loss=0ifself._should_train():
+ self.training_epoch+=1fornetworkinself.networks.values():network.set_is_training(True)
- fortraining_stepinrange(self.ap.algorithm.num_consecutive_training_steps):
- # TODO: this should be network dependent
- network_parameters=list(self.ap.network_wrappers.values())[0]
+ # At the moment we only support a single batch size for all the networks
+ networks_parameters=list(self.ap.network_wrappers.values())
+ assertall(net.batch_size==networks_parameters[0].batch_sizefornetinnetworks_parameters)
+
+ batch_size=networks_parameters[0].batch_size
+ # we either go sequentially through the entire replay buffer in the batch RL mode,
+ # or sample randomly for the basic RL case.
+ training_schedule=self.call_memory('get_shuffled_data_generator',batch_size)if \
+ self.ap.is_batch_rl_trainingelse[self.call_memory('sample',batch_size)for_in
+ range(self.ap.algorithm.num_consecutive_training_steps)]
+
+ forbatchintraining_schedule:# update countersself.training_iteration+=1
-
- # sample a batch and train on it
- batch=self.call_memory('sample',network_parameters.batch_size)ifself.pre_network_filterisnotNone:batch=self.pre_network_filter.filter(batch,update_internal_state=False,deep_copy=False)
@@ -853,15 +893,19 @@
Source code for rl_coach.agents.agent
batch=Batch(batch)total_loss,losses,unclipped_grads=self.learn_from_batch(batch)loss+=total_loss
+
self.unclipped_grads.add_sample(unclipped_grads)
- # TODO: the learning rate decay should be done through the network instead of here
+ # TODO: this only deals with the main network (if exists), need to do the same for other networks
+ # for instance, for DDPG, the LR signal is currently not shown. Probably should be done through the
+ # network directly instead of here# decay learning rate
- ifnetwork_parameters.learning_rate_decay_rate!=0:
+ if'main'inself.ap.network_wrappersand \
+ self.ap.network_wrappers['main'].learning_rate_decay_rate!=0:self.curr_learning_rate.add_sample(self.networks['main'].sess.run(self.networks['main'].online_network.current_learning_rate))else:
- self.curr_learning_rate.add_sample(network_parameters.learning_rate)
+ self.curr_learning_rate.add_sample(networks_parameters[0].learning_rate)ifany([network.has_targetfornetworkinself.networks.values()]) \
andself._should_update_online_weights_to_target():
@@ -877,6 +921,12 @@
Source code for rl_coach.agents.agent
ifself.imitation:self.log_to_screen()
+ ifself.ap.visualization.dump_csvand \
+ self.parent_level_manager.parent_graph_manager.time_metric==TimeTypes.Epoch:
+ # in BatchRL, or imitation learning, the agent never acts, so we have to get the stats out here.
+ # we dump the data out every epoch
+ self.update_log()
+
fornetworkinself.networks.values():network.set_is_training(False)
@@ -919,10 +969,11 @@
[docs]defact(self,action:Union[None,ActionType]=None)->ActionInfo:""" Given the agents current knowledge, decide on the next action to apply to the environment
+ :param action: An action to take, overriding whatever the current policy is :return: An ActionInfo object, which contains the action and any additional info from the action decision process """ifself.phase==RunPhase.TRAINandself.ap.algorithm.num_consecutive_playing_steps.num_steps==0:
@@ -935,20 +986,28 @@
Source code for rl_coach.agents.agent
self.current_episode_steps_counter+=1# decide on the action
- ifself.phase==RunPhase.HEATUPandnotself.ap.algorithm.heatup_using_network_decisions:
- # random action
- self.last_action_info=self.spaces.action.sample_with_info()
- else:
- # informed action
- ifself.pre_network_filterisnotNone:
- # before choosing an action, first use the pre_network_filter to filter out the current state
- update_filter_internal_state=self.phaseisnotRunPhase.TEST
- curr_state=self.run_pre_network_filter_for_inference(self.curr_state,update_filter_internal_state)
-
+ ifactionisNone:
+ ifself.phase==RunPhase.HEATUPandnotself.ap.algorithm.heatup_using_network_decisions:
+ # random action
+ action=self.spaces.action.sample_with_info()else:
- curr_state=self.curr_state
- self.last_action_info=self.choose_action(curr_state)
+ # informed action
+ ifself.pre_network_filterisnotNone:
+ # before choosing an action, first use the pre_network_filter to filter out the current state
+ update_filter_internal_state=self.phaseisnotRunPhase.TEST
+ curr_state=self.run_pre_network_filter_for_inference(self.curr_state,update_filter_internal_state)
+
+ else:
+ curr_state=self.curr_state
+ action=self.choose_action(curr_state)
+ assertisinstance(action,ActionInfo)
+ self.last_action_info=action
+
+ # output filters are explicitly applied after recording self.last_action_info. This is
+ # because the output filters may change the representation of the action so that the agent
+ # can no longer use the transition in it's replay buffer. It is possible that these filters
+ # could be moved to the environment instead, but they are here now for historical reasons.filtered_action_info=self.output_filter.filter(self.last_action_info)returnfiltered_action_info
@@ -1030,37 +1089,35 @@
Source code for rl_coach.agents.agent
# make agent specific changes to the transition if neededtransition=self.update_transition_before_adding_to_replay_buffer(transition)
- # merge the intrinsic reward in
- ifself.ap.algorithm.scale_external_reward_by_intrinsic_reward_value:
- transition.reward=transition.reward*(1+self.last_action_info.action_intrinsic_reward)
- else:
- transition.reward=transition.reward+self.last_action_info.action_intrinsic_reward
-
- # sum up the total shaped reward
- self.total_shaped_reward_in_current_episode+=transition.reward
- self.total_reward_in_current_episode+=env_response.reward
- self.shaped_reward.add_sample(transition.reward)
- self.reward.add_sample(env_response.reward)
-
# add action info to transitioniftype(self.parent).__name__=='CompositeAgent':transition.add_info(self.parent.last_action_info.__dict__)else:transition.add_info(self.last_action_info.__dict__)
- # create and store the transition
- ifself.phasein[RunPhase.TRAIN,RunPhase.HEATUP]:
- # for episodic memories we keep the transitions in a local buffer until the episode is ended.
- # for regular memories we insert the transitions directly to the memory
- self.current_episode_buffer.insert(transition)
- ifnotisinstance(self.memory,EpisodicExperienceReplay) \
- andnotself.ap.algorithm.store_transitions_only_when_episodes_are_terminated:
- self.call_memory('store',transition)
+ self.total_reward_in_current_episode+=env_response.reward
+ self.reward.add_sample(env_response.reward)
- ifself.ap.visualization.dump_in_episode_signals:
- self.update_step_in_episode_log()
+ returnself.observe_transition(transition)
- returntransition.game_over
+ defobserve_transition(self,transition):
+ # sum up the total shaped reward
+ self.total_shaped_reward_in_current_episode+=transition.reward
+ self.shaped_reward.add_sample(transition.reward)
+
+ # create and store the transition
+ ifself.phasein[RunPhase.TRAIN,RunPhase.HEATUP]:
+ # for episodic memories we keep the transitions in a local buffer until the episode is ended.
+ # for regular memories we insert the transitions directly to the memory
+ self.current_episode_buffer.insert(transition)
+ ifnotisinstance(self.memory,EpisodicExperienceReplay) \
+ andnotself.ap.algorithm.store_transitions_only_when_episodes_are_terminated:
+ self.call_memory('store',transition)
+
+ ifself.ap.visualization.dump_in_episode_signals:
+ self.update_step_in_episode_log()
+
+ returntransition.game_over
- # TODO-remove - this is a temporary flow, used by the trainer worker, duplicated from observe() - need to create
- # an external trainer flow reusing the existing flow and methods [e.g. observe(), step(), act()]
-
[docs]defemulate_observe_on_trainer(self,transition:Transition)->bool:
- """
- This emulates the observe using the transition obtained from the rollout worker on the training worker
- in case of distributed training.
- Given a response from the environment, distill the observation from it and store it for later use.
- The response should be a dictionary containing the performed action, the new observation and measurements,
- the reward, a game over flag and any additional information necessary.
- :return:
- """
-
- # sum up the total shaped reward
- self.total_shaped_reward_in_current_episode+=transition.reward
- self.total_reward_in_current_episode+=transition.reward
- self.shaped_reward.add_sample(transition.reward)
- self.reward.add_sample(transition.reward)
-
- # create and store the transition
- ifself.phasein[RunPhase.TRAIN,RunPhase.HEATUP]:
- # for episodic memories we keep the transitions in a local buffer until the episode is ended.
- # for regular memories we insert the transitions directly to the memory
- self.current_episode_buffer.insert(transition)
- ifnotisinstance(self.memory,EpisodicExperienceReplay) \
- andnotself.ap.algorithm.store_transitions_only_when_episodes_are_terminated:
- self.call_memory('store',transition)
-
- ifself.ap.visualization.dump_in_episode_signals:
- self.update_step_in_episode_log()
-
- returntransition.game_over
-
- # TODO-remove - this is a temporary flow, used by the trainer worker, duplicated from observe() - need to create
- # an external trainer flow reusing the existing flow and methods [e.g. observe(), step(), act()]
-
[docs]defemulate_act_on_trainer(self,transition:Transition)->ActionInfo:
- """
- This emulates the act using the transition obtained from the rollout worker on the training worker
- in case of distributed training.
- Given the agents current knowledge, decide on the next action to apply to the environment
- :return: an action and a dictionary containing any additional info from the action decision process
- """
- ifself.phase==RunPhase.TRAINandself.ap.algorithm.num_consecutive_playing_steps.num_steps==0:
- # This agent never plays while training (e.g. behavioral cloning)
- returnNone
-
- # count steps (only when training or if we are in the evaluation worker)
- ifself.phase!=RunPhase.TESTorself.ap.task_parameters.evaluate_only:
- self.total_steps_counter+=1
- self.current_episode_steps_counter+=1
-
- self.last_action_info=transition.action
-
- returnself.last_action_info
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+fromtypingimportUnion,List,Dict
+
+importnumpyasnp
+
+fromrl_coach.core_typesimportEnvResponse,ActionInfo,RunPhase,PredictionType,ActionType,Transition
+fromrl_coach.saverimportSaverCollection
+
+
+classAgentInterface(object):
+ def__init__(self):
+ self._phase=RunPhase.HEATUP
+ self._parent=None
+ self.spaces=None
+
+ @property
+ defparent(self):
+ """
+ Get the parent class of the agent
+ :return: the current phase
+ """
+ returnself._parent
+
+ @parent.setter
+ defparent(self,val):
+ """
+ Change the parent class of the agent
+ :param val: the new parent
+ :return: None
+ """
+ self._parent=val
+
+ @property
+ defphase(self)->RunPhase:
+ """
+ Get the phase of the agent
+ :return: the current phase
+ """
+ returnself._phase
+
+ @phase.setter
+ defphase(self,val:RunPhase):
+ """
+ Change the phase of the agent
+ :param val: the new phase
+ :return: None
+ """
+ self._phase=val
+
+ defreset_internal_state(self)->None:
+ """
+ Reset the episode parameters for the agent
+ :return: None
+ """
+ raiseNotImplementedError("")
+
+ deftrain(self)->Union[float,List]:
+ """
+ Train the agents network
+ :return: The loss of the training
+ """
+ raiseNotImplementedError("")
+
+ defact(self)->ActionInfo:
+ """
+ Get a decision of the next action to take.
+ The action is dependent on the current state which the agent holds from resetting the environment or from
+ the observe function.
+ :return: A tuple containing the actual action and additional info on the action
+ """
+ raiseNotImplementedError("")
+
+ defobserve(self,env_response:EnvResponse)->bool:
+ """
+ Gets a response from the environment.
+ Processes this information for later use. For example, create a transition and store it in memory.
+ The action info (a class containing any info the agent wants to store regarding its action decision process) is
+ stored by the agent itself when deciding on the action.
+ :param env_response: a EnvResponse containing the response from the environment
+ :return: a done signal which is based on the agent knowledge. This can be different from the done signal from
+ the environment. For example, an agent can decide to finish the episode each time it gets some
+ intrinsic reward
+ """
+ raiseNotImplementedError("")
+
+ defsave_checkpoint(self,checkpoint_prefix:str)->None:
+ """
+ Save the model of the agent to the disk. This can contain the network parameters, the memory of the agent, etc.
+ :param checkpoint_prefix: The prefix of the checkpoint file to save
+ :return: None
+ """
+ raiseNotImplementedError("")
+
+ defget_predictions(self,states:Dict,prediction_type:PredictionType)->np.ndarray:
+ """
+ Get a prediction from the agent with regard to the requested prediction_type. If the agent cannot predict this
+ type of prediction_type, or if there is more than possible way to do so, raise a ValueException.
+ :param states:
+ :param prediction_type:
+ :return: the agent's prediction
+ """
+ raiseNotImplementedError("")
+
+ defset_incoming_directive(self,action:ActionType)->None:
+ """
+ Pass a higher level command (directive) to the agent.
+ For example, a higher level agent can set the goal of the agent.
+ :param action: the directive to pass to the agent
+ :return: None
+ """
+ raiseNotImplementedError("")
+
+ defcollect_savers(self,parent_path_suffix:str)->SaverCollection:
+ """
+ Collect all of agent savers
+ :param parent_path_suffix: path suffix of the parent of the agent
+ (could be name of level manager or composite agent)
+ :return: collection of all agent savers
+ """
+ raiseNotImplementedError("")
+
+ defhandle_episode_ended(self)->None:
+ """
+ Make any changes needed when each episode is ended.
+ This includes incrementing counters, updating full episode dependent values, updating logs, etc.
+ This function is called right after each episode is ended.
+
+ :return: None
+ """
+ raiseNotImplementedError("")
+
+ defrun_off_policy_evaluation(self)->None:
+ """
+ Run off-policy evaluation estimators to evaluate the trained policy performance against a dataset.
+ Should only be implemented for off-policy RL algorithms.
+
+ :return: None
+ """
+ raiseNotImplementedError("")
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/_modules/rl_coach/agents/categorical_dqn_agent.html b/docs/_modules/rl_coach/agents/categorical_dqn_agent.html
index 667a484eb..0b8183153 100644
--- a/docs/_modules/rl_coach/agents/categorical_dqn_agent.html
+++ b/docs/_modules/rl_coach/agents/categorical_dqn_agent.html
@@ -264,8 +264,8 @@
Source code for rl_coach.agents.categorical_dqn_agent
# prediction's format is (batch,actions,atoms)
defget_all_q_values_for_states(self,states:StateType):ifself.exploration_policy.requires_action_values():
- prediction=self.get_prediction(states)
- q_values=self.distribution_prediction_to_q_values(prediction)
+ q_values=self.get_prediction(states,
+ outputs=[self.networks['main'].online_network.output_heads[0].q_values])else:q_values=Nonereturnq_values
@@ -280,11 +280,14 @@
Source code for rl_coach.agents.categorical_dqn_agent
(self.networks['main'].online_network,batch.states(network_keys))])
+ # add Q value samples for logging
+ self.q_values.add_sample(self.distribution_prediction_to_q_values(TD_targets))
+
# select the optimal actions for the next statetarget_actions=np.argmax(self.distribution_prediction_to_q_values(distributional_q_st_plus_1),axis=1)
- m=np.zeros((self.ap.network_wrappers['main'].batch_size,self.z_values.size))
+ m=np.zeros((batch.size,self.z_values.size))
- batches=np.arange(self.ap.network_wrappers['main'].batch_size)
+ batches=np.arange(batch.size)# an alternative to the for loop. 3.7x perf improvement vs. the same code done with for looping.# only 10% speedup overall - leaving commented out as the code is not as clear.
@@ -297,7 +300,7 @@
Source code for rl_coach.agents.categorical_dqn_agent
self.networks['main'].online_network.output_heads[1].likelihood_ratio,self.networks['main'].online_network.output_heads[1].clipped_likelihood_ratio]
+ # TODO-fixme if batch.size / self.ap.network_wrappers['main'].batch_size is not an integer, we do not train on
+ # some of the dataforiinrange(int(batch.size/self.ap.network_wrappers['main'].batch_size)):start=i*self.ap.network_wrappers['main'].batch_sizeend=(i+1)*self.ap.network_wrappers['main'].batch_size
diff --git a/docs/_modules/rl_coach/agents/dfp_agent.html b/docs/_modules/rl_coach/agents/dfp_agent.html
index 36aca38b5..447c31e74 100644
--- a/docs/_modules/rl_coach/agents/dfp_agent.html
+++ b/docs/_modules/rl_coach/agents/dfp_agent.html
@@ -326,13 +326,13 @@
Source code for rl_coach.agents.dfp_agent
network_inputs=batch.states(network_keys)network_inputs['goal']=np.repeat(np.expand_dims(self.current_goal,0),
- self.ap.network_wrappers['main'].batch_size,axis=0)
+ batch.size,axis=0)# get the current outputs of the networktargets=self.networks['main'].online_network.predict(network_inputs)# change the targets for the taken actions
- foriinrange(self.ap.network_wrappers['main'].batch_size):
+ foriinrange(batch.size):targets[i,batch.actions()[i]]=batch[i].info['future_measurements'].flatten()result=self.networks['main'].train_and_sync_networks(network_inputs,targets)
diff --git a/docs/_modules/rl_coach/agents/dqn_agent.html b/docs/_modules/rl_coach/agents/dqn_agent.html
index 5024209d1..01cd007f7 100644
--- a/docs/_modules/rl_coach/agents/dqn_agent.html
+++ b/docs/_modules/rl_coach/agents/dqn_agent.html
@@ -250,6 +250,9 @@
else:assertTrue,'The available values for targets_horizon are: 1-Step, N-Step'
+ # add Q value samples for logging
+ self.q_values.add_sample(state_value_head_targets)
+
# trainresult=self.networks['main'].online_network.accumulate_gradients(batch.states(network_keys),[state_value_head_targets])
diff --git a/docs/_modules/rl_coach/agents/nec_agent.html b/docs/_modules/rl_coach/agents/nec_agent.html
index 5f1e0f459..51d49fd3e 100644
--- a/docs/_modules/rl_coach/agents/nec_agent.html
+++ b/docs/_modules/rl_coach/agents/nec_agent.html
@@ -313,7 +313,7 @@
Source code for rl_coach.agents.nec_agent
TD_targets=self.networks['main'].online_network.predict(batch.states(network_keys))bootstrapped_return_from_old_policy=batch.n_step_discounted_rewards()# only update the action that we have actually done in this transition
- foriinrange(self.ap.network_wrappers['main'].batch_size):
+ foriinrange(batch.size):TD_targets[i,batch.actions()[i]]=bootstrapped_return_from_old_policy[i]# set the gradients to fetch for the DND update
@@ -342,7 +342,7 @@
(self.networks['main'].online_network,batch.states(network_keys))])
+ # add Q value samples for logging
+ self.q_values.add_sample(self.get_q_values(current_quantiles))
+
# get the optimal actions to take for the next statestarget_actions=np.argmax(self.get_q_values(next_state_quantiles),axis=1)# calculate the Bellman update
- batch_idx=list(range(self.ap.network_wrappers['main'].batch_size))
+ batch_idx=list(range(batch.size))TD_targets=batch.rewards(True)+(1.0-batch.game_overs(True))*self.ap.algorithm.discount \
*next_state_quantiles[batch_idx,target_actions]
@@ -283,9 +286,9 @@
Source code for rl_coach.agents.qr_dqn_agent
# calculate the cumulative quantile probabilities and reorder them to fit the sorted quantiles order
cumulative_probabilities=np.array(range(self.ap.algorithm.atoms+1))/float(self.ap.algorithm.atoms)# tau_iquantile_midpoints=0.5*(cumulative_probabilities[1:]+cumulative_probabilities[:-1])# tau^hat_i
- quantile_midpoints=np.tile(quantile_midpoints,(self.ap.network_wrappers['main'].batch_size,1))
+ quantile_midpoints=np.tile(quantile_midpoints,(batch.size,1))sorted_quantiles=np.argsort(current_quantiles[batch_idx,batch.actions()])
- foridxinrange(self.ap.network_wrappers['main'].batch_size):
+ foridxinrange(batch.size):quantile_midpoints[idx,:]=quantile_midpoints[idx,sorted_quantiles[idx]]# train
diff --git a/docs/_modules/rl_coach/agents/rainbow_dqn_agent.html b/docs/_modules/rl_coach/agents/rainbow_dqn_agent.html
index 542e9e397..601dd1350 100644
--- a/docs/_modules/rl_coach/agents/rainbow_dqn_agent.html
+++ b/docs/_modules/rl_coach/agents/rainbow_dqn_agent.html
@@ -240,9 +240,12 @@
Source code for rl_coach.agents.rainbow_dqn_agent
def __init__(self):super().__init__()self.algorithm=RainbowDQNAlgorithmParameters()
+
+ # ParameterNoiseParameters is changing the network wrapper parameters. This line needs to be done first.
+ self.network_wrappers={"main":RainbowDQNNetworkParameters()}
+
self.exploration=ParameterNoiseParameters(self)self.memory=PrioritizedExperienceReplayParameters()
- self.network_wrappers={"main":RainbowDQNNetworkParameters()}@propertydefpath(self):
@@ -275,11 +278,14 @@
Source code for rl_coach.agents.rainbow_dqn_agent
(self.networks['main'].online_network,batch.states(network_keys))])
+ # add Q value samples for logging
+ self.q_values.add_sample(self.distribution_prediction_to_q_values(TD_targets))
+
# only update the action that we have actually done in this transition (using the Double-DQN selected actions)target_actions=ddqn_selected_actions
- m=np.zeros((self.ap.network_wrappers['main'].batch_size,self.z_values.size))
+ m=np.zeros((batch.size,self.z_values.size))
- batches=np.arange(self.ap.network_wrappers['main'].batch_size)
+ batches=np.arange(batch.size)forjinrange(self.z_values.size):# we use batch.info('should_bootstrap_next_state') instead of (1 - batch.game_overs()) since with n-step,# we will not bootstrap for the last n-step transitions in the episode
diff --git a/docs/_modules/rl_coach/agents/soft_actor_critic_agent.html b/docs/_modules/rl_coach/agents/soft_actor_critic_agent.html
new file mode 100644
index 000000000..33668b2fe
--- /dev/null
+++ b/docs/_modules/rl_coach/agents/soft_actor_critic_agent.html
@@ -0,0 +1,554 @@
+
+
+
+
+
+
+
+
+
+
+ rl_coach.agents.soft_actor_critic_agent — Reinforcement Learning Coach 0.11.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Source code for rl_coach.agents.soft_actor_critic_agent
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+fromtypingimportUnion
+importcopy
+importnumpyasnp
+fromcollectionsimportOrderedDict
+
+fromrl_coach.agents.agentimportAgent
+fromrl_coach.agents.policy_optimization_agentimportPolicyOptimizationAgent
+
+fromrl_coach.architectures.head_parametersimportSACQHeadParameters,SACPolicyHeadParameters,VHeadParameters
+fromrl_coach.architectures.middleware_parametersimportFCMiddlewareParameters
+fromrl_coach.base_parametersimportAlgorithmParameters,NetworkParameters,AgentParameters,EmbedderScheme,MiddlewareScheme
+fromrl_coach.core_typesimportActionInfo,EnvironmentSteps,RunPhase
+fromrl_coach.exploration_policies.additive_noiseimportAdditiveNoiseParameters
+fromrl_coach.memories.non_episodic.experience_replayimportExperienceReplayParameters
+fromrl_coach.architectures.embedder_parametersimportInputEmbedderParameters
+fromrl_coach.spacesimportBoxActionSpace
+
+
+# There are 3 networks in SAC implementation. All have the same topology but parameters are not shared.
+# The networks are:
+# 1. State Value Network - SACValueNetwork
+# 2. Soft Q Value Network - SACCriticNetwork
+# 3. Policy Network - SACPolicyNetwork - currently supporting only Gaussian Policy
+
+
+# 1. State Value Network - SACValueNetwork
+# this is the state value network in SAC.
+# The network is trained to predict (regression) the state value in the max-entropy settings
+# The objective to be minimized is given in equation (5) in the paper:
+#
+# J(psi)= E_(s~D)[0.5*(V_psi(s)-y(s))^2]
+# where y(s) = E_(a~pi)[Q_theta(s,a)-log(pi(a|s))]
+
+
+# Default parameters for value network:
+# topology :
+# input embedder : EmbedderScheme.Medium (Dense(256)) , relu activation
+# middleware : EmbedderScheme.Medium (Dense(256)) , relu activation
+
+
+classSACValueNetworkParameters(NetworkParameters):
+ def__init__(self):
+ super().__init__()
+ self.input_embedders_parameters={'observation':InputEmbedderParameters(activation_function='relu')}
+ self.middleware_parameters=FCMiddlewareParameters(activation_function='relu')
+ self.heads_parameters=[VHeadParameters(initializer='xavier')]
+ self.rescale_gradient_from_head_by_factor=[1]
+ self.optimizer_type='Adam'
+ self.batch_size=256
+ self.async_training=False
+ self.learning_rate=0.0003# 3e-4 see appendix D in the paper
+ self.create_target_network=True# tau is set in SoftActorCriticAlgorithmParameters.rate_for_copying_weights_to_target
+
+
+# 2. Soft Q Value Network - SACCriticNetwork
+# the whole network is built in the SACQHeadParameters. we use empty input embedder and middleware
+classSACCriticNetworkParameters(NetworkParameters):
+ def__init__(self):
+ super().__init__()
+ self.input_embedders_parameters={'observation':InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
+ self.middleware_parameters=FCMiddlewareParameters(scheme=MiddlewareScheme.Empty)
+ self.heads_parameters=[SACQHeadParameters()]# SACQHeadParameters includes the topology of the head
+ self.rescale_gradient_from_head_by_factor=[1]
+ self.optimizer_type='Adam'
+ self.batch_size=256
+ self.async_training=False
+ self.learning_rate=0.0003
+ self.create_target_network=False
+
+
+# 3. policy Network
+# Default parameters for policy network:
+# topology :
+# input embedder : EmbedderScheme.Medium (Dense(256)) , relu activation
+# middleware : EmbedderScheme = [Dense(256)] , relu activation --> scheme should be overridden in preset
+classSACPolicyNetworkParameters(NetworkParameters):
+ def__init__(self):
+ super().__init__()
+ self.input_embedders_parameters={'observation':InputEmbedderParameters(activation_function='relu')}
+ self.middleware_parameters=FCMiddlewareParameters(activation_function='relu')
+ self.heads_parameters=[SACPolicyHeadParameters()]
+ self.rescale_gradient_from_head_by_factor=[1]
+ self.optimizer_type='Adam'
+ self.batch_size=256
+ self.async_training=False
+ self.learning_rate=0.0003
+ self.create_target_network=False
+ self.l2_regularization=0# weight decay regularization. not used in the original paper
+
+
+# Algorithm Parameters
+
+
[docs]classSoftActorCriticAlgorithmParameters(AlgorithmParameters):
+ """
+ :param num_steps_between_copying_online_weights_to_target: (StepMethod)
+ The number of steps between copying the online network weights to the target network weights.
+
+ :param rate_for_copying_weights_to_target: (float)
+ When copying the online network weights to the target network weights, a soft update will be used, which
+ weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)
+
+ :param use_deterministic_for_evaluation: (bool)
+ If True, during the evaluation phase, action are chosen deterministically according to the policy mean
+ and not sampled from the policy distribution.
+ """
+ def__init__(self):
+ super().__init__()
+ self.num_steps_between_copying_online_weights_to_target=EnvironmentSteps(1)
+ self.rate_for_copying_weights_to_target=0.005
+ self.use_deterministic_for_evaluation=True# evaluate agent using deterministic policy (i.e. take the mean value)
+
+
+classSoftActorCriticAgentParameters(AgentParameters):
+ def__init__(self):
+ super().__init__(algorithm=SoftActorCriticAlgorithmParameters(),
+ exploration=AdditiveNoiseParameters(),
+ memory=ExperienceReplayParameters(),# SAC doesnt use episodic related data
+ # network wrappers:
+ networks=OrderedDict([("policy",SACPolicyNetworkParameters()),
+ ("q",SACCriticNetworkParameters()),
+ ("v",SACValueNetworkParameters())]))
+
+ @property
+ defpath(self):
+ return'rl_coach.agents.soft_actor_critic_agent:SoftActorCriticAgent'
+
+
+# Soft Actor Critic - https://arxiv.org/abs/1801.01290
+classSoftActorCriticAgent(PolicyOptimizationAgent):
+ def__init__(self,agent_parameters,parent:Union['LevelManager','CompositeAgent']=None):
+ super().__init__(agent_parameters,parent)
+ self.last_gradient_update_step_idx=0
+
+ # register signals to track (in learn_from_batch)
+ self.policy_means=self.register_signal('Policy_mu_avg')
+ self.policy_logsig=self.register_signal('Policy_logsig')
+ self.policy_logprob_sampled=self.register_signal('Policy_logp_sampled')
+ self.policy_grads=self.register_signal('Policy_grads_sumabs')
+
+ self.q1_values=self.register_signal("Q1")
+ self.TD_err1=self.register_signal("TD err1")
+ self.q2_values=self.register_signal("Q2")
+ self.TD_err2=self.register_signal("TD err2")
+ self.v_tgt_ns=self.register_signal('V_tgt_ns')
+ self.v_onl_ys=self.register_signal('V_onl_ys')
+ self.action_signal=self.register_signal("actions")
+
+ deflearn_from_batch(self,batch):
+ #########################################
+ # need to update the following networks:
+ # 1. actor (policy)
+ # 2. state value (v)
+ # 3. critic (q1 and q2)
+ # 4. target network - probably already handled by V
+
+ #########################################
+ # define the networks to be used
+
+ # State Value Network
+ value_network=self.networks['v']
+ value_network_keys=self.ap.network_wrappers['v'].input_embedders_parameters.keys()
+
+ # Critic Network
+ q_network=self.networks['q'].online_network
+ q_head=q_network.output_heads[0]
+ q_network_keys=self.ap.network_wrappers['q'].input_embedders_parameters.keys()
+
+ # Actor (policy) Network
+ policy_network=self.networks['policy'].online_network
+ policy_network_keys=self.ap.network_wrappers['policy'].input_embedders_parameters.keys()
+
+ ##########################################
+ # 1. updating the actor - according to (13) in the paper
+ policy_inputs=copy.copy(batch.states(policy_network_keys))
+ policy_results=policy_network.predict(policy_inputs)
+
+ policy_mu,policy_std,sampled_raw_actions,sampled_actions,sampled_actions_logprob, \
+ sampled_actions_logprob_mean=policy_results
+
+ self.policy_means.add_sample(policy_mu)
+ self.policy_logsig.add_sample(policy_std)
+ self.policy_logprob_sampled.add_sample(sampled_actions_logprob_mean)
+
+ # get the state-action values for the replayed states and their corresponding actions from the policy
+ q_inputs=copy.copy(batch.states(q_network_keys))
+ q_inputs['output_0_0']=sampled_actions
+ log_target=q_network.predict(q_inputs)[0].squeeze()
+
+ # log internal q values
+ q1_vals,q2_vals=q_network.predict(q_inputs,outputs=[q_head.q1_output,q_head.q2_output])
+ self.q1_values.add_sample(q1_vals)
+ self.q2_values.add_sample(q2_vals)
+
+ # calculate the gradients according to (13)
+ # get the gradients of log_prob w.r.t the weights (parameters) - indicated as phi in the paper
+ initial_feed_dict={policy_network.gradients_weights_ph[5]:np.array(1.0)}
+ dlogp_dphi=policy_network.predict(policy_inputs,
+ outputs=policy_network.weighted_gradients[5],
+ initial_feed_dict=initial_feed_dict)
+
+ # calculate dq_da
+ dq_da=q_network.predict(q_inputs,
+ outputs=q_network.gradients_wrt_inputs[1]['output_0_0'])
+
+ # calculate da_dphi
+ initial_feed_dict={policy_network.gradients_weights_ph[3]:dq_da}
+ dq_dphi=policy_network.predict(policy_inputs,
+ outputs=policy_network.weighted_gradients[3],
+ initial_feed_dict=initial_feed_dict)
+
+ # now given dlogp_dphi, dq_dphi we need to calculate the policy gradients according to (13)
+ policy_grads=[dlogp_dphi[l]-dq_dphi[l]forlinrange(len(dlogp_dphi))]
+
+ # apply the gradients to policy networks
+ policy_network.apply_gradients(policy_grads)
+ grads_sumabs=np.sum([np.sum(np.abs(policy_grads[l]))forlinrange(len(policy_grads))])
+ self.policy_grads.add_sample(grads_sumabs)
+
+ ##########################################
+ # 2. updating the state value online network weights
+ # done by calculating the targets for the v head according to (5) in the paper
+ # value_targets = log_targets-sampled_actions_logprob
+ value_inputs=copy.copy(batch.states(value_network_keys))
+ value_targets=log_target-sampled_actions_logprob
+
+ self.v_onl_ys.add_sample(value_targets)
+
+ # call value_network apply gradients with this target
+ value_loss=value_network.online_network.train_on_batch(value_inputs,value_targets[:,None])[0]
+
+ ##########################################
+ # 3. updating the critic (q networks)
+ # updating q networks according to (7) in the paper
+
+ # define the input to the q network: state has been already updated previously, but now we need
+ # the actions from the batch (and not those sampled by the policy)
+ q_inputs['output_0_0']=batch.actions(len(batch.actions().shape)==1)
+
+ # define the targets : scale_reward * reward + (1-terminal)*discount*v_target_next_state
+ # define v_target_next_state
+ value_inputs=copy.copy(batch.next_states(value_network_keys))
+ v_target_next_state=value_network.target_network.predict(value_inputs)
+ self.v_tgt_ns.add_sample(v_target_next_state)
+ # Note: reward is assumed to be rescaled by RewardRescaleFilter in the preset parameters
+ TD_targets=batch.rewards(expand_dims=True)+ \
+ (1.0-batch.game_overs(expand_dims=True))*self.ap.algorithm.discount*v_target_next_state
+
+ # call critic network update
+ result=q_network.train_on_batch(q_inputs,TD_targets,additional_fetches=[q_head.q1_loss,q_head.q2_loss])
+ total_loss,losses,unclipped_grads=result[:3]
+ q1_loss,q2_loss=result[3]
+ self.TD_err1.add_sample(q1_loss)
+ self.TD_err2.add_sample(q2_loss)
+
+ ##########################################
+ # 4. updating the value target network
+ # I just need to set the parameter rate_for_copying_weights_to_target in the agent parameters to be 1-tau
+ # where tau is the hyper parameter as defined in sac original implementation
+
+ returntotal_loss,losses,unclipped_grads
+
+ defget_prediction(self,states):
+ """
+ get the mean and stdev of the policy distribution given 'states'
+ :param states: the states for which we need to sample actions from the policy
+ :return: mean and stdev
+ """
+ tf_input_state=self.prepare_batch_for_inference(states,'policy')
+ returnself.networks['policy'].online_network.predict(tf_input_state)
+
+ deftrain(self):
+ # since the algorithm works with experience replay buffer (non-episodic),
+ # we cant use the policy optimization train method. we need Agent.train
+ # note that since in Agent.train there is no apply_gradients, we need to do it in learn from batch
+ returnAgent.train(self)
+
+ defchoose_action(self,curr_state):
+ """
+ choose_action - chooses the most likely action
+ if 'deterministic' - take the mean of the policy which is the prediction of the policy network.
+ else - use the exploration policy
+ :param curr_state:
+ :return: action wrapped in ActionInfo
+ """
+ ifnotisinstance(self.spaces.action,BoxActionSpace):
+ raiseValueError("SAC works only for continuous control problems")
+ # convert to batch so we can run it through the network
+ tf_input_state=self.prepare_batch_for_inference(curr_state,'policy')
+ # use the online network for prediction
+ policy_network=self.networks['policy'].online_network
+ policy_head=policy_network.output_heads[0]
+ result=policy_network.predict(tf_input_state,
+ outputs=[policy_head.policy_mean,policy_head.actions])
+ action_mean,action_sample=result
+
+ # if using deterministic policy, take the mean values. else, use exploration policy to sample from the pdf
+ ifself.phase==RunPhase.TESTandself.ap.algorithm.use_deterministic_for_evaluation:
+ action=action_mean[0]
+ else:
+ action=action_sample[0]
+
+ self.action_signal.add_sample(action)
+
+ action_info=ActionInfo(action=action)
+ returnaction_info
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/_modules/rl_coach/agents/value_optimization_agent.html b/docs/_modules/rl_coach/agents/value_optimization_agent.html
index 02aacb458..730229c98 100644
--- a/docs/_modules/rl_coach/agents/value_optimization_agent.html
+++ b/docs/_modules/rl_coach/agents/value_optimization_agent.html
@@ -193,16 +193,17 @@
Source code for rl_coach.agents.value_optimization_agent
# See the License for the specific language governing permissions and
# limitations under the License.#
-
+fromcollectionsimportOrderedDictfromtypingimportUnionimportnumpyasnpfromrl_coach.agents.agentimportAgent
-fromrl_coach.core_typesimportActionInfo,StateType
+fromrl_coach.core_typesimportActionInfo,StateType,Batch
+fromrl_coach.loggerimportscreenfromrl_coach.memories.non_episodic.prioritized_experience_replayimportPrioritizedExperienceReplayfromrl_coach.spacesimportDiscreteActionSpace
-
+fromcopyimportdeepcopy## This is an abstract agent - there is no learn_from_batch method ##
@@ -229,8 +230,9 @@
Source code for rl_coach.agents.value_optimization_agent
Source code for rl_coach.agents.value_optimization_agent
# this is for bootstrapped dqn
iftype(actions_q_values)==listandlen(actions_q_values)>0:actions_q_values=self.exploration_policy.last_action_values
- actions_q_values=actions_q_values.squeeze()# store the q values statistics for loggingself.q_values.add_sample(actions_q_values)
+
+ actions_q_values=actions_q_values.squeeze()
+
fori,q_valueinenumerate(actions_q_values):self.q_value_for_action[i].add_sample(q_value)
@@ -276,6 +280,77 @@
Source code for rl_coach.agents.value_optimization_agent
def learn_from_batch(self,batch):raiseNotImplementedError("ValueOptimizationAgent is an abstract agent. Not to be used directly.")
+
+ defrun_off_policy_evaluation(self):
+ """
+ Run the off-policy evaluation estimators to get a prediction for the performance of the current policy based on
+ an evaluation dataset, which was collected by another policy(ies).
+ :return: None
+ """
+ assertself.ope_manager
+ dataset_as_episodes=self.call_memory('get_all_complete_episodes_from_to',
+ (self.call_memory('get_last_training_set_episode_id')+1,
+ self.call_memory('num_complete_episodes')))
+ iflen(dataset_as_episodes)==0:
+ raiseValueError('train_to_eval_ratio is too high causing the evaluation set to be empty. '
+ 'Consider decreasing its value.')
+
+ ips,dm,dr,seq_dr=self.ope_manager.evaluate(
+ dataset_as_episodes=dataset_as_episodes,
+ batch_size=self.ap.network_wrappers['main'].batch_size,
+ discount_factor=self.ap.algorithm.discount,
+ reward_model=self.networks['reward_model'].online_network,
+ q_network=self.networks['main'].online_network,
+ network_keys=list(self.ap.network_wrappers['main'].input_embedders_parameters.keys()))
+
+ # get the estimators out to the screen
+ log=OrderedDict()
+ log['Epoch']=self.training_epoch
+ log['IPS']=ips
+ log['DM']=dm
+ log['DR']=dr
+ log['Sequential-DR']=seq_dr
+ screen.log_dict(log,prefix='Off-Policy Evaluation')
+
+ # get the estimators out to dashboard
+ self.agent_logger.set_current_time(self.get_current_time()+1)
+ self.agent_logger.create_signal_value('Inverse Propensity Score',ips)
+ self.agent_logger.create_signal_value('Direct Method Reward',dm)
+ self.agent_logger.create_signal_value('Doubly Robust',dr)
+ self.agent_logger.create_signal_value('Sequential Doubly Robust',seq_dr)
+
+ defget_reward_model_loss(self,batch:Batch):
+ network_keys=self.ap.network_wrappers['reward_model'].input_embedders_parameters.keys()
+ current_rewards_prediction_for_all_actions=self.networks['reward_model'].online_network.predict(
+ batch.states(network_keys))
+ current_rewards_prediction_for_all_actions[range(batch.size),batch.actions()]=batch.rewards()
+
+ returnself.networks['reward_model'].train_and_sync_networks(
+ batch.states(network_keys),current_rewards_prediction_for_all_actions)[0]
+
+ defimprove_reward_model(self,epochs:int):
+ """
+ Train a reward model to be used by the doubly-robust estimator
+
+ :param epochs: The total number of epochs to use for training a reward model
+ :return: None
+ """
+ batch_size=self.ap.network_wrappers['reward_model'].batch_size
+
+ # this is fitted from the training dataset
+ forepochinrange(epochs):
+ loss=0
+ total_transitions_processed=0
+ fori,batchinenumerate(self.call_memory('get_shuffled_data_generator',batch_size)):
+ batch=Batch(batch)
+ loss+=self.get_reward_model_loss(batch)
+ total_transitions_processed+=batch.size
+
+ log=OrderedDict()
+ log['Epoch']=epoch
+ log['loss']=loss/total_transitions_processed
+ screen.log_dict(log,prefix='Training Reward Model')
+
# distributed agents paramsself.share_statistics_between_workers=True
- # intrinsic reward
- self.scale_external_reward_by_intrinsic_reward_value=False
-
# n-step returnsself.n_step=-1# calculate the total return (no bootstrap, by default)
@@ -470,7 +468,8 @@
Source code for rl_coach.base_parameters
batch_size=32,replace_mse_with_huber_loss=False,create_target_network=False,
- tensorflow_support=True):
+ tensorflow_support=True,
+ softmax_temperature=1):""" :param force_cpu: Force the neural networks to run on the CPU even if a GPU is available
@@ -553,6 +552,8 @@
Source code for rl_coach.base_parameters
online network at will. :param tensorflow_support: A flag which specifies if the network is supported by the TensorFlow framework.
+ :param softmax_temperature:
+ If a softmax is present in the network head output, use this temperature """super().__init__()self.framework=Frameworks.tensorflow
@@ -583,16 +584,19 @@
[docs]classTaskParameters(Parameters):
- def__init__(self,framework_type:Frameworks=Frameworks.tensorflow,evaluate_only:bool=False,use_cpu:bool=False,
+ def__init__(self,framework_type:Frameworks=Frameworks.tensorflow,evaluate_only:int=None,use_cpu:bool=False,experiment_path='/tmp',seed=None,checkpoint_save_secs=None,checkpoint_restore_dir=None,
- checkpoint_save_dir=None,export_onnx_graph:bool=False,apply_stop_condition:bool=False,
- num_gpu:int=1):
+ checkpoint_restore_path=None,checkpoint_save_dir=None,export_onnx_graph:bool=False,
+ apply_stop_condition:bool=False,num_gpu:int=1):""" :param framework_type: deep learning framework type. currently only tensorflow is supported
- :param evaluate_only: the task will be used only for evaluating the model
+ :param evaluate_only: if not None, the task will be used only for evaluating the model for the given number of steps.
+ A value of 0 means that task will be evaluated for an infinite number of steps. :param use_cpu: use the cpu for this task :param experiment_path: the path to the directory which will store all the experiment outputs :param seed: a seed to use for the random numbers generator :param checkpoint_save_secs: the number of seconds between each checkpoint saving
- :param checkpoint_restore_dir: the directory to restore the checkpoints from
+ :param checkpoint_restore_dir:
+ [DEPECRATED - will be removed in one of the next releases - switch to checkpoint_restore_path]
+ the dir to restore the checkpoints from
+ :param checkpoint_restore_path: the path to restore the checkpoints from :param checkpoint_save_dir: the directory to store the checkpoints in :param export_onnx_graph: If set to True, this will export an onnx graph each time a checkpoint is saved :param apply_stop_condition: If set to True, this will apply the stop condition defined by reaching a target success rate
@@ -753,7 +762,13 @@
Source code for rl_coach.base_parameters
self.use_cpu=use_cpuself.experiment_path=experiment_pathself.checkpoint_save_secs=checkpoint_save_secs
- self.checkpoint_restore_dir=checkpoint_restore_dir
+ ifcheckpoint_restore_dir:
+ screen.warning('TaskParameters.checkpoint_restore_dir is DEPECRATED and will be removed in one of the next '
+ 'releases. Please switch to using TaskParameters.checkpoint_restore_path, with your '
+ 'directory path. ')
+ self.checkpoint_restore_path=checkpoint_restore_dir
+ else:
+ self.checkpoint_restore_path=checkpoint_restore_pathself.checkpoint_save_dir=checkpoint_save_dirself.seed=seedself.export_onnx_graph=export_onnx_graph
@@ -763,13 +778,14 @@
Source code for rl_coach.base_parameters
[docs]classDistributedTaskParameters(TaskParameters):def__init__(self,framework_type:Frameworks,parameters_server_hosts:str,worker_hosts:str,job_type:str,
- task_index:int,evaluate_only:bool=False,num_tasks:int=None,
+ task_index:int,evaluate_only:int=None,num_tasks:int=None,num_training_tasks:int=None,use_cpu:bool=False,experiment_path=None,dnd=None,
- shared_memory_scratchpad=None,seed=None,checkpoint_save_secs=None,checkpoint_restore_dir=None,
+ shared_memory_scratchpad=None,seed=None,checkpoint_save_secs=None,checkpoint_restore_path=None,checkpoint_save_dir=None,export_onnx_graph:bool=False,apply_stop_condition:bool=False):""" :param framework_type: deep learning framework type. currently only tensorflow is supported
- :param evaluate_only: the task will be used only for evaluating the model
+ :param evaluate_only: if not None, the task will be used only for evaluating the model for the given number of steps.
+ A value of 0 means that task will be evaluated for an infinite number of steps. :param parameters_server_hosts: comma-separated list of hostname:port pairs to which the parameter servers are assigned :param worker_hosts: comma-separated list of hostname:port pairs to which the workers are assigned
@@ -782,7 +798,7 @@
Source code for rl_coach.base_parameters
:param dnd: an external DND to use for NEC. This is a workaround needed for a shared DND not using the scratchpad. :param seed: a seed to use for the random numbers generator :param checkpoint_save_secs: the number of seconds between each checkpoint saving
- :param checkpoint_restore_dir: the directory to restore the checkpoints from
+ :param checkpoint_restore_path: the path to restore the checkpoints from :param checkpoint_save_dir: the directory to store the checkpoints in :param export_onnx_graph: If set to True, this will export an onnx graph each time a checkpoint is saved :param apply_stop_condition: If set to True, this will apply the stop condition defined by reaching a target success rate
@@ -790,7 +806,7 @@
# See the License for the specific language governing permissions and# limitations under the License.#
-
+fromcollectionsimportnamedtupleimportcopy
+importmathfromenumimportEnumfromrandomimportshufflefromtypingimportList,Union,Dict,Any,Type
@@ -218,6 +219,17 @@
defnum_steps(self,val:int)->None:self._num_steps=val
+ def__eq__(self,other):
+ returnself.num_steps==other.num_steps
+
+ def__truediv__(self,other):
+ """
+ divide this step method with other. If other is an integer, returns an object of the same
+ type as self. If other is the same type of self, returns an integer. In either case, any
+ floating point value is rounded up under the assumption that if we are dividing Steps, we
+ would rather overestimate than underestimate.
+ """
+ ifisinstance(other,type(self)):
+ returnmath.ceil(self.num_steps/other.num_steps)
+ elifisinstance(other,int):
+ returntype(self)(math.ceil(self.num_steps/other))
+ else:
+ raiseTypeError("cannot divide {} by {}".format(type(self),type(other)))
+
+ def__rtruediv__(self,other):
+ """
+ divide this step method with other. If other is an integer, returns an object of the same
+ type as self. If other is the same type of self, returns an integer. In either case, any
+ floating point value is rounded up under the assumption that if we are dividing Steps, we
+ would rather overestimate than underestimate.
+ """
+ ifisinstance(other,type(self)):
+ returnmath.ceil(other.num_steps/self.num_steps)
+ elifisinstance(other,int):
+ returntype(self)(math.ceil(other/self.num_steps))
+ else:
+ raiseTypeError("cannot divide {} by {}".format(type(other),type(self)))
+
classFrames(StepMethod):def__init__(self,num_steps):
@@ -429,6 +472,9 @@
"""def__init__(self,action:ActionType,all_action_probabilities:float=0,
- action_value:float=0.,state_value:float=0.,max_action_value:float=None,
- action_intrinsic_reward:float=0):
+ action_value:float=0.,state_value:float=0.,max_action_value:float=None):""" :param action: the action :param all_action_probabilities: the probability that the action was given when selecting it
@@ -520,8 +565,6 @@
Source code for rl_coach.core_types
:param max_action_value: in case this is an action that was selected randomly, this is the value of the action that received the maximum value. if no value is given, the action is assumed to be the action with the maximum value
- :param action_intrinsic_reward: can contain any intrinsic reward that the agent wants to add to this action
- selection """self.action=actionself.all_action_probabilities=all_action_probabilities
@@ -530,8 +573,7 @@
returnTrueelse:returnFalse
+
+
+# TODO move to a NamedTuple, once we move to Python3.6
+# https://stackoverflow.com/questions/34269772/type-hints-in-namedtuple/34269877
+classCsvDataset(object):
+ def__init__(self,filepath:str,is_episodic:bool=True):
+ self.filepath=filepath
+ self.is_episodic=is_episodic
+
+
+classPickledReplayBuffer(object):
+ def__init__(self,filepath:str):
+ self.filepath=filepath
Source code for rl_coach.data_stores.s3_data_store
return Truedefsave_to_store(self):
+ self._save_to_store(self.params.checkpoint_dir)
+
+ def_save_to_store(self,checkpoint_dir):""" save_to_store() uploads the policy checkpoint, gifs and videos to the S3 data store. It reads the checkpoint state files and uploads only the latest checkpoint files to S3. It is used by the trainer in Coach when used in the distributed mode.
@@ -268,24 +271,32 @@
Source code for rl_coach.data_stores.s3_data_store
Source code for rl_coach.data_stores.s3_data_store
if self.params.expt_dirandos.path.exists(os.path.join(self.params.expt_dir,'gifs')):forfilenameinos.listdir(os.path.join(self.params.expt_dir,'gifs')):self.mc.fput_object(self.params.bucket_name,filename,os.path.join(self.params.expt_dir,'gifs',filename))
+
exceptResponseErrorase:print("Got exception: %s\n while saving to S3",e)
@@ -337,6 +349,18 @@
Source code for rl_coach.data_stores.s3_data_store
Source code for rl_coach.environments.gym_environment
random.seed(self.seed)# frame skip and max between consecutive frames
- self.is_robotics_env='robotics'instr(self.env.unwrapped.__class__)self.is_mujoco_env='mujoco'instr(self.env.unwrapped.__class__)self.is_roboschool_env='roboschool'instr(self.env.unwrapped.__class__)self.is_atari_env='Atari'instr(self.env.unwrapped.__class__)
@@ -501,7 +500,7 @@
Source code for rl_coach.environments.gym_environment
Source code for rl_coach.environments.gym_environment
# initialize the number of lives
self._update_ale_lives()
- def_set_mujoco_camera(self,camera_idx:int):
- """
- This function can be used to set the camera for rendering the mujoco simulator
- :param camera_idx: The index of the camera to use. Should be defined in the model
- :return: None
- """
- ifself.env.unwrapped.viewerisnotNoneandself.env.unwrapped.viewer.cam.fixedcamid!=camera_idx:
- frommujoco_py.generatedimportconst
- self.env.unwrapped.viewer.cam.type=const.CAMERA_FIXED
- self.env.unwrapped.viewer.cam.fixedcamid=camera_idx
-
- def_get_robotics_image(self):
- self.env.render()
- image=self.env.unwrapped._get_viewer().read_pixels(1600,900,depth=False)[::-1,:,:]
- image=scipy.misc.imresize(image,(270,480,3))
- returnimage
-
def_render(self):self.env.render(mode='human')
- # required for setting up a fixed camera for mujoco
- ifself.is_mujoco_envandnotself.is_roboschool_env:
- self._set_mujoco_camera(0)defget_rendered_image(self):
- ifself.is_robotics_env:
- # necessary for fetch since the rendered image is cropped to an irrelevant part of the simulator
- image=self._get_robotics_image()
- else:
- image=self.env.render(mode='rgb_array')
- # required for setting up a fixed camera for mujoco
- ifself.is_mujoco_envandnotself.is_roboschool_env:
- self._set_mujoco_camera(0)
+ image=self.env.render(mode='rgb_array')returnimagedefget_target_success_rate(self)->float:
diff --git a/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html b/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html
index 8fc232522..507e9a0af 100644
--- a/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html
+++ b/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html
@@ -179,6 +179,7 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
#
+## Copyright (c) 2017 Intel Corporation## Licensed under the Apache License, Version 2.0 (the "License");
@@ -193,14 +194,19 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
# See the License for the specific language governing permissions and
# limitations under the License.#
+importast
+importmath
-fromtypingimportList,Tuple,Union,Dict,Any
-
+importpandasaspd
+fromtypingimportList,Tuple,Unionimportnumpyasnp
+importrandomfromrl_coach.core_typesimportTransition,Episode
+fromrl_coach.loggerimportscreenfromrl_coach.memories.memoryimportMemory,MemoryGranularity,MemoryParameters
-fromrl_coach.utilsimportReaderWriterLock
+fromrl_coach.utilsimportReaderWriterLock,ProgressBar
+fromrl_coach.core_typesimportCsvDatasetclassEpisodicExperienceReplayParameters(MemoryParameters):
@@ -208,6 +214,7 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
super().__init__()self.max_size=(MemoryGranularity.Transitions,1000000)self.n_step=-1
+ self.train_to_eval_ratio=1# for OPE we'll want a value < 1@propertydefpath(self):
@@ -220,7 +227,9 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
calculations of total return and other values that depend on the sequential behavior of the transitions
in the episode. """
- def__init__(self,max_size:Tuple[MemoryGranularity,int]=(MemoryGranularity.Transitions,1000000),n_step=-1):
+
+ def__init__(self,max_size:Tuple[MemoryGranularity,int]=(MemoryGranularity.Transitions,1000000),n_step=-1,
+ train_to_eval_ratio:int=1):""" :param max_size: the maximum number of transitions or episodes to hold in the memory """
@@ -232,8 +241,11 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
self._num_transitions=0self._num_transitions_in_complete_episodes=0self.reader_writer_lock=ReaderWriterLock()
+ self.last_training_set_episode_id=None# used in batch-rl
+ self.last_training_set_transition_id=None# used in batch-rl
+ self.train_to_eval_ratio=train_to_eval_ratio# used in batch-rl
- deflength(self,lock:bool=False)->int:
+ deflength(self,lock:bool=False)->int:""" Get the number of episodes in the ER (even if they are not complete) """
@@ -255,6 +267,9 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
def num_transitions_in_complete_episodes(self):returnself._num_transitions_in_complete_episodes
+ defget_last_training_set_episode_id(self):
+ returnself.last_training_set_episode_id
+
defsample(self,size:int,is_consecutive_transitions=False)->List[Transition]:""" Sample a batch of transitions from the replay buffer. If the requested size is larger than the number
@@ -272,7 +287,7 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
Source code for rl_coach.memories.episodic.episodic_experience_replay
return batch
+ defget_episode_for_transition(self,transition:Transition)->(int,Episode):
+ """
+ Get the episode from which that transition came from.
+ :param transition: The transition to lookup the episode for
+ :return: (Episode number, the episode) or (-1, None) if could not find a matching episode.
+ """
+
+ fori,episodeinenumerate(self._buffer):
+ iftransitioninepisode.transitions:
+ returni,episode
+ return-1,None
+
+ defshuffle_episodes(self):
+ """
+ Shuffle all the episodes in the replay buffer
+ :return:
+ """
+ random.shuffle(self._buffer)
+ self.transitions=[tforeinself._bufferfortine.transitions]
+
+ defget_shuffled_data_generator(self,size:int)->List[Transition]:
+ """
+ Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
+ If the requested size is larger than the number of samples available in the replay buffer then the batch will
+ return empty. The last returned batch may be smaller than the size requested, to accommodate for all the
+ transitions in the replay buffer.
+
+ :param size: the size of the batch to return
+ :return: a batch (list) of selected transitions from the replay buffer
+ """
+ self.reader_writer_lock.lock_writing()
+ ifself.last_training_set_transition_idisNone:
+ ifself.train_to_eval_ratio<0orself.train_to_eval_ratio>=1:
+ raiseValueError('train_to_eval_ratio should be in the (0, 1] range.')
+
+ transition=self.transitions[round(self.train_to_eval_ratio*self.num_transitions_in_complete_episodes())]
+ episode_num,episode=self.get_episode_for_transition(transition)
+ self.last_training_set_episode_id=episode_num
+ self.last_training_set_transition_id= \
+ len([tforeinself.get_all_complete_episodes_from_to(0,self.last_training_set_episode_id+1)fortine])
+
+ shuffled_transition_indices=list(range(self.last_training_set_transition_id))
+ random.shuffle(shuffled_transition_indices)
+
+ # The last batch drawn will usually be < batch_size (=the size variable)
+ foriinrange(math.ceil(len(shuffled_transition_indices)/size)):
+ sample_data=[self.transitions[j]forjinshuffled_transition_indices[i*size:(i+1)*size]]
+ self.reader_writer_lock.release_writing()
+
+ yieldsample_data
+
+ defget_all_complete_episodes_transitions(self)->List[Transition]:
+ """
+ Get all the transitions from all the complete episodes in the buffer
+ :return: a list of transitions
+ """
+ returnself.transitions[:self.num_transitions_in_complete_episodes()]
+
+ defget_all_complete_episodes(self)->List[Episode]:
+ """
+ Get all the transitions from all the complete episodes in the buffer
+ :return: a list of transitions
+ """
+ returnself.get_all_complete_episodes_from_to(0,self.num_complete_episodes())
+
+ defget_all_complete_episodes_from_to(self,start_episode_id,end_episode_id)->List[Episode]:
+ """
+ Get all the transitions from all the complete episodes in the buffer matching the given episode range
+ :return: a list of transitions
+ """
+ returnself._buffer[start_episode_id:end_episode_id]
+
def_enforce_max_length(self)->None:""" Make sure that the size of the replay buffer does not pass the maximum size allowed.
@@ -368,7 +455,7 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
self.reader_writer_lock.release_writing_and_reading()
- defstore_episode(self,episode:Episode,lock:bool=True)->None:
+ defstore_episode(self,episode:Episode,lock:bool=True)->None:""" Store a new episode in the memory. :param episode: the new episode to store
@@ -391,7 +478,7 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
if lock:self.reader_writer_lock.release_writing_and_reading()
- defget_episode(self,episode_index:int,lock:bool=True)->Union[None,Episode]:
+ defget_episode(self,episode_index:int,lock:bool=True)->Union[None,Episode]:""" Returns the episode in the given index. If the episode does not exist, returns None instead. :param episode_index: the index of the episode to return
@@ -436,7 +523,7 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
self.reader_writer_lock.release_writing_and_reading()# for API compatibility
- defget(self,episode_index:int,lock:bool=True)->Union[None,Episode]:
+ defget(self,episode_index:int,lock:bool=True)->Union[None,Episode]:""" Returns the episode in the given index. If the episode does not exist, returns None instead. :param episode_index: the index of the episode to return
@@ -494,7 +581,51 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
mean =np.mean([transition.rewardfortransitioninself.transitions])self.reader_writer_lock.release_writing()
- returnmean
+ returnmean
+
+ defload_csv(self,csv_dataset:CsvDataset)->None:
+ """
+ Restore the replay buffer contents from a csv file.
+ The csv file is assumed to include a list of transitions.
+ :param csv_dataset: A construct which holds the dataset parameters
+ """
+ df=pd.read_csv(csv_dataset.filepath)
+ iflen(df)>self.max_size[1]:
+ screen.warning("Warning! The number of transitions to load into the replay buffer ({}) is "
+ "bigger than the max size of the replay buffer ({}). The excessive transitions will "
+ "not be stored.".format(len(df),self.max_size[1]))
+
+ episode_ids=df['episode_id'].unique()
+ progress_bar=ProgressBar(len(episode_ids))
+ state_columns=[colforcolindf.columnsifcol.startswith('state_feature')]
+
+ fore_idinepisode_ids:
+ progress_bar.update(e_id)
+ df_episode_transitions=df[df['episode_id']==e_id]
+ episode=Episode()
+ for(_,current_transition),(_,next_transition)inzip(df_episode_transitions[:-1].iterrows(),
+ df_episode_transitions[1:].iterrows()):
+ state=np.array([current_transition[col]forcolinstate_columns])
+ next_state=np.array([next_transition[col]forcolinstate_columns])
+
+ episode.insert(
+ Transition(state={'observation':state},
+ action=current_transition['action'],reward=current_transition['reward'],
+ next_state={'observation':next_state},game_over=False,
+ info={'all_action_probabilities':
+ ast.literal_eval(current_transition['all_action_probabilities'])}))
+
+ # Set the last transition to end the episode
+ ifcsv_dataset.is_episodic:
+ episode.get_last_transition().game_over=True
+
+ self.store_episode(episode)
+
+ # close the progress bar
+ progress_bar.update(len(episode_ids))
+ progress_bar.close()
+
+ self.shuffle_episodes()
Source code for rl_coach.memories.non_episodic.experience_replay
# limitations under the License.
#
-fromtypingimportList,Tuple,Union,Dict,Any
+fromtypingimportList,Tuple,Unionimportpickle
-importsys
-importtime
+importrandom
+importmathimportnumpyasnp
@@ -252,7 +252,6 @@
Source code for rl_coach.memories.non_episodic.experience_replay
Sample a batch of transitions form the replay buffer. If the requested size is larger than the number
of samples available in the replay buffer then the batch will return empty. :param size: the size of the batch to sample
- :param beta: the beta parameter used for importance sampling :return: a batch (list) of selected transitions from the replay buffer """self.reader_writer_lock.lock_writing()
@@ -272,6 +271,28 @@
Source code for rl_coach.memories.non_episodic.experience_replay
self.reader_writer_lock.release_writing()returnbatch
+ defget_shuffled_data_generator(self,size:int)->List[Transition]:
+ """
+ Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
+ If the requested size is larger than the number of samples available in the replay buffer then the batch will
+ return empty. The last returned batch may be smaller than the size requested, to accommodate for all the
+ transitions in the replay buffer.
+
+ :param size: the size of the batch to return
+ :return: a batch (list) of selected transitions from the replay buffer
+ """
+ self.reader_writer_lock.lock_writing()
+ shuffled_transition_indices=list(range(len(self.transitions)))
+ random.shuffle(shuffled_transition_indices)
+
+ # we deliberately drop some of the ending data which is left after dividing to batches of size `size`
+ # for i in range(math.ceil(len(shuffled_transition_indices) / size)):
+ foriinrange(int(len(shuffled_transition_indices)/size)):
+ sample_data=[self.transitions[j]forjinshuffled_transition_indices[i*size:(i+1)*size]]
+ self.reader_writer_lock.release_writing()
+
+ yieldsample_data
+
def_enforce_max_length(self)->None:""" Make sure that the size of the replay buffer does not pass the maximum size allowed.
@@ -395,7 +416,7 @@
Source code for rl_coach.memories.non_episodic.experience_replay
with open(file_path,'wb')asfile:pickle.dump(self.transitions,file)
- defload(self,file_path:str)->None:
+ defload_pickled(self,file_path:str)->None:""" Restore the replay buffer contents from a pickle file. The pickle file is assumed to include a list of transitions.
@@ -418,6 +439,7 @@
Source code for rl_coach.memories.non_episodic.experience_replay
Source code for rl_coach.orchestrators.kubernetes_orchestrator
self.s3_access_key=os.environ.get('ACCESS_KEY_ID')self.s3_secret_key=os.environ.get('SECRET_ACCESS_KEY')
- defsetup(self)->bool:
+ defsetup(self,crd=None)->bool:""" Deploys the memory backend and data stores if required. """
@@ -308,6 +308,9 @@
Source code for rl_coach.orchestrators.kubernetes_orchestrator
return Falseifself.params.data_store_params.store_type=="nfs":self.nfs_pvc=self.data_store.get_info()
+
+ # Upload checkpoints in checkpoint_restore_dir (if provided) to the data store
+ self.data_store.setup_checkpoint_dir(crd)returnTruedefdeploy_trainer(self)->bool:
@@ -321,7 +324,6 @@
Source code for rl_coach.orchestrators.kubernetes_orchestrator
"""def__init__(self,state:StateSpace,
- goal:ObservationSpace,
+ goal:Union[ObservationSpace,None],action:ActionSpace,reward:RewardSpace):self.state=state
diff --git a/docs/_sources/components/agents/index.rst.txt b/docs/_sources/components/agents/index.rst.txt
index 62aaf0eb3..476bc7a6c 100644
--- a/docs/_sources/components/agents/index.rst.txt
+++ b/docs/_sources/components/agents/index.rst.txt
@@ -21,6 +21,7 @@ A detailed description of those algorithms can be found by navigating to each of
imitation/cil
policy_optimization/cppo
policy_optimization/ddpg
+ policy_optimization/sac
other/dfp
value_optimization/double_dqn
value_optimization/dqn
diff --git a/docs/_sources/components/agents/policy_optimization/acer.rst.txt b/docs/_sources/components/agents/policy_optimization/acer.rst.txt
index 7808443fa..d16a0e728 100644
--- a/docs/_sources/components/agents/policy_optimization/acer.rst.txt
+++ b/docs/_sources/components/agents/policy_optimization/acer.rst.txt
@@ -38,6 +38,7 @@ Each update perform the following procedure:
.. math:: \text{where} \quad \bar{\rho}_{t} = \min{\left\{c,\rho_t\right\}},\quad \rho_t=\frac{\pi (a_t \mid s_t)}{\mu (a_t \mid s_t)}
3. **Accumulate gradients:**
+
:math:`\bullet` **Policy gradients (with bias correction):**
.. math:: \hat{g}_t^{policy} & = & \bar{\rho}_{t} \nabla \log \pi (a_t \mid s_t) [Q^{ret}(s_t,a_t) - V(s_t)] \\
diff --git a/docs/_sources/components/agents/policy_optimization/sac.rst.txt b/docs/_sources/components/agents/policy_optimization/sac.rst.txt
new file mode 100644
index 000000000..418c87eac
--- /dev/null
+++ b/docs/_sources/components/agents/policy_optimization/sac.rst.txt
@@ -0,0 +1,49 @@
+Soft Actor-Critic
+============
+
+**Actions space:** Continuous
+
+**References:** `Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor `_
+
+Network Structure
+-----------------
+
+.. image:: /_static/img/design_imgs/sac.png
+ :align: center
+
+Algorithm Description
+---------------------
+
+Choosing an action - Continuous actions
++++++++++++++++++++++++++++++++++++++
+
+The policy network is used in order to predict mean and log std for each action. While training, a sample is taken
+from a Gaussian distribution with these mean and std values. When testing, the agent can choose deterministically
+by picking the mean value or sample from a gaussian distribution like in training.
+
+Training the network
+++++++++++++++++++++
+Start by sampling a batch :math:`B` of transitions from the experience replay.
+
+* To train the **Q network**, use the following targets:
+
+ .. math:: y_t^Q=r(s_t,a_t)+\gamma \cdot V(s_{t+1})
+
+ The state value used in the above target is acquired by running the target state value network.
+
+* To train the **State Value network**, use the following targets:
+
+ .. math:: y_t^V = \min_{i=1,2}Q_i(s_t,\tilde{a}) - log\pi (\tilde{a} \vert s),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)
+
+ The state value network is trained using a sample-based approximation of the connection between and state value and state
+ action values, The actions used for constructing the target are **not** sampled from the replay buffer, but rather sampled
+ from the current policy.
+
+* To train the **actor network**, use the following equation:
+
+ .. math:: \nabla_{\theta} J \approx \nabla_{\theta} \frac{1}{\vert B \vert} \sum_{s_t\in B} \left( Q \left(s_t, \tilde{a}_\theta(s_t)\right) - log\pi_{\theta}(\tilde{a}_{\theta}(s_t)\vert s_t) \right),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)
+
+After every training step, do a soft update of the V target network's weights from the online networks.
+
+
+.. autoclass:: rl_coach.agents.soft_actor_critic_agent.SoftActorCriticAlgorithmParameters
\ No newline at end of file
diff --git a/docs/_sources/selecting_an_algorithm.rst.txt b/docs/_sources/selecting_an_algorithm.rst.txt
index e56f80298..e4027def8 100644
--- a/docs/_sources/selecting_an_algorithm.rst.txt
+++ b/docs/_sources/selecting_an_algorithm.rst.txt
@@ -198,6 +198,14 @@ The algorithms are ordered by their release date in descending order.
improve stability it also employs bias correction and trust region optimization techniques.
+
+
+ SAC
+
+ Soft Actor-Critic is an algorithm which optimizes a stochastic policy in an off-policy way.
+ One of the key features of SAC is that it solves a maximum entropy reinforcement learning problem.
+
+
framework_type – deep learning framework type. currently only tensorflow is supported
-
evaluate_only – the task will be used only for evaluating the model
+
evaluate_only – if not None, the task will be used only for evaluating the model for the given number of steps.
+A value of 0 means that task will be evaluated for an infinite number of steps.
use_cpu – use the cpu for this task
experiment_path – the path to the directory which will store all the experiment outputs
seed – a seed to use for the random numbers generator
checkpoint_save_secs – the number of seconds between each checkpoint saving
-
checkpoint_restore_dir – the directory to restore the checkpoints from
+
checkpoint_restore_dir – [DEPECRATED - will be removed in one of the next releases - switch to checkpoint_restore_path]
+the dir to restore the checkpoints from
+
checkpoint_restore_path – the path to restore the checkpoints from
checkpoint_save_dir – the directory to store the checkpoints in
export_onnx_graph – If set to True, this will export an onnx graph each time a checkpoint is saved
apply_stop_condition – If set to True, this will apply the stop condition defined by reaching a target success rate
framework_type – deep learning framework type. currently only tensorflow is supported
-
evaluate_only – the task will be used only for evaluating the model
+
evaluate_only – if not None, the task will be used only for evaluating the model for the given number of steps.
+A value of 0 means that task will be evaluated for an infinite number of steps.
parameters_server_hosts – comma-separated list of hostname:port pairs to which the parameter servers are
assigned
worker_hosts – comma-separated list of hostname:port pairs to which the workers are assigned
This emulates the act using the transition obtained from the rollout worker on the training worker
-in case of distributed training.
-Given the agents current knowledge, decide on the next action to apply to the environment
-:return: an action and a dictionary containing any additional info from the action decision process
This emulates the observe using the transition obtained from the rollout worker on the training worker
-in case of distributed training.
-Given a response from the environment, distill the observation from it and store it for later use.
-The response should be a dictionary containing the performed action, the new observation and measurements,
-the reward, a game over flag and any additional information necessary.
-:return:
Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
observations together, measurements together, etc.
Run off-policy evaluation estimators to evaluate the trained policy performance against a dataset.
+Should only be implemented for off-policy RL algorithms.
Training the network
- Next
+ Next Previous
diff --git a/docs/components/agents/policy_optimization/pg.html b/docs/components/agents/policy_optimization/pg.html
index 0a2e4fd8e..c37f65fb5 100644
--- a/docs/components/agents/policy_optimization/pg.html
+++ b/docs/components/agents/policy_optimization/pg.html
@@ -114,6 +114,7 @@
The policy network is used in order to predict mean and log std for each action. While training, a sample is taken
+from a Gaussian distribution with these mean and std values. When testing, the agent can choose deterministically
+by picking the mean value or sample from a gaussian distribution like in training.
The state value network is trained using a sample-based approximation of the connection between and state value and state
+action values, The actions used for constructing the target are not sampled from the replay buffer, but rather sampled
+from the current policy.
+
+
To train the actor network, use the following equation:
num_steps_between_copying_online_weights_to_target – (StepMethod)
+The number of steps between copying the online network weights to the target network weights.
+
rate_for_copying_weights_to_target – (float)
+When copying the online network weights to the target network weights, a soft update will be used, which
+weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)
+
use_deterministic_for_evaluation – (bool)
+If True, during the evaluation phase, action are chosen deterministically according to the policy mean
+and not sampled from the policy distribution.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/components/agents/value_optimization/bs_dqn.html b/docs/components/agents/value_optimization/bs_dqn.html
index 8f9283a23..1e1929bee 100644
--- a/docs/components/agents/value_optimization/bs_dqn.html
+++ b/docs/components/agents/value_optimization/bs_dqn.html
@@ -123,6 +123,7 @@
A replay buffer that stores episodes of transitions. The additional structure allows performing various
calculations of total return and other values that depend on the sequential behavior of the transitions
in the episode.
+
+ SAC
+
+ Soft Actor-Critic is an algorithm which optimizes a stochastic policy in an off-policy way.
+ One of the key features of SAC is that it solves a maximum entropy reinforcement learning problem.
+
+
This emulates the act using the transition obtained from the rollout worker on the training worker
-in case of distributed training.
-Given the agents current knowledge, decide on the next action to apply to the environment
-:return: an action and a dictionary containing any additional info from the action decision process
This emulates the observe using the transition obtained from the rollout worker on the training worker
-in case of distributed training.
-Given a response from the environment, distill the observation from it and store it for later use.
-The response should be a dictionary containing the performed action, the new observation and measurements,
-the reward, a game over flag and any additional information necessary.
-:return:
Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
observations together, measurements together, etc.
Run the off-policy evaluation estimators to get a prediction for the performance of the current policy based on
+an evaluation dataset, which was collected by another policy(ies).
+:return: None
+
+ SAC
+
+ Soft Actor-Critic is an algorithm which optimizes a stochastic policy in an off-policy way.
+ One of the key features of SAC is that it solves a maximum entropy reinforcement learning problem.
+
+
DDPG
diff --git a/img/algorithms.png b/img/algorithms.png
index 983df679b..b3310c076 100644
Binary files a/img/algorithms.png and b/img/algorithms.png differ
diff --git a/rl_coach/agents/soft_actor_critic_agent.py b/rl_coach/agents/soft_actor_critic_agent.py
new file mode 100644
index 000000000..9187124a2
--- /dev/null
+++ b/rl_coach/agents/soft_actor_critic_agent.py
@@ -0,0 +1,321 @@
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+import copy
+import numpy as np
+from collections import OrderedDict
+
+from rl_coach.agents.agent import Agent
+from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
+
+from rl_coach.architectures.head_parameters import SACQHeadParameters,SACPolicyHeadParameters,VHeadParameters
+from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
+from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, EmbedderScheme, MiddlewareScheme
+from rl_coach.core_types import ActionInfo, EnvironmentSteps, RunPhase
+from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
+from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
+from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
+from rl_coach.spaces import BoxActionSpace
+
+
+# There are 3 networks in SAC implementation. All have the same topology but parameters are not shared.
+# The networks are:
+# 1. State Value Network - SACValueNetwork
+# 2. Soft Q Value Network - SACCriticNetwork
+# 3. Policy Network - SACPolicyNetwork - currently supporting only Gaussian Policy
+
+
+# 1. State Value Network - SACValueNetwork
+# this is the state value network in SAC.
+# The network is trained to predict (regression) the state value in the max-entropy settings
+# The objective to be minimized is given in equation (5) in the paper:
+#
+# J(psi)= E_(s~D)[0.5*(V_psi(s)-y(s))^2]
+# where y(s) = E_(a~pi)[Q_theta(s,a)-log(pi(a|s))]
+
+
+# Default parameters for value network:
+# topology :
+# input embedder : EmbedderScheme.Medium (Dense(256)) , relu activation
+# middleware : EmbedderScheme.Medium (Dense(256)) , relu activation
+
+
+class SACValueNetworkParameters(NetworkParameters):
+ def __init__(self):
+ super().__init__()
+ self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='relu')}
+ self.middleware_parameters = FCMiddlewareParameters(activation_function='relu')
+ self.heads_parameters = [VHeadParameters(initializer='xavier')]
+ self.rescale_gradient_from_head_by_factor = [1]
+ self.optimizer_type = 'Adam'
+ self.batch_size = 256
+ self.async_training = False
+ self.learning_rate = 0.0003 # 3e-4 see appendix D in the paper
+ self.create_target_network = True # tau is set in SoftActorCriticAlgorithmParameters.rate_for_copying_weights_to_target
+
+
+# 2. Soft Q Value Network - SACCriticNetwork
+# the whole network is built in the SACQHeadParameters. we use empty input embedder and middleware
+class SACCriticNetworkParameters(NetworkParameters):
+ def __init__(self):
+ super().__init__()
+ self.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
+ self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Empty)
+ self.heads_parameters = [SACQHeadParameters()] # SACQHeadParameters includes the topology of the head
+ self.rescale_gradient_from_head_by_factor = [1]
+ self.optimizer_type = 'Adam'
+ self.batch_size = 256
+ self.async_training = False
+ self.learning_rate = 0.0003
+ self.create_target_network = False
+
+
+# 3. policy Network
+# Default parameters for policy network:
+# topology :
+# input embedder : EmbedderScheme.Medium (Dense(256)) , relu activation
+# middleware : EmbedderScheme = [Dense(256)] , relu activation --> scheme should be overridden in preset
+class SACPolicyNetworkParameters(NetworkParameters):
+ def __init__(self):
+ super().__init__()
+ self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='relu')}
+ self.middleware_parameters = FCMiddlewareParameters(activation_function='relu')
+ self.heads_parameters = [SACPolicyHeadParameters()]
+ self.rescale_gradient_from_head_by_factor = [1]
+ self.optimizer_type = 'Adam'
+ self.batch_size = 256
+ self.async_training = False
+ self.learning_rate = 0.0003
+ self.create_target_network = False
+ self.l2_regularization = 0 # weight decay regularization. not used in the original paper
+
+
+# Algorithm Parameters
+
+class SoftActorCriticAlgorithmParameters(AlgorithmParameters):
+ """
+ :param num_steps_between_copying_online_weights_to_target: (StepMethod)
+ The number of steps between copying the online network weights to the target network weights.
+
+ :param rate_for_copying_weights_to_target: (float)
+ When copying the online network weights to the target network weights, a soft update will be used, which
+ weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)
+
+ :param use_deterministic_for_evaluation: (bool)
+ If True, during the evaluation phase, action are chosen deterministically according to the policy mean
+ and not sampled from the policy distribution.
+ """
+ def __init__(self):
+ super().__init__()
+ self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
+ self.rate_for_copying_weights_to_target = 0.005
+ self.use_deterministic_for_evaluation = True # evaluate agent using deterministic policy (i.e. take the mean value)
+
+
+class SoftActorCriticAgentParameters(AgentParameters):
+ def __init__(self):
+ super().__init__(algorithm=SoftActorCriticAlgorithmParameters(),
+ exploration=AdditiveNoiseParameters(),
+ memory=ExperienceReplayParameters(), # SAC doesnt use episodic related data
+ # network wrappers:
+ networks=OrderedDict([("policy", SACPolicyNetworkParameters()),
+ ("q", SACCriticNetworkParameters()),
+ ("v", SACValueNetworkParameters())]))
+
+ @property
+ def path(self):
+ return 'rl_coach.agents.soft_actor_critic_agent:SoftActorCriticAgent'
+
+
+# Soft Actor Critic - https://arxiv.org/abs/1801.01290
+class SoftActorCriticAgent(PolicyOptimizationAgent):
+ def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
+ super().__init__(agent_parameters, parent)
+ self.last_gradient_update_step_idx = 0
+
+ # register signals to track (in learn_from_batch)
+ self.policy_means = self.register_signal('Policy_mu_avg')
+ self.policy_logsig = self.register_signal('Policy_logsig')
+ self.policy_logprob_sampled = self.register_signal('Policy_logp_sampled')
+ self.policy_grads = self.register_signal('Policy_grads_sumabs')
+
+ self.q1_values = self.register_signal("Q1")
+ self.TD_err1 = self.register_signal("TD err1")
+ self.q2_values = self.register_signal("Q2")
+ self.TD_err2 = self.register_signal("TD err2")
+ self.v_tgt_ns = self.register_signal('V_tgt_ns')
+ self.v_onl_ys = self.register_signal('V_onl_ys')
+ self.action_signal = self.register_signal("actions")
+
+ def learn_from_batch(self, batch):
+ #########################################
+ # need to update the following networks:
+ # 1. actor (policy)
+ # 2. state value (v)
+ # 3. critic (q1 and q2)
+ # 4. target network - probably already handled by V
+
+ #########################################
+ # define the networks to be used
+
+ # State Value Network
+ value_network = self.networks['v']
+ value_network_keys = self.ap.network_wrappers['v'].input_embedders_parameters.keys()
+
+ # Critic Network
+ q_network = self.networks['q'].online_network
+ q_head = q_network.output_heads[0]
+ q_network_keys = self.ap.network_wrappers['q'].input_embedders_parameters.keys()
+
+ # Actor (policy) Network
+ policy_network = self.networks['policy'].online_network
+ policy_network_keys = self.ap.network_wrappers['policy'].input_embedders_parameters.keys()
+
+ ##########################################
+ # 1. updating the actor - according to (13) in the paper
+ policy_inputs = copy.copy(batch.states(policy_network_keys))
+ policy_results = policy_network.predict(policy_inputs)
+
+ policy_mu, policy_std, sampled_raw_actions, sampled_actions, sampled_actions_logprob, \
+ sampled_actions_logprob_mean = policy_results
+
+ self.policy_means.add_sample(policy_mu)
+ self.policy_logsig.add_sample(policy_std)
+ self.policy_logprob_sampled.add_sample(sampled_actions_logprob_mean)
+
+ # get the state-action values for the replayed states and their corresponding actions from the policy
+ q_inputs = copy.copy(batch.states(q_network_keys))
+ q_inputs['output_0_0'] = sampled_actions
+ log_target = q_network.predict(q_inputs)[0].squeeze()
+
+ # log internal q values
+ q1_vals, q2_vals = q_network.predict(q_inputs, outputs=[q_head.q1_output, q_head.q2_output])
+ self.q1_values.add_sample(q1_vals)
+ self.q2_values.add_sample(q2_vals)
+
+ # calculate the gradients according to (13)
+ # get the gradients of log_prob w.r.t the weights (parameters) - indicated as phi in the paper
+ initial_feed_dict = {policy_network.gradients_weights_ph[5]: np.array(1.0)}
+ dlogp_dphi = policy_network.predict(policy_inputs,
+ outputs=policy_network.weighted_gradients[5],
+ initial_feed_dict=initial_feed_dict)
+
+ # calculate dq_da
+ dq_da = q_network.predict(q_inputs,
+ outputs=q_network.gradients_wrt_inputs[1]['output_0_0'])
+
+ # calculate da_dphi
+ initial_feed_dict = {policy_network.gradients_weights_ph[3]: dq_da}
+ dq_dphi = policy_network.predict(policy_inputs,
+ outputs=policy_network.weighted_gradients[3],
+ initial_feed_dict=initial_feed_dict)
+
+ # now given dlogp_dphi, dq_dphi we need to calculate the policy gradients according to (13)
+ policy_grads = [dlogp_dphi[l] - dq_dphi[l] for l in range(len(dlogp_dphi))]
+
+ # apply the gradients to policy networks
+ policy_network.apply_gradients(policy_grads)
+ grads_sumabs = np.sum([np.sum(np.abs(policy_grads[l])) for l in range(len(policy_grads))])
+ self.policy_grads.add_sample(grads_sumabs)
+
+ ##########################################
+ # 2. updating the state value online network weights
+ # done by calculating the targets for the v head according to (5) in the paper
+ # value_targets = log_targets-sampled_actions_logprob
+ value_inputs = copy.copy(batch.states(value_network_keys))
+ value_targets = log_target - sampled_actions_logprob
+
+ self.v_onl_ys.add_sample(value_targets)
+
+ # call value_network apply gradients with this target
+ value_loss = value_network.online_network.train_on_batch(value_inputs, value_targets[:,None])[0]
+
+ ##########################################
+ # 3. updating the critic (q networks)
+ # updating q networks according to (7) in the paper
+
+ # define the input to the q network: state has been already updated previously, but now we need
+ # the actions from the batch (and not those sampled by the policy)
+ q_inputs['output_0_0'] = batch.actions(len(batch.actions().shape) == 1)
+
+ # define the targets : scale_reward * reward + (1-terminal)*discount*v_target_next_state
+ # define v_target_next_state
+ value_inputs = copy.copy(batch.next_states(value_network_keys))
+ v_target_next_state = value_network.target_network.predict(value_inputs)
+ self.v_tgt_ns.add_sample(v_target_next_state)
+ # Note: reward is assumed to be rescaled by RewardRescaleFilter in the preset parameters
+ TD_targets = batch.rewards(expand_dims=True) + \
+ (1.0 - batch.game_overs(expand_dims=True)) * self.ap.algorithm.discount * v_target_next_state
+
+ # call critic network update
+ result = q_network.train_on_batch(q_inputs, TD_targets, additional_fetches=[q_head.q1_loss, q_head.q2_loss])
+ total_loss, losses, unclipped_grads = result[:3]
+ q1_loss, q2_loss = result[3]
+ self.TD_err1.add_sample(q1_loss)
+ self.TD_err2.add_sample(q2_loss)
+
+ ##########################################
+ # 4. updating the value target network
+ # I just need to set the parameter rate_for_copying_weights_to_target in the agent parameters to be 1-tau
+ # where tau is the hyper parameter as defined in sac original implementation
+
+ return total_loss, losses, unclipped_grads
+
+ def get_prediction(self, states):
+ """
+ get the mean and stdev of the policy distribution given 'states'
+ :param states: the states for which we need to sample actions from the policy
+ :return: mean and stdev
+ """
+ tf_input_state = self.prepare_batch_for_inference(states, 'policy')
+ return self.networks['policy'].online_network.predict(tf_input_state)
+
+ def train(self):
+ # since the algorithm works with experience replay buffer (non-episodic),
+ # we cant use the policy optimization train method. we need Agent.train
+ # note that since in Agent.train there is no apply_gradients, we need to do it in learn from batch
+ return Agent.train(self)
+
+ def choose_action(self, curr_state):
+ """
+ choose_action - chooses the most likely action
+ if 'deterministic' - take the mean of the policy which is the prediction of the policy network.
+ else - use the exploration policy
+ :param curr_state:
+ :return: action wrapped in ActionInfo
+ """
+ if not isinstance(self.spaces.action, BoxActionSpace):
+ raise ValueError("SAC works only for continuous control problems")
+ # convert to batch so we can run it through the network
+ tf_input_state = self.prepare_batch_for_inference(curr_state, 'policy')
+ # use the online network for prediction
+ policy_network = self.networks['policy'].online_network
+ policy_head = policy_network.output_heads[0]
+ result = policy_network.predict(tf_input_state,
+ outputs=[policy_head.policy_mean, policy_head.actions])
+ action_mean, action_sample = result
+
+ # if using deterministic policy, take the mean values. else, use exploration policy to sample from the pdf
+ if self.phase == RunPhase.TEST and self.ap.algorithm.use_deterministic_for_evaluation:
+ action = action_mean[0]
+ else:
+ action = action_sample[0]
+
+ self.action_signal.add_sample(action)
+
+ action_info = ActionInfo(action=action)
+ return action_info
diff --git a/rl_coach/architectures/head_parameters.py b/rl_coach/architectures/head_parameters.py
index eba8c8a09..b94754990 100644
--- a/rl_coach/architectures/head_parameters.py
+++ b/rl_coach/architectures/head_parameters.py
@@ -36,7 +36,6 @@ def path(self):
return 'rl_coach.architectures.tensorflow_components.heads:' + self.parameterized_class_name
-
class PPOHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='tanh', name: str='ppo_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
@@ -50,11 +49,12 @@ def __init__(self, activation_function: str ='tanh', name: str='ppo_head_params'
class VHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='v_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
- loss_weight: float = 1.0, dense_layer=None):
+ loss_weight: float = 1.0, dense_layer=None, initializer='normalized_columns'):
super().__init__(parameterized_class_name="VHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
+ self.initializer = initializer
class CategoricalQHeadParameters(HeadParameters):
@@ -196,3 +196,17 @@ def __init__(self, activation_function: str ='relu', name: str='acer_policy_head
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
+
+
+class SACPolicyHeadParameters(HeadParameters):
+ def __init__(self, activation_function: str ='relu', name: str='sac_policy_head_params', dense_layer=None):
+ super().__init__(parameterized_class_name='SACPolicyHead', activation_function=activation_function, name=name,
+ dense_layer=dense_layer)
+
+
+class SACQHeadParameters(HeadParameters):
+ def __init__(self, activation_function: str ='relu', name: str='sac_q_head_params', dense_layer=None,
+ layers_sizes: tuple = (256, 256)):
+ super().__init__(parameterized_class_name='SACQHead', activation_function=activation_function, name=name,
+ dense_layer=dense_layer)
+ self.network_layers_sizes = layers_sizes
diff --git a/rl_coach/architectures/tensorflow_components/heads/__init__.py b/rl_coach/architectures/tensorflow_components/heads/__init__.py
index 1d9a174e4..ade92f2a9 100644
--- a/rl_coach/architectures/tensorflow_components/heads/__init__.py
+++ b/rl_coach/architectures/tensorflow_components/heads/__init__.py
@@ -12,6 +12,8 @@
from .rainbow_q_head import RainbowQHead
from .v_head import VHead
from .acer_policy_head import ACERPolicyHead
+from .sac_head import SACPolicyHead
+from .sac_q_head import SACQHead
from .classification_head import ClassificationHead
from .cil_head import RegressionHead
@@ -30,6 +32,8 @@
'RainbowQHead',
'VHead',
'ACERPolicyHead',
- 'ClassificationHead'
+ 'SACPolicyHead',
+ 'SACQHead',
+ 'ClassificationHead',
'RegressionHead'
]
diff --git a/rl_coach/architectures/tensorflow_components/heads/sac_head.py b/rl_coach/architectures/tensorflow_components/heads/sac_head.py
new file mode 100644
index 000000000..aad9bfcd0
--- /dev/null
+++ b/rl_coach/architectures/tensorflow_components/heads/sac_head.py
@@ -0,0 +1,107 @@
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+
+from rl_coach.architectures.tensorflow_components.layers import Dense
+from rl_coach.architectures.tensorflow_components.heads.head import Head
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.core_types import ActionProbabilities
+from rl_coach.spaces import SpacesDefinition
+from rl_coach.utils import eps
+
+LOG_SIG_CAP_MAX = 2
+LOG_SIG_CAP_MIN = -20
+
+
+class SACPolicyHead(Head):
+ def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+ head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
+ squash: bool = True, dense_layer=Dense):
+ super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
+ dense_layer=dense_layer)
+ self.name = 'sac_policy_head'
+ self.return_type = ActionProbabilities
+ self.num_actions = self.spaces.action.shape # continuous actions
+ self.squash = squash # squashing using tanh
+
+ def _build_module(self, input_layer):
+ self.given_raw_actions = tf.placeholder(tf.float32, [None, self.num_actions], name="actions")
+ self.input = [self.given_raw_actions]
+ self.output = []
+
+ # build the network
+ self._build_continuous_net(input_layer, self.spaces.action)
+
+ def _squash_correction(self,actions):
+ '''
+ correct squash operation (in case of bounded actions) according to appendix C in the paper.
+ NOTE : this correction assume the squash is done with tanh.
+ :param actions: unbounded actions
+ :return: the correction to be applied to the log_prob of the actions, assuming tanh squash
+ '''
+ if not self.squash:
+ return 0
+ return tf.reduce_sum(tf.log(1 - tf.tanh(actions) ** 2 + eps), axis=1)
+
+ def _build_continuous_net(self, input_layer, action_space):
+ num_actions = action_space.shape[0]
+
+ self.policy_mu_and_logsig = self.dense_layer(2*num_actions)(input_layer, name='policy_mu_logsig')
+ self.policy_mean = tf.identity(self.policy_mu_and_logsig[..., :num_actions], name='policy_mean')
+ self.policy_log_std = tf.clip_by_value(self.policy_mu_and_logsig[..., num_actions:],
+ LOG_SIG_CAP_MIN, LOG_SIG_CAP_MAX,name='policy_log_std')
+
+ self.output.append(self.policy_mean) # output[0]
+ self.output.append(self.policy_log_std) # output[1]
+
+ # define the distributions for the policy
+ # Tensorflow's multivariate normal distribution supports reparameterization
+ tfd = tf.contrib.distributions
+ self.policy_distribution = tfd.MultivariateNormalDiag(loc=self.policy_mean,
+ scale_diag=tf.exp(self.policy_log_std))
+
+ # define network outputs
+ # note that tensorflow supports reparametrization.
+ # i.e. policy_action_sample is a tensor through which gradients can flow
+ self.raw_actions = self.policy_distribution.sample()
+
+ if self.squash:
+ self.actions = tf.tanh(self.raw_actions)
+ # correct log_prob in case of squash (see appendix C in the paper)
+ squash_correction = self._squash_correction(self.raw_actions)
+ else:
+ self.actions = self.raw_actions
+ squash_correction = 0
+
+ # policy_action_logprob is a tensor through which gradients can flow
+ self.sampled_actions_logprob = self.policy_distribution.log_prob(self.raw_actions) - squash_correction
+ self.sampled_actions_logprob_mean = tf.reduce_mean(self.sampled_actions_logprob)
+
+ self.output.append(self.raw_actions) # output[2] : sampled raw action (before squash)
+ self.output.append(self.actions) # output[3] : squashed (if needed) version of sampled raw_actions
+ self.output.append(self.sampled_actions_logprob) # output[4]: log prob of sampled action (squash corrected)
+ self.output.append(self.sampled_actions_logprob_mean) # output[5]: mean of log prob of sampled actions (squash corrected)
+
+ def __str__(self):
+ result = [
+ "policy head:"
+ "\t\tDense (num outputs = 256)",
+ "\t\tDense (num outputs = 256)",
+ "\t\tDense (num outputs = {0})".format(2*self.num_actions),
+ "policy_mu = output[:num_actions], policy_std = output[num_actions:]"
+ ]
+ return '\n'.join(result)
diff --git a/rl_coach/architectures/tensorflow_components/heads/sac_q_head.py b/rl_coach/architectures/tensorflow_components/heads/sac_q_head.py
new file mode 100644
index 000000000..cc2d95d49
--- /dev/null
+++ b/rl_coach/architectures/tensorflow_components/heads/sac_q_head.py
@@ -0,0 +1,116 @@
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import tensorflow as tf
+
+from rl_coach.architectures.tensorflow_components.layers import Dense
+from rl_coach.architectures.tensorflow_components.heads.head import Head
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.core_types import QActionStateValue
+from rl_coach.spaces import SpacesDefinition, BoxActionSpace
+
+
+class SACQHead(Head):
+ def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+ head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
+ dense_layer=Dense):
+ super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
+ dense_layer=dense_layer)
+ self.name = 'q_values_head'
+ if isinstance(self.spaces.action, BoxActionSpace):
+ self.num_actions = self.spaces.action.shape # continuous actions
+ else:
+ raise ValueError(
+ 'SACQHead does not support action spaces of type: {class_name}'.format(
+ class_name=self.spaces.action.__class__.__name__,
+ )
+ )
+ self.return_type = QActionStateValue
+ # extract the topology from the SACQHeadParameters
+ self.network_layers_sizes = agent_parameters.network_wrappers['q'].heads_parameters[0].network_layers_sizes
+
+ def _build_module(self, input_layer):
+ # SAC Q network is basically 2 networks running in parallel on the same input (state , action)
+ # state is the observation fed through the input_layer, action is fed through placeholder to the header
+ # each is calculating q value : q1(s,a) and q2(s,a)
+ # the output of the head is min(q1,q2)
+ self.actions = tf.placeholder(tf.float32, [None, self.num_actions], name="actions")
+ self.target = tf.placeholder(tf.float32, [None, 1], name="q_targets")
+ self.input = [self.actions]
+ self.output = []
+ # Note (1) : in the author's implementation of sac (in rllab) they summarize the embedding of observation and
+ # action (broadcasting the bias) in the first layer of the network.
+
+ # build q1 network head
+ with tf.variable_scope("q1_head"):
+ layer_size = self.network_layers_sizes[0]
+ qi_obs_emb = self.dense_layer(layer_size)(input_layer, activation=self.activation_function)
+ qi_act_emb = self.dense_layer(layer_size)(self.actions, activation=self.activation_function)
+ qi_output = qi_obs_emb + qi_act_emb # merging the inputs by summarizing them (see Note (1))
+ for layer_size in self.network_layers_sizes[1:]:
+ qi_output = self.dense_layer(layer_size)(qi_output, activation=self.activation_function)
+ # the output layer
+ self.q1_output = self.dense_layer(1)(qi_output, name='q1_output')
+
+ # build q2 network head
+ with tf.variable_scope("q2_head"):
+ layer_size = self.network_layers_sizes[0]
+ qi_obs_emb = self.dense_layer(layer_size)(input_layer, activation=self.activation_function)
+ qi_act_emb = self.dense_layer(layer_size)(self.actions, activation=self.activation_function)
+ qi_output = qi_obs_emb + qi_act_emb # merging the inputs by summarizing them (see Note (1))
+ for layer_size in self.network_layers_sizes[1:]:
+ qi_output = self.dense_layer(layer_size)(qi_output, activation=self.activation_function)
+ # the output layer
+ self.q2_output = self.dense_layer(1)(qi_output, name='q2_output')
+
+ # take the minimum as the network's output. this is the log_target (in the original implementation)
+ self.q_output = tf.minimum(self.q1_output, self.q2_output, name='q_output')
+ # the policy gradients
+ # self.q_output_mean = tf.reduce_mean(self.q1_output) # option 1: use q1
+ self.q_output_mean = tf.reduce_mean(self.q_output) # option 2: use min(q1,q2)
+
+ self.output.append(self.q_output)
+ self.output.append(self.q_output_mean)
+
+ # defining the loss
+ self.q1_loss = 0.5*tf.reduce_mean(tf.square(self.q1_output - self.target))
+ self.q2_loss = 0.5*tf.reduce_mean(tf.square(self.q2_output - self.target))
+ # eventually both losses are depends on different parameters so we can sum them up
+ self.loss = self.q1_loss+self.q2_loss
+ tf.losses.add_loss(self.loss)
+
+ def __str__(self):
+ result = [
+ "q1 output"
+ "\t\tDense (num outputs = 256)",
+ "\t\tDense (num outputs = 256)",
+ "\t\tDense (num outputs = 1)",
+ "q2 output"
+ "\t\tDense (num outputs = 256)",
+ "\t\tDense (num outputs = 256)",
+ "\t\tDense (num outputs = 1)",
+ "min(Q1,Q2)"
+ ]
+ return '\n'.join(result)
+
+
+
+
+
+
+
+
+
diff --git a/rl_coach/architectures/tensorflow_components/heads/v_head.py b/rl_coach/architectures/tensorflow_components/heads/v_head.py
index 07dbf2538..62bfba03b 100644
--- a/rl_coach/architectures/tensorflow_components/heads/v_head.py
+++ b/rl_coach/architectures/tensorflow_components/heads/v_head.py
@@ -26,7 +26,7 @@
class VHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
- dense_layer=Dense):
+ dense_layer=Dense, initializer='normalized_columns'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
dense_layer=dense_layer)
self.name = 'v_values_head'
@@ -37,10 +37,15 @@ def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition,
else:
self.loss_type = tf.losses.mean_squared_error
+ self.initializer = initializer
+
def _build_module(self, input_layer):
# Standard V Network
- self.output = self.dense_layer(1)(input_layer, name='output',
- kernel_initializer=normalized_columns_initializer(1.0))
+ if self.initializer == 'normalized_columns':
+ self.output = self.dense_layer(1)(input_layer, name='output',
+ kernel_initializer=normalized_columns_initializer(1.0))
+ elif self.initializer == 'xavier' or self.initializer is None:
+ self.output = self.dense_layer(1)(input_layer, name='output')
def __str__(self):
result = [
diff --git a/rl_coach/presets/Mujoco_SAC.py b/rl_coach/presets/Mujoco_SAC.py
new file mode 100644
index 000000000..89220a99a
--- /dev/null
+++ b/rl_coach/presets/Mujoco_SAC.py
@@ -0,0 +1,71 @@
+from rl_coach.agents.soft_actor_critic_agent import SoftActorCriticAgentParameters
+from rl_coach.architectures.layers import Dense
+from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
+from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps
+from rl_coach.filters.filter import InputFilter
+from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
+from rl_coach.environments.environment import SingleLevelSelection
+from rl_coach.environments.gym_environment import GymVectorEnvironment, mujoco_v2
+from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
+from rl_coach.graph_managers.graph_manager import ScheduleParameters
+
+
+####################
+# Graph Scheduling #
+####################
+
+# see graph_manager.py for possible schedule parameters
+schedule_params = ScheduleParameters()
+schedule_params.improve_steps = EnvironmentSteps(3000000)
+schedule_params.steps_between_evaluation_periods = EnvironmentSteps(1000)
+schedule_params.evaluation_steps = EnvironmentEpisodes(1)
+schedule_params.heatup_steps = EnvironmentSteps(10000)
+
+
+#########
+# Agent #
+#########
+agent_params = SoftActorCriticAgentParameters()
+# override default parameters:
+# value (v) networks parameters
+agent_params.network_wrappers['v'].batch_size = 256
+agent_params.network_wrappers['v'].learning_rate = 0.0003
+agent_params.network_wrappers['v'].middleware_parameters.scheme = [Dense(256)]
+
+# critic (q) network parameters
+agent_params.network_wrappers['q'].heads_parameters[0].network_layers_sizes = (256, 256)
+agent_params.network_wrappers['q'].batch_size = 256
+agent_params.network_wrappers['q'].learning_rate = 0.0003
+
+# actor (policy) network parameters
+agent_params.network_wrappers['policy'].batch_size = 256
+agent_params.network_wrappers['policy'].learning_rate = 0.0003
+agent_params.network_wrappers['policy'].middleware_parameters.scheme = [Dense(256)]
+
+# Input Filter
+# SAC requires reward scaling for Mujoco environments.
+# according to the paper:
+# Hopper, Walker-2d, HalfCheetah, Ant - requires scaling of 5
+# Humanoid - requires scaling of 20
+
+agent_params.input_filter = InputFilter()
+agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(5))
+
+###############
+# Environment #
+###############
+env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2))
+
+########
+# Test #
+########
+preset_validation_params = PresetValidationParameters()
+preset_validation_params.test = True
+preset_validation_params.min_reward_threshold = 400
+preset_validation_params.max_episodes_to_achieve_reward = 2200
+preset_validation_params.reward_test_level = 'inverted_pendulum'
+preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
+
+graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
+ schedule_params=schedule_params, vis_params=VisualizationParameters(),
+ preset_validation_params=preset_validation_params)
diff --git a/setup.py b/setup.py
index b76ffb89b..deff1df3a 100644
--- a/setup.py
+++ b/setup.py
@@ -85,7 +85,7 @@
setup(
name='rl-coach' if not slim_package else 'rl-coach-slim',
- version='0.11.2',
+ version='0.12.0',
description='Reinforcement Learning Coach enables easy experimentation with state of the art Reinforcement Learning algorithms.',
url='https://github.com/NervanaSystems/coach',
author='Intel AI Lab',