From f0e03de6d4a1ba4660953ad5611b3902e39741fb Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 21 Jul 2020 00:44:42 +0200 Subject: [PATCH 01/81] Added working her version, Online sampling is missing. --- stable_baselines3/her/her.py | 419 +++++++++++++++++++++++++++ stable_baselines3/her/obs_wrapper.py | 78 +++++ tests/test_her.py | 106 +++++++ 3 files changed, 603 insertions(+) create mode 100644 stable_baselines3/her/her.py create mode 100644 stable_baselines3/her/obs_wrapper.py create mode 100644 tests/test_her.py diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py new file mode 100644 index 0000000000..ae3683b404 --- /dev/null +++ b/stable_baselines3/her/her.py @@ -0,0 +1,419 @@ +from enum import Enum +from inspect import signature +from typing import Any, Callable, Dict, Optional, Type, Union + +import numpy as np +import torch as th + +from stable_baselines3.common.buffers import ReplayBuffer +from stable_baselines3.common.callbacks import BaseCallback +from stable_baselines3.common.noise import ActionNoise +from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm +from stable_baselines3.common.policies import BasePolicy +from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn +from stable_baselines3.common.vec_env import VecEnv +from stable_baselines3.her.obs_wrapper import ObsWrapper + + +class GoalSelectionStrategy(Enum): + """ + The strategies for selecting new goals when + creating artificial transitions. + """ + + # Select a goal that was achieved + # after the current step, in the same episode + FUTURE = 0 + # Select the goal that was achieved + # at the end of the episode + FINAL = 1 + # Select a goal that was achieved in the episode + EPISODE = 2 + # Select a goal that was achieved + # at some point in the training procedure + # (and that is present in the replay buffer) + RANDOM = 3 + + +# For convenience +# that way, we can use string to select a strategy +KEY_TO_GOAL_STRATEGY = { + "future": GoalSelectionStrategy.FUTURE, + "final": GoalSelectionStrategy.FINAL, + "episode": GoalSelectionStrategy.EPISODE, + "random": GoalSelectionStrategy.RANDOM, +} + + +class HER(OffPolicyAlgorithm): + """ + Hindsight Experience Replay (HER) + + :param policy: (BasePolicy) The policy model to use. + :param env: (VecEnv) The environment to learn from. + :param model: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3) + :param n_goals: (int) Number of sampled goals for replay. + :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay. + One of ['episode', 'final', 'future', 'random'] + :param learning_rate: (float or callable) learning rate for the optimizer, + it can be a function of the current progress remaining (from 1 to 0) + :param buffer_size: (int) size of the replay buffer + :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts + :param batch_size: (int) Minibatch size for each gradient update + :param tau: (float) the soft update coefficient ("Polyak update", between 0 and 1) + :param gamma: (float) the discount factor + :param train_freq: (int) Update the model every ``train_freq`` steps. + :param gradient_steps: (int) How many gradient update after each step + :param n_episodes_rollout: (int) Update the model every ``n_episodes_rollout`` episodes. + Note that this cannot be used at the same time as ``train_freq`` + :param action_noise: (ActionNoise) the action noise type (None by default), this can help + for hard exploration problem. Cf common.noise for the different action noise type. + :param optimize_memory_usage: (bool) Enable a memory efficient variant of the replay buffer + at a cost of more complexity. + See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195 + :param policy_kwargs: Additional arguments to be passed to the policy on creation + :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) + :param verbose: The verbosity level: 0 none, 1 training information, 2 debug + :param device: Device on which the code should run. + By default, it will try to use a Cuda compatible device and fallback to cpu + if it is not possible. + :param support_multi_env: Whether the algorithm supports training + with multiple environments (as in A2C) + :param create_eval_env: Whether to create a second environment that will be + used for evaluating the agent periodically. (Only available when passing string for the environment) + :param monitor_wrapper: When creating an environment, whether to wrap it + or not in a Monitor wrapper. + :param seed: Seed for the pseudo random generators + :param use_sde: Whether to use State Dependent Exploration (SDE) + instead of action noise exploration (default: False) + :param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE + Default: -1 (only sample at the beginning of the rollout) + :param use_sde_at_warmup: (bool) Whether to use gSDE instead of uniform sampling + during the warm up phase (before learning starts) + :param sde_support: (bool) Whether the model support gSDE or not + """ + + def __init__( + self, + policy: Type[BasePolicy], + env: VecEnv, + model: Type[OffPolicyAlgorithm], + n_goals: int = 5, + goal_strategy: Union[GoalSelectionStrategy, str] = "final", + learning_rate: Union[float, Callable] = 3e-4, + buffer_size: int = int(1e6), + learning_starts: int = 100, + batch_size: int = 256, + tau: float = 0.005, + gamma: float = 0.99, + train_freq: int = 1, + gradient_steps: int = 1, + n_episodes_rollout: int = -1, + action_noise: Optional[ActionNoise] = None, + optimize_memory_usage: bool = False, + policy_kwargs: Dict[str, Any] = None, + tensorboard_log: Optional[str] = None, + verbose: int = 0, + device: Union[th.device, str] = "auto", + support_multi_env: bool = False, + create_eval_env: bool = False, + monitor_wrapper: bool = True, + seed: Optional[int] = None, + use_sde: bool = False, + sde_sample_freq: int = -1, + use_sde_at_warmup: bool = False, + sde_support: bool = True, + *args, + **kwargs + ): + + if isinstance(goal_strategy, str): + self.goal_strategy = KEY_TO_GOAL_STRATEGY[goal_strategy.lower()] + else: + self.goal_strategy = goal_strategy + + assert isinstance( + self.goal_strategy, GoalSelectionStrategy + ), "Invalid goal selection strategy," "please use one of {}".format(list(GoalSelectionStrategy)) + + self.env = ObsWrapper(env) + + # get arguments for the model initialization + model_signature = signature(model.__init__) + arguments = locals() + model_init_dict = { + key: arguments[key] + for key in model_signature.parameters.keys() + if key in arguments and key != "self" and key != "env" + } + + super(HER, self).__init__( + policy, + self.env, + BasePolicy, + learning_rate, + buffer_size, + learning_starts, + batch_size, + tau, + gamma, + train_freq, + gradient_steps, + n_episodes_rollout, + action_noise, + optimize_memory_usage, + policy_kwargs, + tensorboard_log, + verbose, + device, + support_multi_env, + create_eval_env, + monitor_wrapper, + seed, + use_sde, + sde_sample_freq, + use_sde_at_warmup, + sde_support, + ) + + # model initialization + self.model = model(env=self.env, **model_init_dict, **kwargs) + + # storage for transitions of current episode + self.episode_storage = [] + self.n_goals = n_goals + + def learn( + self, + total_timesteps: int, + callback: MaybeCallback = None, + log_interval: int = 4, + eval_env: Optional[GymEnv] = None, + eval_freq: int = -1, + n_eval_episodes: int = 5, + tb_log_name: str = "run", + eval_log_path: Optional[str] = None, + reset_num_timesteps: bool = True, + ) -> "OffPolicyAlgorithm": + + total_timesteps, callback = self.model._setup_learn( + total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name + ) + + callback.on_training_start(locals(), globals()) + + while self.model.num_timesteps < total_timesteps: + + rollout = self.collect_rollouts( + self.env, + n_episodes=self.model.n_episodes_rollout, + n_steps=self.model.train_freq, + action_noise=self.model.action_noise, + callback=callback, + learning_starts=self.model.learning_starts, + replay_buffer=self.model.replay_buffer, + log_interval=log_interval, + ) + + if rollout.continue_training is False: + break + + if self.model.num_timesteps > 0 and self.model.num_timesteps > self.model.learning_starts: + # If no `gradient_steps` is specified, + # do as many gradients steps as steps performed during the rollout + gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps + self.train(batch_size=self.batch_size, gradient_steps=gradient_steps) + + callback.on_training_end() + + return self + + def collect_rollouts( + self, + env: VecEnv, + callback: BaseCallback, + n_episodes: int = 1, + n_steps: int = -1, + action_noise: Optional[ActionNoise] = None, + learning_starts: int = 0, + replay_buffer: Optional[ReplayBuffer] = None, + log_interval: Optional[int] = None, + ) -> RolloutReturn: + """ + Collect experiences and store them into a ReplayBuffer. + + :param env: (VecEnv) The training environment + :param callback: (BaseCallback) Callback that will be called at each step + (and at the beginning and end of the rollout) + :param n_episodes: (int) Number of episodes to use to collect rollout data + You can also specify a ``n_steps`` instead + :param n_steps: (int) Number of steps to use to collect rollout data + You can also specify a ``n_episodes`` instead. + :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration + Required for deterministic policy (e.g. TD3). This can also be used + in addition to the stochastic policy for SAC. + :param learning_starts: (int) Number of steps before learning for the warm-up phase. + :param replay_buffer: (ReplayBuffer) + :param log_interval: (int) Log data every ``log_interval`` episodes + :return: (RolloutReturn) + """ + episode_rewards, total_timesteps = [], [] + total_steps, total_episodes = 0, 0 + + assert isinstance(env, VecEnv), "You must pass a VecEnv" + assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" + + if self.use_sde: + self.model.actor.reset_noise() + + callback.on_rollout_start() + continue_training = True + + while total_steps < n_steps or total_episodes < n_episodes: + done = False + episode_reward, episode_timesteps = 0.0, 0 + + while not done: + # concatenate observation and (desired) goal + observation = self.model._last_obs + self.model._last_obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1) + + if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0: + # Sample a new noise matrix + self.model.actor.reset_noise() + + # Select action randomly or according to policy + action, buffer_action = self.model._sample_action(learning_starts, action_noise) + + # Rescale and perform action + new_obs, reward, done, infos = env.step(action) + + # Only stop training if return value is False, not when it is None. + if callback.on_step() is False: + return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False) + + episode_reward += reward + + # Retrieve reward and episode length if using Monitor wrapper + self.model._update_info_buffer(infos, done) + + # Store episode in episode storage + if replay_buffer is not None: + # Store only the unnormalized version + if self.model._vec_normalize_env is not None: + new_obs_ = self.model._vec_normalize_env.get_original_obs() + reward_ = self.model._vec_normalize_env.get_original_reward() + else: + # Avoid changing the original ones + self.model._last_original_obs, new_obs_, reward_ = observation, new_obs, reward + + # add current transition to episode storage + self.episode_storage.append((self.model._last_original_obs, buffer_action, reward_, new_obs_, done)) + + self.model._last_obs = new_obs + # Save the unnormalized observation + if self.model._vec_normalize_env is not None: + self.model._last_original_obs = new_obs_ + + self.model.num_timesteps += 1 + episode_timesteps += 1 + total_steps += 1 + self.model._update_current_progress_remaining(self.model.num_timesteps, self.model._total_timesteps) + + # For DQN, check if the target network should be updated + # and update the exploration schedule + # For SAC/TD3, the update is done as the same time as the gradient update + # see https://github.com/hill-a/stable-baselines/issues/900 + self.model._on_step() + + if 0 < n_steps <= total_steps: + break + + if done: + # store episode in replay buffer + self.store_transitions() + # clear storage for current episode + self.episode_storage = [] + + total_episodes += 1 + self.model._episode_num += 1 + episode_rewards.append(episode_reward) + total_timesteps.append(episode_timesteps) + + if action_noise is not None: + action_noise.reset() + + # Log training infos + if log_interval is not None and self.model._episode_num % log_interval == 0: + self.model._dump_logs() + + mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 + + callback.on_rollout_end() + + return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training) + + def train(self, gradient_steps: int, batch_size: int) -> None: + self.model.train(gradient_steps=gradient_steps, batch_size=batch_size) + + def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]: + """ + Sample a goal based on goal_strategy. + + :param sample_idx: (int) Index of current transition. + :return: (np.ndarray or None) Return sampled goal. + """ + if self.goal_strategy == GoalSelectionStrategy.FINAL: + # replay with final state of current episode + return self.episode_storage[-1][0]["achieved_goal"] + elif self.goal_strategy == GoalSelectionStrategy.FUTURE: + # replay with random state which comes from the same episode and was observed after current transition + # we have no transition after last transition of episode + if (sample_idx + 1) < len(self.episode_storage): + index = np.random.choice(np.arange(sample_idx + 1, len(self.episode_storage))) + return self.episode_storage[index][0]["achieved_goal"] + elif self.goal_strategy == GoalSelectionStrategy.EPISODE: + # replay with random state which comes from the same episode as current transition + index = np.random.choice(np.arange(len(self.episode_storage))) + return self.episode_storage[index][0]["achieved_goal"] + elif self.goal_strategy == GoalSelectionStrategy.RANDOM: + # replay with random state from the entire replay buffer + index = np.random.choice(np.arange(self.model.replay_buffer.size())) + obs = self.model.replay_buffer.observations[index] + # get only the observation part + obs_array = obs[:, : self.env.obs_dim] + return obs_array + else: + raise ValueError("Strategy for sampling goals not supported!") + + def store_transitions(self) -> None: + """ + Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer. + """ + + # iterate over current episodes transitions + for idx, trans in enumerate(self.episode_storage): + + observation, action, reward, new_observation, done = trans + + # concatenate observation with (desired) goal + obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1) + new_obs = np.concatenate([new_observation["observation"], new_observation["desired_goal"]], axis=1) + + # store data in replay buffer + self.model.replay_buffer.add(obs, new_obs, action, reward, done) + + # sample set of additional goals + sampled_goals = [sample for sample in (self.sample_goals(idx) for i in range(self.n_goals)) if sample is not None] + + # iterate over sampled goals and store new transitions in replay buffer + for goal in sampled_goals: + # compute new reward with new goal + new_reward = self.env.env_method("compute_reward", new_observation["achieved_goal"], goal, None) + + # concatenate observation with (desired) goal + obs = np.concatenate([observation["observation"], goal], axis=1) + new_obs = np.concatenate([new_observation["observation"], goal], axis=1) + + # store data in replay buffer + self.model.replay_buffer.add(obs, new_obs, action, new_reward, done) diff --git a/stable_baselines3/her/obs_wrapper.py b/stable_baselines3/her/obs_wrapper.py new file mode 100644 index 0000000000..e59f40f939 --- /dev/null +++ b/stable_baselines3/her/obs_wrapper.py @@ -0,0 +1,78 @@ +from typing import List, Optional, Sequence, Union + +import numpy as np +from gym import spaces + +from stable_baselines3.common.vec_env import VecEnv + + +class ObsWrapper(VecEnv): + """ + Wrapper for a VecEnv which overrides the observation space for Hindsight Experience Replay to support dict observations. + + :param env: (VecEnv) The vectorized environment to wrap. + """ + + def __init__(self, venv: VecEnv): + super(ObsWrapper, self).__init__( + num_envs=venv.num_envs, observation_space=venv.observation_space, action_space=venv.action_space + ) + + self.venv = venv + + self.spaces = list(venv.observation_space.spaces.values()) + + # get dimensions of observation and goal + if isinstance(self.spaces[0], spaces.Discrete): + self.obs_dim = 1 + self.goal_dim = 1 + else: + goal_space_shape = venv.observation_space.spaces["achieved_goal"].shape + self.obs_dim = venv.observation_space.spaces["observation"].shape[0] + self.goal_dim = goal_space_shape[0] + + # new observation space with concatenated observation and (desired) goal + # for the different types of spaces + if isinstance(self.spaces[0], spaces.Box): + low_values = np.concatenate( + [venv.observation_space["observation"].low, venv.observation_space["desired_goal"].low] + ) + high_values = np.concatenate( + [venv.observation_space["observation"].high, venv.observation_space["desired_goal"].high] + ) + self.observation_space = spaces.Box(low_values, high_values, dtype=np.float32) + elif isinstance(self.spaces[0], spaces.MultiBinary): + total_dim = self.obs_dim + self.goal_dim + self.observation_space = spaces.MultiBinary(total_dim) + elif isinstance(self.spaces[0], spaces.Discrete): + dimensions = [venv.observation_space.spaces["observation"].n, venv.observation_space.spaces["desired_goal"].n] + self.observation_space = spaces.MultiDiscrete(dimensions) + else: + raise NotImplementedError("{} space is not supported".format(type(self.spaces[0]))) + + def reset(self): + return self.venv.reset() + + def step_async(self, actions): + self.venv.step_async(actions) + + def step_wait(self): + return self.venv.step_wait() + + def close(self): + return self.venv.close() + + def get_attr(self, attr_name, indices=None): + return self.venv.get_attr(attr_name, indices) + + def set_attr(self, attr_name, value, indices=None): + return self.venv.set_attr(attr_name, value, indices) + + def env_method(self, method_name, *method_args, indices=None, **method_kwargs): + return self.venv.env_method(method_name, *method_args, indices=indices, **method_kwargs) + + def get_images(self) -> Sequence[np.ndarray]: + return self.venv.get_images() + + def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: + return self.venv.seed(seed) diff --git a/tests/test_her.py b/tests/test_her.py new file mode 100644 index 0000000000..96e36c654e --- /dev/null +++ b/tests/test_her.py @@ -0,0 +1,106 @@ +import numpy as np +import pytest +import torch as th + +from stable_baselines3 import TD3, SAC, DDPG +from stable_baselines3.common.bit_flipping_env import BitFlippingEnv +from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise +from stable_baselines3.common.vec_env import DummyVecEnv +from stable_baselines3.her.her import HER, GoalSelectionStrategy +from stable_baselines3.sac.policies import SACPolicy +from stable_baselines3.td3.policies import TD3Policy +from stable_baselines3.td3.policies import CnnPolicy, MlpPolicy + + +@pytest.mark.parametrize("model_class, policy, sde_support", [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)]) +def test_her(model_class, policy, sde_support): + """ + Test Hindsight Experience Replay. + """ + + env = BitFlippingEnv(continuous=True) + env = DummyVecEnv([lambda: env]) + + # Create action noise + n_actions = env.action_space.shape[0] + action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,))) + + model = HER( + policy, + env, + model_class, + n_goals=5, + goal_strategy="random", + action_noise=action_noise, + verbose=1, + tau=0.05, + batch_size=128, + learning_rate=0.001, + policy_kwargs=dict(net_arch=[256]), + buffer_size=int(1e6), + gamma=0.98, + gradient_steps=40, + sde_support=sde_support + ) + + model.learn(total_timesteps=1, callback=None) + + # Evaluate the agent + n_eval_episodes = 5 + n_episodes = 0 + episode_rewards = [] + episode_reward = 0.0 + + eval_env = BitFlippingEnv(continuous=True) + + observation = eval_env.reset() + + while n_episodes < n_eval_episodes: + + obs = np.concatenate([observation["observation"], observation["desired_goal"]]) + + with th.no_grad(): + obs_ = th.FloatTensor(np.array(obs).reshape(1, -1)).to(model.model.device) + action = model.model.policy.predict(obs_)[0][0] + + observation, reward, done, _ = eval_env.step(action) + + # Render the env + #eval_env.render() + + episode_reward += reward + + if done: + n_episodes += 1 + observation = eval_env.reset() + episode_rewards.append(episode_reward) + episode_reward = 0.0 + + eval_env.close() + print(f"Mean reward: {np.mean(episode_rewards)} +/- {np.std(episode_rewards)}") + + #assert np.mean(episode_rewards) > -50, "The environment is not solved" + + +@pytest.mark.parametrize( + "goal_strategy", + [ + "final", + "episode", + "future", + "random", + GoalSelectionStrategy.FUTURE, + GoalSelectionStrategy.RANDOM, + GoalSelectionStrategy.EPISODE, + GoalSelectionStrategy.FINAL, + ], +) +def test_goal_strategy(goal_strategy): + """ + Test different goal strategies. + """ + env = BitFlippingEnv(continuous=True) + env = DummyVecEnv([lambda: env]) + + model = HER(SACPolicy, env, SAC, goal_strategy=goal_strategy) + model.learn(total_timesteps=50, callback=None) From f2b06450737ff1c5c37ef2d05b2bcc717400e13e Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 21 Jul 2020 12:36:22 +0200 Subject: [PATCH 02/81] Updated test_her. --- tests/test_her.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/test_her.py b/tests/test_her.py index 96e36c654e..a75eee9484 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -9,28 +9,29 @@ from stable_baselines3.her.her import HER, GoalSelectionStrategy from stable_baselines3.sac.policies import SACPolicy from stable_baselines3.td3.policies import TD3Policy -from stable_baselines3.td3.policies import CnnPolicy, MlpPolicy +from stable_baselines3.td3.policies import MlpPolicy -@pytest.mark.parametrize("model_class, policy, sde_support", [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)]) +@pytest.mark.parametrize("model_class, policy, sde_support", + [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)]) def test_her(model_class, policy, sde_support): """ Test Hindsight Experience Replay. """ - env = BitFlippingEnv(continuous=True) + env = BitFlippingEnv(n_bits=4, continuous=True) env = DummyVecEnv([lambda: env]) # Create action noise n_actions = env.action_space.shape[0] - action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,))) + action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions, ), 0.2 * np.ones((n_actions,))) model = HER( policy, env, model_class, n_goals=5, - goal_strategy="random", + goal_strategy="future", action_noise=action_noise, verbose=1, tau=0.05, @@ -43,7 +44,7 @@ def test_her(model_class, policy, sde_support): sde_support=sde_support ) - model.learn(total_timesteps=1, callback=None) + model.learn(total_timesteps=500, callback=None) # Evaluate the agent n_eval_episodes = 5 @@ -51,7 +52,7 @@ def test_her(model_class, policy, sde_support): episode_rewards = [] episode_reward = 0.0 - eval_env = BitFlippingEnv(continuous=True) + eval_env = BitFlippingEnv(n_bits=4, continuous=True) observation = eval_env.reset() @@ -66,7 +67,7 @@ def test_her(model_class, policy, sde_support): observation, reward, done, _ = eval_env.step(action) # Render the env - #eval_env.render() + # eval_env.render() episode_reward += reward @@ -77,9 +78,6 @@ def test_her(model_class, policy, sde_support): episode_reward = 0.0 eval_env.close() - print(f"Mean reward: {np.mean(episode_rewards)} +/- {np.std(episode_rewards)}") - - #assert np.mean(episode_rewards) > -50, "The environment is not solved" @pytest.mark.parametrize( From f7d5f88228128a4f9f4e56bdbe4d5dadefaabf95 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Thu, 23 Jul 2020 08:43:16 +0200 Subject: [PATCH 03/81] Added first version of online her sampling. Still problems with tensor dimensions. --- .../her/goal_selection_strategy.py | 31 ++++ stable_baselines3/her/her.py | 52 +++--- stable_baselines3/her/her_replay_buffer.py | 152 ++++++++++++++++++ tests/test_her.py | 13 +- 4 files changed, 210 insertions(+), 38 deletions(-) create mode 100644 stable_baselines3/her/goal_selection_strategy.py create mode 100644 stable_baselines3/her/her_replay_buffer.py diff --git a/stable_baselines3/her/goal_selection_strategy.py b/stable_baselines3/her/goal_selection_strategy.py new file mode 100644 index 0000000000..09f3bfda6c --- /dev/null +++ b/stable_baselines3/her/goal_selection_strategy.py @@ -0,0 +1,31 @@ +from enum import Enum + + +class GoalSelectionStrategy(Enum): + """ + The strategies for selecting new goals when + creating artificial transitions. + """ + + # Select a goal that was achieved + # after the current step, in the same episode + FUTURE = 0 + # Select the goal that was achieved + # at the end of the episode + FINAL = 1 + # Select a goal that was achieved in the episode + EPISODE = 2 + # Select a goal that was achieved + # at some point in the training procedure + # (and that is present in the replay buffer) + RANDOM = 3 + + +# For convenience +# that way, we can use string to select a strategy +KEY_TO_GOAL_STRATEGY = { + "future": GoalSelectionStrategy.FUTURE, + "final": GoalSelectionStrategy.FINAL, + "episode": GoalSelectionStrategy.EPISODE, + "random": GoalSelectionStrategy.RANDOM, +} \ No newline at end of file diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index ae3683b404..c8c54e83b3 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -12,39 +12,11 @@ from stable_baselines3.common.policies import BasePolicy from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn from stable_baselines3.common.vec_env import VecEnv +from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy, KEY_TO_GOAL_STRATEGY +from stable_baselines3.her.her_replay_buffer import HerReplayBuffer from stable_baselines3.her.obs_wrapper import ObsWrapper -class GoalSelectionStrategy(Enum): - """ - The strategies for selecting new goals when - creating artificial transitions. - """ - - # Select a goal that was achieved - # after the current step, in the same episode - FUTURE = 0 - # Select the goal that was achieved - # at the end of the episode - FINAL = 1 - # Select a goal that was achieved in the episode - EPISODE = 2 - # Select a goal that was achieved - # at some point in the training procedure - # (and that is present in the replay buffer) - RANDOM = 3 - - -# For convenience -# that way, we can use string to select a strategy -KEY_TO_GOAL_STRATEGY = { - "future": GoalSelectionStrategy.FUTURE, - "final": GoalSelectionStrategy.FINAL, - "episode": GoalSelectionStrategy.EPISODE, - "random": GoalSelectionStrategy.RANDOM, -} - - class HER(OffPolicyAlgorithm): """ Hindsight Experience Replay (HER) @@ -55,6 +27,9 @@ class HER(OffPolicyAlgorithm): :param n_goals: (int) Number of sampled goals for replay. :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay. One of ['episode', 'final', 'future', 'random'] + :param online_sampling: (bool) Sample HER transitions online. + :her_ratio: (int) The ratio between HER replays and regular replays (e.g. k = 4 -> 4 times + as many HER replays as regular replays are used) :param learning_rate: (float or callable) learning rate for the optimizer, it can be a function of the current progress remaining (from 1 to 0) :param buffer_size: (int) size of the replay buffer @@ -100,6 +75,8 @@ def __init__( model: Type[OffPolicyAlgorithm], n_goals: int = 5, goal_strategy: Union[GoalSelectionStrategy, str] = "final", + online_sampling: bool = False, + her_ratio: int = 2, learning_rate: Union[float, Callable] = 3e-4, buffer_size: int = int(1e6), learning_starts: int = 100, @@ -114,7 +91,7 @@ def __init__( policy_kwargs: Dict[str, Any] = None, tensorboard_log: Optional[str] = None, verbose: int = 0, - device: Union[th.device, str] = "auto", + device: Union[th.device, str] = "cpu", support_multi_env: bool = False, create_eval_env: bool = False, monitor_wrapper: bool = True, @@ -179,6 +156,10 @@ def __init__( # model initialization self.model = model(env=self.env, **model_init_dict, **kwargs) + self.online_sampling = online_sampling + if self.online_sampling: + self.model.replay_buffer = HerReplayBuffer(self.env, buffer_size, self.goal_strategy, self.env.observation_space, self.env.action_space, device, self.n_envs, her_ratio) + # storage for transitions of current episode self.episode_storage = [] self.n_goals = n_goals @@ -330,8 +311,12 @@ def collect_rollouts( break if done: - # store episode in replay buffer - self.store_transitions() + + if self.online_sampling: + self.model.replay_buffer.add(self.episode_storage) + else: + # store episode in replay buffer + self.store_transitions() # clear storage for current episode self.episode_storage = [] @@ -369,6 +354,7 @@ def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]: elif self.goal_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition # we have no transition after last transition of episode + if (sample_idx + 1) < len(self.episode_storage): index = np.random.choice(np.arange(sample_idx + 1, len(self.episode_storage))) return self.episode_storage[index][0]["achieved_goal"] diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py new file mode 100644 index 0000000000..208355340f --- /dev/null +++ b/stable_baselines3/her/her_replay_buffer.py @@ -0,0 +1,152 @@ +from typing import Union, Optional + +import numpy as np +import torch as th +from gym import spaces + +from stable_baselines3.common.buffers import BaseBuffer + +from stable_baselines3.common.type_aliases import ReplayBufferSamples +from stable_baselines3.common.vec_env import VecNormalize, VecEnv +from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy + + +class HerReplayBuffer(BaseBuffer): + """ + Replay Buffer for online Hindsight Experience Replay (HER) + + :param env: (VecEnv) The training environment + :param buffer_size: (int) The size of the buffer measured in transitions. + :param goal_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay. + One of ['episode', 'final', 'future', 'random'] + :param observation_space: (spaces.Space) Observation space + :param action_space: (spaces.Space) Action space + :param device: (Union[th.device, str]) PyTorch device + to which the values will be converted + :param n_envs: (int) Number of parallel environments + :param her_ratio: (int) The ratio between HER replays and regular replays (e.g. k = 4 -> 4 times + as many HER replays as regular replays are used) + """ + + def __init__(self, env: VecEnv, buffer_size: int, goal_strategy: GoalSelectionStrategy, + observation_space: spaces.Space, + action_space: spaces.Space, + device: Union[th.device, str] = "cpu", + n_envs: int = 1, her_ratio: int = 2): + + super(HerReplayBuffer, self).__init__(buffer_size, observation_space, action_space, device, n_envs) + + self.env = env + self.size = buffer_size + + # buffer with episodes + self.buffer = [] + self.goal_strategy = goal_strategy + self.her_ratio = 1 - (1. / (1 + her_ratio)) + + # memory management + # current size in episodes + self.current_size = 0 + self.n_transitions_stored = 0 + + def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: + """Returns a dict {key: array(batch_size x shapes[key])} + """ + return self._sample_transitions(batch_size) + + def _sample_transitions(self, batch_size: int): + # batch size in transitions + + # Select which episodes and time steps to use. + episode_idxs = np.random.randint(0, self.current_size, batch_size) + buffer = np.array(self.buffer) + episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]]) + t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths]) + + transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)]) + + her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_ratio)[0] + her_episode_lenghts = episode_lengths[her_idxs] + + # get new goals with goal selection strategy + if self.goal_strategy == GoalSelectionStrategy.FINAL: + # replay with final state of current episode + last_transitions = buffer[episode_idxs[her_idxs]][:, -1][:, 0] + her_new_goals = [trans['achieved_goal'] for trans in last_transitions] + elif self.goal_strategy == GoalSelectionStrategy.FUTURE: + # replay with random state which comes from the same episode and was observed after current transition + # we have no transition after last transition of episode + her_new_goals = [] + for idx, length in zip(her_idxs, her_episode_lenghts): + if t_samples[idx] + 1 < length: + index = np.random.choice(np.arange(t_samples[idx] + 1, length)) + her_new_goals.append(buffer[episode_idxs[idx]][index][0]["achieved_goal"]) + else: + # delete index from her indices where we have no transition after current one + her_idxs = her_idxs[her_idxs != idx] + elif self.goal_strategy == GoalSelectionStrategy.EPISODE: + # replay with random state which comes from the same episode as current transition + index = np.array([np.random.choice(np.arange(ep_len)) for ep_len in her_episode_lenghts]) + episode_transitions = buffer[episode_idxs[her_idxs], index][:, 0] + her_new_goals = [trans['achieved_goal'] for trans in episode_transitions] + elif self.goal_strategy == GoalSelectionStrategy.RANDOM: + # replay with random state from the entire replay buffer + ep_idx = np.random.randint(0, self.current_size, len(her_idxs)) + state_idx = [np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]] + random_transitions = buffer[ep_idx][state_idx][:, 0][:, 0] + her_new_goals = [trans['achieved_goal'] for trans in random_transitions] + else: + raise ValueError("Strategy for sampling goals not supported!") + + # assign new goals as desired_goals + for idx, goal in enumerate(her_new_goals): + transitions[her_idxs][:, 0][idx]["desired_goal"] = goal + + observations, actions, rewards, new_observations, dones = list(zip(*transitions)) + + # compute new reward with new goal + achieved_goals = [new_obs['achieved_goal'] for new_obs in np.array(new_observations)[her_idxs]] + new_rewards = np.array(rewards) + new_rewards[her_idxs] = [self.env.env_method("compute_reward", ag, her_new_goals, None) for ag, new_goal in zip(achieved_goals, her_new_goals)] + + # concatenate observation with (desired) goal + obs = [np.concatenate([o["observation"], o["desired_goal"]], axis=1) for o in observations] + new_obs = [np.concatenate([new_o["observation"], new_o["desired_goal"]], axis=1) for new_o in new_observations] + + data = (np.array(obs)[:,0,:], np.array(actions), np.array(new_obs)[:,0,:], np.array(dones, dtype=int), rewards) + + return ReplayBufferSamples(*tuple(map(self.to_torch, data))) + + def add(self, episode): + episode_length = len(episode) + + if self.n_transitions_stored + episode_length <= self.size: + self.buffer.append(episode) + # update replay size + self.current_size += 1 + self.n_transitions_stored += episode_length + elif self.full: + idx = np.random.randint(0, self.size) + + if len(self.buffer[idx]) == episode_length: + self.buffer[idx] = episode + elif len(self.buffer[idx]) > episode_length: + self.buffer[idx] = episode + self.n_transitions_stored -= (self.buffer[idx] - episode_length) + + if self.n_transitions_stored == self.size: + self.full = True + else: + self.full = False + + def get_current_episode_size(self): + return self.current_size + + def get_current_size(self): + return self.n_transitions_stored + + def get_transitions_stored(self): + return self.n_transitions_stored + + def clear_buffer(self): + self.buffer = [] diff --git a/tests/test_her.py b/tests/test_her.py index a75eee9484..311bd2595b 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -14,7 +14,8 @@ @pytest.mark.parametrize("model_class, policy, sde_support", [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)]) -def test_her(model_class, policy, sde_support): +@pytest.mark.parametrize("online_sampling", [True, False]) +def test_her(model_class, policy, sde_support, online_sampling): """ Test Hindsight Experience Replay. """ @@ -32,6 +33,7 @@ def test_her(model_class, policy, sde_support): model_class, n_goals=5, goal_strategy="future", + online_sampling=online_sampling, action_noise=action_noise, verbose=1, tau=0.05, @@ -91,14 +93,15 @@ def test_her(model_class, policy, sde_support): GoalSelectionStrategy.RANDOM, GoalSelectionStrategy.EPISODE, GoalSelectionStrategy.FINAL, - ], + ] ) -def test_goal_strategy(goal_strategy): +@pytest.mark.parametrize("online_sampling", [True, False]) +def test_goal_strategy(goal_strategy, online_sampling): """ Test different goal strategies. """ env = BitFlippingEnv(continuous=True) env = DummyVecEnv([lambda: env]) - model = HER(SACPolicy, env, SAC, goal_strategy=goal_strategy) - model.learn(total_timesteps=50, callback=None) + model = HER(SACPolicy, env, SAC, goal_strategy=goal_strategy, online_sampling=online_sampling) + model.learn(total_timesteps=200, callback=None) From 88771b8ec5765028d61c0781b870c5cdb7483e04 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 23 Jul 2020 11:05:35 +0200 Subject: [PATCH 04/81] Reformat --- .../her/goal_selection_strategy.py | 2 +- stable_baselines3/her/her.py | 13 +++++- stable_baselines3/her/her_replay_buffer.py | 46 ++++++++++++------- tests/test_her.py | 16 +++---- 4 files changed, 50 insertions(+), 27 deletions(-) diff --git a/stable_baselines3/her/goal_selection_strategy.py b/stable_baselines3/her/goal_selection_strategy.py index 09f3bfda6c..5f434be277 100644 --- a/stable_baselines3/her/goal_selection_strategy.py +++ b/stable_baselines3/her/goal_selection_strategy.py @@ -28,4 +28,4 @@ class GoalSelectionStrategy(Enum): "final": GoalSelectionStrategy.FINAL, "episode": GoalSelectionStrategy.EPISODE, "random": GoalSelectionStrategy.RANDOM, -} \ No newline at end of file +} diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index c8c54e83b3..01ae37dfa1 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -12,7 +12,7 @@ from stable_baselines3.common.policies import BasePolicy from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn from stable_baselines3.common.vec_env import VecEnv -from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy, KEY_TO_GOAL_STRATEGY +from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy from stable_baselines3.her.her_replay_buffer import HerReplayBuffer from stable_baselines3.her.obs_wrapper import ObsWrapper @@ -158,7 +158,16 @@ def __init__( self.online_sampling = online_sampling if self.online_sampling: - self.model.replay_buffer = HerReplayBuffer(self.env, buffer_size, self.goal_strategy, self.env.observation_space, self.env.action_space, device, self.n_envs, her_ratio) + self.model.replay_buffer = HerReplayBuffer( + self.env, + buffer_size, + self.goal_strategy, + self.env.observation_space, + self.env.action_space, + device, + self.n_envs, + her_ratio, + ) # storage for transitions of current episode self.episode_storage = [] diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 208355340f..c6bd566104 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -1,13 +1,12 @@ -from typing import Union, Optional +from typing import Optional, Union import numpy as np import torch as th from gym import spaces from stable_baselines3.common.buffers import BaseBuffer - from stable_baselines3.common.type_aliases import ReplayBufferSamples -from stable_baselines3.common.vec_env import VecNormalize, VecEnv +from stable_baselines3.common.vec_env import VecEnv, VecNormalize from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy @@ -28,11 +27,17 @@ class HerReplayBuffer(BaseBuffer): as many HER replays as regular replays are used) """ - def __init__(self, env: VecEnv, buffer_size: int, goal_strategy: GoalSelectionStrategy, - observation_space: spaces.Space, - action_space: spaces.Space, - device: Union[th.device, str] = "cpu", - n_envs: int = 1, her_ratio: int = 2): + def __init__( + self, + env: VecEnv, + buffer_size: int, + goal_strategy: GoalSelectionStrategy, + observation_space: spaces.Space, + action_space: spaces.Space, + device: Union[th.device, str] = "cpu", + n_envs: int = 1, + her_ratio: int = 2, + ): super(HerReplayBuffer, self).__init__(buffer_size, observation_space, action_space, device, n_envs) @@ -42,7 +47,7 @@ def __init__(self, env: VecEnv, buffer_size: int, goal_strategy: GoalSelectionSt # buffer with episodes self.buffer = [] self.goal_strategy = goal_strategy - self.her_ratio = 1 - (1. / (1 + her_ratio)) + self.her_ratio = 1 - (1.0 / (1 + her_ratio)) # memory management # current size in episodes @@ -72,7 +77,7 @@ def _sample_transitions(self, batch_size: int): if self.goal_strategy == GoalSelectionStrategy.FINAL: # replay with final state of current episode last_transitions = buffer[episode_idxs[her_idxs]][:, -1][:, 0] - her_new_goals = [trans['achieved_goal'] for trans in last_transitions] + her_new_goals = [trans["achieved_goal"] for trans in last_transitions] elif self.goal_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition # we have no transition after last transition of episode @@ -88,13 +93,13 @@ def _sample_transitions(self, batch_size: int): # replay with random state which comes from the same episode as current transition index = np.array([np.random.choice(np.arange(ep_len)) for ep_len in her_episode_lenghts]) episode_transitions = buffer[episode_idxs[her_idxs], index][:, 0] - her_new_goals = [trans['achieved_goal'] for trans in episode_transitions] + her_new_goals = [trans["achieved_goal"] for trans in episode_transitions] elif self.goal_strategy == GoalSelectionStrategy.RANDOM: # replay with random state from the entire replay buffer ep_idx = np.random.randint(0, self.current_size, len(her_idxs)) state_idx = [np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]] random_transitions = buffer[ep_idx][state_idx][:, 0][:, 0] - her_new_goals = [trans['achieved_goal'] for trans in random_transitions] + her_new_goals = [trans["achieved_goal"] for trans in random_transitions] else: raise ValueError("Strategy for sampling goals not supported!") @@ -105,15 +110,24 @@ def _sample_transitions(self, batch_size: int): observations, actions, rewards, new_observations, dones = list(zip(*transitions)) # compute new reward with new goal - achieved_goals = [new_obs['achieved_goal'] for new_obs in np.array(new_observations)[her_idxs]] + achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(new_observations)[her_idxs]] new_rewards = np.array(rewards) - new_rewards[her_idxs] = [self.env.env_method("compute_reward", ag, her_new_goals, None) for ag, new_goal in zip(achieved_goals, her_new_goals)] + new_rewards[her_idxs] = [ + self.env.env_method("compute_reward", ag, her_new_goals, None) + for ag, new_goal in zip(achieved_goals, her_new_goals) + ] # concatenate observation with (desired) goal obs = [np.concatenate([o["observation"], o["desired_goal"]], axis=1) for o in observations] new_obs = [np.concatenate([new_o["observation"], new_o["desired_goal"]], axis=1) for new_o in new_observations] - data = (np.array(obs)[:,0,:], np.array(actions), np.array(new_obs)[:,0,:], np.array(dones, dtype=int), rewards) + data = ( + np.array(obs)[:, 0, :], + np.array(actions, dtype=np.float32), + np.array(new_obs)[:, 0, :], + np.array(dones, dtype=np.bool), + rewards, + ) return ReplayBufferSamples(*tuple(map(self.to_torch, data))) @@ -132,7 +146,7 @@ def add(self, episode): self.buffer[idx] = episode elif len(self.buffer[idx]) > episode_length: self.buffer[idx] = episode - self.n_transitions_stored -= (self.buffer[idx] - episode_length) + self.n_transitions_stored -= self.buffer[idx] - episode_length if self.n_transitions_stored == self.size: self.full = True diff --git a/tests/test_her.py b/tests/test_her.py index 311bd2595b..6430d348aa 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -2,18 +2,18 @@ import pytest import torch as th -from stable_baselines3 import TD3, SAC, DDPG +from stable_baselines3 import DDPG, SAC, TD3 from stable_baselines3.common.bit_flipping_env import BitFlippingEnv from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.her.her import HER, GoalSelectionStrategy from stable_baselines3.sac.policies import SACPolicy -from stable_baselines3.td3.policies import TD3Policy -from stable_baselines3.td3.policies import MlpPolicy +from stable_baselines3.td3.policies import MlpPolicy, TD3Policy -@pytest.mark.parametrize("model_class, policy, sde_support", - [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)]) +@pytest.mark.parametrize( + "model_class, policy, sde_support", [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)] +) @pytest.mark.parametrize("online_sampling", [True, False]) def test_her(model_class, policy, sde_support, online_sampling): """ @@ -25,7 +25,7 @@ def test_her(model_class, policy, sde_support, online_sampling): # Create action noise n_actions = env.action_space.shape[0] - action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions, ), 0.2 * np.ones((n_actions,))) + action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,))) model = HER( policy, @@ -43,7 +43,7 @@ def test_her(model_class, policy, sde_support, online_sampling): buffer_size=int(1e6), gamma=0.98, gradient_steps=40, - sde_support=sde_support + sde_support=sde_support, ) model.learn(total_timesteps=500, callback=None) @@ -93,7 +93,7 @@ def test_her(model_class, policy, sde_support, online_sampling): GoalSelectionStrategy.RANDOM, GoalSelectionStrategy.EPISODE, GoalSelectionStrategy.FINAL, - ] + ], ) @pytest.mark.parametrize("online_sampling", [True, False]) def test_goal_strategy(goal_strategy, online_sampling): From 2e436a29cbb6ef6b1ba76b98b8b22ef044f0ac77 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 23 Jul 2020 11:27:48 +0200 Subject: [PATCH 05/81] Fixed tests --- stable_baselines3/her/her_replay_buffer.py | 4 ++-- tests/test_her.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index c6bd566104..0b3d64b080 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -123,9 +123,9 @@ def _sample_transitions(self, batch_size: int): data = ( np.array(obs)[:, 0, :], - np.array(actions, dtype=np.float32), + np.array(actions, dtype=self.action_space.dtype)[:, 0, :], np.array(new_obs)[:, 0, :], - np.array(dones, dtype=np.bool), + np.array(dones, dtype=np.int8), rewards, ) diff --git a/tests/test_her.py b/tests/test_her.py index 6430d348aa..fa14904068 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -39,10 +39,12 @@ def test_her(model_class, policy, sde_support, online_sampling): tau=0.05, batch_size=128, learning_rate=0.001, - policy_kwargs=dict(net_arch=[256]), + policy_kwargs=dict(net_arch=[64]), buffer_size=int(1e6), gamma=0.98, - gradient_steps=40, + gradient_steps=1, + train_freq=1, + n_episodes_rollout=-1, sde_support=sde_support, ) From c0a82fc142ab4feb069dddd39f027083a83732a1 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Thu, 23 Jul 2020 14:53:46 +0200 Subject: [PATCH 06/81] Added some comments. --- stable_baselines3/her/her.py | 6 ++-- stable_baselines3/her/her_replay_buffer.py | 39 ++++++++++++++++------ tests/test_her.py | 11 +++++- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 01ae37dfa1..a486a8a7a4 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -1,4 +1,3 @@ -from enum import Enum from inspect import signature from typing import Any, Callable, Dict, Optional, Type, Union @@ -156,6 +155,7 @@ def __init__( # model initialization self.model = model(env=self.env, **model_init_dict, **kwargs) + # if we sample her transitions online use custom replay buffer self.online_sampling = online_sampling if self.online_sampling: self.model.replay_buffer = HerReplayBuffer( @@ -226,7 +226,7 @@ def collect_rollouts( n_steps: int = -1, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, - replay_buffer: Optional[ReplayBuffer] = None, + replay_buffer: Union[ReplayBuffer, HerReplayBuffer] = None, log_interval: Optional[int] = None, ) -> RolloutReturn: """ @@ -243,7 +243,7 @@ def collect_rollouts( Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: (int) Number of steps before learning for the warm-up phase. - :param replay_buffer: (ReplayBuffer) + :param replay_buffer: (ReplayBuffer or HerReplayBuffer) :param log_interval: (int) Log data every ``log_interval`` episodes :return: (RolloutReturn) """ diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 0b3d64b080..2e611ae829 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -55,34 +55,44 @@ def __init__( self.n_transitions_stored = 0 def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: - """Returns a dict {key: array(batch_size x shapes[key])} + """ + :param batch_size: (int) Number of element to sample + :param env: (Optional[VecNormalize]) associated gym VecEnv + to normalize the observations/rewards when sampling + :return: (ReplayBufferSamples) """ return self._sample_transitions(batch_size) - def _sample_transitions(self, batch_size: int): - # batch size in transitions + def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: + """ + :param batch_size: (int) Number of element to sample + :return: (ReplayBufferSamples) + """ # Select which episodes and time steps to use. episode_idxs = np.random.randint(0, self.current_size, batch_size) buffer = np.array(self.buffer) + # get episode lengths for selecting timesteps episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]]) + # select timesteps t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths]) - + # get selected timesteps transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)]) - + # get her samples indices with her_ratio her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_ratio)[0] + # her samples episode lengths her_episode_lenghts = episode_lengths[her_idxs] # get new goals with goal selection strategy if self.goal_strategy == GoalSelectionStrategy.FINAL: # replay with final state of current episode - last_transitions = buffer[episode_idxs[her_idxs]][:, -1][:, 0] + last_transitions = buffer[episode_idxs[her_idxs], -1][:, 0] her_new_goals = [trans["achieved_goal"] for trans in last_transitions] elif self.goal_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition - # we have no transition after last transition of episode her_new_goals = [] for idx, length in zip(her_idxs, her_episode_lenghts): + # we have no transition after last transition of episode if t_samples[idx] + 1 < length: index = np.random.choice(np.arange(t_samples[idx] + 1, length)) her_new_goals.append(buffer[episode_idxs[idx]][index][0]["achieved_goal"]) @@ -98,7 +108,7 @@ def _sample_transitions(self, batch_size: int): # replay with random state from the entire replay buffer ep_idx = np.random.randint(0, self.current_size, len(her_idxs)) state_idx = [np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]] - random_transitions = buffer[ep_idx][state_idx][:, 0][:, 0] + random_transitions = buffer[ep_idx, state_idx][:, 0] her_new_goals = [trans["achieved_goal"] for trans in random_transitions] else: raise ValueError("Strategy for sampling goals not supported!") @@ -109,7 +119,7 @@ def _sample_transitions(self, batch_size: int): observations, actions, rewards, new_observations, dones = list(zip(*transitions)) - # compute new reward with new goal + # compute new rewards with new goal achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(new_observations)[her_idxs]] new_rewards = np.array(rewards) new_rewards[her_idxs] = [ @@ -126,21 +136,28 @@ def _sample_transitions(self, batch_size: int): np.array(actions, dtype=self.action_space.dtype)[:, 0, :], np.array(new_obs)[:, 0, :], np.array(dones, dtype=np.int8), - rewards, + new_rewards, ) return ReplayBufferSamples(*tuple(map(self.to_torch, data))) def add(self, episode): + """ + Add episode to replay buffer + + :param episode: (list) Episode to store. + """ episode_length = len(episode) + # check if replay buffer has enough space for all transitions of episode if self.n_transitions_stored + episode_length <= self.size: self.buffer.append(episode) # update replay size self.current_size += 1 self.n_transitions_stored += episode_length elif self.full: - idx = np.random.randint(0, self.size) + # if replay buffer is full take random stored episode and replace it + idx = np.random.randint(0, self.current_size) if len(self.buffer[idx]) == episode_length: self.buffer[idx] = episode diff --git a/tests/test_her.py b/tests/test_her.py index fa14904068..2aae177154 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -105,5 +105,14 @@ def test_goal_strategy(goal_strategy, online_sampling): env = BitFlippingEnv(continuous=True) env = DummyVecEnv([lambda: env]) - model = HER(SACPolicy, env, SAC, goal_strategy=goal_strategy, online_sampling=online_sampling) + model = HER( + SACPolicy, + env, + SAC, + goal_strategy=goal_strategy, + online_sampling=online_sampling, + gradient_steps=1, + train_freq=1, + n_episodes_rollout=-1, + ) model.learn(total_timesteps=200, callback=None) From e6263b2dd0cd2d9f369d2061305037dc91a0c9ad Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Thu, 23 Jul 2020 15:04:01 +0200 Subject: [PATCH 07/81] Updated changelog. --- docs/misc/changelog.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 722e9e71e9..bfa0329eba 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -25,6 +25,7 @@ New Features: - Refactored opening paths for saving and loading to use strings, pathlib or io.BufferedIOBase (@PartiallyTyped) - Added ``DDPG`` algorithm as a special case of ``TD3``. - Introduced ``BaseModel`` abstract parent for ``BasePolicy``, which critics inherit from. +- Added Hindsight Experience Replay ``HER``. (@megan-klaiber) Bug Fixes: ^^^^^^^^^^ @@ -355,4 +356,4 @@ And all the contributors: @Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp @srivatsankrishnan @evilsocket @MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching @flodorner @KuKuXia @NeoExtended @PartiallyTyped @mmcenta @richardwu @kinalmehta @rolandgvc @tkelestemur @mloo3 -@tirafesi @blurLake @koulakis @joeljosephjin +@tirafesi @blurLake @koulakis @joeljosephjin @megan-klaiber From 257b8fcebe3e6024fc94e530477e83f1c659c438 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 23 Jul 2020 15:13:53 +0200 Subject: [PATCH 08/81] Add missing init file --- stable_baselines3/her/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 stable_baselines3/her/__init__.py diff --git a/stable_baselines3/her/__init__.py b/stable_baselines3/her/__init__.py new file mode 100644 index 0000000000..e69de29bb2 From 90f6e2c071565dc50d384edbbf9bfe6e393399ac Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Thu, 23 Jul 2020 16:32:48 +0200 Subject: [PATCH 09/81] Fixed some small bugs. --- stable_baselines3/her/her.py | 6 ++++-- stable_baselines3/her/her_replay_buffer.py | 24 +++++++++++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index a486a8a7a4..d8500b7ce9 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -320,9 +320,11 @@ def collect_rollouts( break if done: - if self.online_sampling: - self.model.replay_buffer.add(self.episode_storage) + observations, actions, rewards, next_observations, done = zip(*self.episode_storage) + self.model.replay_buffer.add(observations, next_observations, actions, rewards, done) + # self.model.replay_buffer.add(self.episode_storage) + else: # store episode in replay buffer self.store_transitions() diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 2e611ae829..8c60286f76 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -42,10 +42,12 @@ def __init__( super(HerReplayBuffer, self).__init__(buffer_size, observation_space, action_space, device, n_envs) self.env = env - self.size = buffer_size + self.buffer_size = buffer_size # buffer with episodes self.buffer = [] + # TODO just for typing reason , need another solution + self.observations = np.zeros((self.buffer_size, self.n_envs,) + self.obs_shape, dtype=observation_space.dtype) self.goal_strategy = goal_strategy self.her_ratio = 1 - (1.0 / (1 + her_ratio)) @@ -117,10 +119,10 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: for idx, goal in enumerate(her_new_goals): transitions[her_idxs][:, 0][idx]["desired_goal"] = goal - observations, actions, rewards, new_observations, dones = list(zip(*transitions)) + observations, actions, rewards, next_observations, dones = list(zip(*transitions)) # compute new rewards with new goal - achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(new_observations)[her_idxs]] + achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(next_observations)[her_idxs]] new_rewards = np.array(rewards) new_rewards[her_idxs] = [ self.env.env_method("compute_reward", ag, her_new_goals, None) @@ -129,7 +131,7 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: # concatenate observation with (desired) goal obs = [np.concatenate([o["observation"], o["desired_goal"]], axis=1) for o in observations] - new_obs = [np.concatenate([new_o["observation"], new_o["desired_goal"]], axis=1) for new_o in new_observations] + new_obs = [np.concatenate([new_o["observation"], new_o["desired_goal"]], axis=1) for new_o in next_observations] data = ( np.array(obs)[:, 0, :], @@ -141,16 +143,24 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: return ReplayBufferSamples(*tuple(map(self.to_torch, data))) - def add(self, episode): + def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: np.ndarray, done: np.ndarray) -> None: """ Add episode to replay buffer + :param obs: + :param next_obs: + :param action: + :param reward: + :param done: + :param episode: (list) Episode to store. """ + episode = list(zip(obs, action, reward, next_obs, done)) + episode_length = len(episode) # check if replay buffer has enough space for all transitions of episode - if self.n_transitions_stored + episode_length <= self.size: + if self.n_transitions_stored + episode_length <= self.size(): self.buffer.append(episode) # update replay size self.current_size += 1 @@ -165,7 +175,7 @@ def add(self, episode): self.buffer[idx] = episode self.n_transitions_stored -= self.buffer[idx] - episode_length - if self.n_transitions_stored == self.size: + if self.n_transitions_stored == self.size(): self.full = True else: self.full = False From 7b22e68936db76eb1b1741c0470545ead836fde1 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Wed, 29 Jul 2020 12:54:50 +0200 Subject: [PATCH 10/81] Reduced arguments for HER, small changes. --- stable_baselines3/her/her.py | 161 +++++---------------- stable_baselines3/her/her_replay_buffer.py | 44 +++--- stable_baselines3/her/obs_wrapper.py | 41 ++---- tests/test_her.py | 7 +- 4 files changed, 77 insertions(+), 176 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index d8500b7ce9..89f586d09e 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -1,8 +1,7 @@ -from inspect import signature -from typing import Any, Callable, Dict, Optional, Type, Union +from typing import Callable, Optional, Type, Union +import gym import numpy as np -import torch as th from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.callbacks import BaseCallback @@ -31,40 +30,6 @@ class HER(OffPolicyAlgorithm): as many HER replays as regular replays are used) :param learning_rate: (float or callable) learning rate for the optimizer, it can be a function of the current progress remaining (from 1 to 0) - :param buffer_size: (int) size of the replay buffer - :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts - :param batch_size: (int) Minibatch size for each gradient update - :param tau: (float) the soft update coefficient ("Polyak update", between 0 and 1) - :param gamma: (float) the discount factor - :param train_freq: (int) Update the model every ``train_freq`` steps. - :param gradient_steps: (int) How many gradient update after each step - :param n_episodes_rollout: (int) Update the model every ``n_episodes_rollout`` episodes. - Note that this cannot be used at the same time as ``train_freq`` - :param action_noise: (ActionNoise) the action noise type (None by default), this can help - for hard exploration problem. Cf common.noise for the different action noise type. - :param optimize_memory_usage: (bool) Enable a memory efficient variant of the replay buffer - at a cost of more complexity. - See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195 - :param policy_kwargs: Additional arguments to be passed to the policy on creation - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param verbose: The verbosity level: 0 none, 1 training information, 2 debug - :param device: Device on which the code should run. - By default, it will try to use a Cuda compatible device and fallback to cpu - if it is not possible. - :param support_multi_env: Whether the algorithm supports training - with multiple environments (as in A2C) - :param create_eval_env: Whether to create a second environment that will be - used for evaluating the agent periodically. (Only available when passing string for the environment) - :param monitor_wrapper: When creating an environment, whether to wrap it - or not in a Monitor wrapper. - :param seed: Seed for the pseudo random generators - :param use_sde: Whether to use State Dependent Exploration (SDE) - instead of action noise exploration (default: False) - :param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE - Default: -1 (only sample at the beginning of the rollout) - :param use_sde_at_warmup: (bool) Whether to use gSDE instead of uniform sampling - during the warm up phase (before learning starts) - :param sde_support: (bool) Whether the model support gSDE or not """ def __init__( @@ -73,104 +38,53 @@ def __init__( env: VecEnv, model: Type[OffPolicyAlgorithm], n_goals: int = 5, - goal_strategy: Union[GoalSelectionStrategy, str] = "final", + goal_strategy: Union[GoalSelectionStrategy, str] = "future", online_sampling: bool = False, her_ratio: int = 2, learning_rate: Union[float, Callable] = 3e-4, - buffer_size: int = int(1e6), - learning_starts: int = 100, - batch_size: int = 256, - tau: float = 0.005, - gamma: float = 0.99, - train_freq: int = 1, - gradient_steps: int = 1, - n_episodes_rollout: int = -1, - action_noise: Optional[ActionNoise] = None, - optimize_memory_usage: bool = False, - policy_kwargs: Dict[str, Any] = None, - tensorboard_log: Optional[str] = None, - verbose: int = 0, - device: Union[th.device, str] = "cpu", - support_multi_env: bool = False, - create_eval_env: bool = False, - monitor_wrapper: bool = True, - seed: Optional[int] = None, - use_sde: bool = False, - sde_sample_freq: int = -1, - use_sde_at_warmup: bool = False, - sde_support: bool = True, *args, - **kwargs + **kwargs, ): + self.env = env + # check if wrapper for dict support is needed + if isinstance(env.observation_space, gym.spaces.dict.Dict): + self.env = ObsWrapper(env) + + super(HER, self).__init__( + policy=BasePolicy, env=self.env, policy_base=BasePolicy, learning_rate=learning_rate, sde_support=False + ) + + # model initialization + self.model = model(policy=policy, env=self.env, learning_rate=learning_rate, *args, **kwargs) + + # convert goal_strategy into GoalSelectionStrategy if string if isinstance(goal_strategy, str): self.goal_strategy = KEY_TO_GOAL_STRATEGY[goal_strategy.lower()] else: self.goal_strategy = goal_strategy + # check if goal_strategy is valid assert isinstance( self.goal_strategy, GoalSelectionStrategy - ), "Invalid goal selection strategy," "please use one of {}".format(list(GoalSelectionStrategy)) - - self.env = ObsWrapper(env) - - # get arguments for the model initialization - model_signature = signature(model.__init__) - arguments = locals() - model_init_dict = { - key: arguments[key] - for key in model_signature.parameters.keys() - if key in arguments and key != "self" and key != "env" - } - - super(HER, self).__init__( - policy, - self.env, - BasePolicy, - learning_rate, - buffer_size, - learning_starts, - batch_size, - tau, - gamma, - train_freq, - gradient_steps, - n_episodes_rollout, - action_noise, - optimize_memory_usage, - policy_kwargs, - tensorboard_log, - verbose, - device, - support_multi_env, - create_eval_env, - monitor_wrapper, - seed, - use_sde, - sde_sample_freq, - use_sde_at_warmup, - sde_support, - ) - - # model initialization - self.model = model(env=self.env, **model_init_dict, **kwargs) + ), f"Invalid goal selection strategy, please use one of {list(GoalSelectionStrategy)}" # if we sample her transitions online use custom replay buffer self.online_sampling = online_sampling if self.online_sampling: self.model.replay_buffer = HerReplayBuffer( self.env, - buffer_size, + self.model.buffer_size, self.goal_strategy, self.env.observation_space, self.env.action_space, - device, + self.model.device, self.n_envs, her_ratio, ) # storage for transitions of current episode - self.episode_storage = [] + self.__episode_storage = [] self.n_goals = n_goals def learn( @@ -247,6 +161,7 @@ def collect_rollouts( :param log_interval: (int) Log data every ``log_interval`` episodes :return: (RolloutReturn) """ + episode_rewards, total_timesteps = [], [] total_steps, total_episodes = 0, 0 @@ -298,7 +213,7 @@ def collect_rollouts( self.model._last_original_obs, new_obs_, reward_ = observation, new_obs, reward # add current transition to episode storage - self.episode_storage.append((self.model._last_original_obs, buffer_action, reward_, new_obs_, done)) + self.__episode_storage.append((self.model._last_original_obs, buffer_action, reward_, new_obs_, done)) self.model._last_obs = new_obs # Save the unnormalized observation @@ -321,15 +236,15 @@ def collect_rollouts( if done: if self.online_sampling: - observations, actions, rewards, next_observations, done = zip(*self.episode_storage) + observations, actions, rewards, next_observations, done = zip(*self.__episode_storage) self.model.replay_buffer.add(observations, next_observations, actions, rewards, done) - # self.model.replay_buffer.add(self.episode_storage) + # self.model.replay_buffer.add(self.__episode_storage) else: # store episode in replay buffer - self.store_transitions() + self.__store_transitions() # clear storage for current episode - self.episode_storage = [] + self.__episode_storage = [] total_episodes += 1 self.model._episode_num += 1 @@ -361,35 +276,37 @@ def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]: """ if self.goal_strategy == GoalSelectionStrategy.FINAL: # replay with final state of current episode - return self.episode_storage[-1][0]["achieved_goal"] + return self.__episode_storage[-1][0]["achieved_goal"] elif self.goal_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition # we have no transition after last transition of episode - if (sample_idx + 1) < len(self.episode_storage): - index = np.random.choice(np.arange(sample_idx + 1, len(self.episode_storage))) - return self.episode_storage[index][0]["achieved_goal"] + if (sample_idx + 1) < len(self.__episode_storage): + index = np.random.choice(np.arange(sample_idx + 1, len(self.__episode_storage))) + return self.__episode_storage[index][0]["achieved_goal"] elif self.goal_strategy == GoalSelectionStrategy.EPISODE: # replay with random state which comes from the same episode as current transition - index = np.random.choice(np.arange(len(self.episode_storage))) - return self.episode_storage[index][0]["achieved_goal"] + index = np.random.choice(np.arange(len(self.__episode_storage))) + return self.__episode_storage[index][0]["achieved_goal"] elif self.goal_strategy == GoalSelectionStrategy.RANDOM: # replay with random state from the entire replay buffer index = np.random.choice(np.arange(self.model.replay_buffer.size())) obs = self.model.replay_buffer.observations[index] # get only the observation part - obs_array = obs[:, : self.env.obs_dim] + # TODO + obs_dim = self.env.observation_space.shape[0] // 2 + obs_array = obs[:, :obs_dim] return obs_array else: raise ValueError("Strategy for sampling goals not supported!") - def store_transitions(self) -> None: + def __store_transitions(self) -> None: """ Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer. """ # iterate over current episodes transitions - for idx, trans in enumerate(self.episode_storage): + for idx, trans in enumerate(self.__episode_storage): observation, action, reward, new_observation, done = trans diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 8c60286f76..7282530ce7 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -52,9 +52,8 @@ def __init__( self.her_ratio = 1 - (1.0 / (1 + her_ratio)) # memory management - # current size in episodes - self.current_size = 0 - self.n_transitions_stored = 0 + self.__n_episodes_stored = 0 + self.__n_transitions_stored = 0 def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: """ @@ -72,7 +71,7 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: """ # Select which episodes and time steps to use. - episode_idxs = np.random.randint(0, self.current_size, batch_size) + episode_idxs = np.random.randint(0, self.n_episodes_stored, batch_size) buffer = np.array(self.buffer) # get episode lengths for selecting timesteps episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]]) @@ -108,7 +107,7 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: her_new_goals = [trans["achieved_goal"] for trans in episode_transitions] elif self.goal_strategy == GoalSelectionStrategy.RANDOM: # replay with random state from the entire replay buffer - ep_idx = np.random.randint(0, self.current_size, len(her_idxs)) + ep_idx = np.random.randint(0, self.n_episodes_stored, len(her_idxs)) state_idx = [np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]] random_transitions = buffer[ep_idx, state_idx][:, 0] her_new_goals = [trans["achieved_goal"] for trans in random_transitions] @@ -125,13 +124,15 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(next_observations)[her_idxs]] new_rewards = np.array(rewards) new_rewards[her_idxs] = [ - self.env.env_method("compute_reward", ag, her_new_goals, None) - for ag, new_goal in zip(achieved_goals, her_new_goals) + self.env.env_method("compute_reward", achieved_goal, her_new_goals, None) + for achieved_goal, new_goal in zip(achieved_goals, her_new_goals) ] # concatenate observation with (desired) goal - obs = [np.concatenate([o["observation"], o["desired_goal"]], axis=1) for o in observations] - new_obs = [np.concatenate([new_o["observation"], new_o["desired_goal"]], axis=1) for new_o in next_observations] + obs = [np.concatenate([obs_["observation"], obs_["desired_goal"]], axis=1) for obs_ in observations] + new_obs = [ + np.concatenate([new_obs_["observation"], new_obs_["desired_goal"]], axis=1) for new_obs_ in next_observations + ] data = ( np.array(obs)[:, 0, :], @@ -163,11 +164,11 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: if self.n_transitions_stored + episode_length <= self.size(): self.buffer.append(episode) # update replay size - self.current_size += 1 + self.n_episodes_stored += 1 self.n_transitions_stored += episode_length elif self.full: # if replay buffer is full take random stored episode and replace it - idx = np.random.randint(0, self.current_size) + idx = np.random.randint(0, self.n_episodes_stored) if len(self.buffer[idx]) == episode_length: self.buffer[idx] = episode @@ -180,14 +181,23 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: else: self.full = False - def get_current_episode_size(self): - return self.current_size + @property + def n_episodes_stored(self): + return self.__n_episodes_stored - def get_current_size(self): - return self.n_transitions_stored + @n_episodes_stored.setter + def n_episodes_stored(self, n): + self.__n_episodes_stored = n - def get_transitions_stored(self): - return self.n_transitions_stored + @property + def n_transitions_stored(self): + return self.__n_transitions_stored + + @n_transitions_stored.setter + def n_transitions_stored(self, n): + self.__n_transitions_stored = n def clear_buffer(self): self.buffer = [] + self.n_episodes_stored = 0 + self.n_transitions_stored = 0 diff --git a/stable_baselines3/her/obs_wrapper.py b/stable_baselines3/her/obs_wrapper.py index e59f40f939..1a909968c0 100644 --- a/stable_baselines3/her/obs_wrapper.py +++ b/stable_baselines3/her/obs_wrapper.py @@ -1,12 +1,12 @@ -from typing import List, Optional, Sequence, Union +from typing import Union, Tuple import numpy as np from gym import spaces -from stable_baselines3.common.vec_env import VecEnv +from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper -class ObsWrapper(VecEnv): +class ObsWrapper(VecEnvWrapper): """ Wrapper for a VecEnv which overrides the observation space for Hindsight Experience Replay to support dict observations. @@ -14,9 +14,7 @@ class ObsWrapper(VecEnv): """ def __init__(self, venv: VecEnv): - super(ObsWrapper, self).__init__( - num_envs=venv.num_envs, observation_space=venv.observation_space, action_space=venv.action_space - ) + super(ObsWrapper, self).__init__(venv, venv.observation_space, venv.action_space) self.venv = venv @@ -35,10 +33,10 @@ def __init__(self, venv: VecEnv): # for the different types of spaces if isinstance(self.spaces[0], spaces.Box): low_values = np.concatenate( - [venv.observation_space["observation"].low, venv.observation_space["desired_goal"].low] + [venv.observation_space.spaces["observation"].low, venv.observation_space.spaces["desired_goal"].low] ) high_values = np.concatenate( - [venv.observation_space["observation"].high, venv.observation_space["desired_goal"].high] + [venv.observation_space.spaces["observation"].high, venv.observation_space.spaces["desired_goal"].high] ) self.observation_space = spaces.Box(low_values, high_values, dtype=np.float32) elif isinstance(self.spaces[0], spaces.MultiBinary): @@ -48,31 +46,10 @@ def __init__(self, venv: VecEnv): dimensions = [venv.observation_space.spaces["observation"].n, venv.observation_space.spaces["desired_goal"].n] self.observation_space = spaces.MultiDiscrete(dimensions) else: - raise NotImplementedError("{} space is not supported".format(type(self.spaces[0]))) + raise NotImplementedError(f"{type(self.spaces[0])} space is not supported") - def reset(self): + def reset(self) -> Union[int, float]: return self.venv.reset() - def step_async(self, actions): - self.venv.step_async(actions) - - def step_wait(self): + def step_wait(self) -> Tuple[Union[int, float], float, bool, dict]: return self.venv.step_wait() - - def close(self): - return self.venv.close() - - def get_attr(self, attr_name, indices=None): - return self.venv.get_attr(attr_name, indices) - - def set_attr(self, attr_name, value, indices=None): - return self.venv.set_attr(attr_name, value, indices) - - def env_method(self, method_name, *method_args, indices=None, **method_kwargs): - return self.venv.env_method(method_name, *method_args, indices=indices, **method_kwargs) - - def get_images(self) -> Sequence[np.ndarray]: - return self.venv.get_images() - - def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: - return self.venv.seed(seed) diff --git a/tests/test_her.py b/tests/test_her.py index 2aae177154..4a4531bca7 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -11,11 +11,9 @@ from stable_baselines3.td3.policies import MlpPolicy, TD3Policy -@pytest.mark.parametrize( - "model_class, policy, sde_support", [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)] -) +@pytest.mark.parametrize("model_class, policy", [(SAC, SACPolicy), (TD3, TD3Policy), (DDPG, MlpPolicy)]) @pytest.mark.parametrize("online_sampling", [True, False]) -def test_her(model_class, policy, sde_support, online_sampling): +def test_her(model_class, policy, online_sampling): """ Test Hindsight Experience Replay. """ @@ -45,7 +43,6 @@ def test_her(model_class, policy, sde_support, online_sampling): gradient_steps=1, train_freq=1, n_episodes_rollout=-1, - sde_support=sde_support, ) model.learn(total_timesteps=500, callback=None) From 501b1c47cd02a0eef96c721ad8b4f5b95f5d8933 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Mon, 3 Aug 2020 16:09:51 +0200 Subject: [PATCH 11/81] Added getattr. Fixed bug for online sampling. --- stable_baselines3/her/her.py | 132 +++++++++++++-------- stable_baselines3/her/her_replay_buffer.py | 30 +++-- 2 files changed, 101 insertions(+), 61 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 89f586d09e..a655adb304 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -1,7 +1,7 @@ from typing import Callable, Optional, Type, Union -import gym import numpy as np +from stable_baselines3.common.base_class import BaseAlgorithm from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.callbacks import BaseCallback @@ -9,13 +9,28 @@ from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm from stable_baselines3.common.policies import BasePolicy from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn -from stable_baselines3.common.vec_env import VecEnv +from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy from stable_baselines3.her.her_replay_buffer import HerReplayBuffer from stable_baselines3.her.obs_wrapper import ObsWrapper -class HER(OffPolicyAlgorithm): +def check_wrapped_env(env: VecEnv) -> VecEnv: + """ + Check if the environment is already wrapped by an ObsWrapper. + + :param env: (VecEnv) Environment to check. + :return: (VecEnv) env + """ + env_tmp = env + while isinstance(env_tmp, VecEnvWrapper): + if isinstance(env_tmp, ObsWrapper): + return env + env_tmp = env_tmp.venv + return ObsWrapper(env) + + +class HER(BaseAlgorithm): """ Hindsight Experience Replay (HER) @@ -46,18 +61,17 @@ def __init__( **kwargs, ): - self.env = env # check if wrapper for dict support is needed - if isinstance(env.observation_space, gym.spaces.dict.Dict): - self.env = ObsWrapper(env) + self.env = check_wrapped_env(env) - super(HER, self).__init__( - policy=BasePolicy, env=self.env, policy_base=BasePolicy, learning_rate=learning_rate, sde_support=False - ) + super(HER, self).__init__(policy=BasePolicy, env=self.env, policy_base=BasePolicy, learning_rate=learning_rate) # model initialization self.model = model(policy=policy, env=self.env, learning_rate=learning_rate, *args, **kwargs) + self.verbose = self.model.verbose + self.tensorboard_log = self.model.tensorboard_log + # convert goal_strategy into GoalSelectionStrategy if string if isinstance(goal_strategy, str): self.goal_strategy = KEY_TO_GOAL_STRATEGY[goal_strategy.lower()] @@ -74,11 +88,11 @@ def __init__( if self.online_sampling: self.model.replay_buffer = HerReplayBuffer( self.env, - self.model.buffer_size, + self.buffer_size, self.goal_strategy, self.env.observation_space, self.env.action_space, - self.model.device, + self.device, self.n_envs, her_ratio, ) @@ -98,31 +112,37 @@ def learn( tb_log_name: str = "run", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, - ) -> "OffPolicyAlgorithm": + ) -> BaseAlgorithm: - total_timesteps, callback = self.model._setup_learn( + total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) + self.model.start_time = self.start_time + self.model.ep_info_buffer = self.ep_info_buffer + self.model.ep_success_buffer = self.ep_success_buffer + self.model.num_timesteps = self.num_timesteps + self.model._episode_num = self._episode_num + self.model._last_obs = self._last_obs callback.on_training_start(locals(), globals()) - while self.model.num_timesteps < total_timesteps: + while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, - n_episodes=self.model.n_episodes_rollout, - n_steps=self.model.train_freq, - action_noise=self.model.action_noise, + n_episodes=self.n_episodes_rollout, + n_steps=self.train_freq, + action_noise=self.action_noise, callback=callback, - learning_starts=self.model.learning_starts, - replay_buffer=self.model.replay_buffer, + learning_starts=self.learning_starts, + replay_buffer=self.replay_buffer, log_interval=log_interval, ) if rollout.continue_training is False: break - if self.model.num_timesteps > 0 and self.model.num_timesteps > self.model.learning_starts: + if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps @@ -132,6 +152,15 @@ def learn( return self + def _setup_model(self) -> None: + self.model._setup_model() + + def __getattr__(self, item): + if hasattr(self.model, item): + return getattr(self.model, item) + else: + raise AttributeError + def collect_rollouts( self, env: VecEnv, @@ -169,7 +198,7 @@ def collect_rollouts( assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" if self.use_sde: - self.model.actor.reset_noise() + self.actor.reset_noise() callback.on_rollout_start() continue_training = True @@ -180,15 +209,16 @@ def collect_rollouts( while not done: # concatenate observation and (desired) goal - observation = self.model._last_obs - self.model._last_obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1) + observation = self._last_obs + self._last_obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1) if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0: # Sample a new noise matrix - self.model.actor.reset_noise() + self.actor.reset_noise() # Select action randomly or according to policy - action, buffer_action = self.model._sample_action(learning_starts, action_noise) + self.model._last_obs = self._last_obs + action, buffer_action = self._sample_action(learning_starts, action_noise) # Rescale and perform action new_obs, reward, done, infos = env.step(action) @@ -200,36 +230,42 @@ def collect_rollouts( episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper - self.model._update_info_buffer(infos, done) + self._update_info_buffer(infos, done) + self.model.ep_info_buffer = self.ep_info_buffer + self.model.ep_success_buffer = self.ep_success_buffer # Store episode in episode storage if replay_buffer is not None: # Store only the unnormalized version - if self.model._vec_normalize_env is not None: - new_obs_ = self.model._vec_normalize_env.get_original_obs() - reward_ = self.model._vec_normalize_env.get_original_reward() + if self._vec_normalize_env is not None: + new_obs_ = self._vec_normalize_env.get_original_obs() + reward_ = self._vec_normalize_env.get_original_reward() else: # Avoid changing the original ones - self.model._last_original_obs, new_obs_, reward_ = observation, new_obs, reward + self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward + self.model._last_original_obs = self._last_original_obs # add current transition to episode storage - self.__episode_storage.append((self.model._last_original_obs, buffer_action, reward_, new_obs_, done)) + self.__episode_storage.append((self._last_original_obs, buffer_action, reward_, new_obs_, done)) - self.model._last_obs = new_obs + self._last_obs = new_obs + self.model._last_obs = self._last_obs # Save the unnormalized observation - if self.model._vec_normalize_env is not None: - self.model._last_original_obs = new_obs_ + if self._vec_normalize_env is not None: + self._last_original_obs = new_obs_ + self.model._last_original_obs = self._last_original_obs - self.model.num_timesteps += 1 + self.num_timesteps += 1 + self.model.num_timesteps = self.num_timesteps episode_timesteps += 1 total_steps += 1 - self.model._update_current_progress_remaining(self.model.num_timesteps, self.model._total_timesteps) + self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps) # For DQN, check if the target network should be updated # and update the exploration schedule # For SAC/TD3, the update is done as the same time as the gradient update # see https://github.com/hill-a/stable-baselines/issues/900 - self.model._on_step() + self._on_step() if 0 < n_steps <= total_steps: break @@ -237,8 +273,8 @@ def collect_rollouts( if done: if self.online_sampling: observations, actions, rewards, next_observations, done = zip(*self.__episode_storage) - self.model.replay_buffer.add(observations, next_observations, actions, rewards, done) - # self.model.replay_buffer.add(self.__episode_storage) + self.replay_buffer.add(observations, next_observations, actions, rewards, done) + # self.replay_buffer.add(self.__episode_storage) else: # store episode in replay buffer @@ -247,7 +283,8 @@ def collect_rollouts( self.__episode_storage = [] total_episodes += 1 - self.model._episode_num += 1 + self._episode_num += 1 + self.model._episode_num = self._episode_num episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) @@ -255,8 +292,8 @@ def collect_rollouts( action_noise.reset() # Log training infos - if log_interval is not None and self.model._episode_num % log_interval == 0: - self.model._dump_logs() + if log_interval is not None and self._episode_num % log_interval == 0: + self._dump_logs() mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 @@ -264,9 +301,6 @@ def collect_rollouts( return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training) - def train(self, gradient_steps: int, batch_size: int) -> None: - self.model.train(gradient_steps=gradient_steps, batch_size=batch_size) - def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]: """ Sample a goal based on goal_strategy. @@ -290,8 +324,8 @@ def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]: return self.__episode_storage[index][0]["achieved_goal"] elif self.goal_strategy == GoalSelectionStrategy.RANDOM: # replay with random state from the entire replay buffer - index = np.random.choice(np.arange(self.model.replay_buffer.size())) - obs = self.model.replay_buffer.observations[index] + index = np.random.choice(np.arange(self.replay_buffer.size())) + obs = self.replay_buffer.observations[index] # get only the observation part # TODO obs_dim = self.env.observation_space.shape[0] // 2 @@ -315,7 +349,7 @@ def __store_transitions(self) -> None: new_obs = np.concatenate([new_observation["observation"], new_observation["desired_goal"]], axis=1) # store data in replay buffer - self.model.replay_buffer.add(obs, new_obs, action, reward, done) + self.replay_buffer.add(obs, new_obs, action, reward, done) # sample set of additional goals sampled_goals = [sample for sample in (self.sample_goals(idx) for i in range(self.n_goals)) if sample is not None] @@ -330,4 +364,4 @@ def __store_transitions(self) -> None: new_obs = np.concatenate([new_observation["observation"], goal], axis=1) # store data in replay buffer - self.model.replay_buffer.add(obs, new_obs, action, new_reward, done) + self.replay_buffer.add(obs, new_obs, action, new_reward, done) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 7282530ce7..62ae3df272 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -69,16 +69,15 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: :param batch_size: (int) Number of element to sample :return: (ReplayBufferSamples) """ - # Select which episodes and time steps to use. episode_idxs = np.random.randint(0, self.n_episodes_stored, batch_size) - buffer = np.array(self.buffer) + buffer = np.array(self.buffer, dtype=object) # get episode lengths for selecting timesteps episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]]) # select timesteps t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths]) # get selected timesteps - transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)]) + transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)], dtype=object) # get her samples indices with her_ratio her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_ratio)[0] # her samples episode lengths @@ -87,7 +86,8 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: # get new goals with goal selection strategy if self.goal_strategy == GoalSelectionStrategy.FINAL: # replay with final state of current episode - last_transitions = buffer[episode_idxs[her_idxs], -1][:, 0] + last_transitions = [episode[-1][0] for episode in buffer[episode_idxs[her_idxs]]] + # last_transitions = buffer[episode_idxs[her_idxs], -1][:, 0] her_new_goals = [trans["achieved_goal"] for trans in last_transitions] elif self.goal_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition @@ -103,13 +103,15 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: elif self.goal_strategy == GoalSelectionStrategy.EPISODE: # replay with random state which comes from the same episode as current transition index = np.array([np.random.choice(np.arange(ep_len)) for ep_len in her_episode_lenghts]) - episode_transitions = buffer[episode_idxs[her_idxs], index][:, 0] + episode_transitions = [buffer[episode_idxs[her_idx]][idx][0] for idx, her_idx in zip(index, her_idxs)] + # episode_transitions = buffer[episode_idxs[her_idxs], index][:, 0] her_new_goals = [trans["achieved_goal"] for trans in episode_transitions] elif self.goal_strategy == GoalSelectionStrategy.RANDOM: # replay with random state from the entire replay buffer ep_idx = np.random.randint(0, self.n_episodes_stored, len(her_idxs)) - state_idx = [np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]] - random_transitions = buffer[ep_idx, state_idx][:, 0] + state_idx = np.array([np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]]) + random_transitions = [episode[state][0] for episode, state in zip(buffer[ep_idx], state_idx)] + # random_transitions = buffer[ep_idx, state_idx][:, 0] her_new_goals = [trans["achieved_goal"] for trans in random_transitions] else: raise ValueError("Strategy for sampling goals not supported!") @@ -161,7 +163,7 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: episode_length = len(episode) # check if replay buffer has enough space for all transitions of episode - if self.n_transitions_stored + episode_length <= self.size(): + if self.n_transitions_stored + episode_length <= self.buffer_size: self.buffer.append(episode) # update replay size self.n_episodes_stored += 1 @@ -174,12 +176,10 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: self.buffer[idx] = episode elif len(self.buffer[idx]) > episode_length: self.buffer[idx] = episode - self.n_transitions_stored -= self.buffer[idx] - episode_length + self.n_transitions_stored -= len(self.buffer[idx]) - episode_length - if self.n_transitions_stored == self.size(): + if self.n_transitions_stored == self.buffer_size: self.full = True - else: - self.full = False @property def n_episodes_stored(self): @@ -201,3 +201,9 @@ def clear_buffer(self): self.buffer = [] self.n_episodes_stored = 0 self.n_transitions_stored = 0 + + def size(self) -> int: + """ + :return: (int) The current size of the buffer in transitions. + """ + return self.n_transitions_stored From 5d096195ede97c3892c587d176e5549c1501a894 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Thu, 6 Aug 2020 02:03:38 +0200 Subject: [PATCH 12/81] Updated save/load funtions. Small changes. --- stable_baselines3/her/her.py | 213 +++++++++++++++++---- stable_baselines3/her/her_replay_buffer.py | 24 +-- stable_baselines3/her/obs_wrapper.py | 5 +- tests/test_her.py | 92 ++++++++- 4 files changed, 283 insertions(+), 51 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index a655adb304..89f4ed312d 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -1,14 +1,18 @@ -from typing import Callable, Optional, Type, Union +import io +import pathlib +from typing import Callable, Iterable, List, Optional, Tuple, Type, Union import numpy as np -from stable_baselines3.common.base_class import BaseAlgorithm +from stable_baselines3.common.base_class import BaseAlgorithm from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.noise import ActionNoise from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm from stable_baselines3.common.policies import BasePolicy +from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr, save_to_zip_file from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn +from stable_baselines3.common.utils import check_for_correct_spaces from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy from stable_baselines3.her.her_replay_buffer import HerReplayBuffer @@ -36,7 +40,7 @@ class HER(BaseAlgorithm): :param policy: (BasePolicy) The policy model to use. :param env: (VecEnv) The environment to learn from. - :param model: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3) + :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3) :param n_goals: (int) Number of sampled goals for replay. :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay. One of ['episode', 'final', 'future', 'random'] @@ -51,7 +55,7 @@ def __init__( self, policy: Type[BasePolicy], env: VecEnv, - model: Type[OffPolicyAlgorithm], + model_class: Type[OffPolicyAlgorithm], n_goals: int = 5, goal_strategy: Union[GoalSelectionStrategy, str] = "future", online_sampling: bool = False, @@ -67,7 +71,10 @@ def __init__( super(HER, self).__init__(policy=BasePolicy, env=self.env, policy_base=BasePolicy, learning_rate=learning_rate) # model initialization - self.model = model(policy=policy, env=self.env, learning_rate=learning_rate, *args, **kwargs) + self.model_class = model_class + self.model = model_class( + policy=policy, env=self.env, learning_rate=learning_rate, *args, **kwargs, # pytype: disable=wrong-keyword-args + ) self.verbose = self.model.verbose self.tensorboard_log = self.model.tensorboard_log @@ -85,6 +92,7 @@ def __init__( # if we sample her transitions online use custom replay buffer self.online_sampling = online_sampling + self.her_ratio = her_ratio if self.online_sampling: self.model.replay_buffer = HerReplayBuffer( self.env, @@ -94,13 +102,16 @@ def __init__( self.env.action_space, self.device, self.n_envs, - her_ratio, + self.her_ratio, ) # storage for transitions of current episode - self.__episode_storage = [] + self._episode_storage = [] self.n_goals = n_goals + def _setup_model(self) -> None: + self.model._setup_model() + def learn( self, total_timesteps: int, @@ -152,15 +163,6 @@ def learn( return self - def _setup_model(self) -> None: - self.model._setup_model() - - def __getattr__(self, item): - if hasattr(self.model, item): - return getattr(self.model, item) - else: - raise AttributeError - def collect_rollouts( self, env: VecEnv, @@ -246,7 +248,7 @@ def collect_rollouts( self.model._last_original_obs = self._last_original_obs # add current transition to episode storage - self.__episode_storage.append((self._last_original_obs, buffer_action, reward_, new_obs_, done)) + self._episode_storage.append((self._last_original_obs, buffer_action, reward_, new_obs_, done)) self._last_obs = new_obs self.model._last_obs = self._last_obs @@ -272,15 +274,15 @@ def collect_rollouts( if done: if self.online_sampling: - observations, actions, rewards, next_observations, done = zip(*self.__episode_storage) + observations, actions, rewards, next_observations, done = zip(*self._episode_storage) self.replay_buffer.add(observations, next_observations, actions, rewards, done) - # self.replay_buffer.add(self.__episode_storage) + # self.replay_buffer.add(self._episode_storage) else: # store episode in replay buffer - self.__store_transitions() + self._store_transitions() # clear storage for current episode - self.__episode_storage = [] + self._episode_storage = [] total_episodes += 1 self._episode_num += 1 @@ -301,46 +303,45 @@ def collect_rollouts( return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training) - def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]: + def sample_goals(self, sample_idx: int, obs_dim: int) -> Union[np.ndarray, None]: """ Sample a goal based on goal_strategy. :param sample_idx: (int) Index of current transition. + :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy. :return: (np.ndarray or None) Return sampled goal. """ if self.goal_strategy == GoalSelectionStrategy.FINAL: # replay with final state of current episode - return self.__episode_storage[-1][0]["achieved_goal"] + return self._episode_storage[-1][0]["achieved_goal"] elif self.goal_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition # we have no transition after last transition of episode - if (sample_idx + 1) < len(self.__episode_storage): - index = np.random.choice(np.arange(sample_idx + 1, len(self.__episode_storage))) - return self.__episode_storage[index][0]["achieved_goal"] + if (sample_idx + 1) < len(self._episode_storage): + index = np.random.choice(np.arange(sample_idx + 1, len(self._episode_storage))) + return self._episode_storage[index][0]["achieved_goal"] elif self.goal_strategy == GoalSelectionStrategy.EPISODE: # replay with random state which comes from the same episode as current transition - index = np.random.choice(np.arange(len(self.__episode_storage))) - return self.__episode_storage[index][0]["achieved_goal"] + index = np.random.choice(np.arange(len(self._episode_storage))) + return self._episode_storage[index][0]["achieved_goal"] elif self.goal_strategy == GoalSelectionStrategy.RANDOM: # replay with random state from the entire replay buffer index = np.random.choice(np.arange(self.replay_buffer.size())) obs = self.replay_buffer.observations[index] # get only the observation part - # TODO - obs_dim = self.env.observation_space.shape[0] // 2 obs_array = obs[:, :obs_dim] return obs_array else: raise ValueError("Strategy for sampling goals not supported!") - def __store_transitions(self) -> None: + def _store_transitions(self) -> None: """ Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer. """ # iterate over current episodes transitions - for idx, trans in enumerate(self.__episode_storage): + for idx, trans in enumerate(self._episode_storage): observation, action, reward, new_observation, done = trans @@ -352,7 +353,10 @@ def __store_transitions(self) -> None: self.replay_buffer.add(obs, new_obs, action, reward, done) # sample set of additional goals - sampled_goals = [sample for sample in (self.sample_goals(idx) for i in range(self.n_goals)) if sample is not None] + obs_dim = observation["observation"].shape[1] + sampled_goals = [ + sample for sample in (self.sample_goals(idx, obs_dim) for i in range(self.n_goals)) if sample is not None + ] # iterate over sampled goals and store new transitions in replay buffer for goal in sampled_goals: @@ -365,3 +369,146 @@ def __store_transitions(self) -> None: # store data in replay buffer self.replay_buffer.add(obs, new_obs, action, new_reward, done) + + def __getattr__(self, item): + """ + Find attribute from model class if this class does not have it. + """ + if hasattr(self.model, item): + return getattr(self.model, item) + else: + raise AttributeError + + def get_torch_variables(self) -> Tuple[List[str], List[str]]: + return self.model.get_torch_variables() + + def save( + self, + path: Union[str, pathlib.Path, io.BufferedIOBase], + exclude: Optional[Iterable[str]] = None, + include: Optional[Iterable[str]] = None, + ) -> None: + """ + Save all the attributes of the object and the model parameters in a zip-file. + + :param path: (Union[str, pathlib.Path, io.BufferedIOBase]) path to the file where the rl agent should be saved + :param exclude: name of parameters that should be excluded in addition to the default one + :param include: name of parameters that might be excluded but should be included anyway + """ + # copy parameter list so we don't mutate the original dict + data = self.__dict__.copy() + # add model parameter + data["model_dict"] = self.model.__dict__.copy() + + # Exclude is union of specified parameters (if any) and standard exclusions + if exclude is None: + exclude = [] + exclude = set(exclude).union(self.excluded_save_params()) + exclude.add("model") + + # Do not exclude params if they are specifically included + if include is not None: + exclude = exclude.difference(include) + + state_dicts_names, tensors_names = self.get_torch_variables() + # any params that are in the save vars must not be saved by data + torch_variables = state_dicts_names + tensors_names + for torch_var in torch_variables: + # we need to get only the name of the top most module as we'll remove that + var_name = torch_var.split(".")[0] + exclude.add(var_name) + + # Remove parameter entries of parameters which are to be excluded + for param_name in exclude: + data.pop(param_name, None) + data["model_dict"].pop(param_name, None) + + # Build dict of tensor variables + tensors = None + if tensors_names is not None: + tensors = {} + for name in tensors_names: + attr = recursive_getattr(self, name) + tensors[name] = attr + + # Build dict of state_dicts + params_to_save = {} + for name in state_dicts_names: + # always take attribute from model class if possible + if hasattr(self.model, name): + attr = recursive_getattr(self.model, name) + else: + attr = recursive_getattr(self, name) + # Retrieve state dict + params_to_save[name] = attr.state_dict() + + save_to_zip_file(path, data=data, params=params_to_save, tensors=tensors) + + @classmethod + def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAlgorithm": + """ + Load the model from a zip-file + + :param load_path: the location of the saved data + :param env: the new environment to run the loaded model on + (can be None if you only need prediction from a trained model) has priority over any saved environment + :param kwargs: extra arguments to change the model when loading + """ + data, params, tensors = load_from_zip_file(load_path) + + if "policy_kwargs" in data: + for arg_to_remove in ["device"]: + if arg_to_remove in data["policy_kwargs"]: + del data["policy_kwargs"][arg_to_remove] + + if "policy_kwargs" in kwargs and kwargs["policy_kwargs"] != data["policy_kwargs"]: + raise ValueError( + f"The specified policy kwargs do not equal the stored policy kwargs." + f"Stored kwargs: {data['policy_kwargs']}, specified kwargs: {kwargs['policy_kwargs']}" + ) + + # check if observation space and action space are part of the saved parameters + if "observation_space" not in data or "action_space" not in data: + raise KeyError("The observation_space and action_space were not given, can't verify new environments") + # check if given env is valid + if env is not None: + env = check_wrapped_env(env) + check_for_correct_spaces(env, data["observation_space"], data["action_space"]) + # if no new env was given use stored env if possible + if env is None and "env" in data: + env = data["env"] + + # noinspection PyArgumentList + model = cls( + policy=data["model_dict"]["policy_class"], + env=env, + model_class=data["model_class"], + n_goals=data["n_goals"], + goal_strategy=data["goal_strategy"], + online_sampling=data["online_sampling"], + her_ratio=data["her_ratio"], + learning_rate=data["learning_rate"], + policy_kwargs=data["model_dict"]["policy_kwargs"], + _init_setup_model=True, # pytype: disable=not-instantiable,wrong-keyword-args + ) + + # load parameters + model.__dict__.update(data) + model.model.__dict__.update(data["model_dict"]) + model.__dict__.update(kwargs) + + # put state_dicts back in place + for name in params: + attr = recursive_getattr(model.model, name) + attr.load_state_dict(params[name]) + + # put tensors back in place + if tensors is not None: + for name in tensors: + recursive_setattr(model.model, name, tensors[name]) + + # Sample gSDE exploration matrix, so it uses the right device + # see issue #44 + if model.model.use_sde: + model.model.policy.reset_noise() # pytype: disable=attribute-error + return model diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 62ae3df272..4fa3f0882b 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -46,14 +46,13 @@ def __init__( # buffer with episodes self.buffer = [] - # TODO just for typing reason , need another solution - self.observations = np.zeros((self.buffer_size, self.n_envs,) + self.obs_shape, dtype=observation_space.dtype) self.goal_strategy = goal_strategy - self.her_ratio = 1 - (1.0 / (1 + her_ratio)) + # probability for selecting her indices + self.her_prob = 1 - (1.0 / (1 + her_ratio)) # memory management - self.__n_episodes_stored = 0 - self.__n_transitions_stored = 0 + self._n_episodes_stored = 0 + self._n_transitions_stored = 0 def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: """ @@ -78,8 +77,8 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths]) # get selected timesteps transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)], dtype=object) - # get her samples indices with her_ratio - her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_ratio)[0] + # get her samples indices with her_prob + her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_prob)[0] # her samples episode lengths her_episode_lenghts = episode_lengths[her_idxs] @@ -87,7 +86,6 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: if self.goal_strategy == GoalSelectionStrategy.FINAL: # replay with final state of current episode last_transitions = [episode[-1][0] for episode in buffer[episode_idxs[her_idxs]]] - # last_transitions = buffer[episode_idxs[her_idxs], -1][:, 0] her_new_goals = [trans["achieved_goal"] for trans in last_transitions] elif self.goal_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition @@ -104,14 +102,12 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: # replay with random state which comes from the same episode as current transition index = np.array([np.random.choice(np.arange(ep_len)) for ep_len in her_episode_lenghts]) episode_transitions = [buffer[episode_idxs[her_idx]][idx][0] for idx, her_idx in zip(index, her_idxs)] - # episode_transitions = buffer[episode_idxs[her_idxs], index][:, 0] her_new_goals = [trans["achieved_goal"] for trans in episode_transitions] elif self.goal_strategy == GoalSelectionStrategy.RANDOM: # replay with random state from the entire replay buffer ep_idx = np.random.randint(0, self.n_episodes_stored, len(her_idxs)) state_idx = np.array([np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]]) random_transitions = [episode[state][0] for episode, state in zip(buffer[ep_idx], state_idx)] - # random_transitions = buffer[ep_idx, state_idx][:, 0] her_new_goals = [trans["achieved_goal"] for trans in random_transitions] else: raise ValueError("Strategy for sampling goals not supported!") @@ -183,19 +179,19 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: @property def n_episodes_stored(self): - return self.__n_episodes_stored + return self._n_episodes_stored @n_episodes_stored.setter def n_episodes_stored(self, n): - self.__n_episodes_stored = n + self._n_episodes_stored = n @property def n_transitions_stored(self): - return self.__n_transitions_stored + return self._n_transitions_stored @n_transitions_stored.setter def n_transitions_stored(self, n): - self.__n_transitions_stored = n + self._n_transitions_stored = n def clear_buffer(self): self.buffer = [] diff --git a/stable_baselines3/her/obs_wrapper.py b/stable_baselines3/her/obs_wrapper.py index 1a909968c0..8eb619e47c 100644 --- a/stable_baselines3/her/obs_wrapper.py +++ b/stable_baselines3/her/obs_wrapper.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple +from typing import Tuple, Union import numpy as np from gym import spaces @@ -25,9 +25,8 @@ def __init__(self, venv: VecEnv): self.obs_dim = 1 self.goal_dim = 1 else: - goal_space_shape = venv.observation_space.spaces["achieved_goal"].shape self.obs_dim = venv.observation_space.spaces["observation"].shape[0] - self.goal_dim = goal_space_shape[0] + self.goal_dim = venv.observation_space.spaces["achieved_goal"].shape[0] # new observation space with concatenated observation and (desired) goal # for the different types of spaces diff --git a/tests/test_her.py b/tests/test_her.py index 4a4531bca7..37bad828ab 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -1,3 +1,6 @@ +import os +from copy import deepcopy + import numpy as np import pytest import torch as th @@ -33,7 +36,7 @@ def test_her(model_class, policy, online_sampling): goal_strategy="future", online_sampling=online_sampling, action_noise=action_noise, - verbose=1, + verbose=0, tau=0.05, batch_size=128, learning_rate=0.001, @@ -113,3 +116,90 @@ def test_goal_strategy(goal_strategy, online_sampling): n_episodes_rollout=-1, ) model.learn(total_timesteps=200, callback=None) + + +@pytest.mark.parametrize("model_class, policy", [(SAC, SACPolicy), (TD3, TD3Policy), (DDPG, MlpPolicy)]) +def test_save_load(tmp_path, model_class, policy): + """ + Test if 'save' and 'load' saves and loads model correctly + """ + env = BitFlippingEnv(n_bits=4, continuous=True) + env = DummyVecEnv([lambda: env]) + + # Create action noise + n_actions = env.action_space.shape[0] + action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,))) + + # create model + model = HER( + policy, + env, + model_class, + n_goals=5, + goal_strategy="future", + online_sampling=True, + action_noise=action_noise, + verbose=0, + tau=0.05, + batch_size=128, + learning_rate=0.001, + policy_kwargs=dict(net_arch=[64]), + buffer_size=int(1e6), + gamma=0.98, + gradient_steps=1, + train_freq=1, + n_episodes_rollout=-1, + ) + + model.learn(total_timesteps=500, callback=None) + + env.reset() + + observations_list = [] + for _ in range(10): + obs = env.step([env.action_space.sample()])[0] + observation = np.concatenate([obs["observation"], obs["desired_goal"]], axis=1) + observations_list.append(observation) + + observations = np.concatenate(observations_list, axis=0) + + # Get dictionary of current parameters + params = deepcopy(model.model.policy.state_dict()) + + # Modify all parameters to be random values + random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) + + # Update model parameters with the new random values + model.model.policy.load_state_dict(random_params) + + new_params = model.model.policy.state_dict() + # Check that all params are different now + for k in params: + assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected." + + params = new_params + + # get selected actions + selected_actions, _ = model.model.predict(observations, deterministic=True) + + # Check + model.save(tmp_path / "test_save.zip") + del model + model = HER.load(str(tmp_path / "test_save.zip"), env=env) + + # check if params are still the same after load + new_params = model.model.policy.state_dict() + + # Check that all params are the same as before save load procedure now + for key in params: + assert th.allclose(params[key], new_params[key]), "Model parameters not the same after save and load." + + # check if model still selects the same actions + new_selected_actions, _ = model.model.predict(observations, deterministic=True) + assert np.allclose(selected_actions, new_selected_actions, 1e-4) + + # check if learn still works + model.learn(total_timesteps=1000, eval_freq=500) + + # clear file from os + os.remove(tmp_path / "test_save.zip") From cb9026fe8da8fcec3dc2a7ad584da04e0c24a02b Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Thu, 6 Aug 2020 11:46:06 +0200 Subject: [PATCH 13/81] Added her to init. --- stable_baselines3/__init__.py | 1 + stable_baselines3/her/__init__.py | 1 + 2 files changed, 2 insertions(+) diff --git a/stable_baselines3/__init__.py b/stable_baselines3/__init__.py index b88ca5d4ca..bcac479de6 100644 --- a/stable_baselines3/__init__.py +++ b/stable_baselines3/__init__.py @@ -3,6 +3,7 @@ from stable_baselines3.a2c import A2C from stable_baselines3.ddpg import DDPG from stable_baselines3.dqn import DQN +from stable_baselines3.her import HER from stable_baselines3.ppo import PPO from stable_baselines3.sac import SAC from stable_baselines3.td3 import TD3 diff --git a/stable_baselines3/her/__init__.py b/stable_baselines3/her/__init__.py index e69de29bb2..4e29bce4a5 100644 --- a/stable_baselines3/her/__init__.py +++ b/stable_baselines3/her/__init__.py @@ -0,0 +1 @@ +from stable_baselines3.her.her import HER \ No newline at end of file From e30f730540202cbf426e4f7ec9cf46886c1e9b8f Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Fri, 7 Aug 2020 10:02:17 +0200 Subject: [PATCH 14/81] Updated save method. --- stable_baselines3/her/__init__.py | 2 +- stable_baselines3/her/her.py | 87 ++++++++++--------------------- 2 files changed, 28 insertions(+), 61 deletions(-) diff --git a/stable_baselines3/her/__init__.py b/stable_baselines3/her/__init__.py index 4e29bce4a5..ce43bf04cf 100644 --- a/stable_baselines3/her/__init__.py +++ b/stable_baselines3/her/__init__.py @@ -1 +1 @@ -from stable_baselines3.her.her import HER \ No newline at end of file +from stable_baselines3.her.her import HER diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 89f4ed312d..e0eb93bc4f 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -10,7 +10,7 @@ from stable_baselines3.common.noise import ActionNoise from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm from stable_baselines3.common.policies import BasePolicy -from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr, save_to_zip_file +from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn from stable_baselines3.common.utils import check_for_correct_spaces from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper @@ -125,6 +125,8 @@ def learn( reset_num_timesteps: bool = True, ) -> BaseAlgorithm: + eval_env = check_wrapped_env(eval_env) if eval_env is not None else eval_env + total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) @@ -134,6 +136,7 @@ def learn( self.model.num_timesteps = self.num_timesteps self.model._episode_num = self._episode_num self.model._last_obs = self._last_obs + self.model._total_timesteps = self._total_timesteps callback.on_training_start(locals(), globals()) @@ -395,54 +398,15 @@ def save( :param exclude: name of parameters that should be excluded in addition to the default one :param include: name of parameters that might be excluded but should be included anyway """ - # copy parameter list so we don't mutate the original dict - data = self.__dict__.copy() - # add model parameter - data["model_dict"] = self.model.__dict__.copy() - - # Exclude is union of specified parameters (if any) and standard exclusions - if exclude is None: - exclude = [] - exclude = set(exclude).union(self.excluded_save_params()) - exclude.add("model") - - # Do not exclude params if they are specifically included - if include is not None: - exclude = exclude.difference(include) - - state_dicts_names, tensors_names = self.get_torch_variables() - # any params that are in the save vars must not be saved by data - torch_variables = state_dicts_names + tensors_names - for torch_var in torch_variables: - # we need to get only the name of the top most module as we'll remove that - var_name = torch_var.split(".")[0] - exclude.add(var_name) - - # Remove parameter entries of parameters which are to be excluded - for param_name in exclude: - data.pop(param_name, None) - data["model_dict"].pop(param_name, None) - - # Build dict of tensor variables - tensors = None - if tensors_names is not None: - tensors = {} - for name in tensors_names: - attr = recursive_getattr(self, name) - tensors[name] = attr - - # Build dict of state_dicts - params_to_save = {} - for name in state_dicts_names: - # always take attribute from model class if possible - if hasattr(self.model, name): - attr = recursive_getattr(self.model, name) - else: - attr = recursive_getattr(self, name) - # Retrieve state dict - params_to_save[name] = attr.state_dict() - - save_to_zip_file(path, data=data, params=params_to_save, tensors=tensors) + + # add HER parameters to model + self.model.n_goals = self.n_goals + self.model.her_ratio = self.her_ratio + self.model.goal_strategy = self.goal_strategy + self.model.online_sampling = self.online_sampling + self.model.model_class = self.model_class + + self.model.save(path, exclude, include) @classmethod def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAlgorithm": @@ -479,8 +443,8 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl env = data["env"] # noinspection PyArgumentList - model = cls( - policy=data["model_dict"]["policy_class"], + her_model = cls( + policy=data["policy_class"], env=env, model_class=data["model_class"], n_goals=data["n_goals"], @@ -488,27 +452,30 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl online_sampling=data["online_sampling"], her_ratio=data["her_ratio"], learning_rate=data["learning_rate"], - policy_kwargs=data["model_dict"]["policy_kwargs"], + policy_kwargs=data["policy_kwargs"], _init_setup_model=True, # pytype: disable=not-instantiable,wrong-keyword-args ) # load parameters - model.__dict__.update(data) - model.model.__dict__.update(data["model_dict"]) - model.__dict__.update(kwargs) + her_model.model.__dict__.update(data) + her_model.__dict__.update(kwargs) + + her_model._total_timesteps = her_model.model._total_timesteps + her_model.num_timesteps = her_model.model.num_timesteps + her_model._episode_num = her_model.model._episode_num # put state_dicts back in place for name in params: - attr = recursive_getattr(model.model, name) + attr = recursive_getattr(her_model.model, name) attr.load_state_dict(params[name]) # put tensors back in place if tensors is not None: for name in tensors: - recursive_setattr(model.model, name, tensors[name]) + recursive_setattr(her_model.model, name, tensors[name]) # Sample gSDE exploration matrix, so it uses the right device # see issue #44 - if model.model.use_sde: - model.model.policy.reset_noise() # pytype: disable=attribute-error - return model + if her_model.model.use_sde: + her_model.model.policy.reset_noise() # pytype: disable=attribute-error + return her_model From 7d1eb24f57a91b1b6e36844130f334809e32e1b0 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Fri, 7 Aug 2020 10:35:38 +0200 Subject: [PATCH 15/81] Updated her ratio. --- stable_baselines3/her/her.py | 7 +++---- stable_baselines3/her/her_replay_buffer.py | 14 +++++++------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index e0eb93bc4f..7b0f49561f 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -41,12 +41,11 @@ class HER(BaseAlgorithm): :param policy: (BasePolicy) The policy model to use. :param env: (VecEnv) The environment to learn from. :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3) - :param n_goals: (int) Number of sampled goals for replay. + :param n_goals: (int) Number of sampled goals for replay. (offline sampling) :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay. One of ['episode', 'final', 'future', 'random'] :param online_sampling: (bool) Sample HER transitions online. - :her_ratio: (int) The ratio between HER replays and regular replays (e.g. k = 4 -> 4 times - as many HER replays as regular replays are used) + :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling) :param learning_rate: (float or callable) learning rate for the optimizer, it can be a function of the current progress remaining (from 1 to 0) """ @@ -59,7 +58,7 @@ def __init__( n_goals: int = 5, goal_strategy: Union[GoalSelectionStrategy, str] = "future", online_sampling: bool = False, - her_ratio: int = 2, + her_ratio: float = 0.6, learning_rate: Union[float, Callable] = 3e-4, *args, **kwargs, diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 4fa3f0882b..89d6d75f62 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -23,8 +23,8 @@ class HerReplayBuffer(BaseBuffer): :param device: (Union[th.device, str]) PyTorch device to which the values will be converted :param n_envs: (int) Number of parallel environments - :param her_ratio: (int) The ratio between HER replays and regular replays (e.g. k = 4 -> 4 times - as many HER replays as regular replays are used) + :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling) + """ def __init__( @@ -36,7 +36,7 @@ def __init__( action_space: spaces.Space, device: Union[th.device, str] = "cpu", n_envs: int = 1, - her_ratio: int = 2, + her_ratio: float = 0.6, ): super(HerReplayBuffer, self).__init__(buffer_size, observation_space, action_space, device, n_envs) @@ -47,8 +47,8 @@ def __init__( # buffer with episodes self.buffer = [] self.goal_strategy = goal_strategy - # probability for selecting her indices - self.her_prob = 1 - (1.0 / (1 + her_ratio)) + # percentage of her indices + self.her_ratio = her_ratio # memory management self._n_episodes_stored = 0 @@ -77,8 +77,8 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths]) # get selected timesteps transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)], dtype=object) - # get her samples indices with her_prob - her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_prob)[0] + # get her samples indices with her_ratio + her_idxs = np.random.choice(np.arange(batch_size), int(self.her_ratio * batch_size), replace=False) # her samples episode lengths her_episode_lenghts = episode_lengths[her_idxs] From 21bd1a4fcc39f5af3e1b2cf268be0e6144351f18 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 11 Aug 2020 16:11:04 +0200 Subject: [PATCH 16/81] Move obs_wrapper --- .../vec_env/dict_obs_wrapper.py} | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) rename stable_baselines3/{her/obs_wrapper.py => common/vec_env/dict_obs_wrapper.py} (85%) diff --git a/stable_baselines3/her/obs_wrapper.py b/stable_baselines3/common/vec_env/dict_obs_wrapper.py similarity index 85% rename from stable_baselines3/her/obs_wrapper.py rename to stable_baselines3/common/vec_env/dict_obs_wrapper.py index 8eb619e47c..35eb7908dd 100644 --- a/stable_baselines3/her/obs_wrapper.py +++ b/stable_baselines3/common/vec_env/dict_obs_wrapper.py @@ -1,5 +1,3 @@ -from typing import Tuple, Union - import numpy as np from gym import spaces @@ -47,8 +45,18 @@ def __init__(self, venv: VecEnv): else: raise NotImplementedError(f"{type(self.spaces[0])} space is not supported") - def reset(self) -> Union[int, float]: + def reset(self): return self.venv.reset() - def step_wait(self) -> Tuple[Union[int, float], float, bool, dict]: + def step_wait(self): return self.venv.step_wait() + + @staticmethod + def convert_dict(self, observation: dict) -> np.ndarray: + """ + Concatenate observation and desired goal of observation dict. + + :param observation: (dict) + :return: (np.ndarray) + """ + return np.concatenate([observation["observation"], observation["desired_goal"]]) From e647d3690c76c23877a7f687e17e11c5c923261e Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 11 Aug 2020 17:03:29 +0200 Subject: [PATCH 17/81] Added DQN test. --- stable_baselines3/common/policies.py | 6 +++- stable_baselines3/her/her.py | 31 ++++++++++++-------- tests/test_her.py | 42 +++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/stable_baselines3/common/policies.py b/stable_baselines3/common/policies.py index 01b788fea3..efb06a2f5b 100644 --- a/stable_baselines3/common/policies.py +++ b/stable_baselines3/common/policies.py @@ -23,6 +23,7 @@ from stable_baselines3.common.torch_layers import BaseFeaturesExtractor, FlattenExtractor, MlpExtractor, NatureCNN, create_mlp from stable_baselines3.common.utils import get_device, is_vectorized_observation from stable_baselines3.common.vec_env import VecTransposeImage +from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper class BaseModel(nn.Module, ABC): @@ -227,7 +228,10 @@ def predict( # state = self.initial_state # if mask is None: # mask = [False for _ in range(self.n_envs)] - observation = np.array(observation) + if isinstance(observation, dict): + observation = ObsWrapper.convert_dict(observation) + else: + observation = np.array(observation) # Handle the different cases for images # as PyTorch use channel first format diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 7b0f49561f..c2524fb6c3 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -14,9 +14,9 @@ from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn from stable_baselines3.common.utils import check_for_correct_spaces from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper +from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy from stable_baselines3.her.her_replay_buffer import HerReplayBuffer -from stable_baselines3.her.obs_wrapper import ObsWrapper def check_wrapped_env(env: VecEnv) -> VecEnv: @@ -38,8 +38,8 @@ class HER(BaseAlgorithm): """ Hindsight Experience Replay (HER) - :param policy: (BasePolicy) The policy model to use. - :param env: (VecEnv) The environment to learn from. + :param policy: (BasePolicy or str) The policy model to use. + :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str) :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3) :param n_goals: (int) Number of sampled goals for replay. (offline sampling) :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay. @@ -52,8 +52,8 @@ class HER(BaseAlgorithm): def __init__( self, - policy: Type[BasePolicy], - env: VecEnv, + policy: Union[str, Type[BasePolicy]], + env: Union[GymEnv, str], model_class: Type[OffPolicyAlgorithm], n_goals: int = 5, goal_strategy: Union[GoalSelectionStrategy, str] = "future", @@ -64,10 +64,10 @@ def __init__( **kwargs, ): - # check if wrapper for dict support is needed - self.env = check_wrapped_env(env) + super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=learning_rate) - super(HER, self).__init__(policy=BasePolicy, env=self.env, policy_base=BasePolicy, learning_rate=learning_rate) + # check if wrapper for dict support is needed + self.env = check_wrapped_env(self.env) # model initialization self.model_class = model_class @@ -111,6 +111,16 @@ def __init__( def _setup_model(self) -> None: self.model._setup_model() + def predict( + self, + observation: np.ndarray, + state: Optional[np.ndarray] = None, + mask: Optional[np.ndarray] = None, + deterministic: bool = False, + ) -> Tuple[np.ndarray, Optional[np.ndarray]]: + + return self.model.predict(observation, state, mask, deterministic) + def learn( self, total_timesteps: int, @@ -124,8 +134,6 @@ def learn( reset_num_timesteps: bool = True, ) -> BaseAlgorithm: - eval_env = check_wrapped_env(eval_env) if eval_env is not None else eval_env - total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) @@ -224,7 +232,7 @@ def collect_rollouts( self.model._last_obs = self._last_obs action, buffer_action = self._sample_action(learning_starts, action_noise) - # Rescale and perform action + # Perform action new_obs, reward, done, infos = env.step(action) # Only stop training if return value is False, not when it is None. @@ -264,6 +272,7 @@ def collect_rollouts( episode_timesteps += 1 total_steps += 1 self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps) + self.model._current_progress_remaining = self._current_progress_remaining # For DQN, check if the target network should be updated # and update the exploration schedule diff --git a/tests/test_her.py b/tests/test_her.py index 37bad828ab..736d9c65c3 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -5,7 +5,7 @@ import pytest import torch as th -from stable_baselines3 import DDPG, SAC, TD3 +from stable_baselines3 import DDPG, DQN, SAC, TD3 from stable_baselines3.common.bit_flipping_env import BitFlippingEnv from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise from stable_baselines3.common.vec_env import DummyVecEnv @@ -203,3 +203,43 @@ def test_save_load(tmp_path, model_class, policy): # clear file from os os.remove(tmp_path / "test_save.zip") + + +@pytest.mark.parametrize("online_sampling", [False]) +@pytest.mark.parametrize("n_bits", [15]) +def test_dqn_her(online_sampling, n_bits): + """ + Test HER with DQN for BitFlippingEnv. + """ + env = BitFlippingEnv(n_bits=n_bits, continuous=False) + + # offline + model = HER( + "MlpPolicy", + env, + DQN, + n_goals=4, + goal_strategy="future", + online_sampling=online_sampling, + her_ratio=0.6, + verbose=1, + tau=1, + batch_size=32, + learning_rate=0.0005, + policy_kwargs=dict(net_arch=[64, 64]), + buffer_size=50000, + gamma=0.99, + gradient_steps=1, + train_freq=1, + n_episodes_rollout=-1, + tensorboard_log="tensorboard", + learning_starts=1000, + exploration_fraction=0.1, + exploration_final_eps=0.02, + exploration_initial_eps=1.0, + target_update_interval=500, + ) + + tb_log_name = "run_" + str(online_sampling) + "_" + str(n_bits) + + model.learn(total_timesteps=20000, callback=None, tb_log_name=tb_log_name) From fc2b18108a90dc736493c6d2adcd6b1d4e1c0f75 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 11 Aug 2020 17:15:58 +0200 Subject: [PATCH 18/81] Fix potential bug --- stable_baselines3/her/her.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index c2524fb6c3..19da1e6604 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -271,14 +271,13 @@ def collect_rollouts( self.model.num_timesteps = self.num_timesteps episode_timesteps += 1 total_steps += 1 - self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps) - self.model._current_progress_remaining = self._current_progress_remaining + self.model._update_current_progress_remaining(self.num_timesteps, self._total_timesteps) # For DQN, check if the target network should be updated # and update the exploration schedule # For SAC/TD3, the update is done as the same time as the gradient update # see https://github.com/hill-a/stable-baselines/issues/900 - self._on_step() + self.model._on_step() if 0 < n_steps <= total_steps: break From 3f3bd4914cec59834702f93c248a9e7152440e6c Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Wed, 19 Aug 2020 03:50:09 +0200 Subject: [PATCH 19/81] Offline and online her share same sample_goal function. --- .../common/vec_env/dict_obs_wrapper.py | 4 +- stable_baselines3/her/her.py | 91 ++++++--------- stable_baselines3/her/her_replay_buffer.py | 105 +++++++++++------- tests/test_her.py | 22 ++-- 4 files changed, 110 insertions(+), 112 deletions(-) diff --git a/stable_baselines3/common/vec_env/dict_obs_wrapper.py b/stable_baselines3/common/vec_env/dict_obs_wrapper.py index 35eb7908dd..55e5283b06 100644 --- a/stable_baselines3/common/vec_env/dict_obs_wrapper.py +++ b/stable_baselines3/common/vec_env/dict_obs_wrapper.py @@ -52,11 +52,11 @@ def step_wait(self): return self.venv.step_wait() @staticmethod - def convert_dict(self, observation: dict) -> np.ndarray: + def convert_dict(observation: dict) -> np.ndarray: """ Concatenate observation and desired goal of observation dict. :param observation: (dict) :return: (np.ndarray) """ - return np.concatenate([observation["observation"], observation["desired_goal"]]) + return np.concatenate([observation["observation"], observation["desired_goal"]], axis=1) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 19da1e6604..caa2a4308c 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -41,11 +41,10 @@ class HER(BaseAlgorithm): :param policy: (BasePolicy or str) The policy model to use. :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str) :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3) - :param n_goals: (int) Number of sampled goals for replay. (offline sampling) - :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay. + :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling) + :param goal_selection_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay. One of ['episode', 'final', 'future', 'random'] :param online_sampling: (bool) Sample HER transitions online. - :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling) :param learning_rate: (float or callable) learning rate for the optimizer, it can be a function of the current progress remaining (from 1 to 0) """ @@ -55,10 +54,9 @@ def __init__( policy: Union[str, Type[BasePolicy]], env: Union[GymEnv, str], model_class: Type[OffPolicyAlgorithm], - n_goals: int = 5, - goal_strategy: Union[GoalSelectionStrategy, str] = "future", + n_sampled_goal: int = 5, + goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future", online_sampling: bool = False, - her_ratio: float = 0.6, learning_rate: Union[float, Callable] = 3e-4, *args, **kwargs, @@ -78,25 +76,29 @@ def __init__( self.verbose = self.model.verbose self.tensorboard_log = self.model.tensorboard_log - # convert goal_strategy into GoalSelectionStrategy if string - if isinstance(goal_strategy, str): - self.goal_strategy = KEY_TO_GOAL_STRATEGY[goal_strategy.lower()] + # convert goal_selection_strategy into GoalSelectionStrategy if string + if isinstance(goal_selection_strategy, str): + self.goal_selection_strategy = KEY_TO_GOAL_STRATEGY[goal_selection_strategy.lower()] else: - self.goal_strategy = goal_strategy + self.goal_selection_strategy = goal_selection_strategy - # check if goal_strategy is valid + # check if goal_selection_strategy is valid assert isinstance( - self.goal_strategy, GoalSelectionStrategy + self.goal_selection_strategy, GoalSelectionStrategy ), f"Invalid goal selection strategy, please use one of {list(GoalSelectionStrategy)}" + # storage for transitions of current episode + self._episode_storage = [] + self.n_sampled_goal = n_sampled_goal + # if we sample her transitions online use custom replay buffer self.online_sampling = online_sampling - self.her_ratio = her_ratio + self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1)) if self.online_sampling: self.model.replay_buffer = HerReplayBuffer( self.env, self.buffer_size, - self.goal_strategy, + self.goal_selection_strategy, self.env.observation_space, self.env.action_space, self.device, @@ -104,10 +106,6 @@ def __init__( self.her_ratio, ) - # storage for transitions of current episode - self._episode_storage = [] - self.n_goals = n_goals - def _setup_model(self) -> None: self.model._setup_model() @@ -222,7 +220,7 @@ def collect_rollouts( while not done: # concatenate observation and (desired) goal observation = self._last_obs - self._last_obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1) + self._last_obs = ObsWrapper.convert_dict(observation) if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0: # Sample a new noise matrix @@ -313,38 +311,6 @@ def collect_rollouts( return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training) - def sample_goals(self, sample_idx: int, obs_dim: int) -> Union[np.ndarray, None]: - """ - Sample a goal based on goal_strategy. - - :param sample_idx: (int) Index of current transition. - :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy. - :return: (np.ndarray or None) Return sampled goal. - """ - if self.goal_strategy == GoalSelectionStrategy.FINAL: - # replay with final state of current episode - return self._episode_storage[-1][0]["achieved_goal"] - elif self.goal_strategy == GoalSelectionStrategy.FUTURE: - # replay with random state which comes from the same episode and was observed after current transition - # we have no transition after last transition of episode - - if (sample_idx + 1) < len(self._episode_storage): - index = np.random.choice(np.arange(sample_idx + 1, len(self._episode_storage))) - return self._episode_storage[index][0]["achieved_goal"] - elif self.goal_strategy == GoalSelectionStrategy.EPISODE: - # replay with random state which comes from the same episode as current transition - index = np.random.choice(np.arange(len(self._episode_storage))) - return self._episode_storage[index][0]["achieved_goal"] - elif self.goal_strategy == GoalSelectionStrategy.RANDOM: - # replay with random state from the entire replay buffer - index = np.random.choice(np.arange(self.replay_buffer.size())) - obs = self.replay_buffer.observations[index] - # get only the observation part - obs_array = obs[:, :obs_dim] - return obs_array - else: - raise ValueError("Strategy for sampling goals not supported!") - def _store_transitions(self) -> None: """ Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer. @@ -356,8 +322,8 @@ def _store_transitions(self) -> None: observation, action, reward, new_observation, done = trans # concatenate observation with (desired) goal - obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1) - new_obs = np.concatenate([new_observation["observation"], new_observation["desired_goal"]], axis=1) + obs = ObsWrapper.convert_dict(observation) + new_obs = ObsWrapper.convert_dict(new_observation) # store data in replay buffer self.replay_buffer.add(obs, new_obs, action, reward, done) @@ -365,7 +331,14 @@ def _store_transitions(self) -> None: # sample set of additional goals obs_dim = observation["observation"].shape[1] sampled_goals = [ - sample for sample in (self.sample_goals(idx, obs_dim) for i in range(self.n_goals)) if sample is not None + sample + for sample in ( + HerReplayBuffer.sample_goal( + self.goal_selection_strategy, idx, self._episode_storage, self.replay_buffer.observations, obs_dim + ) + for i in range(self.n_sampled_goal) + ) + if sample is not None ] # iterate over sampled goals and store new transitions in replay buffer @@ -407,9 +380,8 @@ def save( """ # add HER parameters to model - self.model.n_goals = self.n_goals - self.model.her_ratio = self.her_ratio - self.model.goal_strategy = self.goal_strategy + self.model.n_sampled_goal = self.n_sampled_goal + self.model.goal_selection_strategy = self.goal_selection_strategy self.model.online_sampling = self.online_sampling self.model.model_class = self.model_class @@ -454,10 +426,9 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl policy=data["policy_class"], env=env, model_class=data["model_class"], - n_goals=data["n_goals"], - goal_strategy=data["goal_strategy"], + n_sampled_goal=data["n_sampled_goal"], + goal_selection_strategy=data["goal_selection_strategy"], online_sampling=data["online_sampling"], - her_ratio=data["her_ratio"], learning_rate=data["learning_rate"], policy_kwargs=data["policy_kwargs"], _init_setup_model=True, # pytype: disable=not-instantiable,wrong-keyword-args diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 89d6d75f62..a21dab9dec 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -7,6 +7,7 @@ from stable_baselines3.common.buffers import BaseBuffer from stable_baselines3.common.type_aliases import ReplayBufferSamples from stable_baselines3.common.vec_env import VecEnv, VecNormalize +from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy @@ -16,7 +17,7 @@ class HerReplayBuffer(BaseBuffer): :param env: (VecEnv) The training environment :param buffer_size: (int) The size of the buffer measured in transitions. - :param goal_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay. + :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay. One of ['episode', 'final', 'future', 'random'] :param observation_space: (spaces.Space) Observation space :param action_space: (spaces.Space) Action space @@ -31,7 +32,7 @@ def __init__( self, env: VecEnv, buffer_size: int, - goal_strategy: GoalSelectionStrategy, + goal_selection_strategy: GoalSelectionStrategy, observation_space: spaces.Space, action_space: spaces.Space, device: Union[th.device, str] = "cpu", @@ -46,7 +47,7 @@ def __init__( # buffer with episodes self.buffer = [] - self.goal_strategy = goal_strategy + self.goal_selection_strategy = goal_selection_strategy # percentage of her indices self.her_ratio = her_ratio @@ -73,44 +74,22 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: buffer = np.array(self.buffer, dtype=object) # get episode lengths for selecting timesteps episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]]) - # select timesteps + # select timesteps of episodes t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths]) # get selected timesteps transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)], dtype=object) # get her samples indices with her_ratio her_idxs = np.random.choice(np.arange(batch_size), int(self.her_ratio * batch_size), replace=False) - # her samples episode lengths - her_episode_lenghts = episode_lengths[her_idxs] + + # if we sample goals from future delete indices from her_idxs where we have no transition after current one + if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: + her_idxs = her_idxs[t_samples[her_idxs] != episode_lengths[her_idxs] - 1] # get new goals with goal selection strategy - if self.goal_strategy == GoalSelectionStrategy.FINAL: - # replay with final state of current episode - last_transitions = [episode[-1][0] for episode in buffer[episode_idxs[her_idxs]]] - her_new_goals = [trans["achieved_goal"] for trans in last_transitions] - elif self.goal_strategy == GoalSelectionStrategy.FUTURE: - # replay with random state which comes from the same episode and was observed after current transition - her_new_goals = [] - for idx, length in zip(her_idxs, her_episode_lenghts): - # we have no transition after last transition of episode - if t_samples[idx] + 1 < length: - index = np.random.choice(np.arange(t_samples[idx] + 1, length)) - her_new_goals.append(buffer[episode_idxs[idx]][index][0]["achieved_goal"]) - else: - # delete index from her indices where we have no transition after current one - her_idxs = her_idxs[her_idxs != idx] - elif self.goal_strategy == GoalSelectionStrategy.EPISODE: - # replay with random state which comes from the same episode as current transition - index = np.array([np.random.choice(np.arange(ep_len)) for ep_len in her_episode_lenghts]) - episode_transitions = [buffer[episode_idxs[her_idx]][idx][0] for idx, her_idx in zip(index, her_idxs)] - her_new_goals = [trans["achieved_goal"] for trans in episode_transitions] - elif self.goal_strategy == GoalSelectionStrategy.RANDOM: - # replay with random state from the entire replay buffer - ep_idx = np.random.randint(0, self.n_episodes_stored, len(her_idxs)) - state_idx = np.array([np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]]) - random_transitions = [episode[state][0] for episode, state in zip(buffer[ep_idx], state_idx)] - her_new_goals = [trans["achieved_goal"] for trans in random_transitions] - else: - raise ValueError("Strategy for sampling goals not supported!") + her_new_goals = [ + self.sample_goal(self.goal_selection_strategy, trans_idx, episode, self.buffer, online_sampling=True) + for episode, trans_idx in zip(buffer[episode_idxs[her_idxs]], t_samples[her_idxs]) + ] # assign new goals as desired_goals for idx, goal in enumerate(her_new_goals): @@ -122,15 +101,13 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(next_observations)[her_idxs]] new_rewards = np.array(rewards) new_rewards[her_idxs] = [ - self.env.env_method("compute_reward", achieved_goal, her_new_goals, None) + self.env.env_method("compute_reward", achieved_goal, new_goal, None) for achieved_goal, new_goal in zip(achieved_goals, her_new_goals) ] # concatenate observation with (desired) goal - obs = [np.concatenate([obs_["observation"], obs_["desired_goal"]], axis=1) for obs_ in observations] - new_obs = [ - np.concatenate([new_obs_["observation"], new_obs_["desired_goal"]], axis=1) for new_obs_ in next_observations - ] + obs = [ObsWrapper.convert_dict(obs_) for obs_ in observations] + new_obs = [ObsWrapper.convert_dict(new_obs_) for new_obs_ in next_observations] data = ( np.array(obs)[:, 0, :], @@ -142,6 +119,56 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: return ReplayBufferSamples(*tuple(map(self.to_torch, data))) + @staticmethod + def sample_goal( + goal_selection_strategy: GoalSelectionStrategy, + sample_idx: int, + episode: list, + observations: Union[list, np.ndarray], + obs_dim: int = None, + online_sampling: bool = False, + ) -> Union[np.ndarray, None]: + """ + Sample a goal based on goal_selection_strategy. + + :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay. + One of ['episode', 'final', 'future', 'random'] + :param sample_idx: (int) Index of current transition. + :param episode: (list) Current episode. + :param observations: (list or np.ndarray) + :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy. + :param online_sampling: (bool) Sample HER transitions online. + :return: (np.ndarray or None) Return sampled goal. + """ + if goal_selection_strategy == GoalSelectionStrategy.FINAL: + # replay with final state of current episode + return episode[-1][0]["achieved_goal"] + elif goal_selection_strategy == GoalSelectionStrategy.FUTURE: + # replay with random state which comes from the same episode and was observed after current transition + # we have no transition after last transition of episode + if (sample_idx + 1) < len(episode): + index = np.random.choice(np.arange(sample_idx + 1, len(episode))) + return episode[index][0]["achieved_goal"] + elif goal_selection_strategy == GoalSelectionStrategy.EPISODE: + # replay with random state which comes from the same episode as current transition + index = np.random.choice(np.arange(len(episode))) + return episode[index][0]["achieved_goal"] + elif goal_selection_strategy == GoalSelectionStrategy.RANDOM: + if online_sampling: + # replay with random state from the entire replay buffer + ep_idx = np.random.choice(np.arange(len(observations))) + trans_idx = np.random.choice(np.arange(len(observations[ep_idx]))) + return observations[ep_idx][trans_idx][0]["achieved_goal"] + else: + # replay with random state from the entire replay buffer + index = np.random.choice(np.arange(len(observations))) + obs = observations[index] + # get only the observation part + obs_array = obs[:, :obs_dim] + return obs_array + else: + raise ValueError("Strategy for sampling goals not supported!") + def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: np.ndarray, done: np.ndarray) -> None: """ Add episode to replay buffer diff --git a/tests/test_her.py b/tests/test_her.py index 736d9c65c3..80dec3a82b 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -9,6 +9,7 @@ from stable_baselines3.common.bit_flipping_env import BitFlippingEnv from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise from stable_baselines3.common.vec_env import DummyVecEnv +from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper from stable_baselines3.her.her import HER, GoalSelectionStrategy from stable_baselines3.sac.policies import SACPolicy from stable_baselines3.td3.policies import MlpPolicy, TD3Policy @@ -32,8 +33,8 @@ def test_her(model_class, policy, online_sampling): policy, env, model_class, - n_goals=5, - goal_strategy="future", + n_sampled_goal=5, + goal_selection_strategy="future", online_sampling=online_sampling, action_noise=action_noise, verbose=0, @@ -85,7 +86,7 @@ def test_her(model_class, policy, online_sampling): @pytest.mark.parametrize( - "goal_strategy", + "goal_selection_strategy", [ "final", "episode", @@ -98,7 +99,7 @@ def test_her(model_class, policy, online_sampling): ], ) @pytest.mark.parametrize("online_sampling", [True, False]) -def test_goal_strategy(goal_strategy, online_sampling): +def test_goal_selection_strategy(goal_selection_strategy, online_sampling): """ Test different goal strategies. """ @@ -109,7 +110,7 @@ def test_goal_strategy(goal_strategy, online_sampling): SACPolicy, env, SAC, - goal_strategy=goal_strategy, + goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, gradient_steps=1, train_freq=1, @@ -135,8 +136,8 @@ def test_save_load(tmp_path, model_class, policy): policy, env, model_class, - n_goals=5, - goal_strategy="future", + n_sampled_goal=5, + goal_selection_strategy="future", online_sampling=True, action_noise=action_noise, verbose=0, @@ -158,7 +159,7 @@ def test_save_load(tmp_path, model_class, policy): observations_list = [] for _ in range(10): obs = env.step([env.action_space.sample()])[0] - observation = np.concatenate([obs["observation"], obs["desired_goal"]], axis=1) + observation = ObsWrapper.convert_dict(obs) observations_list.append(observation) observations = np.concatenate(observations_list, axis=0) @@ -218,10 +219,9 @@ def test_dqn_her(online_sampling, n_bits): "MlpPolicy", env, DQN, - n_goals=4, - goal_strategy="future", + n_sampled_goal=4, + goal_selection_strategy="future", online_sampling=online_sampling, - her_ratio=0.6, verbose=1, tau=1, batch_size=32, From cce063fc92052bd83f6c6ec94325a61ce8b05f40 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Mon, 24 Aug 2020 10:57:38 +0200 Subject: [PATCH 20/81] Changed lists into arrays. --- stable_baselines3/her/her.py | 22 ++- stable_baselines3/her/her_replay_buffer.py | 164 +++++++++++++++------ 2 files changed, 138 insertions(+), 48 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index caa2a4308c..3902ed6ad9 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -47,6 +47,7 @@ class HER(BaseAlgorithm): :param online_sampling: (bool) Sample HER transitions online. :param learning_rate: (float or callable) learning rate for the optimizer, it can be a function of the current progress remaining (from 1 to 0) + :param max_episode_length: (int) The length of an episode. (time horizon) """ def __init__( @@ -58,6 +59,7 @@ def __init__( goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future", online_sampling: bool = False, learning_rate: Union[float, Callable] = 3e-4, + max_episode_length: int = 10, *args, **kwargs, ): @@ -94,10 +96,14 @@ def __init__( # if we sample her transitions online use custom replay buffer self.online_sampling = online_sampling self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1)) + self.max_episode_length = max_episode_length + # counter for steps in episode + self.episode_steps = 0 if self.online_sampling: self.model.replay_buffer = HerReplayBuffer( self.env, self.buffer_size, + self.max_episode_length, self.goal_selection_strategy, self.env.observation_space, self.env.action_space, @@ -161,7 +167,7 @@ def learn( if rollout.continue_training is False: break - if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: + if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts and self.replay_buffer.size() > 0: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps @@ -277,15 +283,15 @@ def collect_rollouts( # see https://github.com/hill-a/stable-baselines/issues/900 self.model._on_step() + self.episode_steps += 1 + if 0 < n_steps <= total_steps: break - if done: + if done or self.episode_steps >= self.max_episode_length: if self.online_sampling: observations, actions, rewards, next_observations, done = zip(*self._episode_storage) self.replay_buffer.add(observations, next_observations, actions, rewards, done) - # self.replay_buffer.add(self._episode_storage) - else: # store episode in replay buffer self._store_transitions() @@ -305,6 +311,10 @@ def collect_rollouts( if log_interval is not None and self._episode_num % log_interval == 0: self._dump_logs() + # reset if done or episode length is reached + self.env.reset() + self.episode_steps = 0 + mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 callback.on_rollout_end() @@ -341,7 +351,7 @@ def _store_transitions(self) -> None: if sample is not None ] - # iterate over sampled goals and store new transitions in replay buffer + # iterate over sampled new transitions in replay buffer for goal in sampled_goals: # compute new reward with new goal new_reward = self.env.env_method("compute_reward", new_observation["achieved_goal"], goal, None) @@ -384,6 +394,7 @@ def save( self.model.goal_selection_strategy = self.goal_selection_strategy self.model.online_sampling = self.online_sampling self.model.model_class = self.model_class + self.model.max_episode_length = self.max_episode_length self.model.save(path, exclude, include) @@ -430,6 +441,7 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl goal_selection_strategy=data["goal_selection_strategy"], online_sampling=data["online_sampling"], learning_rate=data["learning_rate"], + max_episode_length=data["max_episode_length"], policy_kwargs=data["policy_kwargs"], _init_setup_model=True, # pytype: disable=not-instantiable,wrong-keyword-args ) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index a21dab9dec..5508c59c36 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -1,8 +1,9 @@ -from typing import Optional, Union +from typing import Optional, Type, Union import numpy as np import torch as th from gym import spaces +from gym.spaces import Discrete from stable_baselines3.common.buffers import BaseBuffer from stable_baselines3.common.type_aliases import ReplayBufferSamples @@ -17,6 +18,7 @@ class HerReplayBuffer(BaseBuffer): :param env: (VecEnv) The training environment :param buffer_size: (int) The size of the buffer measured in transitions. + :param max_episode_length: (int) The length of an episode. (time horizon) :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay. One of ['episode', 'final', 'future', 'random'] :param observation_space: (spaces.Space) Observation space @@ -25,13 +27,13 @@ class HerReplayBuffer(BaseBuffer): to which the values will be converted :param n_envs: (int) Number of parallel environments :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling) - """ def __init__( self, env: VecEnv, buffer_size: int, + max_episode_length: int, goal_selection_strategy: GoalSelectionStrategy, observation_space: spaces.Space, action_space: spaces.Space, @@ -44,9 +46,29 @@ def __init__( self.env = env self.buffer_size = buffer_size + self.max_episode_length = max_episode_length # buffer with episodes - self.buffer = [] + # number of episodes which can be stored until buffer size is reached + n_episodes = self.buffer_size // self.max_episode_length + # input dimensions for buffer initialization + input_shape = { + "observation": (self.env.num_envs, self.env.obs_dim), + "achieved_goal": (self.env.num_envs, self.env.goal_dim), + "desired_goal": (self.env.num_envs, self.env.goal_dim), + "action": (self.action_dim,), + "reward": (1,), + "next_obs": (self.env.num_envs, self.env.obs_dim), + "next_achieved_goal": (self.env.num_envs, self.env.goal_dim), + "next_desired_goal": (self.env.num_envs, self.env.goal_dim), + "done": (1,), + } + self.buffer = { + key: np.empty([n_episodes, self.max_episode_length, *dim], dtype=np.float32) for key, dim in input_shape.items() + } + # episode length storage, needed for episodes which has less steps than the maximum length + self.episode_lengths = np.empty(n_episodes) + self.goal_selection_strategy = goal_selection_strategy # percentage of her indices self.her_ratio = her_ratio @@ -62,59 +84,67 @@ def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayB to normalize the observations/rewards when sampling :return: (ReplayBufferSamples) """ - return self._sample_transitions(batch_size) + return self._sample_transitions(batch_size, env) - def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples: + def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples: """ :param batch_size: (int) Number of element to sample + :param env: (Optional[VecNormalize]) associated gym VecEnv + to normalize the observations/rewards when sampling :return: (ReplayBufferSamples) """ - # Select which episodes and time steps to use. + # Select which episodes to use episode_idxs = np.random.randint(0, self.n_episodes_stored, batch_size) - buffer = np.array(self.buffer, dtype=object) - # get episode lengths for selecting timesteps - episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]]) # select timesteps of episodes - t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths]) + max_timestep_idx = self.episode_lengths[episode_idxs] + # transition_idxs = np.random.randint(self.max_episode_length, size=batch_size) + transition_idxs = np.random.randint(max_timestep_idx) # get selected timesteps - transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)], dtype=object) + transitions = {key: self.buffer[key][episode_idxs, transition_idxs].copy() for key in self.buffer.keys()} # get her samples indices with her_ratio her_idxs = np.random.choice(np.arange(batch_size), int(self.her_ratio * batch_size), replace=False) # if we sample goals from future delete indices from her_idxs where we have no transition after current one if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: - her_idxs = her_idxs[t_samples[her_idxs] != episode_lengths[her_idxs] - 1] + her_idxs = her_idxs[transition_idxs[her_idxs] != max_timestep_idx[her_idxs] - 1] # get new goals with goal selection strategy her_new_goals = [ - self.sample_goal(self.goal_selection_strategy, trans_idx, episode, self.buffer, online_sampling=True) - for episode, trans_idx in zip(buffer[episode_idxs[her_idxs]], t_samples[her_idxs]) + self.sample_goal(self.goal_selection_strategy, trans, episode, self.buffer["achieved_goal"], online_sampling=True) + for episode, trans in zip(self.buffer["achieved_goal"][episode_idxs[her_idxs]], transition_idxs[her_idxs]) ] # assign new goals as desired_goals for idx, goal in enumerate(her_new_goals): - transitions[her_idxs][:, 0][idx]["desired_goal"] = goal - - observations, actions, rewards, next_observations, dones = list(zip(*transitions)) + # observation + transitions["desired_goal"][her_idxs][idx] = goal + # next observation + transitions["next_desired_goal"][her_idxs][idx] = goal # compute new rewards with new goal - achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(next_observations)[her_idxs]] - new_rewards = np.array(rewards) + achieved_goals = transitions["next_achieved_goal"][her_idxs] + new_rewards = transitions["reward"].copy() new_rewards[her_idxs] = [ self.env.env_method("compute_reward", achieved_goal, new_goal, None) for achieved_goal, new_goal in zip(achieved_goals, her_new_goals) ] # concatenate observation with (desired) goal - obs = [ObsWrapper.convert_dict(obs_) for obs_ in observations] - new_obs = [ObsWrapper.convert_dict(new_obs_) for new_obs_ in next_observations] + obs = [ + np.concatenate([obs, desired_goal], axis=1) + for obs, desired_goal in zip(transitions["observation"], transitions["desired_goal"]) + ] + next_obs = [ + np.concatenate([obs, desired_goal], axis=1) + for obs, desired_goal in zip(transitions["next_obs"], transitions["next_desired_goal"]) + ] data = ( - np.array(obs)[:, 0, :], - np.array(actions, dtype=self.action_space.dtype)[:, 0, :], - np.array(new_obs)[:, 0, :], - np.array(dones, dtype=np.int8), - new_rewards, + self._normalize_obs(np.asarray(obs, dtype=np.int8), env), + transitions["action"], + self._normalize_obs(np.asarray(next_obs, dtype=np.int8), env), + transitions["done"], + self._normalize_obs(new_rewards, env), ) return ReplayBufferSamples(*tuple(map(self.to_torch, data))) @@ -142,23 +172,29 @@ def sample_goal( """ if goal_selection_strategy == GoalSelectionStrategy.FINAL: # replay with final state of current episode + if online_sampling: + return episode[-1] return episode[-1][0]["achieved_goal"] elif goal_selection_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition # we have no transition after last transition of episode if (sample_idx + 1) < len(episode): index = np.random.choice(np.arange(sample_idx + 1, len(episode))) + if online_sampling: + return episode[index] return episode[index][0]["achieved_goal"] elif goal_selection_strategy == GoalSelectionStrategy.EPISODE: # replay with random state which comes from the same episode as current transition index = np.random.choice(np.arange(len(episode))) + if online_sampling: + return episode[index] return episode[index][0]["achieved_goal"] elif goal_selection_strategy == GoalSelectionStrategy.RANDOM: if online_sampling: # replay with random state from the entire replay buffer ep_idx = np.random.choice(np.arange(len(observations))) trans_idx = np.random.choice(np.arange(len(observations[ep_idx]))) - return observations[ep_idx][trans_idx][0]["achieved_goal"] + return observations[ep_idx][trans_idx] else: # replay with random state from the entire replay buffer index = np.random.choice(np.arange(len(observations))) @@ -173,21 +209,21 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: """ Add episode to replay buffer - :param obs: - :param next_obs: - :param action: - :param reward: - :param done: - - :param episode: (list) Episode to store. + :param obs: (np.ndarray) Observation. + :param next_obs: (np.ndarray) Next observation. + :param action: (np.ndarray) Action. + :param reward: (np.ndarray) Reward. + :param done: (np.ndarray) Done. """ - episode = list(zip(obs, action, reward, next_obs, done)) - - episode_length = len(episode) + episode_length = len(action) + episode = self._get_episode_dict(obs, next_obs, action, reward, done) # check if replay buffer has enough space for all transitions of episode if self.n_transitions_stored + episode_length <= self.buffer_size: - self.buffer.append(episode) + for key in self.buffer.keys(): + self.buffer[key][self._n_episodes_stored][:episode_length] = episode[key] + # add episode length to length storage + self.episode_lengths[self._n_episodes_stored] = episode_length # update replay size self.n_episodes_stored += 1 self.n_transitions_stored += episode_length @@ -195,15 +231,57 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: # if replay buffer is full take random stored episode and replace it idx = np.random.randint(0, self.n_episodes_stored) - if len(self.buffer[idx]) == episode_length: - self.buffer[idx] = episode - elif len(self.buffer[idx]) > episode_length: - self.buffer[idx] = episode - self.n_transitions_stored -= len(self.buffer[idx]) - episode_length + for key in self.buffer.keys(): + self.buffer[key][idx][:episode_length] = episode[key] + # add episode length to length storage + self.episode_lengths[idx] = episode_length if self.n_transitions_stored == self.buffer_size: self.full = True + def _get_episode_dict(self, obs, next_obs, action, reward, done) -> dict: + """ + Convert episode to dictionary. + + :param obs: (np.ndarray) Observation. + :param next_obs: (np.ndarray) Next observation. + :param action: (np.ndarray) Action. + :param reward: (np.ndarray) Reward. + :param done: (np.ndarray) Done. + """ + + observations = [] + achieved_goals = [] + desired_goals = [] + + for obs_ in obs: + observations.append(obs_["observation"]) + achieved_goals.append(obs_["achieved_goal"]) + desired_goals.append(obs_["desired_goal"]) + + next_observations = [] + next_achieved_goals = [] + next_desired_goals = [] + + for next_obs_ in next_obs: + next_observations.append(next_obs_["observation"]) + next_achieved_goals.append(next_obs_["achieved_goal"]) + next_desired_goals.append(next_obs_["desired_goal"]) + + episode = { + "observation": np.array(observations), + "achieved_goal": np.array(achieved_goals), + "desired_goal": np.array(desired_goals), + "action": action, + "reward": reward, + "next_obs": np.array(next_observations), + "next_achieved_goal": np.array(next_achieved_goals), + "next_desired_goal": np.array(next_desired_goals), + "done": done, + } + + return episode + @property def n_episodes_stored(self): return self._n_episodes_stored From 0c0d742f4836af9db6142ab6ef3b95e1a136834b Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Mon, 24 Aug 2020 11:29:08 +0200 Subject: [PATCH 21/81] Updated her test. --- tests/test_her.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_her.py b/tests/test_her.py index 80dec3a82b..b39c54b241 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -206,7 +206,7 @@ def test_save_load(tmp_path, model_class, policy): os.remove(tmp_path / "test_save.zip") -@pytest.mark.parametrize("online_sampling", [False]) +@pytest.mark.parametrize("online_sampling", [False, True]) @pytest.mark.parametrize("n_bits", [15]) def test_dqn_her(online_sampling, n_bits): """ @@ -226,6 +226,7 @@ def test_dqn_her(online_sampling, n_bits): tau=1, batch_size=32, learning_rate=0.0005, + max_episode_length=n_bits, policy_kwargs=dict(net_arch=[64, 64]), buffer_size=50000, gamma=0.99, From bbf9d6dac3ebeebe67f0d196f78039468f7692f0 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Mon, 24 Aug 2020 13:17:36 +0200 Subject: [PATCH 22/81] Fix online sampling --- stable_baselines3/her/her.py | 6 +- stable_baselines3/her/her_replay_buffer.py | 161 +++++++++------------ 2 files changed, 71 insertions(+), 96 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 3902ed6ad9..a181ddad73 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -59,7 +59,7 @@ def __init__( goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future", online_sampling: bool = False, learning_rate: Union[float, Callable] = 3e-4, - max_episode_length: int = 10, + max_episode_length: int = 1000, *args, **kwargs, ): @@ -109,7 +109,7 @@ def __init__( self.env.action_space, self.device, self.n_envs, - self.her_ratio, + self.her_ratio, # pytype: disable=wrong-arg-types ) def _setup_model(self) -> None: @@ -346,7 +346,7 @@ def _store_transitions(self) -> None: HerReplayBuffer.sample_goal( self.goal_selection_strategy, idx, self._episode_storage, self.replay_buffer.observations, obs_dim ) - for i in range(self.n_sampled_goal) + for _ in range(self.n_sampled_goal) ) if sample is not None ] diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 5508c59c36..70ce5eee39 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -3,7 +3,6 @@ import numpy as np import torch as th from gym import spaces -from gym.spaces import Discrete from stable_baselines3.common.buffers import BaseBuffer from stable_baselines3.common.type_aliases import ReplayBufferSamples @@ -31,7 +30,7 @@ class HerReplayBuffer(BaseBuffer): def __init__( self, - env: VecEnv, + env: ObsWrapper, buffer_size: int, max_episode_length: int, goal_selection_strategy: GoalSelectionStrategy, @@ -51,6 +50,8 @@ def __init__( # buffer with episodes # number of episodes which can be stored until buffer size is reached n_episodes = self.buffer_size // self.max_episode_length + self.n_episodes = n_episodes + # input dimensions for buffer initialization input_shape = { "observation": (self.env.num_envs, self.env.obs_dim), @@ -64,19 +65,15 @@ def __init__( "done": (1,), } self.buffer = { - key: np.empty([n_episodes, self.max_episode_length, *dim], dtype=np.float32) for key, dim in input_shape.items() + key: np.empty((n_episodes, self.max_episode_length, *dim), dtype=np.float32) for key, dim in input_shape.items() } # episode length storage, needed for episodes which has less steps than the maximum length - self.episode_lengths = np.empty(n_episodes) + self.episode_lengths = np.empty(n_episodes, dtype=np.uint64) self.goal_selection_strategy = goal_selection_strategy # percentage of her indices self.her_ratio = her_ratio - # memory management - self._n_episodes_stored = 0 - self._n_transitions_stored = 0 - def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: """ :param batch_size: (int) Number of element to sample @@ -94,57 +91,57 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R :return: (ReplayBufferSamples) """ # Select which episodes to use - episode_idxs = np.random.randint(0, self.n_episodes_stored, batch_size) - # select timesteps of episodes - max_timestep_idx = self.episode_lengths[episode_idxs] - # transition_idxs = np.random.randint(self.max_episode_length, size=batch_size) - transition_idxs = np.random.randint(max_timestep_idx) - # get selected timesteps - transitions = {key: self.buffer[key][episode_idxs, transition_idxs].copy() for key in self.buffer.keys()} - # get her samples indices with her_ratio - her_idxs = np.random.choice(np.arange(batch_size), int(self.her_ratio * batch_size), replace=False) - - # if we sample goals from future delete indices from her_idxs where we have no transition after current one - if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: - her_idxs = her_idxs[transition_idxs[her_idxs] != max_timestep_idx[her_idxs] - 1] - - # get new goals with goal selection strategy - her_new_goals = [ - self.sample_goal(self.goal_selection_strategy, trans, episode, self.buffer["achieved_goal"], online_sampling=True) - for episode, trans in zip(self.buffer["achieved_goal"][episode_idxs[her_idxs]], transition_idxs[her_idxs]) - ] - - # assign new goals as desired_goals - for idx, goal in enumerate(her_new_goals): - # observation - transitions["desired_goal"][her_idxs][idx] = goal - # next observation - transitions["next_desired_goal"][her_idxs][idx] = goal - - # compute new rewards with new goal - achieved_goals = transitions["next_achieved_goal"][her_idxs] - new_rewards = transitions["reward"].copy() - new_rewards[her_idxs] = [ - self.env.env_method("compute_reward", achieved_goal, new_goal, None) - for achieved_goal, new_goal in zip(achieved_goals, her_new_goals) - ] - - # concatenate observation with (desired) goal - obs = [ - np.concatenate([obs, desired_goal], axis=1) - for obs, desired_goal in zip(transitions["observation"], transitions["desired_goal"]) - ] - next_obs = [ - np.concatenate([obs, desired_goal], axis=1) - for obs, desired_goal in zip(transitions["next_obs"], transitions["next_desired_goal"]) - ] + episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) + her_episode_indices = episode_indices[: int(self.her_ratio * batch_size)] + + observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype) + actions = np.zeros((batch_size, 1), dtype=self.action_space.dtype) + next_observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype) + dones = np.zeros((batch_size, 1), dtype=np.float32) + rewards = np.zeros((batch_size, 1), dtype=np.float32) + + for idx, ep_length in enumerate(self.episode_lengths[episode_indices]): + skip_her_sampling = False + if episode_indices[idx] in her_episode_indices and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: + max_timestep = ep_length - 1 + # handle the case of 1 step episode: we must use a normal transition then + if max_timestep == 0: + max_timestep = ep_length + skip_her_sampling = True + else: + max_timestep = ep_length + + transition_idx = np.random.randint(max_timestep) + transition = {key: self.buffer[key][episode_indices[idx], transition_idx].copy() for key in self.buffer.keys()} + + if episode_indices[idx] in her_episode_indices and not skip_her_sampling: + episode = self.buffer["achieved_goal"][episode_indices[idx]] + new_goal = self.sample_goal( + self.goal_selection_strategy, transition_idx, episode, self.buffer["achieved_goal"], online_sampling=True + ) + # observation + transition["desired_goal"] = new_goal + # next observation + transition["next_desired_goal"] = new_goal + transition["reward"] = self.env.env_method("compute_reward", transition["next_achieved_goal"], new_goal, None) + # TODO: check that it does not change anything + # transition["done"] = False + + # concatenate observation with (desired) goal + obs = np.concatenate([transition["observation"], transition["desired_goal"]], axis=1) + next_obs = np.concatenate([transition["next_obs"], transition["desired_goal"]], axis=1) + observations[idx] = obs + next_observations[idx] = next_obs + actions[idx] = transition["action"] + dones[idx] = transition["done"] + rewards[idx] = transition["reward"] data = ( - self._normalize_obs(np.asarray(obs, dtype=np.int8), env), - transitions["action"], - self._normalize_obs(np.asarray(next_obs, dtype=np.int8), env), - transitions["done"], - self._normalize_obs(new_rewards, env), + self._normalize_obs(observations, env), + actions, + self._normalize_obs(next_observations, env), + dones, + self._normalize_reward(rewards, env), ) return ReplayBufferSamples(*tuple(map(self.to_torch, data))) @@ -218,26 +215,16 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: episode_length = len(action) episode = self._get_episode_dict(obs, next_obs, action, reward, done) - # check if replay buffer has enough space for all transitions of episode - if self.n_transitions_stored + episode_length <= self.buffer_size: - for key in self.buffer.keys(): - self.buffer[key][self._n_episodes_stored][:episode_length] = episode[key] - # add episode length to length storage - self.episode_lengths[self._n_episodes_stored] = episode_length - # update replay size - self.n_episodes_stored += 1 - self.n_transitions_stored += episode_length - elif self.full: - # if replay buffer is full take random stored episode and replace it - idx = np.random.randint(0, self.n_episodes_stored) - - for key in self.buffer.keys(): - self.buffer[key][idx][:episode_length] = episode[key] - # add episode length to length storage - self.episode_lengths[idx] = episode_length - - if self.n_transitions_stored == self.buffer_size: + for key in self.buffer.keys(): + self.buffer[key][self.pos][:episode_length] = episode[key] + # add episode length to length storage + self.episode_lengths[self.pos] = episode_length + + # update current pointer + self.pos += 1 + if self.pos == self.n_episodes: self.full = True + self.pos = 0 def _get_episode_dict(self, obs, next_obs, action, reward, done) -> dict: """ @@ -284,27 +271,15 @@ def _get_episode_dict(self, obs, next_obs, action, reward, done) -> dict: @property def n_episodes_stored(self): - return self._n_episodes_stored - - @n_episodes_stored.setter - def n_episodes_stored(self, n): - self._n_episodes_stored = n - - @property - def n_transitions_stored(self): - return self._n_transitions_stored - - @n_transitions_stored.setter - def n_transitions_stored(self, n): - self._n_transitions_stored = n + if self.full: + return self.n_episodes + return self.pos def clear_buffer(self): - self.buffer = [] - self.n_episodes_stored = 0 - self.n_transitions_stored = 0 + self.buffer = {} def size(self) -> int: """ :return: (int) The current size of the buffer in transitions. """ - return self.n_transitions_stored + return int(np.sum(self.episode_lengths)) From eefea130c53cfea38c3986fdefb71fcaefb26dfe Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Mon, 24 Aug 2020 20:51:52 +0200 Subject: [PATCH 23/81] Fixed action bug. Updated time limit for episodes. --- stable_baselines3/her/her.py | 14 +++++++--- stable_baselines3/her/her_replay_buffer.py | 16 +++++------- tests/test_her.py | 30 +++++++++------------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index a181ddad73..e6d3f23e7d 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -3,6 +3,7 @@ from typing import Callable, Iterable, List, Optional, Tuple, Type, Union import numpy as np +from gym.wrappers import TimeLimit from stable_baselines3.common.base_class import BaseAlgorithm from stable_baselines3.common.buffers import ReplayBuffer @@ -59,7 +60,7 @@ def __init__( goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future", online_sampling: bool = False, learning_rate: Union[float, Callable] = 3e-4, - max_episode_length: int = 1000, + max_episode_length: int = -1, *args, **kwargs, ): @@ -100,6 +101,10 @@ def __init__( # counter for steps in episode self.episode_steps = 0 if self.online_sampling: + if isinstance(env, TimeLimit): + self.max_episode_length = env._max_episode_steps # pytype: disable=attribute-error + elif self.max_episode_length <= 0: + raise ValueError("The maximum episode length must be greater than zero.") self.model.replay_buffer = HerReplayBuffer( self.env, self.buffer_size, @@ -288,7 +293,7 @@ def collect_rollouts( if 0 < n_steps <= total_steps: break - if done or self.episode_steps >= self.max_episode_length: + if done or self.episode_steps == self.max_episode_length: if self.online_sampling: observations, actions, rewards, next_observations, done = zip(*self._episode_storage) self.replay_buffer.add(observations, next_observations, actions, rewards, done) @@ -338,6 +343,10 @@ def _store_transitions(self) -> None: # store data in replay buffer self.replay_buffer.add(obs, new_obs, action, reward, done) + # We cannot sample a goal from the future in the last step of an episode + if idx == len(self._episode_storage) - 1 and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: + break + # sample set of additional goals obs_dim = observation["observation"].shape[1] sampled_goals = [ @@ -348,7 +357,6 @@ def _store_transitions(self) -> None: ) for _ in range(self.n_sampled_goal) ) - if sample is not None ] # iterate over sampled new transitions in replay buffer diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 70ce5eee39..b448a45c5d 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -1,4 +1,4 @@ -from typing import Optional, Type, Union +from typing import Optional, Union import numpy as np import torch as th @@ -6,7 +6,7 @@ from stable_baselines3.common.buffers import BaseBuffer from stable_baselines3.common.type_aliases import ReplayBufferSamples -from stable_baselines3.common.vec_env import VecEnv, VecNormalize +from stable_baselines3.common.vec_env import VecNormalize from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy @@ -95,7 +95,7 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R her_episode_indices = episode_indices[: int(self.her_ratio * batch_size)] observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype) - actions = np.zeros((batch_size, 1), dtype=self.action_space.dtype) + actions = np.zeros((batch_size, self.action_dim), dtype=self.action_space.dtype) next_observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype) dones = np.zeros((batch_size, 1), dtype=np.float32) rewards = np.zeros((batch_size, 1), dtype=np.float32) @@ -174,12 +174,10 @@ def sample_goal( return episode[-1][0]["achieved_goal"] elif goal_selection_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition - # we have no transition after last transition of episode - if (sample_idx + 1) < len(episode): - index = np.random.choice(np.arange(sample_idx + 1, len(episode))) - if online_sampling: - return episode[index] - return episode[index][0]["achieved_goal"] + index = np.random.choice(np.arange(sample_idx + 1, len(episode))) + if online_sampling: + return episode[index] + return episode[index][0]["achieved_goal"] elif goal_selection_strategy == GoalSelectionStrategy.EPISODE: # replay with random state which comes from the same episode as current transition index = np.random.choice(np.arange(len(episode))) diff --git a/tests/test_her.py b/tests/test_her.py index b39c54b241..34254f6a4d 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -21,8 +21,8 @@ def test_her(model_class, policy, online_sampling): """ Test Hindsight Experience Replay. """ - - env = BitFlippingEnv(n_bits=4, continuous=True) + n_bits = 4 + env = BitFlippingEnv(n_bits=n_bits, continuous=True) env = DummyVecEnv([lambda: env]) # Create action noise @@ -47,6 +47,7 @@ def test_her(model_class, policy, online_sampling): gradient_steps=1, train_freq=1, n_episodes_rollout=-1, + max_episode_length=n_bits, ) model.learn(total_timesteps=500, callback=None) @@ -115,6 +116,7 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling): gradient_steps=1, train_freq=1, n_episodes_rollout=-1, + max_episode_length=10, ) model.learn(total_timesteps=200, callback=None) @@ -124,7 +126,8 @@ def test_save_load(tmp_path, model_class, policy): """ Test if 'save' and 'load' saves and loads model correctly """ - env = BitFlippingEnv(n_bits=4, continuous=True) + n_bits = 4 + env = BitFlippingEnv(n_bits=n_bits, continuous=True) env = DummyVecEnv([lambda: env]) # Create action noise @@ -150,6 +153,7 @@ def test_save_load(tmp_path, model_class, policy): gradient_steps=1, train_freq=1, n_episodes_rollout=-1, + max_episode_length=n_bits, ) model.learn(total_timesteps=500, callback=None) @@ -219,28 +223,18 @@ def test_dqn_her(online_sampling, n_bits): "MlpPolicy", env, DQN, - n_sampled_goal=4, + n_sampled_goal=5, goal_selection_strategy="future", online_sampling=online_sampling, verbose=1, - tau=1, - batch_size=32, learning_rate=0.0005, max_episode_length=n_bits, - policy_kwargs=dict(net_arch=[64, 64]), - buffer_size=50000, - gamma=0.99, - gradient_steps=1, train_freq=1, - n_episodes_rollout=-1, - tensorboard_log="tensorboard", - learning_starts=1000, - exploration_fraction=0.1, + learning_starts=100, exploration_final_eps=0.02, - exploration_initial_eps=1.0, target_update_interval=500, + seed=0, + batch_size=32, ) - tb_log_name = "run_" + str(online_sampling) + "_" + str(n_bits) - - model.learn(total_timesteps=20000, callback=None, tb_log_name=tb_log_name) + model.learn(total_timesteps=20000) From b5b00db2fe6e2b775e34ac0abafd32141f40a20a Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Mon, 24 Aug 2020 21:11:06 +0200 Subject: [PATCH 24/81] Updated convert_dict method to take keys as arguments. --- stable_baselines3/common/vec_env/dict_obs_wrapper.py | 12 ++++++++---- stable_baselines3/her/her_replay_buffer.py | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/stable_baselines3/common/vec_env/dict_obs_wrapper.py b/stable_baselines3/common/vec_env/dict_obs_wrapper.py index 55e5283b06..4d96664962 100644 --- a/stable_baselines3/common/vec_env/dict_obs_wrapper.py +++ b/stable_baselines3/common/vec_env/dict_obs_wrapper.py @@ -52,11 +52,15 @@ def step_wait(self): return self.venv.step_wait() @staticmethod - def convert_dict(observation: dict) -> np.ndarray: + def convert_dict( + observation_dict: dict, observation_key: str = "observation", goal_key: str = "desired_goal" + ) -> np.ndarray: """ - Concatenate observation and desired goal of observation dict. + Concatenate observation and (desired) goal of observation dict. - :param observation: (dict) + :param observation_dict: (dict) Dictionary with observation. + :param observation_key: (str) Key of observation in dicitonary. + :param goal_key: (str) Key of (desired) goal in dicitonary. :return: (np.ndarray) """ - return np.concatenate([observation["observation"], observation["desired_goal"]], axis=1) + return np.concatenate([observation_dict[observation_key], observation_dict[goal_key]], axis=1) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index b448a45c5d..1e267860a1 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -128,8 +128,8 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R # transition["done"] = False # concatenate observation with (desired) goal - obs = np.concatenate([transition["observation"], transition["desired_goal"]], axis=1) - next_obs = np.concatenate([transition["next_obs"], transition["desired_goal"]], axis=1) + obs = ObsWrapper.convert_dict(transition) + next_obs = ObsWrapper.convert_dict(transition, observation_key="next_obs") observations[idx] = obs next_observations[idx] = next_obs actions[idx] = transition["action"] From fb229b7bb77fcf511c06d495b511005f3dd6a19f Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 25 Aug 2020 14:13:15 +0200 Subject: [PATCH 25/81] Renamed obs dict wrapper. --- stable_baselines3/common/base_class.py | 6 ++++ stable_baselines3/common/policies.py | 4 +-- ...ict_obs_wrapper.py => obs_dict_wrapper.py} | 4 +-- stable_baselines3/her/her.py | 35 ++++++------------- stable_baselines3/her/her_replay_buffer.py | 8 ++--- tests/test_her.py | 4 +-- 6 files changed, 26 insertions(+), 35 deletions(-) rename stable_baselines3/common/vec_env/{dict_obs_wrapper.py => obs_dict_wrapper.py} (95%) diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py index a3de8cef86..2d16655fc6 100644 --- a/stable_baselines3/common/base_class.py +++ b/stable_baselines3/common/base_class.py @@ -27,6 +27,7 @@ update_learning_rate, ) from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecNormalize, VecTransposeImage, unwrap_vec_normalize +from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper def maybe_make_env(env: Union[GymEnv, str, None], monitor_wrapper: bool, verbose: int) -> Optional[GymEnv]: @@ -171,6 +172,11 @@ def _wrap_env(self, env: GymEnv) -> VecEnv: if self.verbose >= 1: print("Wrapping the env in a VecTransposeImage.") env = VecTransposeImage(env) + + # check if wrapper for dict support is needed + if isinstance(env.observation_space, gym.spaces.dict.Dict): + env = ObsDictWrapper(env) + return env @abstractmethod diff --git a/stable_baselines3/common/policies.py b/stable_baselines3/common/policies.py index be08e9c35e..babcc99464 100644 --- a/stable_baselines3/common/policies.py +++ b/stable_baselines3/common/policies.py @@ -23,7 +23,7 @@ from stable_baselines3.common.torch_layers import BaseFeaturesExtractor, FlattenExtractor, MlpExtractor, NatureCNN, create_mlp from stable_baselines3.common.utils import get_device, is_vectorized_observation from stable_baselines3.common.vec_env import VecTransposeImage -from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper +from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper class BaseModel(nn.Module, ABC): @@ -236,7 +236,7 @@ def predict( # if mask is None: # mask = [False for _ in range(self.n_envs)] if isinstance(observation, dict): - observation = ObsWrapper.convert_dict(observation) + observation = ObsDictWrapper.convert_dict(observation) else: observation = np.array(observation) diff --git a/stable_baselines3/common/vec_env/dict_obs_wrapper.py b/stable_baselines3/common/vec_env/obs_dict_wrapper.py similarity index 95% rename from stable_baselines3/common/vec_env/dict_obs_wrapper.py rename to stable_baselines3/common/vec_env/obs_dict_wrapper.py index 4d96664962..d524d5e6de 100644 --- a/stable_baselines3/common/vec_env/dict_obs_wrapper.py +++ b/stable_baselines3/common/vec_env/obs_dict_wrapper.py @@ -4,7 +4,7 @@ from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper -class ObsWrapper(VecEnvWrapper): +class ObsDictWrapper(VecEnvWrapper): """ Wrapper for a VecEnv which overrides the observation space for Hindsight Experience Replay to support dict observations. @@ -12,7 +12,7 @@ class ObsWrapper(VecEnvWrapper): """ def __init__(self, venv: VecEnv): - super(ObsWrapper, self).__init__(venv, venv.observation_space, venv.action_space) + super(ObsDictWrapper, self).__init__(venv, venv.observation_space, venv.action_space) self.venv = venv diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index e6d3f23e7d..c59cbaddd4 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -2,6 +2,7 @@ import pathlib from typing import Callable, Iterable, List, Optional, Tuple, Type, Union +import gym import numpy as np from gym.wrappers import TimeLimit @@ -14,27 +15,12 @@ from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn from stable_baselines3.common.utils import check_for_correct_spaces -from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper -from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper +from stable_baselines3.common.vec_env import VecEnv +from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy from stable_baselines3.her.her_replay_buffer import HerReplayBuffer -def check_wrapped_env(env: VecEnv) -> VecEnv: - """ - Check if the environment is already wrapped by an ObsWrapper. - - :param env: (VecEnv) Environment to check. - :return: (VecEnv) env - """ - env_tmp = env - while isinstance(env_tmp, VecEnvWrapper): - if isinstance(env_tmp, ObsWrapper): - return env - env_tmp = env_tmp.venv - return ObsWrapper(env) - - class HER(BaseAlgorithm): """ Hindsight Experience Replay (HER) @@ -67,9 +53,6 @@ def __init__( super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=learning_rate) - # check if wrapper for dict support is needed - self.env = check_wrapped_env(self.env) - # model initialization self.model_class = model_class self.model = model_class( @@ -101,7 +84,7 @@ def __init__( # counter for steps in episode self.episode_steps = 0 if self.online_sampling: - if isinstance(env, TimeLimit): + if isinstance(self.env, TimeLimit): self.max_episode_length = env._max_episode_steps # pytype: disable=attribute-error elif self.max_episode_length <= 0: raise ValueError("The maximum episode length must be greater than zero.") @@ -231,7 +214,7 @@ def collect_rollouts( while not done: # concatenate observation and (desired) goal observation = self._last_obs - self._last_obs = ObsWrapper.convert_dict(observation) + self._last_obs = ObsDictWrapper.convert_dict(observation) if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0: # Sample a new noise matrix @@ -337,8 +320,8 @@ def _store_transitions(self) -> None: observation, action, reward, new_observation, done = trans # concatenate observation with (desired) goal - obs = ObsWrapper.convert_dict(observation) - new_obs = ObsWrapper.convert_dict(new_observation) + obs = ObsDictWrapper.convert_dict(observation) + new_obs = ObsDictWrapper.convert_dict(new_observation) # store data in replay buffer self.replay_buffer.add(obs, new_obs, action, reward, done) @@ -434,7 +417,9 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl raise KeyError("The observation_space and action_space were not given, can't verify new environments") # check if given env is valid if env is not None: - env = check_wrapped_env(env) + # check if wrapper for dict support is needed + if isinstance(env.observation_space, gym.spaces.dict.Dict): + env = ObsDictWrapper(env) check_for_correct_spaces(env, data["observation_space"], data["action_space"]) # if no new env was given use stored env if possible if env is None and "env" in data: diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 1e267860a1..3fd98267b0 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -7,7 +7,7 @@ from stable_baselines3.common.buffers import BaseBuffer from stable_baselines3.common.type_aliases import ReplayBufferSamples from stable_baselines3.common.vec_env import VecNormalize -from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper +from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy @@ -30,7 +30,7 @@ class HerReplayBuffer(BaseBuffer): def __init__( self, - env: ObsWrapper, + env: ObsDictWrapper, buffer_size: int, max_episode_length: int, goal_selection_strategy: GoalSelectionStrategy, @@ -128,8 +128,8 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R # transition["done"] = False # concatenate observation with (desired) goal - obs = ObsWrapper.convert_dict(transition) - next_obs = ObsWrapper.convert_dict(transition, observation_key="next_obs") + obs = ObsDictWrapper.convert_dict(transition) + next_obs = ObsDictWrapper.convert_dict(transition, observation_key="next_obs") observations[idx] = obs next_observations[idx] = next_obs actions[idx] = transition["action"] diff --git a/tests/test_her.py b/tests/test_her.py index 34254f6a4d..2c7a90ddc5 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -9,7 +9,7 @@ from stable_baselines3.common.bit_flipping_env import BitFlippingEnv from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise from stable_baselines3.common.vec_env import DummyVecEnv -from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper +from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.her import HER, GoalSelectionStrategy from stable_baselines3.sac.policies import SACPolicy from stable_baselines3.td3.policies import MlpPolicy, TD3Policy @@ -163,7 +163,7 @@ def test_save_load(tmp_path, model_class, policy): observations_list = [] for _ in range(10): obs = env.step([env.action_space.sample()])[0] - observation = ObsWrapper.convert_dict(obs) + observation = ObsDictWrapper.convert_dict(obs) observations_list.append(observation) observations = np.concatenate(observations_list, axis=0) From 8a93ac9f020b1cf2a5b226a7ededd776305b8df0 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 25 Aug 2020 14:19:22 +0200 Subject: [PATCH 26/81] Seed bit flipping env --- stable_baselines3/common/bit_flipping_env.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stable_baselines3/common/bit_flipping_env.py b/stable_baselines3/common/bit_flipping_env.py index b579fe1579..fd9998a6c1 100644 --- a/stable_baselines3/common/bit_flipping_env.py +++ b/stable_baselines3/common/bit_flipping_env.py @@ -61,7 +61,9 @@ def __init__( max_steps = n_bits self.max_steps = max_steps self.current_step = 0 - self.reset() + + def seed(self, seed: int) -> None: + self.obs_space.seed(seed) def convert_if_needed(self, state: np.ndarray) -> Union[int, np.ndarray]: """ From 66ab30ceb86fd824d9bded78b2af7541c12bddc1 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 25 Aug 2020 14:59:58 +0200 Subject: [PATCH 27/81] Remove get_episode_dict --- setup.cfg | 1 + stable_baselines3/her/her.py | 12 ++- stable_baselines3/her/her_replay_buffer.py | 112 ++++++++------------- 3 files changed, 49 insertions(+), 76 deletions(-) diff --git a/setup.cfg b/setup.cfg index 011c3d9b17..4b5d439182 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,6 +29,7 @@ per-file-ignores = ./stable_baselines3/a2c/__init__.py:F401 ./stable_baselines3/ddpg/__init__.py:F401 ./stable_baselines3/dqn/__init__.py:F401 + ./stable_baselines3/her/__init__.py:F401 ./stable_baselines3/ppo/__init__.py:F401 ./stable_baselines3/sac/__init__.py:F401 ./stable_baselines3/td3/__init__.py:F401 diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index e6d3f23e7d..df0c00cae8 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -266,8 +266,11 @@ def collect_rollouts( self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward self.model._last_original_obs = self._last_original_obs - # add current transition to episode storage - self._episode_storage.append((self._last_original_obs, buffer_action, reward_, new_obs_, done)) + if self.online_sampling: + self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done) + else: + # add current transition to episode storage + self._episode_storage.append((self._last_original_obs, new_obs_, buffer_action, reward_, done)) self._last_obs = new_obs self.model._last_obs = self._last_obs @@ -295,8 +298,7 @@ def collect_rollouts( if done or self.episode_steps == self.max_episode_length: if self.online_sampling: - observations, actions, rewards, next_observations, done = zip(*self._episode_storage) - self.replay_buffer.add(observations, next_observations, actions, rewards, done) + self.replay_buffer.store_episode() else: # store episode in replay buffer self._store_transitions() @@ -334,7 +336,7 @@ def _store_transitions(self) -> None: # iterate over current episodes transitions for idx, trans in enumerate(self._episode_storage): - observation, action, reward, new_observation, done = trans + observation, new_observation, action, reward, done = trans # concatenate observation with (desired) goal obs = ObsWrapper.convert_dict(observation) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 1e267860a1..33e1a8aac8 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Dict, Optional, Union import numpy as np import torch as th @@ -49,8 +49,8 @@ def __init__( # buffer with episodes # number of episodes which can be stored until buffer size is reached - n_episodes = self.buffer_size // self.max_episode_length - self.n_episodes = n_episodes + self.max_episode_stored = self.buffer_size // self.max_episode_length + self.current_idx = 0 # input dimensions for buffer initialization input_shape = { @@ -65,10 +65,11 @@ def __init__( "done": (1,), } self.buffer = { - key: np.empty((n_episodes, self.max_episode_length, *dim), dtype=np.float32) for key, dim in input_shape.items() + key: np.empty((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32) + for key, dim in input_shape.items() } # episode length storage, needed for episodes which has less steps than the maximum length - self.episode_lengths = np.empty(n_episodes, dtype=np.uint64) + self.episode_lengths = np.empty(self.max_episode_stored, dtype=np.uint64) self.goal_selection_strategy = goal_selection_strategy # percentage of her indices @@ -92,7 +93,7 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R """ # Select which episodes to use episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) - her_episode_indices = episode_indices[: int(self.her_ratio * batch_size)] + her_episode_indices = set(episode_indices[: int(self.her_ratio * batch_size)]) observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype) actions = np.zeros((batch_size, self.action_dim), dtype=self.action_space.dtype) @@ -101,21 +102,23 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R rewards = np.zeros((batch_size, 1), dtype=np.float32) for idx, ep_length in enumerate(self.episode_lengths[episode_indices]): - skip_her_sampling = False - if episode_indices[idx] in her_episode_indices and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: + her_sampling = episode_indices[idx] in her_episode_indices + + if her_sampling and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: max_timestep = ep_length - 1 # handle the case of 1 step episode: we must use a normal transition then if max_timestep == 0: max_timestep = ep_length - skip_her_sampling = True + her_sampling = False else: max_timestep = ep_length transition_idx = np.random.randint(max_timestep) transition = {key: self.buffer[key][episode_indices[idx], transition_idx].copy() for key in self.buffer.keys()} - if episode_indices[idx] in her_episode_indices and not skip_her_sampling: - episode = self.buffer["achieved_goal"][episode_indices[idx]] + if her_sampling: + episode = self.buffer["achieved_goal"][episode_indices[idx]][: self.episode_lengths[episode_indices[idx]]] + # TODO: check that episode lenght is taken into account for all sampling strategies new_goal = self.sample_goal( self.goal_selection_strategy, transition_idx, episode, self.buffer["achieved_goal"], online_sampling=True ) @@ -200,77 +203,44 @@ def sample_goal( else: raise ValueError("Strategy for sampling goals not supported!") - def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: np.ndarray, done: np.ndarray) -> None: - """ - Add episode to replay buffer + def add( + self, + obs: Dict[str, np.ndarray], + next_obs: Dict[str, np.ndarray], + action: np.ndarray, + reward: np.ndarray, + done: np.ndarray, + ) -> None: + + self.buffer["observation"][self.pos][self.current_idx] = obs["observation"] + self.buffer["achieved_goal"][self.pos][self.current_idx] = obs["achieved_goal"] + self.buffer["desired_goal"][self.pos][self.current_idx] = obs["desired_goal"] + self.buffer["action"][self.pos][self.current_idx] = action + self.buffer["done"][self.pos][self.current_idx] = done + self.buffer["reward"][self.pos][self.current_idx] = reward + self.buffer["next_obs"][self.pos][self.current_idx] = next_obs["observation"] + self.buffer["next_achieved_goal"][self.pos][self.current_idx] = next_obs["achieved_goal"] + self.buffer["next_desired_goal"][self.pos][self.current_idx] = next_obs["desired_goal"] - :param obs: (np.ndarray) Observation. - :param next_obs: (np.ndarray) Next observation. - :param action: (np.ndarray) Action. - :param reward: (np.ndarray) Reward. - :param done: (np.ndarray) Done. - """ - episode_length = len(action) - episode = self._get_episode_dict(obs, next_obs, action, reward, done) + # update current pointer + self.current_idx += 1 - for key in self.buffer.keys(): - self.buffer[key][self.pos][:episode_length] = episode[key] + def store_episode(self): # add episode length to length storage - self.episode_lengths[self.pos] = episode_length + self.episode_lengths[self.pos] = self.current_idx - # update current pointer + # update current episode pointer self.pos += 1 - if self.pos == self.n_episodes: + if self.pos == self.max_episode_stored: self.full = True self.pos = 0 - - def _get_episode_dict(self, obs, next_obs, action, reward, done) -> dict: - """ - Convert episode to dictionary. - - :param obs: (np.ndarray) Observation. - :param next_obs: (np.ndarray) Next observation. - :param action: (np.ndarray) Action. - :param reward: (np.ndarray) Reward. - :param done: (np.ndarray) Done. - """ - - observations = [] - achieved_goals = [] - desired_goals = [] - - for obs_ in obs: - observations.append(obs_["observation"]) - achieved_goals.append(obs_["achieved_goal"]) - desired_goals.append(obs_["desired_goal"]) - - next_observations = [] - next_achieved_goals = [] - next_desired_goals = [] - - for next_obs_ in next_obs: - next_observations.append(next_obs_["observation"]) - next_achieved_goals.append(next_obs_["achieved_goal"]) - next_desired_goals.append(next_obs_["desired_goal"]) - - episode = { - "observation": np.array(observations), - "achieved_goal": np.array(achieved_goals), - "desired_goal": np.array(desired_goals), - "action": action, - "reward": reward, - "next_obs": np.array(next_observations), - "next_achieved_goal": np.array(next_achieved_goals), - "next_desired_goal": np.array(next_desired_goals), - "done": done, - } - - return episode + # reset transition pointer + self.current_idx = 0 @property def n_episodes_stored(self): if self.full: - return self.n_episodes + return self.max_episode_stored return self.pos def clear_buffer(self): From d6a5524ab25cdab5836274dbbbd86f53f266aeb8 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 25 Aug 2020 16:28:27 +0200 Subject: [PATCH 28/81] Add fast online sampling version --- stable_baselines3/her/her_replay_buffer.py | 46 +++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 5c91272125..709c3229af 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -82,7 +82,7 @@ def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayB to normalize the observations/rewards when sampling :return: (ReplayBufferSamples) """ - return self._sample_transitions(batch_size, env) + return self._sample_transitions_2(batch_size, env) def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples: """ @@ -126,6 +126,7 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R transition["desired_goal"] = new_goal # next observation transition["next_desired_goal"] = new_goal + # TODO: vectorized computation of reward transition["reward"] = self.env.env_method("compute_reward", transition["next_achieved_goal"], new_goal, None) # TODO: check that it does not change anything # transition["done"] = False @@ -203,6 +204,46 @@ def sample_goal( else: raise ValueError("Strategy for sampling goals not supported!") + def _sample_transitions_2(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples: + """ + :param batch_size: (int) Number of element to sample + :param env: (Optional[VecNormalize]) associated gym VecEnv + to normalize the observations/rewards when sampling + :return: (ReplayBufferSamples) + """ + # Select which episodes to use + episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) + transitions_indices = np.random.randint(self.episode_lengths[episode_indices], size=batch_size) + transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()} + + her_indices = np.random.permutation(batch_size)[: int(self.her_ratio * batch_size)] + future_offset = np.random.uniform(size=batch_size) * (self.episode_lengths[episode_indices] - transitions_indices) + future_offset = future_offset.astype(int) + future_indices = (transitions_indices + future_offset)[her_indices] + # future_indices = (transitions_indices + 1 + future_offset)[her_indices] + + future_achieved_goals = self.buffer["achieved_goal"][episode_indices[her_indices], future_indices] + transitions["desired_goal"][her_indices] = future_achieved_goals + + for idx in her_indices: + transitions["reward"][idx] = self.env.env_method( + "compute_reward", transitions["next_achieved_goal"][idx], transitions["desired_goal"][idx], None + ) + + # concatenate observation with (desired) goal + observations = ObsDictWrapper.convert_dict(transitions) + next_observations = ObsDictWrapper.convert_dict(transitions, observation_key="next_obs") + + data = ( + self._normalize_obs(observations, env), + transitions["action"], + self._normalize_obs(next_observations, env), + transitions["done"], + self._normalize_reward(transitions["reward"], env), + ) + + return ReplayBufferSamples(*tuple(map(self.to_torch, data))) + def add( self, obs: Dict[str, np.ndarray], @@ -230,6 +271,9 @@ def store_episode(self): self.episode_lengths[self.pos] = self.current_idx # update current episode pointer + # Note: in the OpenAI implementation + # when the buffer is full, the episode replaced + # is randomly chosen self.pos += 1 if self.pos == self.max_episode_stored: self.full = True From a3c08de8d59a1c73a16d30037fca06d5dfc32993 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 25 Aug 2020 16:58:15 +0200 Subject: [PATCH 29/81] Added documentation. --- docs/index.rst | 1 + docs/modules/her.rst | 110 +++++++++++++++++++++ stable_baselines3/her/__init__.py | 3 + stable_baselines3/her/her_replay_buffer.py | 3 +- tests/test_her.py | 3 +- 5 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 docs/modules/her.rst diff --git a/docs/index.rst b/docs/index.rst index 939655a1c8..4bc2fbcc99 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -57,6 +57,7 @@ Main Features modules/a2c modules/ddpg modules/dqn + modules/her modules/ppo modules/sac modules/td3 diff --git a/docs/modules/her.rst b/docs/modules/her.rst new file mode 100644 index 0000000000..32532ba0f0 --- /dev/null +++ b/docs/modules/her.rst @@ -0,0 +1,110 @@ +.. _her: + +.. automodule:: stable_baselines3.her + + +HER +==== + +`Hindsight Experience Replay (HER) `_ + +HER is an algorithm that works with Off policy methods (DQN, SAC, TD3 and DDPG for example). + +.. warning:: + + HER requires the environment to inherits from `gym.GoalEnv `_ + + +Notes +----- + +- Original paper: https://arxiv.org/abs/1707.01495 +- OpenAI paper: `Plappert et al. (2018)`_ +- OpenAI blog post: https://openai.com/blog/ingredients-for-robotics-research/ + + +.. _Plappert et al. (2018): https://arxiv.org/abs/1802.09464 + +Can I use? +---------- + +Please refer to the wrapped model (DQN, SAC, TD3 or DDPG) for that section. + +Example +------- + +.. code-block:: python + + from stable_baselines3 import DDPG, DQN, SAC, TD3 + from stable_baselines3.her.her import HER + from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy + from stable_baselines3.common.bit_flipping_env import BitFlippingEnv + from stable_baselines3.common.vec_env import DummyVecEnv + from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper + + model_class = DQN # works also with SAC, DDPG and TD3 + N_BITS = 15 + + env = BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) + + # Available strategies (cf paper): future, final, episode, random + goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE + + # If True the HER transitions will get sampled online + online_sampling = True + # Time limit for the episodes in online sampling (to deactivate for offline use the default value -1) + max_episode_length = N_BITS + + # Initialize the model + model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, + verbose=1, max_episode_length=max_episode_length) + # Train the model + model.learn(1000) + + model.save("./her_bit_env") + + # WARNING: you must pass an VecEnv + env = DummyVecEnv([lambda: env]) + model = HER.load('./her_bit_env', env=env) + + obs = env.reset() + for _ in range(100): + # we need to convert the observation dict + obs = ObsDictWrapper.convert_dict(obs) + action, _ = model.model.predict(obs) + obs, reward, done, _ = env.step(action) + + if done: + obs = env.reset() + + +Parameters +---------- + +.. autoclass:: HER + :members: + +Goal Selection Strategies +------------------------- + +.. autoclass:: GoalSelectionStrategy + :members: + :inherited-members: + :undoc-members: + + +Obs Dict Wrapper +---------------- + +.. autoclass:: ObsDictWrapper + :members: + :inherited-members: + :undoc-members: + + +HER Replay Buffer +----------------- + +.. autoclass:: HerReplayBuffer + :members: + :inherited-members: diff --git a/stable_baselines3/her/__init__.py b/stable_baselines3/her/__init__.py index ce43bf04cf..24f347305a 100644 --- a/stable_baselines3/her/__init__.py +++ b/stable_baselines3/her/__init__.py @@ -1 +1,4 @@ +from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper +from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy from stable_baselines3.her.her import HER +from stable_baselines3.her.her_replay_buffer import HerReplayBuffer diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 5c91272125..862e6a4b12 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -13,7 +13,8 @@ class HerReplayBuffer(BaseBuffer): """ - Replay Buffer for online Hindsight Experience Replay (HER) + Replay Buffer for sampling HER (Hindsight Experience Replay) transitions online. + These transitions will not be saved in the Buffer. :param env: (VecEnv) The training environment :param buffer_size: (int) The size of the buffer measured in transitions. diff --git a/tests/test_her.py b/tests/test_her.py index 2c7a90ddc5..7bb62dc830 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -10,7 +10,8 @@ from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper -from stable_baselines3.her.her import HER, GoalSelectionStrategy +from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy +from stable_baselines3.her.her import HER from stable_baselines3.sac.policies import SACPolicy from stable_baselines3.td3.policies import MlpPolicy, TD3Policy From bbf5a9363e851e8d5ca0132c288a3ef67e8cc69d Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 25 Aug 2020 17:39:46 +0200 Subject: [PATCH 30/81] Vectorized reward computation --- stable_baselines3/common/bit_flipping_env.py | 10 +- stable_baselines3/her/her_replay_buffer.py | 113 ++++++------------- 2 files changed, 39 insertions(+), 84 deletions(-) diff --git a/stable_baselines3/common/bit_flipping_env.py b/stable_baselines3/common/bit_flipping_env.py index fd9998a6c1..527eab3ee0 100644 --- a/stable_baselines3/common/bit_flipping_env.py +++ b/stable_baselines3/common/bit_flipping_env.py @@ -103,7 +103,7 @@ def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: else: self.state[action] = 1 - self.state[action] obs = self._get_obs() - reward = self.compute_reward(obs["achieved_goal"], obs["desired_goal"], None) + reward = float(self.compute_reward(obs["achieved_goal"], obs["desired_goal"], None)) done = reward == 0 self.current_step += 1 # Episode terminate when we reached the goal or the max number of steps @@ -111,11 +111,11 @@ def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: done = done or self.current_step >= self.max_steps return obs, reward, done, info - def compute_reward(self, achieved_goal: np.ndarray, desired_goal: np.ndarray, _info) -> float: + def compute_reward(self, achieved_goal: np.ndarray, desired_goal: np.ndarray, _info) -> np.float32: # Deceptive reward: it is positive only when the goal is achieved - if self.discrete_obs_space: - return 0.0 if achieved_goal == desired_goal else -1.0 - return 0.0 if (achieved_goal == desired_goal).all() else -1.0 + # vectorized version + distance = np.linalg.norm(achieved_goal - desired_goal, axis=-1) + return -(distance > 0).astype(np.float32) def render(self, mode: str = "human") -> Optional[np.ndarray]: if mode == "rgb_array": diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 709c3229af..8a29221cf4 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -82,73 +82,7 @@ def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayB to normalize the observations/rewards when sampling :return: (ReplayBufferSamples) """ - return self._sample_transitions_2(batch_size, env) - - def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples: - """ - :param batch_size: (int) Number of element to sample - :param env: (Optional[VecNormalize]) associated gym VecEnv - to normalize the observations/rewards when sampling - :return: (ReplayBufferSamples) - """ - # Select which episodes to use - episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) - her_episode_indices = set(episode_indices[: int(self.her_ratio * batch_size)]) - - observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype) - actions = np.zeros((batch_size, self.action_dim), dtype=self.action_space.dtype) - next_observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype) - dones = np.zeros((batch_size, 1), dtype=np.float32) - rewards = np.zeros((batch_size, 1), dtype=np.float32) - - for idx, ep_length in enumerate(self.episode_lengths[episode_indices]): - her_sampling = episode_indices[idx] in her_episode_indices - - if her_sampling and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: - max_timestep = ep_length - 1 - # handle the case of 1 step episode: we must use a normal transition then - if max_timestep == 0: - max_timestep = ep_length - her_sampling = False - else: - max_timestep = ep_length - - transition_idx = np.random.randint(max_timestep) - transition = {key: self.buffer[key][episode_indices[idx], transition_idx].copy() for key in self.buffer.keys()} - - if her_sampling: - episode = self.buffer["achieved_goal"][episode_indices[idx]][: self.episode_lengths[episode_indices[idx]]] - # TODO: check that episode lenght is taken into account for all sampling strategies - new_goal = self.sample_goal( - self.goal_selection_strategy, transition_idx, episode, self.buffer["achieved_goal"], online_sampling=True - ) - # observation - transition["desired_goal"] = new_goal - # next observation - transition["next_desired_goal"] = new_goal - # TODO: vectorized computation of reward - transition["reward"] = self.env.env_method("compute_reward", transition["next_achieved_goal"], new_goal, None) - # TODO: check that it does not change anything - # transition["done"] = False - - # concatenate observation with (desired) goal - obs = ObsDictWrapper.convert_dict(transition) - next_obs = ObsDictWrapper.convert_dict(transition, observation_key="next_obs") - observations[idx] = obs - next_observations[idx] = next_obs - actions[idx] = transition["action"] - dones[idx] = transition["done"] - rewards[idx] = transition["reward"] - - data = ( - self._normalize_obs(observations, env), - actions, - self._normalize_obs(next_observations, env), - dones, - self._normalize_reward(rewards, env), - ) - - return ReplayBufferSamples(*tuple(map(self.to_torch, data))) + return self._sample_transitions(batch_size, env) @staticmethod def sample_goal( @@ -204,7 +138,7 @@ def sample_goal( else: raise ValueError("Strategy for sampling goals not supported!") - def _sample_transitions_2(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples: + def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples: """ :param batch_size: (int) Number of element to sample :param env: (Optional[VecNormalize]) associated gym VecEnv @@ -213,22 +147,43 @@ def _sample_transitions_2(self, batch_size: int, env: Optional[VecNormalize]) -> """ # Select which episodes to use episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) - transitions_indices = np.random.randint(self.episode_lengths[episode_indices], size=batch_size) - transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()} + her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)] + # her_indices = np.random.permutation(batch_size)[: int(self.her_ratio * batch_size)] + ep_length = self.episode_lengths[episode_indices] - her_indices = np.random.permutation(batch_size)[: int(self.her_ratio * batch_size)] - future_offset = np.random.uniform(size=batch_size) * (self.episode_lengths[episode_indices] - transitions_indices) - future_offset = future_offset.astype(int) - future_indices = (transitions_indices + future_offset)[her_indices] - # future_indices = (transitions_indices + 1 + future_offset)[her_indices] + if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: + # restrict the sampling domain when ep_length > 1 + # otherwise filter out the indices + her_indices = her_indices[ep_length[her_indices] > 1] + ep_length[her_indices] -= 1 - future_achieved_goals = self.buffer["achieved_goal"][episode_indices[her_indices], future_indices] - transitions["desired_goal"][her_indices] = future_achieved_goals + transitions_indices = np.random.randint(ep_length, size=batch_size) + transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()} + + # vectorized version of future sampling (fast) + # future_offset = np.random.uniform(size=batch_size) * (self.episode_lengths[episode_indices] - transitions_indices) + # future_offset = future_offset.astype(int) + # future_indices = (transitions_indices + future_offset)[her_indices] + # # future_indices = (transitions_indices + 1 + future_offset)[her_indices] + # future_achieved_goals = self.buffer["achieved_goal"][episode_indices[her_indices], future_indices] + # transitions["desired_goal"][her_indices] = future_achieved_goals for idx in her_indices: - transitions["reward"][idx] = self.env.env_method( - "compute_reward", transitions["next_achieved_goal"][idx], transitions["desired_goal"][idx], None + episode = self.buffer["achieved_goal"][episode_indices[idx]][: self.episode_lengths[episode_indices[idx]]] + # TODO: check that episode length is taken into account for all sampling strategies + new_goal = self.sample_goal( + self.goal_selection_strategy, + transitions_indices[idx], + episode, + self.buffer["achieved_goal"], + online_sampling=True, ) + transitions["desired_goal"][idx] = new_goal + + # Vectorized computation + transitions["reward"][her_indices] = self.env.env_method( + "compute_reward", transitions["next_achieved_goal"][her_indices], transitions["desired_goal"][her_indices], None + ) # concatenate observation with (desired) goal observations = ObsDictWrapper.convert_dict(transitions) From 2525eb020df0effabd4e970f7d2cdbb0a4900bc4 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 25 Aug 2020 18:19:01 +0200 Subject: [PATCH 31/81] Vectorized goal sampling --- stable_baselines3/her/her_replay_buffer.py | 97 ++++++++++++---------- tests/test_her.py | 8 +- 2 files changed, 57 insertions(+), 48 deletions(-) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 6223765680..756e731ae1 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -70,7 +70,7 @@ def __init__( for key, dim in input_shape.items() } # episode length storage, needed for episodes which has less steps than the maximum length - self.episode_lengths = np.empty(self.max_episode_stored, dtype=np.uint64) + self.episode_lengths = np.zeros(self.max_episode_stored, dtype=np.int64) self.goal_selection_strategy = goal_selection_strategy # percentage of her indices @@ -92,8 +92,7 @@ def sample_goal( episode: list, observations: Union[list, np.ndarray], obs_dim: int = None, - online_sampling: bool = False, - ) -> Union[np.ndarray, None]: + ) -> np.ndarray: """ Sample a goal based on goal_selection_strategy. @@ -103,42 +102,71 @@ def sample_goal( :param episode: (list) Current episode. :param observations: (list or np.ndarray) :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy. - :param online_sampling: (bool) Sample HER transitions online. - :return: (np.ndarray or None) Return sampled goal. + :return: (np.ndarray) Return sampled goal. """ if goal_selection_strategy == GoalSelectionStrategy.FINAL: # replay with final state of current episode - if online_sampling: - return episode[-1] return episode[-1][0]["achieved_goal"] elif goal_selection_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition index = np.random.choice(np.arange(sample_idx + 1, len(episode))) - if online_sampling: - return episode[index] return episode[index][0]["achieved_goal"] elif goal_selection_strategy == GoalSelectionStrategy.EPISODE: # replay with random state which comes from the same episode as current transition index = np.random.choice(np.arange(len(episode))) - if online_sampling: - return episode[index] return episode[index][0]["achieved_goal"] elif goal_selection_strategy == GoalSelectionStrategy.RANDOM: - if online_sampling: - # replay with random state from the entire replay buffer - ep_idx = np.random.choice(np.arange(len(observations))) - trans_idx = np.random.choice(np.arange(len(observations[ep_idx]))) - return observations[ep_idx][trans_idx] - else: - # replay with random state from the entire replay buffer - index = np.random.choice(np.arange(len(observations))) - obs = observations[index] - # get only the observation part - obs_array = obs[:, :obs_dim] - return obs_array + # replay with random state from the entire replay buffer + index = np.random.choice(np.arange(len(observations))) + obs = observations[index] + # get only the observation part + # TODO: check that line (or the comment at least) + obs_array = obs[:, :obs_dim] + return obs_array + else: + raise ValueError("Strategy for sampling goals not supported!") + + def vectorized_sample_goal( + self, episode_indices: np.ndarray, her_indices: np.ndarray, transitions_indices: np.ndarray + ) -> np.ndarray: + """ + Sample goals based on goal_selection_strategy. + This is the vectorized (faster) version of ``sample_goal()`` + + :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay. + One of ['episode', 'final', 'future', 'random'] + :param sample_idx: (int) Index of current transition. + :param episode: (list) Current episode. + :param observations: (list or np.ndarray) + :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy. + :param online_sampling: (bool) Sample HER transitions online. + :return: (np.ndarray) Return sampled goals. + """ + her_episode_indices = episode_indices[her_indices] + + if self.goal_selection_strategy == GoalSelectionStrategy.FINAL: + # replay with final state of current episode + transitions_indices = self.episode_lengths[her_indices] - 1 + + elif self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: + # replay with random state which comes from the same episode and was observed after current transition + transitions_indices = np.random.randint( + transitions_indices[her_indices] + 1, self.episode_lengths[her_episode_indices] + ) + + elif self.goal_selection_strategy == GoalSelectionStrategy.EPISODE: + # replay with random state which comes from the same episode as current transition + transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices]) + + elif self.goal_selection_strategy == GoalSelectionStrategy.RANDOM: + # replay with random state from the entire replay buffer + her_episode_indices = np.random.randint(self.n_episodes_stored, size=len(her_indices)) + transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices]) else: raise ValueError("Strategy for sampling goals not supported!") + return self.buffer["achieved_goal"][her_episode_indices, transitions_indices] + def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples: """ :param batch_size: (int) Number of element to sample @@ -158,28 +186,11 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R her_indices = her_indices[ep_length[her_indices] > 1] ep_length[her_indices] -= 1 - transitions_indices = np.random.randint(ep_length, size=batch_size) + transitions_indices = np.random.randint(ep_length) transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()} - # vectorized version of future sampling (fast) - # future_offset = np.random.uniform(size=batch_size) * (self.episode_lengths[episode_indices] - transitions_indices) - # future_offset = future_offset.astype(int) - # future_indices = (transitions_indices + future_offset)[her_indices] - # # future_indices = (transitions_indices + 1 + future_offset)[her_indices] - # future_achieved_goals = self.buffer["achieved_goal"][episode_indices[her_indices], future_indices] - # transitions["desired_goal"][her_indices] = future_achieved_goals - - for idx in her_indices: - episode = self.buffer["achieved_goal"][episode_indices[idx]][: self.episode_lengths[episode_indices[idx]]] - # TODO: check that episode length is taken into account for all sampling strategies - new_goal = self.sample_goal( - self.goal_selection_strategy, - transitions_indices[idx], - episode, - self.buffer["achieved_goal"], - online_sampling=True, - ) - transitions["desired_goal"][idx] = new_goal + new_goals = self.vectorized_sample_goal(episode_indices, her_indices, transitions_indices) + transitions["desired_goal"][her_indices] = new_goals # Vectorized computation transitions["reward"][her_indices] = self.env.env_method( diff --git a/tests/test_her.py b/tests/test_her.py index 7bb62dc830..ed7b7ee175 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -12,11 +12,9 @@ from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy from stable_baselines3.her.her import HER -from stable_baselines3.sac.policies import SACPolicy -from stable_baselines3.td3.policies import MlpPolicy, TD3Policy -@pytest.mark.parametrize("model_class, policy", [(SAC, SACPolicy), (TD3, TD3Policy), (DDPG, MlpPolicy)]) +@pytest.mark.parametrize("model_class, policy", [(SAC, "MlpPolicy"), (TD3, "MlpPolicy"), (DDPG, "MlpPolicy")]) @pytest.mark.parametrize("online_sampling", [True, False]) def test_her(model_class, policy, online_sampling): """ @@ -109,7 +107,7 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling): env = DummyVecEnv([lambda: env]) model = HER( - SACPolicy, + "MlpPolicy", env, SAC, goal_selection_strategy=goal_selection_strategy, @@ -122,7 +120,7 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling): model.learn(total_timesteps=200, callback=None) -@pytest.mark.parametrize("model_class, policy", [(SAC, SACPolicy), (TD3, TD3Policy), (DDPG, MlpPolicy)]) +@pytest.mark.parametrize("model_class, policy", [(SAC, "MlpPolicy"), (TD3, "MlpPolicy"), (DDPG, "MlpPolicy")]) def test_save_load(tmp_path, model_class, policy): """ Test if 'save' and 'load' saves and loads model correctly From c57c6ef16f4c859ced1e05b8525806105fcfd956 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Wed, 26 Aug 2020 01:13:03 +0200 Subject: [PATCH 32/81] Update time limit for episodes in online her sampling. --- stable_baselines3/her/her.py | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 8007fa16bb..27bcfa84cf 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -4,7 +4,6 @@ import gym import numpy as np -from gym.wrappers import TimeLimit from stable_baselines3.common.base_class import BaseAlgorithm from stable_baselines3.common.buffers import ReplayBuffer @@ -15,12 +14,39 @@ from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn from stable_baselines3.common.utils import check_for_correct_spaces -from stable_baselines3.common.vec_env import VecEnv +from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy from stable_baselines3.her.her_replay_buffer import HerReplayBuffer +def get_time_limit(env: VecEnv, current_max_episode_length: int) -> int: + """ + Get time limit from environment. + + :param env: (VecEnv) Environment from which we want to get the time limit. + :param current_max_episode_length: (int) Current value for max_episode_length. + :return: (int) max episode length + """ + # unwrap environment first + env_tmp = env + while isinstance(env_tmp, VecEnvWrapper): + env_tmp = env_tmp.venv + # try to get the attribute from environment + try: + current_max_episode_length = env_tmp.get_attr("_max_episode_steps")[0] + # if not available check if a valid value was passed as an argument + except AttributeError: + # throw an error when we have no valid value passed + if current_max_episode_length <= 0: + raise ValueError("The maximum episode length must be greater than zero.") + else: + # if valid value was passed take this as time limit + current_max_episode_length = current_max_episode_length + + return current_max_episode_length + + class HER(BaseAlgorithm): """ Hindsight Experience Replay (HER) @@ -84,10 +110,7 @@ def __init__( # counter for steps in episode self.episode_steps = 0 if self.online_sampling: - if isinstance(self.env, TimeLimit): - self.max_episode_length = env._max_episode_steps # pytype: disable=attribute-error - elif self.max_episode_length <= 0: - raise ValueError("The maximum episode length must be greater than zero.") + self.max_episode_length = get_time_limit(self.env, self.max_episode_length) self.model.replay_buffer = HerReplayBuffer( self.env, self.buffer_size, From 902267c1c2bd9fce0aa103759cdeca335c19472d Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 26 Aug 2020 12:10:45 +0200 Subject: [PATCH 33/81] Fix max episode length inference --- stable_baselines3/her/her.py | 37 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 27bcfa84cf..fe26cd2cf4 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -14,13 +14,13 @@ from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn from stable_baselines3.common.utils import check_for_correct_spaces -from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper +from stable_baselines3.common.vec_env import VecEnv from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy from stable_baselines3.her.her_replay_buffer import HerReplayBuffer -def get_time_limit(env: VecEnv, current_max_episode_length: int) -> int: +def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> int: """ Get time limit from environment. @@ -28,22 +28,18 @@ def get_time_limit(env: VecEnv, current_max_episode_length: int) -> int: :param current_max_episode_length: (int) Current value for max_episode_length. :return: (int) max episode length """ - # unwrap environment first - env_tmp = env - while isinstance(env_tmp, VecEnvWrapper): - env_tmp = env_tmp.venv # try to get the attribute from environment - try: - current_max_episode_length = env_tmp.get_attr("_max_episode_steps")[0] - # if not available check if a valid value was passed as an argument - except AttributeError: - # throw an error when we have no valid value passed - if current_max_episode_length <= 0: - raise ValueError("The maximum episode length must be greater than zero.") - else: - # if valid value was passed take this as time limit - current_max_episode_length = current_max_episode_length - + if current_max_episode_length is None: + try: + current_max_episode_length = env.get_attr("spec")[0].max_episode_steps + # if not available check if a valid value was passed as an argument + except AttributeError: + raise ValueError( + "The max episode length could not be inferred." + "You must specify a `max_episode_steps` when registering the environment, " + "use a `gym.wrappers.TimeLimit` wrapper " + "or pass `max_episode_length` to the model constructor" + ) return current_max_episode_length @@ -51,6 +47,8 @@ class HER(BaseAlgorithm): """ Hindsight Experience Replay (HER) + Paper: https://arxiv.org/abs/1707.01495 + :param policy: (BasePolicy or str) The policy model to use. :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str) :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3) @@ -60,7 +58,8 @@ class HER(BaseAlgorithm): :param online_sampling: (bool) Sample HER transitions online. :param learning_rate: (float or callable) learning rate for the optimizer, it can be a function of the current progress remaining (from 1 to 0) - :param max_episode_length: (int) The length of an episode. (time horizon) + :param max_episode_length: (int) The maximum length of an episode. If not specified, + it will be automatically inferred if the environment uses a ``gym.wrappers.TimeLimit`` wrapper """ def __init__( @@ -72,7 +71,7 @@ def __init__( goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future", online_sampling: bool = False, learning_rate: Union[float, Callable] = 3e-4, - max_episode_length: int = -1, + max_episode_length: Optional[int] = None, *args, **kwargs, ): From fc7f647f4b51f1da7328751d9095191aa9bef836 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 26 Aug 2020 12:46:44 +0200 Subject: [PATCH 34/81] Bug fix for Fetch envs --- stable_baselines3/common/vec_env/obs_dict_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable_baselines3/common/vec_env/obs_dict_wrapper.py b/stable_baselines3/common/vec_env/obs_dict_wrapper.py index d524d5e6de..22fbae4060 100644 --- a/stable_baselines3/common/vec_env/obs_dict_wrapper.py +++ b/stable_baselines3/common/vec_env/obs_dict_wrapper.py @@ -63,4 +63,4 @@ def convert_dict( :param goal_key: (str) Key of (desired) goal in dicitonary. :return: (np.ndarray) """ - return np.concatenate([observation_dict[observation_key], observation_dict[goal_key]], axis=1) + return np.concatenate([observation_dict[observation_key], observation_dict[goal_key]], axis=-1) From 0757a73fb5a06e1a00ddd50bb71c2c21718eb296 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 27 Aug 2020 11:20:38 +0200 Subject: [PATCH 35/81] Fix for HER + gSDE --- stable_baselines3/her/her.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index fe26cd2cf4..a1615dcb46 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -223,7 +223,7 @@ def collect_rollouts( assert isinstance(env, VecEnv), "You must pass a VecEnv" assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" - if self.use_sde: + if self.model.use_sde: self.actor.reset_noise() callback.on_rollout_start() @@ -238,7 +238,7 @@ def collect_rollouts( observation = self._last_obs self._last_obs = ObsDictWrapper.convert_dict(observation) - if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0: + if self.model.use_sde and self.model.sde_sample_freq > 0 and total_steps % self.model.sde_sample_freq == 0: # Sample a new noise matrix self.actor.reset_noise() From eb89099bddff8af4efe7412cf532ebaba2049abe Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 27 Aug 2020 12:55:36 +0200 Subject: [PATCH 36/81] Reformat (new black version) --- stable_baselines3/common/buffers.py | 6 +++--- stable_baselines3/common/policies.py | 5 ++++- stable_baselines3/common/save_util.py | 4 +++- stable_baselines3/dqn/policies.py | 5 ++++- stable_baselines3/her/her.py | 6 +++++- tests/test_custom_policy.py | 2 +- tests/test_distributions.py | 6 +++++- tests/test_her.py | 14 ++++++++++++-- tests/test_vec_envs.py | 2 +- 9 files changed, 38 insertions(+), 12 deletions(-) diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py index 4534063a28..6c58953845 100644 --- a/stable_baselines3/common/buffers.py +++ b/stable_baselines3/common/buffers.py @@ -171,12 +171,12 @@ def __init__( mem_available = psutil.virtual_memory().available self.optimize_memory_usage = optimize_memory_usage - self.observations = np.zeros((self.buffer_size, self.n_envs,) + self.obs_shape, dtype=observation_space.dtype) + self.observations = np.zeros((self.buffer_size, self.n_envs) + self.obs_shape, dtype=observation_space.dtype) if optimize_memory_usage: # `observations` contains also the next observation self.next_observations = None else: - self.next_observations = np.zeros((self.buffer_size, self.n_envs,) + self.obs_shape, dtype=observation_space.dtype) + self.next_observations = np.zeros((self.buffer_size, self.n_envs) + self.obs_shape, dtype=observation_space.dtype) self.actions = np.zeros((self.buffer_size, self.n_envs, self.action_dim), dtype=action_space.dtype) self.rewards = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32) self.dones = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32) @@ -284,7 +284,7 @@ def __init__( self.reset() def reset(self) -> None: - self.observations = np.zeros((self.buffer_size, self.n_envs,) + self.obs_shape, dtype=np.float32) + self.observations = np.zeros((self.buffer_size, self.n_envs) + self.obs_shape, dtype=np.float32) self.actions = np.zeros((self.buffer_size, self.n_envs, self.action_dim), dtype=np.float32) self.rewards = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32) self.returns = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32) diff --git a/stable_baselines3/common/policies.py b/stable_baselines3/common/policies.py index babcc99464..3450d82920 100644 --- a/stable_baselines3/common/policies.py +++ b/stable_baselines3/common/policies.py @@ -703,7 +703,10 @@ def __init__( n_critics: int = 2, ): super().__init__( - observation_space, action_space, features_extractor=features_extractor, normalize_images=normalize_images, + observation_space, + action_space, + features_extractor=features_extractor, + normalize_images=normalize_images, ) action_dim = get_action_dim(self.action_space) diff --git a/stable_baselines3/common/save_util.py b/stable_baselines3/common/save_util.py index 51fa8cd175..326db1e8a6 100644 --- a/stable_baselines3/common/save_util.py +++ b/stable_baselines3/common/save_util.py @@ -350,7 +350,9 @@ def load_from_pkl(path: Union[str, pathlib.Path, io.BufferedIOBase], verbose=0) def load_from_zip_file( - load_path: Union[str, pathlib.Path, io.BufferedIOBase], load_data: bool = True, verbose=0, + load_path: Union[str, pathlib.Path, io.BufferedIOBase], + load_data: bool = True, + verbose=0, ) -> (Tuple[Optional[Dict[str, Any]], Optional[TensorDict], Optional[TensorDict]]): """ Load model data from a .zip archive diff --git a/stable_baselines3/dqn/policies.py b/stable_baselines3/dqn/policies.py index f5001c7548..ebbcd34bff 100644 --- a/stable_baselines3/dqn/policies.py +++ b/stable_baselines3/dqn/policies.py @@ -31,7 +31,10 @@ def __init__( normalize_images: bool = True, ): super(QNetwork, self).__init__( - observation_space, action_space, features_extractor=features_extractor, normalize_images=normalize_images, + observation_space, + action_space, + features_extractor=features_extractor, + normalize_images=normalize_images, ) if net_arch is None: diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index a1615dcb46..86c0a8a060 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -81,7 +81,11 @@ def __init__( # model initialization self.model_class = model_class self.model = model_class( - policy=policy, env=self.env, learning_rate=learning_rate, *args, **kwargs, # pytype: disable=wrong-keyword-args + policy=policy, + env=self.env, + learning_rate=learning_rate, + *args, + **kwargs, # pytype: disable=wrong-keyword-args ) self.verbose = self.model.verbose diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py index c1e08dfacf..95f4a7c9ad 100644 --- a/tests/test_custom_policy.py +++ b/tests/test_custom_policy.py @@ -22,7 +22,7 @@ def test_flexible_mlp(model_class, net_arch): _ = model_class("MlpPolicy", "CartPole-v1", policy_kwargs=dict(net_arch=net_arch), n_steps=100).learn(1000) -@pytest.mark.parametrize("net_arch", [[4], [4, 4],]) +@pytest.mark.parametrize("net_arch", [[4], [4, 4]]) @pytest.mark.parametrize("model_class", [SAC, TD3]) def test_custom_offpolicy(model_class, net_arch): _ = model_class("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=net_arch)).learn(1000) diff --git a/tests/test_distributions.py b/tests/test_distributions.py index a73b81eded..490f80eb3b 100644 --- a/tests/test_distributions.py +++ b/tests/test_distributions.py @@ -67,7 +67,11 @@ def test_sde_distribution(): # TODO: analytical form for squashed Gaussian? @pytest.mark.parametrize( - "dist", [DiagGaussianDistribution(N_ACTIONS), StateDependentNoiseDistribution(N_ACTIONS, squash_output=False),] + "dist", + [ + DiagGaussianDistribution(N_ACTIONS), + StateDependentNoiseDistribution(N_ACTIONS, squash_output=False), + ], ) def test_entropy(dist): # The entropy can be approximated by averaging the negative log likelihood diff --git a/tests/test_her.py b/tests/test_her.py index ed7b7ee175..34c197b224 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -26,7 +26,12 @@ def test_her(model_class, policy, online_sampling): # Create action noise n_actions = env.action_space.shape[0] - action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,))) + action_noise = OrnsteinUhlenbeckActionNoise( + np.zeros( + n_actions, + ), + 0.2 * np.ones((n_actions,)), + ) model = HER( policy, @@ -131,7 +136,12 @@ def test_save_load(tmp_path, model_class, policy): # Create action noise n_actions = env.action_space.shape[0] - action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,))) + action_noise = OrnsteinUhlenbeckActionNoise( + np.zeros( + n_actions, + ), + 0.2 * np.ones((n_actions,)), + ) # create model model = HER( diff --git a/tests/test_vec_envs.py b/tests/test_vec_envs.py index 8c33341c57..141ca6a65f 100644 --- a/tests/test_vec_envs.py +++ b/tests/test_vec_envs.py @@ -225,7 +225,7 @@ def make_env(): def check_vecenv_obs(obs, space): """Helper method to check observations from multiple environments each belong to - the appropriate observation space.""" + the appropriate observation space.""" assert obs.shape[0] == N_ENVS for value in obs: assert space.contains(value) From d1adff61fd8aff89db797dbb0450a74d36f10fb4 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Thu, 27 Aug 2020 14:21:33 +0200 Subject: [PATCH 37/81] Added info dict to compute new reward. Check her_replay_buffer again. --- stable_baselines3/her/her.py | 8 ++++---- stable_baselines3/her/her_replay_buffer.py | 8 +++++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 86c0a8a060..87505282d4 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -276,10 +276,10 @@ def collect_rollouts( self.model._last_original_obs = self._last_original_obs if self.online_sampling: - self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done) + self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) else: # add current transition to episode storage - self._episode_storage.append((self._last_original_obs, new_obs_, buffer_action, reward_, done)) + self._episode_storage.append((self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)) self._last_obs = new_obs self.model._last_obs = self._last_obs @@ -345,7 +345,7 @@ def _store_transitions(self) -> None: # iterate over current episodes transitions for idx, trans in enumerate(self._episode_storage): - observation, new_observation, action, reward, done = trans + observation, new_observation, action, reward, done, infos = trans # concatenate observation with (desired) goal obs = ObsDictWrapper.convert_dict(observation) @@ -373,7 +373,7 @@ def _store_transitions(self) -> None: # iterate over sampled new transitions in replay buffer for goal in sampled_goals: # compute new reward with new goal - new_reward = self.env.env_method("compute_reward", new_observation["achieved_goal"], goal, None) + new_reward = self.env.env_method("compute_reward", new_observation["achieved_goal"], goal, infos) # concatenate observation with (desired) goal obs = np.concatenate([observation["observation"], goal], axis=1) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 756e731ae1..d1a2f475fb 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -64,6 +64,7 @@ def __init__( "next_achieved_goal": (self.env.num_envs, self.env.goal_dim), "next_desired_goal": (self.env.num_envs, self.env.goal_dim), "done": (1,), + "infos": (1,), } self.buffer = { key: np.empty((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32) @@ -194,7 +195,10 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R # Vectorized computation transitions["reward"][her_indices] = self.env.env_method( - "compute_reward", transitions["next_achieved_goal"][her_indices], transitions["desired_goal"][her_indices], None + "compute_reward", + transitions["next_achieved_goal"][her_indices], + transitions["desired_goal"][her_indices], + transitions["infos"][her_indices], ) # concatenate observation with (desired) goal @@ -218,6 +222,7 @@ def add( action: np.ndarray, reward: np.ndarray, done: np.ndarray, + infos: Dict[str, np.ndarray], ) -> None: self.buffer["observation"][self.pos][self.current_idx] = obs["observation"] @@ -229,6 +234,7 @@ def add( self.buffer["next_obs"][self.pos][self.current_idx] = next_obs["observation"] self.buffer["next_achieved_goal"][self.pos][self.current_idx] = next_obs["achieved_goal"] self.buffer["next_desired_goal"][self.pos][self.current_idx] = next_obs["desired_goal"] + self.buffer["infos"][self.pos][self.current_idx] = infos # update current pointer self.current_idx += 1 From 01162df0ae9a0ff7198f608bc6f851b1ebae414b Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 27 Aug 2020 15:06:18 +0200 Subject: [PATCH 38/81] Fix info buffer --- stable_baselines3/her/her_replay_buffer.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index d1a2f475fb..8e06cd647f 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -1,3 +1,4 @@ +from collections import deque from typing import Dict, Optional, Union import numpy as np @@ -64,12 +65,12 @@ def __init__( "next_achieved_goal": (self.env.num_envs, self.env.goal_dim), "next_desired_goal": (self.env.num_envs, self.env.goal_dim), "done": (1,), - "infos": (1,), } self.buffer = { key: np.empty((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32) for key, dim in input_shape.items() } + self.info_buffer = [deque(maxlen=self.max_episode_length) for _ in range(self.max_episode_stored)] # episode length storage, needed for episodes which has less steps than the maximum length self.episode_lengths = np.zeros(self.max_episode_stored, dtype=np.int64) @@ -193,12 +194,21 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R new_goals = self.vectorized_sample_goal(episode_indices, her_indices, transitions_indices) transitions["desired_goal"][her_indices] = new_goals + # Convert to numpy array + # TODO: disable if not needed for faster computation + transitions["info"] = np.array( + [ + self.info_buffer[episode_idx][transition_idx] + for episode_idx, transition_idx in zip(episode_indices, transitions_indices) + ] + ) + # Vectorized computation transitions["reward"][her_indices] = self.env.env_method( "compute_reward", transitions["next_achieved_goal"][her_indices], transitions["desired_goal"][her_indices], - transitions["infos"][her_indices], + transitions["info"][her_indices], ) # concatenate observation with (desired) goal @@ -225,6 +235,10 @@ def add( infos: Dict[str, np.ndarray], ) -> None: + if self.current_idx == 0 and self.full: + # Clear info buffer + self.info_buffer[self.pos] = deque(maxlen=self.max_episode_length) + self.buffer["observation"][self.pos][self.current_idx] = obs["observation"] self.buffer["achieved_goal"][self.pos][self.current_idx] = obs["achieved_goal"] self.buffer["desired_goal"][self.pos][self.current_idx] = obs["desired_goal"] @@ -234,7 +248,8 @@ def add( self.buffer["next_obs"][self.pos][self.current_idx] = next_obs["observation"] self.buffer["next_achieved_goal"][self.pos][self.current_idx] = next_obs["achieved_goal"] self.buffer["next_desired_goal"][self.pos][self.current_idx] = next_obs["desired_goal"] - self.buffer["infos"][self.pos][self.current_idx] = infos + + self.info_buffer[self.pos].append(infos) # update current pointer self.current_idx += 1 From 656a1a61599f551124a5512a23bf289bbfc6e3bf Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Fri, 28 Aug 2020 17:28:25 +0200 Subject: [PATCH 39/81] Updated done flag. --- stable_baselines3/her/her.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 87505282d4..39d71e5d40 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -110,10 +110,10 @@ def __init__( self.online_sampling = online_sampling self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1)) self.max_episode_length = max_episode_length + self.max_episode_length = get_time_limit(self.env, self.max_episode_length) # counter for steps in episode self.episode_steps = 0 if self.online_sampling: - self.max_episode_length = get_time_limit(self.env, self.max_episode_length) self.model.replay_buffer = HerReplayBuffer( self.env, self.buffer_size, @@ -253,6 +253,8 @@ def collect_rollouts( # Perform action new_obs, reward, done, infos = env.step(action) + done = done if episode_timesteps < self.max_episode_length else False + # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False) @@ -380,7 +382,7 @@ def _store_transitions(self) -> None: new_obs = np.concatenate([new_observation["observation"], goal], axis=1) # store data in replay buffer - self.replay_buffer.add(obs, new_obs, action, new_reward, done) + self.replay_buffer.add(obs, new_obs, action, new_reward, np.array([False])) def __getattr__(self, item): """ From 59bbe804cd2b0d9c3ba2184dadc5322929b939e2 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 28 Aug 2020 19:26:44 +0200 Subject: [PATCH 40/81] Fixes for gSDE --- stable_baselines3/common/base_class.py | 2 +- stable_baselines3/her/her.py | 5 +++++ tests/test_her.py | 9 ++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py index 2d16655fc6..13728d7257 100644 --- a/stable_baselines3/common/base_class.py +++ b/stable_baselines3/common/base_class.py @@ -159,7 +159,7 @@ def __init__( "Error: the model does not support multiple envs; it requires " "a single vectorized environment." ) - if self.use_sde and not isinstance(self.observation_space, gym.spaces.Box): + if self.use_sde and not isinstance(self.action_space, gym.spaces.Box): raise ValueError("generalized State-Dependent Exploration (gSDE) can only be used with continuous actions.") def _wrap_env(self, env: GymEnv) -> VecEnv: diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 39d71e5d40..8de34194c8 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -455,6 +455,10 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl if env is None and "env" in data: env = data["env"] + kwargs = {} + if "use_sde" in data and data["use_sde"]: + kwargs["use_sde"] = True + # noinspection PyArgumentList her_model = cls( policy=data["policy_class"], @@ -467,6 +471,7 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl max_episode_length=data["max_episode_length"], policy_kwargs=data["policy_kwargs"], _init_setup_model=True, # pytype: disable=not-instantiable,wrong-keyword-args + **kwargs, ) # load parameters diff --git a/tests/test_her.py b/tests/test_her.py index 34c197b224..615f306e42 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -126,10 +126,14 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling): @pytest.mark.parametrize("model_class, policy", [(SAC, "MlpPolicy"), (TD3, "MlpPolicy"), (DDPG, "MlpPolicy")]) -def test_save_load(tmp_path, model_class, policy): +@pytest.mark.parametrize("use_sde", [False, True]) +def test_save_load(tmp_path, model_class, policy, use_sde): """ Test if 'save' and 'load' saves and loads model correctly """ + if use_sde and model_class != SAC: + pytest.skip("Only SAC has gSDE support") + n_bits = 4 env = BitFlippingEnv(n_bits=n_bits, continuous=True) env = DummyVecEnv([lambda: env]) @@ -143,6 +147,8 @@ def test_save_load(tmp_path, model_class, policy): 0.2 * np.ones((n_actions,)), ) + kwargs = dict(use_sde=True) if use_sde else {} + # create model model = HER( policy, @@ -163,6 +169,7 @@ def test_save_load(tmp_path, model_class, policy): train_freq=1, n_episodes_rollout=-1, max_episode_length=n_bits, + **kwargs ) model.learn(total_timesteps=500, callback=None) From 90dafc4a15db2141147866782aa4cc3fac33c259 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Wed, 16 Sep 2020 13:21:24 +0200 Subject: [PATCH 41/81] Offline her version uses now HerReplayBuffer as episode storage. --- stable_baselines3/her/her.py | 75 +++++++++++++--------- stable_baselines3/her/her_replay_buffer.py | 21 +++--- tests/test_her.py | 1 - 3 files changed, 57 insertions(+), 40 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 8de34194c8..d5b21192d1 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -102,15 +102,25 @@ def __init__( self.goal_selection_strategy, GoalSelectionStrategy ), f"Invalid goal selection strategy, please use one of {list(GoalSelectionStrategy)}" + # maximum steps in episode + self.max_episode_length = get_time_limit(self.env, max_episode_length) # storage for transitions of current episode - self._episode_storage = [] + self._episode_storage = HerReplayBuffer( + self.env, + self.max_episode_length, + self.max_episode_length, + self.goal_selection_strategy, + self.env.observation_space, + self.env.action_space, + self.device, + self.n_envs, + 0.0, # pytype: disable=wrong-arg-types + ) self.n_sampled_goal = n_sampled_goal # if we sample her transitions online use custom replay buffer self.online_sampling = online_sampling self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1)) - self.max_episode_length = max_episode_length - self.max_episode_length = get_time_limit(self.env, self.max_episode_length) # counter for steps in episode self.episode_steps = 0 if self.online_sampling: @@ -281,7 +291,7 @@ def collect_rollouts( self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) else: # add current transition to episode storage - self._episode_storage.append((self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)) + self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) self._last_obs = new_obs self.model._last_obs = self._last_obs @@ -311,10 +321,11 @@ def collect_rollouts( if self.online_sampling: self.replay_buffer.store_episode() else: + self._episode_storage.store_episode() # store episode in replay buffer self._store_transitions() # clear storage for current episode - self._episode_storage = [] + self._episode_storage.reset() total_episodes += 1 self._episode_num += 1 @@ -345,44 +356,50 @@ def _store_transitions(self) -> None: """ # iterate over current episodes transitions - for idx, trans in enumerate(self._episode_storage): - - observation, new_observation, action, reward, done, infos = trans + for idx in range(self._episode_storage.size()): + # get data of episode index + observation = self._episode_storage.buffer["observation"][0][idx] + desired_goal = self._episode_storage.buffer["desired_goal"][0][idx] + next_observation = self._episode_storage.buffer["next_obs"][0][idx] + next_achieved_goal = self._episode_storage.buffer["next_achieved_goal"][0][idx] + next_desired_goal = self._episode_storage.buffer["next_desired_goal"][0][idx] + action = self._episode_storage.buffer["action"][0][idx] + reward = self._episode_storage.buffer["reward"][0][idx] + done = self._episode_storage.buffer["done"][0][idx] + infos = self._episode_storage.info_buffer[0][idx] # concatenate observation with (desired) goal - obs = ObsDictWrapper.convert_dict(observation) - new_obs = ObsDictWrapper.convert_dict(new_observation) - + obs = np.concatenate([observation, desired_goal], axis=-1) + next_obs = np.concatenate([next_observation, next_desired_goal], axis=-1) # store data in replay buffer - self.replay_buffer.add(obs, new_obs, action, reward, done) + self.replay_buffer.add(obs, next_obs, action, reward, done) # We cannot sample a goal from the future in the last step of an episode - if idx == len(self._episode_storage) - 1 and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: + if idx == self._episode_storage.size() - 1 and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: break - # sample set of additional goals - obs_dim = observation["observation"].shape[1] - sampled_goals = [ - sample - for sample in ( - HerReplayBuffer.sample_goal( - self.goal_selection_strategy, idx, self._episode_storage, self.replay_buffer.observations, obs_dim - ) - for _ in range(self.n_sampled_goal) + # dimsension of observation + obs_dim = observation.shape[1] + + for _ in range(self.n_sampled_goal): + # sample goal + goal = self._episode_storage.sample_goal( + self.goal_selection_strategy, + idx, + self._episode_storage.buffer["achieved_goal"][0], + self.replay_buffer.observations, + obs_dim, ) - ] - # iterate over sampled new transitions in replay buffer - for goal in sampled_goals: # compute new reward with new goal - new_reward = self.env.env_method("compute_reward", new_observation["achieved_goal"], goal, infos) + new_reward = self.env.env_method("compute_reward", next_achieved_goal, goal, infos) # concatenate observation with (desired) goal - obs = np.concatenate([observation["observation"], goal], axis=1) - new_obs = np.concatenate([new_observation["observation"], goal], axis=1) + obs = np.concatenate([observation, goal], axis=1) + next_obs = np.concatenate([next_observation, goal], axis=1) # store data in replay buffer - self.replay_buffer.add(obs, new_obs, action, new_reward, np.array([False])) + self.replay_buffer.add(obs, next_obs, action, new_reward, np.array([False])) def __getattr__(self, item): """ diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 8e06cd647f..3e33516021 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -1,5 +1,5 @@ from collections import deque -from typing import Dict, Optional, Union +from typing import Dict, List, Optional, Union import numpy as np import torch as th @@ -87,11 +87,11 @@ def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayB """ return self._sample_transitions(batch_size, env) - @staticmethod def sample_goal( + self, goal_selection_strategy: GoalSelectionStrategy, sample_idx: int, - episode: list, + achieved_goals: list, observations: Union[list, np.ndarray], obs_dim: int = None, ) -> np.ndarray: @@ -101,22 +101,22 @@ def sample_goal( :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay. One of ['episode', 'final', 'future', 'random'] :param sample_idx: (int) Index of current transition. - :param episode: (list) Current episode. + :param achieved_goals: (list) Achieved goals of Current episode. :param observations: (list or np.ndarray) :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy. :return: (np.ndarray) Return sampled goal. """ if goal_selection_strategy == GoalSelectionStrategy.FINAL: # replay with final state of current episode - return episode[-1][0]["achieved_goal"] + return achieved_goals[-1] elif goal_selection_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition - index = np.random.choice(np.arange(sample_idx + 1, len(episode))) - return episode[index][0]["achieved_goal"] + index = np.random.choice(np.arange(sample_idx + 1, len(achieved_goals))) + return achieved_goals[index] elif goal_selection_strategy == GoalSelectionStrategy.EPISODE: # replay with random state which comes from the same episode as current transition - index = np.random.choice(np.arange(len(episode))) - return episode[index][0]["achieved_goal"] + index = np.random.choice(np.arange(len(achieved_goals))) + return achieved_goals[index] elif goal_selection_strategy == GoalSelectionStrategy.RANDOM: # replay with random state from the entire replay buffer index = np.random.choice(np.arange(len(observations))) @@ -232,7 +232,8 @@ def add( action: np.ndarray, reward: np.ndarray, done: np.ndarray, - infos: Dict[str, np.ndarray], + # infos: Dict[str, np.ndarray], + infos: List[dict], ) -> None: if self.current_idx == 0 and self.full: diff --git a/tests/test_her.py b/tests/test_her.py index 615f306e42..5fa4980d37 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -234,7 +234,6 @@ def test_dqn_her(online_sampling, n_bits): """ env = BitFlippingEnv(n_bits=n_bits, continuous=False) - # offline model = HER( "MlpPolicy", env, From 655e4c33b6c1db8591b010ebbbd8bd68f4e2ae91 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 17 Sep 2020 16:28:14 +0200 Subject: [PATCH 42/81] Fix num_timesteps computation --- stable_baselines3/her/her.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index d5b21192d1..c9b4701d21 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -265,6 +265,11 @@ def collect_rollouts( done = done if episode_timesteps < self.max_episode_length else False + self.num_timesteps += 1 + self.model.num_timesteps = self.num_timesteps + episode_timesteps += 1 + total_steps += 1 + # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False) @@ -300,10 +305,6 @@ def collect_rollouts( self._last_original_obs = new_obs_ self.model._last_original_obs = self._last_original_obs - self.num_timesteps += 1 - self.model.num_timesteps = self.num_timesteps - episode_timesteps += 1 - total_steps += 1 self.model._update_current_progress_remaining(self.num_timesteps, self._total_timesteps) # For DQN, check if the target network should be updated From 046088b3958b02fd6759196b1ad76d34264ef146 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 24 Sep 2020 18:24:33 +0200 Subject: [PATCH 43/81] Fix get torch params --- stable_baselines3/her/her.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index c9b4701d21..aea1ccca85 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -411,8 +411,8 @@ def __getattr__(self, item): else: raise AttributeError - def get_torch_variables(self) -> Tuple[List[str], List[str]]: - return self.model.get_torch_variables() + def _get_torch_save_params(self) -> Tuple[List[str], List[str]]: + return self.model._get_torch_save_params() def save( self, From a68cc323ca6031d647a47c30e23ec52800b50edd Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 6 Oct 2020 00:38:08 +0200 Subject: [PATCH 44/81] Vectorized version for offline sampling. --- stable_baselines3/her/her.py | 104 ++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 44 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index aea1ccca85..426a3a07bc 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -295,6 +295,11 @@ def collect_rollouts( if self.online_sampling: self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) else: + # concatenate observation with (desired) goal + obs = ObsDictWrapper.convert_dict(self._last_original_obs) + next_obs = ObsDictWrapper.convert_dict(new_obs_) + # add to replay bufffer + self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done) # add current transition to episode storage self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) @@ -318,6 +323,7 @@ def collect_rollouts( if 0 < n_steps <= total_steps: break + # TODO check again if done or self.episode_steps == self.max_episode_length: if self.online_sampling: self.replay_buffer.store_episode() @@ -355,52 +361,62 @@ def _store_transitions(self) -> None: """ Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer. """ + # use vectorized sample goal function fom her_replay_buffer + episode_length = self._episode_storage.episode_lengths[0] + episode_indices = np.array(list(range(self._episode_storage.n_episodes_stored)) * episode_length * self.n_sampled_goal) + her_indices = np.arange(len(episode_indices)) + # repeat every transition index n_sampled_goals times + transitions_indices = np.array(list(range(episode_length)) * self.n_sampled_goal) + + if self._episode_storage.goal_selection_strategy == GoalSelectionStrategy.FUTURE: + # restrict the sampling domain when ep_length > 1 + # otherwise filter out the indices + # only consider transitions which are not the last one in the episode + her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1] + + # transitions + transitions = { + key: self._episode_storage.buffer[key][episode_indices, transitions_indices].copy() + for key in self._episode_storage.buffer.keys() + } + + # get sampled goals + new_goals = self._episode_storage.vectorized_sample_goal(episode_indices, her_indices, transitions_indices) + # assign new goals as desired goals + transitions["desired_goal"][her_indices] = new_goals + + # Convert to numpy array + # TODO: disable if not needed for faster computation + transitions["info"] = np.array( + [ + self._episode_storage.info_buffer[episode_idx][transition_idx] + for episode_idx, transition_idx in zip(episode_indices, transitions_indices) + ] + ) - # iterate over current episodes transitions - for idx in range(self._episode_storage.size()): - # get data of episode index - observation = self._episode_storage.buffer["observation"][0][idx] - desired_goal = self._episode_storage.buffer["desired_goal"][0][idx] - next_observation = self._episode_storage.buffer["next_obs"][0][idx] - next_achieved_goal = self._episode_storage.buffer["next_achieved_goal"][0][idx] - next_desired_goal = self._episode_storage.buffer["next_desired_goal"][0][idx] - action = self._episode_storage.buffer["action"][0][idx] - reward = self._episode_storage.buffer["reward"][0][idx] - done = self._episode_storage.buffer["done"][0][idx] - infos = self._episode_storage.info_buffer[0][idx] - - # concatenate observation with (desired) goal - obs = np.concatenate([observation, desired_goal], axis=-1) - next_obs = np.concatenate([next_observation, next_desired_goal], axis=-1) - # store data in replay buffer - self.replay_buffer.add(obs, next_obs, action, reward, done) - - # We cannot sample a goal from the future in the last step of an episode - if idx == self._episode_storage.size() - 1 and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: - break - - # dimsension of observation - obs_dim = observation.shape[1] - - for _ in range(self.n_sampled_goal): - # sample goal - goal = self._episode_storage.sample_goal( - self.goal_selection_strategy, - idx, - self._episode_storage.buffer["achieved_goal"][0], - self.replay_buffer.observations, - obs_dim, - ) - - # compute new reward with new goal - new_reward = self.env.env_method("compute_reward", next_achieved_goal, goal, infos) - - # concatenate observation with (desired) goal - obs = np.concatenate([observation, goal], axis=1) - next_obs = np.concatenate([next_observation, goal], axis=1) + # Vectorized computation + transitions["reward"][her_indices] = self.env.env_method( + "compute_reward", + transitions["next_achieved_goal"][her_indices], + transitions["desired_goal"][her_indices], + transitions["info"][her_indices], + ) - # store data in replay buffer - self.replay_buffer.add(obs, next_obs, action, new_reward, np.array([False])) + # concatenate observation with (desired) goal + observations = ObsDictWrapper.convert_dict(transitions) + next_observations = ObsDictWrapper.convert_dict(transitions, observation_key="next_obs") + + # TODO check random strategy -> with online_sampling flag? + # TODO done = False? or recompute -> compare desired and achieved goal + + # store data in replay buffer + for i in her_indices: + obs = observations[i] + next_obs = next_observations[i] + buffer_action = transitions["action"][i] + reward = transitions["reward"][i] + done = np.array([False]) + self.replay_buffer.add(obs, next_obs, buffer_action, reward, done) def __getattr__(self, item): """ From 8a25457c1cd184119140e27bc8a4a458f94ce092 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 6 Oct 2020 02:51:28 +0200 Subject: [PATCH 45/81] Modified offline her sampling to use sample method of her_replay_buffer --- stable_baselines3/her/her.py | 63 ++------ stable_baselines3/her/her_replay_buffer.py | 165 +++++++++++---------- 2 files changed, 99 insertions(+), 129 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 426a3a07bc..aa74bdfc30 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -102,6 +102,10 @@ def __init__( self.goal_selection_strategy, GoalSelectionStrategy ), f"Invalid goal selection strategy, please use one of {list(GoalSelectionStrategy)}" + self.n_sampled_goal = n_sampled_goal + # if we sample her transitions online use custom replay buffer + self.online_sampling = online_sampling + self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1)) # maximum steps in episode self.max_episode_length = get_time_limit(self.env, max_episode_length) # storage for transitions of current episode @@ -114,13 +118,9 @@ def __init__( self.env.action_space, self.device, self.n_envs, - 0.0, # pytype: disable=wrong-arg-types + self.her_ratio, # pytype: disable=wrong-arg-types ) - self.n_sampled_goal = n_sampled_goal - # if we sample her transitions online use custom replay buffer - self.online_sampling = online_sampling - self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1)) # counter for steps in episode self.episode_steps = 0 if self.online_sampling: @@ -361,54 +361,17 @@ def _store_transitions(self) -> None: """ Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer. """ - # use vectorized sample goal function fom her_replay_buffer - episode_length = self._episode_storage.episode_lengths[0] - episode_indices = np.array(list(range(self._episode_storage.n_episodes_stored)) * episode_length * self.n_sampled_goal) - her_indices = np.arange(len(episode_indices)) - # repeat every transition index n_sampled_goals times - transitions_indices = np.array(list(range(episode_length)) * self.n_sampled_goal) - - if self._episode_storage.goal_selection_strategy == GoalSelectionStrategy.FUTURE: - # restrict the sampling domain when ep_length > 1 - # otherwise filter out the indices - # only consider transitions which are not the last one in the episode - her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1] - - # transitions - transitions = { - key: self._episode_storage.buffer[key][episode_indices, transitions_indices].copy() - for key in self._episode_storage.buffer.keys() - } - - # get sampled goals - new_goals = self._episode_storage.vectorized_sample_goal(episode_indices, her_indices, transitions_indices) - # assign new goals as desired goals - transitions["desired_goal"][her_indices] = new_goals - - # Convert to numpy array - # TODO: disable if not needed for faster computation - transitions["info"] = np.array( - [ - self._episode_storage.info_buffer[episode_idx][transition_idx] - for episode_idx, transition_idx in zip(episode_indices, transitions_indices) - ] - ) - # Vectorized computation - transitions["reward"][her_indices] = self.env.env_method( - "compute_reward", - transitions["next_achieved_goal"][her_indices], - transitions["desired_goal"][her_indices], - transitions["info"][her_indices], + # sample goals and get new observations + observations, next_observations, transitions, her_indices = self._episode_storage.sample( + self.batch_size, + self.env, + self.online_sampling, + self.n_sampled_goal, + self.replay_buffer.observations, ) - # concatenate observation with (desired) goal - observations = ObsDictWrapper.convert_dict(transitions) - next_observations = ObsDictWrapper.convert_dict(transitions, observation_key="next_obs") - - # TODO check random strategy -> with online_sampling flag? - # TODO done = False? or recompute -> compare desired and achieved goal - + # TODO done = False? # store data in replay buffer for i in her_indices: obs = observations[i] diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 3e33516021..86d26f8f7b 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -1,5 +1,5 @@ from collections import deque -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np import torch as th @@ -78,77 +78,49 @@ def __init__( # percentage of her indices self.her_ratio = her_ratio - def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: + def sample( + self, + batch_size: int, + env: Optional[VecNormalize] = None, + online_sampling: bool = True, + n_sampled_goal: int = None, + replay_observations: np.ndarray = None, + ) -> Union[ReplayBufferSamples, Tuple]: """ :param batch_size: (int) Number of element to sample :param env: (Optional[VecNormalize]) associated gym VecEnv to normalize the observations/rewards when sampling - :return: (ReplayBufferSamples) - """ - return self._sample_transitions(batch_size, env) - - def sample_goal( - self, - goal_selection_strategy: GoalSelectionStrategy, - sample_idx: int, - achieved_goals: list, - observations: Union[list, np.ndarray], - obs_dim: int = None, - ) -> np.ndarray: + :param online_sampling: (bool) Using online_sampling for HER or not. + :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling) + :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. + :return: (ReplayBufferSamples or Tuple) """ - Sample a goal based on goal_selection_strategy. - - :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay. - One of ['episode', 'final', 'future', 'random'] - :param sample_idx: (int) Index of current transition. - :param achieved_goals: (list) Achieved goals of Current episode. - :param observations: (list or np.ndarray) - :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy. - :return: (np.ndarray) Return sampled goal. - """ - if goal_selection_strategy == GoalSelectionStrategy.FINAL: - # replay with final state of current episode - return achieved_goals[-1] - elif goal_selection_strategy == GoalSelectionStrategy.FUTURE: - # replay with random state which comes from the same episode and was observed after current transition - index = np.random.choice(np.arange(sample_idx + 1, len(achieved_goals))) - return achieved_goals[index] - elif goal_selection_strategy == GoalSelectionStrategy.EPISODE: - # replay with random state which comes from the same episode as current transition - index = np.random.choice(np.arange(len(achieved_goals))) - return achieved_goals[index] - elif goal_selection_strategy == GoalSelectionStrategy.RANDOM: - # replay with random state from the entire replay buffer - index = np.random.choice(np.arange(len(observations))) - obs = observations[index] - # get only the observation part - # TODO: check that line (or the comment at least) - obs_array = obs[:, :obs_dim] - return obs_array - else: - raise ValueError("Strategy for sampling goals not supported!") + return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal, replay_observations) def vectorized_sample_goal( - self, episode_indices: np.ndarray, her_indices: np.ndarray, transitions_indices: np.ndarray + self, + episode_indices: np.ndarray, + her_indices: np.ndarray, + transitions_indices: np.ndarray, + online_sampling: bool = True, + replay_observations: np.ndarray = None, ) -> np.ndarray: """ Sample goals based on goal_selection_strategy. - This is the vectorized (faster) version of ``sample_goal()`` - - :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay. - One of ['episode', 'final', 'future', 'random'] - :param sample_idx: (int) Index of current transition. - :param episode: (list) Current episode. - :param observations: (list or np.ndarray) - :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy. - :param online_sampling: (bool) Sample HER transitions online. + This is a vectorized (fast) version. + + :param episode_indices: (np.ndarray) Episode indices to use. + :param her_indices: (np.ndarray) HER indices. + :param transitions_indices: (np.ndarray) Transition indices to use. + :param online_sampling: (bool) Using online_sampling for HER or not. + :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. :return: (np.ndarray) Return sampled goals. """ her_episode_indices = episode_indices[her_indices] if self.goal_selection_strategy == GoalSelectionStrategy.FINAL: # replay with final state of current episode - transitions_indices = self.episode_lengths[her_indices] - 1 + transitions_indices = self.episode_lengths[her_episode_indices] - 1 elif self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: # replay with random state which comes from the same episode and was observed after current transition @@ -161,37 +133,70 @@ def vectorized_sample_goal( transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices]) elif self.goal_selection_strategy == GoalSelectionStrategy.RANDOM: - # replay with random state from the entire replay buffer - her_episode_indices = np.random.randint(self.n_episodes_stored, size=len(her_indices)) - transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices]) + if online_sampling: + # replay with random state from the entire replay buffer + her_episode_indices = np.random.randint(self.n_episodes_stored, size=len(her_indices)) + transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices]) + else: + # replay with random state from the entire replay buffer + index = np.random.choice(np.arange(len(replay_observations)), len(her_indices)) + obs = replay_observations[index] + # get only the observation part of the state + obs_dim = self.env.obs_dim + obs_array = obs[:, :, :obs_dim] + return obs_array else: raise ValueError("Strategy for sampling goals not supported!") return self.buffer["achieved_goal"][her_episode_indices, transitions_indices] - def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples: + def _sample_transitions( + self, + batch_size: int, + env: Optional[VecNormalize], + online_sampling: bool = True, + n_sampled_goal: int = None, + replay_observations: np.ndarray = None, + ) -> Union[ReplayBufferSamples, Tuple]: """ :param batch_size: (int) Number of element to sample :param env: (Optional[VecNormalize]) associated gym VecEnv to normalize the observations/rewards when sampling - :return: (ReplayBufferSamples) + :param online_sampling: (bool) Using online_sampling for HER or not. + :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling) + :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. + :return: (ReplayBufferSamples or Tuple) """ # Select which episodes to use - episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) - her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)] - # her_indices = np.random.permutation(batch_size)[: int(self.her_ratio * batch_size)] - ep_length = self.episode_lengths[episode_indices] + if online_sampling: + episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) + her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)] + ep_length = self.episode_lengths[episode_indices] + else: + episode_length = self.episode_lengths[0] + episode_indices = np.array(list(range(self.n_episodes_stored)) * episode_length * n_sampled_goal) + her_indices = np.arange(len(episode_indices)) + # repeat every transition index n_sampled_goals times + transitions_indices = np.array(list(range(episode_length)) * n_sampled_goal) if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: # restrict the sampling domain when ep_length > 1 # otherwise filter out the indices - her_indices = her_indices[ep_length[her_indices] > 1] - ep_length[her_indices] -= 1 - - transitions_indices = np.random.randint(ep_length) + if online_sampling: + her_indices = her_indices[ep_length[her_indices] > 1] + ep_length[her_indices] -= 1 + else: + her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1] + + if online_sampling: + # Select which transitions to use + transitions_indices = np.random.randint(ep_length) + # get selected transitions transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()} - new_goals = self.vectorized_sample_goal(episode_indices, her_indices, transitions_indices) + new_goals = self.vectorized_sample_goal( + episode_indices, her_indices, transitions_indices, online_sampling, replay_observations + ) transitions["desired_goal"][her_indices] = new_goals # Convert to numpy array @@ -215,15 +220,18 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R observations = ObsDictWrapper.convert_dict(transitions) next_observations = ObsDictWrapper.convert_dict(transitions, observation_key="next_obs") - data = ( - self._normalize_obs(observations, env), - transitions["action"], - self._normalize_obs(next_observations, env), - transitions["done"], - self._normalize_reward(transitions["reward"], env), - ) + if online_sampling: + data = ( + self._normalize_obs(observations, env), + transitions["action"], + self._normalize_obs(next_observations, env), + transitions["done"], + self._normalize_reward(transitions["reward"], env), + ) - return ReplayBufferSamples(*tuple(map(self.to_torch, data))) + return ReplayBufferSamples(*tuple(map(self.to_torch, data))) + else: + return observations, next_observations, transitions, her_indices def add( self, @@ -232,7 +240,6 @@ def add( action: np.ndarray, reward: np.ndarray, done: np.ndarray, - # infos: Dict[str, np.ndarray], infos: List[dict], ) -> None: From c125d0890770db880ab61e90ebf4fcdbd2300851 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 6 Oct 2020 03:55:30 +0200 Subject: [PATCH 46/81] Updated HER tests. --- stable_baselines3/her/her.py | 1 - tests/test_her.py | 55 ++---------------------------------- 2 files changed, 3 insertions(+), 53 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index aa74bdfc30..49539c8f38 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -323,7 +323,6 @@ def collect_rollouts( if 0 < n_steps <= total_steps: break - # TODO check again if done or self.episode_steps == self.max_episode_length: if self.online_sampling: self.replay_buffer.store_episode() diff --git a/tests/test_her.py b/tests/test_her.py index 5fa4980d37..b99fdea87b 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -24,30 +24,12 @@ def test_her(model_class, policy, online_sampling): env = BitFlippingEnv(n_bits=n_bits, continuous=True) env = DummyVecEnv([lambda: env]) - # Create action noise - n_actions = env.action_space.shape[0] - action_noise = OrnsteinUhlenbeckActionNoise( - np.zeros( - n_actions, - ), - 0.2 * np.ones((n_actions,)), - ) - model = HER( policy, env, model_class, - n_sampled_goal=5, goal_selection_strategy="future", online_sampling=online_sampling, - action_noise=action_noise, - verbose=0, - tau=0.05, - batch_size=128, - learning_rate=0.001, - policy_kwargs=dict(net_arch=[64]), - buffer_size=int(1e6), - gamma=0.98, gradient_steps=1, train_freq=1, n_episodes_rollout=-1, @@ -56,39 +38,6 @@ def test_her(model_class, policy, online_sampling): model.learn(total_timesteps=500, callback=None) - # Evaluate the agent - n_eval_episodes = 5 - n_episodes = 0 - episode_rewards = [] - episode_reward = 0.0 - - eval_env = BitFlippingEnv(n_bits=4, continuous=True) - - observation = eval_env.reset() - - while n_episodes < n_eval_episodes: - - obs = np.concatenate([observation["observation"], observation["desired_goal"]]) - - with th.no_grad(): - obs_ = th.FloatTensor(np.array(obs).reshape(1, -1)).to(model.model.device) - action = model.model.policy.predict(obs_)[0][0] - - observation, reward, done, _ = eval_env.step(action) - - # Render the env - # eval_env.render() - - episode_reward += reward - - if done: - n_episodes += 1 - observation = eval_env.reset() - episode_rewards.append(episode_reward) - episode_reward = 0.0 - - eval_env.close() - @pytest.mark.parametrize( "goal_selection_strategy", @@ -252,4 +201,6 @@ def test_dqn_her(online_sampling, n_bits): batch_size=32, ) - model.learn(total_timesteps=20000) + model.learn(total_timesteps=10000) + + assert np.mean(model.ep_success_buffer) > 0.0 From a70b47b6c5e8f3ecaa2975ab8cb3f00f618cad41 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 6 Oct 2020 09:08:06 +0200 Subject: [PATCH 47/81] Updated documentation --- docs/modules/her.rst | 72 ++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/docs/modules/her.rst b/docs/modules/her.rst index 32532ba0f0..11f01fc453 100644 --- a/docs/modules/her.rst +++ b/docs/modules/her.rst @@ -12,7 +12,7 @@ HER is an algorithm that works with Off policy methods (DQN, SAC, TD3 and DDPG f .. warning:: - HER requires the environment to inherits from `gym.GoalEnv `_ + HER requires the environment to inherits from `gym.GoalEnv `_ Notes @@ -28,54 +28,54 @@ Notes Can I use? ---------- -Please refer to the wrapped model (DQN, SAC, TD3 or DDPG) for that section. +Please refer to the used model (DQN, SAC, TD3 or DDPG) for that section. Example ------- .. code-block:: python - from stable_baselines3 import DDPG, DQN, SAC, TD3 - from stable_baselines3.her.her import HER - from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy - from stable_baselines3.common.bit_flipping_env import BitFlippingEnv - from stable_baselines3.common.vec_env import DummyVecEnv - from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper + from stable_baselines3 import DDPG, DQN, SAC, TD3 + from stable_baselines3.her.her import HER + from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy + from stable_baselines3.common.bit_flipping_env import BitFlippingEnv + from stable_baselines3.common.vec_env import DummyVecEnv + from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper - model_class = DQN # works also with SAC, DDPG and TD3 - N_BITS = 15 + model_class = DQN # works also with SAC, DDPG and TD3 + N_BITS = 15 - env = BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) + env = BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) - # Available strategies (cf paper): future, final, episode, random - goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE + # Available strategies (cf paper): future, final, episode, random + goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE - # If True the HER transitions will get sampled online - online_sampling = True - # Time limit for the episodes in online sampling (to deactivate for offline use the default value -1) - max_episode_length = N_BITS + # If True the HER transitions will get sampled online + online_sampling = True + # Time limit for the episodes + max_episode_length = N_BITS - # Initialize the model - model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, - verbose=1, max_episode_length=max_episode_length) - # Train the model - model.learn(1000) + # Initialize the model + model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, + verbose=1, max_episode_length=max_episode_length) + # Train the model + model.learn(1000) - model.save("./her_bit_env") + model.save("./her_bit_env") - # WARNING: you must pass an VecEnv - env = DummyVecEnv([lambda: env]) - model = HER.load('./her_bit_env', env=env) + # WARNING: you must pass an VecEnv + env = DummyVecEnv([lambda: env]) + model = HER.load('./her_bit_env', env=env) - obs = env.reset() - for _ in range(100): - # we need to convert the observation dict - obs = ObsDictWrapper.convert_dict(obs) - action, _ = model.model.predict(obs) - obs, reward, done, _ = env.step(action) + obs = env.reset() + for _ in range(100): + # we need to convert the observation dict + obs = ObsDictWrapper.convert_dict(obs) + action, _ = model.model.predict(obs) + obs, reward, done, _ = env.step(action) - if done: - obs = env.reset() + if done: + obs = env.reset() Parameters @@ -90,7 +90,7 @@ Goal Selection Strategies .. autoclass:: GoalSelectionStrategy :members: :inherited-members: - :undoc-members: + :undoc-members: Obs Dict Wrapper @@ -99,7 +99,7 @@ Obs Dict Wrapper .. autoclass:: ObsDictWrapper :members: :inherited-members: - :undoc-members: + :undoc-members: HER Replay Buffer From aaa80c820cee9564c5ca43e8739d7f236ac9455e Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 7 Oct 2020 11:10:19 +0200 Subject: [PATCH 48/81] Cleanup docstrings --- docs/misc/changelog.rst | 4 +- .../common/vec_env/obs_dict_wrapper.py | 10 ++-- stable_baselines3/her/her.py | 42 +++++++-------- stable_baselines3/her/her_replay_buffer.py | 54 +++++++++---------- 4 files changed, 55 insertions(+), 55 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index c10a50d15e..c922e9291b 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -12,6 +12,7 @@ Breaking Changes: New Features: ^^^^^^^^^^^^^ +- Added Hindsight Experience Replay ``HER``. (@megan-klaiber) Bug Fixes: ^^^^^^^^^^ @@ -49,7 +50,6 @@ Breaking Changes: New Features: ^^^^^^^^^^^^^ -- Added Hindsight Experience Replay ``HER``. (@megan-klaiber) - Added ``unwrap_vec_wrapper()`` to ``common.vec_env`` to extract ``VecEnvWrapper`` if needed - Added ``StopTrainingOnMaxEpisodes`` to callback collection (@xicocaio) - Added ``device`` keyword argument to ``BaseAlgorithm.load()`` (@liorcohen5) @@ -452,4 +452,4 @@ And all the contributors: @MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching @flodorner @KuKuXia @NeoExtended @PartiallyTyped @mmcenta @richardwu @kinalmehta @rolandgvc @tkelestemur @mloo3 @tirafesi @blurLake @koulakis @joeljosephjin @shwang @rk37 @andyshih12 @RaphaelWag @xicocaio -@diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @megan-klaiber \ No newline at end of file +@diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @megan-klaiber diff --git a/stable_baselines3/common/vec_env/obs_dict_wrapper.py b/stable_baselines3/common/vec_env/obs_dict_wrapper.py index 22fbae4060..e05b30b875 100644 --- a/stable_baselines3/common/vec_env/obs_dict_wrapper.py +++ b/stable_baselines3/common/vec_env/obs_dict_wrapper.py @@ -8,7 +8,7 @@ class ObsDictWrapper(VecEnvWrapper): """ Wrapper for a VecEnv which overrides the observation space for Hindsight Experience Replay to support dict observations. - :param env: (VecEnv) The vectorized environment to wrap. + :param env: The vectorized environment to wrap. """ def __init__(self, venv: VecEnv): @@ -58,9 +58,9 @@ def convert_dict( """ Concatenate observation and (desired) goal of observation dict. - :param observation_dict: (dict) Dictionary with observation. - :param observation_key: (str) Key of observation in dicitonary. - :param goal_key: (str) Key of (desired) goal in dicitonary. - :return: (np.ndarray) + :param observation_dict: Dictionary with observation. + :param observation_key: Key of observation in dicitonary. + :param goal_key: Key of (desired) goal in dicitonary. + :return: """ return np.concatenate([observation_dict[observation_key], observation_dict[goal_key]], axis=-1) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 49539c8f38..590106d9b7 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -24,9 +24,9 @@ def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> in """ Get time limit from environment. - :param env: (VecEnv) Environment from which we want to get the time limit. - :param current_max_episode_length: (int) Current value for max_episode_length. - :return: (int) max episode length + :param env: Environment from which we want to get the time limit. + :param current_max_episode_length: Current value for max_episode_length. + :return: max episode length """ # try to get the attribute from environment if current_max_episode_length is None: @@ -49,16 +49,16 @@ class HER(BaseAlgorithm): Paper: https://arxiv.org/abs/1707.01495 - :param policy: (BasePolicy or str) The policy model to use. - :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str) - :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3) - :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling) - :param goal_selection_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay. + :param policy: The policy model to use. + :param env: The environment to learn from (if registered in Gym, can be str) + :param model_class: Off policy model which will be used with hindsight experience replay. (SAC, TD3) + :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) + :param goal_selection_strategy: Strategy for sampling goals for replay. One of ['episode', 'final', 'future', 'random'] - :param online_sampling: (bool) Sample HER transitions online. - :param learning_rate: (float or callable) learning rate for the optimizer, + :param online_sampling: Sample HER transitions online. + :param learning_rate: learning rate for the optimizer, it can be a function of the current progress remaining (from 1 to 0) - :param max_episode_length: (int) The maximum length of an episode. If not specified, + :param max_episode_length: The maximum length of an episode. If not specified, it will be automatically inferred if the environment uses a ``gym.wrappers.TimeLimit`` wrapper """ @@ -215,20 +215,20 @@ def collect_rollouts( """ Collect experiences and store them into a ReplayBuffer. - :param env: (VecEnv) The training environment - :param callback: (BaseCallback) Callback that will be called at each step + :param env: The training environment + :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) - :param n_episodes: (int) Number of episodes to use to collect rollout data + :param n_episodes: Number of episodes to use to collect rollout data You can also specify a ``n_steps`` instead - :param n_steps: (int) Number of steps to use to collect rollout data + :param n_steps: Number of steps to use to collect rollout data You can also specify a ``n_episodes`` instead. - :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration + :param action_noise: Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. - :param learning_starts: (int) Number of steps before learning for the warm-up phase. - :param replay_buffer: (ReplayBuffer or HerReplayBuffer) - :param log_interval: (int) Log data every ``log_interval`` episodes - :return: (RolloutReturn) + :param learning_starts: Number of steps before learning for the warm-up phase. + :param replay_buffer: + :param log_interval: Log data every ``log_interval`` episodes + :return: """ episode_rewards, total_timesteps = [], [] @@ -401,7 +401,7 @@ def save( """ Save all the attributes of the object and the model parameters in a zip-file. - :param path: (Union[str, pathlib.Path, io.BufferedIOBase]) path to the file where the rl agent should be saved + :param path: path to the file where the rl agent should be saved :param exclude: name of parameters that should be excluded in addition to the default one :param include: name of parameters that might be excluded but should be included anyway """ diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 86d26f8f7b..381f574e89 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -17,16 +17,16 @@ class HerReplayBuffer(BaseBuffer): Replay Buffer for sampling HER (Hindsight Experience Replay) transitions online. These transitions will not be saved in the Buffer. - :param env: (VecEnv) The training environment - :param buffer_size: (int) The size of the buffer measured in transitions. - :param max_episode_length: (int) The length of an episode. (time horizon) - :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay. + :param env: The training environment + :param buffer_size: The size of the buffer measured in transitions. + :param max_episode_length: The length of an episode. (time horizon) + :param goal_selection_strategy: Strategy for sampling goals for replay. One of ['episode', 'final', 'future', 'random'] - :param observation_space: (spaces.Space) Observation space - :param action_space: (spaces.Space) Action space - :param device: (Union[th.device, str]) PyTorch device + :param observation_space: Observation space + :param action_space: Action space + :param device: PyTorch device to which the values will be converted - :param n_envs: (int) Number of parallel environments + :param n_envs: Number of parallel environments :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling) """ @@ -87,13 +87,13 @@ def sample( replay_observations: np.ndarray = None, ) -> Union[ReplayBufferSamples, Tuple]: """ - :param batch_size: (int) Number of element to sample - :param env: (Optional[VecNormalize]) associated gym VecEnv + :param batch_size: Number of element to sample + :param env: associated gym VecEnv to normalize the observations/rewards when sampling - :param online_sampling: (bool) Using online_sampling for HER or not. - :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling) - :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. - :return: (ReplayBufferSamples or Tuple) + :param online_sampling: Using online_sampling for HER or not. + :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) + :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. + :return: """ return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal, replay_observations) @@ -109,12 +109,12 @@ def vectorized_sample_goal( Sample goals based on goal_selection_strategy. This is a vectorized (fast) version. - :param episode_indices: (np.ndarray) Episode indices to use. - :param her_indices: (np.ndarray) HER indices. - :param transitions_indices: (np.ndarray) Transition indices to use. - :param online_sampling: (bool) Using online_sampling for HER or not. - :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. - :return: (np.ndarray) Return sampled goals. + :param episode_indices: Episode indices to use. + :param her_indices: HER indices. + :param transitions_indices: Transition indices to use. + :param online_sampling: Using online_sampling for HER or not. + :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. + :return: Return sampled goals. """ her_episode_indices = episode_indices[her_indices] @@ -159,13 +159,13 @@ def _sample_transitions( replay_observations: np.ndarray = None, ) -> Union[ReplayBufferSamples, Tuple]: """ - :param batch_size: (int) Number of element to sample - :param env: (Optional[VecNormalize]) associated gym VecEnv + :param batch_size: Number of element to sample + :param env: associated gym VecEnv to normalize the observations/rewards when sampling - :param online_sampling: (bool) Using online_sampling for HER or not. - :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling) - :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. - :return: (ReplayBufferSamples or Tuple) + :param online_sampling: Using online_sampling for HER or not. + :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) + :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. + :return: """ # Select which episodes to use if online_sampling: @@ -288,6 +288,6 @@ def clear_buffer(self): def size(self) -> int: """ - :return: (int) The current size of the buffer in transitions. + :return: The current size of the buffer in transitions. """ return int(np.sum(self.episode_lengths)) From 362ea5c627f6af960ca346e644926966b3768d9f Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Thu, 8 Oct 2020 10:46:20 +0200 Subject: [PATCH 49/81] Updated to review comments --- docs/modules/her.rst | 10 +--- stable_baselines3/common/base_class.py | 2 +- stable_baselines3/common/buffers.py | 2 + .../common/off_policy_algorithm.py | 7 +++ stable_baselines3/her/her.py | 57 ++++++++++++++----- stable_baselines3/her/her_replay_buffer.py | 44 ++++++++------ tests/test_her.py | 47 ++++++--------- 7 files changed, 96 insertions(+), 73 deletions(-) diff --git a/docs/modules/her.rst b/docs/modules/her.rst index 11f01fc453..8f8c8f36c6 100644 --- a/docs/modules/her.rst +++ b/docs/modules/her.rst @@ -35,8 +35,7 @@ Example .. code-block:: python - from stable_baselines3 import DDPG, DQN, SAC, TD3 - from stable_baselines3.her.her import HER + from stable_baselines3 import HER, DDPG, DQN, SAC, TD3 from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy from stable_baselines3.common.bit_flipping_env import BitFlippingEnv from stable_baselines3.common.vec_env import DummyVecEnv @@ -62,16 +61,11 @@ Example model.learn(1000) model.save("./her_bit_env") - - # WARNING: you must pass an VecEnv - env = DummyVecEnv([lambda: env]) model = HER.load('./her_bit_env', env=env) obs = env.reset() for _ in range(100): - # we need to convert the observation dict - obs = ObsDictWrapper.convert_dict(obs) - action, _ = model.model.predict(obs) + action, _ = model.model.predict(obs, deterministic=True) obs, reward, done, _ = env.step(action) if done: diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py index 667a1c2155..97d0035599 100644 --- a/stable_baselines3/common/base_class.py +++ b/stable_baselines3/common/base_class.py @@ -173,7 +173,7 @@ def _wrap_env(self, env: GymEnv) -> VecEnv: print("Wrapping the env in a VecTransposeImage.") env = VecTransposeImage(env) - # check if wrapper for dict support is needed + # check if wrapper for dict support is needed when using HER if isinstance(env.observation_space, gym.spaces.dict.Dict): env = ObsDictWrapper(env) diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py index 2063d6c302..bd07069854 100644 --- a/stable_baselines3/common/buffers.py +++ b/stable_baselines3/common/buffers.py @@ -83,6 +83,8 @@ def extend(self, *args, **kwargs) -> None: """ # Do a for loop along the batch axis for data in zip(*args): + # import ipdb + # ipdb.set_trace() self.add(*data) def reset(self) -> None: diff --git a/stable_baselines3/common/off_policy_algorithm.py b/stable_baselines3/common/off_policy_algorithm.py index cf08b4444e..e3ffeb61f0 100644 --- a/stable_baselines3/common/off_policy_algorithm.py +++ b/stable_baselines3/common/off_policy_algorithm.py @@ -67,6 +67,8 @@ class OffPolicyAlgorithm(BaseAlgorithm): :param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling during the warm up phase (before learning starts) :param sde_support: Whether the model support gSDE or not + :param remove_time_limit_termination: Remove terminations (dones) that are due to time limit. + See https://github.com/hill-a/stable-baselines/issues/863 """ def __init__( @@ -97,6 +99,7 @@ def __init__( sde_sample_freq: int = -1, use_sde_at_warmup: bool = False, sde_support: bool = True, + remove_time_limit_termination: bool = False, ): super(OffPolicyAlgorithm, self).__init__( @@ -126,6 +129,10 @@ def __init__( self.action_noise = action_noise self.optimize_memory_usage = optimize_memory_usage + # Remove terminations (dones) that are due to time limit + # see https://github.com/hill-a/stable-baselines/issues/863 + self.remove_time_limit_termination = remove_time_limit_termination + if train_freq > 0 and n_episodes_rollout > 0: warnings.warn( "You passed a positive value for `train_freq` and `n_episodes_rollout`." diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 590106d9b7..c1565137b8 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -1,6 +1,6 @@ import io import pathlib -from typing import Callable, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union import gym import numpy as np @@ -11,10 +11,11 @@ from stable_baselines3.common.noise import ActionNoise from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm from stable_baselines3.common.policies import BasePolicy +from stable_baselines3.common.preprocessing import is_image_space from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn from stable_baselines3.common.utils import check_for_correct_spaces -from stable_baselines3.common.vec_env import VecEnv +from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecTransposeImage from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy from stable_baselines3.her.her_replay_buffer import HerReplayBuffer @@ -43,6 +44,7 @@ def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> in return current_max_episode_length +# TODO: rewrite HER class as soon as dict obs are supported class HER(BaseAlgorithm): """ Hindsight Experience Replay (HER) @@ -51,7 +53,7 @@ class HER(BaseAlgorithm): :param policy: The policy model to use. :param env: The environment to learn from (if registered in Gym, can be str) - :param model_class: Off policy model which will be used with hindsight experience replay. (SAC, TD3) + :param model_class: Off policy model which will be used with hindsight experience replay. (SAC, TD3, DDPG, DQN) :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) :param goal_selection_strategy: Strategy for sampling goals for replay. One of ['episode', 'final', 'future', 'random'] @@ -67,7 +69,7 @@ def __init__( policy: Union[str, Type[BasePolicy]], env: Union[GymEnv, str], model_class: Type[OffPolicyAlgorithm], - n_sampled_goal: int = 5, + n_sampled_goal: int = 4, goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future", online_sampling: bool = False, learning_rate: Union[float, Callable] = 3e-4, @@ -105,6 +107,7 @@ def __init__( self.n_sampled_goal = n_sampled_goal # if we sample her transitions online use custom replay buffer self.online_sampling = online_sampling + # compute ratio between HER replays and regular replays in percent for online HER sampling self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1)) # maximum steps in episode self.max_episode_length = get_time_limit(self.env, max_episode_length) @@ -263,8 +266,6 @@ def collect_rollouts( # Perform action new_obs, reward, done, infos = env.step(action) - done = done if episode_timesteps < self.max_episode_length else False - self.num_timesteps += 1 self.model.num_timesteps = self.num_timesteps episode_timesteps += 1 @@ -292,16 +293,30 @@ def collect_rollouts( self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward self.model._last_original_obs = self._last_original_obs + # Remove termination signal due to timelimit if needed + # NOTE: this may cause issue when using memory optimized replay + # or n-step replay + if self.remove_time_limit_termination and infos[0].get("TimeLimit.truncated", False): + done_ = np.array([False]) + # As the VecEnv resets automatically, new_obs is already the + # first observation of the next episode + next_obs = infos[0]["terminal_observation"] + if self._vec_normalize_env is not None: + next_obs = self._vec_normalize_env.unnormalize_obs(next_obs) + else: + done_ = done + next_obs = new_obs_ + if self.online_sampling: - self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) + self.replay_buffer.add(self._last_original_obs, next_obs, buffer_action, reward_, done_, infos) else: # concatenate observation with (desired) goal obs = ObsDictWrapper.convert_dict(self._last_original_obs) - next_obs = ObsDictWrapper.convert_dict(new_obs_) - # add to replay bufffer - self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done) + next_obs_ = ObsDictWrapper.convert_dict(next_obs) + # add to replay buffer + self.replay_buffer.add(obs, next_obs_, buffer_action, reward_, done_) # add current transition to episode storage - self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) + self._episode_storage.add(self._last_original_obs, next_obs, buffer_action, reward_, done_, infos) self._last_obs = new_obs self.model._last_obs = self._last_obs @@ -370,8 +385,9 @@ def _store_transitions(self) -> None: self.replay_buffer.observations, ) - # TODO done = False? # store data in replay buffer + # self.replay_buffer.extend(observations, next_observations, transitions["action"], transitions["reward"], np.array([False])) + for i in her_indices: obs = observations[i] next_obs = next_observations[i] @@ -380,14 +396,14 @@ def _store_transitions(self) -> None: done = np.array([False]) self.replay_buffer.add(obs, next_obs, buffer_action, reward, done) - def __getattr__(self, item): + def __getattr__(self, item: str) -> Any: """ Find attribute from model class if this class does not have it. """ if hasattr(self.model, item): return getattr(self.model, item) else: - raise AttributeError + raise AttributeError(f"{self} has no attribute {item}") def _get_torch_save_params(self) -> Tuple[List[str], List[str]]: return self.model._get_torch_save_params() @@ -444,11 +460,22 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl # check if given env is valid if env is not None: # check if wrapper for dict support is needed + # if isinstance(env.observation_space, gym.spaces.dict.Dict): + # env = ObsDictWrapper(env) + + if not isinstance(env, VecEnv): + env = DummyVecEnv([lambda: env]) + + if is_image_space(env.observation_space) and not isinstance(env, VecTransposeImage): + env = VecTransposeImage(env) + + # check if wrapper for dict support when using HER is needed if isinstance(env.observation_space, gym.spaces.dict.Dict): env = ObsDictWrapper(env) + check_for_correct_spaces(env, data["observation_space"], data["action_space"]) # if no new env was given use stored env if possible - if env is None and "env" in data: + if env is None: env = data["env"] kwargs = {} diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 381f574e89..4c9f017c75 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -6,7 +6,7 @@ from gym import spaces from stable_baselines3.common.buffers import BaseBuffer -from stable_baselines3.common.type_aliases import ReplayBufferSamples +from stable_baselines3.common.type_aliases import ReplayBufferSamples, RolloutBufferSamples from stable_baselines3.common.vec_env import VecNormalize from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy @@ -27,7 +27,7 @@ class HerReplayBuffer(BaseBuffer): :param device: PyTorch device to which the values will be converted :param n_envs: Number of parallel environments - :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling) + :her_ratio: The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling) """ def __init__( @@ -40,7 +40,7 @@ def __init__( action_space: spaces.Space, device: Union[th.device, str] = "cpu", n_envs: int = 1, - her_ratio: float = 0.6, + her_ratio: float = 0.8, ): super(HerReplayBuffer, self).__init__(buffer_size, observation_space, action_space, device, n_envs) @@ -78,6 +78,11 @@ def __init__( # percentage of her indices self.her_ratio = her_ratio + def _get_samples( + self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None + ) -> Union[ReplayBufferSamples, RolloutBufferSamples]: + pass + def sample( self, batch_size: int, @@ -88,16 +93,16 @@ def sample( ) -> Union[ReplayBufferSamples, Tuple]: """ :param batch_size: Number of element to sample - :param env: associated gym VecEnv + :param env: Associated gym VecEnv to normalize the observations/rewards when sampling :param online_sampling: Using online_sampling for HER or not. :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. - :return: + :return: Samples. """ return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal, replay_observations) - def vectorized_sample_goal( + def sample_goal( self, episode_indices: np.ndarray, her_indices: np.ndarray, @@ -165,7 +170,7 @@ def _sample_transitions( :param online_sampling: Using online_sampling for HER or not. :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. - :return: + :return: Samples. """ # Select which episodes to use if online_sampling: @@ -176,17 +181,23 @@ def _sample_transitions( episode_length = self.episode_lengths[0] episode_indices = np.array(list(range(self.n_episodes_stored)) * episode_length * n_sampled_goal) her_indices = np.arange(len(episode_indices)) + ep_length = self.episode_lengths[episode_indices] # repeat every transition index n_sampled_goals times transitions_indices = np.array(list(range(episode_length)) * n_sampled_goal) if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: # restrict the sampling domain when ep_length > 1 # otherwise filter out the indices + if online_sampling: her_indices = her_indices[ep_length[her_indices] > 1] ep_length[her_indices] -= 1 else: her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1] + """ + her_indices = her_indices[ep_length[her_indices] > 1] + ep_length[her_indices] -= 1 + """ if online_sampling: # Select which transitions to use @@ -194,13 +205,10 @@ def _sample_transitions( # get selected transitions transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()} - new_goals = self.vectorized_sample_goal( - episode_indices, her_indices, transitions_indices, online_sampling, replay_observations - ) + new_goals = self.sample_goal(episode_indices, her_indices, transitions_indices, online_sampling, replay_observations) transitions["desired_goal"][her_indices] = new_goals - # Convert to numpy array - # TODO: disable if not needed for faster computation + # Convert info buffer to numpy array transitions["info"] = np.array( [ self.info_buffer[episode_idx][transition_idx] @@ -209,11 +217,11 @@ def _sample_transitions( ) # Vectorized computation - transitions["reward"][her_indices] = self.env.env_method( + transitions["reward"][her_indices, 0] = self.env.env_method( "compute_reward", - transitions["next_achieved_goal"][her_indices], - transitions["desired_goal"][her_indices], - transitions["info"][her_indices], + transitions["next_achieved_goal"][her_indices, 0], + transitions["desired_goal"][her_indices, 0], + transitions["info"][her_indices, 0], ) # concatenate observation with (desired) goal @@ -222,9 +230,9 @@ def _sample_transitions( if online_sampling: data = ( - self._normalize_obs(observations, env), + self._normalize_obs(observations[:, 0], env), transitions["action"], - self._normalize_obs(next_observations, env), + self._normalize_obs(next_observations[:, 0], env), transitions["done"], self._normalize_reward(transitions["reward"], env), ) diff --git a/tests/test_her.py b/tests/test_her.py index b99fdea87b..5ffc38288b 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -5,27 +5,23 @@ import pytest import torch as th -from stable_baselines3 import DDPG, DQN, SAC, TD3 +from stable_baselines3 import DDPG, DQN, HER, SAC, TD3 from stable_baselines3.common.bit_flipping_env import BitFlippingEnv -from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise -from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy -from stable_baselines3.her.her import HER -@pytest.mark.parametrize("model_class, policy", [(SAC, "MlpPolicy"), (TD3, "MlpPolicy"), (DDPG, "MlpPolicy")]) +@pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN]) @pytest.mark.parametrize("online_sampling", [True, False]) -def test_her(model_class, policy, online_sampling): +def test_her(model_class, online_sampling): """ Test Hindsight Experience Replay. """ n_bits = 4 - env = BitFlippingEnv(n_bits=n_bits, continuous=True) - env = DummyVecEnv([lambda: env]) + env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN)) model = HER( - policy, + "MlpPolicy", env, model_class, goal_selection_strategy="future", @@ -34,9 +30,10 @@ def test_her(model_class, policy, online_sampling): train_freq=1, n_episodes_rollout=-1, max_episode_length=n_bits, + policy_kwargs=dict(net_arch=[64]), ) - model.learn(total_timesteps=500, callback=None) + model.learn(total_timesteps=500) @pytest.mark.parametrize( @@ -58,7 +55,6 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling): Test different goal strategies. """ env = BitFlippingEnv(continuous=True) - env = DummyVecEnv([lambda: env]) model = HER( "MlpPolicy", @@ -70,13 +66,14 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling): train_freq=1, n_episodes_rollout=-1, max_episode_length=10, + policy_kwargs=dict(net_arch=[64]), ) - model.learn(total_timesteps=200, callback=None) + model.learn(total_timesteps=200) -@pytest.mark.parametrize("model_class, policy", [(SAC, "MlpPolicy"), (TD3, "MlpPolicy"), (DDPG, "MlpPolicy")]) +@pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN]) @pytest.mark.parametrize("use_sde", [False, True]) -def test_save_load(tmp_path, model_class, policy, use_sde): +def test_save_load(tmp_path, model_class, use_sde): """ Test if 'save' and 'load' saves and loads model correctly """ @@ -84,29 +81,18 @@ def test_save_load(tmp_path, model_class, policy, use_sde): pytest.skip("Only SAC has gSDE support") n_bits = 4 - env = BitFlippingEnv(n_bits=n_bits, continuous=True) - env = DummyVecEnv([lambda: env]) - - # Create action noise - n_actions = env.action_space.shape[0] - action_noise = OrnsteinUhlenbeckActionNoise( - np.zeros( - n_actions, - ), - 0.2 * np.ones((n_actions,)), - ) + env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN)) kwargs = dict(use_sde=True) if use_sde else {} # create model model = HER( - policy, + "MlpPolicy", env, model_class, n_sampled_goal=5, goal_selection_strategy="future", online_sampling=True, - action_noise=action_noise, verbose=0, tau=0.05, batch_size=128, @@ -121,17 +107,16 @@ def test_save_load(tmp_path, model_class, policy, use_sde): **kwargs ) - model.learn(total_timesteps=500, callback=None) + model.learn(total_timesteps=500) env.reset() observations_list = [] for _ in range(10): - obs = env.step([env.action_space.sample()])[0] + obs = env.step(env.action_space.sample())[0] observation = ObsDictWrapper.convert_dict(obs) observations_list.append(observation) - - observations = np.concatenate(observations_list, axis=0) + observations = np.array(observations_list) # Get dictionary of current parameters params = deepcopy(model.model.policy.state_dict()) From 7f8b63617e97a921d1abbc38ce7dfdc2a1f08c0b Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Mon, 12 Oct 2020 10:07:39 +0200 Subject: [PATCH 50/81] Fix pytype --- stable_baselines3/her/her_replay_buffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 4c9f017c75..f5a2f3e339 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -81,7 +81,7 @@ def __init__( def _get_samples( self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None ) -> Union[ReplayBufferSamples, RolloutBufferSamples]: - pass + raise NotImplementedError() def sample( self, From 39a63b8a6428d8b6624ddd5331efb5fdaf034a9a Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Wed, 14 Oct 2020 00:00:30 +0200 Subject: [PATCH 51/81] Update according to review comments. --- stable_baselines3/common/buffers.py | 2 - .../common/vec_env/obs_dict_wrapper.py | 6 +- stable_baselines3/her/her.py | 101 +++++++++--------- stable_baselines3/her/her_replay_buffer.py | 16 ++- 4 files changed, 58 insertions(+), 67 deletions(-) diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py index bd07069854..2063d6c302 100644 --- a/stable_baselines3/common/buffers.py +++ b/stable_baselines3/common/buffers.py @@ -83,8 +83,6 @@ def extend(self, *args, **kwargs) -> None: """ # Do a for loop along the batch axis for data in zip(*args): - # import ipdb - # ipdb.set_trace() self.add(*data) def reset(self) -> None: diff --git a/stable_baselines3/common/vec_env/obs_dict_wrapper.py b/stable_baselines3/common/vec_env/obs_dict_wrapper.py index e05b30b875..5b1dd1a106 100644 --- a/stable_baselines3/common/vec_env/obs_dict_wrapper.py +++ b/stable_baselines3/common/vec_env/obs_dict_wrapper.py @@ -1,3 +1,5 @@ +from typing import Dict + import numpy as np from gym import spaces @@ -53,7 +55,7 @@ def step_wait(self): @staticmethod def convert_dict( - observation_dict: dict, observation_key: str = "observation", goal_key: str = "desired_goal" + observation_dict: Dict[str, np.ndarray], observation_key: str = "observation", goal_key: str = "desired_goal" ) -> np.ndarray: """ Concatenate observation and (desired) goal of observation dict. @@ -61,6 +63,6 @@ def convert_dict( :param observation_dict: Dictionary with observation. :param observation_key: Key of observation in dicitonary. :param goal_key: Key of (desired) goal in dicitonary. - :return: + :return: Concatenated observation. """ return np.concatenate([observation_dict[observation_key], observation_dict[goal_key]], axis=-1) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index c1565137b8..e495415b03 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -4,6 +4,7 @@ import gym import numpy as np +import torch as th from stable_baselines3.common.base_class import BaseAlgorithm from stable_baselines3.common.buffers import ReplayBuffer @@ -48,9 +49,12 @@ def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> in class HER(BaseAlgorithm): """ Hindsight Experience Replay (HER) - Paper: https://arxiv.org/abs/1707.01495 + WARNING: Requires maximum episode length provided either by the environment or by the user! + + For additional offline algorithm specific arguments please have a look at the corresponding documentation. + :param policy: The policy model to use. :param env: The environment to learn from (if registered in Gym, can be str) :param model_class: Off policy model which will be used with hindsight experience replay. (SAC, TD3, DDPG, DQN) @@ -61,7 +65,7 @@ class HER(BaseAlgorithm): :param learning_rate: learning rate for the optimizer, it can be a function of the current progress remaining (from 1 to 0) :param max_episode_length: The maximum length of an episode. If not specified, - it will be automatically inferred if the environment uses a ``gym.wrappers.TimeLimit`` wrapper + it will be automatically inferred if the environment uses a ``gym.wrappers.TimeLimit`` wrapper. """ def __init__( @@ -72,20 +76,18 @@ def __init__( n_sampled_goal: int = 4, goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future", online_sampling: bool = False, - learning_rate: Union[float, Callable] = 3e-4, max_episode_length: Optional[int] = None, *args, **kwargs, ): - super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=learning_rate) + super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=3e-4) # model initialization self.model_class = model_class self.model = model_class( policy=policy, env=self.env, - learning_rate=learning_rate, *args, **kwargs, # pytype: disable=wrong-keyword-args ) @@ -114,7 +116,7 @@ def __init__( # storage for transitions of current episode self._episode_storage = HerReplayBuffer( self.env, - self.max_episode_length, + self.buffer_size, self.max_episode_length, self.goal_selection_strategy, self.env.observation_space, @@ -124,20 +126,12 @@ def __init__( self.her_ratio, # pytype: disable=wrong-arg-types ) + # assign episode storage to replay buffer when using online HER sampling + if self.online_sampling: + self.model.replay_buffer = self._episode_storage + # counter for steps in episode self.episode_steps = 0 - if self.online_sampling: - self.model.replay_buffer = HerReplayBuffer( - self.env, - self.buffer_size, - self.max_episode_length, - self.goal_selection_strategy, - self.env.observation_space, - self.env.action_space, - self.device, - self.n_envs, - self.her_ratio, # pytype: disable=wrong-arg-types - ) def _setup_model(self) -> None: self.model._setup_model() @@ -338,15 +332,15 @@ def collect_rollouts( if 0 < n_steps <= total_steps: break - if done or self.episode_steps == self.max_episode_length: + if done or self.episode_steps >= self.max_episode_length: if self.online_sampling: self.replay_buffer.store_episode() else: self._episode_storage.store_episode() # store episode in replay buffer self._store_transitions() - # clear storage for current episode - self._episode_storage.reset() + # clear storage for current episode + self._episode_storage.reset() total_episodes += 1 self._episode_num += 1 @@ -361,8 +355,6 @@ def collect_rollouts( if log_interval is not None and self._episode_num % log_interval == 0: self._dump_logs() - # reset if done or episode length is reached - self.env.reset() self.episode_steps = 0 mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 @@ -377,7 +369,7 @@ def _store_transitions(self) -> None: """ # sample goals and get new observations - observations, next_observations, transitions, her_indices = self._episode_storage.sample( + observations, next_observations, transitions = self._episode_storage.sample( self.batch_size, self.env, self.online_sampling, @@ -386,15 +378,8 @@ def _store_transitions(self) -> None: ) # store data in replay buffer - # self.replay_buffer.extend(observations, next_observations, transitions["action"], transitions["reward"], np.array([False])) - - for i in her_indices: - obs = observations[i] - next_obs = next_observations[i] - buffer_action = transitions["action"][i] - reward = transitions["reward"][i] - done = np.array([False]) - self.replay_buffer.add(obs, next_obs, buffer_action, reward, done) + dones = np.zeros((len(observations)), dtype=bool) + self.replay_buffer.extend(observations, next_observations, transitions["action"], transitions["reward"], dones) def __getattr__(self, item: str) -> Any: """ @@ -429,24 +414,37 @@ def save( self.model.model_class = self.model_class self.model.max_episode_length = self.max_episode_length + # exclude episode storage + if exclude is None: + exclude = [] + exclude = ["_episode_storage"].extend(exclude) + self.model.save(path, exclude, include) @classmethod - def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAlgorithm": + def load( + cls, + path: Union[str, pathlib.Path, io.BufferedIOBase], + env: Optional[GymEnv] = None, + device: Union[th.device, str] = "auto", + **kwargs, + ) -> "BaseAlgorithm": """ Load the model from a zip-file - :param load_path: the location of the saved data + :param path: path to the file (or a file-like) where to + load the agent from :param env: the new environment to run the loaded model on (can be None if you only need prediction from a trained model) has priority over any saved environment + :param device: Device on which the code should run. :param kwargs: extra arguments to change the model when loading """ - data, params, tensors = load_from_zip_file(load_path) + data, params, pytorch_variables = load_from_zip_file(path, device=device) + # Remove stored device information and replace with ours if "policy_kwargs" in data: - for arg_to_remove in ["device"]: - if arg_to_remove in data["policy_kwargs"]: - del data["policy_kwargs"][arg_to_remove] + if "device" in data["policy_kwargs"]: + del data["policy_kwargs"]["device"] if "policy_kwargs" in kwargs and kwargs["policy_kwargs"] != data["policy_kwargs"]: raise ValueError( @@ -457,12 +455,10 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl # check if observation space and action space are part of the saved parameters if "observation_space" not in data or "action_space" not in data: raise KeyError("The observation_space and action_space were not given, can't verify new environments") + # check if given env is valid if env is not None: # check if wrapper for dict support is needed - # if isinstance(env.observation_space, gym.spaces.dict.Dict): - # env = ObsDictWrapper(env) - if not isinstance(env, VecEnv): env = DummyVecEnv([lambda: env]) @@ -475,8 +471,10 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl check_for_correct_spaces(env, data["observation_space"], data["action_space"]) # if no new env was given use stored env if possible - if env is None: - env = data["env"] + else: + # Use stored env, if one exists. If not, continue as is (can be used for predict) + if "env" in data: + env = data["env"] kwargs = {} if "use_sde" in data and data["use_sde"]: @@ -490,7 +488,6 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl n_sampled_goal=data["n_sampled_goal"], goal_selection_strategy=data["goal_selection_strategy"], online_sampling=data["online_sampling"], - learning_rate=data["learning_rate"], max_episode_length=data["max_episode_length"], policy_kwargs=data["policy_kwargs"], _init_setup_model=True, # pytype: disable=not-instantiable,wrong-keyword-args @@ -506,14 +503,12 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl her_model._episode_num = her_model.model._episode_num # put state_dicts back in place - for name in params: - attr = recursive_getattr(her_model.model, name) - attr.load_state_dict(params[name]) - - # put tensors back in place - if tensors is not None: - for name in tensors: - recursive_setattr(her_model.model, name, tensors[name]) + her_model.model.set_parameters(params, exact_match=True, device=device) + + # put other pytorch variables back in place + if pytorch_variables is not None: + for name in pytorch_variables: + recursive_setattr(her_model.model, name, pytorch_variables[name]) # Sample gSDE exploration matrix, so it uses the right device # see issue #44 diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index f5a2f3e339..b4db3b2609 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -14,8 +14,8 @@ class HerReplayBuffer(BaseBuffer): """ - Replay Buffer for sampling HER (Hindsight Experience Replay) transitions online. - These transitions will not be saved in the Buffer. + Replay Buffer for sampling HER (Hindsight Experience Replay) transitions. + In the online sampling case these new transitions will not be saved in the Buffer. :param env: The training environment :param buffer_size: The size of the buffer measured in transitions. @@ -148,6 +148,7 @@ def sample_goal( obs = replay_observations[index] # get only the observation part of the state obs_dim = self.env.obs_dim + # get from every observation from first env the observation part (without concatenated desired goal) obs_array = obs[:, :, :obs_dim] return obs_array else: @@ -179,29 +180,24 @@ def _sample_transitions( ep_length = self.episode_lengths[episode_indices] else: episode_length = self.episode_lengths[0] - episode_indices = np.array(list(range(self.n_episodes_stored)) * episode_length * n_sampled_goal) + episode_indices = np.array(list(range(1)) * episode_length * n_sampled_goal) her_indices = np.arange(len(episode_indices)) - ep_length = self.episode_lengths[episode_indices] # repeat every transition index n_sampled_goals times transitions_indices = np.array(list(range(episode_length)) * n_sampled_goal) if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: # restrict the sampling domain when ep_length > 1 # otherwise filter out the indices - if online_sampling: her_indices = her_indices[ep_length[her_indices] > 1] ep_length[her_indices] -= 1 else: her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1] - """ - her_indices = her_indices[ep_length[her_indices] > 1] - ep_length[her_indices] -= 1 - """ if online_sampling: # Select which transitions to use transitions_indices = np.random.randint(ep_length) + # get selected transitions transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()} @@ -239,7 +235,7 @@ def _sample_transitions( return ReplayBufferSamples(*tuple(map(self.to_torch, data))) else: - return observations, next_observations, transitions, her_indices + return observations, next_observations, transitions def add( self, From 258deff02019be5eab39d23e5e6dc53e4b4f9e7b Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Wed, 14 Oct 2020 15:03:58 +0200 Subject: [PATCH 52/81] Removed random goal strategy. Updated sample transitions. --- docs/modules/her.rst | 2 +- .../her/goal_selection_strategy.py | 5 -- stable_baselines3/her/her.py | 5 +- stable_baselines3/her/her_replay_buffer.py | 56 +++++++------------ tests/test_her.py | 6 +- 5 files changed, 26 insertions(+), 48 deletions(-) diff --git a/docs/modules/her.rst b/docs/modules/her.rst index 8f8c8f36c6..167e6b6ab0 100644 --- a/docs/modules/her.rst +++ b/docs/modules/her.rst @@ -46,7 +46,7 @@ Example env = BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) - # Available strategies (cf paper): future, final, episode, random + # Available strategies (cf paper): future, final, episode goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # If True the HER transitions will get sampled online diff --git a/stable_baselines3/her/goal_selection_strategy.py b/stable_baselines3/her/goal_selection_strategy.py index 5f434be277..d4c6a93e4f 100644 --- a/stable_baselines3/her/goal_selection_strategy.py +++ b/stable_baselines3/her/goal_selection_strategy.py @@ -15,10 +15,6 @@ class GoalSelectionStrategy(Enum): FINAL = 1 # Select a goal that was achieved in the episode EPISODE = 2 - # Select a goal that was achieved - # at some point in the training procedure - # (and that is present in the replay buffer) - RANDOM = 3 # For convenience @@ -27,5 +23,4 @@ class GoalSelectionStrategy(Enum): "future": GoalSelectionStrategy.FUTURE, "final": GoalSelectionStrategy.FINAL, "episode": GoalSelectionStrategy.EPISODE, - "random": GoalSelectionStrategy.RANDOM, } diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index e495415b03..43ef1a499e 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -369,17 +369,16 @@ def _store_transitions(self) -> None: """ # sample goals and get new observations - observations, next_observations, transitions = self._episode_storage.sample( + observations, next_observations, actions, rewards = self._episode_storage.sample( self.batch_size, self.env, self.online_sampling, self.n_sampled_goal, - self.replay_buffer.observations, ) # store data in replay buffer dones = np.zeros((len(observations)), dtype=bool) - self.replay_buffer.extend(observations, next_observations, transitions["action"], transitions["reward"], dones) + self.replay_buffer.extend(observations, next_observations, actions, rewards, dones) def __getattr__(self, item: str) -> Any: """ diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index b4db3b2609..bbb98ad286 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -81,6 +81,9 @@ def __init__( def _get_samples( self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None ) -> Union[ReplayBufferSamples, RolloutBufferSamples]: + """ + Abstract method from base class. + """ raise NotImplementedError() def sample( @@ -89,7 +92,6 @@ def sample( env: Optional[VecNormalize] = None, online_sampling: bool = True, n_sampled_goal: int = None, - replay_observations: np.ndarray = None, ) -> Union[ReplayBufferSamples, Tuple]: """ :param batch_size: Number of element to sample @@ -97,18 +99,15 @@ def sample( to normalize the observations/rewards when sampling :param online_sampling: Using online_sampling for HER or not. :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) - :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. :return: Samples. """ - return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal, replay_observations) + return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal) - def sample_goal( + def sample_goals( self, episode_indices: np.ndarray, her_indices: np.ndarray, transitions_indices: np.ndarray, - online_sampling: bool = True, - replay_observations: np.ndarray = None, ) -> np.ndarray: """ Sample goals based on goal_selection_strategy. @@ -117,8 +116,6 @@ def sample_goal( :param episode_indices: Episode indices to use. :param her_indices: HER indices. :param transitions_indices: Transition indices to use. - :param online_sampling: Using online_sampling for HER or not. - :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. :return: Return sampled goals. """ her_episode_indices = episode_indices[her_indices] @@ -137,20 +134,6 @@ def sample_goal( # replay with random state which comes from the same episode as current transition transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices]) - elif self.goal_selection_strategy == GoalSelectionStrategy.RANDOM: - if online_sampling: - # replay with random state from the entire replay buffer - her_episode_indices = np.random.randint(self.n_episodes_stored, size=len(her_indices)) - transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices]) - else: - # replay with random state from the entire replay buffer - index = np.random.choice(np.arange(len(replay_observations)), len(her_indices)) - obs = replay_observations[index] - # get only the observation part of the state - obs_dim = self.env.obs_dim - # get from every observation from first env the observation part (without concatenated desired goal) - obs_array = obs[:, :, :obs_dim] - return obs_array else: raise ValueError("Strategy for sampling goals not supported!") @@ -162,7 +145,6 @@ def _sample_transitions( env: Optional[VecNormalize], online_sampling: bool = True, n_sampled_goal: int = None, - replay_observations: np.ndarray = None, ) -> Union[ReplayBufferSamples, Tuple]: """ :param batch_size: Number of element to sample @@ -170,38 +152,42 @@ def _sample_transitions( to normalize the observations/rewards when sampling :param online_sampling: Using online_sampling for HER or not. :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) - :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy. :return: Samples. """ # Select which episodes to use if online_sampling: episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)] - ep_length = self.episode_lengths[episode_indices] else: episode_length = self.episode_lengths[0] - episode_indices = np.array(list(range(1)) * episode_length * n_sampled_goal) + episode_indices = np.tile(0, (episode_length * n_sampled_goal)) + # episode_indices = np.array(list(range(1)) * episode_length * n_sampled_goal) her_indices = np.arange(len(episode_indices)) - # repeat every transition index n_sampled_goals times - transitions_indices = np.array(list(range(episode_length)) * n_sampled_goal) + + ep_length = self.episode_lengths[episode_indices] if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: # restrict the sampling domain when ep_length > 1 # otherwise filter out the indices - if online_sampling: - her_indices = her_indices[ep_length[her_indices] > 1] - ep_length[her_indices] -= 1 - else: - her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1] + her_indices = her_indices[ep_length[her_indices] > 1] + ep_length[her_indices] -= 1 if online_sampling: # Select which transitions to use transitions_indices = np.random.randint(ep_length) + else: + if her_indices.size == 0: + return np.empty(0), np.empty(0), np.empty(0), np.empty(0) + else: + # repeat every transition index n_sampled_goals times + transitions_indices = np.tile(np.arange(ep_length[0]), n_sampled_goal) + episode_indices = episode_indices[transitions_indices] + her_indices = np.arange(len(episode_indices)) # get selected transitions transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()} - new_goals = self.sample_goal(episode_indices, her_indices, transitions_indices, online_sampling, replay_observations) + new_goals = self.sample_goals(episode_indices, her_indices, transitions_indices) transitions["desired_goal"][her_indices] = new_goals # Convert info buffer to numpy array @@ -235,7 +221,7 @@ def _sample_transitions( return ReplayBufferSamples(*tuple(map(self.to_torch, data))) else: - return observations, next_observations, transitions + return observations, next_observations, transitions["action"], transitions["reward"] def add( self, diff --git a/tests/test_her.py b/tests/test_her.py index 5ffc38288b..bb045fb29f 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -42,11 +42,9 @@ def test_her(model_class, online_sampling): "final", "episode", "future", - "random", - GoalSelectionStrategy.FUTURE, - GoalSelectionStrategy.RANDOM, - GoalSelectionStrategy.EPISODE, GoalSelectionStrategy.FINAL, + GoalSelectionStrategy.EPISODE, + GoalSelectionStrategy.FUTURE, ], ) @pytest.mark.parametrize("online_sampling", [True, False]) From 381d927da93568add74004d1261a628ee960f2d8 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Wed, 14 Oct 2020 15:48:27 +0200 Subject: [PATCH 53/81] Updated migration. Removed time signal removal. --- docs/guide/migration.rst | 7 +++++++ stable_baselines3/her/her.py | 22 ++++------------------ 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/docs/guide/migration.rst b/docs/guide/migration.rst index 82fbc69bdf..89a1df2928 100644 --- a/docs/guide/migration.rst +++ b/docs/guide/migration.rst @@ -163,6 +163,13 @@ Despite this change, no change in performance should be expected. To match SB2 behavior, you need to explicitly pass ``deterministic=True`` +HER +^^^ + +The HER implementation now also supports online sampling of the new goals. This is done in a vectorized version. +The goal selection strategy ``RANDOM`` is no longer supported. + + New logger API -------------- diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 43ef1a499e..ff502598a5 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -287,30 +287,16 @@ def collect_rollouts( self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward self.model._last_original_obs = self._last_original_obs - # Remove termination signal due to timelimit if needed - # NOTE: this may cause issue when using memory optimized replay - # or n-step replay - if self.remove_time_limit_termination and infos[0].get("TimeLimit.truncated", False): - done_ = np.array([False]) - # As the VecEnv resets automatically, new_obs is already the - # first observation of the next episode - next_obs = infos[0]["terminal_observation"] - if self._vec_normalize_env is not None: - next_obs = self._vec_normalize_env.unnormalize_obs(next_obs) - else: - done_ = done - next_obs = new_obs_ - if self.online_sampling: - self.replay_buffer.add(self._last_original_obs, next_obs, buffer_action, reward_, done_, infos) + self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) else: # concatenate observation with (desired) goal obs = ObsDictWrapper.convert_dict(self._last_original_obs) - next_obs_ = ObsDictWrapper.convert_dict(next_obs) + next_obs = ObsDictWrapper.convert_dict(new_obs_) # add to replay buffer - self.replay_buffer.add(obs, next_obs_, buffer_action, reward_, done_) + self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done) # add current transition to episode storage - self._episode_storage.add(self._last_original_obs, next_obs, buffer_action, reward_, done_, infos) + self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) self._last_obs = new_obs self.model._last_obs = self._last_obs From c10b26aec466f9366d2c279a699ab6317ca97397 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 14 Oct 2020 18:10:15 +0200 Subject: [PATCH 54/81] Update doc --- docs/guide/examples.rst | 75 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index a6b804071b..49b2b3c2fa 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -18,8 +18,7 @@ notebooks: - `Atari Games`_ - `RL Baselines zoo`_ - `PyBullet`_ - -.. - `Hindsight Experience Replay`_ +- `Hindsight Experience Replay`_ .. _Getting Started: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/stable_baselines_getting_started.ipynb .. _Training, Saving, Loading: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/saving_loading_dqn.ipynb @@ -343,6 +342,78 @@ will compute a running average and standard deviation of input features (it can env.norm_reward = False +Hindsight Experience Replay (HER) +--------------------------------- + +For this example, we are using `Highway-Env `_ by `@eleurent `_. + + +.. image:: ../_static/img/colab-badge.svg + :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/stable_baselines_her.ipynb + + +.. figure:: https://raw.githubusercontent.com/eleurent/highway-env/gh-media/docs/media/parking-env.gif + + The highway-parking-v0 environment. + +The parking env is a goal-conditioned continuous control task, in which the vehicle must park in a given space with the appropriate heading. + +.. note:: + + The hyperparameters in the following example were optimized for that environment. + + +.. code-block:: python + + import gym + import highway_env + import numpy as np + + from stable_baselines3 import HER, SAC, DDPG, TD3 + from stable_baselines3.common.noise import NormalActionNoise + + env = gym.make("parking-v0") + + # Create 4 artificial transitions per real transition + n_sampled_goal = 4 + + # SAC hyperparams: + model = HER( + "MlpPolicy", + env, + SAC, + n_sampled_goal=n_sampled_goal, + goal_selection_strategy="future", + verbose=1, + buffer_size=int(1e6), + learning_rate=1e-3, + gamma=0.95, + batch_size=256, + online_sampling=True, + policy_kwargs=dict(net_arch=[256, 256, 256]), + ) + + model.learn(int(2e5)) + model.save("her_sac_highway") + + # Load saved model + model = HER.load("her_sac_highway", env=env) + + obs = env.reset() + + # Evaluate the agent + episode_reward = 0 + for _ in range(100): + action, _ = model.predict(obs, deterministic=True) + obs, reward, done, info = env.step(action) + env.render() + episode_reward += reward + if done or info.get("is_success", False): + print("Reward:", episode_reward, "Success?", info.get("is_success", False)) + episode_reward = 0.0 + obs = env.reset() + + Record a Video -------------- From 9d5c83ebe315fddf9407dd621a7bfad1891c4dc1 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 14 Oct 2020 18:10:36 +0200 Subject: [PATCH 55/81] Fix potential load issue --- docs/misc/changelog.rst | 1 + stable_baselines3/common/base_class.py | 31 +++++++++++++------- stable_baselines3/common/vec_env/__init__.py | 11 +++++++ stable_baselines3/her/her.py | 23 ++++----------- 4 files changed, 39 insertions(+), 27 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index fc198a6fab..6671d68233 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -18,6 +18,7 @@ New Features: Bug Fixes: ^^^^^^^^^^ - Fix GAE computation for on-policy algorithms (off-by one for the last value) (thanks @Wovchena) +- Fixed potential issue when loading a different environment Deprecations: ^^^^^^^^^^^^^ diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py index 97d0035599..f270c6032f 100644 --- a/stable_baselines3/common/base_class.py +++ b/stable_baselines3/common/base_class.py @@ -26,7 +26,14 @@ set_random_seed, update_learning_rate, ) -from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecNormalize, VecTransposeImage, unwrap_vec_normalize +from stable_baselines3.common.vec_env import ( + DummyVecEnv, + VecEnv, + VecNormalize, + VecTransposeImage, + is_wrapped, + unwrap_vec_normalize, +) from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper @@ -147,7 +154,7 @@ def __init__( self.eval_env = maybe_make_env(env, monitor_wrapper, self.verbose) env = maybe_make_env(env, monitor_wrapper, self.verbose) - env = self._wrap_env(env) + env = self._wrap_env(env, self.verbose) self.observation_space = env.observation_space self.action_space = env.action_space @@ -162,14 +169,15 @@ def __init__( if self.use_sde and not isinstance(self.action_space, gym.spaces.Box): raise ValueError("generalized State-Dependent Exploration (gSDE) can only be used with continuous actions.") - def _wrap_env(self, env: GymEnv) -> VecEnv: + @staticmethod + def _wrap_env(env: GymEnv, verbose: int = 0) -> VecEnv: if not isinstance(env, VecEnv): - if self.verbose >= 1: + if verbose >= 1: print("Wrapping the env in a DummyVecEnv.") env = DummyVecEnv([lambda: env]) - if is_image_space(env.observation_space) and not isinstance(env, VecTransposeImage): - if self.verbose >= 1: + if is_image_space(env.observation_space) and not is_wrapped(env, VecTransposeImage): + if verbose >= 1: print("Wrapping the env in a VecTransposeImage.") env = VecTransposeImage(env) @@ -194,7 +202,7 @@ def _get_eval_env(self, eval_env: Optional[GymEnv]) -> Optional[GymEnv]: eval_env = self.eval_env if eval_env is not None: - eval_env = self._wrap_env(eval_env) + eval_env = self._wrap_env(eval_env, self.verbose) assert eval_env.num_envs == 1 return eval_env @@ -408,10 +416,11 @@ def set_env(self, env: GymEnv) -> None: :param env: The environment for learning a policy """ - check_for_correct_spaces(env, self.observation_space, self.action_space) - # it must be coherent now # if it is not a VecEnv, make it a VecEnv - env = self._wrap_env(env) + # and do other transformations (dict obs, image transpose) if needed + env = self._wrap_env(env, self.verbose) + # Check that the observation spaces match + check_for_correct_spaces(env, self.observation_space, self.action_space) self.n_envs = env.num_envs self.env = env @@ -582,6 +591,8 @@ def load( raise KeyError("The observation_space and action_space were not given, can't verify new environments") if env is not None: + # Wrap first if needed + cls._wrap_env(env, data["verbose"]) # Check if given env is valid check_for_correct_spaces(env, data["observation_space"], data["action_space"]) else: diff --git a/stable_baselines3/common/vec_env/__init__.py b/stable_baselines3/common/vec_env/__init__.py index 1940f20c04..0002788895 100644 --- a/stable_baselines3/common/vec_env/__init__.py +++ b/stable_baselines3/common/vec_env/__init__.py @@ -41,6 +41,17 @@ def unwrap_vec_normalize(env: Union["GymEnv", VecEnv]) -> Optional[VecNormalize] return unwrap_vec_wrapper(env, VecNormalize) # pytype:disable=bad-return-type +def is_wrapped(env: Union["GymEnv", VecEnv], vec_wrapper_class: Type[VecEnvWrapper]) -> bool: + """ + Check if an environment is already wrapped by a given ``VecEnvWrapper``. + + :param env: + :param vec_wrapper_class: + :return: + """ + return unwrap_vec_wrapper(env, vec_wrapper_class) is not None + + # Define here to avoid circular import def sync_envs_normalization(env: "GymEnv", eval_env: "GymEnv") -> None: """ diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index ff502598a5..eb4a13c516 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -1,8 +1,7 @@ import io import pathlib -from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Iterable, List, Optional, Tuple, Type, Union -import gym import numpy as np import torch as th @@ -12,11 +11,10 @@ from stable_baselines3.common.noise import ActionNoise from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm from stable_baselines3.common.policies import BasePolicy -from stable_baselines3.common.preprocessing import is_image_space -from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr +from stable_baselines3.common.save_util import load_from_zip_file, recursive_setattr from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn from stable_baselines3.common.utils import check_for_correct_spaces -from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecTransposeImage +from stable_baselines3.common.vec_env import VecEnv from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy from stable_baselines3.her.her_replay_buffer import HerReplayBuffer @@ -443,19 +441,10 @@ def load( # check if given env is valid if env is not None: - # check if wrapper for dict support is needed - if not isinstance(env, VecEnv): - env = DummyVecEnv([lambda: env]) - - if is_image_space(env.observation_space) and not isinstance(env, VecTransposeImage): - env = VecTransposeImage(env) - - # check if wrapper for dict support when using HER is needed - if isinstance(env.observation_space, gym.spaces.dict.Dict): - env = ObsDictWrapper(env) - + # Wrap first if needed + env = cls._wrap_env(env, data["verbose"]) + # Check if given env is valid check_for_correct_spaces(env, data["observation_space"], data["action_space"]) - # if no new env was given use stored env if possible else: # Use stored env, if one exists. If not, continue as is (can be used for predict) if "env" in data: From 46c6d29a5643ac68adcdfe17730ed14c5bd06159 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Fri, 16 Oct 2020 15:36:39 +0200 Subject: [PATCH 56/81] Add VecNormalize support for dict obs --- Makefile | 2 +- docs/guide/migration.rst | 4 +- docs/misc/changelog.rst | 1 + docs/modules/her.rst | 5 + stable_baselines3/common/bit_flipping_env.py | 6 +- stable_baselines3/common/buffers.py | 8 +- stable_baselines3/common/utils.py | 11 +- .../common/vec_env/vec_normalize.py | 80 ++++++++++--- stable_baselines3/her/her.py | 11 +- stable_baselines3/her/her_replay_buffer.py | 10 +- tests/test_her.py | 25 ++-- tests/test_vec_normalize.py | 109 +++++++++++++++--- 12 files changed, 202 insertions(+), 70 deletions(-) diff --git a/Makefile b/Makefile index 749bc026b2..9954c7d7b1 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ pytest: ./scripts/run_tests.sh type: - pytype + pytype -j auto lint: # stop the build if there are Python syntax errors or undefined names diff --git a/docs/guide/migration.rst b/docs/guide/migration.rst index 89a1df2928..f1e0225a01 100644 --- a/docs/guide/migration.rst +++ b/docs/guide/migration.rst @@ -166,9 +166,9 @@ Despite this change, no change in performance should be expected. HER ^^^ -The HER implementation now also supports online sampling of the new goals. This is done in a vectorized version. +The ``HER`` implementation now also supports online sampling of the new goals. This is done in a vectorized version. The goal selection strategy ``RANDOM`` is no longer supported. - +``HER`` now supports ``VecNormalize`` wrapper but only when ``online_sampling=True`` New logger API diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index ba54366f60..03cacf45a1 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -14,6 +14,7 @@ New Features: ^^^^^^^^^^^^^ - Allow custom actor/critic network architectures using ``net_arch=dict(qf=[400, 300], pi=[64, 64])`` for off-policy algorithms (SAC, TD3, DDPG) - Added Hindsight Experience Replay ``HER``. (@megan-klaiber) +- ``VecNormalize`` now supports ``gym.spaces.Dict`` observation spaces Bug Fixes: ^^^^^^^^^^ diff --git a/docs/modules/her.rst b/docs/modules/her.rst index 167e6b6ab0..6befbc1731 100644 --- a/docs/modules/her.rst +++ b/docs/modules/her.rst @@ -15,6 +15,11 @@ HER is an algorithm that works with Off policy methods (DQN, SAC, TD3 and DDPG f HER requires the environment to inherits from `gym.GoalEnv `_ +.. warning:: + + ``HER`` supports ``VecNormalize`` wrapper but only when ``online_sampling=True`` + + Notes ----- diff --git a/stable_baselines3/common/bit_flipping_env.py b/stable_baselines3/common/bit_flipping_env.py index 999ada32e2..d38ff73cc7 100644 --- a/stable_baselines3/common/bit_flipping_env.py +++ b/stable_baselines3/common/bit_flipping_env.py @@ -1,5 +1,5 @@ from collections import OrderedDict -from typing import Dict, Optional, Union +from typing import Any, Dict, Optional, Union import numpy as np from gym import GoalEnv, spaces @@ -111,7 +111,9 @@ def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: done = done or self.current_step >= self.max_steps return obs, reward, done, info - def compute_reward(self, achieved_goal: np.ndarray, desired_goal: np.ndarray, _info) -> np.float32: + def compute_reward( + self, achieved_goal: Union[int, np.ndarray], desired_goal: Union[int, np.ndarray], _info: Optional[Dict[str, Any]] + ) -> np.float32: # Deceptive reward: it is positive only when the goal is achieved # vectorized version distance = np.linalg.norm(achieved_goal - desired_goal, axis=-1) diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py index 563ddea755..83e6554898 100644 --- a/stable_baselines3/common/buffers.py +++ b/stable_baselines3/common/buffers.py @@ -1,6 +1,6 @@ import warnings from abc import ABC, abstractmethod -from typing import Generator, Optional, Union +from typing import Dict, Generator, Optional, Union import numpy as np import torch as th @@ -129,9 +129,11 @@ def to_torch(self, array: np.ndarray, copy: bool = True) -> th.Tensor: return th.as_tensor(array).to(self.device) @staticmethod - def _normalize_obs(obs: np.ndarray, env: Optional[VecNormalize] = None) -> np.ndarray: + def _normalize_obs( + obs: Union[np.ndarray, Dict[str, np.ndarray]], env: Optional[VecNormalize] = None + ) -> Union[np.ndarray, Dict[str, np.ndarray]]: if env is not None: - return env.normalize_obs(obs).astype(np.float32) + return env.normalize_obs(obs) return obs @staticmethod diff --git a/stable_baselines3/common/utils.py b/stable_baselines3/common/utils.py index 57872657b8..2f02ed74db 100644 --- a/stable_baselines3/common/utils.py +++ b/stable_baselines3/common/utils.py @@ -16,9 +16,7 @@ SummaryWriter = None from stable_baselines3.common import logger -from stable_baselines3.common.preprocessing import is_image_space from stable_baselines3.common.type_aliases import GymEnv -from stable_baselines3.common.vec_env import VecTransposeImage def set_random_seed(seed: int, using_cuda: bool = False) -> None: @@ -204,14 +202,7 @@ def check_for_correct_spaces(env: GymEnv, observation_space: gym.spaces.Space, a :param observation_space: Observation space to check against :param action_space: Action space to check against """ - if ( - observation_space != env.observation_space - # Special cases for images that need to be transposed - and not ( - is_image_space(env.observation_space) - and observation_space == VecTransposeImage.transpose_space(env.observation_space) - ) - ): + if observation_space != env.observation_space: raise ValueError(f"Observation spaces do not match: {observation_space} != {env.observation_space}") if action_space != env.action_space: raise ValueError(f"Action spaces do not match: {action_space} != {env.action_space}") diff --git a/stable_baselines3/common/vec_env/vec_normalize.py b/stable_baselines3/common/vec_env/vec_normalize.py index 39a5d1128a..fcdefd8edd 100644 --- a/stable_baselines3/common/vec_env/vec_normalize.py +++ b/stable_baselines3/common/vec_env/vec_normalize.py @@ -1,8 +1,11 @@ import pickle -from typing import Any, Dict +from copy import deepcopy +from typing import Any, Dict, Union +import gym import numpy as np +from stable_baselines3.common import utils from stable_baselines3.common.running_mean_std import RunningMeanStd from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvStepReturn, VecEnvWrapper @@ -34,7 +37,19 @@ def __init__( epsilon: float = 1e-8, ): VecEnvWrapper.__init__(self, venv) - self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) + + assert isinstance( + self.observation_space, (gym.spaces.Box, gym.spaces.Dict) + ), "VecNormalize only support `gym.spaces.Box` and `gym.spaces.Dict` observation spaces" + + if isinstance(self.observation_space, gym.spaces.Dict): + self.obs_keys = set(self.observation_space.spaces.keys()) + self.obs_spaces = self.observation_space.spaces + self.obs_rms = {key: RunningMeanStd(shape=space.shape) for key, space in self.obs_spaces.items()} + else: + self.obs_keys, self.obs_spaces = None, None + self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) + self.ret_rms = RunningMeanStd(shape=()) self.clip_obs = clip_obs self.clip_reward = clip_reward @@ -83,8 +98,9 @@ def set_venv(self, venv: VecEnv) -> None: if self.venv is not None: raise ValueError("Trying to set venv of already initialized VecNormalize wrapper.") VecEnvWrapper.__init__(self, venv) - if self.obs_rms.mean.shape != self.observation_space.shape: - raise ValueError("venv is incompatible with current statistics.") + + # Check only that the observation_space match + utils.check_for_correct_spaces(venv, self.observation_space, venv.action_space) self.ret = np.zeros(self.num_envs) def step_wait(self) -> VecEnvStepReturn: @@ -99,7 +115,12 @@ def step_wait(self) -> VecEnvStepReturn: self.old_reward = rews if self.training: - self.obs_rms.update(obs) + if isinstance(obs, dict) and isinstance(self.obs_rms, dict): + for key in self.obs_rms.keys(): + self.obs_rms[key].update(obs[key]) + else: + self.obs_rms.update(obs) + obs = self.normalize_obs(obs) if self.training: @@ -114,14 +135,38 @@ def _update_reward(self, reward: np.ndarray) -> None: self.ret = self.ret * self.gamma + reward self.ret_rms.update(self.ret) - def normalize_obs(self, obs: np.ndarray) -> np.ndarray: + def _normalize_obs(self, obs: np.ndarray, obs_rms: RunningMeanStd) -> np.ndarray: + """ + Helper to normalize observation. + :param obs: + :param obs_rms: associated statistics + :return: normalized observation + """ + return np.clip((obs - obs_rms.mean) / np.sqrt(obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) + + def _unnormalize_obs(self, obs: np.ndarray, obs_rms: RunningMeanStd) -> np.ndarray: + """ + Helper to unnormalize observation. + :param obs: + :param obs_rms: associated statistics + :return: unnormalized observation + """ + return (obs * np.sqrt(obs_rms.var + self.epsilon)) + obs_rms.mean + + def normalize_obs(self, obs: Union[np.ndarray, Dict[str, np.ndarray]]) -> Union[np.ndarray, Dict[str, np.ndarray]]: """ Normalize observations using this VecNormalize's observations statistics. Calling this method does not update statistics. """ + # Avoid modifying by reference the original object + obs_ = deepcopy(obs) if self.norm_obs: - obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) - return obs + if isinstance(obs, dict) and isinstance(self.obs_rms, dict): + for key in self.obs_rms.keys(): + obs_[key] = self._normalize_obs(obs[key], self.obs_rms[key]).astype(np.float32) + else: + obs_ = self._normalize_obs(obs, self.obs_rms).astype(np.float32) + return obs_ def normalize_reward(self, reward: np.ndarray) -> np.ndarray: """ @@ -132,22 +177,28 @@ def normalize_reward(self, reward: np.ndarray) -> np.ndarray: reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) return reward - def unnormalize_obs(self, obs: np.ndarray) -> np.ndarray: + def unnormalize_obs(self, obs: Union[np.ndarray, Dict[str, np.ndarray]]) -> Union[np.ndarray, Dict[str, np.ndarray]]: + # Avoid modifying by reference the original object + obs_ = deepcopy(obs) if self.norm_obs: - return (obs * np.sqrt(self.obs_rms.var + self.epsilon)) + self.obs_rms.mean - return obs + if isinstance(obs, dict) and isinstance(self.obs_rms, dict): + for key in self.obs_rms.keys(): + obs_[key] = self._unnormalize_obs(obs[key], self.obs_rms[key]) + else: + obs_ = self._unnormalize_obs(obs, self.obs_rms) + return obs_ def unnormalize_reward(self, reward: np.ndarray) -> np.ndarray: if self.norm_reward: return reward * np.sqrt(self.ret_rms.var + self.epsilon) return reward - def get_original_obs(self) -> np.ndarray: + def get_original_obs(self) -> Union[np.ndarray, Dict[str, np.ndarray]]: """ Returns an unnormalized version of the observations from the most recent step or reset. """ - return self.old_obs.copy() + return deepcopy(self.old_obs) def get_original_reward(self) -> np.ndarray: """ @@ -155,9 +206,10 @@ def get_original_reward(self) -> np.ndarray: """ return self.old_reward.copy() - def reset(self) -> np.ndarray: + def reset(self) -> Union[np.ndarray, Dict[str, np.ndarray]]: """ Reset all environments + :return: first observation of the episode """ obs = self.venv.reset() self.old_obs = obs diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index eb4a13c516..bd71b3e07f 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -6,7 +6,6 @@ import torch as th from stable_baselines3.common.base_class import BaseAlgorithm -from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.noise import ActionNoise from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm @@ -81,6 +80,9 @@ def __init__( super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=3e-4) + if self.get_vec_normalize_env() is not None: + assert online_sampling, "You must pass `online_sampling=True` if you want to use `VecNormalize` with `HER`" + # model initialization self.model_class = model_class self.model = model_class( @@ -179,7 +181,6 @@ def learn( action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, - replay_buffer=self.replay_buffer, log_interval=log_interval, ) @@ -204,7 +205,6 @@ def collect_rollouts( n_steps: int = -1, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, - replay_buffer: Union[ReplayBuffer, HerReplayBuffer] = None, log_interval: Optional[int] = None, ) -> RolloutReturn: """ @@ -221,7 +221,6 @@ def collect_rollouts( Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: Number of steps before learning for the warm-up phase. - :param replay_buffer: :param log_interval: Log data every ``log_interval`` episodes :return: """ @@ -275,7 +274,7 @@ def collect_rollouts( self.model.ep_success_buffer = self.ep_success_buffer # Store episode in episode storage - if replay_buffer is not None: + if self.replay_buffer is not None: # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs() @@ -355,7 +354,7 @@ def _store_transitions(self) -> None: # sample goals and get new observations observations, next_observations, actions, rewards = self._episode_storage.sample( self.batch_size, - self.env, + self.get_vec_normalize_env(), self.online_sampling, self.n_sampled_goal, ) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index bbb98ad286..f2d33af6bd 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -207,14 +207,16 @@ def _sample_transitions( ) # concatenate observation with (desired) goal - observations = ObsDictWrapper.convert_dict(transitions) - next_observations = ObsDictWrapper.convert_dict(transitions, observation_key="next_obs") + observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, env)) + # HACK to make normalize obs work with the next observation + transitions["observation"] = transitions["next_obs"] + next_observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, env)) if online_sampling: data = ( - self._normalize_obs(observations[:, 0], env), + observations[:, 0], transitions["action"], - self._normalize_obs(next_observations[:, 0], env), + next_observations[:, 0], transitions["done"], self._normalize_reward(transitions["reward"], env), ) diff --git a/tests/test_her.py b/tests/test_her.py index bb045fb29f..9989663f0a 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -31,9 +31,10 @@ def test_her(model_class, online_sampling): n_episodes_rollout=-1, max_episode_length=n_bits, policy_kwargs=dict(net_arch=[64]), + learning_starts=100, ) - model.learn(total_timesteps=500) + model.learn(total_timesteps=300) @pytest.mark.parametrize( @@ -65,8 +66,9 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling): n_episodes_rollout=-1, max_episode_length=10, policy_kwargs=dict(net_arch=[64]), + learning_starts=100, ) - model.learn(total_timesteps=200) + model.learn(total_timesteps=300) @pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN]) @@ -99,7 +101,8 @@ def test_save_load(tmp_path, model_class, use_sde): buffer_size=int(1e6), gamma=0.98, gradient_steps=1, - train_freq=1, + train_freq=4, + learning_starts=100, n_episodes_rollout=-1, max_episode_length=n_bits, **kwargs @@ -152,17 +155,18 @@ def test_save_load(tmp_path, model_class, use_sde): assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works - model.learn(total_timesteps=1000, eval_freq=500) + model.learn(total_timesteps=300) # clear file from os os.remove(tmp_path / "test_save.zip") @pytest.mark.parametrize("online_sampling", [False, True]) -@pytest.mark.parametrize("n_bits", [15]) -def test_dqn_her(online_sampling, n_bits): +@pytest.mark.parametrize("n_bits", [10]) +def test_performance_her(online_sampling, n_bits): """ - Test HER with DQN for BitFlippingEnv. + That that DQN+HER can solve BitFlippingEnv. + It should not work when n_sampled_goal=0 (DQN alone). """ env = BitFlippingEnv(n_bits=n_bits, continuous=False) @@ -174,7 +178,7 @@ def test_dqn_her(online_sampling, n_bits): goal_selection_strategy="future", online_sampling=online_sampling, verbose=1, - learning_rate=0.0005, + learning_rate=5e-4, max_episode_length=n_bits, train_freq=1, learning_starts=100, @@ -184,6 +188,7 @@ def test_dqn_her(online_sampling, n_bits): batch_size=32, ) - model.learn(total_timesteps=10000) + model.learn(total_timesteps=5000, log_interval=50) - assert np.mean(model.ep_success_buffer) > 0.0 + # 90% training success + assert np.mean(model.ep_success_buffer) > 0.90 diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index 311e3c92e1..75b017c782 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -1,8 +1,9 @@ import gym import numpy as np import pytest +from gym import spaces -from stable_baselines3 import SAC, TD3 +from stable_baselines3 import HER, SAC, TD3 from stable_baselines3.common.running_mean_std import RunningMeanStd from stable_baselines3.common.vec_env import ( DummyVecEnv, @@ -15,14 +16,68 @@ ENV_ID = "Pendulum-v0" +class DummyDictEnv(gym.GoalEnv): + """ + Dummy gym goal env for testing purposes + """ + + def __init__(self): + super(DummyDictEnv, self).__init__() + self.observation_space = spaces.Dict( + { + "observation": spaces.Box(low=-20.0, high=20.0, shape=(4,), dtype=np.float32), + "achieved_goal": spaces.Box(low=-20.0, high=20.0, shape=(4,), dtype=np.float32), + "desired_goal": spaces.Box(low=-20.0, high=20.0, shape=(4,), dtype=np.float32), + } + ) + self.action_space = spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32) + + def reset(self): + return self.observation_space.sample() + + def step(self, action): + obs = self.observation_space.sample() + reward = self.compute_reward(obs["achieved_goal"], obs["desired_goal"], {}) + done = np.random.rand() > 0.8 + return obs, reward, done, {} + + def compute_reward(self, achieved_goal: np.ndarray, desired_goal: np.ndarray, _info) -> np.float32: + distance = np.linalg.norm(achieved_goal - desired_goal, axis=-1) + return -(distance > 0).astype(np.float32) + + +def allclose(obs_1, obs_2): + """ + Generalized np.allclose() to work with dict spaces. + """ + if isinstance(obs_1, dict): + all_close = True + for key in obs_1.keys(): + if not np.allclose(obs_1[key], obs_2[key]): + all_close = False + break + return all_close + return np.allclose(obs_1, obs_2) + + def make_env(): return gym.make(ENV_ID) +def make_dict_env(): + return DummyDictEnv() + + def check_rms_equal(rmsa, rmsb): - assert np.all(rmsa.mean == rmsb.mean) - assert np.all(rmsa.var == rmsb.var) - assert np.all(rmsa.count == rmsb.count) + if isinstance(rmsa, dict): + for key in rmsa.keys(): + assert np.all(rmsa[key].mean == rmsb[key].mean) + assert np.all(rmsa[key].var == rmsb[key].var) + assert np.all(rmsa[key].count == rmsb[key].count) + else: + assert np.all(rmsa.mean == rmsb.mean) + assert np.all(rmsa.var == rmsb.var) + assert np.all(rmsa.count == rmsb.count) def check_vec_norm_equal(norma, normb): @@ -56,6 +111,19 @@ def _make_warmstart_cartpole(): return venv +def _make_warmstart_dict_env(): + """Warm-start VecNormalize by stepping through BitFlippingEnv""" + venv = DummyVecEnv([make_dict_env]) + venv = VecNormalize(venv) + venv.reset() + venv.get_original_obs() + + for _ in range(100): + actions = [venv.action_space.sample()] + venv.step(actions) + return venv + + def test_runningmeanstd(): """Test RunningMeanStd object""" for (x_1, x_2, x_3) in [ @@ -123,21 +191,24 @@ def test_normalize_external(): assert np.all(norm_rewards < 1) -@pytest.mark.parametrize("model_class", [SAC, TD3]) +@pytest.mark.parametrize("model_class", [SAC, TD3, HER]) def test_offpolicy_normalization(model_class): - env = DummyVecEnv([make_env]) + make_env_ = make_dict_env if model_class == HER else make_env + env = DummyVecEnv([make_env_]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0, clip_reward=10.0) - eval_env = DummyVecEnv([make_env]) + eval_env = DummyVecEnv([make_env_]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10.0, clip_reward=10.0) - model = model_class("MlpPolicy", env, verbose=1, policy_kwargs=dict(net_arch=[64])) - model.learn(total_timesteps=1000, eval_env=eval_env, eval_freq=500) + kwargs = dict(model_class=SAC, max_episode_length=200, online_sampling=True) if model_class == HER else {} + model = model_class("MlpPolicy", env, verbose=1, learning_starts=100, policy_kwargs=dict(net_arch=[64]), **kwargs) + model.learn(total_timesteps=500, eval_env=eval_env, eval_freq=250) # Check getter assert isinstance(model.get_vec_normalize_env(), VecNormalize) -def test_sync_vec_normalize(): +@pytest.mark.parametrize("make_env", [make_env, make_dict_env]) +def test_sync_vec_normalize(make_env): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None @@ -146,13 +217,15 @@ def test_sync_vec_normalize(): assert isinstance(unwrap_vec_normalize(env), VecNormalize) - env = VecFrameStack(env, 1) - - assert isinstance(unwrap_vec_normalize(env), VecNormalize) + if not isinstance(env.observation_space, spaces.Dict): + env = VecFrameStack(env, 1) + assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) - eval_env = VecFrameStack(eval_env, 1) + + if not isinstance(env.observation_space, spaces.Dict): + eval_env = VecFrameStack(eval_env, 1) env.seed(0) env.action_space.seed(0) @@ -171,12 +244,12 @@ def test_sync_vec_normalize(): dummy_rewards = np.random.rand(10) original_obs = env.get_original_obs() # Check that unnormalization works - assert np.allclose(original_obs, env.unnormalize_obs(obs)) + assert allclose(original_obs, env.unnormalize_obs(obs)) # Normalization must be different (between different environments) - assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) + assert not allclose(obs, eval_env.normalize_obs(original_obs)) # Test syncing of parameters sync_envs_normalization(env, eval_env) # Now they must be synced - assert np.allclose(obs, eval_env.normalize_obs(original_obs)) - assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards)) + assert allclose(obs, eval_env.normalize_obs(original_obs)) + assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards)) From 1cfc790b041a79a4e16de728681afcd16265d182 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Fri, 16 Oct 2020 15:57:30 +0200 Subject: [PATCH 57/81] Updated saving/loading replay buffer for HER. --- stable_baselines3/her/her_replay_buffer.py | 38 ++++++++++++++++++++-- tests/test_her.py | 33 +++++++++++++++++++ 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index f2d33af6bd..4ade7ec047 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -1,18 +1,18 @@ from collections import deque -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np import torch as th from gym import spaces -from stable_baselines3.common.buffers import BaseBuffer +from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.type_aliases import ReplayBufferSamples, RolloutBufferSamples from stable_baselines3.common.vec_env import VecNormalize from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy -class HerReplayBuffer(BaseBuffer): +class HerReplayBuffer(ReplayBuffer): """ Replay Buffer for sampling HER (Hindsight Experience Replay) transitions. In the online sampling case these new transitions will not be saved in the Buffer. @@ -78,6 +78,38 @@ def __init__( # percentage of her indices self.her_ratio = her_ratio + def __getstate__(self) -> Dict[str, Any]: + """ + Gets state for pickling. + + Excludes self.env, as in general Env's may not be pickleable.""" + state = self.__dict__.copy() + # these attributes are not pickleable + del state["env"] + return state + + def __setstate__(self, state: Dict[str, Any]) -> None: + """ + Restores pickled state. + + User must call set_env() after unpickling before using. + + :param state: + """ + self.__dict__.update(state) + assert "env" not in state + self.env = None + + def set_env(self, env: ObsDictWrapper) -> None: + """ + Sets the environment. + :param env: + """ + if self.env is not None: + raise ValueError("Trying to set env of already initialized environment.") + + self.env = env + def _get_samples( self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None ) -> Union[ReplayBufferSamples, RolloutBufferSamples]: diff --git a/tests/test_her.py b/tests/test_her.py index 9989663f0a..b5a2382b30 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -1,4 +1,5 @@ import os +import pathlib from copy import deepcopy import numpy as np @@ -161,6 +162,38 @@ def test_save_load(tmp_path, model_class, use_sde): os.remove(tmp_path / "test_save.zip") +@pytest.mark.parametrize("model_class", [HER]) +def test_save_load_replay_buffer(tmp_path, model_class): + path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl") + path.parent.mkdir(exist_ok=True, parents=True) # to not raise a warning + env = BitFlippingEnv(n_bits=4, continuous=True) + model = HER( + "MlpPolicy", + env, + SAC, + goal_selection_strategy="future", + online_sampling=True, + gradient_steps=1, + train_freq=1, + n_episodes_rollout=-1, + max_episode_length=4, + policy_kwargs=dict(net_arch=[64]), + ) + model.learn(300) + old_replay_buffer = deepcopy(model.replay_buffer) + model.save_replay_buffer(path) + model.model.replay_buffer = None + model.load_replay_buffer(path) + # set environment + model.replay_buffer.set_env(env) + + assert np.allclose(old_replay_buffer.buffer["observation"], model.replay_buffer.buffer["observation"]) + assert np.allclose(old_replay_buffer.buffer["next_obs"], model.replay_buffer.buffer["next_obs"]) + assert np.allclose(old_replay_buffer.buffer["action"], model.replay_buffer.buffer["action"]) + assert np.allclose(old_replay_buffer.buffer["reward"], model.replay_buffer.buffer["reward"]) + assert np.allclose(old_replay_buffer.buffer["done"], model.replay_buffer.buffer["done"]) + + @pytest.mark.parametrize("online_sampling", [False, True]) @pytest.mark.parametrize("n_bits", [10]) def test_performance_her(online_sampling, n_bits): From f738f3227cfa115b21b130dbff865f363f0fb309 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Fri, 16 Oct 2020 16:20:10 +0200 Subject: [PATCH 58/81] Fix test memory usage --- tests/test_save_load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_save_load.py b/tests/test_save_load.py index 18fba37a2e..77ec75e4eb 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -305,7 +305,7 @@ def test_save_load_policy(tmp_path, model_class, policy_str): if policy_str == "MlpPolicy": env = select_env(model_class) else: - if model_class in [SAC, TD3, DQN]: + if model_class in [SAC, TD3, DQN, DDPG]: # Avoid memory error when using replay buffer # Reduce the size of the features kwargs = dict(buffer_size=250) From d7a787f5f0832d3debc04467e2f1c3abc57eb5b3 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Mon, 19 Oct 2020 14:03:48 +0200 Subject: [PATCH 59/81] Fixed save/load replay buffer. --- stable_baselines3/her/her.py | 17 ++++++++++++----- tests/test_her.py | 34 +++++++++++++++++++++------------- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index bd71b3e07f..aa1e161a7a 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -396,11 +396,6 @@ def save( self.model.model_class = self.model_class self.model.max_episode_length = self.max_episode_length - # exclude episode storage - if exclude is None: - exclude = [] - exclude = ["_episode_storage"].extend(exclude) - self.model.save(path, exclude, include) @classmethod @@ -488,3 +483,15 @@ def load( if her_model.model.use_sde: her_model.model.policy.reset_noise() # pytype: disable=attribute-error return her_model + + def load_replay_buffer(self, path: Union[str, pathlib.Path, io.BufferedIOBase]) -> None: + """ + Load a replay buffer from a pickle file and set environment for replay buffer (only online sampling). + + :param path: Path to the pickled replay buffer. + """ + self.model.load_replay_buffer(path=path) + + if self.online_sampling: + # set environment + self.replay_buffer.set_env(self.env) diff --git a/tests/test_her.py b/tests/test_her.py index b5a2382b30..d3bade3619 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -74,7 +74,8 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling): @pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN]) @pytest.mark.parametrize("use_sde", [False, True]) -def test_save_load(tmp_path, model_class, use_sde): +@pytest.mark.parametrize("online_sampling", [False, True]) +def test_save_load(tmp_path, model_class, use_sde, online_sampling): """ Test if 'save' and 'load' saves and loads model correctly """ @@ -93,7 +94,7 @@ def test_save_load(tmp_path, model_class, use_sde): model_class, n_sampled_goal=5, goal_selection_strategy="future", - online_sampling=True, + online_sampling=online_sampling, verbose=0, tau=0.05, batch_size=128, @@ -162,8 +163,11 @@ def test_save_load(tmp_path, model_class, use_sde): os.remove(tmp_path / "test_save.zip") -@pytest.mark.parametrize("model_class", [HER]) -def test_save_load_replay_buffer(tmp_path, model_class): +@pytest.mark.parametrize("online_sampling", [False, True]) +def test_save_load_replay_buffer(tmp_path, online_sampling): + """ + Test if 'save_replay_buffer' and 'load_replay_buffer' works correctly + """ path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl") path.parent.mkdir(exist_ok=True, parents=True) # to not raise a warning env = BitFlippingEnv(n_bits=4, continuous=True) @@ -172,7 +176,7 @@ def test_save_load_replay_buffer(tmp_path, model_class): env, SAC, goal_selection_strategy="future", - online_sampling=True, + online_sampling=online_sampling, gradient_steps=1, train_freq=1, n_episodes_rollout=-1, @@ -184,21 +188,25 @@ def test_save_load_replay_buffer(tmp_path, model_class): model.save_replay_buffer(path) model.model.replay_buffer = None model.load_replay_buffer(path) - # set environment - model.replay_buffer.set_env(env) - assert np.allclose(old_replay_buffer.buffer["observation"], model.replay_buffer.buffer["observation"]) - assert np.allclose(old_replay_buffer.buffer["next_obs"], model.replay_buffer.buffer["next_obs"]) - assert np.allclose(old_replay_buffer.buffer["action"], model.replay_buffer.buffer["action"]) - assert np.allclose(old_replay_buffer.buffer["reward"], model.replay_buffer.buffer["reward"]) - assert np.allclose(old_replay_buffer.buffer["done"], model.replay_buffer.buffer["done"]) + if online_sampling: + assert np.allclose(old_replay_buffer.buffer["observation"], model.replay_buffer.buffer["observation"], equal_nan=True) + assert np.allclose(old_replay_buffer.buffer["next_obs"], model.replay_buffer.buffer["next_obs"], equal_nan=True) + assert np.allclose(old_replay_buffer.buffer["action"], model.replay_buffer.buffer["action"], equal_nan=True) + assert np.allclose(old_replay_buffer.buffer["reward"], model.replay_buffer.buffer["reward"], equal_nan=True) + assert np.allclose(old_replay_buffer.buffer["done"], model.replay_buffer.buffer["done"], equal_nan=True) + else: + assert np.allclose(old_replay_buffer.observations, model.replay_buffer.observations) + assert np.allclose(old_replay_buffer.actions, model.replay_buffer.actions) + assert np.allclose(old_replay_buffer.rewards, model.replay_buffer.rewards) + assert np.allclose(old_replay_buffer.dones, model.replay_buffer.dones) @pytest.mark.parametrize("online_sampling", [False, True]) @pytest.mark.parametrize("n_bits", [10]) def test_performance_her(online_sampling, n_bits): """ - That that DQN+HER can solve BitFlippingEnv. + That DQN+HER can solve BitFlippingEnv. It should not work when n_sampled_goal=0 (DQN alone). """ env = BitFlippingEnv(n_bits=n_bits, continuous=False) From c8ebaa93f5b0decde267baa193a09f3230f0aa7c Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Mon, 19 Oct 2020 16:26:52 +0200 Subject: [PATCH 60/81] Fixed save/load replay buffer --- tests/test_her.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tests/test_her.py b/tests/test_her.py index d3bade3619..331fe5269d 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -190,11 +190,24 @@ def test_save_load_replay_buffer(tmp_path, online_sampling): model.load_replay_buffer(path) if online_sampling: - assert np.allclose(old_replay_buffer.buffer["observation"], model.replay_buffer.buffer["observation"], equal_nan=True) - assert np.allclose(old_replay_buffer.buffer["next_obs"], model.replay_buffer.buffer["next_obs"], equal_nan=True) - assert np.allclose(old_replay_buffer.buffer["action"], model.replay_buffer.buffer["action"], equal_nan=True) - assert np.allclose(old_replay_buffer.buffer["reward"], model.replay_buffer.buffer["reward"], equal_nan=True) - assert np.allclose(old_replay_buffer.buffer["done"], model.replay_buffer.buffer["done"], equal_nan=True) + n_episodes_stored = old_replay_buffer.n_episodes_stored + assert np.allclose( + old_replay_buffer.buffer["observation"][:n_episodes_stored], + model.replay_buffer.buffer["observation"][:n_episodes_stored], + ) + assert np.allclose( + old_replay_buffer.buffer["next_obs"][:n_episodes_stored], + model.replay_buffer.buffer["next_obs"][:n_episodes_stored], + ) + assert np.allclose( + old_replay_buffer.buffer["action"][:n_episodes_stored], model.replay_buffer.buffer["action"][:n_episodes_stored] + ) + assert np.allclose( + old_replay_buffer.buffer["reward"][:n_episodes_stored], model.replay_buffer.buffer["reward"][:n_episodes_stored] + ) + assert np.allclose( + old_replay_buffer.buffer["done"][:n_episodes_stored], model.replay_buffer.buffer["done"][:n_episodes_stored] + ) else: assert np.allclose(old_replay_buffer.observations, model.replay_buffer.observations) assert np.allclose(old_replay_buffer.actions, model.replay_buffer.actions) From 11f0fa2734c472d2a6d7de1a3a43a075d84b84dd Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Tue, 20 Oct 2020 00:38:46 +0200 Subject: [PATCH 61/81] Fixed transition index after loading replay buffer in online sampling --- stable_baselines3/her/her.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index aa1e161a7a..0157b46e8f 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -495,3 +495,4 @@ def load_replay_buffer(self, path: Union[str, pathlib.Path, io.BufferedIOBase]) if self.online_sampling: # set environment self.replay_buffer.set_env(self.env) + self.replay_buffer.current_idx = 0 From ee39e38997835673250ad1fc4cb637a37dc81382 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 20 Oct 2020 12:59:21 +0200 Subject: [PATCH 62/81] Better error handling --- stable_baselines3/her/her.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 0157b46e8f..537978ba5c 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -31,6 +31,9 @@ def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> in if current_max_episode_length is None: try: current_max_episode_length = env.get_attr("spec")[0].max_episode_steps + # Raise the error because the attribute is present but is None + if current_max_episode_length is None: + raise AttributeError # if not available check if a valid value was passed as an argument except AttributeError: raise ValueError( From 3821e4dc61ea0795a0702a9dc451ff872c947d3f Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 20 Oct 2020 16:17:40 +0200 Subject: [PATCH 63/81] Add tests for get_time_limit --- stable_baselines3/common/bit_flipping_env.py | 3 ++ stable_baselines3/her/her.py | 4 +- tests/test_her.py | 40 +++++++++++++++++++- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/stable_baselines3/common/bit_flipping_env.py b/stable_baselines3/common/bit_flipping_env.py index d38ff73cc7..62f07100fd 100644 --- a/stable_baselines3/common/bit_flipping_env.py +++ b/stable_baselines3/common/bit_flipping_env.py @@ -3,6 +3,7 @@ import numpy as np from gym import GoalEnv, spaces +from gym.envs.registration import EnvSpec from stable_baselines3.common.type_aliases import GymStepReturn @@ -22,6 +23,8 @@ class BitFlippingEnv(GoalEnv): version or not, by default, it uses the MultiBinary one """ + spec = EnvSpec("BitFlippingEnv-v0") + def __init__( self, n_bits: int = 10, continuous: bool = False, max_steps: Optional[int] = None, discrete_obs_space: bool = False ): diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 537978ba5c..0727175a91 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -37,8 +37,8 @@ def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> in # if not available check if a valid value was passed as an argument except AttributeError: raise ValueError( - "The max episode length could not be inferred." - "You must specify a `max_episode_steps` when registering the environment, " + "The max episode length could not be inferred.\n" + "You must specify a `max_episode_steps` when registering the environment,\n" "use a `gym.wrappers.TimeLimit` wrapper " "or pass `max_episode_length` to the model constructor" ) diff --git a/tests/test_her.py b/tests/test_her.py index 331fe5269d..b3e2e9a119 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -2,14 +2,17 @@ import pathlib from copy import deepcopy +import gym import numpy as np import pytest import torch as th from stable_baselines3 import DDPG, DQN, HER, SAC, TD3 from stable_baselines3.common.bit_flipping_env import BitFlippingEnv +from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy +from stable_baselines3.her.her import get_time_limit @pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN]) @@ -110,7 +113,7 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling): **kwargs ) - model.learn(total_timesteps=500) + model.learn(total_timesteps=300) env.reset() @@ -215,6 +218,41 @@ def test_save_load_replay_buffer(tmp_path, online_sampling): assert np.allclose(old_replay_buffer.dones, model.replay_buffer.dones) +def test_get_max_episode_length(): + dict_env = DummyVecEnv([lambda: BitFlippingEnv()]) + + # Cannot infer max epsiode length + with pytest.raises(ValueError): + get_time_limit(dict_env, current_max_episode_length=None) + + default_length = 10 + assert get_time_limit(dict_env, current_max_episode_length=default_length) == default_length + + env = gym.make("CartPole-v1") + vec_env = DummyVecEnv([lambda: env]) + + assert get_time_limit(vec_env, current_max_episode_length=None) == 500 + # Overwrite max_episode_steps + assert get_time_limit(vec_env, current_max_episode_length=default_length) == default_length + + # Set max_episode_steps to None + env.spec.max_episode_steps = None + vec_env = DummyVecEnv([lambda: env]) + with pytest.raises(ValueError): + get_time_limit(vec_env, current_max_episode_length=None) + + # Initialize HER and specify max_episode_length, should not raise an issue + HER("MlpPolicy", dict_env, DQN, max_episode_length=5) + + with pytest.raises(ValueError): + HER("MlpPolicy", dict_env, DQN) + + # Wrapped in a timelimit, should be fine + # Note: it requires env.spec to be defined + env = DummyVecEnv([lambda: gym.wrappers.TimeLimit(BitFlippingEnv(), 10)]) + HER("MlpPolicy", env, DQN) + + @pytest.mark.parametrize("online_sampling", [False, True]) @pytest.mark.parametrize("n_bits", [10]) def test_performance_her(online_sampling, n_bits): From dca958259fcccb7e09a36a92a35a7aa0dd2e5f07 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 20 Oct 2020 16:34:16 +0200 Subject: [PATCH 64/81] More tests for VecNormalize with dict obs --- tests/test_vec_normalize.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index 75b017c782..a68e1b2fcd 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -142,7 +142,8 @@ def test_runningmeanstd(): assert np.allclose(moments_1, moments_2) -def test_vec_env(tmp_path): +@pytest.mark.parametrize("make_env", [make_env, make_dict_env]) +def test_vec_env(tmp_path, make_env): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 @@ -153,7 +154,11 @@ def test_vec_env(tmp_path): while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) - assert np.max(np.abs(obs)) <= clip_obs + if isinstance(obs, dict): + for key in obs.keys(): + assert np.max(np.abs(obs[key])) <= clip_obs + else: + assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = tmp_path / "vec_normalize" @@ -181,6 +186,26 @@ def test_get_original(): np.testing.assert_allclose(venv.normalize_reward(orig_rewards), rewards) +def test_get_original_dict(): + venv = _make_warmstart_dict_env() + for _ in range(3): + actions = [venv.action_space.sample()] + obs, rewards, _, _ = venv.step(actions) + # obs = obs[0] + orig_obs = venv.get_original_obs() + rewards = rewards[0] + orig_rewards = venv.get_original_reward()[0] + + for key in orig_obs.keys(): + assert orig_obs[key].shape == obs[key].shape + assert orig_rewards.dtype == rewards.dtype + + assert not allclose(orig_obs, obs) + assert not np.array_equal(orig_rewards, rewards) + assert allclose(venv.normalize_obs(orig_obs), obs) + np.testing.assert_allclose(venv.normalize_reward(orig_rewards), rewards) + + def test_normalize_external(): venv = _make_warmstart_cartpole() From 631cc9c6a0ac9cbe7edcfdd3fb6171f4cef64c55 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 20 Oct 2020 18:03:29 +0200 Subject: [PATCH 65/81] Update doc --- docs/guide/examples.rst | 3 +++ docs/guide/migration.rst | 3 ++- docs/modules/her.rst | 8 ++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 49b2b3c2fa..3b9029d33a 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -384,6 +384,9 @@ The parking env is a goal-conditioned continuous control task, in which the vehi SAC, n_sampled_goal=n_sampled_goal, goal_selection_strategy="future", + # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper + # we have to manually specify the max number of steps per episode + max_episode_length=100, verbose=1, buffer_size=int(1e6), learning_rate=1e-3, diff --git a/docs/guide/migration.rst b/docs/guide/migration.rst index f1e0225a01..0899242438 100644 --- a/docs/guide/migration.rst +++ b/docs/guide/migration.rst @@ -168,7 +168,8 @@ HER The ``HER`` implementation now also supports online sampling of the new goals. This is done in a vectorized version. The goal selection strategy ``RANDOM`` is no longer supported. -``HER`` now supports ``VecNormalize`` wrapper but only when ``online_sampling=True`` +``HER`` now supports ``VecNormalize`` wrapper but only when ``online_sampling=True``. +For performance reasons, the maximum number of steps per episodes must be specified (see :ref:`HER ` documentation). New logger API diff --git a/docs/modules/her.rst b/docs/modules/her.rst index 6befbc1731..1fac42add1 100644 --- a/docs/modules/her.rst +++ b/docs/modules/her.rst @@ -15,6 +15,14 @@ HER is an algorithm that works with Off policy methods (DQN, SAC, TD3 and DDPG f HER requires the environment to inherits from `gym.GoalEnv `_ +.. warning:: + + For performance reasons, the maximum number of steps per episodes must be specified. + In most cases, it will be inferred if you specify ```max_episode_steps`` when registering the environment + or if you use a ``gym.wrappers.TimeLimit`` (and ``env.spec`` is not None). + Otherwise, you can directly pass ``max_episode_length`` to the model constructor + + .. warning:: ``HER`` supports ``VecNormalize`` wrapper but only when ``online_sampling=True`` From ba0a7e4f81da4a61b5877e0e6c0cb64f15dce1f8 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 20 Oct 2020 18:10:28 +0200 Subject: [PATCH 66/81] Improve HER description --- docs/modules/her.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/modules/her.rst b/docs/modules/her.rst index 1fac42add1..31d1fac3a1 100644 --- a/docs/modules/her.rst +++ b/docs/modules/her.rst @@ -8,7 +8,11 @@ HER `Hindsight Experience Replay (HER) `_ -HER is an algorithm that works with Off policy methods (DQN, SAC, TD3 and DDPG for example). +HER is an algorithm that works with off-policy methods (DQN, SAC, TD3 and DDPG for example). +HER uses the fact that even if a desired goal was not achieved, other goal may have been achieved during a rollout. +It creates "virtual" transitions by relabeling transitions (changing the desired goal) from past episodes. + + .. warning:: From 907bcffe16a689e53c43f653637c8069332c2f33 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 20 Oct 2020 18:21:20 +0200 Subject: [PATCH 67/81] Add test for sde support --- tests/test_sde.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_sde.py b/tests/test_sde.py index 3b010d36d8..74853a0f99 100644 --- a/tests/test_sde.py +++ b/tests/test_sde.py @@ -54,6 +54,11 @@ def test_state_dependent_exploration_grad(): assert sigma_hat.grad.allclose(grad) +def test_sde_check(): + with pytest.raises(ValueError): + PPO("MlpPolicy", "CartPole-v1", use_sde=True) + + @pytest.mark.parametrize("model_class", [SAC, A2C, PPO]) @pytest.mark.parametrize("sde_net_arch", [None, [32, 16], []]) @pytest.mark.parametrize("use_expln", [False, True]) @@ -65,9 +70,9 @@ def test_state_dependent_offpolicy_noise(model_class, sde_net_arch, use_expln): seed=None, create_eval_env=True, verbose=1, - policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch, use_expln=use_expln), + policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch, use_expln=use_expln, net_arch=[64]), ) - model.learn(total_timesteps=int(500), eval_freq=250) + model.learn(total_timesteps=int(300), eval_freq=250) model.policy.reset_noise() if model_class == SAC: model.policy.actor.get_std() From f650934c69111dd0cb9c154692deeb4e3f8980c1 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 20 Oct 2020 18:50:33 +0200 Subject: [PATCH 68/81] Add comments --- stable_baselines3/her/her.py | 5 +- stable_baselines3/her/her_replay_buffer.py | 55 ++++++++++++++-------- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 0727175a91..44b6377813 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -351,13 +351,14 @@ def collect_rollouts( def _store_transitions(self) -> None: """ - Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer. + Store current episode in replay buffer when using offline sampling. + Sample additional goals and store new transitions in replay buffer. """ # sample goals and get new observations observations, next_observations, actions, rewards = self._episode_storage.sample( self.batch_size, - self.get_vec_normalize_env(), + None, # we should store unnormalized transitions, they will be normalized at sampling time self.online_sampling, self.n_sampled_goal, ) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 4ade7ec047..929415ad75 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -14,20 +14,23 @@ class HerReplayBuffer(ReplayBuffer): """ - Replay Buffer for sampling HER (Hindsight Experience Replay) transitions. - In the online sampling case these new transitions will not be saved in the Buffer. + Replay buffer for sampling HER (Hindsight Experience Replay) transitions. + In the online sampling case, these new transitions will not be saved in the replay buffer + and will only be created at sampling time. :param env: The training environment :param buffer_size: The size of the buffer measured in transitions. :param max_episode_length: The length of an episode. (time horizon) :param goal_selection_strategy: Strategy for sampling goals for replay. - One of ['episode', 'final', 'future', 'random'] + One of ['episode', 'final', 'future'] :param observation_space: Observation space :param action_space: Action space :param device: PyTorch device - to which the values will be converted :param n_envs: Number of parallel environments - :her_ratio: The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling) + :her_ratio: The ratio between HER transitions and regular transitions in percent + (between 0 and 1, for online sampling) + The default value ``her_ratio=0.8`` corresponds to 4 virtual transitions + for one real transition (4 / (4 + 1) = 0.8) """ def __init__( @@ -70,6 +73,7 @@ def __init__( key: np.empty((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32) for key, dim in input_shape.items() } + # Store info dicts are it can be used to compute the reward (e.g. continuity cost) self.info_buffer = [deque(maxlen=self.max_episode_length) for _ in range(self.max_episode_stored)] # episode length storage, needed for episodes which has less steps than the maximum length self.episode_lengths = np.zeros(self.max_episode_stored, dtype=np.int64) @@ -92,7 +96,7 @@ def __setstate__(self, state: Dict[str, Any]) -> None: """ Restores pickled state. - User must call set_env() after unpickling before using. + User must call ``set_env()`` after unpickling before using. :param state: """ @@ -103,6 +107,7 @@ def __setstate__(self, state: Dict[str, Any]) -> None: def set_env(self, env: ObsDictWrapper) -> None: """ Sets the environment. + :param env: """ if self.env is not None: @@ -167,21 +172,21 @@ def sample_goals( transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices]) else: - raise ValueError("Strategy for sampling goals not supported!") + raise ValueError(f"Strategy {self.goal_selection_strategy} for sampling goals not supported!") return self.buffer["achieved_goal"][her_episode_indices, transitions_indices] def _sample_transitions( self, batch_size: int, - env: Optional[VecNormalize], + maybe_vec_env: Optional[VecNormalize], online_sampling: bool = True, n_sampled_goal: int = None, ) -> Union[ReplayBufferSamples, Tuple]: """ :param batch_size: Number of element to sample - :param env: associated gym VecEnv - to normalize the observations/rewards when sampling + :param env: associated gym VecEnv to normalize the observations/rewards + Only valid when using online sampling :param online_sampling: Using online_sampling for HER or not. :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) :return: Samples. @@ -191,9 +196,13 @@ def _sample_transitions( episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)] else: + assert maybe_vec_env is None, "Transitions must be stored unnormalized in the replay buffer" + # Offline sampling: there is only one episode stored episode_length = self.episode_lengths[0] + # we sample n_sampled_goal per timestep in the episode (only one is stored). episode_indices = np.tile(0, (episode_length * n_sampled_goal)) - # episode_indices = np.array(list(range(1)) * episode_length * n_sampled_goal) + # we only sample virtual transitions + # as real transitions are already stored in the replay buffer her_indices = np.arange(len(episode_indices)) ep_length = self.episode_lengths[episode_indices] @@ -209,9 +218,13 @@ def _sample_transitions( transitions_indices = np.random.randint(ep_length) else: if her_indices.size == 0: + # Episode of one timestep, not enough for using the "future" strategy + # no virtual transitions are created in that case return np.empty(0), np.empty(0), np.empty(0), np.empty(0) else: - # repeat every transition index n_sampled_goals times + # Repeat every transition index n_sampled_goals times + # to sample n_sampled_goal per timestep in the episode (only one is stored). + # Now with the corrected episode length when using "future" strategy transitions_indices = np.tile(np.arange(ep_length[0]), n_sampled_goal) episode_indices = episode_indices[transitions_indices] her_indices = np.arange(len(episode_indices)) @@ -219,6 +232,7 @@ def _sample_transitions( # get selected transitions transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()} + # sample new desired goals and relabel the transitions new_goals = self.sample_goals(episode_indices, her_indices, transitions_indices) transitions["desired_goal"][her_indices] = new_goals @@ -239,10 +253,10 @@ def _sample_transitions( ) # concatenate observation with (desired) goal - observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, env)) + observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, maybe_vec_env)) # HACK to make normalize obs work with the next observation transitions["observation"] = transitions["next_obs"] - next_observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, env)) + next_observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, maybe_vec_env)) if online_sampling: data = ( @@ -250,7 +264,7 @@ def _sample_transitions( transitions["action"], next_observations[:, 0], transitions["done"], - self._normalize_reward(transitions["reward"], env), + self._normalize_reward(transitions["reward"], maybe_vec_env), ) return ReplayBufferSamples(*tuple(map(self.to_torch, data))) @@ -286,7 +300,11 @@ def add( # update current pointer self.current_idx += 1 - def store_episode(self): + def store_episode(self) -> None: + """ + Increment episode counter + and reset transition pointer. + """ # add episode length to length storage self.episode_lengths[self.pos] = self.current_idx @@ -302,14 +320,11 @@ def store_episode(self): self.current_idx = 0 @property - def n_episodes_stored(self): + def n_episodes_stored(self) -> int: if self.full: return self.max_episode_stored return self.pos - def clear_buffer(self): - self.buffer = {} - def size(self) -> int: """ :return: The current size of the buffer in transitions. From 03c41041e2a469fc7aeca1965060eb0a2b83126f Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 20 Oct 2020 19:16:48 +0200 Subject: [PATCH 69/81] Add comments --- stable_baselines3/her/her.py | 12 ++++++------ stable_baselines3/her/her_replay_buffer.py | 8 +++++++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 44b6377813..24aa674fd7 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -323,8 +323,8 @@ def collect_rollouts( self.replay_buffer.store_episode() else: self._episode_storage.store_episode() - # store episode in replay buffer - self._store_transitions() + # sample virtual transitions and store them in replay buffer + self._sample_her_transitions() # clear storage for current episode self._episode_storage.reset() @@ -349,16 +349,16 @@ def collect_rollouts( return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training) - def _store_transitions(self) -> None: + def _sample_her_transitions(self) -> None: """ - Store current episode in replay buffer when using offline sampling. - Sample additional goals and store new transitions in replay buffer. + Sample additional goals and store new transitions in replay buffer + when using offline sampling """ # sample goals and get new observations observations, next_observations, actions, rewards = self._episode_storage.sample( self.batch_size, - None, # we should store unnormalized transitions, they will be normalized at sampling time + None, # we should store unnormalized transitions, they will be normalized at sampling time self.online_sampling, self.n_sampled_goal, ) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 929415ad75..ca40d84b9b 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -244,9 +244,15 @@ def _sample_transitions( ] ) - # Vectorized computation + # Vectorized computation of the new reward transitions["reward"][her_indices, 0] = self.env.env_method( "compute_reward", + # the new state depends on the previous state and action + # s_{t+1} = f(s_t, a_t) + # so the next_achieved_goal depends also on the previous state and action + # because we are in a GoalEnv: + # r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal) + # therefore we have to use "next_achieved_goal" and not "achieved_goal" transitions["next_achieved_goal"][her_indices, 0], transitions["desired_goal"][her_indices, 0], transitions["info"][her_indices, 0], From 6c18e4cde8cb7a03cc986f232f1c187dbba9c9ab Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Tue, 20 Oct 2020 19:21:21 +0200 Subject: [PATCH 70/81] Remove check that was always valid --- stable_baselines3/her/her.py | 43 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 24aa674fd7..e74cf7245d 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -276,31 +276,32 @@ def collect_rollouts( self.model.ep_info_buffer = self.ep_info_buffer self.model.ep_success_buffer = self.ep_success_buffer - # Store episode in episode storage - if self.replay_buffer is not None: + # == Store transition in the replay buffer and/or in the episode storage == + + if self._vec_normalize_env is not None: # Store only the unnormalized version - if self._vec_normalize_env is not None: - new_obs_ = self._vec_normalize_env.get_original_obs() - reward_ = self._vec_normalize_env.get_original_reward() - else: - # Avoid changing the original ones - self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward - self.model._last_original_obs = self._last_original_obs - - if self.online_sampling: - self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) - else: - # concatenate observation with (desired) goal - obs = ObsDictWrapper.convert_dict(self._last_original_obs) - next_obs = ObsDictWrapper.convert_dict(new_obs_) - # add to replay buffer - self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done) - # add current transition to episode storage - self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) + new_obs_ = self._vec_normalize_env.get_original_obs() + reward_ = self._vec_normalize_env.get_original_reward() + else: + # Avoid changing the original ones + self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward + self.model._last_original_obs = self._last_original_obs + + if self.online_sampling: + self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) + else: + # concatenate observation with (desired) goal + obs = ObsDictWrapper.convert_dict(self._last_original_obs) + next_obs = ObsDictWrapper.convert_dict(new_obs_) + # add to replay buffer + self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done) + # add current transition to episode storage + self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) self._last_obs = new_obs self.model._last_obs = self._last_obs - # Save the unnormalized observation + + # Save the unnormalized new observation if self._vec_normalize_env is not None: self._last_original_obs = new_obs_ self.model._last_original_obs = self._last_original_obs From 28b281df908d513084ab84c5279bc04826ee1f52 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 21 Oct 2020 11:09:43 +0200 Subject: [PATCH 71/81] Fix for terminal observation --- stable_baselines3/her/her.py | 20 +++++++++++++++----- tests/test_her.py | 6 ++++-- tests/test_save_load.py | 6 ++++-- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index e74cf7245d..15733093e9 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -287,16 +287,26 @@ def collect_rollouts( self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward self.model._last_original_obs = self._last_original_obs + # As the VecEnv resets automatically, new_obs is already the + # first observation of the next episode + if done and infos[0].get("terminal_observation") is not None: + # The saved terminal_observation is not passed through other + # VecEnvWrapper, so no need to unnormalize + # NOTE: this may be an issue when using other wrappers + next_obs = infos[0]["terminal_observation"] + else: + next_obs = new_obs_ + if self.online_sampling: - self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) + self.replay_buffer.add(self._last_original_obs, next_obs, buffer_action, reward_, done, infos) else: # concatenate observation with (desired) goal - obs = ObsDictWrapper.convert_dict(self._last_original_obs) - next_obs = ObsDictWrapper.convert_dict(new_obs_) + flattened_obs = ObsDictWrapper.convert_dict(self._last_original_obs) + flattened_next_obs = ObsDictWrapper.convert_dict(next_obs) # add to replay buffer - self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done) + self.replay_buffer.add(flattened_obs, flattened_next_obs, buffer_action, reward_, done) # add current transition to episode storage - self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos) + self._episode_storage.add(self._last_original_obs, next_obs, buffer_action, reward_, done, infos) self._last_obs = new_obs self.model._last_obs = self._last_obs diff --git a/tests/test_her.py b/tests/test_her.py index b3e2e9a119..a11eb0c7b8 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -184,16 +184,18 @@ def test_save_load_replay_buffer(tmp_path, online_sampling): train_freq=1, n_episodes_rollout=-1, max_episode_length=4, + buffer_size=int(2e4), + seed=0, policy_kwargs=dict(net_arch=[64]), ) - model.learn(300) + model.learn(200) old_replay_buffer = deepcopy(model.replay_buffer) model.save_replay_buffer(path) model.model.replay_buffer = None model.load_replay_buffer(path) if online_sampling: - n_episodes_stored = old_replay_buffer.n_episodes_stored + n_episodes_stored = model.replay_buffer.n_episodes_stored assert np.allclose( old_replay_buffer.buffer["observation"][:n_episodes_stored], model.replay_buffer.buffer["observation"][:n_episodes_stored], diff --git a/tests/test_save_load.py b/tests/test_save_load.py index 77ec75e4eb..e6230ebdd2 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -231,8 +231,10 @@ def test_exclude_include_saved_params(tmp_path, model_class): def test_save_load_replay_buffer(tmp_path, model_class): path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl") path.parent.mkdir(exist_ok=True, parents=True) # to not raise a warning - model = model_class("MlpPolicy", select_env(model_class), buffer_size=1000) - model.learn(500) + model = model_class( + "MlpPolicy", select_env(model_class), buffer_size=1000, policy_kwargs=dict(net_arch=[64]), learning_starts=200 + ) + model.learn(300) old_replay_buffer = deepcopy(model.replay_buffer) model.save_replay_buffer(path) model.replay_buffer = None From d196aa26e13d32c67b3dae071174fb7b8290612c Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Wed, 21 Oct 2020 11:23:35 +0200 Subject: [PATCH 72/81] Updated buffer size in offline version and reset of HER buffer --- stable_baselines3/her/her.py | 3 ++- stable_baselines3/her/her_replay_buffer.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 15733093e9..272e5b56c1 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -117,9 +117,10 @@ def __init__( # maximum steps in episode self.max_episode_length = get_time_limit(self.env, max_episode_length) # storage for transitions of current episode + her_buffer_size = self.buffer_size if online_sampling else self.max_episode_length self._episode_storage = HerReplayBuffer( self.env, - self.buffer_size, + her_buffer_size, self.max_episode_length, self.goal_selection_strategy, self.env.observation_space, diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index ca40d84b9b..57572871be 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -336,3 +336,13 @@ def size(self) -> int: :return: The current size of the buffer in transitions. """ return int(np.sum(self.episode_lengths)) + + def reset(self) -> None: + """ + Reset the buffer. + """ + self.pos = 0 + self.current_idx = 0 + self.full = False + self.episode_lengths = np.zeros(self.max_episode_stored, dtype=np.int64) + From 1f7ab9f2fb38616bb524151063086093471a6e08 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 21 Oct 2020 12:21:32 +0200 Subject: [PATCH 73/81] Reformat --- stable_baselines3/her/her_replay_buffer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 57572871be..e727f2b73a 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -345,4 +345,3 @@ def reset(self) -> None: self.current_idx = 0 self.full = False self.episode_lengths = np.zeros(self.max_episode_stored, dtype=np.int64) - From 7da274fe9ea873dd96364ef99cfa0dc7e187f1d8 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 21 Oct 2020 14:13:33 +0200 Subject: [PATCH 74/81] Update doc --- README.md | 13 +------------ docs/misc/changelog.rst | 3 ++- docs/modules/a2c.rst | 21 ++++++++++++++++++++- docs/modules/ddpg.rst | 8 +++++--- docs/modules/dqn.rst | 4 ++++ docs/modules/her.rst | 2 +- docs/modules/ppo.rst | 19 +++++++++++++++++++ docs/modules/sac.rst | 11 +++++++---- docs/modules/td3.rst | 8 +++++--- stable_baselines3/her/her.py | 8 +++++++- 10 files changed, 71 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 94c005d628..3415e4d091 100644 --- a/README.md +++ b/README.md @@ -35,20 +35,9 @@ These algorithms will make it easier for the research community and industry to | Type hints | :heavy_check_mark: | -### Roadmap to V1.0 - -Please look at the issue for more details. -Planned features: - -- [ ] HER - ### Planned features (v1.1+) -- [ ] DQN extensions (prioritized replay, double q-learning, ...) -- [ ] Support for `Tuple` and `Dict` observation spaces -- [ ] Recurrent Policies -- [ ] TRPO - +Please take a look at the [Roadmap](https://github.com/DLR-RM/stable-baselines3/issues/1) and [Milestones](https://github.com/DLR-RM/stable-baselines3/milestones). ## Migration guide: from Stable-Baselines (SB2) to Stable-Baselines3 (SB3) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index c2db988b0b..cf9ef5960d 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -38,6 +38,7 @@ Others: Documentation: ^^^^^^^^^^^^^^ - Added first draft of migration guide +- Enabled doc for ``CnnPolicies`` Pre-Release 0.9.0 (2020-10-03) @@ -462,4 +463,4 @@ And all the contributors: @MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching @flodorner @KuKuXia @NeoExtended @PartiallyTyped @mmcenta @richardwu @kinalmehta @rolandgvc @tkelestemur @mloo3 @tirafesi @blurLake @koulakis @joeljosephjin @shwang @rk37 @andyshih12 @RaphaelWag @xicocaio -@diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @SwamyDev @wmmc88 @megan-klaiber \ No newline at end of file +@diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @SwamyDev @wmmc88 @megan-klaiber diff --git a/docs/modules/a2c.rst b/docs/modules/a2c.rst index 460d1a6e3b..9cd227ce9d 100644 --- a/docs/modules/a2c.rst +++ b/docs/modules/a2c.rst @@ -11,7 +11,7 @@ It uses multiple workers to avoid the use of a replay buffer. .. warning:: - + If you find training unstable or want to match performance of stable-baselines A2C, consider using ``RMSpropTFLike`` optimizer from ``stable_baselines3.common.sb2_compat.rmsprop_tf_like``. You can change optimizer with ``A2C(policy_kwargs=dict(optimizer_class=RMSpropTFLike))``. @@ -79,3 +79,22 @@ Parameters .. autoclass:: A2C :members: :inherited-members: + + +A2C Policies +------------- + +.. autoclass:: MlpPolicy + :members: + :inherited-members: + +.. autoclass:: stable_baselines3.common.policies.ActorCriticPolicy + :members: + :noindex: + +.. autoclass:: CnnPolicy + :members: + +.. autoclass:: stable_baselines3.common.policies.ActorCriticCnnPolicy + :members: + :noindex: diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst index dd74f3a7d5..8add6982a5 100644 --- a/docs/modules/ddpg.rst +++ b/docs/modules/ddpg.rst @@ -98,7 +98,9 @@ DDPG Policies :members: :inherited-members: +.. autoclass:: stable_baselines3.td3.policies.TD3Policy + :members: + :noindex: -.. .. autoclass:: CnnPolicy -.. :members: -.. :inherited-members: +.. autoclass:: CnnPolicy + :members: diff --git a/docs/modules/dqn.rst b/docs/modules/dqn.rst index 034e5b81da..ca9ccca322 100644 --- a/docs/modules/dqn.rst +++ b/docs/modules/dqn.rst @@ -90,5 +90,9 @@ DQN Policies :members: :inherited-members: +.. autoclass:: stable_baselines3.dqn.policies.DQNPolicy + :members: + :noindex: + .. autoclass:: CnnPolicy :members: diff --git a/docs/modules/her.rst b/docs/modules/her.rst index 31d1fac3a1..355b36d496 100644 --- a/docs/modules/her.rst +++ b/docs/modules/her.rst @@ -22,7 +22,7 @@ It creates "virtual" transitions by relabeling transitions (changing the desired .. warning:: For performance reasons, the maximum number of steps per episodes must be specified. - In most cases, it will be inferred if you specify ```max_episode_steps`` when registering the environment + In most cases, it will be inferred if you specify ``max_episode_steps`` when registering the environment or if you use a ``gym.wrappers.TimeLimit`` (and ``env.spec`` is not None). Otherwise, you can directly pass ``max_episode_length`` to the model constructor diff --git a/docs/modules/ppo.rst b/docs/modules/ppo.rst index 038149d950..eca3e1b699 100644 --- a/docs/modules/ppo.rst +++ b/docs/modules/ppo.rst @@ -80,3 +80,22 @@ Parameters .. autoclass:: PPO :members: :inherited-members: + + +PPO Policies +------------- + +.. autoclass:: MlpPolicy + :members: + :inherited-members: + +.. autoclass:: stable_baselines3.common.policies.ActorCriticPolicy + :members: + :noindex: + +.. autoclass:: CnnPolicy + :members: + +.. autoclass:: stable_baselines3.common.policies.ActorCriticCnnPolicy + :members: + :noindex: diff --git a/docs/modules/sac.rst b/docs/modules/sac.rst index 6d559d4183..7b37974c93 100644 --- a/docs/modules/sac.rst +++ b/docs/modules/sac.rst @@ -82,7 +82,7 @@ Example obs = env.reset() while True: - action, _states = model.predict(obs) + action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: @@ -104,6 +104,9 @@ SAC Policies :members: :inherited-members: -.. .. autoclass:: CnnPolicy -.. :members: -.. :inherited-members: +.. autoclass:: stable_baselines3.sac.policies.SACPolicy + :members: + :noindex: + +.. autoclass:: CnnPolicy + :members: diff --git a/docs/modules/td3.rst b/docs/modules/td3.rst index 912fc1b97c..fbe6aabd50 100644 --- a/docs/modules/td3.rst +++ b/docs/modules/td3.rst @@ -101,7 +101,9 @@ TD3 Policies :members: :inherited-members: +.. autoclass:: stable_baselines3.td3.policies.TD3Policy + :members: + :noindex: -.. .. autoclass:: CnnPolicy -.. :members: -.. :inherited-members: +.. autoclass:: CnnPolicy + :members: diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 272e5b56c1..bf7c486356 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -51,7 +51,13 @@ class HER(BaseAlgorithm): Hindsight Experience Replay (HER) Paper: https://arxiv.org/abs/1707.01495 - WARNING: Requires maximum episode length provided either by the environment or by the user! + .. warning:: + + For performance reasons, the maximum number of steps per episodes must be specified. + In most cases, it will be inferred if you specify ``max_episode_steps`` when registering the environment + or if you use a ``gym.wrappers.TimeLimit`` (and ``env.spec`` is not None). + Otherwise, you can directly pass ``max_episode_length`` to the model constructor + For additional offline algorithm specific arguments please have a look at the corresponding documentation. From 8bb5c7c670b13014402446d1ccea1660aadebd29 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 21 Oct 2020 17:23:49 +0200 Subject: [PATCH 75/81] Remove np.empty + add doc --- stable_baselines3/common/atari_wrappers.py | 2 +- stable_baselines3/her/her_replay_buffer.py | 26 +++++++++++++--------- tests/test_her.py | 19 +++++++++------- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/stable_baselines3/common/atari_wrappers.py b/stable_baselines3/common/atari_wrappers.py index 7cc6836aaa..b0c52959bf 100644 --- a/stable_baselines3/common/atari_wrappers.py +++ b/stable_baselines3/common/atari_wrappers.py @@ -34,7 +34,7 @@ def reset(self, **kwargs) -> np.ndarray: else: noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) assert noops > 0 - obs = np.empty(0) + obs = np.zeros(0) for _ in range(noops): obs, _, done, _ = self.env.step(self.noop_action) if done: diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index e727f2b73a..5ca20eb71b 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -70,7 +70,7 @@ def __init__( "done": (1,), } self.buffer = { - key: np.empty((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32) + key: np.zeros((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32) for key, dim in input_shape.items() } # Store info dicts are it can be used to compute the reward (e.g. continuity cost) @@ -129,7 +129,7 @@ def sample( env: Optional[VecNormalize] = None, online_sampling: bool = True, n_sampled_goal: int = None, - ) -> Union[ReplayBufferSamples, Tuple]: + ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]: """ :param batch_size: Number of element to sample :param env: Associated gym VecEnv @@ -182,7 +182,7 @@ def _sample_transitions( maybe_vec_env: Optional[VecNormalize], online_sampling: bool = True, n_sampled_goal: int = None, - ) -> Union[ReplayBufferSamples, Tuple]: + ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]: """ :param batch_size: Number of element to sample :param env: associated gym VecEnv to normalize the observations/rewards @@ -194,6 +194,7 @@ def _sample_transitions( # Select which episodes to use if online_sampling: episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) + # A subset of the transitions will be relabeled using HER algorithm her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)] else: assert maybe_vec_env is None, "Transitions must be stored unnormalized in the replay buffer" @@ -205,27 +206,29 @@ def _sample_transitions( # as real transitions are already stored in the replay buffer her_indices = np.arange(len(episode_indices)) - ep_length = self.episode_lengths[episode_indices] + ep_lengths = self.episode_lengths[episode_indices] + # Special case when using the "future" goal sampling strategy + # we cannot sample all transitions, we have to remove the last timestep if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: - # restrict the sampling domain when ep_length > 1 + # restrict the sampling domain when ep_lengths > 1 # otherwise filter out the indices - her_indices = her_indices[ep_length[her_indices] > 1] - ep_length[her_indices] -= 1 + her_indices = her_indices[ep_lengths[her_indices] > 1] + ep_lengths[her_indices] -= 1 if online_sampling: # Select which transitions to use - transitions_indices = np.random.randint(ep_length) + transitions_indices = np.random.randint(ep_lengths) else: if her_indices.size == 0: # Episode of one timestep, not enough for using the "future" strategy # no virtual transitions are created in that case - return np.empty(0), np.empty(0), np.empty(0), np.empty(0) + return np.zeros(0), np.zeros(0), np.zeros(0), np.zeros(0) else: # Repeat every transition index n_sampled_goals times # to sample n_sampled_goal per timestep in the episode (only one is stored). # Now with the corrected episode length when using "future" strategy - transitions_indices = np.tile(np.arange(ep_length[0]), n_sampled_goal) + transitions_indices = np.tile(np.arange(ep_lengths[0]), n_sampled_goal) episode_indices = episode_indices[transitions_indices] her_indices = np.arange(len(episode_indices)) @@ -254,6 +257,7 @@ def _sample_transitions( # r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal) # therefore we have to use "next_achieved_goal" and not "achieved_goal" transitions["next_achieved_goal"][her_indices, 0], + # here we use the new desired goal transitions["desired_goal"][her_indices, 0], transitions["info"][her_indices, 0], ) @@ -333,7 +337,7 @@ def n_episodes_stored(self) -> int: def size(self) -> int: """ - :return: The current size of the buffer in transitions. + :return: The current number of transitions in the buffer. """ return int(np.sum(self.episode_lengths)) diff --git a/tests/test_her.py b/tests/test_her.py index a11eb0c7b8..17be564f90 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -125,15 +125,15 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling): observations = np.array(observations_list) # Get dictionary of current parameters - params = deepcopy(model.model.policy.state_dict()) + params = deepcopy(model.policy.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values - model.model.policy.load_state_dict(random_params) + model.policy.load_state_dict(random_params) - new_params = model.model.policy.state_dict() + new_params = model.policy.state_dict() # Check that all params are different now for k in params: assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected." @@ -141,7 +141,7 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling): params = new_params # get selected actions - selected_actions, _ = model.model.predict(observations, deterministic=True) + selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") @@ -149,14 +149,14 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling): model = HER.load(str(tmp_path / "test_save.zip"), env=env) # check if params are still the same after load - new_params = model.model.policy.state_dict() + new_params = model.policy.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert th.allclose(params[key], new_params[key]), "Model parameters not the same after save and load." # check if model still selects the same actions - new_selected_actions, _ = model.model.predict(observations, deterministic=True) + new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works @@ -185,13 +185,16 @@ def test_save_load_replay_buffer(tmp_path, online_sampling): n_episodes_rollout=-1, max_episode_length=4, buffer_size=int(2e4), - seed=0, policy_kwargs=dict(net_arch=[64]), ) model.learn(200) old_replay_buffer = deepcopy(model.replay_buffer) model.save_replay_buffer(path) - model.model.replay_buffer = None + del model.model.replay_buffer + + with pytest.raises(AttributeError): + model.replay_buffer + model.load_replay_buffer(path) if online_sampling: From d884f9c591ec622874645bc0667d809c8a69bfa6 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Wed, 21 Oct 2020 18:36:57 +0200 Subject: [PATCH 76/81] Fix loading --- stable_baselines3/her/her.py | 38 +++++++++++++++++++++++++++--------- tests/test_her.py | 5 +++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index bf7c486356..fd4dd4692d 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -87,16 +87,22 @@ def __init__( **kwargs, ): - super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=3e-4) + # we will use the policy and learning rate from the model + super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=0.0) + del self.policy, self.learning_rate if self.get_vec_normalize_env() is not None: assert online_sampling, "You must pass `online_sampling=True` if you want to use `VecNormalize` with `HER`" + _init_setup_model = kwargs.get("_init_setup_model", True) + if "_init_setup_model" in kwargs: + del kwargs["_init_setup_model"] # model initialization self.model_class = model_class self.model = model_class( policy=policy, env=self.env, + _init_setup_model=False, # pytype: disable=wrong-keyword-args *args, **kwargs, # pytype: disable=wrong-keyword-args ) @@ -122,7 +128,8 @@ def __init__( self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1)) # maximum steps in episode self.max_episode_length = get_time_limit(self.env, max_episode_length) - # storage for transitions of current episode + # storage for transitions of current episode for offline sampling + # for online sampling, it replaces the "classic" replay buffer completely her_buffer_size = self.buffer_size if online_sampling else self.max_episode_length self._episode_storage = HerReplayBuffer( self.env, @@ -136,15 +143,17 @@ def __init__( self.her_ratio, # pytype: disable=wrong-arg-types ) - # assign episode storage to replay buffer when using online HER sampling - if self.online_sampling: - self.model.replay_buffer = self._episode_storage - # counter for steps in episode self.episode_steps = 0 + if _init_setup_model: + self._setup_model() + def _setup_model(self) -> None: self.model._setup_model() + # assign episode storage to replay buffer when using online HER sampling + if self.online_sampling: + self.model.replay_buffer = self._episode_storage def predict( self, @@ -466,10 +475,20 @@ def load( if "env" in data: env = data["env"] - kwargs = {} if "use_sde" in data and data["use_sde"]: kwargs["use_sde"] = True + # Keys that cannot be changed + for key in {"model_class", "online_sampling", "max_episode_length"}: + if key in kwargs: + del kwargs[key] + + # Keys that can be changed + for key in {"n_sampled_goal", "goal_selection_strategy"}: + if key in kwargs: + data[key] = kwargs[key] # pytype: disable=unsupported-operands + del kwargs[key] + # noinspection PyArgumentList her_model = cls( policy=data["policy_class"], @@ -480,13 +499,14 @@ def load( online_sampling=data["online_sampling"], max_episode_length=data["max_episode_length"], policy_kwargs=data["policy_kwargs"], - _init_setup_model=True, # pytype: disable=not-instantiable,wrong-keyword-args + _init_setup_model=False, # pytype: disable=not-instantiable,wrong-keyword-args **kwargs, ) # load parameters her_model.model.__dict__.update(data) - her_model.__dict__.update(kwargs) + her_model.model.__dict__.update(kwargs) + her_model._setup_model() her_model._total_timesteps = her_model.model._total_timesteps her_model.num_timesteps = her_model.model.num_timesteps diff --git a/tests/test_her.py b/tests/test_her.py index 17be564f90..6c4fe0ef16 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -162,6 +162,11 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling): # check if learn still works model.learn(total_timesteps=300) + # Test that the change of parameters works + model = HER.load(str(tmp_path / "test_save.zip"), env=env, verbose=3, learning_rate=2.0) + assert model.model.learning_rate == 2.0 + assert model.verbose == 3 + # clear file from os os.remove(tmp_path / "test_save.zip") From 0ba127270f3170e5ad60eac8027b4ac189d95207 Mon Sep 17 00:00:00 2001 From: Megan Klaiber Date: Wed, 21 Oct 2020 22:46:58 +0200 Subject: [PATCH 77/81] Updated loading replay buffer --- stable_baselines3/her/her.py | 27 +++++++++++++++++++++++++-- tests/test_her.py | 13 +++++++++---- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index fd4dd4692d..c15453bd3c 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -1,5 +1,6 @@ import io import pathlib +import warnings from typing import Any, Iterable, List, Optional, Tuple, Type, Union import numpy as np @@ -526,15 +527,37 @@ def load( her_model.model.policy.reset_noise() # pytype: disable=attribute-error return her_model - def load_replay_buffer(self, path: Union[str, pathlib.Path, io.BufferedIOBase]) -> None: + def load_replay_buffer( + self, path: Union[str, pathlib.Path, io.BufferedIOBase], truncate_last_trajectory: bool = True + ) -> None: """ Load a replay buffer from a pickle file and set environment for replay buffer (only online sampling). :param path: Path to the pickled replay buffer. + :param truncate_last_trajectory: + If set to ``True`` we assume that the last trajectory in the replay buffer was finished. + If it is set to ``False`` we assume it is the same trajectory where we continue. """ self.model.load_replay_buffer(path=path) if self.online_sampling: # set environment self.replay_buffer.set_env(self.env) - self.replay_buffer.current_idx = 0 + + # truncate interrupted episode + if truncate_last_trajectory: + warnings.warn( + "The last trajectory in the replay buffer will be truncated, " + "You should use `truncate_last_trajectory=False` to avoid that issue." + ) + # get current episode and transition index + pos = self.replay_buffer.pos + current_idx = self.replay_buffer.current_idx + # set episode length for current episode + self.replay_buffer.episode_lengths[pos] = current_idx + # set done = True for current episode + self.replay_buffer.buffer["done"][pos][current_idx] = np.array([True], dtype=np.float32) + # reset current transition index + self.replay_buffer.current_idx = 0 + # increment episode counter + self.replay_buffer.pos += 1 diff --git a/tests/test_her.py b/tests/test_her.py index 6c4fe0ef16..c21a3d78d4 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -171,8 +171,8 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling): os.remove(tmp_path / "test_save.zip") -@pytest.mark.parametrize("online_sampling", [False, True]) -def test_save_load_replay_buffer(tmp_path, online_sampling): +@pytest.mark.parametrize("online_sampling, truncate_last_trajectory", [(False, None), (True, True), (True, False)]) +def test_save_load_replay_buffer(tmp_path, online_sampling, truncate_last_trajectory): """ Test if 'save_replay_buffer' and 'load_replay_buffer' works correctly """ @@ -200,7 +200,7 @@ def test_save_load_replay_buffer(tmp_path, online_sampling): with pytest.raises(AttributeError): model.replay_buffer - model.load_replay_buffer(path) + model.load_replay_buffer(path, truncate_last_trajectory) if online_sampling: n_episodes_stored = model.replay_buffer.n_episodes_stored @@ -218,8 +218,10 @@ def test_save_load_replay_buffer(tmp_path, online_sampling): assert np.allclose( old_replay_buffer.buffer["reward"][:n_episodes_stored], model.replay_buffer.buffer["reward"][:n_episodes_stored] ) + # we might change the last done of the last trajectory so we don't compare it assert np.allclose( - old_replay_buffer.buffer["done"][:n_episodes_stored], model.replay_buffer.buffer["done"][:n_episodes_stored] + old_replay_buffer.buffer["done"][: n_episodes_stored - 1], + model.replay_buffer.buffer["done"][: n_episodes_stored - 1], ) else: assert np.allclose(old_replay_buffer.observations, model.replay_buffer.observations) @@ -227,6 +229,9 @@ def test_save_load_replay_buffer(tmp_path, online_sampling): assert np.allclose(old_replay_buffer.rewards, model.replay_buffer.rewards) assert np.allclose(old_replay_buffer.dones, model.replay_buffer.dones) + # test if continuing training works properly + model.learn(200) + def test_get_max_episode_length(): dict_env = DummyVecEnv([lambda: BitFlippingEnv()]) From 403421784e1dcf8ddaa931c53180ea1f64ee0ac0 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Thu, 22 Oct 2020 10:56:47 +0200 Subject: [PATCH 78/81] Separate online and offline sampling + bug fixes --- stable_baselines3/her/her.py | 29 ++++++++-------- stable_baselines3/her/her_replay_buffer.py | 39 ++++++++++++++++------ tests/test_her.py | 20 +++++++++-- 3 files changed, 63 insertions(+), 25 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index c15453bd3c..83b1be4563 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -380,15 +380,14 @@ def collect_rollouts( def _sample_her_transitions(self) -> None: """ Sample additional goals and store new transitions in replay buffer - when using offline sampling + when using offline sampling. """ - # sample goals and get new observations - observations, next_observations, actions, rewards = self._episode_storage.sample( - self.batch_size, - None, # we should store unnormalized transitions, they will be normalized at sampling time - self.online_sampling, - self.n_sampled_goal, + # Sample goals and get new observations + # maybe_vec_env=None as we should store unnormalized transitions, + # they will be normalized at sampling time + observations, next_observations, actions, rewards = self._episode_storage.sample_offline( + n_sampled_goal=self.n_sampled_goal ) # store data in replay buffer @@ -534,9 +533,9 @@ def load_replay_buffer( Load a replay buffer from a pickle file and set environment for replay buffer (only online sampling). :param path: Path to the pickled replay buffer. - :param truncate_last_trajectory: + :param truncate_last_trajectory: Only for online sampling. If set to ``True`` we assume that the last trajectory in the replay buffer was finished. - If it is set to ``False`` we assume it is the same trajectory where we continue. + If it is set to ``False`` we assume that we continue the same trajectory (same episode). """ self.model.load_replay_buffer(path=path) @@ -547,8 +546,9 @@ def load_replay_buffer( # truncate interrupted episode if truncate_last_trajectory: warnings.warn( - "The last trajectory in the replay buffer will be truncated, " - "You should use `truncate_last_trajectory=False` to avoid that issue." + "The last trajectory in the replay buffer will be truncated.\n" + "If you are in the same episode as when the replay buffer was saved,\n" + "you should use `truncate_last_trajectory=False` to avoid that issue." ) # get current episode and transition index pos = self.replay_buffer.pos @@ -556,8 +556,11 @@ def load_replay_buffer( # set episode length for current episode self.replay_buffer.episode_lengths[pos] = current_idx # set done = True for current episode - self.replay_buffer.buffer["done"][pos][current_idx] = np.array([True], dtype=np.float32) + # current_idx was already incremented + self.replay_buffer.buffer["done"][pos][current_idx - 1] = np.array([True], dtype=np.float32) # reset current transition index self.replay_buffer.current_idx = 0 # increment episode counter - self.replay_buffer.pos += 1 + self.replay_buffer.pos = (self.replay_buffer.pos + 1) % self.replay_buffer.max_episode_stored + # update "full" indicator + self.replay_buffer.full = self.replay_buffer.full or self.replay_buffer.pos == 0 diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 5ca20eb71b..5bbf1b9774 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -126,19 +126,36 @@ def _get_samples( def sample( self, batch_size: int, - env: Optional[VecNormalize] = None, - online_sampling: bool = True, - n_sampled_goal: int = None, + env: Optional[VecNormalize], ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]: """ + Sample function for online sampling of HER transition, + this replaces the "regular" replay buffer ``sample()`` + method in the ``train()`` function. + :param batch_size: Number of element to sample :param env: Associated gym VecEnv to normalize the observations/rewards when sampling - :param online_sampling: Using online_sampling for HER or not. - :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) :return: Samples. """ - return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal) + return self._sample_transitions(batch_size, maybe_vec_env=env, online_sampling=True) + + def sample_offline( + self, + n_sampled_goal: Optional[int] = None, + ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]: + """ + Sample function for offline sampling of HER transition, + in that case, only one episode is used and transitions + are added to the regular replay buffer. + + :param n_sampled_goal: Number of sampled goals for replay + :return: at most(n_sampled_goal * episode_length) HER transitions. + """ + # env=None as we should store unnormalized transitions, they will be normalized at sampling time + return self._sample_transitions( + batch_size=None, maybe_vec_env=None, online_sampling=False, n_sampled_goal=n_sampled_goal + ) def sample_goals( self, @@ -178,13 +195,13 @@ def sample_goals( def _sample_transitions( self, - batch_size: int, + batch_size: Optional[int], maybe_vec_env: Optional[VecNormalize], - online_sampling: bool = True, - n_sampled_goal: int = None, + online_sampling: bool, + n_sampled_goal: Optional[int] = None, ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]: """ - :param batch_size: Number of element to sample + :param batch_size: Number of element to sample (only used for online sampling) :param env: associated gym VecEnv to normalize the observations/rewards Only valid when using online sampling :param online_sampling: Using online_sampling for HER or not. @@ -193,11 +210,13 @@ def _sample_transitions( """ # Select which episodes to use if online_sampling: + assert batch_size is not None, "No batch_size specified for online sampling of HER transitions" episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) # A subset of the transitions will be relabeled using HER algorithm her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)] else: assert maybe_vec_env is None, "Transitions must be stored unnormalized in the replay buffer" + assert n_sampled_goal is not None, "No n_sampled_goal specified for offline sampling of HER transitions" # Offline sampling: there is only one episode stored episode_length = self.episode_lengths[0] # we sample n_sampled_goal per timestep in the episode (only one is stored). diff --git a/tests/test_her.py b/tests/test_her.py index c21a3d78d4..bd2e36cfd1 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -1,5 +1,6 @@ import os import pathlib +import warnings from copy import deepcopy import gym @@ -172,10 +173,14 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling): @pytest.mark.parametrize("online_sampling, truncate_last_trajectory", [(False, None), (True, True), (True, False)]) -def test_save_load_replay_buffer(tmp_path, online_sampling, truncate_last_trajectory): +def test_save_load_replay_buffer(tmp_path, recwarn, online_sampling, truncate_last_trajectory): """ Test if 'save_replay_buffer' and 'load_replay_buffer' works correctly """ + # remove gym warnings + warnings.filterwarnings(action="ignore", category=DeprecationWarning) + warnings.filterwarnings(action="ignore", category=UserWarning, module="gym") + path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl") path.parent.mkdir(exist_ok=True, parents=True) # to not raise a warning env = BitFlippingEnv(n_bits=4, continuous=True) @@ -200,8 +205,18 @@ def test_save_load_replay_buffer(tmp_path, online_sampling, truncate_last_trajec with pytest.raises(AttributeError): model.replay_buffer + # Check that there is no warning + assert len(recwarn) == 0 + model.load_replay_buffer(path, truncate_last_trajectory) + if truncate_last_trajectory: + assert len(recwarn) == 1 + warning = recwarn.pop(UserWarning) + assert "The last trajectory in the replay buffer will be truncated" in str(warning.message) + else: + assert len(recwarn) == 0 + if online_sampling: n_episodes_stored = model.replay_buffer.n_episodes_stored assert np.allclose( @@ -230,7 +245,8 @@ def test_save_load_replay_buffer(tmp_path, online_sampling, truncate_last_trajec assert np.allclose(old_replay_buffer.dones, model.replay_buffer.dones) # test if continuing training works properly - model.learn(200) + reset_num_timesteps = False if truncate_last_trajectory is False else True + model.learn(200, reset_num_timesteps=reset_num_timesteps) def test_get_max_episode_length(): From aacd9363b61480eb3140760a8366a536b38bcad6 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Thu, 22 Oct 2020 11:02:23 +0200 Subject: [PATCH 79/81] Update tensorboard log name --- stable_baselines3/her/her.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index 83b1be4563..e80035e6e9 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -174,7 +174,7 @@ def learn( eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, - tb_log_name: str = "run", + tb_log_name: str = "HER", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> BaseAlgorithm: From 940ee2c0c9bef10689dbf06b66b997046d814814 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Thu, 22 Oct 2020 11:08:48 +0200 Subject: [PATCH 80/81] Version bump --- docs/misc/changelog.rst | 2 +- stable_baselines3/version.txt | 2 +- tests/test_her.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index cf9ef5960d..1d6b2a4723 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -4,7 +4,7 @@ Changelog ========== -Pre-Release 0.10.0a0 (WIP) +Pre-Release 0.10.0a1 (WIP) ------------------------------ Breaking Changes: diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt index 37f1777fc3..8dabd1f602 100644 --- a/stable_baselines3/version.txt +++ b/stable_baselines3/version.txt @@ -1 +1 @@ -0.10.0a0 +0.10.0a1 diff --git a/tests/test_her.py b/tests/test_her.py index bd2e36cfd1..09d1a78580 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -196,6 +196,7 @@ def test_save_load_replay_buffer(tmp_path, recwarn, online_sampling, truncate_la max_episode_length=4, buffer_size=int(2e4), policy_kwargs=dict(net_arch=[64]), + seed=0, ) model.learn(200) old_replay_buffer = deepcopy(model.replay_buffer) From 3bb19a7618180ddf0838da0886f6f61792b6f9fe Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Thu, 22 Oct 2020 11:33:33 +0200 Subject: [PATCH 81/81] Bug fix for special case --- stable_baselines3/her/her.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py index e80035e6e9..658abc6fe7 100644 --- a/stable_baselines3/her/her.py +++ b/stable_baselines3/her/her.py @@ -542,9 +542,11 @@ def load_replay_buffer( if self.online_sampling: # set environment self.replay_buffer.set_env(self.env) + # If we are at the start of an episode, no need to truncate + current_idx = self.replay_buffer.current_idx # truncate interrupted episode - if truncate_last_trajectory: + if truncate_last_trajectory and current_idx > 0: warnings.warn( "The last trajectory in the replay buffer will be truncated.\n" "If you are in the same episode as when the replay buffer was saved,\n" @@ -552,7 +554,6 @@ def load_replay_buffer( ) # get current episode and transition index pos = self.replay_buffer.pos - current_idx = self.replay_buffer.current_idx # set episode length for current episode self.replay_buffer.episode_lengths[pos] = current_idx # set done = True for current episode