From f0e03de6d4a1ba4660953ad5611b3902e39741fb Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 21 Jul 2020 00:44:42 +0200
Subject: [PATCH 01/81] Added working her version, Online sampling is missing.

---
 stable_baselines3/her/her.py         | 419 +++++++++++++++++++++++++++
 stable_baselines3/her/obs_wrapper.py |  78 +++++
 tests/test_her.py                    | 106 +++++++
 3 files changed, 603 insertions(+)
 create mode 100644 stable_baselines3/her/her.py
 create mode 100644 stable_baselines3/her/obs_wrapper.py
 create mode 100644 tests/test_her.py

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
new file mode 100644
index 0000000000..ae3683b404
--- /dev/null
+++ b/stable_baselines3/her/her.py
@@ -0,0 +1,419 @@
+from enum import Enum
+from inspect import signature
+from typing import Any, Callable, Dict, Optional, Type, Union
+
+import numpy as np
+import torch as th
+
+from stable_baselines3.common.buffers import ReplayBuffer
+from stable_baselines3.common.callbacks import BaseCallback
+from stable_baselines3.common.noise import ActionNoise
+from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm
+from stable_baselines3.common.policies import BasePolicy
+from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
+from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.her.obs_wrapper import ObsWrapper
+
+
+class GoalSelectionStrategy(Enum):
+    """
+    The strategies for selecting new goals when
+    creating artificial transitions.
+    """
+
+    # Select a goal that was achieved
+    # after the current step, in the same episode
+    FUTURE = 0
+    # Select the goal that was achieved
+    # at the end of the episode
+    FINAL = 1
+    # Select a goal that was achieved in the episode
+    EPISODE = 2
+    # Select a goal that was achieved
+    # at some point in the training procedure
+    # (and that is present in the replay buffer)
+    RANDOM = 3
+
+
+# For convenience
+# that way, we can use string to select a strategy
+KEY_TO_GOAL_STRATEGY = {
+    "future": GoalSelectionStrategy.FUTURE,
+    "final": GoalSelectionStrategy.FINAL,
+    "episode": GoalSelectionStrategy.EPISODE,
+    "random": GoalSelectionStrategy.RANDOM,
+}
+
+
+class HER(OffPolicyAlgorithm):
+    """
+    Hindsight Experience Replay (HER)
+
+    :param policy: (BasePolicy) The policy model to use.
+    :param env: (VecEnv) The environment to learn from.
+    :param model: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3)
+    :param n_goals: (int) Number of sampled goals for replay.
+    :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay.
+        One of ['episode', 'final', 'future', 'random']
+    :param learning_rate: (float or callable) learning rate for the optimizer,
+        it can be a function of the current progress remaining (from 1 to 0)
+    :param buffer_size: (int) size of the replay buffer
+    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
+    :param batch_size: (int) Minibatch size for each gradient update
+    :param tau: (float) the soft update coefficient ("Polyak update", between 0 and 1)
+    :param gamma: (float) the discount factor
+    :param train_freq: (int) Update the model every ``train_freq`` steps.
+    :param gradient_steps: (int) How many gradient update after each step
+    :param n_episodes_rollout: (int) Update the model every ``n_episodes_rollout`` episodes.
+        Note that this cannot be used at the same time as ``train_freq``
+    :param action_noise: (ActionNoise) the action noise type (None by default), this can help
+        for hard exploration problem. Cf common.noise for the different action noise type.
+    :param optimize_memory_usage: (bool) Enable a memory efficient variant of the replay buffer
+        at a cost of more complexity.
+        See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param policy_kwargs: Additional arguments to be passed to the policy on creation
+    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
+    :param verbose: The verbosity level: 0 none, 1 training information, 2 debug
+    :param device: Device on which the code should run.
+        By default, it will try to use a Cuda compatible device and fallback to cpu
+        if it is not possible.
+    :param support_multi_env: Whether the algorithm supports training
+        with multiple environments (as in A2C)
+    :param create_eval_env: Whether to create a second environment that will be
+        used for evaluating the agent periodically. (Only available when passing string for the environment)
+    :param monitor_wrapper: When creating an environment, whether to wrap it
+        or not in a Monitor wrapper.
+    :param seed: Seed for the pseudo random generators
+    :param use_sde: Whether to use State Dependent Exploration (SDE)
+        instead of action noise exploration (default: False)
+    :param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
+        Default: -1 (only sample at the beginning of the rollout)
+    :param use_sde_at_warmup: (bool) Whether to use gSDE instead of uniform sampling
+        during the warm up phase (before learning starts)
+    :param sde_support: (bool) Whether the model support gSDE or not
+    """
+
+    def __init__(
+        self,
+        policy: Type[BasePolicy],
+        env: VecEnv,
+        model: Type[OffPolicyAlgorithm],
+        n_goals: int = 5,
+        goal_strategy: Union[GoalSelectionStrategy, str] = "final",
+        learning_rate: Union[float, Callable] = 3e-4,
+        buffer_size: int = int(1e6),
+        learning_starts: int = 100,
+        batch_size: int = 256,
+        tau: float = 0.005,
+        gamma: float = 0.99,
+        train_freq: int = 1,
+        gradient_steps: int = 1,
+        n_episodes_rollout: int = -1,
+        action_noise: Optional[ActionNoise] = None,
+        optimize_memory_usage: bool = False,
+        policy_kwargs: Dict[str, Any] = None,
+        tensorboard_log: Optional[str] = None,
+        verbose: int = 0,
+        device: Union[th.device, str] = "auto",
+        support_multi_env: bool = False,
+        create_eval_env: bool = False,
+        monitor_wrapper: bool = True,
+        seed: Optional[int] = None,
+        use_sde: bool = False,
+        sde_sample_freq: int = -1,
+        use_sde_at_warmup: bool = False,
+        sde_support: bool = True,
+        *args,
+        **kwargs
+    ):
+
+        if isinstance(goal_strategy, str):
+            self.goal_strategy = KEY_TO_GOAL_STRATEGY[goal_strategy.lower()]
+        else:
+            self.goal_strategy = goal_strategy
+
+        assert isinstance(
+            self.goal_strategy, GoalSelectionStrategy
+        ), "Invalid goal selection strategy," "please use one of {}".format(list(GoalSelectionStrategy))
+
+        self.env = ObsWrapper(env)
+
+        # get arguments for the model initialization
+        model_signature = signature(model.__init__)
+        arguments = locals()
+        model_init_dict = {
+            key: arguments[key]
+            for key in model_signature.parameters.keys()
+            if key in arguments and key != "self" and key != "env"
+        }
+
+        super(HER, self).__init__(
+            policy,
+            self.env,
+            BasePolicy,
+            learning_rate,
+            buffer_size,
+            learning_starts,
+            batch_size,
+            tau,
+            gamma,
+            train_freq,
+            gradient_steps,
+            n_episodes_rollout,
+            action_noise,
+            optimize_memory_usage,
+            policy_kwargs,
+            tensorboard_log,
+            verbose,
+            device,
+            support_multi_env,
+            create_eval_env,
+            monitor_wrapper,
+            seed,
+            use_sde,
+            sde_sample_freq,
+            use_sde_at_warmup,
+            sde_support,
+        )
+
+        # model initialization
+        self.model = model(env=self.env, **model_init_dict, **kwargs)
+
+        # storage for transitions of current episode
+        self.episode_storage = []
+        self.n_goals = n_goals
+
+    def learn(
+        self,
+        total_timesteps: int,
+        callback: MaybeCallback = None,
+        log_interval: int = 4,
+        eval_env: Optional[GymEnv] = None,
+        eval_freq: int = -1,
+        n_eval_episodes: int = 5,
+        tb_log_name: str = "run",
+        eval_log_path: Optional[str] = None,
+        reset_num_timesteps: bool = True,
+    ) -> "OffPolicyAlgorithm":
+
+        total_timesteps, callback = self.model._setup_learn(
+            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
+        )
+
+        callback.on_training_start(locals(), globals())
+
+        while self.model.num_timesteps < total_timesteps:
+
+            rollout = self.collect_rollouts(
+                self.env,
+                n_episodes=self.model.n_episodes_rollout,
+                n_steps=self.model.train_freq,
+                action_noise=self.model.action_noise,
+                callback=callback,
+                learning_starts=self.model.learning_starts,
+                replay_buffer=self.model.replay_buffer,
+                log_interval=log_interval,
+            )
+
+            if rollout.continue_training is False:
+                break
+
+            if self.model.num_timesteps > 0 and self.model.num_timesteps > self.model.learning_starts:
+                # If no `gradient_steps` is specified,
+                # do as many gradients steps as steps performed during the rollout
+                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
+                self.train(batch_size=self.batch_size, gradient_steps=gradient_steps)
+
+        callback.on_training_end()
+
+        return self
+
+    def collect_rollouts(
+        self,
+        env: VecEnv,
+        callback: BaseCallback,
+        n_episodes: int = 1,
+        n_steps: int = -1,
+        action_noise: Optional[ActionNoise] = None,
+        learning_starts: int = 0,
+        replay_buffer: Optional[ReplayBuffer] = None,
+        log_interval: Optional[int] = None,
+    ) -> RolloutReturn:
+        """
+        Collect experiences and store them into a ReplayBuffer.
+
+        :param env: (VecEnv) The training environment
+        :param callback: (BaseCallback) Callback that will be called at each step
+            (and at the beginning and end of the rollout)
+        :param n_episodes: (int) Number of episodes to use to collect rollout data
+            You can also specify a ``n_steps`` instead
+        :param n_steps: (int) Number of steps to use to collect rollout data
+            You can also specify a ``n_episodes`` instead.
+        :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration
+            Required for deterministic policy (e.g. TD3). This can also be used
+            in addition to the stochastic policy for SAC.
+        :param learning_starts: (int) Number of steps before learning for the warm-up phase.
+        :param replay_buffer: (ReplayBuffer)
+        :param log_interval: (int) Log data every ``log_interval`` episodes
+        :return: (RolloutReturn)
+        """
+        episode_rewards, total_timesteps = [], []
+        total_steps, total_episodes = 0, 0
+
+        assert isinstance(env, VecEnv), "You must pass a VecEnv"
+        assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"
+
+        if self.use_sde:
+            self.model.actor.reset_noise()
+
+        callback.on_rollout_start()
+        continue_training = True
+
+        while total_steps < n_steps or total_episodes < n_episodes:
+            done = False
+            episode_reward, episode_timesteps = 0.0, 0
+
+            while not done:
+                # concatenate observation and (desired) goal
+                observation = self.model._last_obs
+                self.model._last_obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1)
+
+                if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0:
+                    # Sample a new noise matrix
+                    self.model.actor.reset_noise()
+
+                # Select action randomly or according to policy
+                action, buffer_action = self.model._sample_action(learning_starts, action_noise)
+
+                # Rescale and perform action
+                new_obs, reward, done, infos = env.step(action)
+
+                # Only stop training if return value is False, not when it is None.
+                if callback.on_step() is False:
+                    return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False)
+
+                episode_reward += reward
+
+                # Retrieve reward and episode length if using Monitor wrapper
+                self.model._update_info_buffer(infos, done)
+
+                # Store episode in episode storage
+                if replay_buffer is not None:
+                    # Store only the unnormalized version
+                    if self.model._vec_normalize_env is not None:
+                        new_obs_ = self.model._vec_normalize_env.get_original_obs()
+                        reward_ = self.model._vec_normalize_env.get_original_reward()
+                    else:
+                        # Avoid changing the original ones
+                        self.model._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
+
+                    # add current transition to episode storage
+                    self.episode_storage.append((self.model._last_original_obs, buffer_action, reward_, new_obs_, done))
+
+                self.model._last_obs = new_obs
+                # Save the unnormalized observation
+                if self.model._vec_normalize_env is not None:
+                    self.model._last_original_obs = new_obs_
+
+                self.model.num_timesteps += 1
+                episode_timesteps += 1
+                total_steps += 1
+                self.model._update_current_progress_remaining(self.model.num_timesteps, self.model._total_timesteps)
+
+                # For DQN, check if the target network should be updated
+                # and update the exploration schedule
+                # For SAC/TD3, the update is done as the same time as the gradient update
+                # see https://github.com/hill-a/stable-baselines/issues/900
+                self.model._on_step()
+
+                if 0 < n_steps <= total_steps:
+                    break
+
+            if done:
+                # store episode in replay buffer
+                self.store_transitions()
+                # clear storage for current episode
+                self.episode_storage = []
+
+                total_episodes += 1
+                self.model._episode_num += 1
+                episode_rewards.append(episode_reward)
+                total_timesteps.append(episode_timesteps)
+
+                if action_noise is not None:
+                    action_noise.reset()
+
+                # Log training infos
+                if log_interval is not None and self.model._episode_num % log_interval == 0:
+                    self.model._dump_logs()
+
+        mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0
+
+        callback.on_rollout_end()
+
+        return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)
+
+    def train(self, gradient_steps: int, batch_size: int) -> None:
+        self.model.train(gradient_steps=gradient_steps, batch_size=batch_size)
+
+    def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]:
+        """
+        Sample a goal based on goal_strategy.
+
+        :param sample_idx: (int) Index of current transition.
+        :return: (np.ndarray or None) Return sampled goal.
+        """
+        if self.goal_strategy == GoalSelectionStrategy.FINAL:
+            # replay with final state of current episode
+            return self.episode_storage[-1][0]["achieved_goal"]
+        elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
+            # replay with random state which comes from the same episode and was observed after current transition
+            # we have no transition after last transition of episode
+            if (sample_idx + 1) < len(self.episode_storage):
+                index = np.random.choice(np.arange(sample_idx + 1, len(self.episode_storage)))
+                return self.episode_storage[index][0]["achieved_goal"]
+        elif self.goal_strategy == GoalSelectionStrategy.EPISODE:
+            # replay with random state which comes from the same episode as current transition
+            index = np.random.choice(np.arange(len(self.episode_storage)))
+            return self.episode_storage[index][0]["achieved_goal"]
+        elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
+            # replay with random state from the entire replay buffer
+            index = np.random.choice(np.arange(self.model.replay_buffer.size()))
+            obs = self.model.replay_buffer.observations[index]
+            # get only the observation part
+            obs_array = obs[:, : self.env.obs_dim]
+            return obs_array
+        else:
+            raise ValueError("Strategy for sampling goals not supported!")
+
+    def store_transitions(self) -> None:
+        """
+        Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer.
+        """
+
+        # iterate over current episodes transitions
+        for idx, trans in enumerate(self.episode_storage):
+
+            observation, action, reward, new_observation, done = trans
+
+            # concatenate observation with (desired) goal
+            obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1)
+            new_obs = np.concatenate([new_observation["observation"], new_observation["desired_goal"]], axis=1)
+
+            # store data in replay buffer
+            self.model.replay_buffer.add(obs, new_obs, action, reward, done)
+
+            # sample set of additional goals
+            sampled_goals = [sample for sample in (self.sample_goals(idx) for i in range(self.n_goals)) if sample is not None]
+
+            # iterate over sampled goals and store new transitions in replay buffer
+            for goal in sampled_goals:
+                # compute new reward with new goal
+                new_reward = self.env.env_method("compute_reward", new_observation["achieved_goal"], goal, None)
+
+                # concatenate observation with (desired) goal
+                obs = np.concatenate([observation["observation"], goal], axis=1)
+                new_obs = np.concatenate([new_observation["observation"], goal], axis=1)
+
+                # store data in replay buffer
+                self.model.replay_buffer.add(obs, new_obs, action, new_reward, done)
diff --git a/stable_baselines3/her/obs_wrapper.py b/stable_baselines3/her/obs_wrapper.py
new file mode 100644
index 0000000000..e59f40f939
--- /dev/null
+++ b/stable_baselines3/her/obs_wrapper.py
@@ -0,0 +1,78 @@
+from typing import List, Optional, Sequence, Union
+
+import numpy as np
+from gym import spaces
+
+from stable_baselines3.common.vec_env import VecEnv
+
+
+class ObsWrapper(VecEnv):
+    """
+    Wrapper for a VecEnv which overrides the observation space for Hindsight Experience Replay to support dict observations.
+
+    :param env: (VecEnv) The vectorized environment to wrap.
+    """
+
+    def __init__(self, venv: VecEnv):
+        super(ObsWrapper, self).__init__(
+            num_envs=venv.num_envs, observation_space=venv.observation_space, action_space=venv.action_space
+        )
+
+        self.venv = venv
+
+        self.spaces = list(venv.observation_space.spaces.values())
+
+        # get dimensions of observation and goal
+        if isinstance(self.spaces[0], spaces.Discrete):
+            self.obs_dim = 1
+            self.goal_dim = 1
+        else:
+            goal_space_shape = venv.observation_space.spaces["achieved_goal"].shape
+            self.obs_dim = venv.observation_space.spaces["observation"].shape[0]
+            self.goal_dim = goal_space_shape[0]
+
+        # new observation space with concatenated observation and (desired) goal
+        # for the different types of spaces
+        if isinstance(self.spaces[0], spaces.Box):
+            low_values = np.concatenate(
+                [venv.observation_space["observation"].low, venv.observation_space["desired_goal"].low]
+            )
+            high_values = np.concatenate(
+                [venv.observation_space["observation"].high, venv.observation_space["desired_goal"].high]
+            )
+            self.observation_space = spaces.Box(low_values, high_values, dtype=np.float32)
+        elif isinstance(self.spaces[0], spaces.MultiBinary):
+            total_dim = self.obs_dim + self.goal_dim
+            self.observation_space = spaces.MultiBinary(total_dim)
+        elif isinstance(self.spaces[0], spaces.Discrete):
+            dimensions = [venv.observation_space.spaces["observation"].n, venv.observation_space.spaces["desired_goal"].n]
+            self.observation_space = spaces.MultiDiscrete(dimensions)
+        else:
+            raise NotImplementedError("{} space is not supported".format(type(self.spaces[0])))
+
+    def reset(self):
+        return self.venv.reset()
+
+    def step_async(self, actions):
+        self.venv.step_async(actions)
+
+    def step_wait(self):
+        return self.venv.step_wait()
+
+    def close(self):
+        return self.venv.close()
+
+    def get_attr(self, attr_name, indices=None):
+        return self.venv.get_attr(attr_name, indices)
+
+    def set_attr(self, attr_name, value, indices=None):
+        return self.venv.set_attr(attr_name, value, indices)
+
+    def env_method(self, method_name, *method_args, indices=None, **method_kwargs):
+        return self.venv.env_method(method_name, *method_args, indices=indices, **method_kwargs)
+
+    def get_images(self) -> Sequence[np.ndarray]:
+        return self.venv.get_images()
+
+    def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]:
+        return self.venv.seed(seed)
diff --git a/tests/test_her.py b/tests/test_her.py
new file mode 100644
index 0000000000..96e36c654e
--- /dev/null
+++ b/tests/test_her.py
@@ -0,0 +1,106 @@
+import numpy as np
+import pytest
+import torch as th
+
+from stable_baselines3 import TD3, SAC, DDPG
+from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
+from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
+from stable_baselines3.common.vec_env import DummyVecEnv
+from stable_baselines3.her.her import HER, GoalSelectionStrategy
+from stable_baselines3.sac.policies import SACPolicy
+from stable_baselines3.td3.policies import TD3Policy
+from stable_baselines3.td3.policies import CnnPolicy, MlpPolicy
+
+
+@pytest.mark.parametrize("model_class, policy, sde_support", [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)])
+def test_her(model_class, policy, sde_support):
+    """
+    Test Hindsight Experience Replay.
+    """
+
+    env = BitFlippingEnv(continuous=True)
+    env = DummyVecEnv([lambda: env])
+
+    # Create action noise
+    n_actions = env.action_space.shape[0]
+    action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,)))
+
+    model = HER(
+        policy,
+        env,
+        model_class,
+        n_goals=5,
+        goal_strategy="random",
+        action_noise=action_noise,
+        verbose=1,
+        tau=0.05,
+        batch_size=128,
+        learning_rate=0.001,
+        policy_kwargs=dict(net_arch=[256]),
+        buffer_size=int(1e6),
+        gamma=0.98,
+        gradient_steps=40,
+        sde_support=sde_support
+    )
+
+    model.learn(total_timesteps=1, callback=None)
+
+    # Evaluate the agent
+    n_eval_episodes = 5
+    n_episodes = 0
+    episode_rewards = []
+    episode_reward = 0.0
+
+    eval_env = BitFlippingEnv(continuous=True)
+
+    observation = eval_env.reset()
+
+    while n_episodes < n_eval_episodes:
+
+        obs = np.concatenate([observation["observation"], observation["desired_goal"]])
+
+        with th.no_grad():
+            obs_ = th.FloatTensor(np.array(obs).reshape(1, -1)).to(model.model.device)
+            action = model.model.policy.predict(obs_)[0][0]
+
+        observation, reward, done, _ = eval_env.step(action)
+
+        # Render the env
+        #eval_env.render()
+
+        episode_reward += reward
+
+        if done:
+            n_episodes += 1
+            observation = eval_env.reset()
+            episode_rewards.append(episode_reward)
+            episode_reward = 0.0
+
+    eval_env.close()
+    print(f"Mean reward: {np.mean(episode_rewards)} +/- {np.std(episode_rewards)}")
+
+    #assert np.mean(episode_rewards) > -50, "The environment is not solved"
+
+
+@pytest.mark.parametrize(
+    "goal_strategy",
+    [
+        "final",
+        "episode",
+        "future",
+        "random",
+        GoalSelectionStrategy.FUTURE,
+        GoalSelectionStrategy.RANDOM,
+        GoalSelectionStrategy.EPISODE,
+        GoalSelectionStrategy.FINAL,
+    ],
+)
+def test_goal_strategy(goal_strategy):
+    """
+    Test different goal strategies.
+    """
+    env = BitFlippingEnv(continuous=True)
+    env = DummyVecEnv([lambda: env])
+
+    model = HER(SACPolicy, env, SAC, goal_strategy=goal_strategy)
+    model.learn(total_timesteps=50, callback=None)

From f2b06450737ff1c5c37ef2d05b2bcc717400e13e Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 21 Jul 2020 12:36:22 +0200
Subject: [PATCH 02/81] Updated test_her.

---
 tests/test_her.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/tests/test_her.py b/tests/test_her.py
index 96e36c654e..a75eee9484 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -9,28 +9,29 @@
 from stable_baselines3.her.her import HER, GoalSelectionStrategy
 from stable_baselines3.sac.policies import SACPolicy
 from stable_baselines3.td3.policies import TD3Policy
-from stable_baselines3.td3.policies import CnnPolicy, MlpPolicy
+from stable_baselines3.td3.policies import MlpPolicy
 
 
-@pytest.mark.parametrize("model_class, policy, sde_support", [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)])
+@pytest.mark.parametrize("model_class, policy, sde_support",
+                         [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)])
 def test_her(model_class, policy, sde_support):
     """
     Test Hindsight Experience Replay.
     """
 
-    env = BitFlippingEnv(continuous=True)
+    env = BitFlippingEnv(n_bits=4, continuous=True)
     env = DummyVecEnv([lambda: env])
 
     # Create action noise
     n_actions = env.action_space.shape[0]
-    action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,)))
+    action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions, ), 0.2 * np.ones((n_actions,)))
 
     model = HER(
         policy,
         env,
         model_class,
         n_goals=5,
-        goal_strategy="random",
+        goal_strategy="future",
         action_noise=action_noise,
         verbose=1,
         tau=0.05,
@@ -43,7 +44,7 @@ def test_her(model_class, policy, sde_support):
         sde_support=sde_support
     )
 
-    model.learn(total_timesteps=1, callback=None)
+    model.learn(total_timesteps=500, callback=None)
 
     # Evaluate the agent
     n_eval_episodes = 5
@@ -51,7 +52,7 @@ def test_her(model_class, policy, sde_support):
     episode_rewards = []
     episode_reward = 0.0
 
-    eval_env = BitFlippingEnv(continuous=True)
+    eval_env = BitFlippingEnv(n_bits=4, continuous=True)
 
     observation = eval_env.reset()
 
@@ -66,7 +67,7 @@ def test_her(model_class, policy, sde_support):
         observation, reward, done, _ = eval_env.step(action)
 
         # Render the env
-        #eval_env.render()
+        # eval_env.render()
 
         episode_reward += reward
 
@@ -77,9 +78,6 @@ def test_her(model_class, policy, sde_support):
             episode_reward = 0.0
 
     eval_env.close()
-    print(f"Mean reward: {np.mean(episode_rewards)} +/- {np.std(episode_rewards)}")
-
-    #assert np.mean(episode_rewards) > -50, "The environment is not solved"
 
 
 @pytest.mark.parametrize(

From f7d5f88228128a4f9f4e56bdbe4d5dadefaabf95 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Thu, 23 Jul 2020 08:43:16 +0200
Subject: [PATCH 03/81] Added first version of online her sampling. Still
 problems with tensor dimensions.

---
 .../her/goal_selection_strategy.py            |  31 ++++
 stable_baselines3/her/her.py                  |  52 +++---
 stable_baselines3/her/her_replay_buffer.py    | 152 ++++++++++++++++++
 tests/test_her.py                             |  13 +-
 4 files changed, 210 insertions(+), 38 deletions(-)
 create mode 100644 stable_baselines3/her/goal_selection_strategy.py
 create mode 100644 stable_baselines3/her/her_replay_buffer.py

diff --git a/stable_baselines3/her/goal_selection_strategy.py b/stable_baselines3/her/goal_selection_strategy.py
new file mode 100644
index 0000000000..09f3bfda6c
--- /dev/null
+++ b/stable_baselines3/her/goal_selection_strategy.py
@@ -0,0 +1,31 @@
+from enum import Enum
+
+
+class GoalSelectionStrategy(Enum):
+    """
+    The strategies for selecting new goals when
+    creating artificial transitions.
+    """
+
+    # Select a goal that was achieved
+    # after the current step, in the same episode
+    FUTURE = 0
+    # Select the goal that was achieved
+    # at the end of the episode
+    FINAL = 1
+    # Select a goal that was achieved in the episode
+    EPISODE = 2
+    # Select a goal that was achieved
+    # at some point in the training procedure
+    # (and that is present in the replay buffer)
+    RANDOM = 3
+
+
+# For convenience
+# that way, we can use string to select a strategy
+KEY_TO_GOAL_STRATEGY = {
+    "future": GoalSelectionStrategy.FUTURE,
+    "final": GoalSelectionStrategy.FINAL,
+    "episode": GoalSelectionStrategy.EPISODE,
+    "random": GoalSelectionStrategy.RANDOM,
+}
\ No newline at end of file
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index ae3683b404..c8c54e83b3 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -12,39 +12,11 @@
 from stable_baselines3.common.policies import BasePolicy
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
 from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy, KEY_TO_GOAL_STRATEGY
+from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
 from stable_baselines3.her.obs_wrapper import ObsWrapper
 
 
-class GoalSelectionStrategy(Enum):
-    """
-    The strategies for selecting new goals when
-    creating artificial transitions.
-    """
-
-    # Select a goal that was achieved
-    # after the current step, in the same episode
-    FUTURE = 0
-    # Select the goal that was achieved
-    # at the end of the episode
-    FINAL = 1
-    # Select a goal that was achieved in the episode
-    EPISODE = 2
-    # Select a goal that was achieved
-    # at some point in the training procedure
-    # (and that is present in the replay buffer)
-    RANDOM = 3
-
-
-# For convenience
-# that way, we can use string to select a strategy
-KEY_TO_GOAL_STRATEGY = {
-    "future": GoalSelectionStrategy.FUTURE,
-    "final": GoalSelectionStrategy.FINAL,
-    "episode": GoalSelectionStrategy.EPISODE,
-    "random": GoalSelectionStrategy.RANDOM,
-}
-
-
 class HER(OffPolicyAlgorithm):
     """
     Hindsight Experience Replay (HER)
@@ -55,6 +27,9 @@ class HER(OffPolicyAlgorithm):
     :param n_goals: (int) Number of sampled goals for replay.
     :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay.
         One of ['episode', 'final', 'future', 'random']
+    :param online_sampling: (bool) Sample HER transitions online.
+    :her_ratio: (int) The ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
+            as many HER replays as regular replays are used)
     :param learning_rate: (float or callable) learning rate for the optimizer,
         it can be a function of the current progress remaining (from 1 to 0)
     :param buffer_size: (int) size of the replay buffer
@@ -100,6 +75,8 @@ def __init__(
         model: Type[OffPolicyAlgorithm],
         n_goals: int = 5,
         goal_strategy: Union[GoalSelectionStrategy, str] = "final",
+        online_sampling: bool = False,
+        her_ratio: int = 2,
         learning_rate: Union[float, Callable] = 3e-4,
         buffer_size: int = int(1e6),
         learning_starts: int = 100,
@@ -114,7 +91,7 @@ def __init__(
         policy_kwargs: Dict[str, Any] = None,
         tensorboard_log: Optional[str] = None,
         verbose: int = 0,
-        device: Union[th.device, str] = "auto",
+        device: Union[th.device, str] = "cpu",
         support_multi_env: bool = False,
         create_eval_env: bool = False,
         monitor_wrapper: bool = True,
@@ -179,6 +156,10 @@ def __init__(
         # model initialization
         self.model = model(env=self.env, **model_init_dict, **kwargs)
 
+        self.online_sampling = online_sampling
+        if self.online_sampling:
+            self.model.replay_buffer = HerReplayBuffer(self.env, buffer_size, self.goal_strategy, self.env.observation_space, self.env.action_space, device, self.n_envs, her_ratio)
+
         # storage for transitions of current episode
         self.episode_storage = []
         self.n_goals = n_goals
@@ -330,8 +311,12 @@ def collect_rollouts(
                     break
 
             if done:
-                # store episode in replay buffer
-                self.store_transitions()
+
+                if self.online_sampling:
+                    self.model.replay_buffer.add(self.episode_storage)
+                else:
+                    # store episode in replay buffer
+                    self.store_transitions()
                 # clear storage for current episode
                 self.episode_storage = []
 
@@ -369,6 +354,7 @@ def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]:
         elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
             # we have no transition after last transition of episode
+
             if (sample_idx + 1) < len(self.episode_storage):
                 index = np.random.choice(np.arange(sample_idx + 1, len(self.episode_storage)))
                 return self.episode_storage[index][0]["achieved_goal"]
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
new file mode 100644
index 0000000000..208355340f
--- /dev/null
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -0,0 +1,152 @@
+from typing import Union, Optional
+
+import numpy as np
+import torch as th
+from gym import spaces
+
+from stable_baselines3.common.buffers import BaseBuffer
+
+from stable_baselines3.common.type_aliases import ReplayBufferSamples
+from stable_baselines3.common.vec_env import VecNormalize, VecEnv
+from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
+
+
+class HerReplayBuffer(BaseBuffer):
+    """
+    Replay Buffer for online Hindsight Experience Replay (HER)
+
+    :param env: (VecEnv) The training environment
+    :param buffer_size: (int) The size of the buffer measured in transitions.
+    :param goal_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay.
+        One of ['episode', 'final', 'future', 'random']
+    :param observation_space: (spaces.Space) Observation space
+    :param action_space: (spaces.Space) Action space
+    :param device: (Union[th.device, str]) PyTorch device
+        to which the values will be converted
+    :param n_envs: (int) Number of parallel environments
+    :param her_ratio: (int) The ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
+            as many HER replays as regular replays are used)
+    """
+
+    def __init__(self, env: VecEnv, buffer_size: int, goal_strategy: GoalSelectionStrategy,
+                 observation_space: spaces.Space,
+                 action_space: spaces.Space,
+                 device: Union[th.device, str] = "cpu",
+                 n_envs: int = 1, her_ratio: int = 2):
+
+        super(HerReplayBuffer, self).__init__(buffer_size, observation_space, action_space, device, n_envs)
+
+        self.env = env
+        self.size = buffer_size
+
+        # buffer with episodes
+        self.buffer = []
+        self.goal_strategy = goal_strategy
+        self.her_ratio = 1 - (1. / (1 + her_ratio))
+
+        # memory management
+        # current size in episodes
+        self.current_size = 0
+        self.n_transitions_stored = 0
+
+    def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
+        """Returns a dict {key: array(batch_size x shapes[key])}
+        """
+        return self._sample_transitions(batch_size)
+
+    def _sample_transitions(self, batch_size: int):
+        # batch size in transitions
+
+        # Select which episodes and time steps to use.
+        episode_idxs = np.random.randint(0, self.current_size, batch_size)
+        buffer = np.array(self.buffer)
+        episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]])
+        t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths])
+
+        transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)])
+
+        her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_ratio)[0]
+        her_episode_lenghts = episode_lengths[her_idxs]
+
+        # get new goals with goal selection strategy
+        if self.goal_strategy == GoalSelectionStrategy.FINAL:
+            # replay with final state of current episode
+            last_transitions = buffer[episode_idxs[her_idxs]][:, -1][:, 0]
+            her_new_goals = [trans['achieved_goal'] for trans in last_transitions]
+        elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
+            # replay with random state which comes from the same episode and was observed after current transition
+            # we have no transition after last transition of episode
+            her_new_goals = []
+            for idx, length in zip(her_idxs, her_episode_lenghts):
+                if t_samples[idx] + 1 < length:
+                    index = np.random.choice(np.arange(t_samples[idx] + 1, length))
+                    her_new_goals.append(buffer[episode_idxs[idx]][index][0]["achieved_goal"])
+                else:
+                    # delete index from her indices where we have no transition after current one
+                    her_idxs = her_idxs[her_idxs != idx]
+        elif self.goal_strategy == GoalSelectionStrategy.EPISODE:
+            # replay with random state which comes from the same episode as current transition
+            index = np.array([np.random.choice(np.arange(ep_len)) for ep_len in her_episode_lenghts])
+            episode_transitions = buffer[episode_idxs[her_idxs], index][:, 0]
+            her_new_goals = [trans['achieved_goal'] for trans in episode_transitions]
+        elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
+            # replay with random state from the entire replay buffer
+            ep_idx = np.random.randint(0, self.current_size, len(her_idxs))
+            state_idx = [np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]]
+            random_transitions = buffer[ep_idx][state_idx][:, 0][:, 0]
+            her_new_goals = [trans['achieved_goal'] for trans in random_transitions]
+        else:
+            raise ValueError("Strategy for sampling goals not supported!")
+
+        # assign new goals as desired_goals
+        for idx, goal in enumerate(her_new_goals):
+            transitions[her_idxs][:, 0][idx]["desired_goal"] = goal
+
+        observations, actions, rewards, new_observations, dones = list(zip(*transitions))
+
+        # compute new reward with new goal
+        achieved_goals = [new_obs['achieved_goal'] for new_obs in np.array(new_observations)[her_idxs]]
+        new_rewards = np.array(rewards)
+        new_rewards[her_idxs] = [self.env.env_method("compute_reward", ag, her_new_goals, None) for ag, new_goal in zip(achieved_goals, her_new_goals)]
+
+        # concatenate observation with (desired) goal
+        obs = [np.concatenate([o["observation"], o["desired_goal"]], axis=1) for o in observations]
+        new_obs = [np.concatenate([new_o["observation"], new_o["desired_goal"]], axis=1) for new_o in new_observations]
+
+        data = (np.array(obs)[:,0,:], np.array(actions), np.array(new_obs)[:,0,:], np.array(dones, dtype=int), rewards)
+
+        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
+
+    def add(self, episode):
+        episode_length = len(episode)
+
+        if self.n_transitions_stored + episode_length <= self.size:
+            self.buffer.append(episode)
+            # update replay size
+            self.current_size += 1
+            self.n_transitions_stored += episode_length
+        elif self.full:
+            idx = np.random.randint(0, self.size)
+
+            if len(self.buffer[idx]) == episode_length:
+                self.buffer[idx] = episode
+            elif len(self.buffer[idx]) > episode_length:
+                self.buffer[idx] = episode
+                self.n_transitions_stored -= (self.buffer[idx] - episode_length)
+
+        if self.n_transitions_stored == self.size:
+            self.full = True
+        else:
+            self.full = False
+
+    def get_current_episode_size(self):
+        return self.current_size
+
+    def get_current_size(self):
+        return self.n_transitions_stored
+
+    def get_transitions_stored(self):
+        return self.n_transitions_stored
+
+    def clear_buffer(self):
+        self.buffer = []
diff --git a/tests/test_her.py b/tests/test_her.py
index a75eee9484..311bd2595b 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -14,7 +14,8 @@
 
 @pytest.mark.parametrize("model_class, policy, sde_support",
                          [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)])
-def test_her(model_class, policy, sde_support):
+@pytest.mark.parametrize("online_sampling", [True, False])
+def test_her(model_class, policy, sde_support, online_sampling):
     """
     Test Hindsight Experience Replay.
     """
@@ -32,6 +33,7 @@ def test_her(model_class, policy, sde_support):
         model_class,
         n_goals=5,
         goal_strategy="future",
+        online_sampling=online_sampling,
         action_noise=action_noise,
         verbose=1,
         tau=0.05,
@@ -91,14 +93,15 @@ def test_her(model_class, policy, sde_support):
         GoalSelectionStrategy.RANDOM,
         GoalSelectionStrategy.EPISODE,
         GoalSelectionStrategy.FINAL,
-    ],
+    ]
 )
-def test_goal_strategy(goal_strategy):
+@pytest.mark.parametrize("online_sampling", [True, False])
+def test_goal_strategy(goal_strategy, online_sampling):
     """
     Test different goal strategies.
     """
     env = BitFlippingEnv(continuous=True)
     env = DummyVecEnv([lambda: env])
 
-    model = HER(SACPolicy, env, SAC, goal_strategy=goal_strategy)
-    model.learn(total_timesteps=50, callback=None)
+    model = HER(SACPolicy, env, SAC, goal_strategy=goal_strategy, online_sampling=online_sampling)
+    model.learn(total_timesteps=200, callback=None)

From 88771b8ec5765028d61c0781b870c5cdb7483e04 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 23 Jul 2020 11:05:35 +0200
Subject: [PATCH 04/81] Reformat

---
 .../her/goal_selection_strategy.py            |  2 +-
 stable_baselines3/her/her.py                  | 13 +++++-
 stable_baselines3/her/her_replay_buffer.py    | 46 ++++++++++++-------
 tests/test_her.py                             | 16 +++----
 4 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/stable_baselines3/her/goal_selection_strategy.py b/stable_baselines3/her/goal_selection_strategy.py
index 09f3bfda6c..5f434be277 100644
--- a/stable_baselines3/her/goal_selection_strategy.py
+++ b/stable_baselines3/her/goal_selection_strategy.py
@@ -28,4 +28,4 @@ class GoalSelectionStrategy(Enum):
     "final": GoalSelectionStrategy.FINAL,
     "episode": GoalSelectionStrategy.EPISODE,
     "random": GoalSelectionStrategy.RANDOM,
-}
\ No newline at end of file
+}
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index c8c54e83b3..01ae37dfa1 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -12,7 +12,7 @@
 from stable_baselines3.common.policies import BasePolicy
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
 from stable_baselines3.common.vec_env import VecEnv
-from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy, KEY_TO_GOAL_STRATEGY
+from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy
 from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
 from stable_baselines3.her.obs_wrapper import ObsWrapper
 
@@ -158,7 +158,16 @@ def __init__(
 
         self.online_sampling = online_sampling
         if self.online_sampling:
-            self.model.replay_buffer = HerReplayBuffer(self.env, buffer_size, self.goal_strategy, self.env.observation_space, self.env.action_space, device, self.n_envs, her_ratio)
+            self.model.replay_buffer = HerReplayBuffer(
+                self.env,
+                buffer_size,
+                self.goal_strategy,
+                self.env.observation_space,
+                self.env.action_space,
+                device,
+                self.n_envs,
+                her_ratio,
+            )
 
         # storage for transitions of current episode
         self.episode_storage = []
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 208355340f..c6bd566104 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -1,13 +1,12 @@
-from typing import Union, Optional
+from typing import Optional, Union
 
 import numpy as np
 import torch as th
 from gym import spaces
 
 from stable_baselines3.common.buffers import BaseBuffer
-
 from stable_baselines3.common.type_aliases import ReplayBufferSamples
-from stable_baselines3.common.vec_env import VecNormalize, VecEnv
+from stable_baselines3.common.vec_env import VecEnv, VecNormalize
 from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
 
 
@@ -28,11 +27,17 @@ class HerReplayBuffer(BaseBuffer):
             as many HER replays as regular replays are used)
     """
 
-    def __init__(self, env: VecEnv, buffer_size: int, goal_strategy: GoalSelectionStrategy,
-                 observation_space: spaces.Space,
-                 action_space: spaces.Space,
-                 device: Union[th.device, str] = "cpu",
-                 n_envs: int = 1, her_ratio: int = 2):
+    def __init__(
+        self,
+        env: VecEnv,
+        buffer_size: int,
+        goal_strategy: GoalSelectionStrategy,
+        observation_space: spaces.Space,
+        action_space: spaces.Space,
+        device: Union[th.device, str] = "cpu",
+        n_envs: int = 1,
+        her_ratio: int = 2,
+    ):
 
         super(HerReplayBuffer, self).__init__(buffer_size, observation_space, action_space, device, n_envs)
 
@@ -42,7 +47,7 @@ def __init__(self, env: VecEnv, buffer_size: int, goal_strategy: GoalSelectionSt
         # buffer with episodes
         self.buffer = []
         self.goal_strategy = goal_strategy
-        self.her_ratio = 1 - (1. / (1 + her_ratio))
+        self.her_ratio = 1 - (1.0 / (1 + her_ratio))
 
         # memory management
         # current size in episodes
@@ -72,7 +77,7 @@ def _sample_transitions(self, batch_size: int):
         if self.goal_strategy == GoalSelectionStrategy.FINAL:
             # replay with final state of current episode
             last_transitions = buffer[episode_idxs[her_idxs]][:, -1][:, 0]
-            her_new_goals = [trans['achieved_goal'] for trans in last_transitions]
+            her_new_goals = [trans["achieved_goal"] for trans in last_transitions]
         elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
             # we have no transition after last transition of episode
@@ -88,13 +93,13 @@ def _sample_transitions(self, batch_size: int):
             # replay with random state which comes from the same episode as current transition
             index = np.array([np.random.choice(np.arange(ep_len)) for ep_len in her_episode_lenghts])
             episode_transitions = buffer[episode_idxs[her_idxs], index][:, 0]
-            her_new_goals = [trans['achieved_goal'] for trans in episode_transitions]
+            her_new_goals = [trans["achieved_goal"] for trans in episode_transitions]
         elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
             # replay with random state from the entire replay buffer
             ep_idx = np.random.randint(0, self.current_size, len(her_idxs))
             state_idx = [np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]]
             random_transitions = buffer[ep_idx][state_idx][:, 0][:, 0]
-            her_new_goals = [trans['achieved_goal'] for trans in random_transitions]
+            her_new_goals = [trans["achieved_goal"] for trans in random_transitions]
         else:
             raise ValueError("Strategy for sampling goals not supported!")
 
@@ -105,15 +110,24 @@ def _sample_transitions(self, batch_size: int):
         observations, actions, rewards, new_observations, dones = list(zip(*transitions))
 
         # compute new reward with new goal
-        achieved_goals = [new_obs['achieved_goal'] for new_obs in np.array(new_observations)[her_idxs]]
+        achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(new_observations)[her_idxs]]
         new_rewards = np.array(rewards)
-        new_rewards[her_idxs] = [self.env.env_method("compute_reward", ag, her_new_goals, None) for ag, new_goal in zip(achieved_goals, her_new_goals)]
+        new_rewards[her_idxs] = [
+            self.env.env_method("compute_reward", ag, her_new_goals, None)
+            for ag, new_goal in zip(achieved_goals, her_new_goals)
+        ]
 
         # concatenate observation with (desired) goal
         obs = [np.concatenate([o["observation"], o["desired_goal"]], axis=1) for o in observations]
         new_obs = [np.concatenate([new_o["observation"], new_o["desired_goal"]], axis=1) for new_o in new_observations]
 
-        data = (np.array(obs)[:,0,:], np.array(actions), np.array(new_obs)[:,0,:], np.array(dones, dtype=int), rewards)
+        data = (
+            np.array(obs)[:, 0, :],
+            np.array(actions, dtype=np.float32),
+            np.array(new_obs)[:, 0, :],
+            np.array(dones, dtype=np.bool),
+            rewards,
+        )
 
         return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
 
@@ -132,7 +146,7 @@ def add(self, episode):
                 self.buffer[idx] = episode
             elif len(self.buffer[idx]) > episode_length:
                 self.buffer[idx] = episode
-                self.n_transitions_stored -= (self.buffer[idx] - episode_length)
+                self.n_transitions_stored -= self.buffer[idx] - episode_length
 
         if self.n_transitions_stored == self.size:
             self.full = True
diff --git a/tests/test_her.py b/tests/test_her.py
index 311bd2595b..6430d348aa 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -2,18 +2,18 @@
 import pytest
 import torch as th
 
-from stable_baselines3 import TD3, SAC, DDPG
+from stable_baselines3 import DDPG, SAC, TD3
 from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
 from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
 from stable_baselines3.common.vec_env import DummyVecEnv
 from stable_baselines3.her.her import HER, GoalSelectionStrategy
 from stable_baselines3.sac.policies import SACPolicy
-from stable_baselines3.td3.policies import TD3Policy
-from stable_baselines3.td3.policies import MlpPolicy
+from stable_baselines3.td3.policies import MlpPolicy, TD3Policy
 
 
-@pytest.mark.parametrize("model_class, policy, sde_support",
-                         [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)])
+@pytest.mark.parametrize(
+    "model_class, policy, sde_support", [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)]
+)
 @pytest.mark.parametrize("online_sampling", [True, False])
 def test_her(model_class, policy, sde_support, online_sampling):
     """
@@ -25,7 +25,7 @@ def test_her(model_class, policy, sde_support, online_sampling):
 
     # Create action noise
     n_actions = env.action_space.shape[0]
-    action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions, ), 0.2 * np.ones((n_actions,)))
+    action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,)))
 
     model = HER(
         policy,
@@ -43,7 +43,7 @@ def test_her(model_class, policy, sde_support, online_sampling):
         buffer_size=int(1e6),
         gamma=0.98,
         gradient_steps=40,
-        sde_support=sde_support
+        sde_support=sde_support,
     )
 
     model.learn(total_timesteps=500, callback=None)
@@ -93,7 +93,7 @@ def test_her(model_class, policy, sde_support, online_sampling):
         GoalSelectionStrategy.RANDOM,
         GoalSelectionStrategy.EPISODE,
         GoalSelectionStrategy.FINAL,
-    ]
+    ],
 )
 @pytest.mark.parametrize("online_sampling", [True, False])
 def test_goal_strategy(goal_strategy, online_sampling):

From 2e436a29cbb6ef6b1ba76b98b8b22ef044f0ac77 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 23 Jul 2020 11:27:48 +0200
Subject: [PATCH 05/81] Fixed tests

---
 stable_baselines3/her/her_replay_buffer.py | 4 ++--
 tests/test_her.py                          | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index c6bd566104..0b3d64b080 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -123,9 +123,9 @@ def _sample_transitions(self, batch_size: int):
 
         data = (
             np.array(obs)[:, 0, :],
-            np.array(actions, dtype=np.float32),
+            np.array(actions, dtype=self.action_space.dtype)[:, 0, :],
             np.array(new_obs)[:, 0, :],
-            np.array(dones, dtype=np.bool),
+            np.array(dones, dtype=np.int8),
             rewards,
         )
 
diff --git a/tests/test_her.py b/tests/test_her.py
index 6430d348aa..fa14904068 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -39,10 +39,12 @@ def test_her(model_class, policy, sde_support, online_sampling):
         tau=0.05,
         batch_size=128,
         learning_rate=0.001,
-        policy_kwargs=dict(net_arch=[256]),
+        policy_kwargs=dict(net_arch=[64]),
         buffer_size=int(1e6),
         gamma=0.98,
-        gradient_steps=40,
+        gradient_steps=1,
+        train_freq=1,
+        n_episodes_rollout=-1,
         sde_support=sde_support,
     )
 

From c0a82fc142ab4feb069dddd39f027083a83732a1 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Thu, 23 Jul 2020 14:53:46 +0200
Subject: [PATCH 06/81] Added some comments.

---
 stable_baselines3/her/her.py               |  6 ++--
 stable_baselines3/her/her_replay_buffer.py | 39 ++++++++++++++++------
 tests/test_her.py                          | 11 +++++-
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 01ae37dfa1..a486a8a7a4 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -1,4 +1,3 @@
-from enum import Enum
 from inspect import signature
 from typing import Any, Callable, Dict, Optional, Type, Union
 
@@ -156,6 +155,7 @@ def __init__(
         # model initialization
         self.model = model(env=self.env, **model_init_dict, **kwargs)
 
+        # if we sample her transitions online use custom replay buffer
         self.online_sampling = online_sampling
         if self.online_sampling:
             self.model.replay_buffer = HerReplayBuffer(
@@ -226,7 +226,7 @@ def collect_rollouts(
         n_steps: int = -1,
         action_noise: Optional[ActionNoise] = None,
         learning_starts: int = 0,
-        replay_buffer: Optional[ReplayBuffer] = None,
+        replay_buffer: Union[ReplayBuffer, HerReplayBuffer] = None,
         log_interval: Optional[int] = None,
     ) -> RolloutReturn:
         """
@@ -243,7 +243,7 @@ def collect_rollouts(
             Required for deterministic policy (e.g. TD3). This can also be used
             in addition to the stochastic policy for SAC.
         :param learning_starts: (int) Number of steps before learning for the warm-up phase.
-        :param replay_buffer: (ReplayBuffer)
+        :param replay_buffer: (ReplayBuffer or HerReplayBuffer)
         :param log_interval: (int) Log data every ``log_interval`` episodes
         :return: (RolloutReturn)
         """
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 0b3d64b080..2e611ae829 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -55,34 +55,44 @@ def __init__(
         self.n_transitions_stored = 0
 
     def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
-        """Returns a dict {key: array(batch_size x shapes[key])}
+        """
+        :param batch_size: (int) Number of element to sample
+        :param env: (Optional[VecNormalize]) associated gym VecEnv
+            to normalize the observations/rewards when sampling
+        :return: (ReplayBufferSamples)
         """
         return self._sample_transitions(batch_size)
 
-    def _sample_transitions(self, batch_size: int):
-        # batch size in transitions
+    def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
+        """
+        :param batch_size: (int) Number of element to sample
+        :return: (ReplayBufferSamples)
+        """
 
         # Select which episodes and time steps to use.
         episode_idxs = np.random.randint(0, self.current_size, batch_size)
         buffer = np.array(self.buffer)
+        # get episode lengths for selecting timesteps
         episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]])
+        # select timesteps
         t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths])
-
+        # get selected timesteps
         transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)])
-
+        # get her samples indices with her_ratio
         her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_ratio)[0]
+        # her samples episode lengths
         her_episode_lenghts = episode_lengths[her_idxs]
 
         # get new goals with goal selection strategy
         if self.goal_strategy == GoalSelectionStrategy.FINAL:
             # replay with final state of current episode
-            last_transitions = buffer[episode_idxs[her_idxs]][:, -1][:, 0]
+            last_transitions = buffer[episode_idxs[her_idxs], -1][:, 0]
             her_new_goals = [trans["achieved_goal"] for trans in last_transitions]
         elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
-            # we have no transition after last transition of episode
             her_new_goals = []
             for idx, length in zip(her_idxs, her_episode_lenghts):
+                # we have no transition after last transition of episode
                 if t_samples[idx] + 1 < length:
                     index = np.random.choice(np.arange(t_samples[idx] + 1, length))
                     her_new_goals.append(buffer[episode_idxs[idx]][index][0]["achieved_goal"])
@@ -98,7 +108,7 @@ def _sample_transitions(self, batch_size: int):
             # replay with random state from the entire replay buffer
             ep_idx = np.random.randint(0, self.current_size, len(her_idxs))
             state_idx = [np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]]
-            random_transitions = buffer[ep_idx][state_idx][:, 0][:, 0]
+            random_transitions = buffer[ep_idx, state_idx][:, 0]
             her_new_goals = [trans["achieved_goal"] for trans in random_transitions]
         else:
             raise ValueError("Strategy for sampling goals not supported!")
@@ -109,7 +119,7 @@ def _sample_transitions(self, batch_size: int):
 
         observations, actions, rewards, new_observations, dones = list(zip(*transitions))
 
-        # compute new reward with new goal
+        # compute new rewards with new goal
         achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(new_observations)[her_idxs]]
         new_rewards = np.array(rewards)
         new_rewards[her_idxs] = [
@@ -126,21 +136,28 @@ def _sample_transitions(self, batch_size: int):
             np.array(actions, dtype=self.action_space.dtype)[:, 0, :],
             np.array(new_obs)[:, 0, :],
             np.array(dones, dtype=np.int8),
-            rewards,
+            new_rewards,
         )
 
         return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
 
     def add(self, episode):
+        """
+        Add episode to replay buffer
+
+        :param episode: (list) Episode to store.
+        """
         episode_length = len(episode)
 
+        # check if replay buffer has enough space for all transitions of episode
         if self.n_transitions_stored + episode_length <= self.size:
             self.buffer.append(episode)
             # update replay size
             self.current_size += 1
             self.n_transitions_stored += episode_length
         elif self.full:
-            idx = np.random.randint(0, self.size)
+            # if replay buffer is full take random stored episode and replace it
+            idx = np.random.randint(0, self.current_size)
 
             if len(self.buffer[idx]) == episode_length:
                 self.buffer[idx] = episode
diff --git a/tests/test_her.py b/tests/test_her.py
index fa14904068..2aae177154 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -105,5 +105,14 @@ def test_goal_strategy(goal_strategy, online_sampling):
     env = BitFlippingEnv(continuous=True)
     env = DummyVecEnv([lambda: env])
 
-    model = HER(SACPolicy, env, SAC, goal_strategy=goal_strategy, online_sampling=online_sampling)
+    model = HER(
+        SACPolicy,
+        env,
+        SAC,
+        goal_strategy=goal_strategy,
+        online_sampling=online_sampling,
+        gradient_steps=1,
+        train_freq=1,
+        n_episodes_rollout=-1,
+    )
     model.learn(total_timesteps=200, callback=None)

From e6263b2dd0cd2d9f369d2061305037dc91a0c9ad Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Thu, 23 Jul 2020 15:04:01 +0200
Subject: [PATCH 07/81] Updated changelog.

---
 docs/misc/changelog.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 722e9e71e9..bfa0329eba 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -25,6 +25,7 @@ New Features:
 - Refactored opening paths for saving and loading to use strings, pathlib or io.BufferedIOBase (@PartiallyTyped)
 - Added ``DDPG`` algorithm as a special case of ``TD3``.
 - Introduced ``BaseModel`` abstract parent for ``BasePolicy``, which critics inherit from.
+- Added Hindsight Experience Replay ``HER``. (@megan-klaiber)
 
 Bug Fixes:
 ^^^^^^^^^^
@@ -355,4 +356,4 @@ And all the contributors:
 @Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp @srivatsankrishnan @evilsocket
 @MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching
 @flodorner @KuKuXia @NeoExtended @PartiallyTyped @mmcenta @richardwu @kinalmehta @rolandgvc @tkelestemur @mloo3
-@tirafesi @blurLake @koulakis @joeljosephjin
+@tirafesi @blurLake @koulakis @joeljosephjin @megan-klaiber

From 257b8fcebe3e6024fc94e530477e83f1c659c438 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 23 Jul 2020 15:13:53 +0200
Subject: [PATCH 08/81] Add missing init file

---
 stable_baselines3/her/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 stable_baselines3/her/__init__.py

diff --git a/stable_baselines3/her/__init__.py b/stable_baselines3/her/__init__.py
new file mode 100644
index 0000000000..e69de29bb2

From 90f6e2c071565dc50d384edbbf9bfe6e393399ac Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Thu, 23 Jul 2020 16:32:48 +0200
Subject: [PATCH 09/81] Fixed some small bugs.

---
 stable_baselines3/her/her.py               |  6 ++++--
 stable_baselines3/her/her_replay_buffer.py | 24 +++++++++++++++-------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index a486a8a7a4..d8500b7ce9 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -320,9 +320,11 @@ def collect_rollouts(
                     break
 
             if done:
-
                 if self.online_sampling:
-                    self.model.replay_buffer.add(self.episode_storage)
+                    observations, actions, rewards, next_observations, done = zip(*self.episode_storage)
+                    self.model.replay_buffer.add(observations, next_observations, actions, rewards, done)
+                    # self.model.replay_buffer.add(self.episode_storage)
+
                 else:
                     # store episode in replay buffer
                     self.store_transitions()
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 2e611ae829..8c60286f76 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -42,10 +42,12 @@ def __init__(
         super(HerReplayBuffer, self).__init__(buffer_size, observation_space, action_space, device, n_envs)
 
         self.env = env
-        self.size = buffer_size
+        self.buffer_size = buffer_size
 
         # buffer with episodes
         self.buffer = []
+        # TODO just for typing reason , need another solution
+        self.observations = np.zeros((self.buffer_size, self.n_envs,) + self.obs_shape, dtype=observation_space.dtype)
         self.goal_strategy = goal_strategy
         self.her_ratio = 1 - (1.0 / (1 + her_ratio))
 
@@ -117,10 +119,10 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         for idx, goal in enumerate(her_new_goals):
             transitions[her_idxs][:, 0][idx]["desired_goal"] = goal
 
-        observations, actions, rewards, new_observations, dones = list(zip(*transitions))
+        observations, actions, rewards, next_observations, dones = list(zip(*transitions))
 
         # compute new rewards with new goal
-        achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(new_observations)[her_idxs]]
+        achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(next_observations)[her_idxs]]
         new_rewards = np.array(rewards)
         new_rewards[her_idxs] = [
             self.env.env_method("compute_reward", ag, her_new_goals, None)
@@ -129,7 +131,7 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
 
         # concatenate observation with (desired) goal
         obs = [np.concatenate([o["observation"], o["desired_goal"]], axis=1) for o in observations]
-        new_obs = [np.concatenate([new_o["observation"], new_o["desired_goal"]], axis=1) for new_o in new_observations]
+        new_obs = [np.concatenate([new_o["observation"], new_o["desired_goal"]], axis=1) for new_o in next_observations]
 
         data = (
             np.array(obs)[:, 0, :],
@@ -141,16 +143,24 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
 
         return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
 
-    def add(self, episode):
+    def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: np.ndarray, done: np.ndarray) -> None:
         """
         Add episode to replay buffer
 
+        :param obs:
+        :param next_obs:
+        :param action:
+        :param reward:
+        :param done:
+
         :param episode: (list) Episode to store.
         """
+        episode = list(zip(obs, action, reward, next_obs, done))
+
         episode_length = len(episode)
 
         # check if replay buffer has enough space for all transitions of episode
-        if self.n_transitions_stored + episode_length <= self.size:
+        if self.n_transitions_stored + episode_length <= self.size():
             self.buffer.append(episode)
             # update replay size
             self.current_size += 1
@@ -165,7 +175,7 @@ def add(self, episode):
                 self.buffer[idx] = episode
                 self.n_transitions_stored -= self.buffer[idx] - episode_length
 
-        if self.n_transitions_stored == self.size:
+        if self.n_transitions_stored == self.size():
             self.full = True
         else:
             self.full = False

From 7b22e68936db76eb1b1741c0470545ead836fde1 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Wed, 29 Jul 2020 12:54:50 +0200
Subject: [PATCH 10/81] Reduced arguments for HER, small changes.

---
 stable_baselines3/her/her.py               | 161 +++++----------------
 stable_baselines3/her/her_replay_buffer.py |  44 +++---
 stable_baselines3/her/obs_wrapper.py       |  41 ++----
 tests/test_her.py                          |   7 +-
 4 files changed, 77 insertions(+), 176 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index d8500b7ce9..89f586d09e 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -1,8 +1,7 @@
-from inspect import signature
-from typing import Any, Callable, Dict, Optional, Type, Union
+from typing import Callable, Optional, Type, Union
 
+import gym
 import numpy as np
-import torch as th
 
 from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.callbacks import BaseCallback
@@ -31,40 +30,6 @@ class HER(OffPolicyAlgorithm):
             as many HER replays as regular replays are used)
     :param learning_rate: (float or callable) learning rate for the optimizer,
         it can be a function of the current progress remaining (from 1 to 0)
-    :param buffer_size: (int) size of the replay buffer
-    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
-    :param batch_size: (int) Minibatch size for each gradient update
-    :param tau: (float) the soft update coefficient ("Polyak update", between 0 and 1)
-    :param gamma: (float) the discount factor
-    :param train_freq: (int) Update the model every ``train_freq`` steps.
-    :param gradient_steps: (int) How many gradient update after each step
-    :param n_episodes_rollout: (int) Update the model every ``n_episodes_rollout`` episodes.
-        Note that this cannot be used at the same time as ``train_freq``
-    :param action_noise: (ActionNoise) the action noise type (None by default), this can help
-        for hard exploration problem. Cf common.noise for the different action noise type.
-    :param optimize_memory_usage: (bool) Enable a memory efficient variant of the replay buffer
-        at a cost of more complexity.
-        See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
-    :param policy_kwargs: Additional arguments to be passed to the policy on creation
-    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
-    :param verbose: The verbosity level: 0 none, 1 training information, 2 debug
-    :param device: Device on which the code should run.
-        By default, it will try to use a Cuda compatible device and fallback to cpu
-        if it is not possible.
-    :param support_multi_env: Whether the algorithm supports training
-        with multiple environments (as in A2C)
-    :param create_eval_env: Whether to create a second environment that will be
-        used for evaluating the agent periodically. (Only available when passing string for the environment)
-    :param monitor_wrapper: When creating an environment, whether to wrap it
-        or not in a Monitor wrapper.
-    :param seed: Seed for the pseudo random generators
-    :param use_sde: Whether to use State Dependent Exploration (SDE)
-        instead of action noise exploration (default: False)
-    :param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
-        Default: -1 (only sample at the beginning of the rollout)
-    :param use_sde_at_warmup: (bool) Whether to use gSDE instead of uniform sampling
-        during the warm up phase (before learning starts)
-    :param sde_support: (bool) Whether the model support gSDE or not
     """
 
     def __init__(
@@ -73,104 +38,53 @@ def __init__(
         env: VecEnv,
         model: Type[OffPolicyAlgorithm],
         n_goals: int = 5,
-        goal_strategy: Union[GoalSelectionStrategy, str] = "final",
+        goal_strategy: Union[GoalSelectionStrategy, str] = "future",
         online_sampling: bool = False,
         her_ratio: int = 2,
         learning_rate: Union[float, Callable] = 3e-4,
-        buffer_size: int = int(1e6),
-        learning_starts: int = 100,
-        batch_size: int = 256,
-        tau: float = 0.005,
-        gamma: float = 0.99,
-        train_freq: int = 1,
-        gradient_steps: int = 1,
-        n_episodes_rollout: int = -1,
-        action_noise: Optional[ActionNoise] = None,
-        optimize_memory_usage: bool = False,
-        policy_kwargs: Dict[str, Any] = None,
-        tensorboard_log: Optional[str] = None,
-        verbose: int = 0,
-        device: Union[th.device, str] = "cpu",
-        support_multi_env: bool = False,
-        create_eval_env: bool = False,
-        monitor_wrapper: bool = True,
-        seed: Optional[int] = None,
-        use_sde: bool = False,
-        sde_sample_freq: int = -1,
-        use_sde_at_warmup: bool = False,
-        sde_support: bool = True,
         *args,
-        **kwargs
+        **kwargs,
     ):
 
+        self.env = env
+        # check if wrapper for dict support is needed
+        if isinstance(env.observation_space, gym.spaces.dict.Dict):
+            self.env = ObsWrapper(env)
+
+        super(HER, self).__init__(
+            policy=BasePolicy, env=self.env, policy_base=BasePolicy, learning_rate=learning_rate, sde_support=False
+        )
+
+        # model initialization
+        self.model = model(policy=policy, env=self.env, learning_rate=learning_rate, *args, **kwargs)
+
+        # convert goal_strategy into GoalSelectionStrategy if string
         if isinstance(goal_strategy, str):
             self.goal_strategy = KEY_TO_GOAL_STRATEGY[goal_strategy.lower()]
         else:
             self.goal_strategy = goal_strategy
 
+        # check if goal_strategy is valid
         assert isinstance(
             self.goal_strategy, GoalSelectionStrategy
-        ), "Invalid goal selection strategy," "please use one of {}".format(list(GoalSelectionStrategy))
-
-        self.env = ObsWrapper(env)
-
-        # get arguments for the model initialization
-        model_signature = signature(model.__init__)
-        arguments = locals()
-        model_init_dict = {
-            key: arguments[key]
-            for key in model_signature.parameters.keys()
-            if key in arguments and key != "self" and key != "env"
-        }
-
-        super(HER, self).__init__(
-            policy,
-            self.env,
-            BasePolicy,
-            learning_rate,
-            buffer_size,
-            learning_starts,
-            batch_size,
-            tau,
-            gamma,
-            train_freq,
-            gradient_steps,
-            n_episodes_rollout,
-            action_noise,
-            optimize_memory_usage,
-            policy_kwargs,
-            tensorboard_log,
-            verbose,
-            device,
-            support_multi_env,
-            create_eval_env,
-            monitor_wrapper,
-            seed,
-            use_sde,
-            sde_sample_freq,
-            use_sde_at_warmup,
-            sde_support,
-        )
-
-        # model initialization
-        self.model = model(env=self.env, **model_init_dict, **kwargs)
+        ), f"Invalid goal selection strategy, please use one of {list(GoalSelectionStrategy)}"
 
         # if we sample her transitions online use custom replay buffer
         self.online_sampling = online_sampling
         if self.online_sampling:
             self.model.replay_buffer = HerReplayBuffer(
                 self.env,
-                buffer_size,
+                self.model.buffer_size,
                 self.goal_strategy,
                 self.env.observation_space,
                 self.env.action_space,
-                device,
+                self.model.device,
                 self.n_envs,
                 her_ratio,
             )
 
         # storage for transitions of current episode
-        self.episode_storage = []
+        self.__episode_storage = []
         self.n_goals = n_goals
 
     def learn(
@@ -247,6 +161,7 @@ def collect_rollouts(
         :param log_interval: (int) Log data every ``log_interval`` episodes
         :return: (RolloutReturn)
         """
+
         episode_rewards, total_timesteps = [], []
         total_steps, total_episodes = 0, 0
 
@@ -298,7 +213,7 @@ def collect_rollouts(
                         self.model._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
 
                     # add current transition to episode storage
-                    self.episode_storage.append((self.model._last_original_obs, buffer_action, reward_, new_obs_, done))
+                    self.__episode_storage.append((self.model._last_original_obs, buffer_action, reward_, new_obs_, done))
 
                 self.model._last_obs = new_obs
                 # Save the unnormalized observation
@@ -321,15 +236,15 @@ def collect_rollouts(
 
             if done:
                 if self.online_sampling:
-                    observations, actions, rewards, next_observations, done = zip(*self.episode_storage)
+                    observations, actions, rewards, next_observations, done = zip(*self.__episode_storage)
                     self.model.replay_buffer.add(observations, next_observations, actions, rewards, done)
-                    # self.model.replay_buffer.add(self.episode_storage)
+                    # self.model.replay_buffer.add(self.__episode_storage)
 
                 else:
                     # store episode in replay buffer
-                    self.store_transitions()
+                    self.__store_transitions()
                 # clear storage for current episode
-                self.episode_storage = []
+                self.__episode_storage = []
 
                 total_episodes += 1
                 self.model._episode_num += 1
@@ -361,35 +276,37 @@ def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]:
         """
         if self.goal_strategy == GoalSelectionStrategy.FINAL:
             # replay with final state of current episode
-            return self.episode_storage[-1][0]["achieved_goal"]
+            return self.__episode_storage[-1][0]["achieved_goal"]
         elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
             # we have no transition after last transition of episode
 
-            if (sample_idx + 1) < len(self.episode_storage):
-                index = np.random.choice(np.arange(sample_idx + 1, len(self.episode_storage)))
-                return self.episode_storage[index][0]["achieved_goal"]
+            if (sample_idx + 1) < len(self.__episode_storage):
+                index = np.random.choice(np.arange(sample_idx + 1, len(self.__episode_storage)))
+                return self.__episode_storage[index][0]["achieved_goal"]
         elif self.goal_strategy == GoalSelectionStrategy.EPISODE:
             # replay with random state which comes from the same episode as current transition
-            index = np.random.choice(np.arange(len(self.episode_storage)))
-            return self.episode_storage[index][0]["achieved_goal"]
+            index = np.random.choice(np.arange(len(self.__episode_storage)))
+            return self.__episode_storage[index][0]["achieved_goal"]
         elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
             # replay with random state from the entire replay buffer
             index = np.random.choice(np.arange(self.model.replay_buffer.size()))
             obs = self.model.replay_buffer.observations[index]
             # get only the observation part
-            obs_array = obs[:, : self.env.obs_dim]
+            # TODO
+            obs_dim = self.env.observation_space.shape[0] // 2
+            obs_array = obs[:, :obs_dim]
             return obs_array
         else:
             raise ValueError("Strategy for sampling goals not supported!")
 
-    def store_transitions(self) -> None:
+    def __store_transitions(self) -> None:
         """
         Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer.
         """
 
         # iterate over current episodes transitions
-        for idx, trans in enumerate(self.episode_storage):
+        for idx, trans in enumerate(self.__episode_storage):
 
             observation, action, reward, new_observation, done = trans
 
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 8c60286f76..7282530ce7 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -52,9 +52,8 @@ def __init__(
         self.her_ratio = 1 - (1.0 / (1 + her_ratio))
 
         # memory management
-        # current size in episodes
-        self.current_size = 0
-        self.n_transitions_stored = 0
+        self.__n_episodes_stored = 0
+        self.__n_transitions_stored = 0
 
     def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
         """
@@ -72,7 +71,7 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         """
 
         # Select which episodes and time steps to use.
-        episode_idxs = np.random.randint(0, self.current_size, batch_size)
+        episode_idxs = np.random.randint(0, self.n_episodes_stored, batch_size)
         buffer = np.array(self.buffer)
         # get episode lengths for selecting timesteps
         episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]])
@@ -108,7 +107,7 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
             her_new_goals = [trans["achieved_goal"] for trans in episode_transitions]
         elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
             # replay with random state from the entire replay buffer
-            ep_idx = np.random.randint(0, self.current_size, len(her_idxs))
+            ep_idx = np.random.randint(0, self.n_episodes_stored, len(her_idxs))
             state_idx = [np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]]
             random_transitions = buffer[ep_idx, state_idx][:, 0]
             her_new_goals = [trans["achieved_goal"] for trans in random_transitions]
@@ -125,13 +124,15 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(next_observations)[her_idxs]]
         new_rewards = np.array(rewards)
         new_rewards[her_idxs] = [
-            self.env.env_method("compute_reward", ag, her_new_goals, None)
-            for ag, new_goal in zip(achieved_goals, her_new_goals)
+            self.env.env_method("compute_reward", achieved_goal, her_new_goals, None)
+            for achieved_goal, new_goal in zip(achieved_goals, her_new_goals)
         ]
 
         # concatenate observation with (desired) goal
-        obs = [np.concatenate([o["observation"], o["desired_goal"]], axis=1) for o in observations]
-        new_obs = [np.concatenate([new_o["observation"], new_o["desired_goal"]], axis=1) for new_o in next_observations]
+        obs = [np.concatenate([obs_["observation"], obs_["desired_goal"]], axis=1) for obs_ in observations]
+        new_obs = [
+            np.concatenate([new_obs_["observation"], new_obs_["desired_goal"]], axis=1) for new_obs_ in next_observations
+        ]
 
         data = (
             np.array(obs)[:, 0, :],
@@ -163,11 +164,11 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward:
         if self.n_transitions_stored + episode_length <= self.size():
             self.buffer.append(episode)
             # update replay size
-            self.current_size += 1
+            self.n_episodes_stored += 1
             self.n_transitions_stored += episode_length
         elif self.full:
             # if replay buffer is full take random stored episode and replace it
-            idx = np.random.randint(0, self.current_size)
+            idx = np.random.randint(0, self.n_episodes_stored)
 
             if len(self.buffer[idx]) == episode_length:
                 self.buffer[idx] = episode
@@ -180,14 +181,23 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward:
         else:
             self.full = False
 
-    def get_current_episode_size(self):
-        return self.current_size
+    @property
+    def n_episodes_stored(self):
+        return self.__n_episodes_stored
 
-    def get_current_size(self):
-        return self.n_transitions_stored
+    @n_episodes_stored.setter
+    def n_episodes_stored(self, n):
+        self.__n_episodes_stored = n
 
-    def get_transitions_stored(self):
-        return self.n_transitions_stored
+    @property
+    def n_transitions_stored(self):
+        return self.__n_transitions_stored
+
+    @n_transitions_stored.setter
+    def n_transitions_stored(self, n):
+        self.__n_transitions_stored = n
 
     def clear_buffer(self):
         self.buffer = []
+        self.n_episodes_stored = 0
+        self.n_transitions_stored = 0
diff --git a/stable_baselines3/her/obs_wrapper.py b/stable_baselines3/her/obs_wrapper.py
index e59f40f939..1a909968c0 100644
--- a/stable_baselines3/her/obs_wrapper.py
+++ b/stable_baselines3/her/obs_wrapper.py
@@ -1,12 +1,12 @@
-from typing import List, Optional, Sequence, Union
+from typing import Union, Tuple
 
 import numpy as np
 from gym import spaces
 
-from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper
 
 
-class ObsWrapper(VecEnv):
+class ObsWrapper(VecEnvWrapper):
     """
     Wrapper for a VecEnv which overrides the observation space for Hindsight Experience Replay to support dict observations.
 
@@ -14,9 +14,7 @@ class ObsWrapper(VecEnv):
     """
 
     def __init__(self, venv: VecEnv):
-        super(ObsWrapper, self).__init__(
-            num_envs=venv.num_envs, observation_space=venv.observation_space, action_space=venv.action_space
-        )
+        super(ObsWrapper, self).__init__(venv, venv.observation_space, venv.action_space)
 
         self.venv = venv
 
@@ -35,10 +33,10 @@ def __init__(self, venv: VecEnv):
         # for the different types of spaces
         if isinstance(self.spaces[0], spaces.Box):
             low_values = np.concatenate(
-                [venv.observation_space["observation"].low, venv.observation_space["desired_goal"].low]
+                [venv.observation_space.spaces["observation"].low, venv.observation_space.spaces["desired_goal"].low]
             )
             high_values = np.concatenate(
-                [venv.observation_space["observation"].high, venv.observation_space["desired_goal"].high]
+                [venv.observation_space.spaces["observation"].high, venv.observation_space.spaces["desired_goal"].high]
             )
             self.observation_space = spaces.Box(low_values, high_values, dtype=np.float32)
         elif isinstance(self.spaces[0], spaces.MultiBinary):
@@ -48,31 +46,10 @@ def __init__(self, venv: VecEnv):
             dimensions = [venv.observation_space.spaces["observation"].n, venv.observation_space.spaces["desired_goal"].n]
             self.observation_space = spaces.MultiDiscrete(dimensions)
         else:
-            raise NotImplementedError("{} space is not supported".format(type(self.spaces[0])))
+            raise NotImplementedError(f"{type(self.spaces[0])} space is not supported")
 
-    def reset(self):
+    def reset(self) -> Union[int, float]:
         return self.venv.reset()
 
-    def step_async(self, actions):
-        self.venv.step_async(actions)
-
-    def step_wait(self):
+    def step_wait(self) -> Tuple[Union[int, float], float, bool, dict]:
         return self.venv.step_wait()
-
-    def close(self):
-        return self.venv.close()
-
-    def get_attr(self, attr_name, indices=None):
-        return self.venv.get_attr(attr_name, indices)
-
-    def set_attr(self, attr_name, value, indices=None):
-        return self.venv.set_attr(attr_name, value, indices)
-
-    def env_method(self, method_name, *method_args, indices=None, **method_kwargs):
-        return self.venv.env_method(method_name, *method_args, indices=indices, **method_kwargs)
-
-    def get_images(self) -> Sequence[np.ndarray]:
-        return self.venv.get_images()
-
-    def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]:
-        return self.venv.seed(seed)
diff --git a/tests/test_her.py b/tests/test_her.py
index 2aae177154..4a4531bca7 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -11,11 +11,9 @@
 from stable_baselines3.td3.policies import MlpPolicy, TD3Policy
 
 
-@pytest.mark.parametrize(
-    "model_class, policy, sde_support", [(SAC, SACPolicy, True), (TD3, TD3Policy, False), (DDPG, MlpPolicy, False)]
-)
+@pytest.mark.parametrize("model_class, policy", [(SAC, SACPolicy), (TD3, TD3Policy), (DDPG, MlpPolicy)])
 @pytest.mark.parametrize("online_sampling", [True, False])
-def test_her(model_class, policy, sde_support, online_sampling):
+def test_her(model_class, policy, online_sampling):
     """
     Test Hindsight Experience Replay.
     """
@@ -45,7 +43,6 @@ def test_her(model_class, policy, sde_support, online_sampling):
         gradient_steps=1,
         train_freq=1,
         n_episodes_rollout=-1,
-        sde_support=sde_support,
     )
 
     model.learn(total_timesteps=500, callback=None)

From 501b1c47cd02a0eef96c721ad8b4f5b95f5d8933 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Mon, 3 Aug 2020 16:09:51 +0200
Subject: [PATCH 11/81] Added getattr. Fixed bug for online sampling.

---
 stable_baselines3/her/her.py               | 132 +++++++++++++--------
 stable_baselines3/her/her_replay_buffer.py |  30 +++--
 2 files changed, 101 insertions(+), 61 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 89f586d09e..a655adb304 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -1,7 +1,7 @@
 from typing import Callable, Optional, Type, Union
 
-import gym
 import numpy as np
+from stable_baselines3.common.base_class import BaseAlgorithm
 
 from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.callbacks import BaseCallback
@@ -9,13 +9,28 @@
 from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm
 from stable_baselines3.common.policies import BasePolicy
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
-from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper
 from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy
 from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
 from stable_baselines3.her.obs_wrapper import ObsWrapper
 
 
-class HER(OffPolicyAlgorithm):
+def check_wrapped_env(env: VecEnv) -> VecEnv:
+    """
+    Check if the environment is already wrapped by an ObsWrapper.
+
+    :param env: (VecEnv) Environment to check.
+    :return: (VecEnv) env
+    """
+    env_tmp = env
+    while isinstance(env_tmp, VecEnvWrapper):
+        if isinstance(env_tmp, ObsWrapper):
+            return env
+        env_tmp = env_tmp.venv
+    return ObsWrapper(env)
+
+
+class HER(BaseAlgorithm):
     """
     Hindsight Experience Replay (HER)
 
@@ -46,18 +61,17 @@ def __init__(
         **kwargs,
     ):
 
-        self.env = env
         # check if wrapper for dict support is needed
-        if isinstance(env.observation_space, gym.spaces.dict.Dict):
-            self.env = ObsWrapper(env)
+        self.env = check_wrapped_env(env)
 
-        super(HER, self).__init__(
-            policy=BasePolicy, env=self.env, policy_base=BasePolicy, learning_rate=learning_rate, sde_support=False
-        )
+        super(HER, self).__init__(policy=BasePolicy, env=self.env, policy_base=BasePolicy, learning_rate=learning_rate)
 
         # model initialization
         self.model = model(policy=policy, env=self.env, learning_rate=learning_rate, *args, **kwargs)
 
+        self.verbose = self.model.verbose
+        self.tensorboard_log = self.model.tensorboard_log
+
         # convert goal_strategy into GoalSelectionStrategy if string
         if isinstance(goal_strategy, str):
             self.goal_strategy = KEY_TO_GOAL_STRATEGY[goal_strategy.lower()]
@@ -74,11 +88,11 @@ def __init__(
         if self.online_sampling:
             self.model.replay_buffer = HerReplayBuffer(
                 self.env,
-                self.model.buffer_size,
+                self.buffer_size,
                 self.goal_strategy,
                 self.env.observation_space,
                 self.env.action_space,
-                self.model.device,
+                self.device,
                 self.n_envs,
                 her_ratio,
             )
@@ -98,31 +112,37 @@ def learn(
         tb_log_name: str = "run",
         eval_log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
-    ) -> "OffPolicyAlgorithm":
+    ) -> BaseAlgorithm:
 
-        total_timesteps, callback = self.model._setup_learn(
+        total_timesteps, callback = self._setup_learn(
             total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
         )
+        self.model.start_time = self.start_time
+        self.model.ep_info_buffer = self.ep_info_buffer
+        self.model.ep_success_buffer = self.ep_success_buffer
+        self.model.num_timesteps = self.num_timesteps
+        self.model._episode_num = self._episode_num
+        self.model._last_obs = self._last_obs
 
         callback.on_training_start(locals(), globals())
 
-        while self.model.num_timesteps < total_timesteps:
+        while self.num_timesteps < total_timesteps:
 
             rollout = self.collect_rollouts(
                 self.env,
-                n_episodes=self.model.n_episodes_rollout,
-                n_steps=self.model.train_freq,
-                action_noise=self.model.action_noise,
+                n_episodes=self.n_episodes_rollout,
+                n_steps=self.train_freq,
+                action_noise=self.action_noise,
                 callback=callback,
-                learning_starts=self.model.learning_starts,
-                replay_buffer=self.model.replay_buffer,
+                learning_starts=self.learning_starts,
+                replay_buffer=self.replay_buffer,
                 log_interval=log_interval,
             )
 
             if rollout.continue_training is False:
                 break
 
-            if self.model.num_timesteps > 0 and self.model.num_timesteps > self.model.learning_starts:
+            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                 # If no `gradient_steps` is specified,
                 # do as many gradients steps as steps performed during the rollout
                 gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
@@ -132,6 +152,15 @@ def learn(
 
         return self
 
+    def _setup_model(self) -> None:
+        self.model._setup_model()
+
+    def __getattr__(self, item):
+        if hasattr(self.model, item):
+            return getattr(self.model, item)
+        else:
+            raise AttributeError
+
     def collect_rollouts(
         self,
         env: VecEnv,
@@ -169,7 +198,7 @@ def collect_rollouts(
         assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"
 
         if self.use_sde:
-            self.model.actor.reset_noise()
+            self.actor.reset_noise()
 
         callback.on_rollout_start()
         continue_training = True
@@ -180,15 +209,16 @@ def collect_rollouts(
 
             while not done:
                 # concatenate observation and (desired) goal
-                observation = self.model._last_obs
-                self.model._last_obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1)
+                observation = self._last_obs
+                self._last_obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1)
 
                 if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0:
                     # Sample a new noise matrix
-                    self.model.actor.reset_noise()
+                    self.actor.reset_noise()
 
                 # Select action randomly or according to policy
-                action, buffer_action = self.model._sample_action(learning_starts, action_noise)
+                self.model._last_obs = self._last_obs
+                action, buffer_action = self._sample_action(learning_starts, action_noise)
 
                 # Rescale and perform action
                 new_obs, reward, done, infos = env.step(action)
@@ -200,36 +230,42 @@ def collect_rollouts(
                 episode_reward += reward
 
                 # Retrieve reward and episode length if using Monitor wrapper
-                self.model._update_info_buffer(infos, done)
+                self._update_info_buffer(infos, done)
+                self.model.ep_info_buffer = self.ep_info_buffer
+                self.model.ep_success_buffer = self.ep_success_buffer
 
                 # Store episode in episode storage
                 if replay_buffer is not None:
                     # Store only the unnormalized version
-                    if self.model._vec_normalize_env is not None:
-                        new_obs_ = self.model._vec_normalize_env.get_original_obs()
-                        reward_ = self.model._vec_normalize_env.get_original_reward()
+                    if self._vec_normalize_env is not None:
+                        new_obs_ = self._vec_normalize_env.get_original_obs()
+                        reward_ = self._vec_normalize_env.get_original_reward()
                     else:
                         # Avoid changing the original ones
-                        self.model._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
+                        self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
+                        self.model._last_original_obs = self._last_original_obs
 
                     # add current transition to episode storage
-                    self.__episode_storage.append((self.model._last_original_obs, buffer_action, reward_, new_obs_, done))
+                    self.__episode_storage.append((self._last_original_obs, buffer_action, reward_, new_obs_, done))
 
-                self.model._last_obs = new_obs
+                self._last_obs = new_obs
+                self.model._last_obs = self._last_obs
                 # Save the unnormalized observation
-                if self.model._vec_normalize_env is not None:
-                    self.model._last_original_obs = new_obs_
+                if self._vec_normalize_env is not None:
+                    self._last_original_obs = new_obs_
+                    self.model._last_original_obs = self._last_original_obs
 
-                self.model.num_timesteps += 1
+                self.num_timesteps += 1
+                self.model.num_timesteps = self.num_timesteps
                 episode_timesteps += 1
                 total_steps += 1
-                self.model._update_current_progress_remaining(self.model.num_timesteps, self.model._total_timesteps)
+                self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps)
 
                 # For DQN, check if the target network should be updated
                 # and update the exploration schedule
                 # For SAC/TD3, the update is done as the same time as the gradient update
                 # see https://github.com/hill-a/stable-baselines/issues/900
-                self.model._on_step()
+                self._on_step()
 
                 if 0 < n_steps <= total_steps:
                     break
@@ -237,8 +273,8 @@ def collect_rollouts(
             if done:
                 if self.online_sampling:
                     observations, actions, rewards, next_observations, done = zip(*self.__episode_storage)
-                    self.model.replay_buffer.add(observations, next_observations, actions, rewards, done)
-                    # self.model.replay_buffer.add(self.__episode_storage)
+                    self.replay_buffer.add(observations, next_observations, actions, rewards, done)
+                    # self.replay_buffer.add(self.__episode_storage)
 
                 else:
                     # store episode in replay buffer
@@ -247,7 +283,8 @@ def collect_rollouts(
                 self.__episode_storage = []
 
                 total_episodes += 1
-                self.model._episode_num += 1
+                self._episode_num += 1
+                self.model._episode_num = self._episode_num
                 episode_rewards.append(episode_reward)
                 total_timesteps.append(episode_timesteps)
 
@@ -255,8 +292,8 @@ def collect_rollouts(
                     action_noise.reset()
 
                 # Log training infos
-                if log_interval is not None and self.model._episode_num % log_interval == 0:
-                    self.model._dump_logs()
+                if log_interval is not None and self._episode_num % log_interval == 0:
+                    self._dump_logs()
 
         mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0
 
@@ -264,9 +301,6 @@ def collect_rollouts(
 
         return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)
 
-    def train(self, gradient_steps: int, batch_size: int) -> None:
-        self.model.train(gradient_steps=gradient_steps, batch_size=batch_size)
-
     def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]:
         """
         Sample a goal based on goal_strategy.
@@ -290,8 +324,8 @@ def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]:
             return self.__episode_storage[index][0]["achieved_goal"]
         elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
             # replay with random state from the entire replay buffer
-            index = np.random.choice(np.arange(self.model.replay_buffer.size()))
-            obs = self.model.replay_buffer.observations[index]
+            index = np.random.choice(np.arange(self.replay_buffer.size()))
+            obs = self.replay_buffer.observations[index]
             # get only the observation part
             # TODO
             obs_dim = self.env.observation_space.shape[0] // 2
@@ -315,7 +349,7 @@ def __store_transitions(self) -> None:
             new_obs = np.concatenate([new_observation["observation"], new_observation["desired_goal"]], axis=1)
 
             # store data in replay buffer
-            self.model.replay_buffer.add(obs, new_obs, action, reward, done)
+            self.replay_buffer.add(obs, new_obs, action, reward, done)
 
             # sample set of additional goals
             sampled_goals = [sample for sample in (self.sample_goals(idx) for i in range(self.n_goals)) if sample is not None]
@@ -330,4 +364,4 @@ def __store_transitions(self) -> None:
                 new_obs = np.concatenate([new_observation["observation"], goal], axis=1)
 
                 # store data in replay buffer
-                self.model.replay_buffer.add(obs, new_obs, action, new_reward, done)
+                self.replay_buffer.add(obs, new_obs, action, new_reward, done)
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 7282530ce7..62ae3df272 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -69,16 +69,15 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         :param batch_size: (int) Number of element to sample
         :return: (ReplayBufferSamples)
         """
-
         # Select which episodes and time steps to use.
         episode_idxs = np.random.randint(0, self.n_episodes_stored, batch_size)
-        buffer = np.array(self.buffer)
+        buffer = np.array(self.buffer, dtype=object)
         # get episode lengths for selecting timesteps
         episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]])
         # select timesteps
         t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths])
         # get selected timesteps
-        transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)])
+        transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)], dtype=object)
         # get her samples indices with her_ratio
         her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_ratio)[0]
         # her samples episode lengths
@@ -87,7 +86,8 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         # get new goals with goal selection strategy
         if self.goal_strategy == GoalSelectionStrategy.FINAL:
             # replay with final state of current episode
-            last_transitions = buffer[episode_idxs[her_idxs], -1][:, 0]
+            last_transitions = [episode[-1][0] for episode in buffer[episode_idxs[her_idxs]]]
+            # last_transitions = buffer[episode_idxs[her_idxs], -1][:, 0]
             her_new_goals = [trans["achieved_goal"] for trans in last_transitions]
         elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
@@ -103,13 +103,15 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         elif self.goal_strategy == GoalSelectionStrategy.EPISODE:
             # replay with random state which comes from the same episode as current transition
             index = np.array([np.random.choice(np.arange(ep_len)) for ep_len in her_episode_lenghts])
-            episode_transitions = buffer[episode_idxs[her_idxs], index][:, 0]
+            episode_transitions = [buffer[episode_idxs[her_idx]][idx][0] for idx, her_idx in zip(index, her_idxs)]
+            # episode_transitions = buffer[episode_idxs[her_idxs], index][:, 0]
             her_new_goals = [trans["achieved_goal"] for trans in episode_transitions]
         elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
             # replay with random state from the entire replay buffer
             ep_idx = np.random.randint(0, self.n_episodes_stored, len(her_idxs))
-            state_idx = [np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]]
-            random_transitions = buffer[ep_idx, state_idx][:, 0]
+            state_idx = np.array([np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]])
+            random_transitions = [episode[state][0] for episode, state in zip(buffer[ep_idx], state_idx)]
+            # random_transitions = buffer[ep_idx, state_idx][:, 0]
             her_new_goals = [trans["achieved_goal"] for trans in random_transitions]
         else:
             raise ValueError("Strategy for sampling goals not supported!")
@@ -161,7 +163,7 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward:
         episode_length = len(episode)
 
         # check if replay buffer has enough space for all transitions of episode
-        if self.n_transitions_stored + episode_length <= self.size():
+        if self.n_transitions_stored + episode_length <= self.buffer_size:
             self.buffer.append(episode)
             # update replay size
             self.n_episodes_stored += 1
@@ -174,12 +176,10 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward:
                 self.buffer[idx] = episode
             elif len(self.buffer[idx]) > episode_length:
                 self.buffer[idx] = episode
-                self.n_transitions_stored -= self.buffer[idx] - episode_length
+                self.n_transitions_stored -= len(self.buffer[idx]) - episode_length
 
-        if self.n_transitions_stored == self.size():
+        if self.n_transitions_stored == self.buffer_size:
             self.full = True
-        else:
-            self.full = False
 
     @property
     def n_episodes_stored(self):
@@ -201,3 +201,9 @@ def clear_buffer(self):
         self.buffer = []
         self.n_episodes_stored = 0
         self.n_transitions_stored = 0
+
+    def size(self) -> int:
+        """
+        :return: (int) The current size of the buffer in transitions.
+        """
+        return self.n_transitions_stored

From 5d096195ede97c3892c587d176e5549c1501a894 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Thu, 6 Aug 2020 02:03:38 +0200
Subject: [PATCH 12/81] Updated save/load funtions. Small changes.

---
 stable_baselines3/her/her.py               | 213 +++++++++++++++++----
 stable_baselines3/her/her_replay_buffer.py |  24 +--
 stable_baselines3/her/obs_wrapper.py       |   5 +-
 tests/test_her.py                          |  92 ++++++++-
 4 files changed, 283 insertions(+), 51 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index a655adb304..89f4ed312d 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -1,14 +1,18 @@
-from typing import Callable, Optional, Type, Union
+import io
+import pathlib
+from typing import Callable, Iterable, List, Optional, Tuple, Type, Union
 
 import numpy as np
-from stable_baselines3.common.base_class import BaseAlgorithm
 
+from stable_baselines3.common.base_class import BaseAlgorithm
 from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.callbacks import BaseCallback
 from stable_baselines3.common.noise import ActionNoise
 from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm
 from stable_baselines3.common.policies import BasePolicy
+from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr, save_to_zip_file
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
+from stable_baselines3.common.utils import check_for_correct_spaces
 from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper
 from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy
 from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
@@ -36,7 +40,7 @@ class HER(BaseAlgorithm):
 
     :param policy: (BasePolicy) The policy model to use.
     :param env: (VecEnv) The environment to learn from.
-    :param model: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3)
+    :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3)
     :param n_goals: (int) Number of sampled goals for replay.
     :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay.
         One of ['episode', 'final', 'future', 'random']
@@ -51,7 +55,7 @@ def __init__(
         self,
         policy: Type[BasePolicy],
         env: VecEnv,
-        model: Type[OffPolicyAlgorithm],
+        model_class: Type[OffPolicyAlgorithm],
         n_goals: int = 5,
         goal_strategy: Union[GoalSelectionStrategy, str] = "future",
         online_sampling: bool = False,
@@ -67,7 +71,10 @@ def __init__(
         super(HER, self).__init__(policy=BasePolicy, env=self.env, policy_base=BasePolicy, learning_rate=learning_rate)
 
         # model initialization
-        self.model = model(policy=policy, env=self.env, learning_rate=learning_rate, *args, **kwargs)
+        self.model_class = model_class
+        self.model = model_class(
+            policy=policy, env=self.env, learning_rate=learning_rate, *args, **kwargs,  # pytype: disable=wrong-keyword-args
+        )
 
         self.verbose = self.model.verbose
         self.tensorboard_log = self.model.tensorboard_log
@@ -85,6 +92,7 @@ def __init__(
 
         # if we sample her transitions online use custom replay buffer
         self.online_sampling = online_sampling
+        self.her_ratio = her_ratio
         if self.online_sampling:
             self.model.replay_buffer = HerReplayBuffer(
                 self.env,
@@ -94,13 +102,16 @@ def __init__(
                 self.env.action_space,
                 self.device,
                 self.n_envs,
-                her_ratio,
+                self.her_ratio,
             )
 
         # storage for transitions of current episode
-        self.__episode_storage = []
+        self._episode_storage = []
         self.n_goals = n_goals
 
+    def _setup_model(self) -> None:
+        self.model._setup_model()
+
     def learn(
         self,
         total_timesteps: int,
@@ -152,15 +163,6 @@ def learn(
 
         return self
 
-    def _setup_model(self) -> None:
-        self.model._setup_model()
-
-    def __getattr__(self, item):
-        if hasattr(self.model, item):
-            return getattr(self.model, item)
-        else:
-            raise AttributeError
-
     def collect_rollouts(
         self,
         env: VecEnv,
@@ -246,7 +248,7 @@ def collect_rollouts(
                         self.model._last_original_obs = self._last_original_obs
 
                     # add current transition to episode storage
-                    self.__episode_storage.append((self._last_original_obs, buffer_action, reward_, new_obs_, done))
+                    self._episode_storage.append((self._last_original_obs, buffer_action, reward_, new_obs_, done))
 
                 self._last_obs = new_obs
                 self.model._last_obs = self._last_obs
@@ -272,15 +274,15 @@ def collect_rollouts(
 
             if done:
                 if self.online_sampling:
-                    observations, actions, rewards, next_observations, done = zip(*self.__episode_storage)
+                    observations, actions, rewards, next_observations, done = zip(*self._episode_storage)
                     self.replay_buffer.add(observations, next_observations, actions, rewards, done)
-                    # self.replay_buffer.add(self.__episode_storage)
+                    # self.replay_buffer.add(self._episode_storage)
 
                 else:
                     # store episode in replay buffer
-                    self.__store_transitions()
+                    self._store_transitions()
                 # clear storage for current episode
-                self.__episode_storage = []
+                self._episode_storage = []
 
                 total_episodes += 1
                 self._episode_num += 1
@@ -301,46 +303,45 @@ def collect_rollouts(
 
         return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)
 
-    def sample_goals(self, sample_idx: int) -> Union[np.ndarray, None]:
+    def sample_goals(self, sample_idx: int, obs_dim: int) -> Union[np.ndarray, None]:
         """
         Sample a goal based on goal_strategy.
 
         :param sample_idx: (int) Index of current transition.
+        :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy.
         :return: (np.ndarray or None) Return sampled goal.
         """
         if self.goal_strategy == GoalSelectionStrategy.FINAL:
             # replay with final state of current episode
-            return self.__episode_storage[-1][0]["achieved_goal"]
+            return self._episode_storage[-1][0]["achieved_goal"]
         elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
             # we have no transition after last transition of episode
 
-            if (sample_idx + 1) < len(self.__episode_storage):
-                index = np.random.choice(np.arange(sample_idx + 1, len(self.__episode_storage)))
-                return self.__episode_storage[index][0]["achieved_goal"]
+            if (sample_idx + 1) < len(self._episode_storage):
+                index = np.random.choice(np.arange(sample_idx + 1, len(self._episode_storage)))
+                return self._episode_storage[index][0]["achieved_goal"]
         elif self.goal_strategy == GoalSelectionStrategy.EPISODE:
             # replay with random state which comes from the same episode as current transition
-            index = np.random.choice(np.arange(len(self.__episode_storage)))
-            return self.__episode_storage[index][0]["achieved_goal"]
+            index = np.random.choice(np.arange(len(self._episode_storage)))
+            return self._episode_storage[index][0]["achieved_goal"]
         elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
             # replay with random state from the entire replay buffer
             index = np.random.choice(np.arange(self.replay_buffer.size()))
             obs = self.replay_buffer.observations[index]
             # get only the observation part
-            # TODO
-            obs_dim = self.env.observation_space.shape[0] // 2
             obs_array = obs[:, :obs_dim]
             return obs_array
         else:
             raise ValueError("Strategy for sampling goals not supported!")
 
-    def __store_transitions(self) -> None:
+    def _store_transitions(self) -> None:
         """
         Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer.
         """
 
         # iterate over current episodes transitions
-        for idx, trans in enumerate(self.__episode_storage):
+        for idx, trans in enumerate(self._episode_storage):
 
             observation, action, reward, new_observation, done = trans
 
@@ -352,7 +353,10 @@ def __store_transitions(self) -> None:
             self.replay_buffer.add(obs, new_obs, action, reward, done)
 
             # sample set of additional goals
-            sampled_goals = [sample for sample in (self.sample_goals(idx) for i in range(self.n_goals)) if sample is not None]
+            obs_dim = observation["observation"].shape[1]
+            sampled_goals = [
+                sample for sample in (self.sample_goals(idx, obs_dim) for i in range(self.n_goals)) if sample is not None
+            ]
 
             # iterate over sampled goals and store new transitions in replay buffer
             for goal in sampled_goals:
@@ -365,3 +369,146 @@ def __store_transitions(self) -> None:
 
                 # store data in replay buffer
                 self.replay_buffer.add(obs, new_obs, action, new_reward, done)
+
+    def __getattr__(self, item):
+        """
+        Find attribute from model class if this class does not have it.
+        """
+        if hasattr(self.model, item):
+            return getattr(self.model, item)
+        else:
+            raise AttributeError
+
+    def get_torch_variables(self) -> Tuple[List[str], List[str]]:
+        return self.model.get_torch_variables()
+
+    def save(
+        self,
+        path: Union[str, pathlib.Path, io.BufferedIOBase],
+        exclude: Optional[Iterable[str]] = None,
+        include: Optional[Iterable[str]] = None,
+    ) -> None:
+        """
+        Save all the attributes of the object and the model parameters in a zip-file.
+
+        :param path: (Union[str, pathlib.Path, io.BufferedIOBase]) path to the file where the rl agent should be saved
+        :param exclude: name of parameters that should be excluded in addition to the default one
+        :param include: name of parameters that might be excluded but should be included anyway
+        """
+        # copy parameter list so we don't mutate the original dict
+        data = self.__dict__.copy()
+        # add model parameter
+        data["model_dict"] = self.model.__dict__.copy()
+
+        # Exclude is union of specified parameters (if any) and standard exclusions
+        if exclude is None:
+            exclude = []
+        exclude = set(exclude).union(self.excluded_save_params())
+        exclude.add("model")
+
+        # Do not exclude params if they are specifically included
+        if include is not None:
+            exclude = exclude.difference(include)
+
+        state_dicts_names, tensors_names = self.get_torch_variables()
+        # any params that are in the save vars must not be saved by data
+        torch_variables = state_dicts_names + tensors_names
+        for torch_var in torch_variables:
+            # we need to get only the name of the top most module as we'll remove that
+            var_name = torch_var.split(".")[0]
+            exclude.add(var_name)
+
+        # Remove parameter entries of parameters which are to be excluded
+        for param_name in exclude:
+            data.pop(param_name, None)
+            data["model_dict"].pop(param_name, None)
+
+        # Build dict of tensor variables
+        tensors = None
+        if tensors_names is not None:
+            tensors = {}
+            for name in tensors_names:
+                attr = recursive_getattr(self, name)
+                tensors[name] = attr
+
+        # Build dict of state_dicts
+        params_to_save = {}
+        for name in state_dicts_names:
+            # always take attribute from model class if possible
+            if hasattr(self.model, name):
+                attr = recursive_getattr(self.model, name)
+            else:
+                attr = recursive_getattr(self, name)
+            # Retrieve state dict
+            params_to_save[name] = attr.state_dict()
+
+        save_to_zip_file(path, data=data, params=params_to_save, tensors=tensors)
+
+    @classmethod
+    def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAlgorithm":
+        """
+        Load the model from a zip-file
+
+        :param load_path: the location of the saved data
+        :param env: the new environment to run the loaded model on
+            (can be None if you only need prediction from a trained model) has priority over any saved environment
+        :param kwargs: extra arguments to change the model when loading
+        """
+        data, params, tensors = load_from_zip_file(load_path)
+
+        if "policy_kwargs" in data:
+            for arg_to_remove in ["device"]:
+                if arg_to_remove in data["policy_kwargs"]:
+                    del data["policy_kwargs"][arg_to_remove]
+
+        if "policy_kwargs" in kwargs and kwargs["policy_kwargs"] != data["policy_kwargs"]:
+            raise ValueError(
+                f"The specified policy kwargs do not equal the stored policy kwargs."
+                f"Stored kwargs: {data['policy_kwargs']}, specified kwargs: {kwargs['policy_kwargs']}"
+            )
+
+        # check if observation space and action space are part of the saved parameters
+        if "observation_space" not in data or "action_space" not in data:
+            raise KeyError("The observation_space and action_space were not given, can't verify new environments")
+        # check if given env is valid
+        if env is not None:
+            env = check_wrapped_env(env)
+            check_for_correct_spaces(env, data["observation_space"], data["action_space"])
+        # if no new env was given use stored env if possible
+        if env is None and "env" in data:
+            env = data["env"]
+
+        # noinspection PyArgumentList
+        model = cls(
+            policy=data["model_dict"]["policy_class"],
+            env=env,
+            model_class=data["model_class"],
+            n_goals=data["n_goals"],
+            goal_strategy=data["goal_strategy"],
+            online_sampling=data["online_sampling"],
+            her_ratio=data["her_ratio"],
+            learning_rate=data["learning_rate"],
+            policy_kwargs=data["model_dict"]["policy_kwargs"],
+            _init_setup_model=True,  # pytype: disable=not-instantiable,wrong-keyword-args
+        )
+
+        # load parameters
+        model.__dict__.update(data)
+        model.model.__dict__.update(data["model_dict"])
+        model.__dict__.update(kwargs)
+
+        # put state_dicts back in place
+        for name in params:
+            attr = recursive_getattr(model.model, name)
+            attr.load_state_dict(params[name])
+
+        # put tensors back in place
+        if tensors is not None:
+            for name in tensors:
+                recursive_setattr(model.model, name, tensors[name])
+
+        # Sample gSDE exploration matrix, so it uses the right device
+        # see issue #44
+        if model.model.use_sde:
+            model.model.policy.reset_noise()  # pytype: disable=attribute-error
+        return model
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 62ae3df272..4fa3f0882b 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -46,14 +46,13 @@ def __init__(
 
         # buffer with episodes
         self.buffer = []
-        # TODO just for typing reason , need another solution
-        self.observations = np.zeros((self.buffer_size, self.n_envs,) + self.obs_shape, dtype=observation_space.dtype)
         self.goal_strategy = goal_strategy
-        self.her_ratio = 1 - (1.0 / (1 + her_ratio))
+        # probability for selecting her indices
+        self.her_prob = 1 - (1.0 / (1 + her_ratio))
 
         # memory management
-        self.__n_episodes_stored = 0
-        self.__n_transitions_stored = 0
+        self._n_episodes_stored = 0
+        self._n_transitions_stored = 0
 
     def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
         """
@@ -78,8 +77,8 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths])
         # get selected timesteps
         transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)], dtype=object)
-        # get her samples indices with her_ratio
-        her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_ratio)[0]
+        # get her samples indices with her_prob
+        her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_prob)[0]
         # her samples episode lengths
         her_episode_lenghts = episode_lengths[her_idxs]
 
@@ -87,7 +86,6 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         if self.goal_strategy == GoalSelectionStrategy.FINAL:
             # replay with final state of current episode
             last_transitions = [episode[-1][0] for episode in buffer[episode_idxs[her_idxs]]]
-            # last_transitions = buffer[episode_idxs[her_idxs], -1][:, 0]
             her_new_goals = [trans["achieved_goal"] for trans in last_transitions]
         elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
@@ -104,14 +102,12 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
             # replay with random state which comes from the same episode as current transition
             index = np.array([np.random.choice(np.arange(ep_len)) for ep_len in her_episode_lenghts])
             episode_transitions = [buffer[episode_idxs[her_idx]][idx][0] for idx, her_idx in zip(index, her_idxs)]
-            # episode_transitions = buffer[episode_idxs[her_idxs], index][:, 0]
             her_new_goals = [trans["achieved_goal"] for trans in episode_transitions]
         elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
             # replay with random state from the entire replay buffer
             ep_idx = np.random.randint(0, self.n_episodes_stored, len(her_idxs))
             state_idx = np.array([np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]])
             random_transitions = [episode[state][0] for episode, state in zip(buffer[ep_idx], state_idx)]
-            # random_transitions = buffer[ep_idx, state_idx][:, 0]
             her_new_goals = [trans["achieved_goal"] for trans in random_transitions]
         else:
             raise ValueError("Strategy for sampling goals not supported!")
@@ -183,19 +179,19 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward:
 
     @property
     def n_episodes_stored(self):
-        return self.__n_episodes_stored
+        return self._n_episodes_stored
 
     @n_episodes_stored.setter
     def n_episodes_stored(self, n):
-        self.__n_episodes_stored = n
+        self._n_episodes_stored = n
 
     @property
     def n_transitions_stored(self):
-        return self.__n_transitions_stored
+        return self._n_transitions_stored
 
     @n_transitions_stored.setter
     def n_transitions_stored(self, n):
-        self.__n_transitions_stored = n
+        self._n_transitions_stored = n
 
     def clear_buffer(self):
         self.buffer = []
diff --git a/stable_baselines3/her/obs_wrapper.py b/stable_baselines3/her/obs_wrapper.py
index 1a909968c0..8eb619e47c 100644
--- a/stable_baselines3/her/obs_wrapper.py
+++ b/stable_baselines3/her/obs_wrapper.py
@@ -1,4 +1,4 @@
-from typing import Union, Tuple
+from typing import Tuple, Union
 
 import numpy as np
 from gym import spaces
@@ -25,9 +25,8 @@ def __init__(self, venv: VecEnv):
             self.obs_dim = 1
             self.goal_dim = 1
         else:
-            goal_space_shape = venv.observation_space.spaces["achieved_goal"].shape
             self.obs_dim = venv.observation_space.spaces["observation"].shape[0]
-            self.goal_dim = goal_space_shape[0]
+            self.goal_dim = venv.observation_space.spaces["achieved_goal"].shape[0]
 
         # new observation space with concatenated observation and (desired) goal
         # for the different types of spaces
diff --git a/tests/test_her.py b/tests/test_her.py
index 4a4531bca7..37bad828ab 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -1,3 +1,6 @@
+import os
+from copy import deepcopy
+
 import numpy as np
 import pytest
 import torch as th
@@ -33,7 +36,7 @@ def test_her(model_class, policy, online_sampling):
         goal_strategy="future",
         online_sampling=online_sampling,
         action_noise=action_noise,
-        verbose=1,
+        verbose=0,
         tau=0.05,
         batch_size=128,
         learning_rate=0.001,
@@ -113,3 +116,90 @@ def test_goal_strategy(goal_strategy, online_sampling):
         n_episodes_rollout=-1,
     )
     model.learn(total_timesteps=200, callback=None)
+
+
+@pytest.mark.parametrize("model_class, policy", [(SAC, SACPolicy), (TD3, TD3Policy), (DDPG, MlpPolicy)])
+def test_save_load(tmp_path, model_class, policy):
+    """
+    Test if 'save' and 'load' saves and loads model correctly
+    """
+    env = BitFlippingEnv(n_bits=4, continuous=True)
+    env = DummyVecEnv([lambda: env])
+
+    # Create action noise
+    n_actions = env.action_space.shape[0]
+    action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,)))
+
+    # create model
+    model = HER(
+        policy,
+        env,
+        model_class,
+        n_goals=5,
+        goal_strategy="future",
+        online_sampling=True,
+        action_noise=action_noise,
+        verbose=0,
+        tau=0.05,
+        batch_size=128,
+        learning_rate=0.001,
+        policy_kwargs=dict(net_arch=[64]),
+        buffer_size=int(1e6),
+        gamma=0.98,
+        gradient_steps=1,
+        train_freq=1,
+        n_episodes_rollout=-1,
+    )
+
+    model.learn(total_timesteps=500, callback=None)
+
+    env.reset()
+
+    observations_list = []
+    for _ in range(10):
+        obs = env.step([env.action_space.sample()])[0]
+        observation = np.concatenate([obs["observation"], obs["desired_goal"]], axis=1)
+        observations_list.append(observation)
+
+    observations = np.concatenate(observations_list, axis=0)
+
+    # Get dictionary of current parameters
+    params = deepcopy(model.model.policy.state_dict())
+
+    # Modify all parameters to be random values
+    random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items())
+
+    # Update model parameters with the new random values
+    model.model.policy.load_state_dict(random_params)
+
+    new_params = model.model.policy.state_dict()
+    # Check that all params are different now
+    for k in params:
+        assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected."
+
+    params = new_params
+
+    # get selected actions
+    selected_actions, _ = model.model.predict(observations, deterministic=True)
+
+    # Check
+    model.save(tmp_path / "test_save.zip")
+    del model
+    model = HER.load(str(tmp_path / "test_save.zip"), env=env)
+
+    # check if params are still the same after load
+    new_params = model.model.policy.state_dict()
+
+    # Check that all params are the same as before save load procedure now
+    for key in params:
+        assert th.allclose(params[key], new_params[key]), "Model parameters not the same after save and load."
+
+    # check if model still selects the same actions
+    new_selected_actions, _ = model.model.predict(observations, deterministic=True)
+    assert np.allclose(selected_actions, new_selected_actions, 1e-4)
+
+    # check if learn still works
+    model.learn(total_timesteps=1000, eval_freq=500)
+
+    # clear file from os
+    os.remove(tmp_path / "test_save.zip")

From cb9026fe8da8fcec3dc2a7ad584da04e0c24a02b Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Thu, 6 Aug 2020 11:46:06 +0200
Subject: [PATCH 13/81] Added her to init.

---
 stable_baselines3/__init__.py     | 1 +
 stable_baselines3/her/__init__.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/stable_baselines3/__init__.py b/stable_baselines3/__init__.py
index b88ca5d4ca..bcac479de6 100644
--- a/stable_baselines3/__init__.py
+++ b/stable_baselines3/__init__.py
@@ -3,6 +3,7 @@
 from stable_baselines3.a2c import A2C
 from stable_baselines3.ddpg import DDPG
 from stable_baselines3.dqn import DQN
+from stable_baselines3.her import HER
 from stable_baselines3.ppo import PPO
 from stable_baselines3.sac import SAC
 from stable_baselines3.td3 import TD3
diff --git a/stable_baselines3/her/__init__.py b/stable_baselines3/her/__init__.py
index e69de29bb2..4e29bce4a5 100644
--- a/stable_baselines3/her/__init__.py
+++ b/stable_baselines3/her/__init__.py
@@ -0,0 +1 @@
+from stable_baselines3.her.her import HER
\ No newline at end of file

From e30f730540202cbf426e4f7ec9cf46886c1e9b8f Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Fri, 7 Aug 2020 10:02:17 +0200
Subject: [PATCH 14/81] Updated save method.

---
 stable_baselines3/her/__init__.py |  2 +-
 stable_baselines3/her/her.py      | 87 ++++++++++---------------------
 2 files changed, 28 insertions(+), 61 deletions(-)

diff --git a/stable_baselines3/her/__init__.py b/stable_baselines3/her/__init__.py
index 4e29bce4a5..ce43bf04cf 100644
--- a/stable_baselines3/her/__init__.py
+++ b/stable_baselines3/her/__init__.py
@@ -1 +1 @@
-from stable_baselines3.her.her import HER
\ No newline at end of file
+from stable_baselines3.her.her import HER
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 89f4ed312d..e0eb93bc4f 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -10,7 +10,7 @@
 from stable_baselines3.common.noise import ActionNoise
 from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm
 from stable_baselines3.common.policies import BasePolicy
-from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr, save_to_zip_file
+from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
 from stable_baselines3.common.utils import check_for_correct_spaces
 from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper
@@ -125,6 +125,8 @@ def learn(
         reset_num_timesteps: bool = True,
     ) -> BaseAlgorithm:
 
+        eval_env = check_wrapped_env(eval_env) if eval_env is not None else eval_env
+
         total_timesteps, callback = self._setup_learn(
             total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
         )
@@ -134,6 +136,7 @@ def learn(
         self.model.num_timesteps = self.num_timesteps
         self.model._episode_num = self._episode_num
         self.model._last_obs = self._last_obs
+        self.model._total_timesteps = self._total_timesteps
 
         callback.on_training_start(locals(), globals())
 
@@ -395,54 +398,15 @@ def save(
         :param exclude: name of parameters that should be excluded in addition to the default one
         :param include: name of parameters that might be excluded but should be included anyway
         """
-        # copy parameter list so we don't mutate the original dict
-        data = self.__dict__.copy()
-        # add model parameter
-        data["model_dict"] = self.model.__dict__.copy()
-
-        # Exclude is union of specified parameters (if any) and standard exclusions
-        if exclude is None:
-            exclude = []
-        exclude = set(exclude).union(self.excluded_save_params())
-        exclude.add("model")
-
-        # Do not exclude params if they are specifically included
-        if include is not None:
-            exclude = exclude.difference(include)
-
-        state_dicts_names, tensors_names = self.get_torch_variables()
-        # any params that are in the save vars must not be saved by data
-        torch_variables = state_dicts_names + tensors_names
-        for torch_var in torch_variables:
-            # we need to get only the name of the top most module as we'll remove that
-            var_name = torch_var.split(".")[0]
-            exclude.add(var_name)
-
-        # Remove parameter entries of parameters which are to be excluded
-        for param_name in exclude:
-            data.pop(param_name, None)
-            data["model_dict"].pop(param_name, None)
-
-        # Build dict of tensor variables
-        tensors = None
-        if tensors_names is not None:
-            tensors = {}
-            for name in tensors_names:
-                attr = recursive_getattr(self, name)
-                tensors[name] = attr
-
-        # Build dict of state_dicts
-        params_to_save = {}
-        for name in state_dicts_names:
-            # always take attribute from model class if possible
-            if hasattr(self.model, name):
-                attr = recursive_getattr(self.model, name)
-            else:
-                attr = recursive_getattr(self, name)
-            # Retrieve state dict
-            params_to_save[name] = attr.state_dict()
-
-        save_to_zip_file(path, data=data, params=params_to_save, tensors=tensors)
+
+        # add HER parameters to model
+        self.model.n_goals = self.n_goals
+        self.model.her_ratio = self.her_ratio
+        self.model.goal_strategy = self.goal_strategy
+        self.model.online_sampling = self.online_sampling
+        self.model.model_class = self.model_class
+
+        self.model.save(path, exclude, include)
 
     @classmethod
     def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAlgorithm":
@@ -479,8 +443,8 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
             env = data["env"]
 
         # noinspection PyArgumentList
-        model = cls(
-            policy=data["model_dict"]["policy_class"],
+        her_model = cls(
+            policy=data["policy_class"],
             env=env,
             model_class=data["model_class"],
             n_goals=data["n_goals"],
@@ -488,27 +452,30 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
             online_sampling=data["online_sampling"],
             her_ratio=data["her_ratio"],
             learning_rate=data["learning_rate"],
-            policy_kwargs=data["model_dict"]["policy_kwargs"],
+            policy_kwargs=data["policy_kwargs"],
             _init_setup_model=True,  # pytype: disable=not-instantiable,wrong-keyword-args
         )
 
         # load parameters
-        model.__dict__.update(data)
-        model.model.__dict__.update(data["model_dict"])
-        model.__dict__.update(kwargs)
+        her_model.model.__dict__.update(data)
+        her_model.__dict__.update(kwargs)
+
+        her_model._total_timesteps = her_model.model._total_timesteps
+        her_model.num_timesteps = her_model.model.num_timesteps
+        her_model._episode_num = her_model.model._episode_num
 
         # put state_dicts back in place
         for name in params:
-            attr = recursive_getattr(model.model, name)
+            attr = recursive_getattr(her_model.model, name)
             attr.load_state_dict(params[name])
 
         # put tensors back in place
         if tensors is not None:
             for name in tensors:
-                recursive_setattr(model.model, name, tensors[name])
+                recursive_setattr(her_model.model, name, tensors[name])
 
         # Sample gSDE exploration matrix, so it uses the right device
         # see issue #44
-        if model.model.use_sde:
-            model.model.policy.reset_noise()  # pytype: disable=attribute-error
-        return model
+        if her_model.model.use_sde:
+            her_model.model.policy.reset_noise()  # pytype: disable=attribute-error
+        return her_model

From 7d1eb24f57a91b1b6e36844130f334809e32e1b0 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Fri, 7 Aug 2020 10:35:38 +0200
Subject: [PATCH 15/81] Updated her ratio.

---
 stable_baselines3/her/her.py               |  7 +++----
 stable_baselines3/her/her_replay_buffer.py | 14 +++++++-------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index e0eb93bc4f..7b0f49561f 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -41,12 +41,11 @@ class HER(BaseAlgorithm):
     :param policy: (BasePolicy) The policy model to use.
     :param env: (VecEnv) The environment to learn from.
     :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3)
-    :param n_goals: (int) Number of sampled goals for replay.
+    :param n_goals: (int) Number of sampled goals for replay. (offline sampling)
     :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay.
         One of ['episode', 'final', 'future', 'random']
     :param online_sampling: (bool) Sample HER transitions online.
-    :her_ratio: (int) The ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
-            as many HER replays as regular replays are used)
+    :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling)
     :param learning_rate: (float or callable) learning rate for the optimizer,
         it can be a function of the current progress remaining (from 1 to 0)
     """
@@ -59,7 +58,7 @@ def __init__(
         n_goals: int = 5,
         goal_strategy: Union[GoalSelectionStrategy, str] = "future",
         online_sampling: bool = False,
-        her_ratio: int = 2,
+        her_ratio: float = 0.6,
         learning_rate: Union[float, Callable] = 3e-4,
         *args,
         **kwargs,
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 4fa3f0882b..89d6d75f62 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -23,8 +23,8 @@ class HerReplayBuffer(BaseBuffer):
     :param device: (Union[th.device, str]) PyTorch device
         to which the values will be converted
     :param n_envs: (int) Number of parallel environments
-    :param her_ratio: (int) The ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
-            as many HER replays as regular replays are used)
+    :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling)
+
     """
 
     def __init__(
@@ -36,7 +36,7 @@ def __init__(
         action_space: spaces.Space,
         device: Union[th.device, str] = "cpu",
         n_envs: int = 1,
-        her_ratio: int = 2,
+        her_ratio: float = 0.6,
     ):
 
         super(HerReplayBuffer, self).__init__(buffer_size, observation_space, action_space, device, n_envs)
@@ -47,8 +47,8 @@ def __init__(
         # buffer with episodes
         self.buffer = []
         self.goal_strategy = goal_strategy
-        # probability for selecting her indices
-        self.her_prob = 1 - (1.0 / (1 + her_ratio))
+        # percentage of her indices
+        self.her_ratio = her_ratio
 
         # memory management
         self._n_episodes_stored = 0
@@ -77,8 +77,8 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths])
         # get selected timesteps
         transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)], dtype=object)
-        # get her samples indices with her_prob
-        her_idxs = np.where(np.random.uniform(size=batch_size) < self.her_prob)[0]
+        # get her samples indices with her_ratio
+        her_idxs = np.random.choice(np.arange(batch_size), int(self.her_ratio * batch_size), replace=False)
         # her samples episode lengths
         her_episode_lenghts = episode_lengths[her_idxs]
 

From 21bd1a4fcc39f5af3e1b2cf268be0e6144351f18 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 11 Aug 2020 16:11:04 +0200
Subject: [PATCH 16/81] Move obs_wrapper

---
 .../vec_env/dict_obs_wrapper.py}                 | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)
 rename stable_baselines3/{her/obs_wrapper.py => common/vec_env/dict_obs_wrapper.py} (85%)

diff --git a/stable_baselines3/her/obs_wrapper.py b/stable_baselines3/common/vec_env/dict_obs_wrapper.py
similarity index 85%
rename from stable_baselines3/her/obs_wrapper.py
rename to stable_baselines3/common/vec_env/dict_obs_wrapper.py
index 8eb619e47c..35eb7908dd 100644
--- a/stable_baselines3/her/obs_wrapper.py
+++ b/stable_baselines3/common/vec_env/dict_obs_wrapper.py
@@ -1,5 +1,3 @@
-from typing import Tuple, Union
-
 import numpy as np
 from gym import spaces
 
@@ -47,8 +45,18 @@ def __init__(self, venv: VecEnv):
         else:
             raise NotImplementedError(f"{type(self.spaces[0])} space is not supported")
 
-    def reset(self) -> Union[int, float]:
+    def reset(self):
         return self.venv.reset()
 
-    def step_wait(self) -> Tuple[Union[int, float], float, bool, dict]:
+    def step_wait(self):
         return self.venv.step_wait()
+
+    @staticmethod
+    def convert_dict(self, observation: dict) -> np.ndarray:
+        """
+        Concatenate observation and desired goal of observation dict.
+
+        :param observation: (dict)
+        :return: (np.ndarray)
+        """
+        return np.concatenate([observation["observation"], observation["desired_goal"]])

From e647d3690c76c23877a7f687e17e11c5c923261e Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 11 Aug 2020 17:03:29 +0200
Subject: [PATCH 17/81] Added DQN test.

---
 stable_baselines3/common/policies.py |  6 +++-
 stable_baselines3/her/her.py         | 31 ++++++++++++--------
 tests/test_her.py                    | 42 +++++++++++++++++++++++++++-
 3 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/stable_baselines3/common/policies.py b/stable_baselines3/common/policies.py
index 01b788fea3..efb06a2f5b 100644
--- a/stable_baselines3/common/policies.py
+++ b/stable_baselines3/common/policies.py
@@ -23,6 +23,7 @@
 from stable_baselines3.common.torch_layers import BaseFeaturesExtractor, FlattenExtractor, MlpExtractor, NatureCNN, create_mlp
 from stable_baselines3.common.utils import get_device, is_vectorized_observation
 from stable_baselines3.common.vec_env import VecTransposeImage
+from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper
 
 
 class BaseModel(nn.Module, ABC):
@@ -227,7 +228,10 @@ def predict(
         #     state = self.initial_state
         # if mask is None:
         #     mask = [False for _ in range(self.n_envs)]
-        observation = np.array(observation)
+        if isinstance(observation, dict):
+            observation = ObsWrapper.convert_dict(observation)
+        else:
+            observation = np.array(observation)
 
         # Handle the different cases for images
         # as PyTorch use channel first format
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 7b0f49561f..c2524fb6c3 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -14,9 +14,9 @@
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
 from stable_baselines3.common.utils import check_for_correct_spaces
 from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper
+from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper
 from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy
 from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
-from stable_baselines3.her.obs_wrapper import ObsWrapper
 
 
 def check_wrapped_env(env: VecEnv) -> VecEnv:
@@ -38,8 +38,8 @@ class HER(BaseAlgorithm):
     """
     Hindsight Experience Replay (HER)
 
-    :param policy: (BasePolicy) The policy model to use.
-    :param env: (VecEnv) The environment to learn from.
+    :param policy: (BasePolicy or str) The policy model to use.
+    :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str)
     :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3)
     :param n_goals: (int) Number of sampled goals for replay. (offline sampling)
     :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay.
@@ -52,8 +52,8 @@ class HER(BaseAlgorithm):
 
     def __init__(
         self,
-        policy: Type[BasePolicy],
-        env: VecEnv,
+        policy: Union[str, Type[BasePolicy]],
+        env: Union[GymEnv, str],
         model_class: Type[OffPolicyAlgorithm],
         n_goals: int = 5,
         goal_strategy: Union[GoalSelectionStrategy, str] = "future",
@@ -64,10 +64,10 @@ def __init__(
         **kwargs,
     ):
 
-        # check if wrapper for dict support is needed
-        self.env = check_wrapped_env(env)
+        super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=learning_rate)
 
-        super(HER, self).__init__(policy=BasePolicy, env=self.env, policy_base=BasePolicy, learning_rate=learning_rate)
+        # check if wrapper for dict support is needed
+        self.env = check_wrapped_env(self.env)
 
         # model initialization
         self.model_class = model_class
@@ -111,6 +111,16 @@ def __init__(
     def _setup_model(self) -> None:
         self.model._setup_model()
 
+    def predict(
+        self,
+        observation: np.ndarray,
+        state: Optional[np.ndarray] = None,
+        mask: Optional[np.ndarray] = None,
+        deterministic: bool = False,
+    ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+
+        return self.model.predict(observation, state, mask, deterministic)
+
     def learn(
         self,
         total_timesteps: int,
@@ -124,8 +134,6 @@ def learn(
         reset_num_timesteps: bool = True,
     ) -> BaseAlgorithm:
 
-        eval_env = check_wrapped_env(eval_env) if eval_env is not None else eval_env
-
         total_timesteps, callback = self._setup_learn(
             total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
         )
@@ -224,7 +232,7 @@ def collect_rollouts(
                 self.model._last_obs = self._last_obs
                 action, buffer_action = self._sample_action(learning_starts, action_noise)
 
-                # Rescale and perform action
+                # Perform action
                 new_obs, reward, done, infos = env.step(action)
 
                 # Only stop training if return value is False, not when it is None.
@@ -264,6 +272,7 @@ def collect_rollouts(
                 episode_timesteps += 1
                 total_steps += 1
                 self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps)
+                self.model._current_progress_remaining = self._current_progress_remaining
 
                 # For DQN, check if the target network should be updated
                 # and update the exploration schedule
diff --git a/tests/test_her.py b/tests/test_her.py
index 37bad828ab..736d9c65c3 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -5,7 +5,7 @@
 import pytest
 import torch as th
 
-from stable_baselines3 import DDPG, SAC, TD3
+from stable_baselines3 import DDPG, DQN, SAC, TD3
 from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
 from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
 from stable_baselines3.common.vec_env import DummyVecEnv
@@ -203,3 +203,43 @@ def test_save_load(tmp_path, model_class, policy):
 
     # clear file from os
     os.remove(tmp_path / "test_save.zip")
+
+
+@pytest.mark.parametrize("online_sampling", [False])
+@pytest.mark.parametrize("n_bits", [15])
+def test_dqn_her(online_sampling, n_bits):
+    """
+    Test HER with DQN for BitFlippingEnv.
+    """
+    env = BitFlippingEnv(n_bits=n_bits, continuous=False)
+
+    # offline
+    model = HER(
+        "MlpPolicy",
+        env,
+        DQN,
+        n_goals=4,
+        goal_strategy="future",
+        online_sampling=online_sampling,
+        her_ratio=0.6,
+        verbose=1,
+        tau=1,
+        batch_size=32,
+        learning_rate=0.0005,
+        policy_kwargs=dict(net_arch=[64, 64]),
+        buffer_size=50000,
+        gamma=0.99,
+        gradient_steps=1,
+        train_freq=1,
+        n_episodes_rollout=-1,
+        tensorboard_log="tensorboard",
+        learning_starts=1000,
+        exploration_fraction=0.1,
+        exploration_final_eps=0.02,
+        exploration_initial_eps=1.0,
+        target_update_interval=500,
+    )
+
+    tb_log_name = "run_" + str(online_sampling) + "_" + str(n_bits)
+
+    model.learn(total_timesteps=20000, callback=None, tb_log_name=tb_log_name)

From fc2b18108a90dc736493c6d2adcd6b1d4e1c0f75 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 11 Aug 2020 17:15:58 +0200
Subject: [PATCH 18/81] Fix potential bug

---
 stable_baselines3/her/her.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index c2524fb6c3..19da1e6604 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -271,14 +271,13 @@ def collect_rollouts(
                 self.model.num_timesteps = self.num_timesteps
                 episode_timesteps += 1
                 total_steps += 1
-                self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps)
-                self.model._current_progress_remaining = self._current_progress_remaining
+                self.model._update_current_progress_remaining(self.num_timesteps, self._total_timesteps)
 
                 # For DQN, check if the target network should be updated
                 # and update the exploration schedule
                 # For SAC/TD3, the update is done as the same time as the gradient update
                 # see https://github.com/hill-a/stable-baselines/issues/900
-                self._on_step()
+                self.model._on_step()
 
                 if 0 < n_steps <= total_steps:
                     break

From 3f3bd4914cec59834702f93c248a9e7152440e6c Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Wed, 19 Aug 2020 03:50:09 +0200
Subject: [PATCH 19/81] Offline and online her share same sample_goal function.

---
 .../common/vec_env/dict_obs_wrapper.py        |   4 +-
 stable_baselines3/her/her.py                  |  91 ++++++---------
 stable_baselines3/her/her_replay_buffer.py    | 105 +++++++++++-------
 tests/test_her.py                             |  22 ++--
 4 files changed, 110 insertions(+), 112 deletions(-)

diff --git a/stable_baselines3/common/vec_env/dict_obs_wrapper.py b/stable_baselines3/common/vec_env/dict_obs_wrapper.py
index 35eb7908dd..55e5283b06 100644
--- a/stable_baselines3/common/vec_env/dict_obs_wrapper.py
+++ b/stable_baselines3/common/vec_env/dict_obs_wrapper.py
@@ -52,11 +52,11 @@ def step_wait(self):
         return self.venv.step_wait()
 
     @staticmethod
-    def convert_dict(self, observation: dict) -> np.ndarray:
+    def convert_dict(observation: dict) -> np.ndarray:
         """
         Concatenate observation and desired goal of observation dict.
 
         :param observation: (dict)
         :return: (np.ndarray)
         """
-        return np.concatenate([observation["observation"], observation["desired_goal"]])
+        return np.concatenate([observation["observation"], observation["desired_goal"]], axis=1)
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 19da1e6604..caa2a4308c 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -41,11 +41,10 @@ class HER(BaseAlgorithm):
     :param policy: (BasePolicy or str) The policy model to use.
     :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str)
     :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3)
-    :param n_goals: (int) Number of sampled goals for replay. (offline sampling)
-    :param goal_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay.
+    :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling)
+    :param goal_selection_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay.
         One of ['episode', 'final', 'future', 'random']
     :param online_sampling: (bool) Sample HER transitions online.
-    :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling)
     :param learning_rate: (float or callable) learning rate for the optimizer,
         it can be a function of the current progress remaining (from 1 to 0)
     """
@@ -55,10 +54,9 @@ def __init__(
         policy: Union[str, Type[BasePolicy]],
         env: Union[GymEnv, str],
         model_class: Type[OffPolicyAlgorithm],
-        n_goals: int = 5,
-        goal_strategy: Union[GoalSelectionStrategy, str] = "future",
+        n_sampled_goal: int = 5,
+        goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future",
         online_sampling: bool = False,
-        her_ratio: float = 0.6,
         learning_rate: Union[float, Callable] = 3e-4,
         *args,
         **kwargs,
@@ -78,25 +76,29 @@ def __init__(
         self.verbose = self.model.verbose
         self.tensorboard_log = self.model.tensorboard_log
 
-        # convert goal_strategy into GoalSelectionStrategy if string
-        if isinstance(goal_strategy, str):
-            self.goal_strategy = KEY_TO_GOAL_STRATEGY[goal_strategy.lower()]
+        # convert goal_selection_strategy into GoalSelectionStrategy if string
+        if isinstance(goal_selection_strategy, str):
+            self.goal_selection_strategy = KEY_TO_GOAL_STRATEGY[goal_selection_strategy.lower()]
         else:
-            self.goal_strategy = goal_strategy
+            self.goal_selection_strategy = goal_selection_strategy
 
-        # check if goal_strategy is valid
+        # check if goal_selection_strategy is valid
         assert isinstance(
-            self.goal_strategy, GoalSelectionStrategy
+            self.goal_selection_strategy, GoalSelectionStrategy
         ), f"Invalid goal selection strategy, please use one of {list(GoalSelectionStrategy)}"
 
+        # storage for transitions of current episode
+        self._episode_storage = []
+        self.n_sampled_goal = n_sampled_goal
+
         # if we sample her transitions online use custom replay buffer
         self.online_sampling = online_sampling
-        self.her_ratio = her_ratio
+        self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1))
         if self.online_sampling:
             self.model.replay_buffer = HerReplayBuffer(
                 self.env,
                 self.buffer_size,
-                self.goal_strategy,
+                self.goal_selection_strategy,
                 self.env.observation_space,
                 self.env.action_space,
                 self.device,
@@ -104,10 +106,6 @@ def __init__(
                 self.her_ratio,
             )
 
-        # storage for transitions of current episode
-        self._episode_storage = []
-        self.n_goals = n_goals
-
     def _setup_model(self) -> None:
         self.model._setup_model()
 
@@ -222,7 +220,7 @@ def collect_rollouts(
             while not done:
                 # concatenate observation and (desired) goal
                 observation = self._last_obs
-                self._last_obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1)
+                self._last_obs = ObsWrapper.convert_dict(observation)
 
                 if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0:
                     # Sample a new noise matrix
@@ -313,38 +311,6 @@ def collect_rollouts(
 
         return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)
 
-    def sample_goals(self, sample_idx: int, obs_dim: int) -> Union[np.ndarray, None]:
-        """
-        Sample a goal based on goal_strategy.
-
-        :param sample_idx: (int) Index of current transition.
-        :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy.
-        :return: (np.ndarray or None) Return sampled goal.
-        """
-        if self.goal_strategy == GoalSelectionStrategy.FINAL:
-            # replay with final state of current episode
-            return self._episode_storage[-1][0]["achieved_goal"]
-        elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
-            # replay with random state which comes from the same episode and was observed after current transition
-            # we have no transition after last transition of episode
-
-            if (sample_idx + 1) < len(self._episode_storage):
-                index = np.random.choice(np.arange(sample_idx + 1, len(self._episode_storage)))
-                return self._episode_storage[index][0]["achieved_goal"]
-        elif self.goal_strategy == GoalSelectionStrategy.EPISODE:
-            # replay with random state which comes from the same episode as current transition
-            index = np.random.choice(np.arange(len(self._episode_storage)))
-            return self._episode_storage[index][0]["achieved_goal"]
-        elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
-            # replay with random state from the entire replay buffer
-            index = np.random.choice(np.arange(self.replay_buffer.size()))
-            obs = self.replay_buffer.observations[index]
-            # get only the observation part
-            obs_array = obs[:, :obs_dim]
-            return obs_array
-        else:
-            raise ValueError("Strategy for sampling goals not supported!")
-
     def _store_transitions(self) -> None:
         """
         Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer.
@@ -356,8 +322,8 @@ def _store_transitions(self) -> None:
             observation, action, reward, new_observation, done = trans
 
             # concatenate observation with (desired) goal
-            obs = np.concatenate([observation["observation"], observation["desired_goal"]], axis=1)
-            new_obs = np.concatenate([new_observation["observation"], new_observation["desired_goal"]], axis=1)
+            obs = ObsWrapper.convert_dict(observation)
+            new_obs = ObsWrapper.convert_dict(new_observation)
 
             # store data in replay buffer
             self.replay_buffer.add(obs, new_obs, action, reward, done)
@@ -365,7 +331,14 @@ def _store_transitions(self) -> None:
             # sample set of additional goals
             obs_dim = observation["observation"].shape[1]
             sampled_goals = [
-                sample for sample in (self.sample_goals(idx, obs_dim) for i in range(self.n_goals)) if sample is not None
+                sample
+                for sample in (
+                    HerReplayBuffer.sample_goal(
+                        self.goal_selection_strategy, idx, self._episode_storage, self.replay_buffer.observations, obs_dim
+                    )
+                    for i in range(self.n_sampled_goal)
+                )
+                if sample is not None
             ]
 
             # iterate over sampled goals and store new transitions in replay buffer
@@ -407,9 +380,8 @@ def save(
         """
 
         # add HER parameters to model
-        self.model.n_goals = self.n_goals
-        self.model.her_ratio = self.her_ratio
-        self.model.goal_strategy = self.goal_strategy
+        self.model.n_sampled_goal = self.n_sampled_goal
+        self.model.goal_selection_strategy = self.goal_selection_strategy
         self.model.online_sampling = self.online_sampling
         self.model.model_class = self.model_class
 
@@ -454,10 +426,9 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
             policy=data["policy_class"],
             env=env,
             model_class=data["model_class"],
-            n_goals=data["n_goals"],
-            goal_strategy=data["goal_strategy"],
+            n_sampled_goal=data["n_sampled_goal"],
+            goal_selection_strategy=data["goal_selection_strategy"],
             online_sampling=data["online_sampling"],
-            her_ratio=data["her_ratio"],
             learning_rate=data["learning_rate"],
             policy_kwargs=data["policy_kwargs"],
             _init_setup_model=True,  # pytype: disable=not-instantiable,wrong-keyword-args
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 89d6d75f62..a21dab9dec 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -7,6 +7,7 @@
 from stable_baselines3.common.buffers import BaseBuffer
 from stable_baselines3.common.type_aliases import ReplayBufferSamples
 from stable_baselines3.common.vec_env import VecEnv, VecNormalize
+from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper
 from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
 
 
@@ -16,7 +17,7 @@ class HerReplayBuffer(BaseBuffer):
 
     :param env: (VecEnv) The training environment
     :param buffer_size: (int) The size of the buffer measured in transitions.
-    :param goal_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay.
+    :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay.
         One of ['episode', 'final', 'future', 'random']
     :param observation_space: (spaces.Space) Observation space
     :param action_space: (spaces.Space) Action space
@@ -31,7 +32,7 @@ def __init__(
         self,
         env: VecEnv,
         buffer_size: int,
-        goal_strategy: GoalSelectionStrategy,
+        goal_selection_strategy: GoalSelectionStrategy,
         observation_space: spaces.Space,
         action_space: spaces.Space,
         device: Union[th.device, str] = "cpu",
@@ -46,7 +47,7 @@ def __init__(
 
         # buffer with episodes
         self.buffer = []
-        self.goal_strategy = goal_strategy
+        self.goal_selection_strategy = goal_selection_strategy
         # percentage of her indices
         self.her_ratio = her_ratio
 
@@ -73,44 +74,22 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         buffer = np.array(self.buffer, dtype=object)
         # get episode lengths for selecting timesteps
         episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]])
-        # select timesteps
+        # select timesteps of episodes
         t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths])
         # get selected timesteps
         transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)], dtype=object)
         # get her samples indices with her_ratio
         her_idxs = np.random.choice(np.arange(batch_size), int(self.her_ratio * batch_size), replace=False)
-        # her samples episode lengths
-        her_episode_lenghts = episode_lengths[her_idxs]
+
+        # if we sample goals from future delete indices from her_idxs where we have no transition after current one
+        if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
+            her_idxs = her_idxs[t_samples[her_idxs] != episode_lengths[her_idxs] - 1]
 
         # get new goals with goal selection strategy
-        if self.goal_strategy == GoalSelectionStrategy.FINAL:
-            # replay with final state of current episode
-            last_transitions = [episode[-1][0] for episode in buffer[episode_idxs[her_idxs]]]
-            her_new_goals = [trans["achieved_goal"] for trans in last_transitions]
-        elif self.goal_strategy == GoalSelectionStrategy.FUTURE:
-            # replay with random state which comes from the same episode and was observed after current transition
-            her_new_goals = []
-            for idx, length in zip(her_idxs, her_episode_lenghts):
-                # we have no transition after last transition of episode
-                if t_samples[idx] + 1 < length:
-                    index = np.random.choice(np.arange(t_samples[idx] + 1, length))
-                    her_new_goals.append(buffer[episode_idxs[idx]][index][0]["achieved_goal"])
-                else:
-                    # delete index from her indices where we have no transition after current one
-                    her_idxs = her_idxs[her_idxs != idx]
-        elif self.goal_strategy == GoalSelectionStrategy.EPISODE:
-            # replay with random state which comes from the same episode as current transition
-            index = np.array([np.random.choice(np.arange(ep_len)) for ep_len in her_episode_lenghts])
-            episode_transitions = [buffer[episode_idxs[her_idx]][idx][0] for idx, her_idx in zip(index, her_idxs)]
-            her_new_goals = [trans["achieved_goal"] for trans in episode_transitions]
-        elif self.goal_strategy == GoalSelectionStrategy.RANDOM:
-            # replay with random state from the entire replay buffer
-            ep_idx = np.random.randint(0, self.n_episodes_stored, len(her_idxs))
-            state_idx = np.array([np.random.choice(np.arange(len(ep))) for ep in buffer[ep_idx]])
-            random_transitions = [episode[state][0] for episode, state in zip(buffer[ep_idx], state_idx)]
-            her_new_goals = [trans["achieved_goal"] for trans in random_transitions]
-        else:
-            raise ValueError("Strategy for sampling goals not supported!")
+        her_new_goals = [
+            self.sample_goal(self.goal_selection_strategy, trans_idx, episode, self.buffer, online_sampling=True)
+            for episode, trans_idx in zip(buffer[episode_idxs[her_idxs]], t_samples[her_idxs])
+        ]
 
         # assign new goals as desired_goals
         for idx, goal in enumerate(her_new_goals):
@@ -122,15 +101,13 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
         achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(next_observations)[her_idxs]]
         new_rewards = np.array(rewards)
         new_rewards[her_idxs] = [
-            self.env.env_method("compute_reward", achieved_goal, her_new_goals, None)
+            self.env.env_method("compute_reward", achieved_goal, new_goal, None)
             for achieved_goal, new_goal in zip(achieved_goals, her_new_goals)
         ]
 
         # concatenate observation with (desired) goal
-        obs = [np.concatenate([obs_["observation"], obs_["desired_goal"]], axis=1) for obs_ in observations]
-        new_obs = [
-            np.concatenate([new_obs_["observation"], new_obs_["desired_goal"]], axis=1) for new_obs_ in next_observations
-        ]
+        obs = [ObsWrapper.convert_dict(obs_) for obs_ in observations]
+        new_obs = [ObsWrapper.convert_dict(new_obs_) for new_obs_ in next_observations]
 
         data = (
             np.array(obs)[:, 0, :],
@@ -142,6 +119,56 @@ def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
 
         return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
 
+    @staticmethod
+    def sample_goal(
+        goal_selection_strategy: GoalSelectionStrategy,
+        sample_idx: int,
+        episode: list,
+        observations: Union[list, np.ndarray],
+        obs_dim: int = None,
+        online_sampling: bool = False,
+    ) -> Union[np.ndarray, None]:
+        """
+        Sample a goal based on goal_selection_strategy.
+
+        :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay.
+            One of ['episode', 'final', 'future', 'random']
+        :param sample_idx: (int) Index of current transition.
+        :param episode: (list) Current episode.
+        :param observations: (list or np.ndarray)
+        :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy.
+        :param online_sampling: (bool) Sample HER transitions online.
+        :return: (np.ndarray or None) Return sampled goal.
+        """
+        if goal_selection_strategy == GoalSelectionStrategy.FINAL:
+            # replay with final state of current episode
+            return episode[-1][0]["achieved_goal"]
+        elif goal_selection_strategy == GoalSelectionStrategy.FUTURE:
+            # replay with random state which comes from the same episode and was observed after current transition
+            # we have no transition after last transition of episode
+            if (sample_idx + 1) < len(episode):
+                index = np.random.choice(np.arange(sample_idx + 1, len(episode)))
+                return episode[index][0]["achieved_goal"]
+        elif goal_selection_strategy == GoalSelectionStrategy.EPISODE:
+            # replay with random state which comes from the same episode as current transition
+            index = np.random.choice(np.arange(len(episode)))
+            return episode[index][0]["achieved_goal"]
+        elif goal_selection_strategy == GoalSelectionStrategy.RANDOM:
+            if online_sampling:
+                # replay with random state from the entire replay buffer
+                ep_idx = np.random.choice(np.arange(len(observations)))
+                trans_idx = np.random.choice(np.arange(len(observations[ep_idx])))
+                return observations[ep_idx][trans_idx][0]["achieved_goal"]
+            else:
+                # replay with random state from the entire replay buffer
+                index = np.random.choice(np.arange(len(observations)))
+                obs = observations[index]
+                # get only the observation part
+                obs_array = obs[:, :obs_dim]
+                return obs_array
+        else:
+            raise ValueError("Strategy for sampling goals not supported!")
+
     def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: np.ndarray, done: np.ndarray) -> None:
         """
         Add episode to replay buffer
diff --git a/tests/test_her.py b/tests/test_her.py
index 736d9c65c3..80dec3a82b 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -9,6 +9,7 @@
 from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
 from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
 from stable_baselines3.common.vec_env import DummyVecEnv
+from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper
 from stable_baselines3.her.her import HER, GoalSelectionStrategy
 from stable_baselines3.sac.policies import SACPolicy
 from stable_baselines3.td3.policies import MlpPolicy, TD3Policy
@@ -32,8 +33,8 @@ def test_her(model_class, policy, online_sampling):
         policy,
         env,
         model_class,
-        n_goals=5,
-        goal_strategy="future",
+        n_sampled_goal=5,
+        goal_selection_strategy="future",
         online_sampling=online_sampling,
         action_noise=action_noise,
         verbose=0,
@@ -85,7 +86,7 @@ def test_her(model_class, policy, online_sampling):
 
 
 @pytest.mark.parametrize(
-    "goal_strategy",
+    "goal_selection_strategy",
     [
         "final",
         "episode",
@@ -98,7 +99,7 @@ def test_her(model_class, policy, online_sampling):
     ],
 )
 @pytest.mark.parametrize("online_sampling", [True, False])
-def test_goal_strategy(goal_strategy, online_sampling):
+def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
     """
     Test different goal strategies.
     """
@@ -109,7 +110,7 @@ def test_goal_strategy(goal_strategy, online_sampling):
         SACPolicy,
         env,
         SAC,
-        goal_strategy=goal_strategy,
+        goal_selection_strategy=goal_selection_strategy,
         online_sampling=online_sampling,
         gradient_steps=1,
         train_freq=1,
@@ -135,8 +136,8 @@ def test_save_load(tmp_path, model_class, policy):
         policy,
         env,
         model_class,
-        n_goals=5,
-        goal_strategy="future",
+        n_sampled_goal=5,
+        goal_selection_strategy="future",
         online_sampling=True,
         action_noise=action_noise,
         verbose=0,
@@ -158,7 +159,7 @@ def test_save_load(tmp_path, model_class, policy):
     observations_list = []
     for _ in range(10):
         obs = env.step([env.action_space.sample()])[0]
-        observation = np.concatenate([obs["observation"], obs["desired_goal"]], axis=1)
+        observation = ObsWrapper.convert_dict(obs)
         observations_list.append(observation)
 
     observations = np.concatenate(observations_list, axis=0)
@@ -218,10 +219,9 @@ def test_dqn_her(online_sampling, n_bits):
         "MlpPolicy",
         env,
         DQN,
-        n_goals=4,
-        goal_strategy="future",
+        n_sampled_goal=4,
+        goal_selection_strategy="future",
         online_sampling=online_sampling,
-        her_ratio=0.6,
         verbose=1,
         tau=1,
         batch_size=32,

From cce063fc92052bd83f6c6ec94325a61ce8b05f40 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Mon, 24 Aug 2020 10:57:38 +0200
Subject: [PATCH 20/81] Changed lists into arrays.

---
 stable_baselines3/her/her.py               |  22 ++-
 stable_baselines3/her/her_replay_buffer.py | 164 +++++++++++++++------
 2 files changed, 138 insertions(+), 48 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index caa2a4308c..3902ed6ad9 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -47,6 +47,7 @@ class HER(BaseAlgorithm):
     :param online_sampling: (bool) Sample HER transitions online.
     :param learning_rate: (float or callable) learning rate for the optimizer,
         it can be a function of the current progress remaining (from 1 to 0)
+    :param max_episode_length: (int) The length of an episode. (time horizon)
     """
 
     def __init__(
@@ -58,6 +59,7 @@ def __init__(
         goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future",
         online_sampling: bool = False,
         learning_rate: Union[float, Callable] = 3e-4,
+        max_episode_length: int = 10,
         *args,
         **kwargs,
     ):
@@ -94,10 +96,14 @@ def __init__(
         # if we sample her transitions online use custom replay buffer
         self.online_sampling = online_sampling
         self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1))
+        self.max_episode_length = max_episode_length
+        # counter for steps in episode
+        self.episode_steps = 0
         if self.online_sampling:
             self.model.replay_buffer = HerReplayBuffer(
                 self.env,
                 self.buffer_size,
+                self.max_episode_length,
                 self.goal_selection_strategy,
                 self.env.observation_space,
                 self.env.action_space,
@@ -161,7 +167,7 @@ def learn(
             if rollout.continue_training is False:
                 break
 
-            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
+            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts and self.replay_buffer.size() > 0:
                 # If no `gradient_steps` is specified,
                 # do as many gradients steps as steps performed during the rollout
                 gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
@@ -277,15 +283,15 @@ def collect_rollouts(
                 # see https://github.com/hill-a/stable-baselines/issues/900
                 self.model._on_step()
 
+                self.episode_steps += 1
+
                 if 0 < n_steps <= total_steps:
                     break
 
-            if done:
+            if done or self.episode_steps >= self.max_episode_length:
                 if self.online_sampling:
                     observations, actions, rewards, next_observations, done = zip(*self._episode_storage)
                     self.replay_buffer.add(observations, next_observations, actions, rewards, done)
-                    # self.replay_buffer.add(self._episode_storage)
-
                 else:
                     # store episode in replay buffer
                     self._store_transitions()
@@ -305,6 +311,10 @@ def collect_rollouts(
                 if log_interval is not None and self._episode_num % log_interval == 0:
                     self._dump_logs()
 
+                # reset if done or episode length is reached
+                self.env.reset()
+                self.episode_steps = 0
+
         mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0
 
         callback.on_rollout_end()
@@ -341,7 +351,7 @@ def _store_transitions(self) -> None:
                 if sample is not None
             ]
 
-            # iterate over sampled goals and store new transitions in replay buffer
+            # iterate over sampled  new transitions in replay buffer
             for goal in sampled_goals:
                 # compute new reward with new goal
                 new_reward = self.env.env_method("compute_reward", new_observation["achieved_goal"], goal, None)
@@ -384,6 +394,7 @@ def save(
         self.model.goal_selection_strategy = self.goal_selection_strategy
         self.model.online_sampling = self.online_sampling
         self.model.model_class = self.model_class
+        self.model.max_episode_length = self.max_episode_length
 
         self.model.save(path, exclude, include)
 
@@ -430,6 +441,7 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
             goal_selection_strategy=data["goal_selection_strategy"],
             online_sampling=data["online_sampling"],
             learning_rate=data["learning_rate"],
+            max_episode_length=data["max_episode_length"],
             policy_kwargs=data["policy_kwargs"],
             _init_setup_model=True,  # pytype: disable=not-instantiable,wrong-keyword-args
         )
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index a21dab9dec..5508c59c36 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -1,8 +1,9 @@
-from typing import Optional, Union
+from typing import Optional, Type, Union
 
 import numpy as np
 import torch as th
 from gym import spaces
+from gym.spaces import Discrete
 
 from stable_baselines3.common.buffers import BaseBuffer
 from stable_baselines3.common.type_aliases import ReplayBufferSamples
@@ -17,6 +18,7 @@ class HerReplayBuffer(BaseBuffer):
 
     :param env: (VecEnv) The training environment
     :param buffer_size: (int) The size of the buffer measured in transitions.
+    :param max_episode_length: (int) The length of an episode. (time horizon)
     :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay.
         One of ['episode', 'final', 'future', 'random']
     :param observation_space: (spaces.Space) Observation space
@@ -25,13 +27,13 @@ class HerReplayBuffer(BaseBuffer):
         to which the values will be converted
     :param n_envs: (int) Number of parallel environments
     :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling)
-
     """
 
     def __init__(
         self,
         env: VecEnv,
         buffer_size: int,
+        max_episode_length: int,
         goal_selection_strategy: GoalSelectionStrategy,
         observation_space: spaces.Space,
         action_space: spaces.Space,
@@ -44,9 +46,29 @@ def __init__(
 
         self.env = env
         self.buffer_size = buffer_size
+        self.max_episode_length = max_episode_length
 
         # buffer with episodes
-        self.buffer = []
+        # number of episodes which can be stored until buffer size is reached
+        n_episodes = self.buffer_size // self.max_episode_length
+        # input dimensions for buffer initialization
+        input_shape = {
+            "observation": (self.env.num_envs, self.env.obs_dim),
+            "achieved_goal": (self.env.num_envs, self.env.goal_dim),
+            "desired_goal": (self.env.num_envs, self.env.goal_dim),
+            "action": (self.action_dim,),
+            "reward": (1,),
+            "next_obs": (self.env.num_envs, self.env.obs_dim),
+            "next_achieved_goal": (self.env.num_envs, self.env.goal_dim),
+            "next_desired_goal": (self.env.num_envs, self.env.goal_dim),
+            "done": (1,),
+        }
+        self.buffer = {
+            key: np.empty([n_episodes, self.max_episode_length, *dim], dtype=np.float32) for key, dim in input_shape.items()
+        }
+        # episode length storage, needed for episodes which has less steps than the maximum length
+        self.episode_lengths = np.empty(n_episodes)
+
         self.goal_selection_strategy = goal_selection_strategy
         # percentage of her indices
         self.her_ratio = her_ratio
@@ -62,59 +84,67 @@ def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayB
             to normalize the observations/rewards when sampling
         :return: (ReplayBufferSamples)
         """
-        return self._sample_transitions(batch_size)
+        return self._sample_transitions(batch_size, env)
 
-    def _sample_transitions(self, batch_size: int) -> ReplayBufferSamples:
+    def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples:
         """
         :param batch_size: (int) Number of element to sample
+        :param env: (Optional[VecNormalize]) associated gym VecEnv
+            to normalize the observations/rewards when sampling
         :return: (ReplayBufferSamples)
         """
-        # Select which episodes and time steps to use.
+        # Select which episodes to use
         episode_idxs = np.random.randint(0, self.n_episodes_stored, batch_size)
-        buffer = np.array(self.buffer, dtype=object)
-        # get episode lengths for selecting timesteps
-        episode_lengths = np.array([len(ep) for ep in buffer[episode_idxs]])
         # select timesteps of episodes
-        t_samples = np.array([np.random.choice(np.arange(ep_len)) for ep_len in episode_lengths])
+        max_timestep_idx = self.episode_lengths[episode_idxs]
+        # transition_idxs = np.random.randint(self.max_episode_length, size=batch_size)
+        transition_idxs = np.random.randint(max_timestep_idx)
         # get selected timesteps
-        transitions = np.array([buffer[ep][trans] for ep, trans in zip(episode_idxs, t_samples)], dtype=object)
+        transitions = {key: self.buffer[key][episode_idxs, transition_idxs].copy() for key in self.buffer.keys()}
         # get her samples indices with her_ratio
         her_idxs = np.random.choice(np.arange(batch_size), int(self.her_ratio * batch_size), replace=False)
 
         # if we sample goals from future delete indices from her_idxs where we have no transition after current one
         if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
-            her_idxs = her_idxs[t_samples[her_idxs] != episode_lengths[her_idxs] - 1]
+            her_idxs = her_idxs[transition_idxs[her_idxs] != max_timestep_idx[her_idxs] - 1]
 
         # get new goals with goal selection strategy
         her_new_goals = [
-            self.sample_goal(self.goal_selection_strategy, trans_idx, episode, self.buffer, online_sampling=True)
-            for episode, trans_idx in zip(buffer[episode_idxs[her_idxs]], t_samples[her_idxs])
+            self.sample_goal(self.goal_selection_strategy, trans, episode, self.buffer["achieved_goal"], online_sampling=True)
+            for episode, trans in zip(self.buffer["achieved_goal"][episode_idxs[her_idxs]], transition_idxs[her_idxs])
         ]
 
         # assign new goals as desired_goals
         for idx, goal in enumerate(her_new_goals):
-            transitions[her_idxs][:, 0][idx]["desired_goal"] = goal
-
-        observations, actions, rewards, next_observations, dones = list(zip(*transitions))
+            # observation
+            transitions["desired_goal"][her_idxs][idx] = goal
+            # next observation
+            transitions["next_desired_goal"][her_idxs][idx] = goal
 
         # compute new rewards with new goal
-        achieved_goals = [new_obs["achieved_goal"] for new_obs in np.array(next_observations)[her_idxs]]
-        new_rewards = np.array(rewards)
+        achieved_goals = transitions["next_achieved_goal"][her_idxs]
+        new_rewards = transitions["reward"].copy()
         new_rewards[her_idxs] = [
             self.env.env_method("compute_reward", achieved_goal, new_goal, None)
             for achieved_goal, new_goal in zip(achieved_goals, her_new_goals)
         ]
 
         # concatenate observation with (desired) goal
-        obs = [ObsWrapper.convert_dict(obs_) for obs_ in observations]
-        new_obs = [ObsWrapper.convert_dict(new_obs_) for new_obs_ in next_observations]
+        obs = [
+            np.concatenate([obs, desired_goal], axis=1)
+            for obs, desired_goal in zip(transitions["observation"], transitions["desired_goal"])
+        ]
+        next_obs = [
+            np.concatenate([obs, desired_goal], axis=1)
+            for obs, desired_goal in zip(transitions["next_obs"], transitions["next_desired_goal"])
+        ]
 
         data = (
-            np.array(obs)[:, 0, :],
-            np.array(actions, dtype=self.action_space.dtype)[:, 0, :],
-            np.array(new_obs)[:, 0, :],
-            np.array(dones, dtype=np.int8),
-            new_rewards,
+            self._normalize_obs(np.asarray(obs, dtype=np.int8), env),
+            transitions["action"],
+            self._normalize_obs(np.asarray(next_obs, dtype=np.int8), env),
+            transitions["done"],
+            self._normalize_obs(new_rewards, env),
         )
 
         return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
@@ -142,23 +172,29 @@ def sample_goal(
         """
         if goal_selection_strategy == GoalSelectionStrategy.FINAL:
             # replay with final state of current episode
+            if online_sampling:
+                return episode[-1]
             return episode[-1][0]["achieved_goal"]
         elif goal_selection_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
             # we have no transition after last transition of episode
             if (sample_idx + 1) < len(episode):
                 index = np.random.choice(np.arange(sample_idx + 1, len(episode)))
+                if online_sampling:
+                    return episode[index]
                 return episode[index][0]["achieved_goal"]
         elif goal_selection_strategy == GoalSelectionStrategy.EPISODE:
             # replay with random state which comes from the same episode as current transition
             index = np.random.choice(np.arange(len(episode)))
+            if online_sampling:
+                return episode[index]
             return episode[index][0]["achieved_goal"]
         elif goal_selection_strategy == GoalSelectionStrategy.RANDOM:
             if online_sampling:
                 # replay with random state from the entire replay buffer
                 ep_idx = np.random.choice(np.arange(len(observations)))
                 trans_idx = np.random.choice(np.arange(len(observations[ep_idx])))
-                return observations[ep_idx][trans_idx][0]["achieved_goal"]
+                return observations[ep_idx][trans_idx]
             else:
                 # replay with random state from the entire replay buffer
                 index = np.random.choice(np.arange(len(observations)))
@@ -173,21 +209,21 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward:
         """
         Add episode to replay buffer
 
-        :param obs:
-        :param next_obs:
-        :param action:
-        :param reward:
-        :param done:
-
-        :param episode: (list) Episode to store.
+        :param obs: (np.ndarray) Observation.
+        :param next_obs: (np.ndarray) Next observation.
+        :param action: (np.ndarray) Action.
+        :param reward: (np.ndarray) Reward.
+        :param done: (np.ndarray) Done.
         """
-        episode = list(zip(obs, action, reward, next_obs, done))
-
-        episode_length = len(episode)
+        episode_length = len(action)
+        episode = self._get_episode_dict(obs, next_obs, action, reward, done)
 
         # check if replay buffer has enough space for all transitions of episode
         if self.n_transitions_stored + episode_length <= self.buffer_size:
-            self.buffer.append(episode)
+            for key in self.buffer.keys():
+                self.buffer[key][self._n_episodes_stored][:episode_length] = episode[key]
+            # add episode length to length storage
+            self.episode_lengths[self._n_episodes_stored] = episode_length
             # update replay size
             self.n_episodes_stored += 1
             self.n_transitions_stored += episode_length
@@ -195,15 +231,57 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward:
             # if replay buffer is full take random stored episode and replace it
             idx = np.random.randint(0, self.n_episodes_stored)
 
-            if len(self.buffer[idx]) == episode_length:
-                self.buffer[idx] = episode
-            elif len(self.buffer[idx]) > episode_length:
-                self.buffer[idx] = episode
-                self.n_transitions_stored -= len(self.buffer[idx]) - episode_length
+            for key in self.buffer.keys():
+                self.buffer[key][idx][:episode_length] = episode[key]
+            # add episode length to length storage
+            self.episode_lengths[idx] = episode_length
 
         if self.n_transitions_stored == self.buffer_size:
             self.full = True
 
+    def _get_episode_dict(self, obs, next_obs, action, reward, done) -> dict:
+        """
+        Convert episode to dictionary.
+
+        :param obs: (np.ndarray) Observation.
+        :param next_obs: (np.ndarray) Next observation.
+        :param action: (np.ndarray) Action.
+        :param reward: (np.ndarray) Reward.
+        :param done: (np.ndarray) Done.
+        """
+
+        observations = []
+        achieved_goals = []
+        desired_goals = []
+
+        for obs_ in obs:
+            observations.append(obs_["observation"])
+            achieved_goals.append(obs_["achieved_goal"])
+            desired_goals.append(obs_["desired_goal"])
+
+        next_observations = []
+        next_achieved_goals = []
+        next_desired_goals = []
+
+        for next_obs_ in next_obs:
+            next_observations.append(next_obs_["observation"])
+            next_achieved_goals.append(next_obs_["achieved_goal"])
+            next_desired_goals.append(next_obs_["desired_goal"])
+
+        episode = {
+            "observation": np.array(observations),
+            "achieved_goal": np.array(achieved_goals),
+            "desired_goal": np.array(desired_goals),
+            "action": action,
+            "reward": reward,
+            "next_obs": np.array(next_observations),
+            "next_achieved_goal": np.array(next_achieved_goals),
+            "next_desired_goal": np.array(next_desired_goals),
+            "done": done,
+        }
+
+        return episode
+
     @property
     def n_episodes_stored(self):
         return self._n_episodes_stored

From 0c0d742f4836af9db6142ab6ef3b95e1a136834b Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Mon, 24 Aug 2020 11:29:08 +0200
Subject: [PATCH 21/81] Updated her test.

---
 tests/test_her.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_her.py b/tests/test_her.py
index 80dec3a82b..b39c54b241 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -206,7 +206,7 @@ def test_save_load(tmp_path, model_class, policy):
     os.remove(tmp_path / "test_save.zip")
 
 
-@pytest.mark.parametrize("online_sampling", [False])
+@pytest.mark.parametrize("online_sampling", [False, True])
 @pytest.mark.parametrize("n_bits", [15])
 def test_dqn_her(online_sampling, n_bits):
     """
@@ -226,6 +226,7 @@ def test_dqn_her(online_sampling, n_bits):
         tau=1,
         batch_size=32,
         learning_rate=0.0005,
+        max_episode_length=n_bits,
         policy_kwargs=dict(net_arch=[64, 64]),
         buffer_size=50000,
         gamma=0.99,

From bbf9d6dac3ebeebe67f0d196f78039468f7692f0 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Mon, 24 Aug 2020 13:17:36 +0200
Subject: [PATCH 22/81] Fix online sampling

---
 stable_baselines3/her/her.py               |   6 +-
 stable_baselines3/her/her_replay_buffer.py | 161 +++++++++------------
 2 files changed, 71 insertions(+), 96 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 3902ed6ad9..a181ddad73 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -59,7 +59,7 @@ def __init__(
         goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future",
         online_sampling: bool = False,
         learning_rate: Union[float, Callable] = 3e-4,
-        max_episode_length: int = 10,
+        max_episode_length: int = 1000,
         *args,
         **kwargs,
     ):
@@ -109,7 +109,7 @@ def __init__(
                 self.env.action_space,
                 self.device,
                 self.n_envs,
-                self.her_ratio,
+                self.her_ratio,  # pytype: disable=wrong-arg-types
             )
 
     def _setup_model(self) -> None:
@@ -346,7 +346,7 @@ def _store_transitions(self) -> None:
                     HerReplayBuffer.sample_goal(
                         self.goal_selection_strategy, idx, self._episode_storage, self.replay_buffer.observations, obs_dim
                     )
-                    for i in range(self.n_sampled_goal)
+                    for _ in range(self.n_sampled_goal)
                 )
                 if sample is not None
             ]
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 5508c59c36..70ce5eee39 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -3,7 +3,6 @@
 import numpy as np
 import torch as th
 from gym import spaces
-from gym.spaces import Discrete
 
 from stable_baselines3.common.buffers import BaseBuffer
 from stable_baselines3.common.type_aliases import ReplayBufferSamples
@@ -31,7 +30,7 @@ class HerReplayBuffer(BaseBuffer):
 
     def __init__(
         self,
-        env: VecEnv,
+        env: ObsWrapper,
         buffer_size: int,
         max_episode_length: int,
         goal_selection_strategy: GoalSelectionStrategy,
@@ -51,6 +50,8 @@ def __init__(
         # buffer with episodes
         # number of episodes which can be stored until buffer size is reached
         n_episodes = self.buffer_size // self.max_episode_length
+        self.n_episodes = n_episodes
+
         # input dimensions for buffer initialization
         input_shape = {
             "observation": (self.env.num_envs, self.env.obs_dim),
@@ -64,19 +65,15 @@ def __init__(
             "done": (1,),
         }
         self.buffer = {
-            key: np.empty([n_episodes, self.max_episode_length, *dim], dtype=np.float32) for key, dim in input_shape.items()
+            key: np.empty((n_episodes, self.max_episode_length, *dim), dtype=np.float32) for key, dim in input_shape.items()
         }
         # episode length storage, needed for episodes which has less steps than the maximum length
-        self.episode_lengths = np.empty(n_episodes)
+        self.episode_lengths = np.empty(n_episodes, dtype=np.uint64)
 
         self.goal_selection_strategy = goal_selection_strategy
         # percentage of her indices
         self.her_ratio = her_ratio
 
-        # memory management
-        self._n_episodes_stored = 0
-        self._n_transitions_stored = 0
-
     def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
         """
         :param batch_size: (int) Number of element to sample
@@ -94,57 +91,57 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
         :return: (ReplayBufferSamples)
         """
         # Select which episodes to use
-        episode_idxs = np.random.randint(0, self.n_episodes_stored, batch_size)
-        # select timesteps of episodes
-        max_timestep_idx = self.episode_lengths[episode_idxs]
-        # transition_idxs = np.random.randint(self.max_episode_length, size=batch_size)
-        transition_idxs = np.random.randint(max_timestep_idx)
-        # get selected timesteps
-        transitions = {key: self.buffer[key][episode_idxs, transition_idxs].copy() for key in self.buffer.keys()}
-        # get her samples indices with her_ratio
-        her_idxs = np.random.choice(np.arange(batch_size), int(self.her_ratio * batch_size), replace=False)
-
-        # if we sample goals from future delete indices from her_idxs where we have no transition after current one
-        if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
-            her_idxs = her_idxs[transition_idxs[her_idxs] != max_timestep_idx[her_idxs] - 1]
-
-        # get new goals with goal selection strategy
-        her_new_goals = [
-            self.sample_goal(self.goal_selection_strategy, trans, episode, self.buffer["achieved_goal"], online_sampling=True)
-            for episode, trans in zip(self.buffer["achieved_goal"][episode_idxs[her_idxs]], transition_idxs[her_idxs])
-        ]
-
-        # assign new goals as desired_goals
-        for idx, goal in enumerate(her_new_goals):
-            # observation
-            transitions["desired_goal"][her_idxs][idx] = goal
-            # next observation
-            transitions["next_desired_goal"][her_idxs][idx] = goal
-
-        # compute new rewards with new goal
-        achieved_goals = transitions["next_achieved_goal"][her_idxs]
-        new_rewards = transitions["reward"].copy()
-        new_rewards[her_idxs] = [
-            self.env.env_method("compute_reward", achieved_goal, new_goal, None)
-            for achieved_goal, new_goal in zip(achieved_goals, her_new_goals)
-        ]
-
-        # concatenate observation with (desired) goal
-        obs = [
-            np.concatenate([obs, desired_goal], axis=1)
-            for obs, desired_goal in zip(transitions["observation"], transitions["desired_goal"])
-        ]
-        next_obs = [
-            np.concatenate([obs, desired_goal], axis=1)
-            for obs, desired_goal in zip(transitions["next_obs"], transitions["next_desired_goal"])
-        ]
+        episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
+        her_episode_indices = episode_indices[: int(self.her_ratio * batch_size)]
+
+        observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype)
+        actions = np.zeros((batch_size, 1), dtype=self.action_space.dtype)
+        next_observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype)
+        dones = np.zeros((batch_size, 1), dtype=np.float32)
+        rewards = np.zeros((batch_size, 1), dtype=np.float32)
+
+        for idx, ep_length in enumerate(self.episode_lengths[episode_indices]):
+            skip_her_sampling = False
+            if episode_indices[idx] in her_episode_indices and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
+                max_timestep = ep_length - 1
+                # handle the case of 1 step episode: we must use a normal transition then
+                if max_timestep == 0:
+                    max_timestep = ep_length
+                    skip_her_sampling = True
+            else:
+                max_timestep = ep_length
+
+            transition_idx = np.random.randint(max_timestep)
+            transition = {key: self.buffer[key][episode_indices[idx], transition_idx].copy() for key in self.buffer.keys()}
+
+            if episode_indices[idx] in her_episode_indices and not skip_her_sampling:
+                episode = self.buffer["achieved_goal"][episode_indices[idx]]
+                new_goal = self.sample_goal(
+                    self.goal_selection_strategy, transition_idx, episode, self.buffer["achieved_goal"], online_sampling=True
+                )
+                # observation
+                transition["desired_goal"] = new_goal
+                # next observation
+                transition["next_desired_goal"] = new_goal
+                transition["reward"] = self.env.env_method("compute_reward", transition["next_achieved_goal"], new_goal, None)
+                # TODO: check that it does not change anything
+                # transition["done"] = False
+
+            # concatenate observation with (desired) goal
+            obs = np.concatenate([transition["observation"], transition["desired_goal"]], axis=1)
+            next_obs = np.concatenate([transition["next_obs"], transition["desired_goal"]], axis=1)
+            observations[idx] = obs
+            next_observations[idx] = next_obs
+            actions[idx] = transition["action"]
+            dones[idx] = transition["done"]
+            rewards[idx] = transition["reward"]
 
         data = (
-            self._normalize_obs(np.asarray(obs, dtype=np.int8), env),
-            transitions["action"],
-            self._normalize_obs(np.asarray(next_obs, dtype=np.int8), env),
-            transitions["done"],
-            self._normalize_obs(new_rewards, env),
+            self._normalize_obs(observations, env),
+            actions,
+            self._normalize_obs(next_observations, env),
+            dones,
+            self._normalize_reward(rewards, env),
         )
 
         return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
@@ -218,26 +215,16 @@ def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward:
         episode_length = len(action)
         episode = self._get_episode_dict(obs, next_obs, action, reward, done)
 
-        # check if replay buffer has enough space for all transitions of episode
-        if self.n_transitions_stored + episode_length <= self.buffer_size:
-            for key in self.buffer.keys():
-                self.buffer[key][self._n_episodes_stored][:episode_length] = episode[key]
-            # add episode length to length storage
-            self.episode_lengths[self._n_episodes_stored] = episode_length
-            # update replay size
-            self.n_episodes_stored += 1
-            self.n_transitions_stored += episode_length
-        elif self.full:
-            # if replay buffer is full take random stored episode and replace it
-            idx = np.random.randint(0, self.n_episodes_stored)
-
-            for key in self.buffer.keys():
-                self.buffer[key][idx][:episode_length] = episode[key]
-            # add episode length to length storage
-            self.episode_lengths[idx] = episode_length
-
-        if self.n_transitions_stored == self.buffer_size:
+        for key in self.buffer.keys():
+            self.buffer[key][self.pos][:episode_length] = episode[key]
+        # add episode length to length storage
+        self.episode_lengths[self.pos] = episode_length
+
+        # update current pointer
+        self.pos += 1
+        if self.pos == self.n_episodes:
             self.full = True
+            self.pos = 0
 
     def _get_episode_dict(self, obs, next_obs, action, reward, done) -> dict:
         """
@@ -284,27 +271,15 @@ def _get_episode_dict(self, obs, next_obs, action, reward, done) -> dict:
 
     @property
     def n_episodes_stored(self):
-        return self._n_episodes_stored
-
-    @n_episodes_stored.setter
-    def n_episodes_stored(self, n):
-        self._n_episodes_stored = n
-
-    @property
-    def n_transitions_stored(self):
-        return self._n_transitions_stored
-
-    @n_transitions_stored.setter
-    def n_transitions_stored(self, n):
-        self._n_transitions_stored = n
+        if self.full:
+            return self.n_episodes
+        return self.pos
 
     def clear_buffer(self):
-        self.buffer = []
-        self.n_episodes_stored = 0
-        self.n_transitions_stored = 0
+        self.buffer = {}
 
     def size(self) -> int:
         """
         :return: (int) The current size of the buffer in transitions.
         """
-        return self.n_transitions_stored
+        return int(np.sum(self.episode_lengths))

From eefea130c53cfea38c3986fdefb71fcaefb26dfe Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Mon, 24 Aug 2020 20:51:52 +0200
Subject: [PATCH 23/81] Fixed action bug. Updated time limit for episodes.

---
 stable_baselines3/her/her.py               | 14 +++++++---
 stable_baselines3/her/her_replay_buffer.py | 16 +++++-------
 tests/test_her.py                          | 30 +++++++++-------------
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index a181ddad73..e6d3f23e7d 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -3,6 +3,7 @@
 from typing import Callable, Iterable, List, Optional, Tuple, Type, Union
 
 import numpy as np
+from gym.wrappers import TimeLimit
 
 from stable_baselines3.common.base_class import BaseAlgorithm
 from stable_baselines3.common.buffers import ReplayBuffer
@@ -59,7 +60,7 @@ def __init__(
         goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future",
         online_sampling: bool = False,
         learning_rate: Union[float, Callable] = 3e-4,
-        max_episode_length: int = 1000,
+        max_episode_length: int = -1,
         *args,
         **kwargs,
     ):
@@ -100,6 +101,10 @@ def __init__(
         # counter for steps in episode
         self.episode_steps = 0
         if self.online_sampling:
+            if isinstance(env, TimeLimit):
+                self.max_episode_length = env._max_episode_steps  # pytype: disable=attribute-error
+            elif self.max_episode_length <= 0:
+                raise ValueError("The maximum episode length must be greater than zero.")
             self.model.replay_buffer = HerReplayBuffer(
                 self.env,
                 self.buffer_size,
@@ -288,7 +293,7 @@ def collect_rollouts(
                 if 0 < n_steps <= total_steps:
                     break
 
-            if done or self.episode_steps >= self.max_episode_length:
+            if done or self.episode_steps == self.max_episode_length:
                 if self.online_sampling:
                     observations, actions, rewards, next_observations, done = zip(*self._episode_storage)
                     self.replay_buffer.add(observations, next_observations, actions, rewards, done)
@@ -338,6 +343,10 @@ def _store_transitions(self) -> None:
             # store data in replay buffer
             self.replay_buffer.add(obs, new_obs, action, reward, done)
 
+            # We cannot sample a goal from the future in the last step of an episode
+            if idx == len(self._episode_storage) - 1 and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
+                break
+
             # sample set of additional goals
             obs_dim = observation["observation"].shape[1]
             sampled_goals = [
@@ -348,7 +357,6 @@ def _store_transitions(self) -> None:
                     )
                     for _ in range(self.n_sampled_goal)
                 )
-                if sample is not None
             ]
 
             # iterate over sampled  new transitions in replay buffer
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 70ce5eee39..b448a45c5d 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -1,4 +1,4 @@
-from typing import Optional, Type, Union
+from typing import Optional, Union
 
 import numpy as np
 import torch as th
@@ -6,7 +6,7 @@
 
 from stable_baselines3.common.buffers import BaseBuffer
 from stable_baselines3.common.type_aliases import ReplayBufferSamples
-from stable_baselines3.common.vec_env import VecEnv, VecNormalize
+from stable_baselines3.common.vec_env import VecNormalize
 from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper
 from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
 
@@ -95,7 +95,7 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
         her_episode_indices = episode_indices[: int(self.her_ratio * batch_size)]
 
         observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype)
-        actions = np.zeros((batch_size, 1), dtype=self.action_space.dtype)
+        actions = np.zeros((batch_size, self.action_dim), dtype=self.action_space.dtype)
         next_observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype)
         dones = np.zeros((batch_size, 1), dtype=np.float32)
         rewards = np.zeros((batch_size, 1), dtype=np.float32)
@@ -174,12 +174,10 @@ def sample_goal(
             return episode[-1][0]["achieved_goal"]
         elif goal_selection_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
-            # we have no transition after last transition of episode
-            if (sample_idx + 1) < len(episode):
-                index = np.random.choice(np.arange(sample_idx + 1, len(episode)))
-                if online_sampling:
-                    return episode[index]
-                return episode[index][0]["achieved_goal"]
+            index = np.random.choice(np.arange(sample_idx + 1, len(episode)))
+            if online_sampling:
+                return episode[index]
+            return episode[index][0]["achieved_goal"]
         elif goal_selection_strategy == GoalSelectionStrategy.EPISODE:
             # replay with random state which comes from the same episode as current transition
             index = np.random.choice(np.arange(len(episode)))
diff --git a/tests/test_her.py b/tests/test_her.py
index b39c54b241..34254f6a4d 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -21,8 +21,8 @@ def test_her(model_class, policy, online_sampling):
     """
     Test Hindsight Experience Replay.
     """
-
-    env = BitFlippingEnv(n_bits=4, continuous=True)
+    n_bits = 4
+    env = BitFlippingEnv(n_bits=n_bits, continuous=True)
     env = DummyVecEnv([lambda: env])
 
     # Create action noise
@@ -47,6 +47,7 @@ def test_her(model_class, policy, online_sampling):
         gradient_steps=1,
         train_freq=1,
         n_episodes_rollout=-1,
+        max_episode_length=n_bits,
     )
 
     model.learn(total_timesteps=500, callback=None)
@@ -115,6 +116,7 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
         gradient_steps=1,
         train_freq=1,
         n_episodes_rollout=-1,
+        max_episode_length=10,
     )
     model.learn(total_timesteps=200, callback=None)
 
@@ -124,7 +126,8 @@ def test_save_load(tmp_path, model_class, policy):
     """
     Test if 'save' and 'load' saves and loads model correctly
     """
-    env = BitFlippingEnv(n_bits=4, continuous=True)
+    n_bits = 4
+    env = BitFlippingEnv(n_bits=n_bits, continuous=True)
     env = DummyVecEnv([lambda: env])
 
     # Create action noise
@@ -150,6 +153,7 @@ def test_save_load(tmp_path, model_class, policy):
         gradient_steps=1,
         train_freq=1,
         n_episodes_rollout=-1,
+        max_episode_length=n_bits,
     )
 
     model.learn(total_timesteps=500, callback=None)
@@ -219,28 +223,18 @@ def test_dqn_her(online_sampling, n_bits):
         "MlpPolicy",
         env,
         DQN,
-        n_sampled_goal=4,
+        n_sampled_goal=5,
         goal_selection_strategy="future",
         online_sampling=online_sampling,
         verbose=1,
-        tau=1,
-        batch_size=32,
         learning_rate=0.0005,
         max_episode_length=n_bits,
-        policy_kwargs=dict(net_arch=[64, 64]),
-        buffer_size=50000,
-        gamma=0.99,
-        gradient_steps=1,
         train_freq=1,
-        n_episodes_rollout=-1,
-        tensorboard_log="tensorboard",
-        learning_starts=1000,
-        exploration_fraction=0.1,
+        learning_starts=100,
         exploration_final_eps=0.02,
-        exploration_initial_eps=1.0,
         target_update_interval=500,
+        seed=0,
+        batch_size=32,
     )
 
-    tb_log_name = "run_" + str(online_sampling) + "_" + str(n_bits)
-
-    model.learn(total_timesteps=20000, callback=None, tb_log_name=tb_log_name)
+    model.learn(total_timesteps=20000)

From b5b00db2fe6e2b775e34ac0abafd32141f40a20a Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Mon, 24 Aug 2020 21:11:06 +0200
Subject: [PATCH 24/81] Updated convert_dict method to take keys as arguments.

---
 stable_baselines3/common/vec_env/dict_obs_wrapper.py | 12 ++++++++----
 stable_baselines3/her/her_replay_buffer.py           |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/stable_baselines3/common/vec_env/dict_obs_wrapper.py b/stable_baselines3/common/vec_env/dict_obs_wrapper.py
index 55e5283b06..4d96664962 100644
--- a/stable_baselines3/common/vec_env/dict_obs_wrapper.py
+++ b/stable_baselines3/common/vec_env/dict_obs_wrapper.py
@@ -52,11 +52,15 @@ def step_wait(self):
         return self.venv.step_wait()
 
     @staticmethod
-    def convert_dict(observation: dict) -> np.ndarray:
+    def convert_dict(
+        observation_dict: dict, observation_key: str = "observation", goal_key: str = "desired_goal"
+    ) -> np.ndarray:
         """
-        Concatenate observation and desired goal of observation dict.
+        Concatenate observation and (desired) goal of observation dict.
 
-        :param observation: (dict)
+        :param observation_dict: (dict) Dictionary with observation.
+        :param observation_key: (str) Key of observation in dicitonary.
+        :param goal_key: (str) Key of (desired) goal in dicitonary.
         :return: (np.ndarray)
         """
-        return np.concatenate([observation["observation"], observation["desired_goal"]], axis=1)
+        return np.concatenate([observation_dict[observation_key], observation_dict[goal_key]], axis=1)
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index b448a45c5d..1e267860a1 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -128,8 +128,8 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
                 # transition["done"] = False
 
             # concatenate observation with (desired) goal
-            obs = np.concatenate([transition["observation"], transition["desired_goal"]], axis=1)
-            next_obs = np.concatenate([transition["next_obs"], transition["desired_goal"]], axis=1)
+            obs = ObsWrapper.convert_dict(transition)
+            next_obs = ObsWrapper.convert_dict(transition, observation_key="next_obs")
             observations[idx] = obs
             next_observations[idx] = next_obs
             actions[idx] = transition["action"]

From fb229b7bb77fcf511c06d495b511005f3dd6a19f Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 25 Aug 2020 14:13:15 +0200
Subject: [PATCH 25/81] Renamed obs dict wrapper.

---
 stable_baselines3/common/base_class.py        |  6 ++++
 stable_baselines3/common/policies.py          |  4 +--
 ...ict_obs_wrapper.py => obs_dict_wrapper.py} |  4 +--
 stable_baselines3/her/her.py                  | 35 ++++++-------------
 stable_baselines3/her/her_replay_buffer.py    |  8 ++---
 tests/test_her.py                             |  4 +--
 6 files changed, 26 insertions(+), 35 deletions(-)
 rename stable_baselines3/common/vec_env/{dict_obs_wrapper.py => obs_dict_wrapper.py} (95%)

diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
index a3de8cef86..2d16655fc6 100644
--- a/stable_baselines3/common/base_class.py
+++ b/stable_baselines3/common/base_class.py
@@ -27,6 +27,7 @@
     update_learning_rate,
 )
 from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecNormalize, VecTransposeImage, unwrap_vec_normalize
+from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 
 
 def maybe_make_env(env: Union[GymEnv, str, None], monitor_wrapper: bool, verbose: int) -> Optional[GymEnv]:
@@ -171,6 +172,11 @@ def _wrap_env(self, env: GymEnv) -> VecEnv:
             if self.verbose >= 1:
                 print("Wrapping the env in a VecTransposeImage.")
             env = VecTransposeImage(env)
+
+        # check if wrapper for dict support is needed
+        if isinstance(env.observation_space, gym.spaces.dict.Dict):
+            env = ObsDictWrapper(env)
+
         return env
 
     @abstractmethod
diff --git a/stable_baselines3/common/policies.py b/stable_baselines3/common/policies.py
index be08e9c35e..babcc99464 100644
--- a/stable_baselines3/common/policies.py
+++ b/stable_baselines3/common/policies.py
@@ -23,7 +23,7 @@
 from stable_baselines3.common.torch_layers import BaseFeaturesExtractor, FlattenExtractor, MlpExtractor, NatureCNN, create_mlp
 from stable_baselines3.common.utils import get_device, is_vectorized_observation
 from stable_baselines3.common.vec_env import VecTransposeImage
-from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper
+from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 
 
 class BaseModel(nn.Module, ABC):
@@ -236,7 +236,7 @@ def predict(
         # if mask is None:
         #     mask = [False for _ in range(self.n_envs)]
         if isinstance(observation, dict):
-            observation = ObsWrapper.convert_dict(observation)
+            observation = ObsDictWrapper.convert_dict(observation)
         else:
             observation = np.array(observation)
 
diff --git a/stable_baselines3/common/vec_env/dict_obs_wrapper.py b/stable_baselines3/common/vec_env/obs_dict_wrapper.py
similarity index 95%
rename from stable_baselines3/common/vec_env/dict_obs_wrapper.py
rename to stable_baselines3/common/vec_env/obs_dict_wrapper.py
index 4d96664962..d524d5e6de 100644
--- a/stable_baselines3/common/vec_env/dict_obs_wrapper.py
+++ b/stable_baselines3/common/vec_env/obs_dict_wrapper.py
@@ -4,7 +4,7 @@
 from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper
 
 
-class ObsWrapper(VecEnvWrapper):
+class ObsDictWrapper(VecEnvWrapper):
     """
     Wrapper for a VecEnv which overrides the observation space for Hindsight Experience Replay to support dict observations.
 
@@ -12,7 +12,7 @@ class ObsWrapper(VecEnvWrapper):
     """
 
     def __init__(self, venv: VecEnv):
-        super(ObsWrapper, self).__init__(venv, venv.observation_space, venv.action_space)
+        super(ObsDictWrapper, self).__init__(venv, venv.observation_space, venv.action_space)
 
         self.venv = venv
 
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index e6d3f23e7d..c59cbaddd4 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -2,6 +2,7 @@
 import pathlib
 from typing import Callable, Iterable, List, Optional, Tuple, Type, Union
 
+import gym
 import numpy as np
 from gym.wrappers import TimeLimit
 
@@ -14,27 +15,12 @@
 from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
 from stable_baselines3.common.utils import check_for_correct_spaces
-from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper
-from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper
+from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy
 from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
 
 
-def check_wrapped_env(env: VecEnv) -> VecEnv:
-    """
-    Check if the environment is already wrapped by an ObsWrapper.
-
-    :param env: (VecEnv) Environment to check.
-    :return: (VecEnv) env
-    """
-    env_tmp = env
-    while isinstance(env_tmp, VecEnvWrapper):
-        if isinstance(env_tmp, ObsWrapper):
-            return env
-        env_tmp = env_tmp.venv
-    return ObsWrapper(env)
-
-
 class HER(BaseAlgorithm):
     """
     Hindsight Experience Replay (HER)
@@ -67,9 +53,6 @@ def __init__(
 
         super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=learning_rate)
 
-        # check if wrapper for dict support is needed
-        self.env = check_wrapped_env(self.env)
-
         # model initialization
         self.model_class = model_class
         self.model = model_class(
@@ -101,7 +84,7 @@ def __init__(
         # counter for steps in episode
         self.episode_steps = 0
         if self.online_sampling:
-            if isinstance(env, TimeLimit):
+            if isinstance(self.env, TimeLimit):
                 self.max_episode_length = env._max_episode_steps  # pytype: disable=attribute-error
             elif self.max_episode_length <= 0:
                 raise ValueError("The maximum episode length must be greater than zero.")
@@ -231,7 +214,7 @@ def collect_rollouts(
             while not done:
                 # concatenate observation and (desired) goal
                 observation = self._last_obs
-                self._last_obs = ObsWrapper.convert_dict(observation)
+                self._last_obs = ObsDictWrapper.convert_dict(observation)
 
                 if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0:
                     # Sample a new noise matrix
@@ -337,8 +320,8 @@ def _store_transitions(self) -> None:
             observation, action, reward, new_observation, done = trans
 
             # concatenate observation with (desired) goal
-            obs = ObsWrapper.convert_dict(observation)
-            new_obs = ObsWrapper.convert_dict(new_observation)
+            obs = ObsDictWrapper.convert_dict(observation)
+            new_obs = ObsDictWrapper.convert_dict(new_observation)
 
             # store data in replay buffer
             self.replay_buffer.add(obs, new_obs, action, reward, done)
@@ -434,7 +417,9 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
             raise KeyError("The observation_space and action_space were not given, can't verify new environments")
         # check if given env is valid
         if env is not None:
-            env = check_wrapped_env(env)
+            # check if wrapper for dict support is needed
+            if isinstance(env.observation_space, gym.spaces.dict.Dict):
+                env = ObsDictWrapper(env)
             check_for_correct_spaces(env, data["observation_space"], data["action_space"])
         # if no new env was given use stored env if possible
         if env is None and "env" in data:
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 1e267860a1..3fd98267b0 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -7,7 +7,7 @@
 from stable_baselines3.common.buffers import BaseBuffer
 from stable_baselines3.common.type_aliases import ReplayBufferSamples
 from stable_baselines3.common.vec_env import VecNormalize
-from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper
+from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
 
 
@@ -30,7 +30,7 @@ class HerReplayBuffer(BaseBuffer):
 
     def __init__(
         self,
-        env: ObsWrapper,
+        env: ObsDictWrapper,
         buffer_size: int,
         max_episode_length: int,
         goal_selection_strategy: GoalSelectionStrategy,
@@ -128,8 +128,8 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
                 # transition["done"] = False
 
             # concatenate observation with (desired) goal
-            obs = ObsWrapper.convert_dict(transition)
-            next_obs = ObsWrapper.convert_dict(transition, observation_key="next_obs")
+            obs = ObsDictWrapper.convert_dict(transition)
+            next_obs = ObsDictWrapper.convert_dict(transition, observation_key="next_obs")
             observations[idx] = obs
             next_observations[idx] = next_obs
             actions[idx] = transition["action"]
diff --git a/tests/test_her.py b/tests/test_her.py
index 34254f6a4d..2c7a90ddc5 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -9,7 +9,7 @@
 from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
 from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
 from stable_baselines3.common.vec_env import DummyVecEnv
-from stable_baselines3.common.vec_env.dict_obs_wrapper import ObsWrapper
+from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.her import HER, GoalSelectionStrategy
 from stable_baselines3.sac.policies import SACPolicy
 from stable_baselines3.td3.policies import MlpPolicy, TD3Policy
@@ -163,7 +163,7 @@ def test_save_load(tmp_path, model_class, policy):
     observations_list = []
     for _ in range(10):
         obs = env.step([env.action_space.sample()])[0]
-        observation = ObsWrapper.convert_dict(obs)
+        observation = ObsDictWrapper.convert_dict(obs)
         observations_list.append(observation)
 
     observations = np.concatenate(observations_list, axis=0)

From 8a93ac9f020b1cf2a5b226a7ededd776305b8df0 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 25 Aug 2020 14:19:22 +0200
Subject: [PATCH 26/81] Seed bit flipping env

---
 stable_baselines3/common/bit_flipping_env.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/stable_baselines3/common/bit_flipping_env.py b/stable_baselines3/common/bit_flipping_env.py
index b579fe1579..fd9998a6c1 100644
--- a/stable_baselines3/common/bit_flipping_env.py
+++ b/stable_baselines3/common/bit_flipping_env.py
@@ -61,7 +61,9 @@ def __init__(
             max_steps = n_bits
         self.max_steps = max_steps
         self.current_step = 0
-        self.reset()
+
+    def seed(self, seed: int) -> None:
+        self.obs_space.seed(seed)
 
     def convert_if_needed(self, state: np.ndarray) -> Union[int, np.ndarray]:
         """

From 66ab30ceb86fd824d9bded78b2af7541c12bddc1 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 25 Aug 2020 14:59:58 +0200
Subject: [PATCH 27/81] Remove get_episode_dict

---
 setup.cfg                                  |   1 +
 stable_baselines3/her/her.py               |  12 ++-
 stable_baselines3/her/her_replay_buffer.py | 112 ++++++++-------------
 3 files changed, 49 insertions(+), 76 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 011c3d9b17..4b5d439182 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -29,6 +29,7 @@ per-file-ignores =
 	./stable_baselines3/a2c/__init__.py:F401
 	./stable_baselines3/ddpg/__init__.py:F401
 	./stable_baselines3/dqn/__init__.py:F401
+	./stable_baselines3/her/__init__.py:F401
 	./stable_baselines3/ppo/__init__.py:F401
 	./stable_baselines3/sac/__init__.py:F401
 	./stable_baselines3/td3/__init__.py:F401
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index e6d3f23e7d..df0c00cae8 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -266,8 +266,11 @@ def collect_rollouts(
                         self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
                         self.model._last_original_obs = self._last_original_obs
 
-                    # add current transition to episode storage
-                    self._episode_storage.append((self._last_original_obs, buffer_action, reward_, new_obs_, done))
+                    if self.online_sampling:
+                        self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done)
+                    else:
+                        # add current transition to episode storage
+                        self._episode_storage.append((self._last_original_obs, new_obs_, buffer_action, reward_, done))
 
                 self._last_obs = new_obs
                 self.model._last_obs = self._last_obs
@@ -295,8 +298,7 @@ def collect_rollouts(
 
             if done or self.episode_steps == self.max_episode_length:
                 if self.online_sampling:
-                    observations, actions, rewards, next_observations, done = zip(*self._episode_storage)
-                    self.replay_buffer.add(observations, next_observations, actions, rewards, done)
+                    self.replay_buffer.store_episode()
                 else:
                     # store episode in replay buffer
                     self._store_transitions()
@@ -334,7 +336,7 @@ def _store_transitions(self) -> None:
         # iterate over current episodes transitions
         for idx, trans in enumerate(self._episode_storage):
 
-            observation, action, reward, new_observation, done = trans
+            observation, new_observation, action, reward, done = trans
 
             # concatenate observation with (desired) goal
             obs = ObsWrapper.convert_dict(observation)
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 1e267860a1..33e1a8aac8 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union
+from typing import Dict, Optional, Union
 
 import numpy as np
 import torch as th
@@ -49,8 +49,8 @@ def __init__(
 
         # buffer with episodes
         # number of episodes which can be stored until buffer size is reached
-        n_episodes = self.buffer_size // self.max_episode_length
-        self.n_episodes = n_episodes
+        self.max_episode_stored = self.buffer_size // self.max_episode_length
+        self.current_idx = 0
 
         # input dimensions for buffer initialization
         input_shape = {
@@ -65,10 +65,11 @@ def __init__(
             "done": (1,),
         }
         self.buffer = {
-            key: np.empty((n_episodes, self.max_episode_length, *dim), dtype=np.float32) for key, dim in input_shape.items()
+            key: np.empty((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32)
+            for key, dim in input_shape.items()
         }
         # episode length storage, needed for episodes which has less steps than the maximum length
-        self.episode_lengths = np.empty(n_episodes, dtype=np.uint64)
+        self.episode_lengths = np.empty(self.max_episode_stored, dtype=np.uint64)
 
         self.goal_selection_strategy = goal_selection_strategy
         # percentage of her indices
@@ -92,7 +93,7 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
         """
         # Select which episodes to use
         episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
-        her_episode_indices = episode_indices[: int(self.her_ratio * batch_size)]
+        her_episode_indices = set(episode_indices[: int(self.her_ratio * batch_size)])
 
         observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype)
         actions = np.zeros((batch_size, self.action_dim), dtype=self.action_space.dtype)
@@ -101,21 +102,23 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
         rewards = np.zeros((batch_size, 1), dtype=np.float32)
 
         for idx, ep_length in enumerate(self.episode_lengths[episode_indices]):
-            skip_her_sampling = False
-            if episode_indices[idx] in her_episode_indices and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
+            her_sampling = episode_indices[idx] in her_episode_indices
+
+            if her_sampling and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
                 max_timestep = ep_length - 1
                 # handle the case of 1 step episode: we must use a normal transition then
                 if max_timestep == 0:
                     max_timestep = ep_length
-                    skip_her_sampling = True
+                    her_sampling = False
             else:
                 max_timestep = ep_length
 
             transition_idx = np.random.randint(max_timestep)
             transition = {key: self.buffer[key][episode_indices[idx], transition_idx].copy() for key in self.buffer.keys()}
 
-            if episode_indices[idx] in her_episode_indices and not skip_her_sampling:
-                episode = self.buffer["achieved_goal"][episode_indices[idx]]
+            if her_sampling:
+                episode = self.buffer["achieved_goal"][episode_indices[idx]][: self.episode_lengths[episode_indices[idx]]]
+                # TODO: check that episode lenght is taken into account for all sampling strategies
                 new_goal = self.sample_goal(
                     self.goal_selection_strategy, transition_idx, episode, self.buffer["achieved_goal"], online_sampling=True
                 )
@@ -200,77 +203,44 @@ def sample_goal(
         else:
             raise ValueError("Strategy for sampling goals not supported!")
 
-    def add(self, obs: np.ndarray, next_obs: np.ndarray, action: np.ndarray, reward: np.ndarray, done: np.ndarray) -> None:
-        """
-        Add episode to replay buffer
+    def add(
+        self,
+        obs: Dict[str, np.ndarray],
+        next_obs: Dict[str, np.ndarray],
+        action: np.ndarray,
+        reward: np.ndarray,
+        done: np.ndarray,
+    ) -> None:
+
+        self.buffer["observation"][self.pos][self.current_idx] = obs["observation"]
+        self.buffer["achieved_goal"][self.pos][self.current_idx] = obs["achieved_goal"]
+        self.buffer["desired_goal"][self.pos][self.current_idx] = obs["desired_goal"]
+        self.buffer["action"][self.pos][self.current_idx] = action
+        self.buffer["done"][self.pos][self.current_idx] = done
+        self.buffer["reward"][self.pos][self.current_idx] = reward
+        self.buffer["next_obs"][self.pos][self.current_idx] = next_obs["observation"]
+        self.buffer["next_achieved_goal"][self.pos][self.current_idx] = next_obs["achieved_goal"]
+        self.buffer["next_desired_goal"][self.pos][self.current_idx] = next_obs["desired_goal"]
 
-        :param obs: (np.ndarray) Observation.
-        :param next_obs: (np.ndarray) Next observation.
-        :param action: (np.ndarray) Action.
-        :param reward: (np.ndarray) Reward.
-        :param done: (np.ndarray) Done.
-        """
-        episode_length = len(action)
-        episode = self._get_episode_dict(obs, next_obs, action, reward, done)
+        # update current pointer
+        self.current_idx += 1
 
-        for key in self.buffer.keys():
-            self.buffer[key][self.pos][:episode_length] = episode[key]
+    def store_episode(self):
         # add episode length to length storage
-        self.episode_lengths[self.pos] = episode_length
+        self.episode_lengths[self.pos] = self.current_idx
 
-        # update current pointer
+        # update current episode pointer
         self.pos += 1
-        if self.pos == self.n_episodes:
+        if self.pos == self.max_episode_stored:
             self.full = True
             self.pos = 0
-
-    def _get_episode_dict(self, obs, next_obs, action, reward, done) -> dict:
-        """
-        Convert episode to dictionary.
-
-        :param obs: (np.ndarray) Observation.
-        :param next_obs: (np.ndarray) Next observation.
-        :param action: (np.ndarray) Action.
-        :param reward: (np.ndarray) Reward.
-        :param done: (np.ndarray) Done.
-        """
-
-        observations = []
-        achieved_goals = []
-        desired_goals = []
-
-        for obs_ in obs:
-            observations.append(obs_["observation"])
-            achieved_goals.append(obs_["achieved_goal"])
-            desired_goals.append(obs_["desired_goal"])
-
-        next_observations = []
-        next_achieved_goals = []
-        next_desired_goals = []
-
-        for next_obs_ in next_obs:
-            next_observations.append(next_obs_["observation"])
-            next_achieved_goals.append(next_obs_["achieved_goal"])
-            next_desired_goals.append(next_obs_["desired_goal"])
-
-        episode = {
-            "observation": np.array(observations),
-            "achieved_goal": np.array(achieved_goals),
-            "desired_goal": np.array(desired_goals),
-            "action": action,
-            "reward": reward,
-            "next_obs": np.array(next_observations),
-            "next_achieved_goal": np.array(next_achieved_goals),
-            "next_desired_goal": np.array(next_desired_goals),
-            "done": done,
-        }
-
-        return episode
+        # reset transition pointer
+        self.current_idx = 0
 
     @property
     def n_episodes_stored(self):
         if self.full:
-            return self.n_episodes
+            return self.max_episode_stored
         return self.pos
 
     def clear_buffer(self):

From d6a5524ab25cdab5836274dbbbd86f53f266aeb8 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 25 Aug 2020 16:28:27 +0200
Subject: [PATCH 28/81] Add fast online sampling version

---
 stable_baselines3/her/her_replay_buffer.py | 46 +++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 5c91272125..709c3229af 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -82,7 +82,7 @@ def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayB
             to normalize the observations/rewards when sampling
         :return: (ReplayBufferSamples)
         """
-        return self._sample_transitions(batch_size, env)
+        return self._sample_transitions_2(batch_size, env)
 
     def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples:
         """
@@ -126,6 +126,7 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
                 transition["desired_goal"] = new_goal
                 # next observation
                 transition["next_desired_goal"] = new_goal
+                # TODO: vectorized computation of reward
                 transition["reward"] = self.env.env_method("compute_reward", transition["next_achieved_goal"], new_goal, None)
                 # TODO: check that it does not change anything
                 # transition["done"] = False
@@ -203,6 +204,46 @@ def sample_goal(
         else:
             raise ValueError("Strategy for sampling goals not supported!")
 
+    def _sample_transitions_2(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples:
+        """
+        :param batch_size: (int) Number of element to sample
+        :param env: (Optional[VecNormalize]) associated gym VecEnv
+            to normalize the observations/rewards when sampling
+        :return: (ReplayBufferSamples)
+        """
+        # Select which episodes to use
+        episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
+        transitions_indices = np.random.randint(self.episode_lengths[episode_indices], size=batch_size)
+        transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()}
+
+        her_indices = np.random.permutation(batch_size)[: int(self.her_ratio * batch_size)]
+        future_offset = np.random.uniform(size=batch_size) * (self.episode_lengths[episode_indices] - transitions_indices)
+        future_offset = future_offset.astype(int)
+        future_indices = (transitions_indices + future_offset)[her_indices]
+        # future_indices = (transitions_indices + 1 + future_offset)[her_indices]
+
+        future_achieved_goals = self.buffer["achieved_goal"][episode_indices[her_indices], future_indices]
+        transitions["desired_goal"][her_indices] = future_achieved_goals
+
+        for idx in her_indices:
+            transitions["reward"][idx] = self.env.env_method(
+                "compute_reward", transitions["next_achieved_goal"][idx], transitions["desired_goal"][idx], None
+            )
+
+        # concatenate observation with (desired) goal
+        observations = ObsDictWrapper.convert_dict(transitions)
+        next_observations = ObsDictWrapper.convert_dict(transitions, observation_key="next_obs")
+
+        data = (
+            self._normalize_obs(observations, env),
+            transitions["action"],
+            self._normalize_obs(next_observations, env),
+            transitions["done"],
+            self._normalize_reward(transitions["reward"], env),
+        )
+
+        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
+
     def add(
         self,
         obs: Dict[str, np.ndarray],
@@ -230,6 +271,9 @@ def store_episode(self):
         self.episode_lengths[self.pos] = self.current_idx
 
         # update current episode pointer
+        # Note: in the OpenAI implementation
+        # when the buffer is full, the episode replaced
+        # is randomly chosen
         self.pos += 1
         if self.pos == self.max_episode_stored:
             self.full = True

From a3c08de8d59a1c73a16d30037fca06d5dfc32993 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 25 Aug 2020 16:58:15 +0200
Subject: [PATCH 29/81] Added documentation.

---
 docs/index.rst                             |   1 +
 docs/modules/her.rst                       | 110 +++++++++++++++++++++
 stable_baselines3/her/__init__.py          |   3 +
 stable_baselines3/her/her_replay_buffer.py |   3 +-
 tests/test_her.py                          |   3 +-
 5 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 docs/modules/her.rst

diff --git a/docs/index.rst b/docs/index.rst
index 939655a1c8..4bc2fbcc99 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -57,6 +57,7 @@ Main Features
   modules/a2c
   modules/ddpg
   modules/dqn
+  modules/her
   modules/ppo
   modules/sac
   modules/td3
diff --git a/docs/modules/her.rst b/docs/modules/her.rst
new file mode 100644
index 0000000000..32532ba0f0
--- /dev/null
+++ b/docs/modules/her.rst
@@ -0,0 +1,110 @@
+.. _her:
+
+.. automodule:: stable_baselines3.her
+
+
+HER
+====
+
+`Hindsight Experience Replay (HER) <https://arxiv.org/abs/1707.01495>`_
+
+HER is an algorithm that works with Off policy methods (DQN, SAC, TD3 and DDPG for example).
+
+.. warning::
+
+	HER requires the environment to inherits from `gym.GoalEnv <https://github.com/openai/gym/blob/3394e245727c1ae6851b504a50ba77c73cd4c65b/gym/core.py#L160>`_
+
+
+Notes
+-----
+
+- Original paper: https://arxiv.org/abs/1707.01495
+- OpenAI paper: `Plappert et al. (2018)`_
+- OpenAI blog post: https://openai.com/blog/ingredients-for-robotics-research/
+
+
+.. _Plappert et al. (2018): https://arxiv.org/abs/1802.09464
+
+Can I use?
+----------
+
+Please refer to the wrapped model (DQN, SAC, TD3 or DDPG) for that section.
+
+Example
+-------
+
+.. code-block:: python
+
+	from stable_baselines3 import DDPG, DQN, SAC, TD3
+	from stable_baselines3.her.her import HER
+	from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
+	from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
+	from stable_baselines3.common.vec_env import DummyVecEnv
+	from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
+
+	model_class = DQN  # works also with SAC, DDPG and TD3
+	N_BITS = 15
+
+	env = BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)
+
+	# Available strategies (cf paper): future, final, episode, random
+	goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
+
+	# If True the HER transitions will get sampled online
+	online_sampling = True
+	# Time limit for the episodes in online sampling (to deactivate for offline use the default value -1)
+	max_episode_length = N_BITS
+
+	# Initialize the model
+	model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling,
+							verbose=1, max_episode_length=max_episode_length)
+	# Train the model
+	model.learn(1000)
+
+	model.save("./her_bit_env")
+
+	# WARNING: you must pass an VecEnv
+	env = DummyVecEnv([lambda: env])
+	model = HER.load('./her_bit_env', env=env)
+
+	obs = env.reset()
+	for _ in range(100):
+	    # we need to convert the observation dict
+	    obs = ObsDictWrapper.convert_dict(obs)
+	    action, _ = model.model.predict(obs)
+	    obs, reward, done, _ = env.step(action)
+
+	    if done:
+	        obs = env.reset()
+
+
+Parameters
+----------
+
+.. autoclass:: HER
+  :members:
+
+Goal Selection Strategies
+-------------------------
+
+.. autoclass:: GoalSelectionStrategy
+  :members:
+  :inherited-members:
+	:undoc-members:
+
+
+Obs Dict Wrapper
+----------------
+
+.. autoclass:: ObsDictWrapper
+  :members:
+  :inherited-members:
+	:undoc-members:
+
+
+HER Replay Buffer
+-----------------
+
+.. autoclass:: HerReplayBuffer
+  :members:
+  :inherited-members:
diff --git a/stable_baselines3/her/__init__.py b/stable_baselines3/her/__init__.py
index ce43bf04cf..24f347305a 100644
--- a/stable_baselines3/her/__init__.py
+++ b/stable_baselines3/her/__init__.py
@@ -1 +1,4 @@
+from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
+from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
 from stable_baselines3.her.her import HER
+from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 5c91272125..862e6a4b12 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -13,7 +13,8 @@
 
 class HerReplayBuffer(BaseBuffer):
     """
-    Replay Buffer for online Hindsight Experience Replay (HER)
+    Replay Buffer for sampling HER (Hindsight Experience Replay) transitions online.
+    These transitions will not be saved in the Buffer.
 
     :param env: (VecEnv) The training environment
     :param buffer_size: (int) The size of the buffer measured in transitions.
diff --git a/tests/test_her.py b/tests/test_her.py
index 2c7a90ddc5..7bb62dc830 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -10,7 +10,8 @@
 from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
 from stable_baselines3.common.vec_env import DummyVecEnv
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
-from stable_baselines3.her.her import HER, GoalSelectionStrategy
+from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
+from stable_baselines3.her.her import HER
 from stable_baselines3.sac.policies import SACPolicy
 from stable_baselines3.td3.policies import MlpPolicy, TD3Policy
 

From bbf5a9363e851e8d5ca0132c288a3ef67e8cc69d Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 25 Aug 2020 17:39:46 +0200
Subject: [PATCH 30/81] Vectorized reward computation

---
 stable_baselines3/common/bit_flipping_env.py |  10 +-
 stable_baselines3/her/her_replay_buffer.py   | 113 ++++++-------------
 2 files changed, 39 insertions(+), 84 deletions(-)

diff --git a/stable_baselines3/common/bit_flipping_env.py b/stable_baselines3/common/bit_flipping_env.py
index fd9998a6c1..527eab3ee0 100644
--- a/stable_baselines3/common/bit_flipping_env.py
+++ b/stable_baselines3/common/bit_flipping_env.py
@@ -103,7 +103,7 @@ def step(self, action: Union[np.ndarray, int]) -> GymStepReturn:
         else:
             self.state[action] = 1 - self.state[action]
         obs = self._get_obs()
-        reward = self.compute_reward(obs["achieved_goal"], obs["desired_goal"], None)
+        reward = float(self.compute_reward(obs["achieved_goal"], obs["desired_goal"], None))
         done = reward == 0
         self.current_step += 1
         # Episode terminate when we reached the goal or the max number of steps
@@ -111,11 +111,11 @@ def step(self, action: Union[np.ndarray, int]) -> GymStepReturn:
         done = done or self.current_step >= self.max_steps
         return obs, reward, done, info
 
-    def compute_reward(self, achieved_goal: np.ndarray, desired_goal: np.ndarray, _info) -> float:
+    def compute_reward(self, achieved_goal: np.ndarray, desired_goal: np.ndarray, _info) -> np.float32:
         # Deceptive reward: it is positive only when the goal is achieved
-        if self.discrete_obs_space:
-            return 0.0 if achieved_goal == desired_goal else -1.0
-        return 0.0 if (achieved_goal == desired_goal).all() else -1.0
+        # vectorized version
+        distance = np.linalg.norm(achieved_goal - desired_goal, axis=-1)
+        return -(distance > 0).astype(np.float32)
 
     def render(self, mode: str = "human") -> Optional[np.ndarray]:
         if mode == "rgb_array":
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 709c3229af..8a29221cf4 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -82,73 +82,7 @@ def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayB
             to normalize the observations/rewards when sampling
         :return: (ReplayBufferSamples)
         """
-        return self._sample_transitions_2(batch_size, env)
-
-    def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples:
-        """
-        :param batch_size: (int) Number of element to sample
-        :param env: (Optional[VecNormalize]) associated gym VecEnv
-            to normalize the observations/rewards when sampling
-        :return: (ReplayBufferSamples)
-        """
-        # Select which episodes to use
-        episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
-        her_episode_indices = set(episode_indices[: int(self.her_ratio * batch_size)])
-
-        observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype)
-        actions = np.zeros((batch_size, self.action_dim), dtype=self.action_space.dtype)
-        next_observations = np.zeros((batch_size, self.env.obs_dim + self.env.goal_dim), dtype=self.observation_space.dtype)
-        dones = np.zeros((batch_size, 1), dtype=np.float32)
-        rewards = np.zeros((batch_size, 1), dtype=np.float32)
-
-        for idx, ep_length in enumerate(self.episode_lengths[episode_indices]):
-            her_sampling = episode_indices[idx] in her_episode_indices
-
-            if her_sampling and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
-                max_timestep = ep_length - 1
-                # handle the case of 1 step episode: we must use a normal transition then
-                if max_timestep == 0:
-                    max_timestep = ep_length
-                    her_sampling = False
-            else:
-                max_timestep = ep_length
-
-            transition_idx = np.random.randint(max_timestep)
-            transition = {key: self.buffer[key][episode_indices[idx], transition_idx].copy() for key in self.buffer.keys()}
-
-            if her_sampling:
-                episode = self.buffer["achieved_goal"][episode_indices[idx]][: self.episode_lengths[episode_indices[idx]]]
-                # TODO: check that episode lenght is taken into account for all sampling strategies
-                new_goal = self.sample_goal(
-                    self.goal_selection_strategy, transition_idx, episode, self.buffer["achieved_goal"], online_sampling=True
-                )
-                # observation
-                transition["desired_goal"] = new_goal
-                # next observation
-                transition["next_desired_goal"] = new_goal
-                # TODO: vectorized computation of reward
-                transition["reward"] = self.env.env_method("compute_reward", transition["next_achieved_goal"], new_goal, None)
-                # TODO: check that it does not change anything
-                # transition["done"] = False
-
-            # concatenate observation with (desired) goal
-            obs = ObsDictWrapper.convert_dict(transition)
-            next_obs = ObsDictWrapper.convert_dict(transition, observation_key="next_obs")
-            observations[idx] = obs
-            next_observations[idx] = next_obs
-            actions[idx] = transition["action"]
-            dones[idx] = transition["done"]
-            rewards[idx] = transition["reward"]
-
-        data = (
-            self._normalize_obs(observations, env),
-            actions,
-            self._normalize_obs(next_observations, env),
-            dones,
-            self._normalize_reward(rewards, env),
-        )
-
-        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
+        return self._sample_transitions(batch_size, env)
 
     @staticmethod
     def sample_goal(
@@ -204,7 +138,7 @@ def sample_goal(
         else:
             raise ValueError("Strategy for sampling goals not supported!")
 
-    def _sample_transitions_2(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples:
+    def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples:
         """
         :param batch_size: (int) Number of element to sample
         :param env: (Optional[VecNormalize]) associated gym VecEnv
@@ -213,22 +147,43 @@ def _sample_transitions_2(self, batch_size: int, env: Optional[VecNormalize]) ->
         """
         # Select which episodes to use
         episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
-        transitions_indices = np.random.randint(self.episode_lengths[episode_indices], size=batch_size)
-        transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()}
+        her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)]
+        # her_indices = np.random.permutation(batch_size)[: int(self.her_ratio * batch_size)]
+        ep_length = self.episode_lengths[episode_indices]
 
-        her_indices = np.random.permutation(batch_size)[: int(self.her_ratio * batch_size)]
-        future_offset = np.random.uniform(size=batch_size) * (self.episode_lengths[episode_indices] - transitions_indices)
-        future_offset = future_offset.astype(int)
-        future_indices = (transitions_indices + future_offset)[her_indices]
-        # future_indices = (transitions_indices + 1 + future_offset)[her_indices]
+        if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
+            # restrict the sampling domain when ep_length > 1
+            # otherwise filter out the indices
+            her_indices = her_indices[ep_length[her_indices] > 1]
+            ep_length[her_indices] -= 1
 
-        future_achieved_goals = self.buffer["achieved_goal"][episode_indices[her_indices], future_indices]
-        transitions["desired_goal"][her_indices] = future_achieved_goals
+        transitions_indices = np.random.randint(ep_length, size=batch_size)
+        transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()}
+
+        # vectorized version of future sampling (fast)
+        # future_offset = np.random.uniform(size=batch_size) * (self.episode_lengths[episode_indices] - transitions_indices)
+        # future_offset = future_offset.astype(int)
+        # future_indices = (transitions_indices + future_offset)[her_indices]
+        # # future_indices = (transitions_indices + 1 + future_offset)[her_indices]
+        # future_achieved_goals = self.buffer["achieved_goal"][episode_indices[her_indices], future_indices]
+        # transitions["desired_goal"][her_indices] = future_achieved_goals
 
         for idx in her_indices:
-            transitions["reward"][idx] = self.env.env_method(
-                "compute_reward", transitions["next_achieved_goal"][idx], transitions["desired_goal"][idx], None
+            episode = self.buffer["achieved_goal"][episode_indices[idx]][: self.episode_lengths[episode_indices[idx]]]
+            # TODO: check that episode length is taken into account for all sampling strategies
+            new_goal = self.sample_goal(
+                self.goal_selection_strategy,
+                transitions_indices[idx],
+                episode,
+                self.buffer["achieved_goal"],
+                online_sampling=True,
             )
+            transitions["desired_goal"][idx] = new_goal
+
+        # Vectorized computation
+        transitions["reward"][her_indices] = self.env.env_method(
+            "compute_reward", transitions["next_achieved_goal"][her_indices], transitions["desired_goal"][her_indices], None
+        )
 
         # concatenate observation with (desired) goal
         observations = ObsDictWrapper.convert_dict(transitions)

From 2525eb020df0effabd4e970f7d2cdbb0a4900bc4 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 25 Aug 2020 18:19:01 +0200
Subject: [PATCH 31/81] Vectorized goal sampling

---
 stable_baselines3/her/her_replay_buffer.py | 97 ++++++++++++----------
 tests/test_her.py                          |  8 +-
 2 files changed, 57 insertions(+), 48 deletions(-)

diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 6223765680..756e731ae1 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -70,7 +70,7 @@ def __init__(
             for key, dim in input_shape.items()
         }
         # episode length storage, needed for episodes which has less steps than the maximum length
-        self.episode_lengths = np.empty(self.max_episode_stored, dtype=np.uint64)
+        self.episode_lengths = np.zeros(self.max_episode_stored, dtype=np.int64)
 
         self.goal_selection_strategy = goal_selection_strategy
         # percentage of her indices
@@ -92,8 +92,7 @@ def sample_goal(
         episode: list,
         observations: Union[list, np.ndarray],
         obs_dim: int = None,
-        online_sampling: bool = False,
-    ) -> Union[np.ndarray, None]:
+    ) -> np.ndarray:
         """
         Sample a goal based on goal_selection_strategy.
 
@@ -103,42 +102,71 @@ def sample_goal(
         :param episode: (list) Current episode.
         :param observations: (list or np.ndarray)
         :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy.
-        :param online_sampling: (bool) Sample HER transitions online.
-        :return: (np.ndarray or None) Return sampled goal.
+        :return: (np.ndarray) Return sampled goal.
         """
         if goal_selection_strategy == GoalSelectionStrategy.FINAL:
             # replay with final state of current episode
-            if online_sampling:
-                return episode[-1]
             return episode[-1][0]["achieved_goal"]
         elif goal_selection_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
             index = np.random.choice(np.arange(sample_idx + 1, len(episode)))
-            if online_sampling:
-                return episode[index]
             return episode[index][0]["achieved_goal"]
         elif goal_selection_strategy == GoalSelectionStrategy.EPISODE:
             # replay with random state which comes from the same episode as current transition
             index = np.random.choice(np.arange(len(episode)))
-            if online_sampling:
-                return episode[index]
             return episode[index][0]["achieved_goal"]
         elif goal_selection_strategy == GoalSelectionStrategy.RANDOM:
-            if online_sampling:
-                # replay with random state from the entire replay buffer
-                ep_idx = np.random.choice(np.arange(len(observations)))
-                trans_idx = np.random.choice(np.arange(len(observations[ep_idx])))
-                return observations[ep_idx][trans_idx]
-            else:
-                # replay with random state from the entire replay buffer
-                index = np.random.choice(np.arange(len(observations)))
-                obs = observations[index]
-                # get only the observation part
-                obs_array = obs[:, :obs_dim]
-                return obs_array
+            # replay with random state from the entire replay buffer
+            index = np.random.choice(np.arange(len(observations)))
+            obs = observations[index]
+            # get only the observation part
+            # TODO: check that line (or the comment at least)
+            obs_array = obs[:, :obs_dim]
+            return obs_array
+        else:
+            raise ValueError("Strategy for sampling goals not supported!")
+
+    def vectorized_sample_goal(
+        self, episode_indices: np.ndarray, her_indices: np.ndarray, transitions_indices: np.ndarray
+    ) -> np.ndarray:
+        """
+        Sample goals based on goal_selection_strategy.
+        This is the vectorized (faster) version of ``sample_goal()``
+
+        :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay.
+            One of ['episode', 'final', 'future', 'random']
+        :param sample_idx: (int) Index of current transition.
+        :param episode: (list) Current episode.
+        :param observations: (list or np.ndarray)
+        :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy.
+        :param online_sampling: (bool) Sample HER transitions online.
+        :return: (np.ndarray) Return sampled goals.
+        """
+        her_episode_indices = episode_indices[her_indices]
+
+        if self.goal_selection_strategy == GoalSelectionStrategy.FINAL:
+            # replay with final state of current episode
+            transitions_indices = self.episode_lengths[her_indices] - 1
+
+        elif self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
+            # replay with random state which comes from the same episode and was observed after current transition
+            transitions_indices = np.random.randint(
+                transitions_indices[her_indices] + 1, self.episode_lengths[her_episode_indices]
+            )
+
+        elif self.goal_selection_strategy == GoalSelectionStrategy.EPISODE:
+            # replay with random state which comes from the same episode as current transition
+            transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices])
+
+        elif self.goal_selection_strategy == GoalSelectionStrategy.RANDOM:
+            # replay with random state from the entire replay buffer
+            her_episode_indices = np.random.randint(self.n_episodes_stored, size=len(her_indices))
+            transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices])
         else:
             raise ValueError("Strategy for sampling goals not supported!")
 
+        return self.buffer["achieved_goal"][her_episode_indices, transitions_indices]
+
     def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples:
         """
         :param batch_size: (int) Number of element to sample
@@ -158,28 +186,11 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
             her_indices = her_indices[ep_length[her_indices] > 1]
             ep_length[her_indices] -= 1
 
-        transitions_indices = np.random.randint(ep_length, size=batch_size)
+        transitions_indices = np.random.randint(ep_length)
         transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()}
 
-        # vectorized version of future sampling (fast)
-        # future_offset = np.random.uniform(size=batch_size) * (self.episode_lengths[episode_indices] - transitions_indices)
-        # future_offset = future_offset.astype(int)
-        # future_indices = (transitions_indices + future_offset)[her_indices]
-        # # future_indices = (transitions_indices + 1 + future_offset)[her_indices]
-        # future_achieved_goals = self.buffer["achieved_goal"][episode_indices[her_indices], future_indices]
-        # transitions["desired_goal"][her_indices] = future_achieved_goals
-
-        for idx in her_indices:
-            episode = self.buffer["achieved_goal"][episode_indices[idx]][: self.episode_lengths[episode_indices[idx]]]
-            # TODO: check that episode length is taken into account for all sampling strategies
-            new_goal = self.sample_goal(
-                self.goal_selection_strategy,
-                transitions_indices[idx],
-                episode,
-                self.buffer["achieved_goal"],
-                online_sampling=True,
-            )
-            transitions["desired_goal"][idx] = new_goal
+        new_goals = self.vectorized_sample_goal(episode_indices, her_indices, transitions_indices)
+        transitions["desired_goal"][her_indices] = new_goals
 
         # Vectorized computation
         transitions["reward"][her_indices] = self.env.env_method(
diff --git a/tests/test_her.py b/tests/test_her.py
index 7bb62dc830..ed7b7ee175 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -12,11 +12,9 @@
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
 from stable_baselines3.her.her import HER
-from stable_baselines3.sac.policies import SACPolicy
-from stable_baselines3.td3.policies import MlpPolicy, TD3Policy
 
 
-@pytest.mark.parametrize("model_class, policy", [(SAC, SACPolicy), (TD3, TD3Policy), (DDPG, MlpPolicy)])
+@pytest.mark.parametrize("model_class, policy", [(SAC, "MlpPolicy"), (TD3, "MlpPolicy"), (DDPG, "MlpPolicy")])
 @pytest.mark.parametrize("online_sampling", [True, False])
 def test_her(model_class, policy, online_sampling):
     """
@@ -109,7 +107,7 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
     env = DummyVecEnv([lambda: env])
 
     model = HER(
-        SACPolicy,
+        "MlpPolicy",
         env,
         SAC,
         goal_selection_strategy=goal_selection_strategy,
@@ -122,7 +120,7 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
     model.learn(total_timesteps=200, callback=None)
 
 
-@pytest.mark.parametrize("model_class, policy", [(SAC, SACPolicy), (TD3, TD3Policy), (DDPG, MlpPolicy)])
+@pytest.mark.parametrize("model_class, policy", [(SAC, "MlpPolicy"), (TD3, "MlpPolicy"), (DDPG, "MlpPolicy")])
 def test_save_load(tmp_path, model_class, policy):
     """
     Test if 'save' and 'load' saves and loads model correctly

From c57c6ef16f4c859ced1e05b8525806105fcfd956 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Wed, 26 Aug 2020 01:13:03 +0200
Subject: [PATCH 32/81] Update time limit for episodes in online her sampling.

---
 stable_baselines3/her/her.py | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 8007fa16bb..27bcfa84cf 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -4,7 +4,6 @@
 
 import gym
 import numpy as np
-from gym.wrappers import TimeLimit
 
 from stable_baselines3.common.base_class import BaseAlgorithm
 from stable_baselines3.common.buffers import ReplayBuffer
@@ -15,12 +14,39 @@
 from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
 from stable_baselines3.common.utils import check_for_correct_spaces
-from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy
 from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
 
 
+def get_time_limit(env: VecEnv, current_max_episode_length: int) -> int:
+    """
+    Get time limit from environment.
+
+    :param env: (VecEnv) Environment from which we want to get the time limit.
+    :param current_max_episode_length: (int) Current value for max_episode_length.
+    :return: (int) max episode length
+    """
+    # unwrap environment first
+    env_tmp = env
+    while isinstance(env_tmp, VecEnvWrapper):
+        env_tmp = env_tmp.venv
+    # try to get the attribute from environment
+    try:
+        current_max_episode_length = env_tmp.get_attr("_max_episode_steps")[0]
+    # if not available check if a valid value was passed as an argument
+    except AttributeError:
+        # throw an error when we have no valid value passed
+        if current_max_episode_length <= 0:
+            raise ValueError("The maximum episode length must be greater than zero.")
+        else:
+            # if valid value was passed take this as time limit
+            current_max_episode_length = current_max_episode_length
+
+    return current_max_episode_length
+
+
 class HER(BaseAlgorithm):
     """
     Hindsight Experience Replay (HER)
@@ -84,10 +110,7 @@ def __init__(
         # counter for steps in episode
         self.episode_steps = 0
         if self.online_sampling:
-            if isinstance(self.env, TimeLimit):
-                self.max_episode_length = env._max_episode_steps  # pytype: disable=attribute-error
-            elif self.max_episode_length <= 0:
-                raise ValueError("The maximum episode length must be greater than zero.")
+            self.max_episode_length = get_time_limit(self.env, self.max_episode_length)
             self.model.replay_buffer = HerReplayBuffer(
                 self.env,
                 self.buffer_size,

From 902267c1c2bd9fce0aa103759cdeca335c19472d Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Wed, 26 Aug 2020 12:10:45 +0200
Subject: [PATCH 33/81] Fix max episode length inference

---
 stable_baselines3/her/her.py | 37 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 27bcfa84cf..fe26cd2cf4 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -14,13 +14,13 @@
 from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
 from stable_baselines3.common.utils import check_for_correct_spaces
-from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper
+from stable_baselines3.common.vec_env import VecEnv
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy
 from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
 
 
-def get_time_limit(env: VecEnv, current_max_episode_length: int) -> int:
+def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> int:
     """
     Get time limit from environment.
 
@@ -28,22 +28,18 @@ def get_time_limit(env: VecEnv, current_max_episode_length: int) -> int:
     :param current_max_episode_length: (int) Current value for max_episode_length.
     :return: (int) max episode length
     """
-    # unwrap environment first
-    env_tmp = env
-    while isinstance(env_tmp, VecEnvWrapper):
-        env_tmp = env_tmp.venv
     # try to get the attribute from environment
-    try:
-        current_max_episode_length = env_tmp.get_attr("_max_episode_steps")[0]
-    # if not available check if a valid value was passed as an argument
-    except AttributeError:
-        # throw an error when we have no valid value passed
-        if current_max_episode_length <= 0:
-            raise ValueError("The maximum episode length must be greater than zero.")
-        else:
-            # if valid value was passed take this as time limit
-            current_max_episode_length = current_max_episode_length
-
+    if current_max_episode_length is None:
+        try:
+            current_max_episode_length = env.get_attr("spec")[0].max_episode_steps
+        # if not available check if a valid value was passed as an argument
+        except AttributeError:
+            raise ValueError(
+                "The max episode length could not be inferred."
+                "You must specify a `max_episode_steps` when registering the environment, "
+                "use a `gym.wrappers.TimeLimit` wrapper "
+                "or pass `max_episode_length` to the model constructor"
+            )
     return current_max_episode_length
 
 
@@ -51,6 +47,8 @@ class HER(BaseAlgorithm):
     """
     Hindsight Experience Replay (HER)
 
+    Paper: https://arxiv.org/abs/1707.01495
+
     :param policy: (BasePolicy or str) The policy model to use.
     :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str)
     :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3)
@@ -60,7 +58,8 @@ class HER(BaseAlgorithm):
     :param online_sampling: (bool) Sample HER transitions online.
     :param learning_rate: (float or callable) learning rate for the optimizer,
         it can be a function of the current progress remaining (from 1 to 0)
-    :param max_episode_length: (int) The length of an episode. (time horizon)
+    :param max_episode_length: (int) The maximum length of an episode. If not specified,
+        it will be automatically inferred if the environment uses a ``gym.wrappers.TimeLimit`` wrapper
     """
 
     def __init__(
@@ -72,7 +71,7 @@ def __init__(
         goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future",
         online_sampling: bool = False,
         learning_rate: Union[float, Callable] = 3e-4,
-        max_episode_length: int = -1,
+        max_episode_length: Optional[int] = None,
         *args,
         **kwargs,
     ):

From fc7f647f4b51f1da7328751d9095191aa9bef836 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Wed, 26 Aug 2020 12:46:44 +0200
Subject: [PATCH 34/81] Bug fix for Fetch envs

---
 stable_baselines3/common/vec_env/obs_dict_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable_baselines3/common/vec_env/obs_dict_wrapper.py b/stable_baselines3/common/vec_env/obs_dict_wrapper.py
index d524d5e6de..22fbae4060 100644
--- a/stable_baselines3/common/vec_env/obs_dict_wrapper.py
+++ b/stable_baselines3/common/vec_env/obs_dict_wrapper.py
@@ -63,4 +63,4 @@ def convert_dict(
         :param goal_key: (str) Key of (desired) goal in dicitonary.
         :return: (np.ndarray)
         """
-        return np.concatenate([observation_dict[observation_key], observation_dict[goal_key]], axis=1)
+        return np.concatenate([observation_dict[observation_key], observation_dict[goal_key]], axis=-1)

From 0757a73fb5a06e1a00ddd50bb71c2c21718eb296 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 27 Aug 2020 11:20:38 +0200
Subject: [PATCH 35/81] Fix for HER + gSDE

---
 stable_baselines3/her/her.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index fe26cd2cf4..a1615dcb46 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -223,7 +223,7 @@ def collect_rollouts(
         assert isinstance(env, VecEnv), "You must pass a VecEnv"
         assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"
 
-        if self.use_sde:
+        if self.model.use_sde:
             self.actor.reset_noise()
 
         callback.on_rollout_start()
@@ -238,7 +238,7 @@ def collect_rollouts(
                 observation = self._last_obs
                 self._last_obs = ObsDictWrapper.convert_dict(observation)
 
-                if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0:
+                if self.model.use_sde and self.model.sde_sample_freq > 0 and total_steps % self.model.sde_sample_freq == 0:
                     # Sample a new noise matrix
                     self.actor.reset_noise()
 

From eb89099bddff8af4efe7412cf532ebaba2049abe Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 27 Aug 2020 12:55:36 +0200
Subject: [PATCH 36/81] Reformat (new black version)

---
 stable_baselines3/common/buffers.py   |  6 +++---
 stable_baselines3/common/policies.py  |  5 ++++-
 stable_baselines3/common/save_util.py |  4 +++-
 stable_baselines3/dqn/policies.py     |  5 ++++-
 stable_baselines3/her/her.py          |  6 +++++-
 tests/test_custom_policy.py           |  2 +-
 tests/test_distributions.py           |  6 +++++-
 tests/test_her.py                     | 14 ++++++++++++--
 tests/test_vec_envs.py                |  2 +-
 9 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py
index 4534063a28..6c58953845 100644
--- a/stable_baselines3/common/buffers.py
+++ b/stable_baselines3/common/buffers.py
@@ -171,12 +171,12 @@ def __init__(
             mem_available = psutil.virtual_memory().available
 
         self.optimize_memory_usage = optimize_memory_usage
-        self.observations = np.zeros((self.buffer_size, self.n_envs,) + self.obs_shape, dtype=observation_space.dtype)
+        self.observations = np.zeros((self.buffer_size, self.n_envs) + self.obs_shape, dtype=observation_space.dtype)
         if optimize_memory_usage:
             # `observations` contains also the next observation
             self.next_observations = None
         else:
-            self.next_observations = np.zeros((self.buffer_size, self.n_envs,) + self.obs_shape, dtype=observation_space.dtype)
+            self.next_observations = np.zeros((self.buffer_size, self.n_envs) + self.obs_shape, dtype=observation_space.dtype)
         self.actions = np.zeros((self.buffer_size, self.n_envs, self.action_dim), dtype=action_space.dtype)
         self.rewards = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
         self.dones = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
@@ -284,7 +284,7 @@ def __init__(
         self.reset()
 
     def reset(self) -> None:
-        self.observations = np.zeros((self.buffer_size, self.n_envs,) + self.obs_shape, dtype=np.float32)
+        self.observations = np.zeros((self.buffer_size, self.n_envs) + self.obs_shape, dtype=np.float32)
         self.actions = np.zeros((self.buffer_size, self.n_envs, self.action_dim), dtype=np.float32)
         self.rewards = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
         self.returns = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
diff --git a/stable_baselines3/common/policies.py b/stable_baselines3/common/policies.py
index babcc99464..3450d82920 100644
--- a/stable_baselines3/common/policies.py
+++ b/stable_baselines3/common/policies.py
@@ -703,7 +703,10 @@ def __init__(
         n_critics: int = 2,
     ):
         super().__init__(
-            observation_space, action_space, features_extractor=features_extractor, normalize_images=normalize_images,
+            observation_space,
+            action_space,
+            features_extractor=features_extractor,
+            normalize_images=normalize_images,
         )
 
         action_dim = get_action_dim(self.action_space)
diff --git a/stable_baselines3/common/save_util.py b/stable_baselines3/common/save_util.py
index 51fa8cd175..326db1e8a6 100644
--- a/stable_baselines3/common/save_util.py
+++ b/stable_baselines3/common/save_util.py
@@ -350,7 +350,9 @@ def load_from_pkl(path: Union[str, pathlib.Path, io.BufferedIOBase], verbose=0)
 
 
 def load_from_zip_file(
-    load_path: Union[str, pathlib.Path, io.BufferedIOBase], load_data: bool = True, verbose=0,
+    load_path: Union[str, pathlib.Path, io.BufferedIOBase],
+    load_data: bool = True,
+    verbose=0,
 ) -> (Tuple[Optional[Dict[str, Any]], Optional[TensorDict], Optional[TensorDict]]):
     """
     Load model data from a .zip archive
diff --git a/stable_baselines3/dqn/policies.py b/stable_baselines3/dqn/policies.py
index f5001c7548..ebbcd34bff 100644
--- a/stable_baselines3/dqn/policies.py
+++ b/stable_baselines3/dqn/policies.py
@@ -31,7 +31,10 @@ def __init__(
         normalize_images: bool = True,
     ):
         super(QNetwork, self).__init__(
-            observation_space, action_space, features_extractor=features_extractor, normalize_images=normalize_images,
+            observation_space,
+            action_space,
+            features_extractor=features_extractor,
+            normalize_images=normalize_images,
         )
 
         if net_arch is None:
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index a1615dcb46..86c0a8a060 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -81,7 +81,11 @@ def __init__(
         # model initialization
         self.model_class = model_class
         self.model = model_class(
-            policy=policy, env=self.env, learning_rate=learning_rate, *args, **kwargs,  # pytype: disable=wrong-keyword-args
+            policy=policy,
+            env=self.env,
+            learning_rate=learning_rate,
+            *args,
+            **kwargs,  # pytype: disable=wrong-keyword-args
         )
 
         self.verbose = self.model.verbose
diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py
index c1e08dfacf..95f4a7c9ad 100644
--- a/tests/test_custom_policy.py
+++ b/tests/test_custom_policy.py
@@ -22,7 +22,7 @@ def test_flexible_mlp(model_class, net_arch):
     _ = model_class("MlpPolicy", "CartPole-v1", policy_kwargs=dict(net_arch=net_arch), n_steps=100).learn(1000)
 
 
-@pytest.mark.parametrize("net_arch", [[4], [4, 4],])
+@pytest.mark.parametrize("net_arch", [[4], [4, 4]])
 @pytest.mark.parametrize("model_class", [SAC, TD3])
 def test_custom_offpolicy(model_class, net_arch):
     _ = model_class("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=net_arch)).learn(1000)
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
index a73b81eded..490f80eb3b 100644
--- a/tests/test_distributions.py
+++ b/tests/test_distributions.py
@@ -67,7 +67,11 @@ def test_sde_distribution():
 
 # TODO: analytical form for squashed Gaussian?
 @pytest.mark.parametrize(
-    "dist", [DiagGaussianDistribution(N_ACTIONS), StateDependentNoiseDistribution(N_ACTIONS, squash_output=False),]
+    "dist",
+    [
+        DiagGaussianDistribution(N_ACTIONS),
+        StateDependentNoiseDistribution(N_ACTIONS, squash_output=False),
+    ],
 )
 def test_entropy(dist):
     # The entropy can be approximated by averaging the negative log likelihood
diff --git a/tests/test_her.py b/tests/test_her.py
index ed7b7ee175..34c197b224 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -26,7 +26,12 @@ def test_her(model_class, policy, online_sampling):
 
     # Create action noise
     n_actions = env.action_space.shape[0]
-    action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,)))
+    action_noise = OrnsteinUhlenbeckActionNoise(
+        np.zeros(
+            n_actions,
+        ),
+        0.2 * np.ones((n_actions,)),
+    )
 
     model = HER(
         policy,
@@ -131,7 +136,12 @@ def test_save_load(tmp_path, model_class, policy):
 
     # Create action noise
     n_actions = env.action_space.shape[0]
-    action_noise = OrnsteinUhlenbeckActionNoise(np.zeros(n_actions,), 0.2 * np.ones((n_actions,)))
+    action_noise = OrnsteinUhlenbeckActionNoise(
+        np.zeros(
+            n_actions,
+        ),
+        0.2 * np.ones((n_actions,)),
+    )
 
     # create model
     model = HER(
diff --git a/tests/test_vec_envs.py b/tests/test_vec_envs.py
index 8c33341c57..141ca6a65f 100644
--- a/tests/test_vec_envs.py
+++ b/tests/test_vec_envs.py
@@ -225,7 +225,7 @@ def make_env():
 
 def check_vecenv_obs(obs, space):
     """Helper method to check observations from multiple environments each belong to
-       the appropriate observation space."""
+    the appropriate observation space."""
     assert obs.shape[0] == N_ENVS
     for value in obs:
         assert space.contains(value)

From d1adff61fd8aff89db797dbb0450a74d36f10fb4 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Thu, 27 Aug 2020 14:21:33 +0200
Subject: [PATCH 37/81] Added info dict to compute new reward. Check
 her_replay_buffer again.

---
 stable_baselines3/her/her.py               | 8 ++++----
 stable_baselines3/her/her_replay_buffer.py | 8 +++++++-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 86c0a8a060..87505282d4 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -276,10 +276,10 @@ def collect_rollouts(
                         self.model._last_original_obs = self._last_original_obs
 
                     if self.online_sampling:
-                        self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done)
+                        self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
                     else:
                         # add current transition to episode storage
-                        self._episode_storage.append((self._last_original_obs, new_obs_, buffer_action, reward_, done))
+                        self._episode_storage.append((self._last_original_obs, new_obs_, buffer_action, reward_, done, infos))
 
                 self._last_obs = new_obs
                 self.model._last_obs = self._last_obs
@@ -345,7 +345,7 @@ def _store_transitions(self) -> None:
         # iterate over current episodes transitions
         for idx, trans in enumerate(self._episode_storage):
 
-            observation, new_observation, action, reward, done = trans
+            observation, new_observation, action, reward, done, infos = trans
 
             # concatenate observation with (desired) goal
             obs = ObsDictWrapper.convert_dict(observation)
@@ -373,7 +373,7 @@ def _store_transitions(self) -> None:
             # iterate over sampled  new transitions in replay buffer
             for goal in sampled_goals:
                 # compute new reward with new goal
-                new_reward = self.env.env_method("compute_reward", new_observation["achieved_goal"], goal, None)
+                new_reward = self.env.env_method("compute_reward", new_observation["achieved_goal"], goal, infos)
 
                 # concatenate observation with (desired) goal
                 obs = np.concatenate([observation["observation"], goal], axis=1)
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 756e731ae1..d1a2f475fb 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -64,6 +64,7 @@ def __init__(
             "next_achieved_goal": (self.env.num_envs, self.env.goal_dim),
             "next_desired_goal": (self.env.num_envs, self.env.goal_dim),
             "done": (1,),
+            "infos": (1,),
         }
         self.buffer = {
             key: np.empty((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32)
@@ -194,7 +195,10 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
 
         # Vectorized computation
         transitions["reward"][her_indices] = self.env.env_method(
-            "compute_reward", transitions["next_achieved_goal"][her_indices], transitions["desired_goal"][her_indices], None
+            "compute_reward",
+            transitions["next_achieved_goal"][her_indices],
+            transitions["desired_goal"][her_indices],
+            transitions["infos"][her_indices],
         )
 
         # concatenate observation with (desired) goal
@@ -218,6 +222,7 @@ def add(
         action: np.ndarray,
         reward: np.ndarray,
         done: np.ndarray,
+        infos: Dict[str, np.ndarray],
     ) -> None:
 
         self.buffer["observation"][self.pos][self.current_idx] = obs["observation"]
@@ -229,6 +234,7 @@ def add(
         self.buffer["next_obs"][self.pos][self.current_idx] = next_obs["observation"]
         self.buffer["next_achieved_goal"][self.pos][self.current_idx] = next_obs["achieved_goal"]
         self.buffer["next_desired_goal"][self.pos][self.current_idx] = next_obs["desired_goal"]
+        self.buffer["infos"][self.pos][self.current_idx] = infos
 
         # update current pointer
         self.current_idx += 1

From 01162df0ae9a0ff7198f608bc6f851b1ebae414b Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 27 Aug 2020 15:06:18 +0200
Subject: [PATCH 38/81] Fix info buffer

---
 stable_baselines3/her/her_replay_buffer.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index d1a2f475fb..8e06cd647f 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -1,3 +1,4 @@
+from collections import deque
 from typing import Dict, Optional, Union
 
 import numpy as np
@@ -64,12 +65,12 @@ def __init__(
             "next_achieved_goal": (self.env.num_envs, self.env.goal_dim),
             "next_desired_goal": (self.env.num_envs, self.env.goal_dim),
             "done": (1,),
-            "infos": (1,),
         }
         self.buffer = {
             key: np.empty((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32)
             for key, dim in input_shape.items()
         }
+        self.info_buffer = [deque(maxlen=self.max_episode_length) for _ in range(self.max_episode_stored)]
         # episode length storage, needed for episodes which has less steps than the maximum length
         self.episode_lengths = np.zeros(self.max_episode_stored, dtype=np.int64)
 
@@ -193,12 +194,21 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
         new_goals = self.vectorized_sample_goal(episode_indices, her_indices, transitions_indices)
         transitions["desired_goal"][her_indices] = new_goals
 
+        # Convert to numpy array
+        # TODO: disable if not needed for faster computation
+        transitions["info"] = np.array(
+            [
+                self.info_buffer[episode_idx][transition_idx]
+                for episode_idx, transition_idx in zip(episode_indices, transitions_indices)
+            ]
+        )
+
         # Vectorized computation
         transitions["reward"][her_indices] = self.env.env_method(
             "compute_reward",
             transitions["next_achieved_goal"][her_indices],
             transitions["desired_goal"][her_indices],
-            transitions["infos"][her_indices],
+            transitions["info"][her_indices],
         )
 
         # concatenate observation with (desired) goal
@@ -225,6 +235,10 @@ def add(
         infos: Dict[str, np.ndarray],
     ) -> None:
 
+        if self.current_idx == 0 and self.full:
+            # Clear info buffer
+            self.info_buffer[self.pos] = deque(maxlen=self.max_episode_length)
+
         self.buffer["observation"][self.pos][self.current_idx] = obs["observation"]
         self.buffer["achieved_goal"][self.pos][self.current_idx] = obs["achieved_goal"]
         self.buffer["desired_goal"][self.pos][self.current_idx] = obs["desired_goal"]
@@ -234,7 +248,8 @@ def add(
         self.buffer["next_obs"][self.pos][self.current_idx] = next_obs["observation"]
         self.buffer["next_achieved_goal"][self.pos][self.current_idx] = next_obs["achieved_goal"]
         self.buffer["next_desired_goal"][self.pos][self.current_idx] = next_obs["desired_goal"]
-        self.buffer["infos"][self.pos][self.current_idx] = infos
+
+        self.info_buffer[self.pos].append(infos)
 
         # update current pointer
         self.current_idx += 1

From 656a1a61599f551124a5512a23bf289bbfc6e3bf Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Fri, 28 Aug 2020 17:28:25 +0200
Subject: [PATCH 39/81] Updated done flag.

---
 stable_baselines3/her/her.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 87505282d4..39d71e5d40 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -110,10 +110,10 @@ def __init__(
         self.online_sampling = online_sampling
         self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1))
         self.max_episode_length = max_episode_length
+        self.max_episode_length = get_time_limit(self.env, self.max_episode_length)
         # counter for steps in episode
         self.episode_steps = 0
         if self.online_sampling:
-            self.max_episode_length = get_time_limit(self.env, self.max_episode_length)
             self.model.replay_buffer = HerReplayBuffer(
                 self.env,
                 self.buffer_size,
@@ -253,6 +253,8 @@ def collect_rollouts(
                 # Perform action
                 new_obs, reward, done, infos = env.step(action)
 
+                done = done if episode_timesteps < self.max_episode_length else False
+
                 # Only stop training if return value is False, not when it is None.
                 if callback.on_step() is False:
                     return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False)
@@ -380,7 +382,7 @@ def _store_transitions(self) -> None:
                 new_obs = np.concatenate([new_observation["observation"], goal], axis=1)
 
                 # store data in replay buffer
-                self.replay_buffer.add(obs, new_obs, action, new_reward, done)
+                self.replay_buffer.add(obs, new_obs, action, new_reward, np.array([False]))
 
     def __getattr__(self, item):
         """

From 59bbe804cd2b0d9c3ba2184dadc5322929b939e2 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Fri, 28 Aug 2020 19:26:44 +0200
Subject: [PATCH 40/81] Fixes for gSDE

---
 stable_baselines3/common/base_class.py | 2 +-
 stable_baselines3/her/her.py           | 5 +++++
 tests/test_her.py                      | 9 ++++++++-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
index 2d16655fc6..13728d7257 100644
--- a/stable_baselines3/common/base_class.py
+++ b/stable_baselines3/common/base_class.py
@@ -159,7 +159,7 @@ def __init__(
                     "Error: the model does not support multiple envs; it requires " "a single vectorized environment."
                 )
 
-        if self.use_sde and not isinstance(self.observation_space, gym.spaces.Box):
+        if self.use_sde and not isinstance(self.action_space, gym.spaces.Box):
             raise ValueError("generalized State-Dependent Exploration (gSDE) can only be used with continuous actions.")
 
     def _wrap_env(self, env: GymEnv) -> VecEnv:
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 39d71e5d40..8de34194c8 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -455,6 +455,10 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
         if env is None and "env" in data:
             env = data["env"]
 
+        kwargs = {}
+        if "use_sde" in data and data["use_sde"]:
+            kwargs["use_sde"] = True
+
         # noinspection PyArgumentList
         her_model = cls(
             policy=data["policy_class"],
@@ -467,6 +471,7 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
             max_episode_length=data["max_episode_length"],
             policy_kwargs=data["policy_kwargs"],
             _init_setup_model=True,  # pytype: disable=not-instantiable,wrong-keyword-args
+            **kwargs,
         )
 
         # load parameters
diff --git a/tests/test_her.py b/tests/test_her.py
index 34c197b224..615f306e42 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -126,10 +126,14 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
 
 
 @pytest.mark.parametrize("model_class, policy", [(SAC, "MlpPolicy"), (TD3, "MlpPolicy"), (DDPG, "MlpPolicy")])
-def test_save_load(tmp_path, model_class, policy):
+@pytest.mark.parametrize("use_sde", [False, True])
+def test_save_load(tmp_path, model_class, policy, use_sde):
     """
     Test if 'save' and 'load' saves and loads model correctly
     """
+    if use_sde and model_class != SAC:
+        pytest.skip("Only SAC has gSDE support")
+
     n_bits = 4
     env = BitFlippingEnv(n_bits=n_bits, continuous=True)
     env = DummyVecEnv([lambda: env])
@@ -143,6 +147,8 @@ def test_save_load(tmp_path, model_class, policy):
         0.2 * np.ones((n_actions,)),
     )
 
+    kwargs = dict(use_sde=True) if use_sde else {}
+
     # create model
     model = HER(
         policy,
@@ -163,6 +169,7 @@ def test_save_load(tmp_path, model_class, policy):
         train_freq=1,
         n_episodes_rollout=-1,
         max_episode_length=n_bits,
+        **kwargs
     )
 
     model.learn(total_timesteps=500, callback=None)

From 90dafc4a15db2141147866782aa4cc3fac33c259 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Wed, 16 Sep 2020 13:21:24 +0200
Subject: [PATCH 41/81] Offline her version uses now HerReplayBuffer as episode
 storage.

---
 stable_baselines3/her/her.py               | 75 +++++++++++++---------
 stable_baselines3/her/her_replay_buffer.py | 21 +++---
 tests/test_her.py                          |  1 -
 3 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 8de34194c8..d5b21192d1 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -102,15 +102,25 @@ def __init__(
             self.goal_selection_strategy, GoalSelectionStrategy
         ), f"Invalid goal selection strategy, please use one of {list(GoalSelectionStrategy)}"
 
+        # maximum steps in episode
+        self.max_episode_length = get_time_limit(self.env, max_episode_length)
         # storage for transitions of current episode
-        self._episode_storage = []
+        self._episode_storage = HerReplayBuffer(
+            self.env,
+            self.max_episode_length,
+            self.max_episode_length,
+            self.goal_selection_strategy,
+            self.env.observation_space,
+            self.env.action_space,
+            self.device,
+            self.n_envs,
+            0.0,  # pytype: disable=wrong-arg-types
+        )
         self.n_sampled_goal = n_sampled_goal
 
         # if we sample her transitions online use custom replay buffer
         self.online_sampling = online_sampling
         self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1))
-        self.max_episode_length = max_episode_length
-        self.max_episode_length = get_time_limit(self.env, self.max_episode_length)
         # counter for steps in episode
         self.episode_steps = 0
         if self.online_sampling:
@@ -281,7 +291,7 @@ def collect_rollouts(
                         self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
                     else:
                         # add current transition to episode storage
-                        self._episode_storage.append((self._last_original_obs, new_obs_, buffer_action, reward_, done, infos))
+                        self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
 
                 self._last_obs = new_obs
                 self.model._last_obs = self._last_obs
@@ -311,10 +321,11 @@ def collect_rollouts(
                 if self.online_sampling:
                     self.replay_buffer.store_episode()
                 else:
+                    self._episode_storage.store_episode()
                     # store episode in replay buffer
                     self._store_transitions()
                 # clear storage for current episode
-                self._episode_storage = []
+                self._episode_storage.reset()
 
                 total_episodes += 1
                 self._episode_num += 1
@@ -345,44 +356,50 @@ def _store_transitions(self) -> None:
         """
 
         # iterate over current episodes transitions
-        for idx, trans in enumerate(self._episode_storage):
-
-            observation, new_observation, action, reward, done, infos = trans
+        for idx in range(self._episode_storage.size()):
+            # get data of episode index
+            observation = self._episode_storage.buffer["observation"][0][idx]
+            desired_goal = self._episode_storage.buffer["desired_goal"][0][idx]
+            next_observation = self._episode_storage.buffer["next_obs"][0][idx]
+            next_achieved_goal = self._episode_storage.buffer["next_achieved_goal"][0][idx]
+            next_desired_goal = self._episode_storage.buffer["next_desired_goal"][0][idx]
+            action = self._episode_storage.buffer["action"][0][idx]
+            reward = self._episode_storage.buffer["reward"][0][idx]
+            done = self._episode_storage.buffer["done"][0][idx]
+            infos = self._episode_storage.info_buffer[0][idx]
 
             # concatenate observation with (desired) goal
-            obs = ObsDictWrapper.convert_dict(observation)
-            new_obs = ObsDictWrapper.convert_dict(new_observation)
-
+            obs = np.concatenate([observation, desired_goal], axis=-1)
+            next_obs = np.concatenate([next_observation, next_desired_goal], axis=-1)
             # store data in replay buffer
-            self.replay_buffer.add(obs, new_obs, action, reward, done)
+            self.replay_buffer.add(obs, next_obs, action, reward, done)
 
             # We cannot sample a goal from the future in the last step of an episode
-            if idx == len(self._episode_storage) - 1 and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
+            if idx == self._episode_storage.size() - 1 and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
                 break
 
-            # sample set of additional goals
-            obs_dim = observation["observation"].shape[1]
-            sampled_goals = [
-                sample
-                for sample in (
-                    HerReplayBuffer.sample_goal(
-                        self.goal_selection_strategy, idx, self._episode_storage, self.replay_buffer.observations, obs_dim
-                    )
-                    for _ in range(self.n_sampled_goal)
+            # dimsension of observation
+            obs_dim = observation.shape[1]
+
+            for _ in range(self.n_sampled_goal):
+                # sample goal
+                goal = self._episode_storage.sample_goal(
+                    self.goal_selection_strategy,
+                    idx,
+                    self._episode_storage.buffer["achieved_goal"][0],
+                    self.replay_buffer.observations,
+                    obs_dim,
                 )
-            ]
 
-            # iterate over sampled  new transitions in replay buffer
-            for goal in sampled_goals:
                 # compute new reward with new goal
-                new_reward = self.env.env_method("compute_reward", new_observation["achieved_goal"], goal, infos)
+                new_reward = self.env.env_method("compute_reward", next_achieved_goal, goal, infos)
 
                 # concatenate observation with (desired) goal
-                obs = np.concatenate([observation["observation"], goal], axis=1)
-                new_obs = np.concatenate([new_observation["observation"], goal], axis=1)
+                obs = np.concatenate([observation, goal], axis=1)
+                next_obs = np.concatenate([next_observation, goal], axis=1)
 
                 # store data in replay buffer
-                self.replay_buffer.add(obs, new_obs, action, new_reward, np.array([False]))
+                self.replay_buffer.add(obs, next_obs, action, new_reward, np.array([False]))
 
     def __getattr__(self, item):
         """
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 8e06cd647f..3e33516021 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -1,5 +1,5 @@
 from collections import deque
-from typing import Dict, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import numpy as np
 import torch as th
@@ -87,11 +87,11 @@ def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayB
         """
         return self._sample_transitions(batch_size, env)
 
-    @staticmethod
     def sample_goal(
+        self,
         goal_selection_strategy: GoalSelectionStrategy,
         sample_idx: int,
-        episode: list,
+        achieved_goals: list,
         observations: Union[list, np.ndarray],
         obs_dim: int = None,
     ) -> np.ndarray:
@@ -101,22 +101,22 @@ def sample_goal(
         :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay.
             One of ['episode', 'final', 'future', 'random']
         :param sample_idx: (int) Index of current transition.
-        :param episode: (list) Current episode.
+        :param achieved_goals: (list) Achieved goals of Current episode.
         :param observations: (list or np.ndarray)
         :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy.
         :return: (np.ndarray) Return sampled goal.
         """
         if goal_selection_strategy == GoalSelectionStrategy.FINAL:
             # replay with final state of current episode
-            return episode[-1][0]["achieved_goal"]
+            return achieved_goals[-1]
         elif goal_selection_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
-            index = np.random.choice(np.arange(sample_idx + 1, len(episode)))
-            return episode[index][0]["achieved_goal"]
+            index = np.random.choice(np.arange(sample_idx + 1, len(achieved_goals)))
+            return achieved_goals[index]
         elif goal_selection_strategy == GoalSelectionStrategy.EPISODE:
             # replay with random state which comes from the same episode as current transition
-            index = np.random.choice(np.arange(len(episode)))
-            return episode[index][0]["achieved_goal"]
+            index = np.random.choice(np.arange(len(achieved_goals)))
+            return achieved_goals[index]
         elif goal_selection_strategy == GoalSelectionStrategy.RANDOM:
             # replay with random state from the entire replay buffer
             index = np.random.choice(np.arange(len(observations)))
@@ -232,7 +232,8 @@ def add(
         action: np.ndarray,
         reward: np.ndarray,
         done: np.ndarray,
-        infos: Dict[str, np.ndarray],
+        # infos: Dict[str, np.ndarray],
+        infos: List[dict],
     ) -> None:
 
         if self.current_idx == 0 and self.full:
diff --git a/tests/test_her.py b/tests/test_her.py
index 615f306e42..5fa4980d37 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -234,7 +234,6 @@ def test_dqn_her(online_sampling, n_bits):
     """
     env = BitFlippingEnv(n_bits=n_bits, continuous=False)
 
-    # offline
     model = HER(
         "MlpPolicy",
         env,

From 655e4c33b6c1db8591b010ebbbd8bd68f4e2ae91 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 17 Sep 2020 16:28:14 +0200
Subject: [PATCH 42/81] Fix num_timesteps computation

---
 stable_baselines3/her/her.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index d5b21192d1..c9b4701d21 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -265,6 +265,11 @@ def collect_rollouts(
 
                 done = done if episode_timesteps < self.max_episode_length else False
 
+                self.num_timesteps += 1
+                self.model.num_timesteps = self.num_timesteps
+                episode_timesteps += 1
+                total_steps += 1
+
                 # Only stop training if return value is False, not when it is None.
                 if callback.on_step() is False:
                     return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False)
@@ -300,10 +305,6 @@ def collect_rollouts(
                     self._last_original_obs = new_obs_
                     self.model._last_original_obs = self._last_original_obs
 
-                self.num_timesteps += 1
-                self.model.num_timesteps = self.num_timesteps
-                episode_timesteps += 1
-                total_steps += 1
                 self.model._update_current_progress_remaining(self.num_timesteps, self._total_timesteps)
 
                 # For DQN, check if the target network should be updated

From 046088b3958b02fd6759196b1ad76d34264ef146 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 24 Sep 2020 18:24:33 +0200
Subject: [PATCH 43/81] Fix get torch params

---
 stable_baselines3/her/her.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index c9b4701d21..aea1ccca85 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -411,8 +411,8 @@ def __getattr__(self, item):
         else:
             raise AttributeError
 
-    def get_torch_variables(self) -> Tuple[List[str], List[str]]:
-        return self.model.get_torch_variables()
+    def _get_torch_save_params(self) -> Tuple[List[str], List[str]]:
+        return self.model._get_torch_save_params()
 
     def save(
         self,

From a68cc323ca6031d647a47c30e23ec52800b50edd Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 6 Oct 2020 00:38:08 +0200
Subject: [PATCH 44/81] Vectorized version for offline sampling.

---
 stable_baselines3/her/her.py | 104 ++++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 44 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index aea1ccca85..426a3a07bc 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -295,6 +295,11 @@ def collect_rollouts(
                     if self.online_sampling:
                         self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
                     else:
+                        # concatenate observation with (desired) goal
+                        obs = ObsDictWrapper.convert_dict(self._last_original_obs)
+                        next_obs = ObsDictWrapper.convert_dict(new_obs_)
+                        # add to replay bufffer
+                        self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done)
                         # add current transition to episode storage
                         self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
 
@@ -318,6 +323,7 @@ def collect_rollouts(
                 if 0 < n_steps <= total_steps:
                     break
 
+            # TODO check again
             if done or self.episode_steps == self.max_episode_length:
                 if self.online_sampling:
                     self.replay_buffer.store_episode()
@@ -355,52 +361,62 @@ def _store_transitions(self) -> None:
         """
         Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer.
         """
+        # use vectorized sample goal function fom her_replay_buffer
+        episode_length = self._episode_storage.episode_lengths[0]
+        episode_indices = np.array(list(range(self._episode_storage.n_episodes_stored)) * episode_length * self.n_sampled_goal)
+        her_indices = np.arange(len(episode_indices))
+        # repeat every transition index n_sampled_goals times
+        transitions_indices = np.array(list(range(episode_length)) * self.n_sampled_goal)
+
+        if self._episode_storage.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
+            # restrict the sampling domain when ep_length > 1
+            # otherwise filter out the indices
+            # only consider transitions which are not the last one in the episode
+            her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1]
+
+        # transitions
+        transitions = {
+            key: self._episode_storage.buffer[key][episode_indices, transitions_indices].copy()
+            for key in self._episode_storage.buffer.keys()
+        }
+
+        # get sampled goals
+        new_goals = self._episode_storage.vectorized_sample_goal(episode_indices, her_indices, transitions_indices)
+        # assign new goals as desired goals
+        transitions["desired_goal"][her_indices] = new_goals
+
+        # Convert to numpy array
+        # TODO: disable if not needed for faster computation
+        transitions["info"] = np.array(
+            [
+                self._episode_storage.info_buffer[episode_idx][transition_idx]
+                for episode_idx, transition_idx in zip(episode_indices, transitions_indices)
+            ]
+        )
 
-        # iterate over current episodes transitions
-        for idx in range(self._episode_storage.size()):
-            # get data of episode index
-            observation = self._episode_storage.buffer["observation"][0][idx]
-            desired_goal = self._episode_storage.buffer["desired_goal"][0][idx]
-            next_observation = self._episode_storage.buffer["next_obs"][0][idx]
-            next_achieved_goal = self._episode_storage.buffer["next_achieved_goal"][0][idx]
-            next_desired_goal = self._episode_storage.buffer["next_desired_goal"][0][idx]
-            action = self._episode_storage.buffer["action"][0][idx]
-            reward = self._episode_storage.buffer["reward"][0][idx]
-            done = self._episode_storage.buffer["done"][0][idx]
-            infos = self._episode_storage.info_buffer[0][idx]
-
-            # concatenate observation with (desired) goal
-            obs = np.concatenate([observation, desired_goal], axis=-1)
-            next_obs = np.concatenate([next_observation, next_desired_goal], axis=-1)
-            # store data in replay buffer
-            self.replay_buffer.add(obs, next_obs, action, reward, done)
-
-            # We cannot sample a goal from the future in the last step of an episode
-            if idx == self._episode_storage.size() - 1 and self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
-                break
-
-            # dimsension of observation
-            obs_dim = observation.shape[1]
-
-            for _ in range(self.n_sampled_goal):
-                # sample goal
-                goal = self._episode_storage.sample_goal(
-                    self.goal_selection_strategy,
-                    idx,
-                    self._episode_storage.buffer["achieved_goal"][0],
-                    self.replay_buffer.observations,
-                    obs_dim,
-                )
-
-                # compute new reward with new goal
-                new_reward = self.env.env_method("compute_reward", next_achieved_goal, goal, infos)
-
-                # concatenate observation with (desired) goal
-                obs = np.concatenate([observation, goal], axis=1)
-                next_obs = np.concatenate([next_observation, goal], axis=1)
+        # Vectorized computation
+        transitions["reward"][her_indices] = self.env.env_method(
+            "compute_reward",
+            transitions["next_achieved_goal"][her_indices],
+            transitions["desired_goal"][her_indices],
+            transitions["info"][her_indices],
+        )
 
-                # store data in replay buffer
-                self.replay_buffer.add(obs, next_obs, action, new_reward, np.array([False]))
+        # concatenate observation with (desired) goal
+        observations = ObsDictWrapper.convert_dict(transitions)
+        next_observations = ObsDictWrapper.convert_dict(transitions, observation_key="next_obs")
+
+        # TODO check random strategy -> with online_sampling flag?
+        # TODO done = False? or recompute -> compare desired and achieved goal
+
+        # store data in replay buffer
+        for i in her_indices:
+            obs = observations[i]
+            next_obs = next_observations[i]
+            buffer_action = transitions["action"][i]
+            reward = transitions["reward"][i]
+            done = np.array([False])
+            self.replay_buffer.add(obs, next_obs, buffer_action, reward, done)
 
     def __getattr__(self, item):
         """

From 8a25457c1cd184119140e27bc8a4a458f94ce092 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 6 Oct 2020 02:51:28 +0200
Subject: [PATCH 45/81] Modified offline her sampling to use sample method of
 her_replay_buffer

---
 stable_baselines3/her/her.py               |  63 ++------
 stable_baselines3/her/her_replay_buffer.py | 165 +++++++++++----------
 2 files changed, 99 insertions(+), 129 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 426a3a07bc..aa74bdfc30 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -102,6 +102,10 @@ def __init__(
             self.goal_selection_strategy, GoalSelectionStrategy
         ), f"Invalid goal selection strategy, please use one of {list(GoalSelectionStrategy)}"
 
+        self.n_sampled_goal = n_sampled_goal
+        # if we sample her transitions online use custom replay buffer
+        self.online_sampling = online_sampling
+        self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1))
         # maximum steps in episode
         self.max_episode_length = get_time_limit(self.env, max_episode_length)
         # storage for transitions of current episode
@@ -114,13 +118,9 @@ def __init__(
             self.env.action_space,
             self.device,
             self.n_envs,
-            0.0,  # pytype: disable=wrong-arg-types
+            self.her_ratio,  # pytype: disable=wrong-arg-types
         )
-        self.n_sampled_goal = n_sampled_goal
 
-        # if we sample her transitions online use custom replay buffer
-        self.online_sampling = online_sampling
-        self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1))
         # counter for steps in episode
         self.episode_steps = 0
         if self.online_sampling:
@@ -361,54 +361,17 @@ def _store_transitions(self) -> None:
         """
         Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer.
         """
-        # use vectorized sample goal function fom her_replay_buffer
-        episode_length = self._episode_storage.episode_lengths[0]
-        episode_indices = np.array(list(range(self._episode_storage.n_episodes_stored)) * episode_length * self.n_sampled_goal)
-        her_indices = np.arange(len(episode_indices))
-        # repeat every transition index n_sampled_goals times
-        transitions_indices = np.array(list(range(episode_length)) * self.n_sampled_goal)
-
-        if self._episode_storage.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
-            # restrict the sampling domain when ep_length > 1
-            # otherwise filter out the indices
-            # only consider transitions which are not the last one in the episode
-            her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1]
-
-        # transitions
-        transitions = {
-            key: self._episode_storage.buffer[key][episode_indices, transitions_indices].copy()
-            for key in self._episode_storage.buffer.keys()
-        }
-
-        # get sampled goals
-        new_goals = self._episode_storage.vectorized_sample_goal(episode_indices, her_indices, transitions_indices)
-        # assign new goals as desired goals
-        transitions["desired_goal"][her_indices] = new_goals
-
-        # Convert to numpy array
-        # TODO: disable if not needed for faster computation
-        transitions["info"] = np.array(
-            [
-                self._episode_storage.info_buffer[episode_idx][transition_idx]
-                for episode_idx, transition_idx in zip(episode_indices, transitions_indices)
-            ]
-        )
 
-        # Vectorized computation
-        transitions["reward"][her_indices] = self.env.env_method(
-            "compute_reward",
-            transitions["next_achieved_goal"][her_indices],
-            transitions["desired_goal"][her_indices],
-            transitions["info"][her_indices],
+        # sample goals and get new observations
+        observations, next_observations, transitions, her_indices = self._episode_storage.sample(
+            self.batch_size,
+            self.env,
+            self.online_sampling,
+            self.n_sampled_goal,
+            self.replay_buffer.observations,
         )
 
-        # concatenate observation with (desired) goal
-        observations = ObsDictWrapper.convert_dict(transitions)
-        next_observations = ObsDictWrapper.convert_dict(transitions, observation_key="next_obs")
-
-        # TODO check random strategy -> with online_sampling flag?
-        # TODO done = False? or recompute -> compare desired and achieved goal
-
+        # TODO done = False?
         # store data in replay buffer
         for i in her_indices:
             obs = observations[i]
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 3e33516021..86d26f8f7b 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -1,5 +1,5 @@
 from collections import deque
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch as th
@@ -78,77 +78,49 @@ def __init__(
         # percentage of her indices
         self.her_ratio = her_ratio
 
-    def sample(self, batch_size: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
+    def sample(
+        self,
+        batch_size: int,
+        env: Optional[VecNormalize] = None,
+        online_sampling: bool = True,
+        n_sampled_goal: int = None,
+        replay_observations: np.ndarray = None,
+    ) -> Union[ReplayBufferSamples, Tuple]:
         """
         :param batch_size: (int) Number of element to sample
         :param env: (Optional[VecNormalize]) associated gym VecEnv
             to normalize the observations/rewards when sampling
-        :return: (ReplayBufferSamples)
-        """
-        return self._sample_transitions(batch_size, env)
-
-    def sample_goal(
-        self,
-        goal_selection_strategy: GoalSelectionStrategy,
-        sample_idx: int,
-        achieved_goals: list,
-        observations: Union[list, np.ndarray],
-        obs_dim: int = None,
-    ) -> np.ndarray:
+        :param online_sampling: (bool) Using online_sampling for HER or not.
+        :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling)
+        :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
+        :return: (ReplayBufferSamples or Tuple)
         """
-        Sample a goal based on goal_selection_strategy.
-
-        :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay.
-            One of ['episode', 'final', 'future', 'random']
-        :param sample_idx: (int) Index of current transition.
-        :param achieved_goals: (list) Achieved goals of Current episode.
-        :param observations: (list or np.ndarray)
-        :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy.
-        :return: (np.ndarray) Return sampled goal.
-        """
-        if goal_selection_strategy == GoalSelectionStrategy.FINAL:
-            # replay with final state of current episode
-            return achieved_goals[-1]
-        elif goal_selection_strategy == GoalSelectionStrategy.FUTURE:
-            # replay with random state which comes from the same episode and was observed after current transition
-            index = np.random.choice(np.arange(sample_idx + 1, len(achieved_goals)))
-            return achieved_goals[index]
-        elif goal_selection_strategy == GoalSelectionStrategy.EPISODE:
-            # replay with random state which comes from the same episode as current transition
-            index = np.random.choice(np.arange(len(achieved_goals)))
-            return achieved_goals[index]
-        elif goal_selection_strategy == GoalSelectionStrategy.RANDOM:
-            # replay with random state from the entire replay buffer
-            index = np.random.choice(np.arange(len(observations)))
-            obs = observations[index]
-            # get only the observation part
-            # TODO: check that line (or the comment at least)
-            obs_array = obs[:, :obs_dim]
-            return obs_array
-        else:
-            raise ValueError("Strategy for sampling goals not supported!")
+        return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal, replay_observations)
 
     def vectorized_sample_goal(
-        self, episode_indices: np.ndarray, her_indices: np.ndarray, transitions_indices: np.ndarray
+        self,
+        episode_indices: np.ndarray,
+        her_indices: np.ndarray,
+        transitions_indices: np.ndarray,
+        online_sampling: bool = True,
+        replay_observations: np.ndarray = None,
     ) -> np.ndarray:
         """
         Sample goals based on goal_selection_strategy.
-        This is the vectorized (faster) version of ``sample_goal()``
-
-        :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay.
-            One of ['episode', 'final', 'future', 'random']
-        :param sample_idx: (int) Index of current transition.
-        :param episode: (list) Current episode.
-        :param observations: (list or np.ndarray)
-        :param obs_dim: (int) Dimension of real observation without goal. It is needed for the random strategy.
-        :param online_sampling: (bool) Sample HER transitions online.
+        This is a vectorized (fast) version.
+
+        :param episode_indices: (np.ndarray) Episode indices to use.
+        :param her_indices: (np.ndarray) HER indices.
+        :param transitions_indices: (np.ndarray) Transition indices to use.
+        :param online_sampling: (bool) Using online_sampling for HER or not.
+        :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
         :return: (np.ndarray) Return sampled goals.
         """
         her_episode_indices = episode_indices[her_indices]
 
         if self.goal_selection_strategy == GoalSelectionStrategy.FINAL:
             # replay with final state of current episode
-            transitions_indices = self.episode_lengths[her_indices] - 1
+            transitions_indices = self.episode_lengths[her_episode_indices] - 1
 
         elif self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
             # replay with random state which comes from the same episode and was observed after current transition
@@ -161,37 +133,70 @@ def vectorized_sample_goal(
             transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices])
 
         elif self.goal_selection_strategy == GoalSelectionStrategy.RANDOM:
-            # replay with random state from the entire replay buffer
-            her_episode_indices = np.random.randint(self.n_episodes_stored, size=len(her_indices))
-            transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices])
+            if online_sampling:
+                # replay with random state from the entire replay buffer
+                her_episode_indices = np.random.randint(self.n_episodes_stored, size=len(her_indices))
+                transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices])
+            else:
+                # replay with random state from the entire replay buffer
+                index = np.random.choice(np.arange(len(replay_observations)), len(her_indices))
+                obs = replay_observations[index]
+                # get only the observation part of the state
+                obs_dim = self.env.obs_dim
+                obs_array = obs[:, :, :obs_dim]
+                return obs_array
         else:
             raise ValueError("Strategy for sampling goals not supported!")
 
         return self.buffer["achieved_goal"][her_episode_indices, transitions_indices]
 
-    def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> ReplayBufferSamples:
+    def _sample_transitions(
+        self,
+        batch_size: int,
+        env: Optional[VecNormalize],
+        online_sampling: bool = True,
+        n_sampled_goal: int = None,
+        replay_observations: np.ndarray = None,
+    ) -> Union[ReplayBufferSamples, Tuple]:
         """
         :param batch_size: (int) Number of element to sample
         :param env: (Optional[VecNormalize]) associated gym VecEnv
             to normalize the observations/rewards when sampling
-        :return: (ReplayBufferSamples)
+        :param online_sampling: (bool) Using online_sampling for HER or not.
+        :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling)
+        :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
+        :return: (ReplayBufferSamples or Tuple)
         """
         # Select which episodes to use
-        episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
-        her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)]
-        # her_indices = np.random.permutation(batch_size)[: int(self.her_ratio * batch_size)]
-        ep_length = self.episode_lengths[episode_indices]
+        if online_sampling:
+            episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
+            her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)]
+            ep_length = self.episode_lengths[episode_indices]
+        else:
+            episode_length = self.episode_lengths[0]
+            episode_indices = np.array(list(range(self.n_episodes_stored)) * episode_length * n_sampled_goal)
+            her_indices = np.arange(len(episode_indices))
+            # repeat every transition index n_sampled_goals times
+            transitions_indices = np.array(list(range(episode_length)) * n_sampled_goal)
 
         if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
             # restrict the sampling domain when ep_length > 1
             # otherwise filter out the indices
-            her_indices = her_indices[ep_length[her_indices] > 1]
-            ep_length[her_indices] -= 1
-
-        transitions_indices = np.random.randint(ep_length)
+            if online_sampling:
+                her_indices = her_indices[ep_length[her_indices] > 1]
+                ep_length[her_indices] -= 1
+            else:
+                her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1]
+
+        if online_sampling:
+            # Select which transitions to use
+            transitions_indices = np.random.randint(ep_length)
+        # get selected transitions
         transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()}
 
-        new_goals = self.vectorized_sample_goal(episode_indices, her_indices, transitions_indices)
+        new_goals = self.vectorized_sample_goal(
+            episode_indices, her_indices, transitions_indices, online_sampling, replay_observations
+        )
         transitions["desired_goal"][her_indices] = new_goals
 
         # Convert to numpy array
@@ -215,15 +220,18 @@ def _sample_transitions(self, batch_size: int, env: Optional[VecNormalize]) -> R
         observations = ObsDictWrapper.convert_dict(transitions)
         next_observations = ObsDictWrapper.convert_dict(transitions, observation_key="next_obs")
 
-        data = (
-            self._normalize_obs(observations, env),
-            transitions["action"],
-            self._normalize_obs(next_observations, env),
-            transitions["done"],
-            self._normalize_reward(transitions["reward"], env),
-        )
+        if online_sampling:
+            data = (
+                self._normalize_obs(observations, env),
+                transitions["action"],
+                self._normalize_obs(next_observations, env),
+                transitions["done"],
+                self._normalize_reward(transitions["reward"], env),
+            )
 
-        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
+            return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
+        else:
+            return observations, next_observations, transitions, her_indices
 
     def add(
         self,
@@ -232,7 +240,6 @@ def add(
         action: np.ndarray,
         reward: np.ndarray,
         done: np.ndarray,
-        # infos: Dict[str, np.ndarray],
         infos: List[dict],
     ) -> None:
 

From c125d0890770db880ab61e90ebf4fcdbd2300851 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 6 Oct 2020 03:55:30 +0200
Subject: [PATCH 46/81] Updated HER tests.

---
 stable_baselines3/her/her.py |  1 -
 tests/test_her.py            | 55 ++----------------------------------
 2 files changed, 3 insertions(+), 53 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index aa74bdfc30..49539c8f38 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -323,7 +323,6 @@ def collect_rollouts(
                 if 0 < n_steps <= total_steps:
                     break
 
-            # TODO check again
             if done or self.episode_steps == self.max_episode_length:
                 if self.online_sampling:
                     self.replay_buffer.store_episode()
diff --git a/tests/test_her.py b/tests/test_her.py
index 5fa4980d37..b99fdea87b 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -24,30 +24,12 @@ def test_her(model_class, policy, online_sampling):
     env = BitFlippingEnv(n_bits=n_bits, continuous=True)
     env = DummyVecEnv([lambda: env])
 
-    # Create action noise
-    n_actions = env.action_space.shape[0]
-    action_noise = OrnsteinUhlenbeckActionNoise(
-        np.zeros(
-            n_actions,
-        ),
-        0.2 * np.ones((n_actions,)),
-    )
-
     model = HER(
         policy,
         env,
         model_class,
-        n_sampled_goal=5,
         goal_selection_strategy="future",
         online_sampling=online_sampling,
-        action_noise=action_noise,
-        verbose=0,
-        tau=0.05,
-        batch_size=128,
-        learning_rate=0.001,
-        policy_kwargs=dict(net_arch=[64]),
-        buffer_size=int(1e6),
-        gamma=0.98,
         gradient_steps=1,
         train_freq=1,
         n_episodes_rollout=-1,
@@ -56,39 +38,6 @@ def test_her(model_class, policy, online_sampling):
 
     model.learn(total_timesteps=500, callback=None)
 
-    # Evaluate the agent
-    n_eval_episodes = 5
-    n_episodes = 0
-    episode_rewards = []
-    episode_reward = 0.0
-
-    eval_env = BitFlippingEnv(n_bits=4, continuous=True)
-
-    observation = eval_env.reset()
-
-    while n_episodes < n_eval_episodes:
-
-        obs = np.concatenate([observation["observation"], observation["desired_goal"]])
-
-        with th.no_grad():
-            obs_ = th.FloatTensor(np.array(obs).reshape(1, -1)).to(model.model.device)
-            action = model.model.policy.predict(obs_)[0][0]
-
-        observation, reward, done, _ = eval_env.step(action)
-
-        # Render the env
-        # eval_env.render()
-
-        episode_reward += reward
-
-        if done:
-            n_episodes += 1
-            observation = eval_env.reset()
-            episode_rewards.append(episode_reward)
-            episode_reward = 0.0
-
-    eval_env.close()
-
 
 @pytest.mark.parametrize(
     "goal_selection_strategy",
@@ -252,4 +201,6 @@ def test_dqn_her(online_sampling, n_bits):
         batch_size=32,
     )
 
-    model.learn(total_timesteps=20000)
+    model.learn(total_timesteps=10000)
+
+    assert np.mean(model.ep_success_buffer) > 0.0

From a70b47b6c5e8f3ecaa2975ab8cb3f00f618cad41 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 6 Oct 2020 09:08:06 +0200
Subject: [PATCH 47/81] Updated documentation

---
 docs/modules/her.rst | 72 ++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/docs/modules/her.rst b/docs/modules/her.rst
index 32532ba0f0..11f01fc453 100644
--- a/docs/modules/her.rst
+++ b/docs/modules/her.rst
@@ -12,7 +12,7 @@ HER is an algorithm that works with Off policy methods (DQN, SAC, TD3 and DDPG f
 
 .. warning::
 
-	HER requires the environment to inherits from `gym.GoalEnv <https://github.com/openai/gym/blob/3394e245727c1ae6851b504a50ba77c73cd4c65b/gym/core.py#L160>`_
+    HER requires the environment to inherits from `gym.GoalEnv <https://github.com/openai/gym/blob/3394e245727c1ae6851b504a50ba77c73cd4c65b/gym/core.py#L160>`_
 
 
 Notes
@@ -28,54 +28,54 @@ Notes
 Can I use?
 ----------
 
-Please refer to the wrapped model (DQN, SAC, TD3 or DDPG) for that section.
+Please refer to the used model (DQN, SAC, TD3 or DDPG) for that section.
 
 Example
 -------
 
 .. code-block:: python
 
-	from stable_baselines3 import DDPG, DQN, SAC, TD3
-	from stable_baselines3.her.her import HER
-	from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
-	from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
-	from stable_baselines3.common.vec_env import DummyVecEnv
-	from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
+    from stable_baselines3 import DDPG, DQN, SAC, TD3
+    from stable_baselines3.her.her import HER
+    from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
+    from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
+    from stable_baselines3.common.vec_env import DummyVecEnv
+    from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 
-	model_class = DQN  # works also with SAC, DDPG and TD3
-	N_BITS = 15
+    model_class = DQN  # works also with SAC, DDPG and TD3
+    N_BITS = 15
 
-	env = BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)
+    env = BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)
 
-	# Available strategies (cf paper): future, final, episode, random
-	goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
+    # Available strategies (cf paper): future, final, episode, random
+    goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
 
-	# If True the HER transitions will get sampled online
-	online_sampling = True
-	# Time limit for the episodes in online sampling (to deactivate for offline use the default value -1)
-	max_episode_length = N_BITS
+    # If True the HER transitions will get sampled online
+    online_sampling = True
+    # Time limit for the episodes
+    max_episode_length = N_BITS
 
-	# Initialize the model
-	model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling,
-							verbose=1, max_episode_length=max_episode_length)
-	# Train the model
-	model.learn(1000)
+    # Initialize the model
+    model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling,
+                            verbose=1, max_episode_length=max_episode_length)
+    # Train the model
+    model.learn(1000)
 
-	model.save("./her_bit_env")
+    model.save("./her_bit_env")
 
-	# WARNING: you must pass an VecEnv
-	env = DummyVecEnv([lambda: env])
-	model = HER.load('./her_bit_env', env=env)
+    # WARNING: you must pass an VecEnv
+    env = DummyVecEnv([lambda: env])
+    model = HER.load('./her_bit_env', env=env)
 
-	obs = env.reset()
-	for _ in range(100):
-	    # we need to convert the observation dict
-	    obs = ObsDictWrapper.convert_dict(obs)
-	    action, _ = model.model.predict(obs)
-	    obs, reward, done, _ = env.step(action)
+    obs = env.reset()
+    for _ in range(100):
+        # we need to convert the observation dict
+        obs = ObsDictWrapper.convert_dict(obs)
+        action, _ = model.model.predict(obs)
+        obs, reward, done, _ = env.step(action)
 
-	    if done:
-	        obs = env.reset()
+        if done:
+            obs = env.reset()
 
 
 Parameters
@@ -90,7 +90,7 @@ Goal Selection Strategies
 .. autoclass:: GoalSelectionStrategy
   :members:
   :inherited-members:
-	:undoc-members:
+    :undoc-members:
 
 
 Obs Dict Wrapper
@@ -99,7 +99,7 @@ Obs Dict Wrapper
 .. autoclass:: ObsDictWrapper
   :members:
   :inherited-members:
-	:undoc-members:
+    :undoc-members:
 
 
 HER Replay Buffer

From aaa80c820cee9564c5ca43e8739d7f236ac9455e Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Wed, 7 Oct 2020 11:10:19 +0200
Subject: [PATCH 48/81] Cleanup docstrings

---
 docs/misc/changelog.rst                       |  4 +-
 .../common/vec_env/obs_dict_wrapper.py        | 10 ++--
 stable_baselines3/her/her.py                  | 42 +++++++--------
 stable_baselines3/her/her_replay_buffer.py    | 54 +++++++++----------
 4 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index c10a50d15e..c922e9291b 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -12,6 +12,7 @@ Breaking Changes:
 
 New Features:
 ^^^^^^^^^^^^^
+- Added Hindsight Experience Replay ``HER``. (@megan-klaiber)
 
 Bug Fixes:
 ^^^^^^^^^^
@@ -49,7 +50,6 @@ Breaking Changes:
 
 New Features:
 ^^^^^^^^^^^^^
-- Added Hindsight Experience Replay ``HER``. (@megan-klaiber)
 - Added ``unwrap_vec_wrapper()`` to ``common.vec_env`` to extract ``VecEnvWrapper`` if needed
 - Added ``StopTrainingOnMaxEpisodes`` to callback collection (@xicocaio)
 - Added ``device`` keyword argument to ``BaseAlgorithm.load()`` (@liorcohen5)
@@ -452,4 +452,4 @@ And all the contributors:
 @MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching
 @flodorner @KuKuXia @NeoExtended @PartiallyTyped @mmcenta @richardwu @kinalmehta @rolandgvc @tkelestemur @mloo3
 @tirafesi @blurLake @koulakis @joeljosephjin @shwang @rk37 @andyshih12 @RaphaelWag @xicocaio
-@diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @megan-klaiber
\ No newline at end of file
+@diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @megan-klaiber
diff --git a/stable_baselines3/common/vec_env/obs_dict_wrapper.py b/stable_baselines3/common/vec_env/obs_dict_wrapper.py
index 22fbae4060..e05b30b875 100644
--- a/stable_baselines3/common/vec_env/obs_dict_wrapper.py
+++ b/stable_baselines3/common/vec_env/obs_dict_wrapper.py
@@ -8,7 +8,7 @@ class ObsDictWrapper(VecEnvWrapper):
     """
     Wrapper for a VecEnv which overrides the observation space for Hindsight Experience Replay to support dict observations.
 
-    :param env: (VecEnv) The vectorized environment to wrap.
+    :param env: The vectorized environment to wrap.
     """
 
     def __init__(self, venv: VecEnv):
@@ -58,9 +58,9 @@ def convert_dict(
         """
         Concatenate observation and (desired) goal of observation dict.
 
-        :param observation_dict: (dict) Dictionary with observation.
-        :param observation_key: (str) Key of observation in dicitonary.
-        :param goal_key: (str) Key of (desired) goal in dicitonary.
-        :return: (np.ndarray)
+        :param observation_dict: Dictionary with observation.
+        :param observation_key: Key of observation in dicitonary.
+        :param goal_key: Key of (desired) goal in dicitonary.
+        :return:
         """
         return np.concatenate([observation_dict[observation_key], observation_dict[goal_key]], axis=-1)
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 49539c8f38..590106d9b7 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -24,9 +24,9 @@ def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> in
     """
     Get time limit from environment.
 
-    :param env: (VecEnv) Environment from which we want to get the time limit.
-    :param current_max_episode_length: (int) Current value for max_episode_length.
-    :return: (int) max episode length
+    :param env: Environment from which we want to get the time limit.
+    :param current_max_episode_length: Current value for max_episode_length.
+    :return: max episode length
     """
     # try to get the attribute from environment
     if current_max_episode_length is None:
@@ -49,16 +49,16 @@ class HER(BaseAlgorithm):
 
     Paper: https://arxiv.org/abs/1707.01495
 
-    :param policy: (BasePolicy or str) The policy model to use.
-    :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str)
-    :param model_class: (OffPolicyAlgorithm) Off policy model which will be used with hindsight experience replay. (SAC, TD3)
-    :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling)
-    :param goal_selection_strategy: (GoalSelectionStrategy or str) Strategy for sampling goals for replay.
+    :param policy: The policy model to use.
+    :param env: The environment to learn from (if registered in Gym, can be str)
+    :param model_class: Off policy model which will be used with hindsight experience replay. (SAC, TD3)
+    :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
+    :param goal_selection_strategy: Strategy for sampling goals for replay.
         One of ['episode', 'final', 'future', 'random']
-    :param online_sampling: (bool) Sample HER transitions online.
-    :param learning_rate: (float or callable) learning rate for the optimizer,
+    :param online_sampling: Sample HER transitions online.
+    :param learning_rate: learning rate for the optimizer,
         it can be a function of the current progress remaining (from 1 to 0)
-    :param max_episode_length: (int) The maximum length of an episode. If not specified,
+    :param max_episode_length: The maximum length of an episode. If not specified,
         it will be automatically inferred if the environment uses a ``gym.wrappers.TimeLimit`` wrapper
     """
 
@@ -215,20 +215,20 @@ def collect_rollouts(
         """
         Collect experiences and store them into a ReplayBuffer.
 
-        :param env: (VecEnv) The training environment
-        :param callback: (BaseCallback) Callback that will be called at each step
+        :param env: The training environment
+        :param callback: Callback that will be called at each step
             (and at the beginning and end of the rollout)
-        :param n_episodes: (int) Number of episodes to use to collect rollout data
+        :param n_episodes: Number of episodes to use to collect rollout data
             You can also specify a ``n_steps`` instead
-        :param n_steps: (int) Number of steps to use to collect rollout data
+        :param n_steps: Number of steps to use to collect rollout data
             You can also specify a ``n_episodes`` instead.
-        :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration
+        :param action_noise: Action noise that will be used for exploration
             Required for deterministic policy (e.g. TD3). This can also be used
             in addition to the stochastic policy for SAC.
-        :param learning_starts: (int) Number of steps before learning for the warm-up phase.
-        :param replay_buffer: (ReplayBuffer or HerReplayBuffer)
-        :param log_interval: (int) Log data every ``log_interval`` episodes
-        :return: (RolloutReturn)
+        :param learning_starts: Number of steps before learning for the warm-up phase.
+        :param replay_buffer:
+        :param log_interval: Log data every ``log_interval`` episodes
+        :return:
         """
 
         episode_rewards, total_timesteps = [], []
@@ -401,7 +401,7 @@ def save(
         """
         Save all the attributes of the object and the model parameters in a zip-file.
 
-        :param path: (Union[str, pathlib.Path, io.BufferedIOBase]) path to the file where the rl agent should be saved
+        :param path: path to the file where the rl agent should be saved
         :param exclude: name of parameters that should be excluded in addition to the default one
         :param include: name of parameters that might be excluded but should be included anyway
         """
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 86d26f8f7b..381f574e89 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -17,16 +17,16 @@ class HerReplayBuffer(BaseBuffer):
     Replay Buffer for sampling HER (Hindsight Experience Replay) transitions online.
     These transitions will not be saved in the Buffer.
 
-    :param env: (VecEnv) The training environment
-    :param buffer_size: (int) The size of the buffer measured in transitions.
-    :param max_episode_length: (int) The length of an episode. (time horizon)
-    :param goal_selection_strategy: (GoalSelectionStrategy ) Strategy for sampling goals for replay.
+    :param env: The training environment
+    :param buffer_size: The size of the buffer measured in transitions.
+    :param max_episode_length: The length of an episode. (time horizon)
+    :param goal_selection_strategy: Strategy for sampling goals for replay.
         One of ['episode', 'final', 'future', 'random']
-    :param observation_space: (spaces.Space) Observation space
-    :param action_space: (spaces.Space) Action space
-    :param device: (Union[th.device, str]) PyTorch device
+    :param observation_space: Observation space
+    :param action_space: Action space
+    :param device: PyTorch device
         to which the values will be converted
-    :param n_envs: (int) Number of parallel environments
+    :param n_envs: Number of parallel environments
     :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling)
     """
 
@@ -87,13 +87,13 @@ def sample(
         replay_observations: np.ndarray = None,
     ) -> Union[ReplayBufferSamples, Tuple]:
         """
-        :param batch_size: (int) Number of element to sample
-        :param env: (Optional[VecNormalize]) associated gym VecEnv
+        :param batch_size: Number of element to sample
+        :param env: associated gym VecEnv
             to normalize the observations/rewards when sampling
-        :param online_sampling: (bool) Using online_sampling for HER or not.
-        :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling)
-        :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
-        :return: (ReplayBufferSamples or Tuple)
+        :param online_sampling: Using online_sampling for HER or not.
+        :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
+        :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
+        :return:
         """
         return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal, replay_observations)
 
@@ -109,12 +109,12 @@ def vectorized_sample_goal(
         Sample goals based on goal_selection_strategy.
         This is a vectorized (fast) version.
 
-        :param episode_indices: (np.ndarray) Episode indices to use.
-        :param her_indices: (np.ndarray) HER indices.
-        :param transitions_indices: (np.ndarray) Transition indices to use.
-        :param online_sampling: (bool) Using online_sampling for HER or not.
-        :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
-        :return: (np.ndarray) Return sampled goals.
+        :param episode_indices: Episode indices to use.
+        :param her_indices: HER indices.
+        :param transitions_indices: Transition indices to use.
+        :param online_sampling: Using online_sampling for HER or not.
+        :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
+        :return: Return sampled goals.
         """
         her_episode_indices = episode_indices[her_indices]
 
@@ -159,13 +159,13 @@ def _sample_transitions(
         replay_observations: np.ndarray = None,
     ) -> Union[ReplayBufferSamples, Tuple]:
         """
-        :param batch_size: (int) Number of element to sample
-        :param env: (Optional[VecNormalize]) associated gym VecEnv
+        :param batch_size: Number of element to sample
+        :param env: associated gym VecEnv
             to normalize the observations/rewards when sampling
-        :param online_sampling: (bool) Using online_sampling for HER or not.
-        :param n_sampled_goal: (int) Number of sampled goals for replay. (offline sampling)
-        :param replay_observations: (np.ndarray) Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
-        :return: (ReplayBufferSamples or Tuple)
+        :param online_sampling: Using online_sampling for HER or not.
+        :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
+        :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
+        :return:
         """
         # Select which episodes to use
         if online_sampling:
@@ -288,6 +288,6 @@ def clear_buffer(self):
 
     def size(self) -> int:
         """
-        :return: (int) The current size of the buffer in transitions.
+        :return: The current size of the buffer in transitions.
         """
         return int(np.sum(self.episode_lengths))

From 362ea5c627f6af960ca346e644926966b3768d9f Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Thu, 8 Oct 2020 10:46:20 +0200
Subject: [PATCH 49/81] Updated to review comments

---
 docs/modules/her.rst                          | 10 +---
 stable_baselines3/common/base_class.py        |  2 +-
 stable_baselines3/common/buffers.py           |  2 +
 .../common/off_policy_algorithm.py            |  7 +++
 stable_baselines3/her/her.py                  | 57 ++++++++++++++-----
 stable_baselines3/her/her_replay_buffer.py    | 44 ++++++++------
 tests/test_her.py                             | 47 ++++++---------
 7 files changed, 96 insertions(+), 73 deletions(-)

diff --git a/docs/modules/her.rst b/docs/modules/her.rst
index 11f01fc453..8f8c8f36c6 100644
--- a/docs/modules/her.rst
+++ b/docs/modules/her.rst
@@ -35,8 +35,7 @@ Example
 
 .. code-block:: python
 
-    from stable_baselines3 import DDPG, DQN, SAC, TD3
-    from stable_baselines3.her.her import HER
+    from stable_baselines3 import HER, DDPG, DQN, SAC, TD3
     from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
     from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
     from stable_baselines3.common.vec_env import DummyVecEnv
@@ -62,16 +61,11 @@ Example
     model.learn(1000)
 
     model.save("./her_bit_env")
-
-    # WARNING: you must pass an VecEnv
-    env = DummyVecEnv([lambda: env])
     model = HER.load('./her_bit_env', env=env)
 
     obs = env.reset()
     for _ in range(100):
-        # we need to convert the observation dict
-        obs = ObsDictWrapper.convert_dict(obs)
-        action, _ = model.model.predict(obs)
+        action, _ = model.model.predict(obs, deterministic=True)
         obs, reward, done, _ = env.step(action)
 
         if done:
diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
index 667a1c2155..97d0035599 100644
--- a/stable_baselines3/common/base_class.py
+++ b/stable_baselines3/common/base_class.py
@@ -173,7 +173,7 @@ def _wrap_env(self, env: GymEnv) -> VecEnv:
                 print("Wrapping the env in a VecTransposeImage.")
             env = VecTransposeImage(env)
 
-        # check if wrapper for dict support is needed
+        # check if wrapper for dict support is needed when using HER
         if isinstance(env.observation_space, gym.spaces.dict.Dict):
             env = ObsDictWrapper(env)
 
diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py
index 2063d6c302..bd07069854 100644
--- a/stable_baselines3/common/buffers.py
+++ b/stable_baselines3/common/buffers.py
@@ -83,6 +83,8 @@ def extend(self, *args, **kwargs) -> None:
         """
         # Do a for loop along the batch axis
         for data in zip(*args):
+            # import ipdb
+            # ipdb.set_trace()
             self.add(*data)
 
     def reset(self) -> None:
diff --git a/stable_baselines3/common/off_policy_algorithm.py b/stable_baselines3/common/off_policy_algorithm.py
index cf08b4444e..e3ffeb61f0 100644
--- a/stable_baselines3/common/off_policy_algorithm.py
+++ b/stable_baselines3/common/off_policy_algorithm.py
@@ -67,6 +67,8 @@ class OffPolicyAlgorithm(BaseAlgorithm):
     :param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling
         during the warm up phase (before learning starts)
     :param sde_support: Whether the model support gSDE or not
+    :param remove_time_limit_termination: Remove terminations (dones) that are due to time limit.
+        See https://github.com/hill-a/stable-baselines/issues/863
     """
 
     def __init__(
@@ -97,6 +99,7 @@ def __init__(
         sde_sample_freq: int = -1,
         use_sde_at_warmup: bool = False,
         sde_support: bool = True,
+        remove_time_limit_termination: bool = False,
     ):
 
         super(OffPolicyAlgorithm, self).__init__(
@@ -126,6 +129,10 @@ def __init__(
         self.action_noise = action_noise
         self.optimize_memory_usage = optimize_memory_usage
 
+        # Remove terminations (dones) that are due to time limit
+        # see https://github.com/hill-a/stable-baselines/issues/863
+        self.remove_time_limit_termination = remove_time_limit_termination
+
         if train_freq > 0 and n_episodes_rollout > 0:
             warnings.warn(
                 "You passed a positive value for `train_freq` and `n_episodes_rollout`."
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 590106d9b7..c1565137b8 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -1,6 +1,6 @@
 import io
 import pathlib
-from typing import Callable, Iterable, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union
 
 import gym
 import numpy as np
@@ -11,10 +11,11 @@
 from stable_baselines3.common.noise import ActionNoise
 from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm
 from stable_baselines3.common.policies import BasePolicy
+from stable_baselines3.common.preprocessing import is_image_space
 from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
 from stable_baselines3.common.utils import check_for_correct_spaces
-from stable_baselines3.common.vec_env import VecEnv
+from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecTransposeImage
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy
 from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
@@ -43,6 +44,7 @@ def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> in
     return current_max_episode_length
 
 
+# TODO: rewrite HER class as soon as dict obs are supported
 class HER(BaseAlgorithm):
     """
     Hindsight Experience Replay (HER)
@@ -51,7 +53,7 @@ class HER(BaseAlgorithm):
 
     :param policy: The policy model to use.
     :param env: The environment to learn from (if registered in Gym, can be str)
-    :param model_class: Off policy model which will be used with hindsight experience replay. (SAC, TD3)
+    :param model_class: Off policy model which will be used with hindsight experience replay. (SAC, TD3, DDPG, DQN)
     :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
     :param goal_selection_strategy: Strategy for sampling goals for replay.
         One of ['episode', 'final', 'future', 'random']
@@ -67,7 +69,7 @@ def __init__(
         policy: Union[str, Type[BasePolicy]],
         env: Union[GymEnv, str],
         model_class: Type[OffPolicyAlgorithm],
-        n_sampled_goal: int = 5,
+        n_sampled_goal: int = 4,
         goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future",
         online_sampling: bool = False,
         learning_rate: Union[float, Callable] = 3e-4,
@@ -105,6 +107,7 @@ def __init__(
         self.n_sampled_goal = n_sampled_goal
         # if we sample her transitions online use custom replay buffer
         self.online_sampling = online_sampling
+        # compute ratio between HER replays and regular replays in percent for online HER sampling
         self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1))
         # maximum steps in episode
         self.max_episode_length = get_time_limit(self.env, max_episode_length)
@@ -263,8 +266,6 @@ def collect_rollouts(
                 # Perform action
                 new_obs, reward, done, infos = env.step(action)
 
-                done = done if episode_timesteps < self.max_episode_length else False
-
                 self.num_timesteps += 1
                 self.model.num_timesteps = self.num_timesteps
                 episode_timesteps += 1
@@ -292,16 +293,30 @@ def collect_rollouts(
                         self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
                         self.model._last_original_obs = self._last_original_obs
 
+                    # Remove termination signal due to timelimit if needed
+                    # NOTE: this may cause issue when using memory optimized replay
+                    # or n-step replay
+                    if self.remove_time_limit_termination and infos[0].get("TimeLimit.truncated", False):
+                        done_ = np.array([False])
+                        # As the VecEnv resets automatically, new_obs is already the
+                        # first observation of the next episode
+                        next_obs = infos[0]["terminal_observation"]
+                        if self._vec_normalize_env is not None:
+                            next_obs = self._vec_normalize_env.unnormalize_obs(next_obs)
+                    else:
+                        done_ = done
+                        next_obs = new_obs_
+
                     if self.online_sampling:
-                        self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
+                        self.replay_buffer.add(self._last_original_obs, next_obs, buffer_action, reward_, done_, infos)
                     else:
                         # concatenate observation with (desired) goal
                         obs = ObsDictWrapper.convert_dict(self._last_original_obs)
-                        next_obs = ObsDictWrapper.convert_dict(new_obs_)
-                        # add to replay bufffer
-                        self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done)
+                        next_obs_ = ObsDictWrapper.convert_dict(next_obs)
+                        # add to replay buffer
+                        self.replay_buffer.add(obs, next_obs_, buffer_action, reward_, done_)
                         # add current transition to episode storage
-                        self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
+                        self._episode_storage.add(self._last_original_obs, next_obs, buffer_action, reward_, done_, infos)
 
                 self._last_obs = new_obs
                 self.model._last_obs = self._last_obs
@@ -370,8 +385,9 @@ def _store_transitions(self) -> None:
             self.replay_buffer.observations,
         )
 
-        # TODO done = False?
         # store data in replay buffer
+        # self.replay_buffer.extend(observations, next_observations, transitions["action"], transitions["reward"], np.array([False]))
+
         for i in her_indices:
             obs = observations[i]
             next_obs = next_observations[i]
@@ -380,14 +396,14 @@ def _store_transitions(self) -> None:
             done = np.array([False])
             self.replay_buffer.add(obs, next_obs, buffer_action, reward, done)
 
-    def __getattr__(self, item):
+    def __getattr__(self, item: str) -> Any:
         """
         Find attribute from model class if this class does not have it.
         """
         if hasattr(self.model, item):
             return getattr(self.model, item)
         else:
-            raise AttributeError
+            raise AttributeError(f"{self} has no attribute {item}")
 
     def _get_torch_save_params(self) -> Tuple[List[str], List[str]]:
         return self.model._get_torch_save_params()
@@ -444,11 +460,22 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
         # check if given env is valid
         if env is not None:
             # check if wrapper for dict support is needed
+            # if isinstance(env.observation_space, gym.spaces.dict.Dict):
+            #    env = ObsDictWrapper(env)
+
+            if not isinstance(env, VecEnv):
+                env = DummyVecEnv([lambda: env])
+
+            if is_image_space(env.observation_space) and not isinstance(env, VecTransposeImage):
+                env = VecTransposeImage(env)
+
+            # check if wrapper for dict support when using HER is needed
             if isinstance(env.observation_space, gym.spaces.dict.Dict):
                 env = ObsDictWrapper(env)
+
             check_for_correct_spaces(env, data["observation_space"], data["action_space"])
         # if no new env was given use stored env if possible
-        if env is None and "env" in data:
+        if env is None:
             env = data["env"]
 
         kwargs = {}
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 381f574e89..4c9f017c75 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -6,7 +6,7 @@
 from gym import spaces
 
 from stable_baselines3.common.buffers import BaseBuffer
-from stable_baselines3.common.type_aliases import ReplayBufferSamples
+from stable_baselines3.common.type_aliases import ReplayBufferSamples, RolloutBufferSamples
 from stable_baselines3.common.vec_env import VecNormalize
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
@@ -27,7 +27,7 @@ class HerReplayBuffer(BaseBuffer):
     :param device: PyTorch device
         to which the values will be converted
     :param n_envs: Number of parallel environments
-    :her_ratio: (float) The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling)
+    :her_ratio: The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling)
     """
 
     def __init__(
@@ -40,7 +40,7 @@ def __init__(
         action_space: spaces.Space,
         device: Union[th.device, str] = "cpu",
         n_envs: int = 1,
-        her_ratio: float = 0.6,
+        her_ratio: float = 0.8,
     ):
 
         super(HerReplayBuffer, self).__init__(buffer_size, observation_space, action_space, device, n_envs)
@@ -78,6 +78,11 @@ def __init__(
         # percentage of her indices
         self.her_ratio = her_ratio
 
+    def _get_samples(
+        self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None
+    ) -> Union[ReplayBufferSamples, RolloutBufferSamples]:
+        pass
+
     def sample(
         self,
         batch_size: int,
@@ -88,16 +93,16 @@ def sample(
     ) -> Union[ReplayBufferSamples, Tuple]:
         """
         :param batch_size: Number of element to sample
-        :param env: associated gym VecEnv
+        :param env: Associated gym VecEnv
             to normalize the observations/rewards when sampling
         :param online_sampling: Using online_sampling for HER or not.
         :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
         :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
-        :return:
+        :return: Samples.
         """
         return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal, replay_observations)
 
-    def vectorized_sample_goal(
+    def sample_goal(
         self,
         episode_indices: np.ndarray,
         her_indices: np.ndarray,
@@ -165,7 +170,7 @@ def _sample_transitions(
         :param online_sampling: Using online_sampling for HER or not.
         :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
         :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
-        :return:
+        :return: Samples.
         """
         # Select which episodes to use
         if online_sampling:
@@ -176,17 +181,23 @@ def _sample_transitions(
             episode_length = self.episode_lengths[0]
             episode_indices = np.array(list(range(self.n_episodes_stored)) * episode_length * n_sampled_goal)
             her_indices = np.arange(len(episode_indices))
+            ep_length = self.episode_lengths[episode_indices]
             # repeat every transition index n_sampled_goals times
             transitions_indices = np.array(list(range(episode_length)) * n_sampled_goal)
 
         if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
             # restrict the sampling domain when ep_length > 1
             # otherwise filter out the indices
+
             if online_sampling:
                 her_indices = her_indices[ep_length[her_indices] > 1]
                 ep_length[her_indices] -= 1
             else:
                 her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1]
+            """
+            her_indices = her_indices[ep_length[her_indices] > 1]
+            ep_length[her_indices] -= 1
+            """
 
         if online_sampling:
             # Select which transitions to use
@@ -194,13 +205,10 @@ def _sample_transitions(
         # get selected transitions
         transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()}
 
-        new_goals = self.vectorized_sample_goal(
-            episode_indices, her_indices, transitions_indices, online_sampling, replay_observations
-        )
+        new_goals = self.sample_goal(episode_indices, her_indices, transitions_indices, online_sampling, replay_observations)
         transitions["desired_goal"][her_indices] = new_goals
 
-        # Convert to numpy array
-        # TODO: disable if not needed for faster computation
+        # Convert info buffer to numpy array
         transitions["info"] = np.array(
             [
                 self.info_buffer[episode_idx][transition_idx]
@@ -209,11 +217,11 @@ def _sample_transitions(
         )
 
         # Vectorized computation
-        transitions["reward"][her_indices] = self.env.env_method(
+        transitions["reward"][her_indices, 0] = self.env.env_method(
             "compute_reward",
-            transitions["next_achieved_goal"][her_indices],
-            transitions["desired_goal"][her_indices],
-            transitions["info"][her_indices],
+            transitions["next_achieved_goal"][her_indices, 0],
+            transitions["desired_goal"][her_indices, 0],
+            transitions["info"][her_indices, 0],
         )
 
         # concatenate observation with (desired) goal
@@ -222,9 +230,9 @@ def _sample_transitions(
 
         if online_sampling:
             data = (
-                self._normalize_obs(observations, env),
+                self._normalize_obs(observations[:, 0], env),
                 transitions["action"],
-                self._normalize_obs(next_observations, env),
+                self._normalize_obs(next_observations[:, 0], env),
                 transitions["done"],
                 self._normalize_reward(transitions["reward"], env),
             )
diff --git a/tests/test_her.py b/tests/test_her.py
index b99fdea87b..5ffc38288b 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -5,27 +5,23 @@
 import pytest
 import torch as th
 
-from stable_baselines3 import DDPG, DQN, SAC, TD3
+from stable_baselines3 import DDPG, DQN, HER, SAC, TD3
 from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
-from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
-from stable_baselines3.common.vec_env import DummyVecEnv
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
-from stable_baselines3.her.her import HER
 
 
-@pytest.mark.parametrize("model_class, policy", [(SAC, "MlpPolicy"), (TD3, "MlpPolicy"), (DDPG, "MlpPolicy")])
+@pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN])
 @pytest.mark.parametrize("online_sampling", [True, False])
-def test_her(model_class, policy, online_sampling):
+def test_her(model_class, online_sampling):
     """
     Test Hindsight Experience Replay.
     """
     n_bits = 4
-    env = BitFlippingEnv(n_bits=n_bits, continuous=True)
-    env = DummyVecEnv([lambda: env])
+    env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN))
 
     model = HER(
-        policy,
+        "MlpPolicy",
         env,
         model_class,
         goal_selection_strategy="future",
@@ -34,9 +30,10 @@ def test_her(model_class, policy, online_sampling):
         train_freq=1,
         n_episodes_rollout=-1,
         max_episode_length=n_bits,
+        policy_kwargs=dict(net_arch=[64]),
     )
 
-    model.learn(total_timesteps=500, callback=None)
+    model.learn(total_timesteps=500)
 
 
 @pytest.mark.parametrize(
@@ -58,7 +55,6 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
     Test different goal strategies.
     """
     env = BitFlippingEnv(continuous=True)
-    env = DummyVecEnv([lambda: env])
 
     model = HER(
         "MlpPolicy",
@@ -70,13 +66,14 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
         train_freq=1,
         n_episodes_rollout=-1,
         max_episode_length=10,
+        policy_kwargs=dict(net_arch=[64]),
     )
-    model.learn(total_timesteps=200, callback=None)
+    model.learn(total_timesteps=200)
 
 
-@pytest.mark.parametrize("model_class, policy", [(SAC, "MlpPolicy"), (TD3, "MlpPolicy"), (DDPG, "MlpPolicy")])
+@pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN])
 @pytest.mark.parametrize("use_sde", [False, True])
-def test_save_load(tmp_path, model_class, policy, use_sde):
+def test_save_load(tmp_path, model_class, use_sde):
     """
     Test if 'save' and 'load' saves and loads model correctly
     """
@@ -84,29 +81,18 @@ def test_save_load(tmp_path, model_class, policy, use_sde):
         pytest.skip("Only SAC has gSDE support")
 
     n_bits = 4
-    env = BitFlippingEnv(n_bits=n_bits, continuous=True)
-    env = DummyVecEnv([lambda: env])
-
-    # Create action noise
-    n_actions = env.action_space.shape[0]
-    action_noise = OrnsteinUhlenbeckActionNoise(
-        np.zeros(
-            n_actions,
-        ),
-        0.2 * np.ones((n_actions,)),
-    )
+    env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN))
 
     kwargs = dict(use_sde=True) if use_sde else {}
 
     # create model
     model = HER(
-        policy,
+        "MlpPolicy",
         env,
         model_class,
         n_sampled_goal=5,
         goal_selection_strategy="future",
         online_sampling=True,
-        action_noise=action_noise,
         verbose=0,
         tau=0.05,
         batch_size=128,
@@ -121,17 +107,16 @@ def test_save_load(tmp_path, model_class, policy, use_sde):
         **kwargs
     )
 
-    model.learn(total_timesteps=500, callback=None)
+    model.learn(total_timesteps=500)
 
     env.reset()
 
     observations_list = []
     for _ in range(10):
-        obs = env.step([env.action_space.sample()])[0]
+        obs = env.step(env.action_space.sample())[0]
         observation = ObsDictWrapper.convert_dict(obs)
         observations_list.append(observation)
-
-    observations = np.concatenate(observations_list, axis=0)
+    observations = np.array(observations_list)
 
     # Get dictionary of current parameters
     params = deepcopy(model.model.policy.state_dict())

From 7f8b63617e97a921d1abbc38ce7dfdc2a1f08c0b Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Mon, 12 Oct 2020 10:07:39 +0200
Subject: [PATCH 50/81] Fix pytype

---
 stable_baselines3/her/her_replay_buffer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 4c9f017c75..f5a2f3e339 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -81,7 +81,7 @@ def __init__(
     def _get_samples(
         self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None
     ) -> Union[ReplayBufferSamples, RolloutBufferSamples]:
-        pass
+        raise NotImplementedError()
 
     def sample(
         self,

From 39a63b8a6428d8b6624ddd5331efb5fdaf034a9a Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Wed, 14 Oct 2020 00:00:30 +0200
Subject: [PATCH 51/81] Update according to review comments.

---
 stable_baselines3/common/buffers.py           |   2 -
 .../common/vec_env/obs_dict_wrapper.py        |   6 +-
 stable_baselines3/her/her.py                  | 101 +++++++++---------
 stable_baselines3/her/her_replay_buffer.py    |  16 ++-
 4 files changed, 58 insertions(+), 67 deletions(-)

diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py
index bd07069854..2063d6c302 100644
--- a/stable_baselines3/common/buffers.py
+++ b/stable_baselines3/common/buffers.py
@@ -83,8 +83,6 @@ def extend(self, *args, **kwargs) -> None:
         """
         # Do a for loop along the batch axis
         for data in zip(*args):
-            # import ipdb
-            # ipdb.set_trace()
             self.add(*data)
 
     def reset(self) -> None:
diff --git a/stable_baselines3/common/vec_env/obs_dict_wrapper.py b/stable_baselines3/common/vec_env/obs_dict_wrapper.py
index e05b30b875..5b1dd1a106 100644
--- a/stable_baselines3/common/vec_env/obs_dict_wrapper.py
+++ b/stable_baselines3/common/vec_env/obs_dict_wrapper.py
@@ -1,3 +1,5 @@
+from typing import Dict
+
 import numpy as np
 from gym import spaces
 
@@ -53,7 +55,7 @@ def step_wait(self):
 
     @staticmethod
     def convert_dict(
-        observation_dict: dict, observation_key: str = "observation", goal_key: str = "desired_goal"
+        observation_dict: Dict[str, np.ndarray], observation_key: str = "observation", goal_key: str = "desired_goal"
     ) -> np.ndarray:
         """
         Concatenate observation and (desired) goal of observation dict.
@@ -61,6 +63,6 @@ def convert_dict(
         :param observation_dict: Dictionary with observation.
         :param observation_key: Key of observation in dicitonary.
         :param goal_key: Key of (desired) goal in dicitonary.
-        :return:
+        :return: Concatenated observation.
         """
         return np.concatenate([observation_dict[observation_key], observation_dict[goal_key]], axis=-1)
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index c1565137b8..e495415b03 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -4,6 +4,7 @@
 
 import gym
 import numpy as np
+import torch as th
 
 from stable_baselines3.common.base_class import BaseAlgorithm
 from stable_baselines3.common.buffers import ReplayBuffer
@@ -48,9 +49,12 @@ def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> in
 class HER(BaseAlgorithm):
     """
     Hindsight Experience Replay (HER)
-
     Paper: https://arxiv.org/abs/1707.01495
 
+    WARNING: Requires maximum episode length provided either by the environment or by the user!
+
+    For additional offline algorithm specific arguments please have a look at the corresponding documentation.
+
     :param policy: The policy model to use.
     :param env: The environment to learn from (if registered in Gym, can be str)
     :param model_class: Off policy model which will be used with hindsight experience replay. (SAC, TD3, DDPG, DQN)
@@ -61,7 +65,7 @@ class HER(BaseAlgorithm):
     :param learning_rate: learning rate for the optimizer,
         it can be a function of the current progress remaining (from 1 to 0)
     :param max_episode_length: The maximum length of an episode. If not specified,
-        it will be automatically inferred if the environment uses a ``gym.wrappers.TimeLimit`` wrapper
+        it will be automatically inferred if the environment uses a ``gym.wrappers.TimeLimit`` wrapper.
     """
 
     def __init__(
@@ -72,20 +76,18 @@ def __init__(
         n_sampled_goal: int = 4,
         goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future",
         online_sampling: bool = False,
-        learning_rate: Union[float, Callable] = 3e-4,
         max_episode_length: Optional[int] = None,
         *args,
         **kwargs,
     ):
 
-        super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=learning_rate)
+        super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=3e-4)
 
         # model initialization
         self.model_class = model_class
         self.model = model_class(
             policy=policy,
             env=self.env,
-            learning_rate=learning_rate,
             *args,
             **kwargs,  # pytype: disable=wrong-keyword-args
         )
@@ -114,7 +116,7 @@ def __init__(
         # storage for transitions of current episode
         self._episode_storage = HerReplayBuffer(
             self.env,
-            self.max_episode_length,
+            self.buffer_size,
             self.max_episode_length,
             self.goal_selection_strategy,
             self.env.observation_space,
@@ -124,20 +126,12 @@ def __init__(
             self.her_ratio,  # pytype: disable=wrong-arg-types
         )
 
+        # assign episode storage to replay buffer when using online HER sampling
+        if self.online_sampling:
+            self.model.replay_buffer = self._episode_storage
+
         # counter for steps in episode
         self.episode_steps = 0
-        if self.online_sampling:
-            self.model.replay_buffer = HerReplayBuffer(
-                self.env,
-                self.buffer_size,
-                self.max_episode_length,
-                self.goal_selection_strategy,
-                self.env.observation_space,
-                self.env.action_space,
-                self.device,
-                self.n_envs,
-                self.her_ratio,  # pytype: disable=wrong-arg-types
-            )
 
     def _setup_model(self) -> None:
         self.model._setup_model()
@@ -338,15 +332,15 @@ def collect_rollouts(
                 if 0 < n_steps <= total_steps:
                     break
 
-            if done or self.episode_steps == self.max_episode_length:
+            if done or self.episode_steps >= self.max_episode_length:
                 if self.online_sampling:
                     self.replay_buffer.store_episode()
                 else:
                     self._episode_storage.store_episode()
                     # store episode in replay buffer
                     self._store_transitions()
-                # clear storage for current episode
-                self._episode_storage.reset()
+                    # clear storage for current episode
+                    self._episode_storage.reset()
 
                 total_episodes += 1
                 self._episode_num += 1
@@ -361,8 +355,6 @@ def collect_rollouts(
                 if log_interval is not None and self._episode_num % log_interval == 0:
                     self._dump_logs()
 
-                # reset if done or episode length is reached
-                self.env.reset()
                 self.episode_steps = 0
 
         mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0
@@ -377,7 +369,7 @@ def _store_transitions(self) -> None:
         """
 
         # sample goals and get new observations
-        observations, next_observations, transitions, her_indices = self._episode_storage.sample(
+        observations, next_observations, transitions = self._episode_storage.sample(
             self.batch_size,
             self.env,
             self.online_sampling,
@@ -386,15 +378,8 @@ def _store_transitions(self) -> None:
         )
 
         # store data in replay buffer
-        # self.replay_buffer.extend(observations, next_observations, transitions["action"], transitions["reward"], np.array([False]))
-
-        for i in her_indices:
-            obs = observations[i]
-            next_obs = next_observations[i]
-            buffer_action = transitions["action"][i]
-            reward = transitions["reward"][i]
-            done = np.array([False])
-            self.replay_buffer.add(obs, next_obs, buffer_action, reward, done)
+        dones = np.zeros((len(observations)), dtype=bool)
+        self.replay_buffer.extend(observations, next_observations, transitions["action"], transitions["reward"], dones)
 
     def __getattr__(self, item: str) -> Any:
         """
@@ -429,24 +414,37 @@ def save(
         self.model.model_class = self.model_class
         self.model.max_episode_length = self.max_episode_length
 
+        # exclude episode storage
+        if exclude is None:
+            exclude = []
+        exclude = ["_episode_storage"].extend(exclude)
+
         self.model.save(path, exclude, include)
 
     @classmethod
-    def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAlgorithm":
+    def load(
+        cls,
+        path: Union[str, pathlib.Path, io.BufferedIOBase],
+        env: Optional[GymEnv] = None,
+        device: Union[th.device, str] = "auto",
+        **kwargs,
+    ) -> "BaseAlgorithm":
         """
         Load the model from a zip-file
 
-        :param load_path: the location of the saved data
+        :param path: path to the file (or a file-like) where to
+            load the agent from
         :param env: the new environment to run the loaded model on
             (can be None if you only need prediction from a trained model) has priority over any saved environment
+        :param device: Device on which the code should run.
         :param kwargs: extra arguments to change the model when loading
         """
-        data, params, tensors = load_from_zip_file(load_path)
+        data, params, pytorch_variables = load_from_zip_file(path, device=device)
 
+        # Remove stored device information and replace with ours
         if "policy_kwargs" in data:
-            for arg_to_remove in ["device"]:
-                if arg_to_remove in data["policy_kwargs"]:
-                    del data["policy_kwargs"][arg_to_remove]
+            if "device" in data["policy_kwargs"]:
+                del data["policy_kwargs"]["device"]
 
         if "policy_kwargs" in kwargs and kwargs["policy_kwargs"] != data["policy_kwargs"]:
             raise ValueError(
@@ -457,12 +455,10 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
         # check if observation space and action space are part of the saved parameters
         if "observation_space" not in data or "action_space" not in data:
             raise KeyError("The observation_space and action_space were not given, can't verify new environments")
+
         # check if given env is valid
         if env is not None:
             # check if wrapper for dict support is needed
-            # if isinstance(env.observation_space, gym.spaces.dict.Dict):
-            #    env = ObsDictWrapper(env)
-
             if not isinstance(env, VecEnv):
                 env = DummyVecEnv([lambda: env])
 
@@ -475,8 +471,10 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
 
             check_for_correct_spaces(env, data["observation_space"], data["action_space"])
         # if no new env was given use stored env if possible
-        if env is None:
-            env = data["env"]
+        else:
+            # Use stored env, if one exists. If not, continue as is (can be used for predict)
+            if "env" in data:
+                env = data["env"]
 
         kwargs = {}
         if "use_sde" in data and data["use_sde"]:
@@ -490,7 +488,6 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
             n_sampled_goal=data["n_sampled_goal"],
             goal_selection_strategy=data["goal_selection_strategy"],
             online_sampling=data["online_sampling"],
-            learning_rate=data["learning_rate"],
             max_episode_length=data["max_episode_length"],
             policy_kwargs=data["policy_kwargs"],
             _init_setup_model=True,  # pytype: disable=not-instantiable,wrong-keyword-args
@@ -506,14 +503,12 @@ def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs) -> "BaseAl
         her_model._episode_num = her_model.model._episode_num
 
         # put state_dicts back in place
-        for name in params:
-            attr = recursive_getattr(her_model.model, name)
-            attr.load_state_dict(params[name])
-
-        # put tensors back in place
-        if tensors is not None:
-            for name in tensors:
-                recursive_setattr(her_model.model, name, tensors[name])
+        her_model.model.set_parameters(params, exact_match=True, device=device)
+
+        # put other pytorch variables back in place
+        if pytorch_variables is not None:
+            for name in pytorch_variables:
+                recursive_setattr(her_model.model, name, pytorch_variables[name])
 
         # Sample gSDE exploration matrix, so it uses the right device
         # see issue #44
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index f5a2f3e339..b4db3b2609 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -14,8 +14,8 @@
 
 class HerReplayBuffer(BaseBuffer):
     """
-    Replay Buffer for sampling HER (Hindsight Experience Replay) transitions online.
-    These transitions will not be saved in the Buffer.
+    Replay Buffer for sampling HER (Hindsight Experience Replay) transitions.
+    In the online sampling case these new transitions will not be saved in the Buffer.
 
     :param env: The training environment
     :param buffer_size: The size of the buffer measured in transitions.
@@ -148,6 +148,7 @@ def sample_goal(
                 obs = replay_observations[index]
                 # get only the observation part of the state
                 obs_dim = self.env.obs_dim
+                # get from every observation from first env the observation part (without concatenated desired goal)
                 obs_array = obs[:, :, :obs_dim]
                 return obs_array
         else:
@@ -179,29 +180,24 @@ def _sample_transitions(
             ep_length = self.episode_lengths[episode_indices]
         else:
             episode_length = self.episode_lengths[0]
-            episode_indices = np.array(list(range(self.n_episodes_stored)) * episode_length * n_sampled_goal)
+            episode_indices = np.array(list(range(1)) * episode_length * n_sampled_goal)
             her_indices = np.arange(len(episode_indices))
-            ep_length = self.episode_lengths[episode_indices]
             # repeat every transition index n_sampled_goals times
             transitions_indices = np.array(list(range(episode_length)) * n_sampled_goal)
 
         if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
             # restrict the sampling domain when ep_length > 1
             # otherwise filter out the indices
-
             if online_sampling:
                 her_indices = her_indices[ep_length[her_indices] > 1]
                 ep_length[her_indices] -= 1
             else:
                 her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1]
-            """
-            her_indices = her_indices[ep_length[her_indices] > 1]
-            ep_length[her_indices] -= 1
-            """
 
         if online_sampling:
             # Select which transitions to use
             transitions_indices = np.random.randint(ep_length)
+
         # get selected transitions
         transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()}
 
@@ -239,7 +235,7 @@ def _sample_transitions(
 
             return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
         else:
-            return observations, next_observations, transitions, her_indices
+            return observations, next_observations, transitions
 
     def add(
         self,

From 258deff02019be5eab39d23e5e6dc53e4b4f9e7b Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Wed, 14 Oct 2020 15:03:58 +0200
Subject: [PATCH 52/81] Removed random goal strategy. Updated sample
 transitions.

---
 docs/modules/her.rst                          |  2 +-
 .../her/goal_selection_strategy.py            |  5 --
 stable_baselines3/her/her.py                  |  5 +-
 stable_baselines3/her/her_replay_buffer.py    | 56 +++++++------------
 tests/test_her.py                             |  6 +-
 5 files changed, 26 insertions(+), 48 deletions(-)

diff --git a/docs/modules/her.rst b/docs/modules/her.rst
index 8f8c8f36c6..167e6b6ab0 100644
--- a/docs/modules/her.rst
+++ b/docs/modules/her.rst
@@ -46,7 +46,7 @@ Example
 
     env = BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)
 
-    # Available strategies (cf paper): future, final, episode, random
+    # Available strategies (cf paper): future, final, episode
     goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
 
     # If True the HER transitions will get sampled online
diff --git a/stable_baselines3/her/goal_selection_strategy.py b/stable_baselines3/her/goal_selection_strategy.py
index 5f434be277..d4c6a93e4f 100644
--- a/stable_baselines3/her/goal_selection_strategy.py
+++ b/stable_baselines3/her/goal_selection_strategy.py
@@ -15,10 +15,6 @@ class GoalSelectionStrategy(Enum):
     FINAL = 1
     # Select a goal that was achieved in the episode
     EPISODE = 2
-    # Select a goal that was achieved
-    # at some point in the training procedure
-    # (and that is present in the replay buffer)
-    RANDOM = 3
 
 
 # For convenience
@@ -27,5 +23,4 @@ class GoalSelectionStrategy(Enum):
     "future": GoalSelectionStrategy.FUTURE,
     "final": GoalSelectionStrategy.FINAL,
     "episode": GoalSelectionStrategy.EPISODE,
-    "random": GoalSelectionStrategy.RANDOM,
 }
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index e495415b03..43ef1a499e 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -369,17 +369,16 @@ def _store_transitions(self) -> None:
         """
 
         # sample goals and get new observations
-        observations, next_observations, transitions = self._episode_storage.sample(
+        observations, next_observations, actions, rewards = self._episode_storage.sample(
             self.batch_size,
             self.env,
             self.online_sampling,
             self.n_sampled_goal,
-            self.replay_buffer.observations,
         )
 
         # store data in replay buffer
         dones = np.zeros((len(observations)), dtype=bool)
-        self.replay_buffer.extend(observations, next_observations, transitions["action"], transitions["reward"], dones)
+        self.replay_buffer.extend(observations, next_observations, actions, rewards, dones)
 
     def __getattr__(self, item: str) -> Any:
         """
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index b4db3b2609..bbb98ad286 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -81,6 +81,9 @@ def __init__(
     def _get_samples(
         self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None
     ) -> Union[ReplayBufferSamples, RolloutBufferSamples]:
+        """
+        Abstract method from base class.
+        """
         raise NotImplementedError()
 
     def sample(
@@ -89,7 +92,6 @@ def sample(
         env: Optional[VecNormalize] = None,
         online_sampling: bool = True,
         n_sampled_goal: int = None,
-        replay_observations: np.ndarray = None,
     ) -> Union[ReplayBufferSamples, Tuple]:
         """
         :param batch_size: Number of element to sample
@@ -97,18 +99,15 @@ def sample(
             to normalize the observations/rewards when sampling
         :param online_sampling: Using online_sampling for HER or not.
         :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
-        :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
         :return: Samples.
         """
-        return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal, replay_observations)
+        return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal)
 
-    def sample_goal(
+    def sample_goals(
         self,
         episode_indices: np.ndarray,
         her_indices: np.ndarray,
         transitions_indices: np.ndarray,
-        online_sampling: bool = True,
-        replay_observations: np.ndarray = None,
     ) -> np.ndarray:
         """
         Sample goals based on goal_selection_strategy.
@@ -117,8 +116,6 @@ def sample_goal(
         :param episode_indices: Episode indices to use.
         :param her_indices: HER indices.
         :param transitions_indices: Transition indices to use.
-        :param online_sampling: Using online_sampling for HER or not.
-        :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
         :return: Return sampled goals.
         """
         her_episode_indices = episode_indices[her_indices]
@@ -137,20 +134,6 @@ def sample_goal(
             # replay with random state which comes from the same episode as current transition
             transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices])
 
-        elif self.goal_selection_strategy == GoalSelectionStrategy.RANDOM:
-            if online_sampling:
-                # replay with random state from the entire replay buffer
-                her_episode_indices = np.random.randint(self.n_episodes_stored, size=len(her_indices))
-                transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices])
-            else:
-                # replay with random state from the entire replay buffer
-                index = np.random.choice(np.arange(len(replay_observations)), len(her_indices))
-                obs = replay_observations[index]
-                # get only the observation part of the state
-                obs_dim = self.env.obs_dim
-                # get from every observation from first env the observation part (without concatenated desired goal)
-                obs_array = obs[:, :, :obs_dim]
-                return obs_array
         else:
             raise ValueError("Strategy for sampling goals not supported!")
 
@@ -162,7 +145,6 @@ def _sample_transitions(
         env: Optional[VecNormalize],
         online_sampling: bool = True,
         n_sampled_goal: int = None,
-        replay_observations: np.ndarray = None,
     ) -> Union[ReplayBufferSamples, Tuple]:
         """
         :param batch_size: Number of element to sample
@@ -170,38 +152,42 @@ def _sample_transitions(
             to normalize the observations/rewards when sampling
         :param online_sampling: Using online_sampling for HER or not.
         :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
-        :param replay_observations: Observations of the offline replay buffer. Needed for 'RANDOM' goal strategy.
         :return: Samples.
         """
         # Select which episodes to use
         if online_sampling:
             episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
             her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)]
-            ep_length = self.episode_lengths[episode_indices]
         else:
             episode_length = self.episode_lengths[0]
-            episode_indices = np.array(list(range(1)) * episode_length * n_sampled_goal)
+            episode_indices = np.tile(0, (episode_length * n_sampled_goal))
+            # episode_indices = np.array(list(range(1)) * episode_length * n_sampled_goal)
             her_indices = np.arange(len(episode_indices))
-            # repeat every transition index n_sampled_goals times
-            transitions_indices = np.array(list(range(episode_length)) * n_sampled_goal)
+
+        ep_length = self.episode_lengths[episode_indices]
 
         if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
             # restrict the sampling domain when ep_length > 1
             # otherwise filter out the indices
-            if online_sampling:
-                her_indices = her_indices[ep_length[her_indices] > 1]
-                ep_length[her_indices] -= 1
-            else:
-                her_indices = her_indices[episode_length > 1 and transitions_indices < episode_length - 1]
+            her_indices = her_indices[ep_length[her_indices] > 1]
+            ep_length[her_indices] -= 1
 
         if online_sampling:
             # Select which transitions to use
             transitions_indices = np.random.randint(ep_length)
+        else:
+            if her_indices.size == 0:
+                return np.empty(0), np.empty(0), np.empty(0), np.empty(0)
+            else:
+                # repeat every transition index n_sampled_goals times
+                transitions_indices = np.tile(np.arange(ep_length[0]), n_sampled_goal)
+                episode_indices = episode_indices[transitions_indices]
+                her_indices = np.arange(len(episode_indices))
 
         # get selected transitions
         transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()}
 
-        new_goals = self.sample_goal(episode_indices, her_indices, transitions_indices, online_sampling, replay_observations)
+        new_goals = self.sample_goals(episode_indices, her_indices, transitions_indices)
         transitions["desired_goal"][her_indices] = new_goals
 
         # Convert info buffer to numpy array
@@ -235,7 +221,7 @@ def _sample_transitions(
 
             return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
         else:
-            return observations, next_observations, transitions
+            return observations, next_observations, transitions["action"], transitions["reward"]
 
     def add(
         self,
diff --git a/tests/test_her.py b/tests/test_her.py
index 5ffc38288b..bb045fb29f 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -42,11 +42,9 @@ def test_her(model_class, online_sampling):
         "final",
         "episode",
         "future",
-        "random",
-        GoalSelectionStrategy.FUTURE,
-        GoalSelectionStrategy.RANDOM,
-        GoalSelectionStrategy.EPISODE,
         GoalSelectionStrategy.FINAL,
+        GoalSelectionStrategy.EPISODE,
+        GoalSelectionStrategy.FUTURE,
     ],
 )
 @pytest.mark.parametrize("online_sampling", [True, False])

From 381d927da93568add74004d1261a628ee960f2d8 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Wed, 14 Oct 2020 15:48:27 +0200
Subject: [PATCH 53/81] Updated migration. Removed time signal removal.

---
 docs/guide/migration.rst     |  7 +++++++
 stable_baselines3/her/her.py | 22 ++++------------------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/docs/guide/migration.rst b/docs/guide/migration.rst
index 82fbc69bdf..89a1df2928 100644
--- a/docs/guide/migration.rst
+++ b/docs/guide/migration.rst
@@ -163,6 +163,13 @@ Despite this change, no change in performance should be expected.
 	To match SB2 behavior, you need to explicitly pass ``deterministic=True``
 
 
+HER
+^^^
+
+The HER implementation now also supports online sampling of the new goals. This is done in a vectorized version.
+The goal selection strategy ``RANDOM`` is no longer supported.
+
+
 
 New logger API
 --------------
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 43ef1a499e..ff502598a5 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -287,30 +287,16 @@ def collect_rollouts(
                         self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
                         self.model._last_original_obs = self._last_original_obs
 
-                    # Remove termination signal due to timelimit if needed
-                    # NOTE: this may cause issue when using memory optimized replay
-                    # or n-step replay
-                    if self.remove_time_limit_termination and infos[0].get("TimeLimit.truncated", False):
-                        done_ = np.array([False])
-                        # As the VecEnv resets automatically, new_obs is already the
-                        # first observation of the next episode
-                        next_obs = infos[0]["terminal_observation"]
-                        if self._vec_normalize_env is not None:
-                            next_obs = self._vec_normalize_env.unnormalize_obs(next_obs)
-                    else:
-                        done_ = done
-                        next_obs = new_obs_
-
                     if self.online_sampling:
-                        self.replay_buffer.add(self._last_original_obs, next_obs, buffer_action, reward_, done_, infos)
+                        self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
                     else:
                         # concatenate observation with (desired) goal
                         obs = ObsDictWrapper.convert_dict(self._last_original_obs)
-                        next_obs_ = ObsDictWrapper.convert_dict(next_obs)
+                        next_obs = ObsDictWrapper.convert_dict(new_obs_)
                         # add to replay buffer
-                        self.replay_buffer.add(obs, next_obs_, buffer_action, reward_, done_)
+                        self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done)
                         # add current transition to episode storage
-                        self._episode_storage.add(self._last_original_obs, next_obs, buffer_action, reward_, done_, infos)
+                        self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
 
                 self._last_obs = new_obs
                 self.model._last_obs = self._last_obs

From c10b26aec466f9366d2c279a699ab6317ca97397 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Wed, 14 Oct 2020 18:10:15 +0200
Subject: [PATCH 54/81] Update doc

---
 docs/guide/examples.rst | 75 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst
index a6b804071b..49b2b3c2fa 100644
--- a/docs/guide/examples.rst
+++ b/docs/guide/examples.rst
@@ -18,8 +18,7 @@ notebooks:
 -  `Atari Games`_
 -  `RL Baselines zoo`_
 -  `PyBullet`_
-
-.. -  `Hindsight Experience Replay`_
+-  `Hindsight Experience Replay`_
 
 .. _Getting Started: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/stable_baselines_getting_started.ipynb
 .. _Training, Saving, Loading: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/saving_loading_dqn.ipynb
@@ -343,6 +342,78 @@ will compute a running average and standard deviation of input features (it can
   env.norm_reward = False
 
 
+Hindsight Experience Replay (HER)
+---------------------------------
+
+For this example, we are using `Highway-Env <https://github.com/eleurent/highway-env>`_ by `@eleurent <https://github.com/eleurent>`_.
+
+
+.. image:: ../_static/img/colab-badge.svg
+   :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/stable_baselines_her.ipynb
+
+
+.. figure:: https://raw.githubusercontent.com/eleurent/highway-env/gh-media/docs/media/parking-env.gif
+
+   The highway-parking-v0 environment.
+
+The parking env is a goal-conditioned continuous control task, in which the vehicle must park in a given space with the appropriate heading.
+
+.. note::
+
+  The hyperparameters in the following example were optimized for that environment.
+
+
+.. code-block:: python
+
+  import gym
+  import highway_env
+  import numpy as np
+
+  from stable_baselines3 import HER, SAC, DDPG, TD3
+  from stable_baselines3.common.noise import NormalActionNoise
+
+  env = gym.make("parking-v0")
+
+  # Create 4 artificial transitions per real transition
+  n_sampled_goal = 4
+
+  # SAC hyperparams:
+  model = HER(
+      "MlpPolicy",
+      env,
+      SAC,
+      n_sampled_goal=n_sampled_goal,
+      goal_selection_strategy="future",
+      verbose=1,
+      buffer_size=int(1e6),
+      learning_rate=1e-3,
+      gamma=0.95,
+      batch_size=256,
+      online_sampling=True,
+      policy_kwargs=dict(net_arch=[256, 256, 256]),
+  )
+
+  model.learn(int(2e5))
+  model.save("her_sac_highway")
+
+  # Load saved model
+  model = HER.load("her_sac_highway", env=env)
+
+  obs = env.reset()
+
+  # Evaluate the agent
+  episode_reward = 0
+  for _ in range(100):
+      action, _ = model.predict(obs, deterministic=True)
+      obs, reward, done, info = env.step(action)
+      env.render()
+      episode_reward += reward
+      if done or info.get("is_success", False):
+          print("Reward:", episode_reward, "Success?", info.get("is_success", False))
+          episode_reward = 0.0
+          obs = env.reset()
+
+
 Record a Video
 --------------
 

From 9d5c83ebe315fddf9407dd621a7bfad1891c4dc1 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Wed, 14 Oct 2020 18:10:36 +0200
Subject: [PATCH 55/81] Fix potential load issue

---
 docs/misc/changelog.rst                      |  1 +
 stable_baselines3/common/base_class.py       | 31 +++++++++++++-------
 stable_baselines3/common/vec_env/__init__.py | 11 +++++++
 stable_baselines3/her/her.py                 | 23 ++++-----------
 4 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index fc198a6fab..6671d68233 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -18,6 +18,7 @@ New Features:
 Bug Fixes:
 ^^^^^^^^^^
 - Fix GAE computation for on-policy algorithms (off-by one for the last value) (thanks @Wovchena)
+- Fixed potential issue when loading a different environment
 
 Deprecations:
 ^^^^^^^^^^^^^
diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
index 97d0035599..f270c6032f 100644
--- a/stable_baselines3/common/base_class.py
+++ b/stable_baselines3/common/base_class.py
@@ -26,7 +26,14 @@
     set_random_seed,
     update_learning_rate,
 )
-from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecNormalize, VecTransposeImage, unwrap_vec_normalize
+from stable_baselines3.common.vec_env import (
+    DummyVecEnv,
+    VecEnv,
+    VecNormalize,
+    VecTransposeImage,
+    is_wrapped,
+    unwrap_vec_normalize,
+)
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 
 
@@ -147,7 +154,7 @@ def __init__(
                     self.eval_env = maybe_make_env(env, monitor_wrapper, self.verbose)
 
             env = maybe_make_env(env, monitor_wrapper, self.verbose)
-            env = self._wrap_env(env)
+            env = self._wrap_env(env, self.verbose)
 
             self.observation_space = env.observation_space
             self.action_space = env.action_space
@@ -162,14 +169,15 @@ def __init__(
         if self.use_sde and not isinstance(self.action_space, gym.spaces.Box):
             raise ValueError("generalized State-Dependent Exploration (gSDE) can only be used with continuous actions.")
 
-    def _wrap_env(self, env: GymEnv) -> VecEnv:
+    @staticmethod
+    def _wrap_env(env: GymEnv, verbose: int = 0) -> VecEnv:
         if not isinstance(env, VecEnv):
-            if self.verbose >= 1:
+            if verbose >= 1:
                 print("Wrapping the env in a DummyVecEnv.")
             env = DummyVecEnv([lambda: env])
 
-        if is_image_space(env.observation_space) and not isinstance(env, VecTransposeImage):
-            if self.verbose >= 1:
+        if is_image_space(env.observation_space) and not is_wrapped(env, VecTransposeImage):
+            if verbose >= 1:
                 print("Wrapping the env in a VecTransposeImage.")
             env = VecTransposeImage(env)
 
@@ -194,7 +202,7 @@ def _get_eval_env(self, eval_env: Optional[GymEnv]) -> Optional[GymEnv]:
             eval_env = self.eval_env
 
         if eval_env is not None:
-            eval_env = self._wrap_env(eval_env)
+            eval_env = self._wrap_env(eval_env, self.verbose)
             assert eval_env.num_envs == 1
         return eval_env
 
@@ -408,10 +416,11 @@ def set_env(self, env: GymEnv) -> None:
 
         :param env: The environment for learning a policy
         """
-        check_for_correct_spaces(env, self.observation_space, self.action_space)
-        # it must be coherent now
         # if it is not a VecEnv, make it a VecEnv
-        env = self._wrap_env(env)
+        # and do other transformations (dict obs, image transpose) if needed
+        env = self._wrap_env(env, self.verbose)
+        # Check that the observation spaces match
+        check_for_correct_spaces(env, self.observation_space, self.action_space)
 
         self.n_envs = env.num_envs
         self.env = env
@@ -582,6 +591,8 @@ def load(
             raise KeyError("The observation_space and action_space were not given, can't verify new environments")
 
         if env is not None:
+            # Wrap first if needed
+            cls._wrap_env(env, data["verbose"])
             # Check if given env is valid
             check_for_correct_spaces(env, data["observation_space"], data["action_space"])
         else:
diff --git a/stable_baselines3/common/vec_env/__init__.py b/stable_baselines3/common/vec_env/__init__.py
index 1940f20c04..0002788895 100644
--- a/stable_baselines3/common/vec_env/__init__.py
+++ b/stable_baselines3/common/vec_env/__init__.py
@@ -41,6 +41,17 @@ def unwrap_vec_normalize(env: Union["GymEnv", VecEnv]) -> Optional[VecNormalize]
     return unwrap_vec_wrapper(env, VecNormalize)  # pytype:disable=bad-return-type
 
 
+def is_wrapped(env: Union["GymEnv", VecEnv], vec_wrapper_class: Type[VecEnvWrapper]) -> bool:
+    """
+    Check if an environment is already wrapped by a given ``VecEnvWrapper``.
+
+    :param env:
+    :param vec_wrapper_class:
+    :return:
+    """
+    return unwrap_vec_wrapper(env, vec_wrapper_class) is not None
+
+
 # Define here to avoid circular import
 def sync_envs_normalization(env: "GymEnv", eval_env: "GymEnv") -> None:
     """
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index ff502598a5..eb4a13c516 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -1,8 +1,7 @@
 import io
 import pathlib
-from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union
+from typing import Any, Iterable, List, Optional, Tuple, Type, Union
 
-import gym
 import numpy as np
 import torch as th
 
@@ -12,11 +11,10 @@
 from stable_baselines3.common.noise import ActionNoise
 from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm
 from stable_baselines3.common.policies import BasePolicy
-from stable_baselines3.common.preprocessing import is_image_space
-from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr
+from stable_baselines3.common.save_util import load_from_zip_file, recursive_setattr
 from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, RolloutReturn
 from stable_baselines3.common.utils import check_for_correct_spaces
-from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecTransposeImage
+from stable_baselines3.common.vec_env import VecEnv
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import KEY_TO_GOAL_STRATEGY, GoalSelectionStrategy
 from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
@@ -443,19 +441,10 @@ def load(
 
         # check if given env is valid
         if env is not None:
-            # check if wrapper for dict support is needed
-            if not isinstance(env, VecEnv):
-                env = DummyVecEnv([lambda: env])
-
-            if is_image_space(env.observation_space) and not isinstance(env, VecTransposeImage):
-                env = VecTransposeImage(env)
-
-            # check if wrapper for dict support when using HER is needed
-            if isinstance(env.observation_space, gym.spaces.dict.Dict):
-                env = ObsDictWrapper(env)
-
+            # Wrap first if needed
+            env = cls._wrap_env(env, data["verbose"])
+            # Check if given env is valid
             check_for_correct_spaces(env, data["observation_space"], data["action_space"])
-        # if no new env was given use stored env if possible
         else:
             # Use stored env, if one exists. If not, continue as is (can be used for predict)
             if "env" in data:

From 46c6d29a5643ac68adcdfe17730ed14c5bd06159 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Fri, 16 Oct 2020 15:36:39 +0200
Subject: [PATCH 56/81] Add VecNormalize support for dict obs

---
 Makefile                                      |   2 +-
 docs/guide/migration.rst                      |   4 +-
 docs/misc/changelog.rst                       |   1 +
 docs/modules/her.rst                          |   5 +
 stable_baselines3/common/bit_flipping_env.py  |   6 +-
 stable_baselines3/common/buffers.py           |   8 +-
 stable_baselines3/common/utils.py             |  11 +-
 .../common/vec_env/vec_normalize.py           |  80 ++++++++++---
 stable_baselines3/her/her.py                  |  11 +-
 stable_baselines3/her/her_replay_buffer.py    |  10 +-
 tests/test_her.py                             |  25 ++--
 tests/test_vec_normalize.py                   | 109 +++++++++++++++---
 12 files changed, 202 insertions(+), 70 deletions(-)

diff --git a/Makefile b/Makefile
index 749bc026b2..9954c7d7b1 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ pytest:
 	./scripts/run_tests.sh
 
 type:
-	pytype
+	pytype -j auto
 
 lint:
 	# stop the build if there are Python syntax errors or undefined names
diff --git a/docs/guide/migration.rst b/docs/guide/migration.rst
index 89a1df2928..f1e0225a01 100644
--- a/docs/guide/migration.rst
+++ b/docs/guide/migration.rst
@@ -166,9 +166,9 @@ Despite this change, no change in performance should be expected.
 HER
 ^^^
 
-The HER implementation now also supports online sampling of the new goals. This is done in a vectorized version.
+The ``HER`` implementation now also supports online sampling of the new goals. This is done in a vectorized version.
 The goal selection strategy ``RANDOM`` is no longer supported.
-
+``HER`` now supports ``VecNormalize`` wrapper but only when ``online_sampling=True``
 
 
 New logger API
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index ba54366f60..03cacf45a1 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -14,6 +14,7 @@ New Features:
 ^^^^^^^^^^^^^
 - Allow custom actor/critic network architectures using ``net_arch=dict(qf=[400, 300], pi=[64, 64])`` for off-policy algorithms (SAC, TD3, DDPG)
 - Added Hindsight Experience Replay ``HER``. (@megan-klaiber)
+- ``VecNormalize`` now supports ``gym.spaces.Dict`` observation spaces
 
 Bug Fixes:
 ^^^^^^^^^^
diff --git a/docs/modules/her.rst b/docs/modules/her.rst
index 167e6b6ab0..6befbc1731 100644
--- a/docs/modules/her.rst
+++ b/docs/modules/her.rst
@@ -15,6 +15,11 @@ HER is an algorithm that works with Off policy methods (DQN, SAC, TD3 and DDPG f
     HER requires the environment to inherits from `gym.GoalEnv <https://github.com/openai/gym/blob/3394e245727c1ae6851b504a50ba77c73cd4c65b/gym/core.py#L160>`_
 
 
+.. warning::
+
+	``HER`` supports ``VecNormalize`` wrapper but only when ``online_sampling=True``
+
+
 Notes
 -----
 
diff --git a/stable_baselines3/common/bit_flipping_env.py b/stable_baselines3/common/bit_flipping_env.py
index 999ada32e2..d38ff73cc7 100644
--- a/stable_baselines3/common/bit_flipping_env.py
+++ b/stable_baselines3/common/bit_flipping_env.py
@@ -1,5 +1,5 @@
 from collections import OrderedDict
-from typing import Dict, Optional, Union
+from typing import Any, Dict, Optional, Union
 
 import numpy as np
 from gym import GoalEnv, spaces
@@ -111,7 +111,9 @@ def step(self, action: Union[np.ndarray, int]) -> GymStepReturn:
         done = done or self.current_step >= self.max_steps
         return obs, reward, done, info
 
-    def compute_reward(self, achieved_goal: np.ndarray, desired_goal: np.ndarray, _info) -> np.float32:
+    def compute_reward(
+        self, achieved_goal: Union[int, np.ndarray], desired_goal: Union[int, np.ndarray], _info: Optional[Dict[str, Any]]
+    ) -> np.float32:
         # Deceptive reward: it is positive only when the goal is achieved
         # vectorized version
         distance = np.linalg.norm(achieved_goal - desired_goal, axis=-1)
diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py
index 563ddea755..83e6554898 100644
--- a/stable_baselines3/common/buffers.py
+++ b/stable_baselines3/common/buffers.py
@@ -1,6 +1,6 @@
 import warnings
 from abc import ABC, abstractmethod
-from typing import Generator, Optional, Union
+from typing import Dict, Generator, Optional, Union
 
 import numpy as np
 import torch as th
@@ -129,9 +129,11 @@ def to_torch(self, array: np.ndarray, copy: bool = True) -> th.Tensor:
         return th.as_tensor(array).to(self.device)
 
     @staticmethod
-    def _normalize_obs(obs: np.ndarray, env: Optional[VecNormalize] = None) -> np.ndarray:
+    def _normalize_obs(
+        obs: Union[np.ndarray, Dict[str, np.ndarray]], env: Optional[VecNormalize] = None
+    ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
         if env is not None:
-            return env.normalize_obs(obs).astype(np.float32)
+            return env.normalize_obs(obs)
         return obs
 
     @staticmethod
diff --git a/stable_baselines3/common/utils.py b/stable_baselines3/common/utils.py
index 57872657b8..2f02ed74db 100644
--- a/stable_baselines3/common/utils.py
+++ b/stable_baselines3/common/utils.py
@@ -16,9 +16,7 @@
     SummaryWriter = None
 
 from stable_baselines3.common import logger
-from stable_baselines3.common.preprocessing import is_image_space
 from stable_baselines3.common.type_aliases import GymEnv
-from stable_baselines3.common.vec_env import VecTransposeImage
 
 
 def set_random_seed(seed: int, using_cuda: bool = False) -> None:
@@ -204,14 +202,7 @@ def check_for_correct_spaces(env: GymEnv, observation_space: gym.spaces.Space, a
     :param observation_space: Observation space to check against
     :param action_space: Action space to check against
     """
-    if (
-        observation_space != env.observation_space
-        # Special cases for images that need to be transposed
-        and not (
-            is_image_space(env.observation_space)
-            and observation_space == VecTransposeImage.transpose_space(env.observation_space)
-        )
-    ):
+    if observation_space != env.observation_space:
         raise ValueError(f"Observation spaces do not match: {observation_space} != {env.observation_space}")
     if action_space != env.action_space:
         raise ValueError(f"Action spaces do not match: {action_space} != {env.action_space}")
diff --git a/stable_baselines3/common/vec_env/vec_normalize.py b/stable_baselines3/common/vec_env/vec_normalize.py
index 39a5d1128a..fcdefd8edd 100644
--- a/stable_baselines3/common/vec_env/vec_normalize.py
+++ b/stable_baselines3/common/vec_env/vec_normalize.py
@@ -1,8 +1,11 @@
 import pickle
-from typing import Any, Dict
+from copy import deepcopy
+from typing import Any, Dict, Union
 
+import gym
 import numpy as np
 
+from stable_baselines3.common import utils
 from stable_baselines3.common.running_mean_std import RunningMeanStd
 from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvStepReturn, VecEnvWrapper
 
@@ -34,7 +37,19 @@ def __init__(
         epsilon: float = 1e-8,
     ):
         VecEnvWrapper.__init__(self, venv)
-        self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
+
+        assert isinstance(
+            self.observation_space, (gym.spaces.Box, gym.spaces.Dict)
+        ), "VecNormalize only support `gym.spaces.Box` and `gym.spaces.Dict` observation spaces"
+
+        if isinstance(self.observation_space, gym.spaces.Dict):
+            self.obs_keys = set(self.observation_space.spaces.keys())
+            self.obs_spaces = self.observation_space.spaces
+            self.obs_rms = {key: RunningMeanStd(shape=space.shape) for key, space in self.obs_spaces.items()}
+        else:
+            self.obs_keys, self.obs_spaces = None, None
+            self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
+
         self.ret_rms = RunningMeanStd(shape=())
         self.clip_obs = clip_obs
         self.clip_reward = clip_reward
@@ -83,8 +98,9 @@ def set_venv(self, venv: VecEnv) -> None:
         if self.venv is not None:
             raise ValueError("Trying to set venv of already initialized VecNormalize wrapper.")
         VecEnvWrapper.__init__(self, venv)
-        if self.obs_rms.mean.shape != self.observation_space.shape:
-            raise ValueError("venv is incompatible with current statistics.")
+
+        # Check only that the observation_space match
+        utils.check_for_correct_spaces(venv, self.observation_space, venv.action_space)
         self.ret = np.zeros(self.num_envs)
 
     def step_wait(self) -> VecEnvStepReturn:
@@ -99,7 +115,12 @@ def step_wait(self) -> VecEnvStepReturn:
         self.old_reward = rews
 
         if self.training:
-            self.obs_rms.update(obs)
+            if isinstance(obs, dict) and isinstance(self.obs_rms, dict):
+                for key in self.obs_rms.keys():
+                    self.obs_rms[key].update(obs[key])
+            else:
+                self.obs_rms.update(obs)
+
         obs = self.normalize_obs(obs)
 
         if self.training:
@@ -114,14 +135,38 @@ def _update_reward(self, reward: np.ndarray) -> None:
         self.ret = self.ret * self.gamma + reward
         self.ret_rms.update(self.ret)
 
-    def normalize_obs(self, obs: np.ndarray) -> np.ndarray:
+    def _normalize_obs(self, obs: np.ndarray, obs_rms: RunningMeanStd) -> np.ndarray:
+        """
+        Helper to normalize observation.
+        :param obs:
+        :param obs_rms: associated statistics
+        :return: normalized observation
+        """
+        return np.clip((obs - obs_rms.mean) / np.sqrt(obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs)
+
+    def _unnormalize_obs(self, obs: np.ndarray, obs_rms: RunningMeanStd) -> np.ndarray:
+        """
+        Helper to unnormalize observation.
+        :param obs:
+        :param obs_rms: associated statistics
+        :return: unnormalized observation
+        """
+        return (obs * np.sqrt(obs_rms.var + self.epsilon)) + obs_rms.mean
+
+    def normalize_obs(self, obs: Union[np.ndarray, Dict[str, np.ndarray]]) -> Union[np.ndarray, Dict[str, np.ndarray]]:
         """
         Normalize observations using this VecNormalize's observations statistics.
         Calling this method does not update statistics.
         """
+        # Avoid modifying by reference the original object
+        obs_ = deepcopy(obs)
         if self.norm_obs:
-            obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs)
-        return obs
+            if isinstance(obs, dict) and isinstance(self.obs_rms, dict):
+                for key in self.obs_rms.keys():
+                    obs_[key] = self._normalize_obs(obs[key], self.obs_rms[key]).astype(np.float32)
+            else:
+                obs_ = self._normalize_obs(obs, self.obs_rms).astype(np.float32)
+        return obs_
 
     def normalize_reward(self, reward: np.ndarray) -> np.ndarray:
         """
@@ -132,22 +177,28 @@ def normalize_reward(self, reward: np.ndarray) -> np.ndarray:
             reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward)
         return reward
 
-    def unnormalize_obs(self, obs: np.ndarray) -> np.ndarray:
+    def unnormalize_obs(self, obs: Union[np.ndarray, Dict[str, np.ndarray]]) -> Union[np.ndarray, Dict[str, np.ndarray]]:
+        # Avoid modifying by reference the original object
+        obs_ = deepcopy(obs)
         if self.norm_obs:
-            return (obs * np.sqrt(self.obs_rms.var + self.epsilon)) + self.obs_rms.mean
-        return obs
+            if isinstance(obs, dict) and isinstance(self.obs_rms, dict):
+                for key in self.obs_rms.keys():
+                    obs_[key] = self._unnormalize_obs(obs[key], self.obs_rms[key])
+            else:
+                obs_ = self._unnormalize_obs(obs, self.obs_rms)
+        return obs_
 
     def unnormalize_reward(self, reward: np.ndarray) -> np.ndarray:
         if self.norm_reward:
             return reward * np.sqrt(self.ret_rms.var + self.epsilon)
         return reward
 
-    def get_original_obs(self) -> np.ndarray:
+    def get_original_obs(self) -> Union[np.ndarray, Dict[str, np.ndarray]]:
         """
         Returns an unnormalized version of the observations from the most recent
         step or reset.
         """
-        return self.old_obs.copy()
+        return deepcopy(self.old_obs)
 
     def get_original_reward(self) -> np.ndarray:
         """
@@ -155,9 +206,10 @@ def get_original_reward(self) -> np.ndarray:
         """
         return self.old_reward.copy()
 
-    def reset(self) -> np.ndarray:
+    def reset(self) -> Union[np.ndarray, Dict[str, np.ndarray]]:
         """
         Reset all environments
+        :return: first observation of the episode
         """
         obs = self.venv.reset()
         self.old_obs = obs
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index eb4a13c516..bd71b3e07f 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -6,7 +6,6 @@
 import torch as th
 
 from stable_baselines3.common.base_class import BaseAlgorithm
-from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.callbacks import BaseCallback
 from stable_baselines3.common.noise import ActionNoise
 from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm
@@ -81,6 +80,9 @@ def __init__(
 
         super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=3e-4)
 
+        if self.get_vec_normalize_env() is not None:
+            assert online_sampling, "You must pass `online_sampling=True` if you want to use `VecNormalize` with `HER`"
+
         # model initialization
         self.model_class = model_class
         self.model = model_class(
@@ -179,7 +181,6 @@ def learn(
                 action_noise=self.action_noise,
                 callback=callback,
                 learning_starts=self.learning_starts,
-                replay_buffer=self.replay_buffer,
                 log_interval=log_interval,
             )
 
@@ -204,7 +205,6 @@ def collect_rollouts(
         n_steps: int = -1,
         action_noise: Optional[ActionNoise] = None,
         learning_starts: int = 0,
-        replay_buffer: Union[ReplayBuffer, HerReplayBuffer] = None,
         log_interval: Optional[int] = None,
     ) -> RolloutReturn:
         """
@@ -221,7 +221,6 @@ def collect_rollouts(
             Required for deterministic policy (e.g. TD3). This can also be used
             in addition to the stochastic policy for SAC.
         :param learning_starts: Number of steps before learning for the warm-up phase.
-        :param replay_buffer:
         :param log_interval: Log data every ``log_interval`` episodes
         :return:
         """
@@ -275,7 +274,7 @@ def collect_rollouts(
                 self.model.ep_success_buffer = self.ep_success_buffer
 
                 # Store episode in episode storage
-                if replay_buffer is not None:
+                if self.replay_buffer is not None:
                     # Store only the unnormalized version
                     if self._vec_normalize_env is not None:
                         new_obs_ = self._vec_normalize_env.get_original_obs()
@@ -355,7 +354,7 @@ def _store_transitions(self) -> None:
         # sample goals and get new observations
         observations, next_observations, actions, rewards = self._episode_storage.sample(
             self.batch_size,
-            self.env,
+            self.get_vec_normalize_env(),
             self.online_sampling,
             self.n_sampled_goal,
         )
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index bbb98ad286..f2d33af6bd 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -207,14 +207,16 @@ def _sample_transitions(
         )
 
         # concatenate observation with (desired) goal
-        observations = ObsDictWrapper.convert_dict(transitions)
-        next_observations = ObsDictWrapper.convert_dict(transitions, observation_key="next_obs")
+        observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, env))
+        # HACK to make normalize obs work with the next observation
+        transitions["observation"] = transitions["next_obs"]
+        next_observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, env))
 
         if online_sampling:
             data = (
-                self._normalize_obs(observations[:, 0], env),
+                observations[:, 0],
                 transitions["action"],
-                self._normalize_obs(next_observations[:, 0], env),
+                next_observations[:, 0],
                 transitions["done"],
                 self._normalize_reward(transitions["reward"], env),
             )
diff --git a/tests/test_her.py b/tests/test_her.py
index bb045fb29f..9989663f0a 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -31,9 +31,10 @@ def test_her(model_class, online_sampling):
         n_episodes_rollout=-1,
         max_episode_length=n_bits,
         policy_kwargs=dict(net_arch=[64]),
+        learning_starts=100,
     )
 
-    model.learn(total_timesteps=500)
+    model.learn(total_timesteps=300)
 
 
 @pytest.mark.parametrize(
@@ -65,8 +66,9 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
         n_episodes_rollout=-1,
         max_episode_length=10,
         policy_kwargs=dict(net_arch=[64]),
+        learning_starts=100,
     )
-    model.learn(total_timesteps=200)
+    model.learn(total_timesteps=300)
 
 
 @pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN])
@@ -99,7 +101,8 @@ def test_save_load(tmp_path, model_class, use_sde):
         buffer_size=int(1e6),
         gamma=0.98,
         gradient_steps=1,
-        train_freq=1,
+        train_freq=4,
+        learning_starts=100,
         n_episodes_rollout=-1,
         max_episode_length=n_bits,
         **kwargs
@@ -152,17 +155,18 @@ def test_save_load(tmp_path, model_class, use_sde):
     assert np.allclose(selected_actions, new_selected_actions, 1e-4)
 
     # check if learn still works
-    model.learn(total_timesteps=1000, eval_freq=500)
+    model.learn(total_timesteps=300)
 
     # clear file from os
     os.remove(tmp_path / "test_save.zip")
 
 
 @pytest.mark.parametrize("online_sampling", [False, True])
-@pytest.mark.parametrize("n_bits", [15])
-def test_dqn_her(online_sampling, n_bits):
+@pytest.mark.parametrize("n_bits", [10])
+def test_performance_her(online_sampling, n_bits):
     """
-    Test HER with DQN for BitFlippingEnv.
+    That that DQN+HER can solve BitFlippingEnv.
+    It should not work when n_sampled_goal=0 (DQN alone).
     """
     env = BitFlippingEnv(n_bits=n_bits, continuous=False)
 
@@ -174,7 +178,7 @@ def test_dqn_her(online_sampling, n_bits):
         goal_selection_strategy="future",
         online_sampling=online_sampling,
         verbose=1,
-        learning_rate=0.0005,
+        learning_rate=5e-4,
         max_episode_length=n_bits,
         train_freq=1,
         learning_starts=100,
@@ -184,6 +188,7 @@ def test_dqn_her(online_sampling, n_bits):
         batch_size=32,
     )
 
-    model.learn(total_timesteps=10000)
+    model.learn(total_timesteps=5000, log_interval=50)
 
-    assert np.mean(model.ep_success_buffer) > 0.0
+    # 90% training success
+    assert np.mean(model.ep_success_buffer) > 0.90
diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
index 311e3c92e1..75b017c782 100644
--- a/tests/test_vec_normalize.py
+++ b/tests/test_vec_normalize.py
@@ -1,8 +1,9 @@
 import gym
 import numpy as np
 import pytest
+from gym import spaces
 
-from stable_baselines3 import SAC, TD3
+from stable_baselines3 import HER, SAC, TD3
 from stable_baselines3.common.running_mean_std import RunningMeanStd
 from stable_baselines3.common.vec_env import (
     DummyVecEnv,
@@ -15,14 +16,68 @@
 ENV_ID = "Pendulum-v0"
 
 
+class DummyDictEnv(gym.GoalEnv):
+    """
+    Dummy gym goal env for testing purposes
+    """
+
+    def __init__(self):
+        super(DummyDictEnv, self).__init__()
+        self.observation_space = spaces.Dict(
+            {
+                "observation": spaces.Box(low=-20.0, high=20.0, shape=(4,), dtype=np.float32),
+                "achieved_goal": spaces.Box(low=-20.0, high=20.0, shape=(4,), dtype=np.float32),
+                "desired_goal": spaces.Box(low=-20.0, high=20.0, shape=(4,), dtype=np.float32),
+            }
+        )
+        self.action_space = spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32)
+
+    def reset(self):
+        return self.observation_space.sample()
+
+    def step(self, action):
+        obs = self.observation_space.sample()
+        reward = self.compute_reward(obs["achieved_goal"], obs["desired_goal"], {})
+        done = np.random.rand() > 0.8
+        return obs, reward, done, {}
+
+    def compute_reward(self, achieved_goal: np.ndarray, desired_goal: np.ndarray, _info) -> np.float32:
+        distance = np.linalg.norm(achieved_goal - desired_goal, axis=-1)
+        return -(distance > 0).astype(np.float32)
+
+
+def allclose(obs_1, obs_2):
+    """
+    Generalized np.allclose() to work with dict spaces.
+    """
+    if isinstance(obs_1, dict):
+        all_close = True
+        for key in obs_1.keys():
+            if not np.allclose(obs_1[key], obs_2[key]):
+                all_close = False
+                break
+        return all_close
+    return np.allclose(obs_1, obs_2)
+
+
 def make_env():
     return gym.make(ENV_ID)
 
 
+def make_dict_env():
+    return DummyDictEnv()
+
+
 def check_rms_equal(rmsa, rmsb):
-    assert np.all(rmsa.mean == rmsb.mean)
-    assert np.all(rmsa.var == rmsb.var)
-    assert np.all(rmsa.count == rmsb.count)
+    if isinstance(rmsa, dict):
+        for key in rmsa.keys():
+            assert np.all(rmsa[key].mean == rmsb[key].mean)
+            assert np.all(rmsa[key].var == rmsb[key].var)
+            assert np.all(rmsa[key].count == rmsb[key].count)
+    else:
+        assert np.all(rmsa.mean == rmsb.mean)
+        assert np.all(rmsa.var == rmsb.var)
+        assert np.all(rmsa.count == rmsb.count)
 
 
 def check_vec_norm_equal(norma, normb):
@@ -56,6 +111,19 @@ def _make_warmstart_cartpole():
     return venv
 
 
+def _make_warmstart_dict_env():
+    """Warm-start VecNormalize by stepping through BitFlippingEnv"""
+    venv = DummyVecEnv([make_dict_env])
+    venv = VecNormalize(venv)
+    venv.reset()
+    venv.get_original_obs()
+
+    for _ in range(100):
+        actions = [venv.action_space.sample()]
+        venv.step(actions)
+    return venv
+
+
 def test_runningmeanstd():
     """Test RunningMeanStd object"""
     for (x_1, x_2, x_3) in [
@@ -123,21 +191,24 @@ def test_normalize_external():
     assert np.all(norm_rewards < 1)
 
 
-@pytest.mark.parametrize("model_class", [SAC, TD3])
+@pytest.mark.parametrize("model_class", [SAC, TD3, HER])
 def test_offpolicy_normalization(model_class):
-    env = DummyVecEnv([make_env])
+    make_env_ = make_dict_env if model_class == HER else make_env
+    env = DummyVecEnv([make_env_])
     env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0, clip_reward=10.0)
 
-    eval_env = DummyVecEnv([make_env])
+    eval_env = DummyVecEnv([make_env_])
     eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10.0, clip_reward=10.0)
 
-    model = model_class("MlpPolicy", env, verbose=1, policy_kwargs=dict(net_arch=[64]))
-    model.learn(total_timesteps=1000, eval_env=eval_env, eval_freq=500)
+    kwargs = dict(model_class=SAC, max_episode_length=200, online_sampling=True) if model_class == HER else {}
+    model = model_class("MlpPolicy", env, verbose=1, learning_starts=100, policy_kwargs=dict(net_arch=[64]), **kwargs)
+    model.learn(total_timesteps=500, eval_env=eval_env, eval_freq=250)
     # Check getter
     assert isinstance(model.get_vec_normalize_env(), VecNormalize)
 
 
-def test_sync_vec_normalize():
+@pytest.mark.parametrize("make_env", [make_env, make_dict_env])
+def test_sync_vec_normalize(make_env):
     env = DummyVecEnv([make_env])
 
     assert unwrap_vec_normalize(env) is None
@@ -146,13 +217,15 @@ def test_sync_vec_normalize():
 
     assert isinstance(unwrap_vec_normalize(env), VecNormalize)
 
-    env = VecFrameStack(env, 1)
-
-    assert isinstance(unwrap_vec_normalize(env), VecNormalize)
+    if not isinstance(env.observation_space, spaces.Dict):
+        env = VecFrameStack(env, 1)
+        assert isinstance(unwrap_vec_normalize(env), VecNormalize)
 
     eval_env = DummyVecEnv([make_env])
     eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0)
-    eval_env = VecFrameStack(eval_env, 1)
+
+    if not isinstance(env.observation_space, spaces.Dict):
+        eval_env = VecFrameStack(eval_env, 1)
 
     env.seed(0)
     env.action_space.seed(0)
@@ -171,12 +244,12 @@ def test_sync_vec_normalize():
     dummy_rewards = np.random.rand(10)
     original_obs = env.get_original_obs()
     # Check that unnormalization works
-    assert np.allclose(original_obs, env.unnormalize_obs(obs))
+    assert allclose(original_obs, env.unnormalize_obs(obs))
     # Normalization must be different (between different environments)
-    assert not np.allclose(obs, eval_env.normalize_obs(original_obs))
+    assert not allclose(obs, eval_env.normalize_obs(original_obs))
 
     # Test syncing of parameters
     sync_envs_normalization(env, eval_env)
     # Now they must be synced
-    assert np.allclose(obs, eval_env.normalize_obs(original_obs))
-    assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
+    assert allclose(obs, eval_env.normalize_obs(original_obs))
+    assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))

From 1cfc790b041a79a4e16de728681afcd16265d182 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Fri, 16 Oct 2020 15:57:30 +0200
Subject: [PATCH 57/81] Updated saving/loading replay buffer for HER.

---
 stable_baselines3/her/her_replay_buffer.py | 38 ++++++++++++++++++++--
 tests/test_her.py                          | 33 +++++++++++++++++++
 2 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index f2d33af6bd..4ade7ec047 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -1,18 +1,18 @@
 from collections import deque
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch as th
 from gym import spaces
 
-from stable_baselines3.common.buffers import BaseBuffer
+from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.type_aliases import ReplayBufferSamples, RolloutBufferSamples
 from stable_baselines3.common.vec_env import VecNormalize
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
 
 
-class HerReplayBuffer(BaseBuffer):
+class HerReplayBuffer(ReplayBuffer):
     """
     Replay Buffer for sampling HER (Hindsight Experience Replay) transitions.
     In the online sampling case these new transitions will not be saved in the Buffer.
@@ -78,6 +78,38 @@ def __init__(
         # percentage of her indices
         self.her_ratio = her_ratio
 
+    def __getstate__(self) -> Dict[str, Any]:
+        """
+        Gets state for pickling.
+
+        Excludes self.env, as in general Env's may not be pickleable."""
+        state = self.__dict__.copy()
+        # these attributes are not pickleable
+        del state["env"]
+        return state
+
+    def __setstate__(self, state: Dict[str, Any]) -> None:
+        """
+        Restores pickled state.
+
+        User must call set_env() after unpickling before using.
+
+        :param state:
+        """
+        self.__dict__.update(state)
+        assert "env" not in state
+        self.env = None
+
+    def set_env(self, env: ObsDictWrapper) -> None:
+        """
+        Sets the environment.
+        :param env:
+        """
+        if self.env is not None:
+            raise ValueError("Trying to set env of already initialized environment.")
+
+        self.env = env
+
     def _get_samples(
         self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None
     ) -> Union[ReplayBufferSamples, RolloutBufferSamples]:
diff --git a/tests/test_her.py b/tests/test_her.py
index 9989663f0a..b5a2382b30 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -1,4 +1,5 @@
 import os
+import pathlib
 from copy import deepcopy
 
 import numpy as np
@@ -161,6 +162,38 @@ def test_save_load(tmp_path, model_class, use_sde):
     os.remove(tmp_path / "test_save.zip")
 
 
+@pytest.mark.parametrize("model_class", [HER])
+def test_save_load_replay_buffer(tmp_path, model_class):
+    path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl")
+    path.parent.mkdir(exist_ok=True, parents=True)  # to not raise a warning
+    env = BitFlippingEnv(n_bits=4, continuous=True)
+    model = HER(
+        "MlpPolicy",
+        env,
+        SAC,
+        goal_selection_strategy="future",
+        online_sampling=True,
+        gradient_steps=1,
+        train_freq=1,
+        n_episodes_rollout=-1,
+        max_episode_length=4,
+        policy_kwargs=dict(net_arch=[64]),
+    )
+    model.learn(300)
+    old_replay_buffer = deepcopy(model.replay_buffer)
+    model.save_replay_buffer(path)
+    model.model.replay_buffer = None
+    model.load_replay_buffer(path)
+    # set environment
+    model.replay_buffer.set_env(env)
+
+    assert np.allclose(old_replay_buffer.buffer["observation"], model.replay_buffer.buffer["observation"])
+    assert np.allclose(old_replay_buffer.buffer["next_obs"], model.replay_buffer.buffer["next_obs"])
+    assert np.allclose(old_replay_buffer.buffer["action"], model.replay_buffer.buffer["action"])
+    assert np.allclose(old_replay_buffer.buffer["reward"], model.replay_buffer.buffer["reward"])
+    assert np.allclose(old_replay_buffer.buffer["done"], model.replay_buffer.buffer["done"])
+
+
 @pytest.mark.parametrize("online_sampling", [False, True])
 @pytest.mark.parametrize("n_bits", [10])
 def test_performance_her(online_sampling, n_bits):

From f738f3227cfa115b21b130dbff865f363f0fb309 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Fri, 16 Oct 2020 16:20:10 +0200
Subject: [PATCH 58/81] Fix test memory usage

---
 tests/test_save_load.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_save_load.py b/tests/test_save_load.py
index 18fba37a2e..77ec75e4eb 100644
--- a/tests/test_save_load.py
+++ b/tests/test_save_load.py
@@ -305,7 +305,7 @@ def test_save_load_policy(tmp_path, model_class, policy_str):
     if policy_str == "MlpPolicy":
         env = select_env(model_class)
     else:
-        if model_class in [SAC, TD3, DQN]:
+        if model_class in [SAC, TD3, DQN, DDPG]:
             # Avoid memory error when using replay buffer
             # Reduce the size of the features
             kwargs = dict(buffer_size=250)

From d7a787f5f0832d3debc04467e2f1c3abc57eb5b3 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Mon, 19 Oct 2020 14:03:48 +0200
Subject: [PATCH 59/81] Fixed save/load replay buffer.

---
 stable_baselines3/her/her.py | 17 ++++++++++++-----
 tests/test_her.py            | 34 +++++++++++++++++++++-------------
 2 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index bd71b3e07f..aa1e161a7a 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -396,11 +396,6 @@ def save(
         self.model.model_class = self.model_class
         self.model.max_episode_length = self.max_episode_length
 
-        # exclude episode storage
-        if exclude is None:
-            exclude = []
-        exclude = ["_episode_storage"].extend(exclude)
-
         self.model.save(path, exclude, include)
 
     @classmethod
@@ -488,3 +483,15 @@ def load(
         if her_model.model.use_sde:
             her_model.model.policy.reset_noise()  # pytype: disable=attribute-error
         return her_model
+
+    def load_replay_buffer(self, path: Union[str, pathlib.Path, io.BufferedIOBase]) -> None:
+        """
+        Load a replay buffer from a pickle file and set environment for replay buffer (only online sampling).
+
+        :param path: Path to the pickled replay buffer.
+        """
+        self.model.load_replay_buffer(path=path)
+
+        if self.online_sampling:
+            # set environment
+            self.replay_buffer.set_env(self.env)
diff --git a/tests/test_her.py b/tests/test_her.py
index b5a2382b30..d3bade3619 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -74,7 +74,8 @@ def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
 
 @pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN])
 @pytest.mark.parametrize("use_sde", [False, True])
-def test_save_load(tmp_path, model_class, use_sde):
+@pytest.mark.parametrize("online_sampling", [False, True])
+def test_save_load(tmp_path, model_class, use_sde, online_sampling):
     """
     Test if 'save' and 'load' saves and loads model correctly
     """
@@ -93,7 +94,7 @@ def test_save_load(tmp_path, model_class, use_sde):
         model_class,
         n_sampled_goal=5,
         goal_selection_strategy="future",
-        online_sampling=True,
+        online_sampling=online_sampling,
         verbose=0,
         tau=0.05,
         batch_size=128,
@@ -162,8 +163,11 @@ def test_save_load(tmp_path, model_class, use_sde):
     os.remove(tmp_path / "test_save.zip")
 
 
-@pytest.mark.parametrize("model_class", [HER])
-def test_save_load_replay_buffer(tmp_path, model_class):
+@pytest.mark.parametrize("online_sampling", [False, True])
+def test_save_load_replay_buffer(tmp_path, online_sampling):
+    """
+    Test if 'save_replay_buffer' and 'load_replay_buffer' works correctly
+    """
     path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl")
     path.parent.mkdir(exist_ok=True, parents=True)  # to not raise a warning
     env = BitFlippingEnv(n_bits=4, continuous=True)
@@ -172,7 +176,7 @@ def test_save_load_replay_buffer(tmp_path, model_class):
         env,
         SAC,
         goal_selection_strategy="future",
-        online_sampling=True,
+        online_sampling=online_sampling,
         gradient_steps=1,
         train_freq=1,
         n_episodes_rollout=-1,
@@ -184,21 +188,25 @@ def test_save_load_replay_buffer(tmp_path, model_class):
     model.save_replay_buffer(path)
     model.model.replay_buffer = None
     model.load_replay_buffer(path)
-    # set environment
-    model.replay_buffer.set_env(env)
 
-    assert np.allclose(old_replay_buffer.buffer["observation"], model.replay_buffer.buffer["observation"])
-    assert np.allclose(old_replay_buffer.buffer["next_obs"], model.replay_buffer.buffer["next_obs"])
-    assert np.allclose(old_replay_buffer.buffer["action"], model.replay_buffer.buffer["action"])
-    assert np.allclose(old_replay_buffer.buffer["reward"], model.replay_buffer.buffer["reward"])
-    assert np.allclose(old_replay_buffer.buffer["done"], model.replay_buffer.buffer["done"])
+    if online_sampling:
+        assert np.allclose(old_replay_buffer.buffer["observation"], model.replay_buffer.buffer["observation"], equal_nan=True)
+        assert np.allclose(old_replay_buffer.buffer["next_obs"], model.replay_buffer.buffer["next_obs"], equal_nan=True)
+        assert np.allclose(old_replay_buffer.buffer["action"], model.replay_buffer.buffer["action"], equal_nan=True)
+        assert np.allclose(old_replay_buffer.buffer["reward"], model.replay_buffer.buffer["reward"], equal_nan=True)
+        assert np.allclose(old_replay_buffer.buffer["done"], model.replay_buffer.buffer["done"], equal_nan=True)
+    else:
+        assert np.allclose(old_replay_buffer.observations, model.replay_buffer.observations)
+        assert np.allclose(old_replay_buffer.actions, model.replay_buffer.actions)
+        assert np.allclose(old_replay_buffer.rewards, model.replay_buffer.rewards)
+        assert np.allclose(old_replay_buffer.dones, model.replay_buffer.dones)
 
 
 @pytest.mark.parametrize("online_sampling", [False, True])
 @pytest.mark.parametrize("n_bits", [10])
 def test_performance_her(online_sampling, n_bits):
     """
-    That that DQN+HER can solve BitFlippingEnv.
+    That DQN+HER can solve BitFlippingEnv.
     It should not work when n_sampled_goal=0 (DQN alone).
     """
     env = BitFlippingEnv(n_bits=n_bits, continuous=False)

From c8ebaa93f5b0decde267baa193a09f3230f0aa7c Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Mon, 19 Oct 2020 16:26:52 +0200
Subject: [PATCH 60/81] Fixed save/load replay buffer

---
 tests/test_her.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/tests/test_her.py b/tests/test_her.py
index d3bade3619..331fe5269d 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -190,11 +190,24 @@ def test_save_load_replay_buffer(tmp_path, online_sampling):
     model.load_replay_buffer(path)
 
     if online_sampling:
-        assert np.allclose(old_replay_buffer.buffer["observation"], model.replay_buffer.buffer["observation"], equal_nan=True)
-        assert np.allclose(old_replay_buffer.buffer["next_obs"], model.replay_buffer.buffer["next_obs"], equal_nan=True)
-        assert np.allclose(old_replay_buffer.buffer["action"], model.replay_buffer.buffer["action"], equal_nan=True)
-        assert np.allclose(old_replay_buffer.buffer["reward"], model.replay_buffer.buffer["reward"], equal_nan=True)
-        assert np.allclose(old_replay_buffer.buffer["done"], model.replay_buffer.buffer["done"], equal_nan=True)
+        n_episodes_stored = old_replay_buffer.n_episodes_stored
+        assert np.allclose(
+            old_replay_buffer.buffer["observation"][:n_episodes_stored],
+            model.replay_buffer.buffer["observation"][:n_episodes_stored],
+        )
+        assert np.allclose(
+            old_replay_buffer.buffer["next_obs"][:n_episodes_stored],
+            model.replay_buffer.buffer["next_obs"][:n_episodes_stored],
+        )
+        assert np.allclose(
+            old_replay_buffer.buffer["action"][:n_episodes_stored], model.replay_buffer.buffer["action"][:n_episodes_stored]
+        )
+        assert np.allclose(
+            old_replay_buffer.buffer["reward"][:n_episodes_stored], model.replay_buffer.buffer["reward"][:n_episodes_stored]
+        )
+        assert np.allclose(
+            old_replay_buffer.buffer["done"][:n_episodes_stored], model.replay_buffer.buffer["done"][:n_episodes_stored]
+        )
     else:
         assert np.allclose(old_replay_buffer.observations, model.replay_buffer.observations)
         assert np.allclose(old_replay_buffer.actions, model.replay_buffer.actions)

From 11f0fa2734c472d2a6d7de1a3a43a075d84b84dd Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Tue, 20 Oct 2020 00:38:46 +0200
Subject: [PATCH 61/81] Fixed transition index after loading replay buffer in
 online sampling

---
 stable_baselines3/her/her.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index aa1e161a7a..0157b46e8f 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -495,3 +495,4 @@ def load_replay_buffer(self, path: Union[str, pathlib.Path, io.BufferedIOBase])
         if self.online_sampling:
             # set environment
             self.replay_buffer.set_env(self.env)
+            self.replay_buffer.current_idx = 0

From ee39e38997835673250ad1fc4cb637a37dc81382 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Tue, 20 Oct 2020 12:59:21 +0200
Subject: [PATCH 62/81] Better error handling

---
 stable_baselines3/her/her.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 0157b46e8f..537978ba5c 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -31,6 +31,9 @@ def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> in
     if current_max_episode_length is None:
         try:
             current_max_episode_length = env.get_attr("spec")[0].max_episode_steps
+            # Raise the error because the attribute is present but is None
+            if current_max_episode_length is None:
+                raise AttributeError
         # if not available check if a valid value was passed as an argument
         except AttributeError:
             raise ValueError(

From 3821e4dc61ea0795a0702a9dc451ff872c947d3f Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Tue, 20 Oct 2020 16:17:40 +0200
Subject: [PATCH 63/81] Add tests for get_time_limit

---
 stable_baselines3/common/bit_flipping_env.py |  3 ++
 stable_baselines3/her/her.py                 |  4 +-
 tests/test_her.py                            | 40 +++++++++++++++++++-
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/stable_baselines3/common/bit_flipping_env.py b/stable_baselines3/common/bit_flipping_env.py
index d38ff73cc7..62f07100fd 100644
--- a/stable_baselines3/common/bit_flipping_env.py
+++ b/stable_baselines3/common/bit_flipping_env.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from gym import GoalEnv, spaces
+from gym.envs.registration import EnvSpec
 
 from stable_baselines3.common.type_aliases import GymStepReturn
 
@@ -22,6 +23,8 @@ class BitFlippingEnv(GoalEnv):
         version or not, by default, it uses the MultiBinary one
     """
 
+    spec = EnvSpec("BitFlippingEnv-v0")
+
     def __init__(
         self, n_bits: int = 10, continuous: bool = False, max_steps: Optional[int] = None, discrete_obs_space: bool = False
     ):
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 537978ba5c..0727175a91 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -37,8 +37,8 @@ def get_time_limit(env: VecEnv, current_max_episode_length: Optional[int]) -> in
         # if not available check if a valid value was passed as an argument
         except AttributeError:
             raise ValueError(
-                "The max episode length could not be inferred."
-                "You must specify a `max_episode_steps` when registering the environment, "
+                "The max episode length could not be inferred.\n"
+                "You must specify a `max_episode_steps` when registering the environment,\n"
                 "use a `gym.wrappers.TimeLimit` wrapper "
                 "or pass `max_episode_length` to the model constructor"
             )
diff --git a/tests/test_her.py b/tests/test_her.py
index 331fe5269d..b3e2e9a119 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -2,14 +2,17 @@
 import pathlib
 from copy import deepcopy
 
+import gym
 import numpy as np
 import pytest
 import torch as th
 
 from stable_baselines3 import DDPG, DQN, HER, SAC, TD3
 from stable_baselines3.common.bit_flipping_env import BitFlippingEnv
+from stable_baselines3.common.vec_env import DummyVecEnv
 from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper
 from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
+from stable_baselines3.her.her import get_time_limit
 
 
 @pytest.mark.parametrize("model_class", [SAC, TD3, DDPG, DQN])
@@ -110,7 +113,7 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling):
         **kwargs
     )
 
-    model.learn(total_timesteps=500)
+    model.learn(total_timesteps=300)
 
     env.reset()
 
@@ -215,6 +218,41 @@ def test_save_load_replay_buffer(tmp_path, online_sampling):
         assert np.allclose(old_replay_buffer.dones, model.replay_buffer.dones)
 
 
+def test_get_max_episode_length():
+    dict_env = DummyVecEnv([lambda: BitFlippingEnv()])
+
+    # Cannot infer max epsiode length
+    with pytest.raises(ValueError):
+        get_time_limit(dict_env, current_max_episode_length=None)
+
+    default_length = 10
+    assert get_time_limit(dict_env, current_max_episode_length=default_length) == default_length
+
+    env = gym.make("CartPole-v1")
+    vec_env = DummyVecEnv([lambda: env])
+
+    assert get_time_limit(vec_env, current_max_episode_length=None) == 500
+    # Overwrite max_episode_steps
+    assert get_time_limit(vec_env, current_max_episode_length=default_length) == default_length
+
+    # Set max_episode_steps to None
+    env.spec.max_episode_steps = None
+    vec_env = DummyVecEnv([lambda: env])
+    with pytest.raises(ValueError):
+        get_time_limit(vec_env, current_max_episode_length=None)
+
+    # Initialize HER and specify max_episode_length, should not raise an issue
+    HER("MlpPolicy", dict_env, DQN, max_episode_length=5)
+
+    with pytest.raises(ValueError):
+        HER("MlpPolicy", dict_env, DQN)
+
+    # Wrapped in a timelimit, should be fine
+    # Note: it requires env.spec to be defined
+    env = DummyVecEnv([lambda: gym.wrappers.TimeLimit(BitFlippingEnv(), 10)])
+    HER("MlpPolicy", env, DQN)
+
+
 @pytest.mark.parametrize("online_sampling", [False, True])
 @pytest.mark.parametrize("n_bits", [10])
 def test_performance_her(online_sampling, n_bits):

From dca958259fcccb7e09a36a92a35a7aa0dd2e5f07 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Tue, 20 Oct 2020 16:34:16 +0200
Subject: [PATCH 64/81] More tests for VecNormalize with dict obs

---
 tests/test_vec_normalize.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
index 75b017c782..a68e1b2fcd 100644
--- a/tests/test_vec_normalize.py
+++ b/tests/test_vec_normalize.py
@@ -142,7 +142,8 @@ def test_runningmeanstd():
         assert np.allclose(moments_1, moments_2)
 
 
-def test_vec_env(tmp_path):
+@pytest.mark.parametrize("make_env", [make_env, make_dict_env])
+def test_vec_env(tmp_path, make_env):
     """Test VecNormalize Object"""
     clip_obs = 0.5
     clip_reward = 5.0
@@ -153,7 +154,11 @@ def test_vec_env(tmp_path):
     while not done[0]:
         actions = [norm_venv.action_space.sample()]
         obs, rew, done, _ = norm_venv.step(actions)
-        assert np.max(np.abs(obs)) <= clip_obs
+        if isinstance(obs, dict):
+            for key in obs.keys():
+                assert np.max(np.abs(obs[key])) <= clip_obs
+        else:
+            assert np.max(np.abs(obs)) <= clip_obs
         assert np.max(np.abs(rew)) <= clip_reward
 
     path = tmp_path / "vec_normalize"
@@ -181,6 +186,26 @@ def test_get_original():
         np.testing.assert_allclose(venv.normalize_reward(orig_rewards), rewards)
 
 
+def test_get_original_dict():
+    venv = _make_warmstart_dict_env()
+    for _ in range(3):
+        actions = [venv.action_space.sample()]
+        obs, rewards, _, _ = venv.step(actions)
+        # obs = obs[0]
+        orig_obs = venv.get_original_obs()
+        rewards = rewards[0]
+        orig_rewards = venv.get_original_reward()[0]
+
+        for key in orig_obs.keys():
+            assert orig_obs[key].shape == obs[key].shape
+        assert orig_rewards.dtype == rewards.dtype
+
+        assert not allclose(orig_obs, obs)
+        assert not np.array_equal(orig_rewards, rewards)
+        assert allclose(venv.normalize_obs(orig_obs), obs)
+        np.testing.assert_allclose(venv.normalize_reward(orig_rewards), rewards)
+
+
 def test_normalize_external():
     venv = _make_warmstart_cartpole()
 

From 631cc9c6a0ac9cbe7edcfdd3fb6171f4cef64c55 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Tue, 20 Oct 2020 18:03:29 +0200
Subject: [PATCH 65/81] Update doc

---
 docs/guide/examples.rst  | 3 +++
 docs/guide/migration.rst | 3 ++-
 docs/modules/her.rst     | 8 ++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst
index 49b2b3c2fa..3b9029d33a 100644
--- a/docs/guide/examples.rst
+++ b/docs/guide/examples.rst
@@ -384,6 +384,9 @@ The parking env is a goal-conditioned continuous control task, in which the vehi
       SAC,
       n_sampled_goal=n_sampled_goal,
       goal_selection_strategy="future",
+      # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper
+      # we have to manually specify the max number of steps per episode
+      max_episode_length=100,
       verbose=1,
       buffer_size=int(1e6),
       learning_rate=1e-3,
diff --git a/docs/guide/migration.rst b/docs/guide/migration.rst
index f1e0225a01..0899242438 100644
--- a/docs/guide/migration.rst
+++ b/docs/guide/migration.rst
@@ -168,7 +168,8 @@ HER
 
 The ``HER`` implementation now also supports online sampling of the new goals. This is done in a vectorized version.
 The goal selection strategy ``RANDOM`` is no longer supported.
-``HER`` now supports ``VecNormalize`` wrapper but only when ``online_sampling=True``
+``HER`` now supports ``VecNormalize`` wrapper but only when ``online_sampling=True``.
+For performance reasons, the maximum number of steps per episodes must be specified (see :ref:`HER <her>` documentation).
 
 
 New logger API
diff --git a/docs/modules/her.rst b/docs/modules/her.rst
index 6befbc1731..1fac42add1 100644
--- a/docs/modules/her.rst
+++ b/docs/modules/her.rst
@@ -15,6 +15,14 @@ HER is an algorithm that works with Off policy methods (DQN, SAC, TD3 and DDPG f
     HER requires the environment to inherits from `gym.GoalEnv <https://github.com/openai/gym/blob/3394e245727c1ae6851b504a50ba77c73cd4c65b/gym/core.py#L160>`_
 
 
+.. warning::
+
+  For performance reasons, the maximum number of steps per episodes must be specified.
+  In most cases, it will be inferred if you specify ```max_episode_steps`` when registering the environment
+  or if you use a ``gym.wrappers.TimeLimit`` (and ``env.spec`` is not None).
+  Otherwise, you can directly pass ``max_episode_length`` to the model constructor
+
+
 .. warning::
 
 	``HER`` supports ``VecNormalize`` wrapper but only when ``online_sampling=True``

From ba0a7e4f81da4a61b5877e0e6c0cb64f15dce1f8 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Tue, 20 Oct 2020 18:10:28 +0200
Subject: [PATCH 66/81] Improve HER description

---
 docs/modules/her.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/modules/her.rst b/docs/modules/her.rst
index 1fac42add1..31d1fac3a1 100644
--- a/docs/modules/her.rst
+++ b/docs/modules/her.rst
@@ -8,7 +8,11 @@ HER
 
 `Hindsight Experience Replay (HER) <https://arxiv.org/abs/1707.01495>`_
 
-HER is an algorithm that works with Off policy methods (DQN, SAC, TD3 and DDPG for example).
+HER is an algorithm that works with off-policy methods (DQN, SAC, TD3 and DDPG for example).
+HER uses the fact that even if a desired goal was not achieved, other goal may have been achieved during a rollout.
+It creates "virtual" transitions by relabeling transitions (changing the desired goal) from past episodes.
+
+
 
 .. warning::
 

From 907bcffe16a689e53c43f653637c8069332c2f33 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Tue, 20 Oct 2020 18:21:20 +0200
Subject: [PATCH 67/81] Add test for sde support

---
 tests/test_sde.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_sde.py b/tests/test_sde.py
index 3b010d36d8..74853a0f99 100644
--- a/tests/test_sde.py
+++ b/tests/test_sde.py
@@ -54,6 +54,11 @@ def test_state_dependent_exploration_grad():
     assert sigma_hat.grad.allclose(grad)
 
 
+def test_sde_check():
+    with pytest.raises(ValueError):
+        PPO("MlpPolicy", "CartPole-v1", use_sde=True)
+
+
 @pytest.mark.parametrize("model_class", [SAC, A2C, PPO])
 @pytest.mark.parametrize("sde_net_arch", [None, [32, 16], []])
 @pytest.mark.parametrize("use_expln", [False, True])
@@ -65,9 +70,9 @@ def test_state_dependent_offpolicy_noise(model_class, sde_net_arch, use_expln):
         seed=None,
         create_eval_env=True,
         verbose=1,
-        policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch, use_expln=use_expln),
+        policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch, use_expln=use_expln, net_arch=[64]),
     )
-    model.learn(total_timesteps=int(500), eval_freq=250)
+    model.learn(total_timesteps=int(300), eval_freq=250)
     model.policy.reset_noise()
     if model_class == SAC:
         model.policy.actor.get_std()

From f650934c69111dd0cb9c154692deeb4e3f8980c1 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Tue, 20 Oct 2020 18:50:33 +0200
Subject: [PATCH 68/81] Add comments

---
 stable_baselines3/her/her.py               |  5 +-
 stable_baselines3/her/her_replay_buffer.py | 55 ++++++++++++++--------
 2 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 0727175a91..44b6377813 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -351,13 +351,14 @@ def collect_rollouts(
 
     def _store_transitions(self) -> None:
         """
-        Store current episode in replay buffer. Sample additional goals and store new transitions in replay buffer.
+        Store current episode in replay buffer when using offline sampling.
+        Sample additional goals and store new transitions in replay buffer.
         """
 
         # sample goals and get new observations
         observations, next_observations, actions, rewards = self._episode_storage.sample(
             self.batch_size,
-            self.get_vec_normalize_env(),
+            None, # we should store unnormalized transitions, they will be normalized at sampling time
             self.online_sampling,
             self.n_sampled_goal,
         )
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 4ade7ec047..929415ad75 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -14,20 +14,23 @@
 
 class HerReplayBuffer(ReplayBuffer):
     """
-    Replay Buffer for sampling HER (Hindsight Experience Replay) transitions.
-    In the online sampling case these new transitions will not be saved in the Buffer.
+    Replay buffer for sampling HER (Hindsight Experience Replay) transitions.
+    In the online sampling case, these new transitions will not be saved in the replay buffer
+    and will only be created at sampling time.
 
     :param env: The training environment
     :param buffer_size: The size of the buffer measured in transitions.
     :param max_episode_length: The length of an episode. (time horizon)
     :param goal_selection_strategy: Strategy for sampling goals for replay.
-        One of ['episode', 'final', 'future', 'random']
+        One of ['episode', 'final', 'future']
     :param observation_space: Observation space
     :param action_space: Action space
     :param device: PyTorch device
-        to which the values will be converted
     :param n_envs: Number of parallel environments
-    :her_ratio: The ratio between HER replays and regular replays in percent (between 0 and 1, for online sampling)
+    :her_ratio: The ratio between HER transitions and regular transitions in percent
+        (between 0 and 1, for online sampling)
+        The default value ``her_ratio=0.8`` corresponds to 4 virtual transitions
+        for one real transition (4 / (4 + 1) = 0.8)
     """
 
     def __init__(
@@ -70,6 +73,7 @@ def __init__(
             key: np.empty((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32)
             for key, dim in input_shape.items()
         }
+        # Store info dicts are it can be used to compute the reward (e.g. continuity cost)
         self.info_buffer = [deque(maxlen=self.max_episode_length) for _ in range(self.max_episode_stored)]
         # episode length storage, needed for episodes which has less steps than the maximum length
         self.episode_lengths = np.zeros(self.max_episode_stored, dtype=np.int64)
@@ -92,7 +96,7 @@ def __setstate__(self, state: Dict[str, Any]) -> None:
         """
         Restores pickled state.
 
-        User must call set_env() after unpickling before using.
+        User must call ``set_env()`` after unpickling before using.
 
         :param state:
         """
@@ -103,6 +107,7 @@ def __setstate__(self, state: Dict[str, Any]) -> None:
     def set_env(self, env: ObsDictWrapper) -> None:
         """
         Sets the environment.
+
         :param env:
         """
         if self.env is not None:
@@ -167,21 +172,21 @@ def sample_goals(
             transitions_indices = np.random.randint(self.episode_lengths[her_episode_indices])
 
         else:
-            raise ValueError("Strategy for sampling goals not supported!")
+            raise ValueError(f"Strategy {self.goal_selection_strategy} for sampling goals not supported!")
 
         return self.buffer["achieved_goal"][her_episode_indices, transitions_indices]
 
     def _sample_transitions(
         self,
         batch_size: int,
-        env: Optional[VecNormalize],
+        maybe_vec_env: Optional[VecNormalize],
         online_sampling: bool = True,
         n_sampled_goal: int = None,
     ) -> Union[ReplayBufferSamples, Tuple]:
         """
         :param batch_size: Number of element to sample
-        :param env: associated gym VecEnv
-            to normalize the observations/rewards when sampling
+        :param env: associated gym VecEnv to normalize the observations/rewards
+            Only valid when using online sampling
         :param online_sampling: Using online_sampling for HER or not.
         :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
         :return: Samples.
@@ -191,9 +196,13 @@ def _sample_transitions(
             episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
             her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)]
         else:
+            assert maybe_vec_env is None, "Transitions must be stored unnormalized in the replay buffer"
+            # Offline sampling: there is only one episode stored
             episode_length = self.episode_lengths[0]
+            # we sample n_sampled_goal per timestep in the episode (only one is stored).
             episode_indices = np.tile(0, (episode_length * n_sampled_goal))
-            # episode_indices = np.array(list(range(1)) * episode_length * n_sampled_goal)
+            # we only sample virtual transitions
+            # as real transitions are already stored in the replay buffer
             her_indices = np.arange(len(episode_indices))
 
         ep_length = self.episode_lengths[episode_indices]
@@ -209,9 +218,13 @@ def _sample_transitions(
             transitions_indices = np.random.randint(ep_length)
         else:
             if her_indices.size == 0:
+                # Episode of one timestep, not enough for using the "future" strategy
+                # no virtual transitions are created in that case
                 return np.empty(0), np.empty(0), np.empty(0), np.empty(0)
             else:
-                # repeat every transition index n_sampled_goals times
+                # Repeat every transition index n_sampled_goals times
+                # to sample n_sampled_goal per timestep in the episode (only one is stored).
+                # Now with the corrected episode length when using "future" strategy
                 transitions_indices = np.tile(np.arange(ep_length[0]), n_sampled_goal)
                 episode_indices = episode_indices[transitions_indices]
                 her_indices = np.arange(len(episode_indices))
@@ -219,6 +232,7 @@ def _sample_transitions(
         # get selected transitions
         transitions = {key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys()}
 
+        # sample new desired goals and relabel the transitions
         new_goals = self.sample_goals(episode_indices, her_indices, transitions_indices)
         transitions["desired_goal"][her_indices] = new_goals
 
@@ -239,10 +253,10 @@ def _sample_transitions(
         )
 
         # concatenate observation with (desired) goal
-        observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, env))
+        observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, maybe_vec_env))
         # HACK to make normalize obs work with the next observation
         transitions["observation"] = transitions["next_obs"]
-        next_observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, env))
+        next_observations = ObsDictWrapper.convert_dict(self._normalize_obs(transitions, maybe_vec_env))
 
         if online_sampling:
             data = (
@@ -250,7 +264,7 @@ def _sample_transitions(
                 transitions["action"],
                 next_observations[:, 0],
                 transitions["done"],
-                self._normalize_reward(transitions["reward"], env),
+                self._normalize_reward(transitions["reward"], maybe_vec_env),
             )
 
             return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
@@ -286,7 +300,11 @@ def add(
         # update current pointer
         self.current_idx += 1
 
-    def store_episode(self):
+    def store_episode(self) -> None:
+        """
+        Increment episode counter
+        and reset transition pointer.
+        """
         # add episode length to length storage
         self.episode_lengths[self.pos] = self.current_idx
 
@@ -302,14 +320,11 @@ def store_episode(self):
         self.current_idx = 0
 
     @property
-    def n_episodes_stored(self):
+    def n_episodes_stored(self) -> int:
         if self.full:
             return self.max_episode_stored
         return self.pos
 
-    def clear_buffer(self):
-        self.buffer = {}
-
     def size(self) -> int:
         """
         :return: The current size of the buffer in transitions.

From 03c41041e2a469fc7aeca1965060eb0a2b83126f Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Tue, 20 Oct 2020 19:16:48 +0200
Subject: [PATCH 69/81] Add comments

---
 stable_baselines3/her/her.py               | 12 ++++++------
 stable_baselines3/her/her_replay_buffer.py |  8 +++++++-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 44b6377813..24aa674fd7 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -323,8 +323,8 @@ def collect_rollouts(
                     self.replay_buffer.store_episode()
                 else:
                     self._episode_storage.store_episode()
-                    # store episode in replay buffer
-                    self._store_transitions()
+                    # sample virtual transitions and store them in replay buffer
+                    self._sample_her_transitions()
                     # clear storage for current episode
                     self._episode_storage.reset()
 
@@ -349,16 +349,16 @@ def collect_rollouts(
 
         return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)
 
-    def _store_transitions(self) -> None:
+    def _sample_her_transitions(self) -> None:
         """
-        Store current episode in replay buffer when using offline sampling.
-        Sample additional goals and store new transitions in replay buffer.
+        Sample additional goals and store new transitions in replay buffer
+        when using offline sampling
         """
 
         # sample goals and get new observations
         observations, next_observations, actions, rewards = self._episode_storage.sample(
             self.batch_size,
-            None, # we should store unnormalized transitions, they will be normalized at sampling time
+            None,  # we should store unnormalized transitions, they will be normalized at sampling time
             self.online_sampling,
             self.n_sampled_goal,
         )
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 929415ad75..ca40d84b9b 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -244,9 +244,15 @@ def _sample_transitions(
             ]
         )
 
-        # Vectorized computation
+        # Vectorized computation of the new reward
         transitions["reward"][her_indices, 0] = self.env.env_method(
             "compute_reward",
+            # the new state depends on the previous state and action
+            # s_{t+1} = f(s_t, a_t)
+            # so the next_achieved_goal depends also on the previous state and action
+            # because we are in a GoalEnv:
+            # r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal)
+            # therefore we have to use "next_achieved_goal" and not "achieved_goal"
             transitions["next_achieved_goal"][her_indices, 0],
             transitions["desired_goal"][her_indices, 0],
             transitions["info"][her_indices, 0],

From 6c18e4cde8cb7a03cc986f232f1c187dbba9c9ab Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Tue, 20 Oct 2020 19:21:21 +0200
Subject: [PATCH 70/81] Remove check that was always valid

---
 stable_baselines3/her/her.py | 43 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 24aa674fd7..e74cf7245d 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -276,31 +276,32 @@ def collect_rollouts(
                 self.model.ep_info_buffer = self.ep_info_buffer
                 self.model.ep_success_buffer = self.ep_success_buffer
 
-                # Store episode in episode storage
-                if self.replay_buffer is not None:
+                # == Store transition in the replay buffer and/or in the episode storage ==
+
+                if self._vec_normalize_env is not None:
                     # Store only the unnormalized version
-                    if self._vec_normalize_env is not None:
-                        new_obs_ = self._vec_normalize_env.get_original_obs()
-                        reward_ = self._vec_normalize_env.get_original_reward()
-                    else:
-                        # Avoid changing the original ones
-                        self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
-                        self.model._last_original_obs = self._last_original_obs
-
-                    if self.online_sampling:
-                        self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
-                    else:
-                        # concatenate observation with (desired) goal
-                        obs = ObsDictWrapper.convert_dict(self._last_original_obs)
-                        next_obs = ObsDictWrapper.convert_dict(new_obs_)
-                        # add to replay buffer
-                        self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done)
-                        # add current transition to episode storage
-                        self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
+                    new_obs_ = self._vec_normalize_env.get_original_obs()
+                    reward_ = self._vec_normalize_env.get_original_reward()
+                else:
+                    # Avoid changing the original ones
+                    self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
+                    self.model._last_original_obs = self._last_original_obs
+
+                if self.online_sampling:
+                    self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
+                else:
+                    # concatenate observation with (desired) goal
+                    obs = ObsDictWrapper.convert_dict(self._last_original_obs)
+                    next_obs = ObsDictWrapper.convert_dict(new_obs_)
+                    # add to replay buffer
+                    self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done)
+                    # add current transition to episode storage
+                    self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
 
                 self._last_obs = new_obs
                 self.model._last_obs = self._last_obs
-                # Save the unnormalized observation
+
+                # Save the unnormalized new observation
                 if self._vec_normalize_env is not None:
                     self._last_original_obs = new_obs_
                     self.model._last_original_obs = self._last_original_obs

From 28b281df908d513084ab84c5279bc04826ee1f52 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Wed, 21 Oct 2020 11:09:43 +0200
Subject: [PATCH 71/81] Fix for terminal observation

---
 stable_baselines3/her/her.py | 20 +++++++++++++++-----
 tests/test_her.py            |  6 ++++--
 tests/test_save_load.py      |  6 ++++--
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index e74cf7245d..15733093e9 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -287,16 +287,26 @@ def collect_rollouts(
                     self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward
                     self.model._last_original_obs = self._last_original_obs
 
+                # As the VecEnv resets automatically, new_obs is already the
+                # first observation of the next episode
+                if done and infos[0].get("terminal_observation") is not None:
+                    # The saved terminal_observation is not passed through other
+                    # VecEnvWrapper, so no need to unnormalize
+                    # NOTE: this may be an issue when using other wrappers
+                    next_obs = infos[0]["terminal_observation"]
+                else:
+                    next_obs = new_obs_
+
                 if self.online_sampling:
-                    self.replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
+                    self.replay_buffer.add(self._last_original_obs, next_obs, buffer_action, reward_, done, infos)
                 else:
                     # concatenate observation with (desired) goal
-                    obs = ObsDictWrapper.convert_dict(self._last_original_obs)
-                    next_obs = ObsDictWrapper.convert_dict(new_obs_)
+                    flattened_obs = ObsDictWrapper.convert_dict(self._last_original_obs)
+                    flattened_next_obs = ObsDictWrapper.convert_dict(next_obs)
                     # add to replay buffer
-                    self.replay_buffer.add(obs, next_obs, buffer_action, reward_, done)
+                    self.replay_buffer.add(flattened_obs, flattened_next_obs, buffer_action, reward_, done)
                     # add current transition to episode storage
-                    self._episode_storage.add(self._last_original_obs, new_obs_, buffer_action, reward_, done, infos)
+                    self._episode_storage.add(self._last_original_obs, next_obs, buffer_action, reward_, done, infos)
 
                 self._last_obs = new_obs
                 self.model._last_obs = self._last_obs
diff --git a/tests/test_her.py b/tests/test_her.py
index b3e2e9a119..a11eb0c7b8 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -184,16 +184,18 @@ def test_save_load_replay_buffer(tmp_path, online_sampling):
         train_freq=1,
         n_episodes_rollout=-1,
         max_episode_length=4,
+        buffer_size=int(2e4),
+        seed=0,
         policy_kwargs=dict(net_arch=[64]),
     )
-    model.learn(300)
+    model.learn(200)
     old_replay_buffer = deepcopy(model.replay_buffer)
     model.save_replay_buffer(path)
     model.model.replay_buffer = None
     model.load_replay_buffer(path)
 
     if online_sampling:
-        n_episodes_stored = old_replay_buffer.n_episodes_stored
+        n_episodes_stored = model.replay_buffer.n_episodes_stored
         assert np.allclose(
             old_replay_buffer.buffer["observation"][:n_episodes_stored],
             model.replay_buffer.buffer["observation"][:n_episodes_stored],
diff --git a/tests/test_save_load.py b/tests/test_save_load.py
index 77ec75e4eb..e6230ebdd2 100644
--- a/tests/test_save_load.py
+++ b/tests/test_save_load.py
@@ -231,8 +231,10 @@ def test_exclude_include_saved_params(tmp_path, model_class):
 def test_save_load_replay_buffer(tmp_path, model_class):
     path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl")
     path.parent.mkdir(exist_ok=True, parents=True)  # to not raise a warning
-    model = model_class("MlpPolicy", select_env(model_class), buffer_size=1000)
-    model.learn(500)
+    model = model_class(
+        "MlpPolicy", select_env(model_class), buffer_size=1000, policy_kwargs=dict(net_arch=[64]), learning_starts=200
+    )
+    model.learn(300)
     old_replay_buffer = deepcopy(model.replay_buffer)
     model.save_replay_buffer(path)
     model.replay_buffer = None

From d196aa26e13d32c67b3dae071174fb7b8290612c Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Wed, 21 Oct 2020 11:23:35 +0200
Subject: [PATCH 72/81] Updated buffer size in offline version and reset of HER
 buffer

---
 stable_baselines3/her/her.py               |  3 ++-
 stable_baselines3/her/her_replay_buffer.py | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 15733093e9..272e5b56c1 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -117,9 +117,10 @@ def __init__(
         # maximum steps in episode
         self.max_episode_length = get_time_limit(self.env, max_episode_length)
         # storage for transitions of current episode
+        her_buffer_size = self.buffer_size if online_sampling else self.max_episode_length
         self._episode_storage = HerReplayBuffer(
             self.env,
-            self.buffer_size,
+            her_buffer_size,
             self.max_episode_length,
             self.goal_selection_strategy,
             self.env.observation_space,
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index ca40d84b9b..57572871be 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -336,3 +336,13 @@ def size(self) -> int:
         :return: The current size of the buffer in transitions.
         """
         return int(np.sum(self.episode_lengths))
+
+    def reset(self) -> None:
+        """
+        Reset the buffer.
+        """
+        self.pos = 0
+        self.current_idx = 0
+        self.full = False
+        self.episode_lengths = np.zeros(self.max_episode_stored, dtype=np.int64)
+

From 1f7ab9f2fb38616bb524151063086093471a6e08 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Wed, 21 Oct 2020 12:21:32 +0200
Subject: [PATCH 73/81] Reformat

---
 stable_baselines3/her/her_replay_buffer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 57572871be..e727f2b73a 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -345,4 +345,3 @@ def reset(self) -> None:
         self.current_idx = 0
         self.full = False
         self.episode_lengths = np.zeros(self.max_episode_stored, dtype=np.int64)
-

From 7da274fe9ea873dd96364ef99cfa0dc7e187f1d8 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Wed, 21 Oct 2020 14:13:33 +0200
Subject: [PATCH 74/81] Update doc

---
 README.md                    | 13 +------------
 docs/misc/changelog.rst      |  3 ++-
 docs/modules/a2c.rst         | 21 ++++++++++++++++++++-
 docs/modules/ddpg.rst        |  8 +++++---
 docs/modules/dqn.rst         |  4 ++++
 docs/modules/her.rst         |  2 +-
 docs/modules/ppo.rst         | 19 +++++++++++++++++++
 docs/modules/sac.rst         | 11 +++++++----
 docs/modules/td3.rst         |  8 +++++---
 stable_baselines3/her/her.py |  8 +++++++-
 10 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 94c005d628..3415e4d091 100644
--- a/README.md
+++ b/README.md
@@ -35,20 +35,9 @@ These algorithms will make it easier for the research community and industry to
 | Type hints                  | :heavy_check_mark: |
 
 
-### Roadmap to V1.0
-
-Please look at the issue for more details.
-Planned features:
-
-- [ ] HER
-
 ### Planned features (v1.1+)
 
-- [ ] DQN extensions (prioritized replay, double q-learning, ...)
-- [ ] Support for `Tuple` and `Dict` observation spaces
-- [ ] Recurrent Policies
-- [ ] TRPO
-
+Please take a look at the [Roadmap](https://github.com/DLR-RM/stable-baselines3/issues/1) and [Milestones](https://github.com/DLR-RM/stable-baselines3/milestones).
 
 ## Migration guide: from Stable-Baselines (SB2) to Stable-Baselines3 (SB3)
 
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index c2db988b0b..cf9ef5960d 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -38,6 +38,7 @@ Others:
 Documentation:
 ^^^^^^^^^^^^^^
 - Added first draft of migration guide
+- Enabled doc for ``CnnPolicies``
 
 
 Pre-Release 0.9.0 (2020-10-03)
@@ -462,4 +463,4 @@ And all the contributors:
 @MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching
 @flodorner @KuKuXia @NeoExtended @PartiallyTyped @mmcenta @richardwu @kinalmehta @rolandgvc @tkelestemur @mloo3
 @tirafesi @blurLake @koulakis @joeljosephjin @shwang @rk37 @andyshih12 @RaphaelWag @xicocaio
-@diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @SwamyDev @wmmc88 @megan-klaiber
\ No newline at end of file
+@diditforlulz273 @liorcohen5 @ManifoldFR @mloo3 @SwamyDev @wmmc88 @megan-klaiber
diff --git a/docs/modules/a2c.rst b/docs/modules/a2c.rst
index 460d1a6e3b..9cd227ce9d 100644
--- a/docs/modules/a2c.rst
+++ b/docs/modules/a2c.rst
@@ -11,7 +11,7 @@ It uses multiple workers to avoid the use of a replay buffer.
 
 
 .. warning::
-  
+
   If you find training unstable or want to match performance of stable-baselines A2C, consider using
   ``RMSpropTFLike`` optimizer from ``stable_baselines3.common.sb2_compat.rmsprop_tf_like``.
   You can change optimizer with ``A2C(policy_kwargs=dict(optimizer_class=RMSpropTFLike))``.
@@ -79,3 +79,22 @@ Parameters
 .. autoclass:: A2C
   :members:
   :inherited-members:
+
+
+A2C Policies
+-------------
+
+.. autoclass:: MlpPolicy
+  :members:
+  :inherited-members:
+
+.. autoclass:: stable_baselines3.common.policies.ActorCriticPolicy
+  :members:
+  :noindex:
+
+.. autoclass:: CnnPolicy
+  :members:
+
+.. autoclass:: stable_baselines3.common.policies.ActorCriticCnnPolicy
+  :members:
+  :noindex:
diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst
index dd74f3a7d5..8add6982a5 100644
--- a/docs/modules/ddpg.rst
+++ b/docs/modules/ddpg.rst
@@ -98,7 +98,9 @@ DDPG Policies
   :members:
   :inherited-members:
 
+.. autoclass:: stable_baselines3.td3.policies.TD3Policy
+  :members:
+  :noindex:
 
-.. .. autoclass:: CnnPolicy
-..   :members:
-..   :inherited-members:
+.. autoclass:: CnnPolicy
+  :members:
diff --git a/docs/modules/dqn.rst b/docs/modules/dqn.rst
index 034e5b81da..ca9ccca322 100644
--- a/docs/modules/dqn.rst
+++ b/docs/modules/dqn.rst
@@ -90,5 +90,9 @@ DQN Policies
   :members:
   :inherited-members:
 
+.. autoclass:: stable_baselines3.dqn.policies.DQNPolicy
+  :members:
+  :noindex:
+
 .. autoclass:: CnnPolicy
   :members:
diff --git a/docs/modules/her.rst b/docs/modules/her.rst
index 31d1fac3a1..355b36d496 100644
--- a/docs/modules/her.rst
+++ b/docs/modules/her.rst
@@ -22,7 +22,7 @@ It creates "virtual" transitions by relabeling transitions (changing the desired
 .. warning::
 
   For performance reasons, the maximum number of steps per episodes must be specified.
-  In most cases, it will be inferred if you specify ```max_episode_steps`` when registering the environment
+  In most cases, it will be inferred if you specify ``max_episode_steps`` when registering the environment
   or if you use a ``gym.wrappers.TimeLimit`` (and ``env.spec`` is not None).
   Otherwise, you can directly pass ``max_episode_length`` to the model constructor
 
diff --git a/docs/modules/ppo.rst b/docs/modules/ppo.rst
index 038149d950..eca3e1b699 100644
--- a/docs/modules/ppo.rst
+++ b/docs/modules/ppo.rst
@@ -80,3 +80,22 @@ Parameters
 .. autoclass:: PPO
   :members:
   :inherited-members:
+
+
+PPO Policies
+-------------
+
+.. autoclass:: MlpPolicy
+  :members:
+  :inherited-members:
+
+.. autoclass:: stable_baselines3.common.policies.ActorCriticPolicy
+  :members:
+  :noindex:
+
+.. autoclass:: CnnPolicy
+  :members:
+
+.. autoclass:: stable_baselines3.common.policies.ActorCriticCnnPolicy
+  :members:
+  :noindex:
diff --git a/docs/modules/sac.rst b/docs/modules/sac.rst
index 6d559d4183..7b37974c93 100644
--- a/docs/modules/sac.rst
+++ b/docs/modules/sac.rst
@@ -82,7 +82,7 @@ Example
 
   obs = env.reset()
   while True:
-      action, _states = model.predict(obs)
+      action, _states = model.predict(obs, deterministic=True)
       obs, reward, done, info = env.step(action)
       env.render()
       if done:
@@ -104,6 +104,9 @@ SAC Policies
   :members:
   :inherited-members:
 
-.. .. autoclass:: CnnPolicy
-..   :members:
-..   :inherited-members:
+.. autoclass:: stable_baselines3.sac.policies.SACPolicy
+  :members:
+  :noindex:
+
+.. autoclass:: CnnPolicy
+  :members:
diff --git a/docs/modules/td3.rst b/docs/modules/td3.rst
index 912fc1b97c..fbe6aabd50 100644
--- a/docs/modules/td3.rst
+++ b/docs/modules/td3.rst
@@ -101,7 +101,9 @@ TD3 Policies
   :members:
   :inherited-members:
 
+.. autoclass:: stable_baselines3.td3.policies.TD3Policy
+  :members:
+  :noindex:
 
-.. .. autoclass:: CnnPolicy
-..   :members:
-..   :inherited-members:
+.. autoclass:: CnnPolicy
+  :members:
diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 272e5b56c1..bf7c486356 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -51,7 +51,13 @@ class HER(BaseAlgorithm):
     Hindsight Experience Replay (HER)
     Paper: https://arxiv.org/abs/1707.01495
 
-    WARNING: Requires maximum episode length provided either by the environment or by the user!
+    .. warning::
+
+      For performance reasons, the maximum number of steps per episodes must be specified.
+      In most cases, it will be inferred if you specify ``max_episode_steps`` when registering the environment
+      or if you use a ``gym.wrappers.TimeLimit`` (and ``env.spec`` is not None).
+      Otherwise, you can directly pass ``max_episode_length`` to the model constructor
+
 
     For additional offline algorithm specific arguments please have a look at the corresponding documentation.
 

From 8bb5c7c670b13014402446d1ccea1660aadebd29 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Wed, 21 Oct 2020 17:23:49 +0200
Subject: [PATCH 75/81] Remove np.empty + add doc

---
 stable_baselines3/common/atari_wrappers.py |  2 +-
 stable_baselines3/her/her_replay_buffer.py | 26 +++++++++++++---------
 tests/test_her.py                          | 19 +++++++++-------
 3 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/stable_baselines3/common/atari_wrappers.py b/stable_baselines3/common/atari_wrappers.py
index 7cc6836aaa..b0c52959bf 100644
--- a/stable_baselines3/common/atari_wrappers.py
+++ b/stable_baselines3/common/atari_wrappers.py
@@ -34,7 +34,7 @@ def reset(self, **kwargs) -> np.ndarray:
         else:
             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
         assert noops > 0
-        obs = np.empty(0)
+        obs = np.zeros(0)
         for _ in range(noops):
             obs, _, done, _ = self.env.step(self.noop_action)
             if done:
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index e727f2b73a..5ca20eb71b 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -70,7 +70,7 @@ def __init__(
             "done": (1,),
         }
         self.buffer = {
-            key: np.empty((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32)
+            key: np.zeros((self.max_episode_stored, self.max_episode_length, *dim), dtype=np.float32)
             for key, dim in input_shape.items()
         }
         # Store info dicts are it can be used to compute the reward (e.g. continuity cost)
@@ -129,7 +129,7 @@ def sample(
         env: Optional[VecNormalize] = None,
         online_sampling: bool = True,
         n_sampled_goal: int = None,
-    ) -> Union[ReplayBufferSamples, Tuple]:
+    ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]:
         """
         :param batch_size: Number of element to sample
         :param env: Associated gym VecEnv
@@ -182,7 +182,7 @@ def _sample_transitions(
         maybe_vec_env: Optional[VecNormalize],
         online_sampling: bool = True,
         n_sampled_goal: int = None,
-    ) -> Union[ReplayBufferSamples, Tuple]:
+    ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]:
         """
         :param batch_size: Number of element to sample
         :param env: associated gym VecEnv to normalize the observations/rewards
@@ -194,6 +194,7 @@ def _sample_transitions(
         # Select which episodes to use
         if online_sampling:
             episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
+            # A subset of the transitions will be relabeled using HER algorithm
             her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)]
         else:
             assert maybe_vec_env is None, "Transitions must be stored unnormalized in the replay buffer"
@@ -205,27 +206,29 @@ def _sample_transitions(
             # as real transitions are already stored in the replay buffer
             her_indices = np.arange(len(episode_indices))
 
-        ep_length = self.episode_lengths[episode_indices]
+        ep_lengths = self.episode_lengths[episode_indices]
 
+        # Special case when using the "future" goal sampling strategy
+        # we cannot sample all transitions, we have to remove the last timestep
         if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
-            # restrict the sampling domain when ep_length > 1
+            # restrict the sampling domain when ep_lengths > 1
             # otherwise filter out the indices
-            her_indices = her_indices[ep_length[her_indices] > 1]
-            ep_length[her_indices] -= 1
+            her_indices = her_indices[ep_lengths[her_indices] > 1]
+            ep_lengths[her_indices] -= 1
 
         if online_sampling:
             # Select which transitions to use
-            transitions_indices = np.random.randint(ep_length)
+            transitions_indices = np.random.randint(ep_lengths)
         else:
             if her_indices.size == 0:
                 # Episode of one timestep, not enough for using the "future" strategy
                 # no virtual transitions are created in that case
-                return np.empty(0), np.empty(0), np.empty(0), np.empty(0)
+                return np.zeros(0), np.zeros(0), np.zeros(0), np.zeros(0)
             else:
                 # Repeat every transition index n_sampled_goals times
                 # to sample n_sampled_goal per timestep in the episode (only one is stored).
                 # Now with the corrected episode length when using "future" strategy
-                transitions_indices = np.tile(np.arange(ep_length[0]), n_sampled_goal)
+                transitions_indices = np.tile(np.arange(ep_lengths[0]), n_sampled_goal)
                 episode_indices = episode_indices[transitions_indices]
                 her_indices = np.arange(len(episode_indices))
 
@@ -254,6 +257,7 @@ def _sample_transitions(
             # r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal)
             # therefore we have to use "next_achieved_goal" and not "achieved_goal"
             transitions["next_achieved_goal"][her_indices, 0],
+            # here we use the new desired goal
             transitions["desired_goal"][her_indices, 0],
             transitions["info"][her_indices, 0],
         )
@@ -333,7 +337,7 @@ def n_episodes_stored(self) -> int:
 
     def size(self) -> int:
         """
-        :return: The current size of the buffer in transitions.
+        :return: The current number of transitions in the buffer.
         """
         return int(np.sum(self.episode_lengths))
 
diff --git a/tests/test_her.py b/tests/test_her.py
index a11eb0c7b8..17be564f90 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -125,15 +125,15 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling):
     observations = np.array(observations_list)
 
     # Get dictionary of current parameters
-    params = deepcopy(model.model.policy.state_dict())
+    params = deepcopy(model.policy.state_dict())
 
     # Modify all parameters to be random values
     random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items())
 
     # Update model parameters with the new random values
-    model.model.policy.load_state_dict(random_params)
+    model.policy.load_state_dict(random_params)
 
-    new_params = model.model.policy.state_dict()
+    new_params = model.policy.state_dict()
     # Check that all params are different now
     for k in params:
         assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected."
@@ -141,7 +141,7 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling):
     params = new_params
 
     # get selected actions
-    selected_actions, _ = model.model.predict(observations, deterministic=True)
+    selected_actions, _ = model.predict(observations, deterministic=True)
 
     # Check
     model.save(tmp_path / "test_save.zip")
@@ -149,14 +149,14 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling):
     model = HER.load(str(tmp_path / "test_save.zip"), env=env)
 
     # check if params are still the same after load
-    new_params = model.model.policy.state_dict()
+    new_params = model.policy.state_dict()
 
     # Check that all params are the same as before save load procedure now
     for key in params:
         assert th.allclose(params[key], new_params[key]), "Model parameters not the same after save and load."
 
     # check if model still selects the same actions
-    new_selected_actions, _ = model.model.predict(observations, deterministic=True)
+    new_selected_actions, _ = model.predict(observations, deterministic=True)
     assert np.allclose(selected_actions, new_selected_actions, 1e-4)
 
     # check if learn still works
@@ -185,13 +185,16 @@ def test_save_load_replay_buffer(tmp_path, online_sampling):
         n_episodes_rollout=-1,
         max_episode_length=4,
         buffer_size=int(2e4),
-        seed=0,
         policy_kwargs=dict(net_arch=[64]),
     )
     model.learn(200)
     old_replay_buffer = deepcopy(model.replay_buffer)
     model.save_replay_buffer(path)
-    model.model.replay_buffer = None
+    del model.model.replay_buffer
+
+    with pytest.raises(AttributeError):
+        model.replay_buffer
+
     model.load_replay_buffer(path)
 
     if online_sampling:

From d884f9c591ec622874645bc0667d809c8a69bfa6 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Wed, 21 Oct 2020 18:36:57 +0200
Subject: [PATCH 76/81] Fix loading

---
 stable_baselines3/her/her.py | 38 +++++++++++++++++++++++++++---------
 tests/test_her.py            |  5 +++++
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index bf7c486356..fd4dd4692d 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -87,16 +87,22 @@ def __init__(
         **kwargs,
     ):
 
-        super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=3e-4)
+        # we will use the policy and learning rate from the model
+        super(HER, self).__init__(policy=BasePolicy, env=env, policy_base=BasePolicy, learning_rate=0.0)
+        del self.policy, self.learning_rate
 
         if self.get_vec_normalize_env() is not None:
             assert online_sampling, "You must pass `online_sampling=True` if you want to use `VecNormalize` with `HER`"
 
+        _init_setup_model = kwargs.get("_init_setup_model", True)
+        if "_init_setup_model" in kwargs:
+            del kwargs["_init_setup_model"]
         # model initialization
         self.model_class = model_class
         self.model = model_class(
             policy=policy,
             env=self.env,
+            _init_setup_model=False,  # pytype: disable=wrong-keyword-args
             *args,
             **kwargs,  # pytype: disable=wrong-keyword-args
         )
@@ -122,7 +128,8 @@ def __init__(
         self.her_ratio = 1 - (1.0 / (self.n_sampled_goal + 1))
         # maximum steps in episode
         self.max_episode_length = get_time_limit(self.env, max_episode_length)
-        # storage for transitions of current episode
+        # storage for transitions of current episode for offline sampling
+        # for online sampling, it replaces the "classic" replay buffer completely
         her_buffer_size = self.buffer_size if online_sampling else self.max_episode_length
         self._episode_storage = HerReplayBuffer(
             self.env,
@@ -136,15 +143,17 @@ def __init__(
             self.her_ratio,  # pytype: disable=wrong-arg-types
         )
 
-        # assign episode storage to replay buffer when using online HER sampling
-        if self.online_sampling:
-            self.model.replay_buffer = self._episode_storage
-
         # counter for steps in episode
         self.episode_steps = 0
 
+        if _init_setup_model:
+            self._setup_model()
+
     def _setup_model(self) -> None:
         self.model._setup_model()
+        # assign episode storage to replay buffer when using online HER sampling
+        if self.online_sampling:
+            self.model.replay_buffer = self._episode_storage
 
     def predict(
         self,
@@ -466,10 +475,20 @@ def load(
             if "env" in data:
                 env = data["env"]
 
-        kwargs = {}
         if "use_sde" in data and data["use_sde"]:
             kwargs["use_sde"] = True
 
+        # Keys that cannot be changed
+        for key in {"model_class", "online_sampling", "max_episode_length"}:
+            if key in kwargs:
+                del kwargs[key]
+
+        # Keys that can be changed
+        for key in {"n_sampled_goal", "goal_selection_strategy"}:
+            if key in kwargs:
+                data[key] = kwargs[key]  # pytype: disable=unsupported-operands
+                del kwargs[key]
+
         # noinspection PyArgumentList
         her_model = cls(
             policy=data["policy_class"],
@@ -480,13 +499,14 @@ def load(
             online_sampling=data["online_sampling"],
             max_episode_length=data["max_episode_length"],
             policy_kwargs=data["policy_kwargs"],
-            _init_setup_model=True,  # pytype: disable=not-instantiable,wrong-keyword-args
+            _init_setup_model=False,  # pytype: disable=not-instantiable,wrong-keyword-args
             **kwargs,
         )
 
         # load parameters
         her_model.model.__dict__.update(data)
-        her_model.__dict__.update(kwargs)
+        her_model.model.__dict__.update(kwargs)
+        her_model._setup_model()
 
         her_model._total_timesteps = her_model.model._total_timesteps
         her_model.num_timesteps = her_model.model.num_timesteps
diff --git a/tests/test_her.py b/tests/test_her.py
index 17be564f90..6c4fe0ef16 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -162,6 +162,11 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling):
     # check if learn still works
     model.learn(total_timesteps=300)
 
+    # Test that the change of parameters works
+    model = HER.load(str(tmp_path / "test_save.zip"), env=env, verbose=3, learning_rate=2.0)
+    assert model.model.learning_rate == 2.0
+    assert model.verbose == 3
+
     # clear file from os
     os.remove(tmp_path / "test_save.zip")
 

From 0ba127270f3170e5ad60eac8027b4ac189d95207 Mon Sep 17 00:00:00 2001
From: Megan Klaiber <megan.klaiber@outlook.com>
Date: Wed, 21 Oct 2020 22:46:58 +0200
Subject: [PATCH 77/81] Updated loading replay buffer

---
 stable_baselines3/her/her.py | 27 +++++++++++++++++++++++++--
 tests/test_her.py            | 13 +++++++++----
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index fd4dd4692d..c15453bd3c 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -1,5 +1,6 @@
 import io
 import pathlib
+import warnings
 from typing import Any, Iterable, List, Optional, Tuple, Type, Union
 
 import numpy as np
@@ -526,15 +527,37 @@ def load(
             her_model.model.policy.reset_noise()  # pytype: disable=attribute-error
         return her_model
 
-    def load_replay_buffer(self, path: Union[str, pathlib.Path, io.BufferedIOBase]) -> None:
+    def load_replay_buffer(
+        self, path: Union[str, pathlib.Path, io.BufferedIOBase], truncate_last_trajectory: bool = True
+    ) -> None:
         """
         Load a replay buffer from a pickle file and set environment for replay buffer (only online sampling).
 
         :param path: Path to the pickled replay buffer.
+        :param truncate_last_trajectory:
+            If set to ``True`` we assume that the last trajectory in the replay buffer was finished.
+            If it is set to ``False`` we assume it is the same trajectory where we continue.
         """
         self.model.load_replay_buffer(path=path)
 
         if self.online_sampling:
             # set environment
             self.replay_buffer.set_env(self.env)
-            self.replay_buffer.current_idx = 0
+
+            # truncate interrupted episode
+            if truncate_last_trajectory:
+                warnings.warn(
+                    "The last trajectory in the replay buffer will be truncated, "
+                    "You should use `truncate_last_trajectory=False` to avoid that issue."
+                )
+                # get current episode and transition index
+                pos = self.replay_buffer.pos
+                current_idx = self.replay_buffer.current_idx
+                # set episode length for current episode
+                self.replay_buffer.episode_lengths[pos] = current_idx
+                # set done = True for current episode
+                self.replay_buffer.buffer["done"][pos][current_idx] = np.array([True], dtype=np.float32)
+                # reset current transition index
+                self.replay_buffer.current_idx = 0
+                # increment episode counter
+                self.replay_buffer.pos += 1
diff --git a/tests/test_her.py b/tests/test_her.py
index 6c4fe0ef16..c21a3d78d4 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -171,8 +171,8 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling):
     os.remove(tmp_path / "test_save.zip")
 
 
-@pytest.mark.parametrize("online_sampling", [False, True])
-def test_save_load_replay_buffer(tmp_path, online_sampling):
+@pytest.mark.parametrize("online_sampling, truncate_last_trajectory", [(False, None), (True, True), (True, False)])
+def test_save_load_replay_buffer(tmp_path, online_sampling, truncate_last_trajectory):
     """
     Test if 'save_replay_buffer' and 'load_replay_buffer' works correctly
     """
@@ -200,7 +200,7 @@ def test_save_load_replay_buffer(tmp_path, online_sampling):
     with pytest.raises(AttributeError):
         model.replay_buffer
 
-    model.load_replay_buffer(path)
+    model.load_replay_buffer(path, truncate_last_trajectory)
 
     if online_sampling:
         n_episodes_stored = model.replay_buffer.n_episodes_stored
@@ -218,8 +218,10 @@ def test_save_load_replay_buffer(tmp_path, online_sampling):
         assert np.allclose(
             old_replay_buffer.buffer["reward"][:n_episodes_stored], model.replay_buffer.buffer["reward"][:n_episodes_stored]
         )
+        # we might change the last done of the last trajectory so we don't compare it
         assert np.allclose(
-            old_replay_buffer.buffer["done"][:n_episodes_stored], model.replay_buffer.buffer["done"][:n_episodes_stored]
+            old_replay_buffer.buffer["done"][: n_episodes_stored - 1],
+            model.replay_buffer.buffer["done"][: n_episodes_stored - 1],
         )
     else:
         assert np.allclose(old_replay_buffer.observations, model.replay_buffer.observations)
@@ -227,6 +229,9 @@ def test_save_load_replay_buffer(tmp_path, online_sampling):
         assert np.allclose(old_replay_buffer.rewards, model.replay_buffer.rewards)
         assert np.allclose(old_replay_buffer.dones, model.replay_buffer.dones)
 
+    # test if continuing training works properly
+    model.learn(200)
+
 
 def test_get_max_episode_length():
     dict_env = DummyVecEnv([lambda: BitFlippingEnv()])

From 403421784e1dcf8ddaa931c53180ea1f64ee0ac0 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Thu, 22 Oct 2020 10:56:47 +0200
Subject: [PATCH 78/81] Separate online and offline sampling + bug fixes

---
 stable_baselines3/her/her.py               | 29 ++++++++--------
 stable_baselines3/her/her_replay_buffer.py | 39 ++++++++++++++++------
 tests/test_her.py                          | 20 +++++++++--
 3 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index c15453bd3c..83b1be4563 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -380,15 +380,14 @@ def collect_rollouts(
     def _sample_her_transitions(self) -> None:
         """
         Sample additional goals and store new transitions in replay buffer
-        when using offline sampling
+        when using offline sampling.
         """
 
-        # sample goals and get new observations
-        observations, next_observations, actions, rewards = self._episode_storage.sample(
-            self.batch_size,
-            None,  # we should store unnormalized transitions, they will be normalized at sampling time
-            self.online_sampling,
-            self.n_sampled_goal,
+        # Sample goals and get new observations
+        # maybe_vec_env=None as we should store unnormalized transitions,
+        # they will be normalized at sampling time
+        observations, next_observations, actions, rewards = self._episode_storage.sample_offline(
+            n_sampled_goal=self.n_sampled_goal
         )
 
         # store data in replay buffer
@@ -534,9 +533,9 @@ def load_replay_buffer(
         Load a replay buffer from a pickle file and set environment for replay buffer (only online sampling).
 
         :param path: Path to the pickled replay buffer.
-        :param truncate_last_trajectory:
+        :param truncate_last_trajectory: Only for online sampling.
             If set to ``True`` we assume that the last trajectory in the replay buffer was finished.
-            If it is set to ``False`` we assume it is the same trajectory where we continue.
+            If it is set to ``False`` we assume that we continue the same trajectory (same episode).
         """
         self.model.load_replay_buffer(path=path)
 
@@ -547,8 +546,9 @@ def load_replay_buffer(
             # truncate interrupted episode
             if truncate_last_trajectory:
                 warnings.warn(
-                    "The last trajectory in the replay buffer will be truncated, "
-                    "You should use `truncate_last_trajectory=False` to avoid that issue."
+                    "The last trajectory in the replay buffer will be truncated.\n"
+                    "If you are in the same episode as when the replay buffer was saved,\n"
+                    "you should use `truncate_last_trajectory=False` to avoid that issue."
                 )
                 # get current episode and transition index
                 pos = self.replay_buffer.pos
@@ -556,8 +556,11 @@ def load_replay_buffer(
                 # set episode length for current episode
                 self.replay_buffer.episode_lengths[pos] = current_idx
                 # set done = True for current episode
-                self.replay_buffer.buffer["done"][pos][current_idx] = np.array([True], dtype=np.float32)
+                # current_idx was already incremented
+                self.replay_buffer.buffer["done"][pos][current_idx - 1] = np.array([True], dtype=np.float32)
                 # reset current transition index
                 self.replay_buffer.current_idx = 0
                 # increment episode counter
-                self.replay_buffer.pos += 1
+                self.replay_buffer.pos = (self.replay_buffer.pos + 1) % self.replay_buffer.max_episode_stored
+                # update "full" indicator
+                self.replay_buffer.full = self.replay_buffer.full or self.replay_buffer.pos == 0
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
index 5ca20eb71b..5bbf1b9774 100644
--- a/stable_baselines3/her/her_replay_buffer.py
+++ b/stable_baselines3/her/her_replay_buffer.py
@@ -126,19 +126,36 @@ def _get_samples(
     def sample(
         self,
         batch_size: int,
-        env: Optional[VecNormalize] = None,
-        online_sampling: bool = True,
-        n_sampled_goal: int = None,
+        env: Optional[VecNormalize],
     ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]:
         """
+        Sample function for online sampling of HER transition,
+        this replaces the "regular" replay buffer ``sample()``
+        method in the ``train()`` function.
+
         :param batch_size: Number of element to sample
         :param env: Associated gym VecEnv
             to normalize the observations/rewards when sampling
-        :param online_sampling: Using online_sampling for HER or not.
-        :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
         :return: Samples.
         """
-        return self._sample_transitions(batch_size, env, online_sampling, n_sampled_goal)
+        return self._sample_transitions(batch_size, maybe_vec_env=env, online_sampling=True)
+
+    def sample_offline(
+        self,
+        n_sampled_goal: Optional[int] = None,
+    ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]:
+        """
+        Sample function for offline sampling of HER transition,
+        in that case, only one episode is used and transitions
+        are added to the regular replay buffer.
+
+        :param n_sampled_goal: Number of sampled goals for replay
+        :return: at most(n_sampled_goal * episode_length) HER transitions.
+        """
+        # env=None as we should store unnormalized transitions, they will be normalized at sampling time
+        return self._sample_transitions(
+            batch_size=None, maybe_vec_env=None, online_sampling=False, n_sampled_goal=n_sampled_goal
+        )
 
     def sample_goals(
         self,
@@ -178,13 +195,13 @@ def sample_goals(
 
     def _sample_transitions(
         self,
-        batch_size: int,
+        batch_size: Optional[int],
         maybe_vec_env: Optional[VecNormalize],
-        online_sampling: bool = True,
-        n_sampled_goal: int = None,
+        online_sampling: bool,
+        n_sampled_goal: Optional[int] = None,
     ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]:
         """
-        :param batch_size: Number of element to sample
+        :param batch_size: Number of element to sample (only used for online sampling)
         :param env: associated gym VecEnv to normalize the observations/rewards
             Only valid when using online sampling
         :param online_sampling: Using online_sampling for HER or not.
@@ -193,11 +210,13 @@ def _sample_transitions(
         """
         # Select which episodes to use
         if online_sampling:
+            assert batch_size is not None, "No batch_size specified for online sampling of HER transitions"
             episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size)
             # A subset of the transitions will be relabeled using HER algorithm
             her_indices = np.arange(batch_size)[: int(self.her_ratio * batch_size)]
         else:
             assert maybe_vec_env is None, "Transitions must be stored unnormalized in the replay buffer"
+            assert n_sampled_goal is not None, "No n_sampled_goal specified for offline sampling of HER transitions"
             # Offline sampling: there is only one episode stored
             episode_length = self.episode_lengths[0]
             # we sample n_sampled_goal per timestep in the episode (only one is stored).
diff --git a/tests/test_her.py b/tests/test_her.py
index c21a3d78d4..bd2e36cfd1 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -1,5 +1,6 @@
 import os
 import pathlib
+import warnings
 from copy import deepcopy
 
 import gym
@@ -172,10 +173,14 @@ def test_save_load(tmp_path, model_class, use_sde, online_sampling):
 
 
 @pytest.mark.parametrize("online_sampling, truncate_last_trajectory", [(False, None), (True, True), (True, False)])
-def test_save_load_replay_buffer(tmp_path, online_sampling, truncate_last_trajectory):
+def test_save_load_replay_buffer(tmp_path, recwarn, online_sampling, truncate_last_trajectory):
     """
     Test if 'save_replay_buffer' and 'load_replay_buffer' works correctly
     """
+    # remove gym warnings
+    warnings.filterwarnings(action="ignore", category=DeprecationWarning)
+    warnings.filterwarnings(action="ignore", category=UserWarning, module="gym")
+
     path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl")
     path.parent.mkdir(exist_ok=True, parents=True)  # to not raise a warning
     env = BitFlippingEnv(n_bits=4, continuous=True)
@@ -200,8 +205,18 @@ def test_save_load_replay_buffer(tmp_path, online_sampling, truncate_last_trajec
     with pytest.raises(AttributeError):
         model.replay_buffer
 
+    # Check that there is no warning
+    assert len(recwarn) == 0
+
     model.load_replay_buffer(path, truncate_last_trajectory)
 
+    if truncate_last_trajectory:
+        assert len(recwarn) == 1
+        warning = recwarn.pop(UserWarning)
+        assert "The last trajectory in the replay buffer will be truncated" in str(warning.message)
+    else:
+        assert len(recwarn) == 0
+
     if online_sampling:
         n_episodes_stored = model.replay_buffer.n_episodes_stored
         assert np.allclose(
@@ -230,7 +245,8 @@ def test_save_load_replay_buffer(tmp_path, online_sampling, truncate_last_trajec
         assert np.allclose(old_replay_buffer.dones, model.replay_buffer.dones)
 
     # test if continuing training works properly
-    model.learn(200)
+    reset_num_timesteps = False if truncate_last_trajectory is False else True
+    model.learn(200, reset_num_timesteps=reset_num_timesteps)
 
 
 def test_get_max_episode_length():

From aacd9363b61480eb3140760a8366a536b38bcad6 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Thu, 22 Oct 2020 11:02:23 +0200
Subject: [PATCH 79/81] Update tensorboard log name

---
 stable_baselines3/her/her.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index 83b1be4563..e80035e6e9 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -174,7 +174,7 @@ def learn(
         eval_env: Optional[GymEnv] = None,
         eval_freq: int = -1,
         n_eval_episodes: int = 5,
-        tb_log_name: str = "run",
+        tb_log_name: str = "HER",
         eval_log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
     ) -> BaseAlgorithm:

From 940ee2c0c9bef10689dbf06b66b997046d814814 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Thu, 22 Oct 2020 11:08:48 +0200
Subject: [PATCH 80/81] Version bump

---
 docs/misc/changelog.rst       | 2 +-
 stable_baselines3/version.txt | 2 +-
 tests/test_her.py             | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index cf9ef5960d..1d6b2a4723 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -4,7 +4,7 @@ Changelog
 ==========
 
 
-Pre-Release 0.10.0a0 (WIP)
+Pre-Release 0.10.0a1 (WIP)
 ------------------------------
 
 Breaking Changes:
diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt
index 37f1777fc3..8dabd1f602 100644
--- a/stable_baselines3/version.txt
+++ b/stable_baselines3/version.txt
@@ -1 +1 @@
-0.10.0a0
+0.10.0a1
diff --git a/tests/test_her.py b/tests/test_her.py
index bd2e36cfd1..09d1a78580 100644
--- a/tests/test_her.py
+++ b/tests/test_her.py
@@ -196,6 +196,7 @@ def test_save_load_replay_buffer(tmp_path, recwarn, online_sampling, truncate_la
         max_episode_length=4,
         buffer_size=int(2e4),
         policy_kwargs=dict(net_arch=[64]),
+        seed=0,
     )
     model.learn(200)
     old_replay_buffer = deepcopy(model.replay_buffer)

From 3bb19a7618180ddf0838da0886f6f61792b6f9fe Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Thu, 22 Oct 2020 11:33:33 +0200
Subject: [PATCH 81/81] Bug fix for special case

---
 stable_baselines3/her/her.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/stable_baselines3/her/her.py b/stable_baselines3/her/her.py
index e80035e6e9..658abc6fe7 100644
--- a/stable_baselines3/her/her.py
+++ b/stable_baselines3/her/her.py
@@ -542,9 +542,11 @@ def load_replay_buffer(
         if self.online_sampling:
             # set environment
             self.replay_buffer.set_env(self.env)
+            # If we are at the start of an episode, no need to truncate
+            current_idx = self.replay_buffer.current_idx
 
             # truncate interrupted episode
-            if truncate_last_trajectory:
+            if truncate_last_trajectory and current_idx > 0:
                 warnings.warn(
                     "The last trajectory in the replay buffer will be truncated.\n"
                     "If you are in the same episode as when the replay buffer was saved,\n"
@@ -552,7 +554,6 @@ def load_replay_buffer(
                 )
                 # get current episode and transition index
                 pos = self.replay_buffer.pos
-                current_idx = self.replay_buffer.current_idx
                 # set episode length for current episode
                 self.replay_buffer.episode_lengths[pos] = current_idx
                 # set done = True for current episode