|
| 1 | +.. _her: |
| 2 | + |
| 3 | +.. automodule:: stable_baselines3.her |
| 4 | + |
| 5 | + |
| 6 | +HER |
| 7 | +==== |
| 8 | + |
| 9 | +`Hindsight Experience Replay (HER) <https://arxiv.org/abs/1707.01495>`_ |
| 10 | + |
| 11 | +HER is an algorithm that works with off-policy methods (DQN, SAC, TD3 and DDPG for example). |
| 12 | +HER uses the fact that even if a desired goal was not achieved, other goal may have been achieved during a rollout. |
| 13 | +It creates "virtual" transitions by relabeling transitions (changing the desired goal) from past episodes. |
| 14 | + |
| 15 | + |
| 16 | + |
| 17 | +.. warning:: |
| 18 | + |
| 19 | + HER requires the environment to inherits from `gym.GoalEnv <https://github.com/openai/gym/blob/3394e245727c1ae6851b504a50ba77c73cd4c65b/gym/core.py#L160>`_ |
| 20 | + |
| 21 | + |
| 22 | +.. warning:: |
| 23 | + |
| 24 | + For performance reasons, the maximum number of steps per episodes must be specified. |
| 25 | + In most cases, it will be inferred if you specify ``max_episode_steps`` when registering the environment |
| 26 | + or if you use a ``gym.wrappers.TimeLimit`` (and ``env.spec`` is not None). |
| 27 | + Otherwise, you can directly pass ``max_episode_length`` to the model constructor |
| 28 | + |
| 29 | + |
| 30 | +.. warning:: |
| 31 | + |
| 32 | + ``HER`` supports ``VecNormalize`` wrapper but only when ``online_sampling=True`` |
| 33 | + |
| 34 | + |
| 35 | +Notes |
| 36 | +----- |
| 37 | + |
| 38 | +- Original paper: https://arxiv.org/abs/1707.01495 |
| 39 | +- OpenAI paper: `Plappert et al. (2018)`_ |
| 40 | +- OpenAI blog post: https://openai.com/blog/ingredients-for-robotics-research/ |
| 41 | + |
| 42 | + |
| 43 | +.. _Plappert et al. (2018): https://arxiv.org/abs/1802.09464 |
| 44 | + |
| 45 | +Can I use? |
| 46 | +---------- |
| 47 | + |
| 48 | +Please refer to the used model (DQN, SAC, TD3 or DDPG) for that section. |
| 49 | + |
| 50 | +Example |
| 51 | +------- |
| 52 | + |
| 53 | +.. code-block:: python |
| 54 | +
|
| 55 | + from stable_baselines3 import HER, DDPG, DQN, SAC, TD3 |
| 56 | + from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy |
| 57 | + from stable_baselines3.common.bit_flipping_env import BitFlippingEnv |
| 58 | + from stable_baselines3.common.vec_env import DummyVecEnv |
| 59 | + from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper |
| 60 | +
|
| 61 | + model_class = DQN # works also with SAC, DDPG and TD3 |
| 62 | + N_BITS = 15 |
| 63 | +
|
| 64 | + env = BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) |
| 65 | +
|
| 66 | + # Available strategies (cf paper): future, final, episode |
| 67 | + goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE |
| 68 | +
|
| 69 | + # If True the HER transitions will get sampled online |
| 70 | + online_sampling = True |
| 71 | + # Time limit for the episodes |
| 72 | + max_episode_length = N_BITS |
| 73 | +
|
| 74 | + # Initialize the model |
| 75 | + model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, |
| 76 | + verbose=1, max_episode_length=max_episode_length) |
| 77 | + # Train the model |
| 78 | + model.learn(1000) |
| 79 | +
|
| 80 | + model.save("./her_bit_env") |
| 81 | + model = HER.load('./her_bit_env', env=env) |
| 82 | +
|
| 83 | + obs = env.reset() |
| 84 | + for _ in range(100): |
| 85 | + action, _ = model.model.predict(obs, deterministic=True) |
| 86 | + obs, reward, done, _ = env.step(action) |
| 87 | +
|
| 88 | + if done: |
| 89 | + obs = env.reset() |
| 90 | +
|
| 91 | +
|
| 92 | +Parameters |
| 93 | +---------- |
| 94 | + |
| 95 | +.. autoclass:: HER |
| 96 | + :members: |
| 97 | + |
| 98 | +Goal Selection Strategies |
| 99 | +------------------------- |
| 100 | + |
| 101 | +.. autoclass:: GoalSelectionStrategy |
| 102 | + :members: |
| 103 | + :inherited-members: |
| 104 | + :undoc-members: |
| 105 | + |
| 106 | + |
| 107 | +Obs Dict Wrapper |
| 108 | +---------------- |
| 109 | + |
| 110 | +.. autoclass:: ObsDictWrapper |
| 111 | + :members: |
| 112 | + :inherited-members: |
| 113 | + :undoc-members: |
| 114 | + |
| 115 | + |
| 116 | +HER Replay Buffer |
| 117 | +----------------- |
| 118 | + |
| 119 | +.. autoclass:: HerReplayBuffer |
| 120 | + :members: |
| 121 | + :inherited-members: |
0 commit comments