Stable-Baselines-Team · araffin · Dec 29, 2021 · Aug 7, 2021 · Aug 9, 2021 · Aug 17, 2021
diff --git a/sb3_contrib/__init__.py b/sb3_contrib/__init__.py
@@ -2,6 +2,8 @@
 
 from sb3_contrib.qrdqn import QRDQN
 from sb3_contrib.tqc import TQC
+from sb3_contrib.trpo import TRPO
+
 
 # Read version from file
 version_file = os.path.join(os.path.dirname(__file__), "version.txt")

diff --git a/sb3_contrib/common/policies.py b/sb3_contrib/common/policies.py
@@ -0,0 +1,28 @@
+"""Policies: abstract base class and concrete implementations."""
+
+from stable_baselines3.common.distributions import Distribution
+from stable_baselines3.common.policies import ActorCriticPolicy as _ActorCriticPolicy
+
+
+class ActorCriticPolicy(_ActorCriticPolicy):
+    """
+    Policy class for actor-critic algorithms (has both policy and value prediction).
+    Used by A2C, PPO and the likes.
+    """
+
+    def get_distribution(self) -> Distribution:
+        """
+        Get the current action distribution
+        :return: Action distribution
+        """
+        return self.action_dist
+
+
+# This is just to propagate get_distribution
+class ActorCriticCnnPolicy(ActorCriticPolicy):
+    pass
+
+
+# This is just to propagate get_distribution
+class MultiInputActorCriticPolicy(ActorCriticPolicy):
+    pass
diff --git a/sb3_contrib/common/utils.py b/sb3_contrib/common/utils.py
@@ -1,6 +1,7 @@
-from typing import Optional
+from typing import Optional, Sequence, Callable
 
 import torch as th
+from torch import nn
 
 
 def quantile_huber_loss(
@@ -67,3 +68,91 @@ def quantile_huber_loss(
     else:
         loss = loss.mean()
     return loss
+
+
+# TODO: write regression tests
+def conjugate_gradient_solver(
+    matrix_vector_dot_func: Callable[[th.Tensor], th.Tensor],
+    b,
+    max_iter=10,
+    residual_tol=1e-10,
+) -> th.Tensor:
+    """
+    Finds an approximate solution to a set of linear equations Ax = b
+
+    Source: https://github.com/ajlangley/trpo-pytorch/blob/master/conjugate_gradient.py
+
+    :param matrix_vector_dot_func:
+        a function that right multiplies a matrix A by a vector v
+    :param b:
+        the right hand term in the set of linear equations Ax = b
+    :param max_iter:
+        the maximum number of iterations (default is 10)
+    :param residual_tol:
+        residual tolerance for early stopping of the solving (default is 1e-10)
+    :return x:
+        the approximate solution to the system of equations defined by Avp_fun
+        and b
+    """
+
+    # The vector is not initialized at 0 because of the instability issues when the gradient becomes small.
+    # A small random gaussian noise is used for the initialization.
+    x = 1e-4 * th.randn_like(b)
+    r = b - matrix_vector_dot_func(x)
+    r_dot = th.matmul(r, r)
+
+    if r_dot < residual_tol:
+        # If the gradient becomes extremely small
+        # The denominator in alpha will become zero
+        # Leading to a division by zero
+        return x
+
+    p = r.clone()
+
+    for i in range(max_iter):
+        Avp = matrix_vector_dot_func(p)
+
+        alpha = r_dot / p.dot(Avp)
+        x += alpha * p
+
+        if i == max_iter - 1:
+            return x
+
+        r -= alpha * Avp
+        new_r_dot = th.matmul(r, r)
+
+        if new_r_dot < residual_tol:
+            return x
+
+        beta = new_r_dot / r_dot
+        r_dot = new_r_dot
+        p = r + beta * p
+
+
+# TODO: test
+def flat_grad(
+    output,
+    parameters: Sequence[nn.parameter.Parameter],
+    create_graph: bool = False,
+    retain_graph: bool = False,
+) -> th.Tensor:
+    """
+    Returns the gradients of the passed sequence of parameters into a flat gradient.
+    Order of parameters is preserved.
+
+    :param output: functional output to compute the gradient for
+    :param parameters: sequence of `Parameter`
+    :param retain_graph – If ``False``, the graph used to compute the grad will be freed.
+        Defaults to the value of ``create_graph``.
+    :param create_graph – If ``True``, graph of the derivative will be constructed,
+        allowing to compute higher order derivative products. Default: ``False``.
+    :return: Tensor containing the flattened gradients
+    """
+    grads = th.autograd.grad(
+        output,
+        parameters,
+        create_graph=create_graph,
+        retain_graph=retain_graph,
+        allow_unused=True,
+    )
+    return th.cat([grad.view(-1) for grad in grads if grad is not None])
diff --git a/sb3_contrib/trpo/__init__.py b/sb3_contrib/trpo/__init__.py
@@ -0,0 +1,2 @@
+from sb3_contrib.trpo.policies import CnnPolicy, MlpPolicy, MultiInputPolicy
+from sb3_contrib.trpo.trpo import TRPO
diff --git a/sb3_contrib/trpo/policies.py b/sb3_contrib/trpo/policies.py
@@ -0,0 +1,13 @@
+# This file is here just to define MlpPolicy/CnnPolicy
+# that work for TRPO
+from sb3_contrib.common.policies import ActorCriticPolicy, ActorCriticCnnPolicy, MultiInputActorCriticPolicy
+from stable_baselines3.common.policies import register_policy
+
+
+MlpPolicy = ActorCriticPolicy
+CnnPolicy = ActorCriticCnnPolicy
+MultiInputPolicy = MultiInputActorCriticPolicy
+
+register_policy("MlpPolicy", ActorCriticPolicy)
+register_policy("CnnPolicy", ActorCriticCnnPolicy)
+register_policy("MultiInputPolicy", MultiInputPolicy)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from sb3_contrib.trpo.policies import CnnPolicy, MlpPolicy, MultiInputPolicy
		from sb3_contrib.trpo.trpo import TRPO