From 2eff07baf9f82cf123d6c9a08c69c6fb090d5e1f Mon Sep 17 00:00:00 2001
From: Frederick Robinson <frederick.robinson@frrad.com>
Date: Thu, 17 Jul 2025 21:29:19 -0700
Subject: [PATCH 1/7] enable mypy for one module

---
 .pre-commit-config.yaml               |  7 ++++++-
 pyproject.toml                        | 16 ++++++++++++++++
 tests/special_sanity/check_license.py |  4 ++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e68644a010a..4b4c7b8435c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,6 +7,11 @@ repos:
         exclude: ^.*\.(ipynb)$
       - id: ruff-format
 
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: 'v1.17.0'
+    hooks:
+      - id: mypy
+
   - repo: local
     hooks:
       - id: autogen-trainer-cfg
@@ -29,4 +34,4 @@ repos:
         name: Check license
         entry: python3 tests/special_sanity/check_license.py --directory .
         language: python
-        pass_filenames: false
\ No newline at end of file
+        pass_filenames: false
diff --git a/pyproject.toml b/pyproject.toml
index b508516adc6..f41aec66f41 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,22 @@ ignore = [
     "UP035",
 ]
 
+# -------------------------------
+# tool.mypy - typechecking config
+# -------------------------------
+[tool.mypy]
+pretty            = true
+ignore_missing_imports = true
+explicit_package_bases = true
+follow_imports = "skip"
+
+# Blanket silence
+ignore_errors = true
+
+[[tool.mypy.overrides]]
+module = ["verl.trainer.ppo.core_algos"]
+ignore_errors = false
+
 # -------------------------------
 # tool.setuptools - Additional config
 # -------------------------------
diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py
index a02afeb3d91..b1066c75b76 100644
--- a/tests/special_sanity/check_license.py
+++ b/tests/special_sanity/check_license.py
@@ -43,6 +43,10 @@
     for path in pathlist:
         # because path is object not string
         path_in_str = str(path.absolute())
+
+        if ".venv" in path_in_str:
+            continue
+
         print(path_in_str)
         with open(path_in_str, encoding="utf-8") as f:
             file_content = f.read()

From 0493d344950cee4fdc5a84e2ca4eb89caf0496e1 Mon Sep 17 00:00:00 2001
From: Frederick Robinson <frederick.robinson@frrad.com>
Date: Thu, 17 Jul 2025 21:37:22 -0700
Subject: [PATCH 2/7] precommit passes

---
 verl/trainer/ppo/core_algos.py | 50 +++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py
index 5f02675817b..aa5e52d30c3 100644
--- a/verl/trainer/ppo/core_algos.py
+++ b/verl/trainer/ppo/core_algos.py
@@ -22,7 +22,7 @@
 
 from collections import defaultdict
 from enum import Enum
-from typing import Optional
+from typing import Any, Optional
 
 import numpy as np
 import torch
@@ -68,10 +68,30 @@ def get_policy_loss_fn(name):
     return POLICY_LOSS_REGISTRY[loss_name]
 
 
-ADV_ESTIMATOR_REGISTRY = {}
+class AdvantageEstimator(str, Enum):
+    """Using an enumeration class to avoid spelling errors in adv_estimator.
 
+    Note(haibin.lin): this enum class is immutable after creation. Extending this
+    enum for new estimators may not be necessary since users can always just call
+    `verl.trainer.ppo.core_algos.register` with string name for a custom advantage
+    estimator instead.
+    """
 
-def register_adv_est(name_or_enum):
+    GAE = "gae"
+    GRPO = "grpo"
+    REINFORCE_PLUS_PLUS = "reinforce_plus_plus"
+    REINFORCE_PLUS_PLUS_BASELINE = "reinforce_plus_plus_baseline"
+    REMAX = "remax"
+    RLOO = "rloo"
+    OPO = "opo"
+    GRPO_PASSK = "grpo_passk"
+    GPG = "gpg"
+
+
+ADV_ESTIMATOR_REGISTRY: dict[str, Any] = {}
+
+
+def register_adv_est(name_or_enum: str | AdvantageEstimator) -> Any:
     """Decorator to register a advantage estimator function with a given name.
 
     Args:
@@ -108,26 +128,6 @@ def get_adv_estimator_fn(name_or_enum):
     return ADV_ESTIMATOR_REGISTRY[name]
 
 
-class AdvantageEstimator(str, Enum):
-    """Using an enumeration class to avoid spelling errors in adv_estimator.
-
-    Note(haibin.lin): this enum class is immutable after creation. Extending this
-    enum for new estimators may not be necessary since users can always just call
-    `verl.trainer.ppo.core_algos.register` with string name for a custom advantage
-    estimator instead.
-    """
-
-    GAE = "gae"
-    GRPO = "grpo"
-    REINFORCE_PLUS_PLUS = "reinforce_plus_plus"
-    REINFORCE_PLUS_PLUS_BASELINE = "reinforce_plus_plus_baseline"
-    REMAX = "remax"
-    RLOO = "rloo"
-    OPO = "opo"
-    GRPO_PASSK = "grpo_passk"
-    GPG = "gpg"
-
-
 class AdaptiveKLController:
     """
     Adaptive KL controller described in the paper:
@@ -822,7 +822,7 @@ def compute_policy_loss_clip_cov(
     advantages: torch.Tensor,
     response_mask: torch.Tensor,
     loss_agg_mode: str = "token-mean",
-    config: Optional[AlgoConfig] = None,
+    config: Any = None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Compute the clipped policy objective and related metrics for Clip-Cov.
@@ -912,7 +912,7 @@ def compute_policy_loss_kl_cov(
     advantages: torch.Tensor,
     response_mask: torch.Tensor,
     loss_agg_mode: str = "token-mean",
-    config: Optional[AlgoConfig] = None,
+    config: Any = None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Compute the clipped policy objective and related metrics for Clip-Cov.

From 0fdbe54118729ab0a571e9f08c9929a6e9f7f375 Mon Sep 17 00:00:00 2001
From: Frederick Robinson <frederick.robinson@frrad.com>
Date: Thu, 17 Jul 2025 21:44:16 -0700
Subject: [PATCH 3/7] remove my license hack

---
 tests/special_sanity/check_license.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py
index b1066c75b76..a02afeb3d91 100644
--- a/tests/special_sanity/check_license.py
+++ b/tests/special_sanity/check_license.py
@@ -43,10 +43,6 @@
     for path in pathlist:
         # because path is object not string
         path_in_str = str(path.absolute())
-
-        if ".venv" in path_in_str:
-            continue
-
         print(path_in_str)
         with open(path_in_str, encoding="utf-8") as f:
             file_content = f.read()

From fe0645bf8247d0efb272759389052789c2556539 Mon Sep 17 00:00:00 2001
From: Frederick Robinson <frederick.robinson@frrad.com>
Date: Tue, 22 Jul 2025 23:30:51 -0700
Subject: [PATCH 4/7] remove any type, add PolicyLossFn type

---
 verl/trainer/ppo/core_algos.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py
index aa5e52d30c3..5508213d713 100644
--- a/verl/trainer/ppo/core_algos.py
+++ b/verl/trainer/ppo/core_algos.py
@@ -22,18 +22,31 @@
 
 from collections import defaultdict
 from enum import Enum
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 
 import numpy as np
 import torch
+from omegaconf import DictConfig
 
 import verl.utils.torch_functional as verl_F
 from verl.trainer.config import AlgoConfig
 
-POLICY_LOSS_REGISTRY = {}
+PolicyLossFn = Callable[
+    [
+        torch.Tensor,  # old_log_prob
+        torch.Tensor,  # log_prob
+        torch.Tensor,  # advantages
+        torch.Tensor,  # response_mask
+        str,  # loss_agg_mode
+        Optional[DictConfig | AlgoConfig],  # config
+    ],
+    tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+]
 
+POLICY_LOSS_REGISTRY: dict[str, PolicyLossFn] = {}
 
-def register_policy_loss(name):
+
+def register_policy_loss(name: str) -> Callable[[PolicyLossFn], PolicyLossFn]:
     """Register a policy loss function with the given name.
 
     Args:
@@ -43,7 +56,7 @@ def register_policy_loss(name):
         function: Decorator function that registers the policy loss function.
     """
 
-    def decorator(func):
+    def decorator(func: PolicyLossFn) -> PolicyLossFn:
         POLICY_LOSS_REGISTRY[name] = func
         return func
 
@@ -822,7 +835,7 @@ def compute_policy_loss_clip_cov(
     advantages: torch.Tensor,
     response_mask: torch.Tensor,
     loss_agg_mode: str = "token-mean",
-    config: Any = None,
+    config: Optional[DictConfig | AlgoConfig] = None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Compute the clipped policy objective and related metrics for Clip-Cov.
@@ -855,6 +868,8 @@ def compute_policy_loss_clip_cov(
         clip_cov_ub (float, optional):
             Upper bound for clipping covariance. Defaults to 5.0.
     """
+    assert config is not None
+
     clip_cov_ratio = config.policy_loss.clip_cov_ratio if config.policy_loss.clip_cov_ratio is not None else 0.0002
     cliprange = config.clip_ratio
     cliprange_low = config.clip_ratio_low if config.clip_ratio_low is not None else cliprange
@@ -912,7 +927,7 @@ def compute_policy_loss_kl_cov(
     advantages: torch.Tensor,
     response_mask: torch.Tensor,
     loss_agg_mode: str = "token-mean",
-    config: Any = None,
+    config: Optional[DictConfig | AlgoConfig] = None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Compute the clipped policy objective and related metrics for Clip-Cov.
@@ -936,6 +951,8 @@ def compute_policy_loss_kl_cov(
         ppo_kl_coef (float, optional):
             Coefficient for the KL penalty term in the loss. Defaults to 1.
     """
+    assert config is not None
+
     kl_cov_ratio = config.policy_loss.kl_cov_ratio if config.policy_loss.kl_cov_ratio is not None else 0.0002
     ppo_kl_coef = config.policy_loss.ppo_kl_coef if config.policy_loss.ppo_kl_coef is not None else 1.0
 

From cc2972c11431a5ab189813ee855eaa0429e1baf1 Mon Sep 17 00:00:00 2001
From: Frederick Robinson <frederick.robinson@frrad.com>
Date: Tue, 22 Jul 2025 23:53:58 -0700
Subject: [PATCH 5/7] add `verl.trainer.config.algorithm` to whitelist, related
 fixes

---
 pyproject.toml                   |  5 ++++-
 verl/trainer/config/algorithm.py | 13 +++++++++++++
 verl/trainer/ppo/core_algos.py   |  2 ++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f41aec66f41..9d24ae95b35 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,10 @@ follow_imports = "skip"
 ignore_errors = true
 
 [[tool.mypy.overrides]]
-module = ["verl.trainer.ppo.core_algos"]
+module = [
+"verl.trainer.config.algorithm",
+"verl.trainer.ppo.core_algos",
+]
 ignore_errors = false
 
 # -------------------------------
diff --git a/verl/trainer/config/algorithm.py b/verl/trainer/config/algorithm.py
index 5bc6cf94369..e3d1842d9c8 100644
--- a/verl/trainer/config/algorithm.py
+++ b/verl/trainer/config/algorithm.py
@@ -73,6 +73,15 @@ class FilterGroupsConfig(BaseConfig):
     max_num_gen_batches: int = 0
 
 
+@dataclass
+class PolicyLossConfig(BaseConfig):
+    ppo_kl_coef: float = 0.1
+    kl_cov_ratio: float = 0.0002
+    clip_cov_lb: float = 1.0
+    clip_cov_ub: float = 5.0
+    clip_cov_ratio: float = 0.0002
+
+
 @dataclass
 class AlgoConfig(BaseConfig):
     """Configuration for the algorithm.
@@ -112,3 +121,7 @@ class AlgoConfig(BaseConfig):
     use_pf_ppo: bool = False
     pf_ppo: Optional[PFPPOConfig] = None
     filter_groups: Optional[FilterGroupsConfig] = None
+    policy_loss: Optional[PolicyLossConfig] = field(default_factory=PolicyLossConfig)
+    clip_ratio_high: float = 0.2
+    clip_ratio_low: float = 0.2
+    clip_ratio: float = 0.2
diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py
index 5508213d713..be2e885c7c3 100644
--- a/verl/trainer/ppo/core_algos.py
+++ b/verl/trainer/ppo/core_algos.py
@@ -869,6 +869,7 @@ def compute_policy_loss_clip_cov(
             Upper bound for clipping covariance. Defaults to 5.0.
     """
     assert config is not None
+    assert config.policy_loss is not None
 
     clip_cov_ratio = config.policy_loss.clip_cov_ratio if config.policy_loss.clip_cov_ratio is not None else 0.0002
     cliprange = config.clip_ratio
@@ -952,6 +953,7 @@ def compute_policy_loss_kl_cov(
             Coefficient for the KL penalty term in the loss. Defaults to 1.
     """
     assert config is not None
+    assert config.policy_loss is not None
 
     kl_cov_ratio = config.policy_loss.kl_cov_ratio if config.policy_loss.kl_cov_ratio is not None else 0.0002
     ppo_kl_coef = config.policy_loss.ppo_kl_coef if config.policy_loss.ppo_kl_coef is not None else 1.0

From 1bc626c434c117f097f98043240653d3dd5610d0 Mon Sep 17 00:00:00 2001
From: Fred <frederrx@amazon.com>
Date: Thu, 24 Jul 2025 21:27:11 +0000
Subject: [PATCH 6/7] Revert "add `verl.trainer.config.algorithm` to whitelist,
 related fixes"

This reverts commit cc2972c11431a5ab189813ee855eaa0429e1baf1.
---
 pyproject.toml                   |  5 +----
 verl/trainer/config/algorithm.py | 13 -------------
 verl/trainer/ppo/core_algos.py   |  2 --
 3 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9d24ae95b35..f41aec66f41 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,10 +78,7 @@ follow_imports = "skip"
 ignore_errors = true
 
 [[tool.mypy.overrides]]
-module = [
-"verl.trainer.config.algorithm",
-"verl.trainer.ppo.core_algos",
-]
+module = ["verl.trainer.ppo.core_algos"]
 ignore_errors = false
 
 # -------------------------------
diff --git a/verl/trainer/config/algorithm.py b/verl/trainer/config/algorithm.py
index e3d1842d9c8..5bc6cf94369 100644
--- a/verl/trainer/config/algorithm.py
+++ b/verl/trainer/config/algorithm.py
@@ -73,15 +73,6 @@ class FilterGroupsConfig(BaseConfig):
     max_num_gen_batches: int = 0
 
 
-@dataclass
-class PolicyLossConfig(BaseConfig):
-    ppo_kl_coef: float = 0.1
-    kl_cov_ratio: float = 0.0002
-    clip_cov_lb: float = 1.0
-    clip_cov_ub: float = 5.0
-    clip_cov_ratio: float = 0.0002
-
-
 @dataclass
 class AlgoConfig(BaseConfig):
     """Configuration for the algorithm.
@@ -121,7 +112,3 @@ class AlgoConfig(BaseConfig):
     use_pf_ppo: bool = False
     pf_ppo: Optional[PFPPOConfig] = None
     filter_groups: Optional[FilterGroupsConfig] = None
-    policy_loss: Optional[PolicyLossConfig] = field(default_factory=PolicyLossConfig)
-    clip_ratio_high: float = 0.2
-    clip_ratio_low: float = 0.2
-    clip_ratio: float = 0.2
diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py
index be2e885c7c3..5508213d713 100644
--- a/verl/trainer/ppo/core_algos.py
+++ b/verl/trainer/ppo/core_algos.py
@@ -869,7 +869,6 @@ def compute_policy_loss_clip_cov(
             Upper bound for clipping covariance. Defaults to 5.0.
     """
     assert config is not None
-    assert config.policy_loss is not None
 
     clip_cov_ratio = config.policy_loss.clip_cov_ratio if config.policy_loss.clip_cov_ratio is not None else 0.0002
     cliprange = config.clip_ratio
@@ -953,7 +952,6 @@ def compute_policy_loss_kl_cov(
             Coefficient for the KL penalty term in the loss. Defaults to 1.
     """
     assert config is not None
-    assert config.policy_loss is not None
 
     kl_cov_ratio = config.policy_loss.kl_cov_ratio if config.policy_loss.kl_cov_ratio is not None else 0.0002
     ppo_kl_coef = config.policy_loss.ppo_kl_coef if config.policy_loss.ppo_kl_coef is not None else 1.0

From 3560b75dcfbf4dc7d9e686db60b258fa3bf7c0a5 Mon Sep 17 00:00:00 2001
From: Fred <frederrx@amazon.com>
Date: Thu, 24 Jul 2025 21:40:13 +0000
Subject: [PATCH 7/7] Revert "Revert "add `verl.trainer.config.algorithm` to
 whitelist, related fixes""

This reverts commit 1bc626c434c117f097f98043240653d3dd5610d0.
---
 pyproject.toml                 | 5 ++++-
 verl/trainer/ppo/core_algos.py | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f41aec66f41..9d24ae95b35 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,10 @@ follow_imports = "skip"
 ignore_errors = true
 
 [[tool.mypy.overrides]]
-module = ["verl.trainer.ppo.core_algos"]
+module = [
+"verl.trainer.config.algorithm",
+"verl.trainer.ppo.core_algos",
+]
 ignore_errors = false
 
 # -------------------------------
diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py
index 5508213d713..c916b6c91b2 100644
--- a/verl/trainer/ppo/core_algos.py
+++ b/verl/trainer/ppo/core_algos.py
@@ -869,6 +869,8 @@ def compute_policy_loss_clip_cov(
             Upper bound for clipping covariance. Defaults to 5.0.
     """
     assert config is not None
+    assert not isinstance(config, AlgoConfig), "passing AlgoConfig not supported yet"
+    assert config.policy_loss is not None
 
     clip_cov_ratio = config.policy_loss.clip_cov_ratio if config.policy_loss.clip_cov_ratio is not None else 0.0002
     cliprange = config.clip_ratio
@@ -952,6 +954,8 @@ def compute_policy_loss_kl_cov(
             Coefficient for the KL penalty term in the loss. Defaults to 1.
     """
     assert config is not None
+    assert not isinstance(config, AlgoConfig), "passing AlgoConfig not supported yet"
+    assert config.policy_loss is not None
 
     kl_cov_ratio = config.policy_loss.kl_cov_ratio if config.policy_loss.kl_cov_ratio is not None else 0.0002
     ppo_kl_coef = config.policy_loss.ppo_kl_coef if config.policy_loss.ppo_kl_coef is not None else 1.0