From 2eff07baf9f82cf123d6c9a08c69c6fb090d5e1f Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Thu, 17 Jul 2025 21:29:19 -0700 Subject: [PATCH 1/7] enable mypy for one module --- .pre-commit-config.yaml | 7 ++++++- pyproject.toml | 16 ++++++++++++++++ tests/special_sanity/check_license.py | 4 ++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e68644a010a..4b4c7b8435c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,6 +7,11 @@ repos: exclude: ^.*\.(ipynb)$ - id: ruff-format + - repo: https://github.com/pre-commit/mirrors-mypy + rev: 'v1.17.0' + hooks: + - id: mypy + - repo: local hooks: - id: autogen-trainer-cfg @@ -29,4 +34,4 @@ repos: name: Check license entry: python3 tests/special_sanity/check_license.py --directory . language: python - pass_filenames: false \ No newline at end of file + pass_filenames: false diff --git a/pyproject.toml b/pyproject.toml index b508516adc6..f41aec66f41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,22 @@ ignore = [ "UP035", ] +# ------------------------------- +# tool.mypy - typechecking config +# ------------------------------- +[tool.mypy] +pretty = true +ignore_missing_imports = true +explicit_package_bases = true +follow_imports = "skip" + +# Blanket silence +ignore_errors = true + +[[tool.mypy.overrides]] +module = ["verl.trainer.ppo.core_algos"] +ignore_errors = false + # ------------------------------- # tool.setuptools - Additional config # ------------------------------- diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py index a02afeb3d91..b1066c75b76 100644 --- a/tests/special_sanity/check_license.py +++ b/tests/special_sanity/check_license.py @@ -43,6 +43,10 @@ for path in pathlist: # because path is object not string path_in_str = str(path.absolute()) + + if ".venv" in path_in_str: + continue + print(path_in_str) with open(path_in_str, encoding="utf-8") as f: file_content = f.read() From 0493d344950cee4fdc5a84e2ca4eb89caf0496e1 Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Thu, 17 Jul 2025 21:37:22 -0700 Subject: [PATCH 2/7] precommit passes --- verl/trainer/ppo/core_algos.py | 50 +++++++++++++++++----------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py index 5f02675817b..aa5e52d30c3 100644 --- a/verl/trainer/ppo/core_algos.py +++ b/verl/trainer/ppo/core_algos.py @@ -22,7 +22,7 @@ from collections import defaultdict from enum import Enum -from typing import Optional +from typing import Any, Optional import numpy as np import torch @@ -68,10 +68,30 @@ def get_policy_loss_fn(name): return POLICY_LOSS_REGISTRY[loss_name] -ADV_ESTIMATOR_REGISTRY = {} +class AdvantageEstimator(str, Enum): + """Using an enumeration class to avoid spelling errors in adv_estimator. + Note(haibin.lin): this enum class is immutable after creation. Extending this + enum for new estimators may not be necessary since users can always just call + `verl.trainer.ppo.core_algos.register` with string name for a custom advantage + estimator instead. + """ -def register_adv_est(name_or_enum): + GAE = "gae" + GRPO = "grpo" + REINFORCE_PLUS_PLUS = "reinforce_plus_plus" + REINFORCE_PLUS_PLUS_BASELINE = "reinforce_plus_plus_baseline" + REMAX = "remax" + RLOO = "rloo" + OPO = "opo" + GRPO_PASSK = "grpo_passk" + GPG = "gpg" + + +ADV_ESTIMATOR_REGISTRY: dict[str, Any] = {} + + +def register_adv_est(name_or_enum: str | AdvantageEstimator) -> Any: """Decorator to register a advantage estimator function with a given name. Args: @@ -108,26 +128,6 @@ def get_adv_estimator_fn(name_or_enum): return ADV_ESTIMATOR_REGISTRY[name] -class AdvantageEstimator(str, Enum): - """Using an enumeration class to avoid spelling errors in adv_estimator. - - Note(haibin.lin): this enum class is immutable after creation. Extending this - enum for new estimators may not be necessary since users can always just call - `verl.trainer.ppo.core_algos.register` with string name for a custom advantage - estimator instead. - """ - - GAE = "gae" - GRPO = "grpo" - REINFORCE_PLUS_PLUS = "reinforce_plus_plus" - REINFORCE_PLUS_PLUS_BASELINE = "reinforce_plus_plus_baseline" - REMAX = "remax" - RLOO = "rloo" - OPO = "opo" - GRPO_PASSK = "grpo_passk" - GPG = "gpg" - - class AdaptiveKLController: """ Adaptive KL controller described in the paper: @@ -822,7 +822,7 @@ def compute_policy_loss_clip_cov( advantages: torch.Tensor, response_mask: torch.Tensor, loss_agg_mode: str = "token-mean", - config: Optional[AlgoConfig] = None, + config: Any = None, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Compute the clipped policy objective and related metrics for Clip-Cov. @@ -912,7 +912,7 @@ def compute_policy_loss_kl_cov( advantages: torch.Tensor, response_mask: torch.Tensor, loss_agg_mode: str = "token-mean", - config: Optional[AlgoConfig] = None, + config: Any = None, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Compute the clipped policy objective and related metrics for Clip-Cov. From 0fdbe54118729ab0a571e9f08c9929a6e9f7f375 Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Thu, 17 Jul 2025 21:44:16 -0700 Subject: [PATCH 3/7] remove my license hack --- tests/special_sanity/check_license.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py index b1066c75b76..a02afeb3d91 100644 --- a/tests/special_sanity/check_license.py +++ b/tests/special_sanity/check_license.py @@ -43,10 +43,6 @@ for path in pathlist: # because path is object not string path_in_str = str(path.absolute()) - - if ".venv" in path_in_str: - continue - print(path_in_str) with open(path_in_str, encoding="utf-8") as f: file_content = f.read() From fe0645bf8247d0efb272759389052789c2556539 Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Tue, 22 Jul 2025 23:30:51 -0700 Subject: [PATCH 4/7] remove any type, add PolicyLossFn type --- verl/trainer/ppo/core_algos.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py index aa5e52d30c3..5508213d713 100644 --- a/verl/trainer/ppo/core_algos.py +++ b/verl/trainer/ppo/core_algos.py @@ -22,18 +22,31 @@ from collections import defaultdict from enum import Enum -from typing import Any, Optional +from typing import Any, Callable, Optional import numpy as np import torch +from omegaconf import DictConfig import verl.utils.torch_functional as verl_F from verl.trainer.config import AlgoConfig -POLICY_LOSS_REGISTRY = {} +PolicyLossFn = Callable[ + [ + torch.Tensor, # old_log_prob + torch.Tensor, # log_prob + torch.Tensor, # advantages + torch.Tensor, # response_mask + str, # loss_agg_mode + Optional[DictConfig | AlgoConfig], # config + ], + tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], +] +POLICY_LOSS_REGISTRY: dict[str, PolicyLossFn] = {} -def register_policy_loss(name): + +def register_policy_loss(name: str) -> Callable[[PolicyLossFn], PolicyLossFn]: """Register a policy loss function with the given name. Args: @@ -43,7 +56,7 @@ def register_policy_loss(name): function: Decorator function that registers the policy loss function. """ - def decorator(func): + def decorator(func: PolicyLossFn) -> PolicyLossFn: POLICY_LOSS_REGISTRY[name] = func return func @@ -822,7 +835,7 @@ def compute_policy_loss_clip_cov( advantages: torch.Tensor, response_mask: torch.Tensor, loss_agg_mode: str = "token-mean", - config: Any = None, + config: Optional[DictConfig | AlgoConfig] = None, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Compute the clipped policy objective and related metrics for Clip-Cov. @@ -855,6 +868,8 @@ def compute_policy_loss_clip_cov( clip_cov_ub (float, optional): Upper bound for clipping covariance. Defaults to 5.0. """ + assert config is not None + clip_cov_ratio = config.policy_loss.clip_cov_ratio if config.policy_loss.clip_cov_ratio is not None else 0.0002 cliprange = config.clip_ratio cliprange_low = config.clip_ratio_low if config.clip_ratio_low is not None else cliprange @@ -912,7 +927,7 @@ def compute_policy_loss_kl_cov( advantages: torch.Tensor, response_mask: torch.Tensor, loss_agg_mode: str = "token-mean", - config: Any = None, + config: Optional[DictConfig | AlgoConfig] = None, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Compute the clipped policy objective and related metrics for Clip-Cov. @@ -936,6 +951,8 @@ def compute_policy_loss_kl_cov( ppo_kl_coef (float, optional): Coefficient for the KL penalty term in the loss. Defaults to 1. """ + assert config is not None + kl_cov_ratio = config.policy_loss.kl_cov_ratio if config.policy_loss.kl_cov_ratio is not None else 0.0002 ppo_kl_coef = config.policy_loss.ppo_kl_coef if config.policy_loss.ppo_kl_coef is not None else 1.0 From cc2972c11431a5ab189813ee855eaa0429e1baf1 Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Tue, 22 Jul 2025 23:53:58 -0700 Subject: [PATCH 5/7] add `verl.trainer.config.algorithm` to whitelist, related fixes --- pyproject.toml | 5 ++++- verl/trainer/config/algorithm.py | 13 +++++++++++++ verl/trainer/ppo/core_algos.py | 2 ++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f41aec66f41..9d24ae95b35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,10 @@ follow_imports = "skip" ignore_errors = true [[tool.mypy.overrides]] -module = ["verl.trainer.ppo.core_algos"] +module = [ +"verl.trainer.config.algorithm", +"verl.trainer.ppo.core_algos", +] ignore_errors = false # ------------------------------- diff --git a/verl/trainer/config/algorithm.py b/verl/trainer/config/algorithm.py index 5bc6cf94369..e3d1842d9c8 100644 --- a/verl/trainer/config/algorithm.py +++ b/verl/trainer/config/algorithm.py @@ -73,6 +73,15 @@ class FilterGroupsConfig(BaseConfig): max_num_gen_batches: int = 0 +@dataclass +class PolicyLossConfig(BaseConfig): + ppo_kl_coef: float = 0.1 + kl_cov_ratio: float = 0.0002 + clip_cov_lb: float = 1.0 + clip_cov_ub: float = 5.0 + clip_cov_ratio: float = 0.0002 + + @dataclass class AlgoConfig(BaseConfig): """Configuration for the algorithm. @@ -112,3 +121,7 @@ class AlgoConfig(BaseConfig): use_pf_ppo: bool = False pf_ppo: Optional[PFPPOConfig] = None filter_groups: Optional[FilterGroupsConfig] = None + policy_loss: Optional[PolicyLossConfig] = field(default_factory=PolicyLossConfig) + clip_ratio_high: float = 0.2 + clip_ratio_low: float = 0.2 + clip_ratio: float = 0.2 diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py index 5508213d713..be2e885c7c3 100644 --- a/verl/trainer/ppo/core_algos.py +++ b/verl/trainer/ppo/core_algos.py @@ -869,6 +869,7 @@ def compute_policy_loss_clip_cov( Upper bound for clipping covariance. Defaults to 5.0. """ assert config is not None + assert config.policy_loss is not None clip_cov_ratio = config.policy_loss.clip_cov_ratio if config.policy_loss.clip_cov_ratio is not None else 0.0002 cliprange = config.clip_ratio @@ -952,6 +953,7 @@ def compute_policy_loss_kl_cov( Coefficient for the KL penalty term in the loss. Defaults to 1. """ assert config is not None + assert config.policy_loss is not None kl_cov_ratio = config.policy_loss.kl_cov_ratio if config.policy_loss.kl_cov_ratio is not None else 0.0002 ppo_kl_coef = config.policy_loss.ppo_kl_coef if config.policy_loss.ppo_kl_coef is not None else 1.0 From 1bc626c434c117f097f98043240653d3dd5610d0 Mon Sep 17 00:00:00 2001 From: Fred Date: Thu, 24 Jul 2025 21:27:11 +0000 Subject: [PATCH 6/7] Revert "add `verl.trainer.config.algorithm` to whitelist, related fixes" This reverts commit cc2972c11431a5ab189813ee855eaa0429e1baf1. --- pyproject.toml | 5 +---- verl/trainer/config/algorithm.py | 13 ------------- verl/trainer/ppo/core_algos.py | 2 -- 3 files changed, 1 insertion(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9d24ae95b35..f41aec66f41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,10 +78,7 @@ follow_imports = "skip" ignore_errors = true [[tool.mypy.overrides]] -module = [ -"verl.trainer.config.algorithm", -"verl.trainer.ppo.core_algos", -] +module = ["verl.trainer.ppo.core_algos"] ignore_errors = false # ------------------------------- diff --git a/verl/trainer/config/algorithm.py b/verl/trainer/config/algorithm.py index e3d1842d9c8..5bc6cf94369 100644 --- a/verl/trainer/config/algorithm.py +++ b/verl/trainer/config/algorithm.py @@ -73,15 +73,6 @@ class FilterGroupsConfig(BaseConfig): max_num_gen_batches: int = 0 -@dataclass -class PolicyLossConfig(BaseConfig): - ppo_kl_coef: float = 0.1 - kl_cov_ratio: float = 0.0002 - clip_cov_lb: float = 1.0 - clip_cov_ub: float = 5.0 - clip_cov_ratio: float = 0.0002 - - @dataclass class AlgoConfig(BaseConfig): """Configuration for the algorithm. @@ -121,7 +112,3 @@ class AlgoConfig(BaseConfig): use_pf_ppo: bool = False pf_ppo: Optional[PFPPOConfig] = None filter_groups: Optional[FilterGroupsConfig] = None - policy_loss: Optional[PolicyLossConfig] = field(default_factory=PolicyLossConfig) - clip_ratio_high: float = 0.2 - clip_ratio_low: float = 0.2 - clip_ratio: float = 0.2 diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py index be2e885c7c3..5508213d713 100644 --- a/verl/trainer/ppo/core_algos.py +++ b/verl/trainer/ppo/core_algos.py @@ -869,7 +869,6 @@ def compute_policy_loss_clip_cov( Upper bound for clipping covariance. Defaults to 5.0. """ assert config is not None - assert config.policy_loss is not None clip_cov_ratio = config.policy_loss.clip_cov_ratio if config.policy_loss.clip_cov_ratio is not None else 0.0002 cliprange = config.clip_ratio @@ -953,7 +952,6 @@ def compute_policy_loss_kl_cov( Coefficient for the KL penalty term in the loss. Defaults to 1. """ assert config is not None - assert config.policy_loss is not None kl_cov_ratio = config.policy_loss.kl_cov_ratio if config.policy_loss.kl_cov_ratio is not None else 0.0002 ppo_kl_coef = config.policy_loss.ppo_kl_coef if config.policy_loss.ppo_kl_coef is not None else 1.0 From 3560b75dcfbf4dc7d9e686db60b258fa3bf7c0a5 Mon Sep 17 00:00:00 2001 From: Fred Date: Thu, 24 Jul 2025 21:40:13 +0000 Subject: [PATCH 7/7] Revert "Revert "add `verl.trainer.config.algorithm` to whitelist, related fixes"" This reverts commit 1bc626c434c117f097f98043240653d3dd5610d0. --- pyproject.toml | 5 ++++- verl/trainer/ppo/core_algos.py | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f41aec66f41..9d24ae95b35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,10 @@ follow_imports = "skip" ignore_errors = true [[tool.mypy.overrides]] -module = ["verl.trainer.ppo.core_algos"] +module = [ +"verl.trainer.config.algorithm", +"verl.trainer.ppo.core_algos", +] ignore_errors = false # ------------------------------- diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py index 5508213d713..c916b6c91b2 100644 --- a/verl/trainer/ppo/core_algos.py +++ b/verl/trainer/ppo/core_algos.py @@ -869,6 +869,8 @@ def compute_policy_loss_clip_cov( Upper bound for clipping covariance. Defaults to 5.0. """ assert config is not None + assert not isinstance(config, AlgoConfig), "passing AlgoConfig not supported yet" + assert config.policy_loss is not None clip_cov_ratio = config.policy_loss.clip_cov_ratio if config.policy_loss.clip_cov_ratio is not None else 0.0002 cliprange = config.clip_ratio @@ -952,6 +954,8 @@ def compute_policy_loss_kl_cov( Coefficient for the KL penalty term in the loss. Defaults to 1. """ assert config is not None + assert not isinstance(config, AlgoConfig), "passing AlgoConfig not supported yet" + assert config.policy_loss is not None kl_cov_ratio = config.policy_loss.kl_cov_ratio if config.policy_loss.kl_cov_ratio is not None else 0.0002 ppo_kl_coef = config.policy_loss.ppo_kl_coef if config.policy_loss.ppo_kl_coef is not None else 1.0