hiyouga
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎verl/protocol.py‎
Lines changed: 14 additions & 0 deletions b/‎verl/protocol.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎verl/single_controller/base/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎verl/single_controller/base/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎verl/single_controller/ray/base.py‎
Lines changed: 22 additions & 2 deletions b/‎verl/single_controller/ray/base.py‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎verl/trainer/main.py‎
Lines changed: 21 additions & 20 deletions b/‎verl/trainer/main.py‎
Lines changed: 21 additions & 20 deletions
diff --git a/‎verl/trainer/metrics.py‎
Lines changed: 137 additions & 0 deletions b/‎verl/trainer/metrics.py‎
Lines changed: 137 additions & 0 deletions
@@ -14,7 +14,7 @@ indent-width = 4
 
 [tool.ruff.lint]
 ignore = ["C901", "E501", "E741", "W605", "C408"]
-select = ["C", "E", "F", "I", "W"]
+select = ["C", "E", "F", "I", "W", "RUF022"]
 
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["E402", "F401", "F403", "F811"]
 
@@ -28,9 +28,11 @@
 import torch
 from numpy.typing import NDArray
 from tensordict import TensorDict
+from torch.distributed import ProcessGroup
 from torch.utils.data import DataLoader
 
 from .utils.py_functional import union_two_dict
+from .utils.torch_functional import allgather_dict_tensors
 
 
 try:
@@ -620,3 +622,15 @@ def get(self):
         if self.dispatch_fn is not None:
             output = self.dispatch_fn(output)  # split in batch dim, select using dp
         return output
+
+
+def all_gather_data_proto(data: DataProto, size: int, group: ProcessGroup) -> None:
+    # Note that this is an inplace operator just like torch.distributed.all_gather
+    prev_device = data.batch.device
+    data.batch = data.batch.cuda(device=torch.cuda.current_device())
+    data.batch = allgather_dict_tensors(data.batch.contiguous(), size=size, group=group, dim=0)
+    data.batch = data.batch.to(prev_device)
+    # all gather non_tensor_batch
+    all_non_tensor_batch = [None for _ in range(size)]
+    torch.distributed.all_gather_object(all_non_tensor_batch, data.non_tensor_batch, group=group)
+    data.non_tensor_batch = {k: np.concatenate([d[k] for d in all_non_tensor_batch]) for k in data.non_tensor_batch}
@@ -16,4 +16,4 @@
 from .worker_group import ClassWithInitArgs, ResourcePool, WorkerGroup
 
 
-__all__ = ["Worker", "WorkerGroup", "ClassWithInitArgs", "ResourcePool"]
+__all__ = ["ClassWithInitArgs", "ResourcePool", "Worker", "WorkerGroup"]
@@ -49,6 +49,26 @@ def func(*args, **kwargs):
     return func
 
 
+def sort_placement_group_by_node_ip(pgs: List[PlacementGroup]) -> List[PlacementGroup]:
+    """
+    Sort the placement groups by node ip, all bundles in a single placement group should be on the same node.
+
+    FSDPCheckpointManager saves sharded model states and optimizer states in local storage, which requires RANK
+    to be consistent across nodes when resume from checkpoint.
+
+    With this function, if there's only one resource pool and there's no node change, RANK should be consistent
+    across nodes in multiple ray jobs, even if the whole ray cluster is restarted.
+    """
+    node_ip = {node["NodeID"]: node["NodeManagerAddress"] for node in ray.nodes()}
+    pg_ip = {}
+    for pg in pgs:
+        specs = ray._private.state.state.placement_group_table(pg.id)
+        # all bunles should be on the same node
+        node_id = specs["bundles_to_node_id"][0]
+        pg_ip[pg.id] = node_ip[node_id]
+    return sorted(pgs, key=lambda pg: pg_ip[pg.id])
+
+
 class RayResourcePool(ResourcePool):
     def __init__(
         self,
@@ -231,8 +251,8 @@ def _init_with_resource_pool(self, resource_pool, ray_cls_with_init, bin_pack, d
         num_gpus = 1 / resource_pool.max_collocate_count
 
         rank = -1
-        for pg_idx, local_world_size in enumerate(resource_pool.store):
-            pg = pgs[pg_idx]
+        local_world_size = resource_pool.store[0]
+        for pg_idx, pg in enumerate(sort_placement_group_by_node_ip(pgs)):
             assert local_world_size <= pg.bundle_count, f"when generating for {self.name_prefix}, for the "
             for local_rank in range(local_world_size):
                 rank += 1
 
@@ -28,26 +28,13 @@
 from .ray_trainer import RayPPOTrainer, ResourcePoolManager, Role
 
 
-def main():
-    cli_args = OmegaConf.from_cli()
-    file_config = OmegaConf.load(cli_args.config)
-    del cli_args.config
-
-    default_config = OmegaConf.structured(PPOConfig())
-    ppo_config = OmegaConf.merge(default_config, file_config, cli_args)
-    ppo_config = OmegaConf.to_object(ppo_config)
-
-    if not ray.is_initialized():
-        # this is for local ray cluster
-        ray.init(runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN"}})
-
-    ray.get(main_task.remote(ppo_config))
-
-
-@ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
+@ray.remote(num_cpus=1)
 def main_task(config: PPOConfig):
+    # please make sure main_task is not scheduled on head
+    # print config
     config.deep_post_init()
     print(json.dumps(config.to_dict(), indent=2))
+
     # instantiate tokenizer
     tokenizer = get_tokenizer(
         config.worker.actor.model.model_path,
@@ -67,7 +54,6 @@ def main_task(config: PPOConfig):
         Role.Critic: ray.remote(FSDPWorker),
         Role.RefPolicy: ray.remote(FSDPWorker),
     }
-
     global_pool_id = "global_pool"
     resource_pool_spec = {
         global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
@@ -77,6 +63,7 @@ def main_task(config: PPOConfig):
         Role.Critic: global_pool_id,
         Role.RefPolicy: global_pool_id,
     }
+    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
 
     reward_fn = CustomRewardManager(
         tokenizer=tokenizer, num_examine=1, compute_score=config.worker.reward.compute_score
@@ -85,8 +72,6 @@ def main_task(config: PPOConfig):
         tokenizer=tokenizer, num_examine=1, compute_score=config.worker.reward.compute_score
     )
 
-    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
-
     trainer = RayPPOTrainer(
         config=config,
         tokenizer=tokenizer,
@@ -101,5 +86,21 @@ def main_task(config: PPOConfig):
     trainer.fit()
 
 
+def main():
+    cli_args = OmegaConf.from_cli()
+    file_config = OmegaConf.load(getattr(cli_args, "config"))
+    cli_args.pop("config", None)
+
+    default_config = OmegaConf.structured(PPOConfig())
+    ppo_config = OmegaConf.merge(default_config, file_config, cli_args)
+    ppo_config = OmegaConf.to_object(ppo_config)
+
+    if not ray.is_initialized():
+        # this is for local ray cluster
+        ray.init(runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN"}})
+
+    ray.get(main_task.remote(ppo_config))
+
+
 if __name__ == "__main__":
     main()
@@ -0,0 +1,137 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List
+
+import numpy as np
+import torch
+
+from ..protocol import DataProto
+
+
+def _compute_response_info(batch: DataProto) -> Dict[str, Any]:
+    response_length = batch.batch["responses"].shape[-1]
+    prompt_mask = batch.batch["attention_mask"][:, :-response_length]
+    response_mask = batch.batch["attention_mask"][:, -response_length:]
+    prompt_length = prompt_mask.sum(-1).float()
+    response_length = response_mask.sum(-1).float()  # (batch_size,)
+    return dict(
+        response_mask=response_mask,
+        prompt_length=prompt_length,
+        response_length=response_length,
+    )
+
+
+def reduce_metrics(metrics: Dict[str, List[Any]]) -> Dict[str, Any]:
+    return {key: np.mean(value) for key, value in metrics.items()}
+
+
+def compute_data_metrics(batch: DataProto, use_critic: bool = False) -> Dict[str, Any]:
+    sequence_score = batch.batch["token_level_scores"].sum(-1)
+    sequence_reward = batch.batch["token_level_rewards"].sum(-1)
+
+    advantages = batch.batch["advantages"]
+    returns = batch.batch["returns"]
+
+    max_response_length = batch.batch["responses"].size(-1)
+
+    prompt_mask = batch.batch["attention_mask"][:, :-max_response_length].bool()
+    response_mask = batch.batch["attention_mask"][:, -max_response_length:].bool()
+
+    max_prompt_length = prompt_mask.size(-1)
+
+    response_info = _compute_response_info(batch)
+    prompt_length = response_info["prompt_length"]
+    response_length = response_info["response_length"]
+
+    valid_adv = torch.masked_select(advantages, response_mask)
+    valid_returns = torch.masked_select(returns, response_mask)
+
+    if use_critic:
+        values = batch.batch["values"]
+        valid_values = torch.masked_select(values, response_mask)
+        return_diff_var = torch.var(valid_returns - valid_values)
+        return_var = torch.var(valid_returns)
+
+    metrics = {
+        # score
+        "critic/score/mean": torch.mean(sequence_score).detach().item(),
+        "critic/score/max": torch.max(sequence_score).detach().item(),
+        "critic/score/min": torch.min(sequence_score).detach().item(),
+        # reward
+        "critic/rewards/mean": torch.mean(sequence_reward).detach().item(),
+        "critic/rewards/max": torch.max(sequence_reward).detach().item(),
+        "critic/rewards/min": torch.min(sequence_reward).detach().item(),
+        # adv
+        "critic/advantages/mean": torch.mean(valid_adv).detach().item(),
+        "critic/advantages/max": torch.max(valid_adv).detach().item(),
+        "critic/advantages/min": torch.min(valid_adv).detach().item(),
+        # returns
+        "critic/returns/mean": torch.mean(valid_returns).detach().item(),
+        "critic/returns/max": torch.max(valid_returns).detach().item(),
+        "critic/returns/min": torch.min(valid_returns).detach().item(),
+        **(
+            {
+                # values
+                "critic/values/mean": torch.mean(valid_values).detach().item(),
+                "critic/values/max": torch.max(valid_values).detach().item(),
+                "critic/values/min": torch.min(valid_values).detach().item(),
+                # vf explained var
+                "critic/vf_explained_var": (1.0 - return_diff_var / (return_var + 1e-5)).detach().item(),
+            }
+            if use_critic
+            else {}
+        ),
+        # response length
+        "response_length/mean": torch.mean(response_length).detach().item(),
+        "response_length/max": torch.max(response_length).detach().item(),
+        "response_length/min": torch.min(response_length).detach().item(),
+        "response_length/clip_ratio": torch.mean(torch.eq(response_length, max_response_length).float())
+        .detach()
+        .item(),
+        # prompt length
+        "prompt_length/mean": torch.mean(prompt_length).detach().item(),
+        "prompt_length/max": torch.max(prompt_length).detach().item(),
+        "prompt_length/min": torch.min(prompt_length).detach().item(),
+        "prompt_length/clip_ratio": torch.mean(torch.eq(prompt_length, max_prompt_length).float()).detach().item(),
+    }
+    return metrics
+
+
+def compute_timing_metrics(batch: DataProto, timing_raw: Dict[str, float]) -> Dict[str, Any]:
+    response_info = _compute_response_info(batch)
+    num_prompt_tokens = torch.sum(response_info["prompt_length"]).item()
+    num_response_tokens = torch.sum(response_info["response_length"]).item()
+    num_overall_tokens = num_prompt_tokens + num_response_tokens
+    num_tokens_of_section = {
+        "gen": num_response_tokens,
+        **{name: num_overall_tokens for name in ["ref", "values", "adv", "update_critic", "update_actor"]},
+    }
+    return {
+        **{f"timing_s/{name}": value for name, value in timing_raw.items()},
+        **{
+            f"timing_per_token_ms/{name}": timing_raw[name] * 1000 / num_tokens_of_section[name]
+            for name in set(num_tokens_of_section.keys()) & set(timing_raw.keys())
+        },
+    }
+
+
+def compute_throughout_metrics(batch: DataProto, timing_raw: Dict[str, float], n_gpus: int) -> Dict[str, Any]:
+    total_num_tokens = sum(batch.meta_info["global_token_num"])
+    time = timing_raw["step"]
+    return {
+        "perf/total_num_tokens": total_num_tokens,
+        "perf/time_per_step": time,
+        "perf/throughput": total_num_tokens / (time * n_gpus),
+    }
Original file line number	Diff line number	Diff line change
`@@ -16,4 +16,4 @@`
`16`	`16`	`from .worker_group import ClassWithInitArgs, ResourcePool, WorkerGroup`
`17`	`17`
`18`	`18`
`19`		`-__all__ = ["Worker", "WorkerGroup", "ClassWithInitArgs", "ResourcePool"]`
	`19`	`+__all__ = ["ClassWithInitArgs", "ResourcePool", "Worker", "WorkerGroup"]`