[worker] colocate actor and ref model (#342)

hiyouga · hiyouga · commit 281aaee92d79 · 2025-06-13T19:49:27.000Z
diff --git a/examples/config.yaml b/examples/config.yaml
@@ -89,5 +89,6 @@ trainer:
   val_generations_to_log: 3
   save_freq: 5  # -1 to disable
   save_limit: 3  # -1 to disable
+  save_model_only: false
   save_checkpoint_path: null
   load_checkpoint_path: null
diff --git a/verl/trainer/config.py b/verl/trainer/config.py
@@ -93,22 +93,40 @@ class AlgorithmConfig:
 
 @dataclass
 class TrainerConfig:
-    total_epochs: int = 10
+    total_epochs: int = 15
+    """total epochs for training"""
     max_steps: Optional[int] = None
+    """max steps for training, if specified, total_epochs is ignored"""
     project_name: str = "easy_r1"
+    """project name for logger"""
     experiment_name: str = "demo"
+    """experiment name for logger"""
     logger: Tuple[str] = ("console", "wandb")
+    """logger type, support `console`, `mlflow`, `swanlab`, `tensorboard`, `wandb`"""
     nnodes: int = 1
+    """number of nodes for training"""
     n_gpus_per_node: int = 8
+    """number of gpus per node for training"""
     critic_warmup: int = 0
+    """critic warmup steps"""
     val_freq: int = -1
+    """validation frequency, -1 means no validation"""
     val_before_train: bool = True
+    """validate before training"""
     val_only: bool = False
+    """validate only, skip training"""
     val_generations_to_log: int = 0
+    """number of generations to log for validation"""
     save_freq: int = -1
+    """save frequency, -1 means no saving"""
     save_limit: int = -1
+    """max number of checkpoints to save, -1 means no limit"""
+    save_model_only: bool = False
+    """save model only, no optimizer state dict"""
     save_checkpoint_path: Optional[str] = None
+    """save checkpoint path, if not specified, use `checkpoints/project_name/experiment_name`"""
     load_checkpoint_path: Optional[str] = None
+    """load checkpoint path"""
 
     def post_init(self):
         if self.save_checkpoint_path is None:
diff --git a/verl/trainer/ray_trainer.py b/verl/trainer/ray_trainer.py
@@ -16,6 +16,7 @@
 This trainer supports model-agonistic model initialization with huggingface
 """
 
+import json
 import os
 import uuid
 from collections import defaultdict
@@ -176,6 +177,10 @@ def __init__(
         self.reward_fn = reward_fn
         self.val_reward_fn = val_reward_fn
 
+        self.val_reward_score = 0.0
+        self.best_val_reward_score = -1.0
+        self.best_global_step = None
+
         self.hybrid_engine = config.worker.hybrid_engine
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
@@ -258,6 +263,7 @@ def _validate(self) -> Dict[str, Any]:
         # Lists to collect samples for the table
         sample_inputs, sample_outputs, sample_labels, sample_scores = [], [], [], []
         reward_metrics_lst = defaultdict(list)
+        print("Start validation...")
         for batch_dict in self.val_dataloader:
             test_batch = DataProto.from_single_dict(batch_dict)
             # Store original inputs
@@ -295,9 +301,10 @@ def _validate(self) -> Dict[str, Any]:
                 reward_metrics_lst[key].extend(value)
 
         self._maybe_log_val_generations(sample_inputs, sample_outputs, sample_labels, sample_scores)
-        reward_score = torch.cat(reward_tensor_lst, dim=0).sum(-1).mean().item()
+        self.val_reward_score = torch.cat(reward_tensor_lst, dim=0).sum(-1).mean().item()
         val_reward_metrics = {f"val/{key}_reward": value for key, value in reduce_metrics(reward_metrics_lst).items()}
-        return {"val/reward_score": reward_score, **val_reward_metrics}
+        print("Finish validation.")
+        return {"val/reward_score": self.val_reward_score, **val_reward_metrics}
 
     def init_workers(self) -> None:
         """Init resource pool and worker group"""
@@ -359,24 +366,37 @@ def init_workers(self) -> None:
 
     def _save_checkpoint(self) -> None:
         # path: {save_checkpoint_path}/global_step_{global_step}/{actor,critic}
+        if self.val_reward_score > self.best_val_reward_score:
+            self.best_val_reward_score = self.val_reward_score
+            self.best_global_step = self.global_step
+
         remove_obsolete_ckpt(
-            self.config.trainer.save_checkpoint_path, self.global_step, self.config.trainer.save_limit
+            self.config.trainer.save_checkpoint_path,
+            self.global_step,
+            self.best_global_step,
+            self.config.trainer.save_limit,
         )
         folder_path = os.path.join(self.config.trainer.save_checkpoint_path, f"global_step_{self.global_step}")
         actor_path = os.path.join(folder_path, "actor")
         self.actor_rollout_ref_wg.save_checkpoint(actor_path)
 
         if self.use_critic:
             critic_path = os.path.join(folder_path, "critic")
-            self.critic_wg.save_checkpoint(critic_path)
+            self.critic_wg.save_checkpoint(critic_path, save_model_only=self.config.trainer.save_model_only)
 
         dataloader_path = os.path.join(folder_path, "dataloader.pt")
         dataloader_state_dict = self.train_dataloader.state_dict()
         torch.save(dataloader_state_dict, dataloader_path)
 
-        last_global_step_path = os.path.join(self.config.trainer.save_checkpoint_path, CHECKPOINT_TRACKER)
-        with open(last_global_step_path, "w") as f:
-            f.write(str(self.global_step))
+        checkpointer_tracker_info = {
+            "best_global_step": self.best_global_step,
+            "best_val_reward_score": round(self.best_val_reward_score, 2),
+            "last_global_step": self.global_step,
+            "last_actor_path": os.path.abspath(actor_path),
+        }
+        checkpointer_tracker_path = os.path.join(self.config.trainer.save_checkpoint_path, CHECKPOINT_TRACKER)
+        with open(checkpointer_tracker_path, "w") as f:
+            json.dump(checkpointer_tracker_info, f, ensure_ascii=False, indent=2)
 
     def _load_checkpoint(self) -> None:
         if self.config.trainer.load_checkpoint_path is None:
diff --git a/verl/utils/checkpoint/checkpoint_manager.py b/verl/utils/checkpoint/checkpoint_manager.py
@@ -28,7 +28,7 @@
 from transformers import PreTrainedTokenizer, ProcessorMixin
 
 
-CHECKPOINT_TRACKER = "latest_global_step.txt"
+CHECKPOINT_TRACKER = "checkpoint_tracker.json"
 
 
 class BaseCheckpointManager(ABC):
@@ -135,26 +135,33 @@ def get_checkpoint_tracker_filename(root_path: str) -> str:
     return os.path.join(root_path, CHECKPOINT_TRACKER)
 
 
-def remove_obsolete_ckpt(path: str, global_step: int, save_limit: int = -1, directory_format: str = "global_step_{}"):
+def remove_obsolete_ckpt(
+    path: str, global_step: int, best_global_step: int, save_limit: int = -1, directory_format: str = "global_step_{}"
+):
     """
     Remove the obsolete checkpoints that exceed the save_limit.
     """
-    if save_limit <= 0:
-        return
 
     if not os.path.exists(path):
         return
 
     pattern = re.escape(directory_format).replace(r"\{\}", r"(\d+)")
-    ckpt_folders = []
+    ckpt_global_steps = []
     for folder in os.listdir(path):
         if match := re.match(pattern, folder):
             step = int(match.group(1))
             if step < global_step:
-                ckpt_folders.append((step, folder))
+                ckpt_global_steps.append(step)
+
+    ckpt_global_steps.sort(reverse=True)
+    if best_global_step in ckpt_global_steps:
+        ckpt_global_steps.remove(best_global_step)
+        save_limit -= 1
+
+    if save_limit <= 0:
+        return
 
-    ckpt_folders.sort(reverse=True)
-    for _, folder in ckpt_folders[save_limit - 1 :]:
-        folder_path = os.path.join(path, folder)
+    for step in ckpt_global_steps[save_limit - 1 :]:
+        folder_path = os.path.join(path, directory_format.format(step))
         shutil.rmtree(folder_path, ignore_errors=True)
         print(f"Removed obsolete checkpoint: {folder_path}")
diff --git a/verl/utils/checkpoint/fsdp_checkpoint_manager.py b/verl/utils/checkpoint/fsdp_checkpoint_manager.py
@@ -17,7 +17,12 @@
 
 import torch
 import torch.distributed as dist
-from torch.distributed.checkpoint.state_dict import StateDictOptions, get_state_dict, set_state_dict
+from torch.distributed.checkpoint.state_dict import (
+    StateDictOptions,
+    get_model_state_dict,
+    get_state_dict,
+    set_state_dict,
+)
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
 
@@ -77,27 +82,32 @@ def load_checkpoint(self, path: Optional[str] = None):
         if "rng" in extra_state_dict:
             self.load_rng_state(extra_state_dict["rng"])
 
-    def save_checkpoint(self, path: str):
+    def save_checkpoint(self, path: str, save_model_only: bool = False):
         path = self.local_mkdir(path)
         dist.barrier()
 
         # every rank will save its own model and optim shard
-        state_dict_options = StateDictOptions(cpu_offload=True)
-        model_state_dict, optim_state_dict = get_state_dict(self.model, self.optimizer, options=state_dict_options)
-        extra_state_dict = {
-            "lr_scheduler": self.lr_scheduler.state_dict(),
-            "rng": self.get_rng_state(),
-        }
         model_path = os.path.join(path, f"model_world_size_{self.world_size}_rank_{self.rank}.pt")
         optim_path = os.path.join(path, f"optim_world_size_{self.world_size}_rank_{self.rank}.pt")
         extra_path = os.path.join(path, f"extra_state_world_size_{self.world_size}_rank_{self.rank}.pt")
 
-        print(f"[rank-{self.rank}]: Saving model to {os.path.abspath(model_path)}.")
-        print(f"[rank-{self.rank}]: Saving optimizer to {os.path.abspath(optim_path)}.")
-        print(f"[rank-{self.rank}]: Saving extra_state to {os.path.abspath(extra_path)}.")
-        torch.save(model_state_dict, model_path)
-        torch.save(optim_state_dict, optim_path)
-        torch.save(extra_state_dict, extra_path)
+        state_dict_options = StateDictOptions(cpu_offload=True)
+        if save_model_only:
+            model_state_dict = get_model_state_dict(self.model, options=state_dict_options)
+            print(f"[rank-{self.rank}]: Saving model to {os.path.abspath(model_path)}.")
+            torch.save(model_state_dict, model_path)
+        else:
+            model_state_dict, optim_state_dict = get_state_dict(self.model, self.optimizer, options=state_dict_options)
+            extra_state_dict = {
+                "lr_scheduler": self.lr_scheduler.state_dict(),
+                "rng": self.get_rng_state(),
+            }
+            print(f"[rank-{self.rank}]: Saving model to {os.path.abspath(model_path)}.")
+            print(f"[rank-{self.rank}]: Saving optimizer to {os.path.abspath(optim_path)}.")
+            print(f"[rank-{self.rank}]: Saving extra_state to {os.path.abspath(extra_path)}.")
+            torch.save(model_state_dict, model_path)
+            torch.save(optim_state_dict, optim_path)
+            torch.save(extra_state_dict, extra_path)
 
         # wait for everyone to dump to local
         dist.barrier()
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
@@ -410,12 +410,12 @@ def init_model(self):
             )
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def save_checkpoint(self, path: str):
+    def save_checkpoint(self, path: str, save_model_only: bool = False):
         assert self._has_actor or self._has_critic
         if self._use_param_offload:
             load_fsdp_model(self.fsdp_module)
 
-        self.checkpoint_manager.save_checkpoint(path)
+        self.checkpoint_manager.save_checkpoint(path, save_model_only)
         dist.barrier()
         if self._use_param_offload:
             offload_fsdp_model(self.fsdp_module)