diff --git a/README.md b/README.md index 6394026c..b6aa77bc 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,11 @@ We provide a [Dockerfile](./Dockerfile) to easily build environments. ### Hardware Requirements -At least **8*80GB VRAM** is needed to train a 7B model. If you have less computation resource, please consider using smaller (1.5B, 3B) models. +\* *estimated* + +| Method | Bits | 1.5B | 3B | 7B | +| ------------------------ | ---- | ------ | ------ | ------ | +| GRPO Full Fine-Tuning | AMP | 2*40GB | 4*40GB | 4*80GB | > [!NOTE] > We are working hard to reduce the VRAM in RL training, LoRA support will be integrated in next updates. @@ -101,10 +105,10 @@ We also thank Guangming Sheng and Chi Zhang for helpful discussions. ```bibtex @misc{zheng2025easyr1, - title = {EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework}, - author = {Yaowei Zheng, Junting Lu, Shenzhi Wang, Yuwen Xiong}, + title = {EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework}, + author = {Yaowei Zheng, Junting Lu, Shenzhi Wang, Yuwen Xiong}, howpublished = {\url{https://github.com/hiyouga/EasyR1}}, - year = {2025} + year = {2025} } ``` diff --git a/examples/grpo_example.yaml b/examples/grpo_example.yaml index 045aa4b7..f5117521 100644 --- a/examples/grpo_example.yaml +++ b/examples/grpo_example.yaml @@ -2,9 +2,13 @@ data: train_files: hiyouga/math12k@train val_files: hiyouga/math12k@test prompt_key: problem - rollout_batch_size: 512 max_prompt_length: 2048 max_response_length: 2048 + rollout_batch_size: 512 + shuffle: true + seed: 1 + max_pixels: 4194304 + min_pixels: 262144 algorithm: adv_estimator: grpo @@ -13,8 +17,9 @@ algorithm: worker: actor: global_batch_size: 128 - micro_batch_size_per_device_for_update: 2 + micro_batch_size_per_device_for_update: 1 micro_batch_size_per_device_for_experience: 2 + max_grad_norm: 1.0 use_kl_loss: true kl_loss_coef: 1.0e-3 kl_loss_type: low_var_kl @@ -23,13 +28,19 @@ worker: enable_gradient_checkpointing: true optim: lr: 1.0e-6 + weight_decay: 1.0e-2 + fsdp: + param_offload: false + optimizer_offload: false + torch_dtype: null offload: param_offload: true optimizer_offload: true rollout: + temperature: 1.0 tensor_parallel_size: 2 - gpu_memory_utilization: 0.7 + gpu_memory_utilization: 0.6 n: 5 enable_chunked_prefill: true @@ -38,6 +49,7 @@ worker: param_offload: true reward: + reward_type: function compute_score: math trainer: @@ -49,3 +61,6 @@ trainer: nnodes: 1 save_freq: 5 test_freq: 5 + val_before_train: true + val_only: false + save_checkpoint_path: null diff --git a/examples/run_qwen2_5_7b_math.sh b/examples/run_qwen2_5_7b_math.sh index a96e34ea..275a1b89 100644 --- a/examples/run_qwen2_5_7b_math.sh +++ b/examples/run_qwen2_5_7b_math.sh @@ -6,4 +6,5 @@ MODEL_PATH=Qwen/Qwen2.5-7B-Instruct # replace it with your local file path python3 -m verl.trainer.main \ config=examples/grpo_example.yaml \ - worker.actor.model.model_path=${MODEL_PATH} + worker.actor.model.model_path=${MODEL_PATH} \ + trainer.n_gpus_per_node=4 diff --git a/examples/run_qwen2_5_vl_3b_geo.sh b/examples/run_qwen2_5_vl_3b_geo.sh new file mode 100644 index 00000000..beecc9c9 --- /dev/null +++ b/examples/run_qwen2_5_vl_3b_geo.sh @@ -0,0 +1,16 @@ +set -x + +export VLLM_ATTENTION_BACKEND=XFORMERS + +MODEL_PATH=Qwen/Qwen2.5-VL-3B-Instruct # replace it with your local file path + +python3 -m verl.trainer.main \ + config=examples/grpo_example.yaml \ + data.train_files=hiyouga/geometry3k@train \ + data.val_files=hiyouga/geometry3k@test \ + data.max_prompt_length=4096 \ + worker.actor.model.model_path=${MODEL_PATH} \ + worker.rollout.tensor_parallel_size=1 \ + worker.rollout.enable_chunked_prefill=false \ + trainer.experiment_name=qwen2_5_vl_3b_geo \ + trainer.n_gpus_per_node=2 diff --git a/examples/run_qwen2_5_vl_7b_geo.sh b/examples/run_qwen2_5_vl_7b_geo.sh index 43ce14a8..3d785e50 100644 --- a/examples/run_qwen2_5_vl_7b_geo.sh +++ b/examples/run_qwen2_5_vl_7b_geo.sh @@ -11,4 +11,5 @@ python3 -m verl.trainer.main \ data.max_prompt_length=4096 \ worker.actor.model.model_path=${MODEL_PATH} \ worker.rollout.enable_chunked_prefill=false \ - trainer.experiment_name=qwen2_5_vl_7b_geo + trainer.experiment_name=qwen2_5_vl_7b_geo \ + trainer.n_gpus_per_node=4 diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py index 7ee834b2..03340104 100644 --- a/verl/workers/fsdp_workers.py +++ b/verl/workers/fsdp_workers.py @@ -188,6 +188,7 @@ def _build_model_optimizer( ) assert isinstance(model, PreTrainedModel) # lint + model.tie_weights() # avoid hanging model = model.to(torch_dtype) if model_config.enable_gradient_checkpointing: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False}) diff --git a/verl/workers/reward/config.py b/verl/workers/reward/config.py index 53e1ebdd..fbf860ef 100644 --- a/verl/workers/reward/config.py +++ b/verl/workers/reward/config.py @@ -20,5 +20,5 @@ @dataclass class RewardConfig: - reward_type: str = "custom" + reward_type: str = "function" compute_score: str = "math" diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py index 65792b07..a4c3fa8d 100644 --- a/verl/workers/sharding_manager/fsdp_vllm.py +++ b/verl/workers/sharding_manager/fsdp_vllm.py @@ -56,9 +56,9 @@ def __init__( self.gen_random_states = None def __enter__(self): - log_gpu_memory_usage("Before state_dict() in sharding manager memory") + log_gpu_memory_usage("Before state_dict() in sharding manager") actor_weights = self.module.state_dict() - log_gpu_memory_usage("After state_dict() in sharding manager memory") + log_gpu_memory_usage("After state_dict() in sharding manager") self.inference_engine.wake_up() load_dtensor_weights(