fix: set torch dtype to auto (#749)

ccclyu · web-flow · commit a1dd9224b657 · 2025-03-27T10:10:14.000+08:00
### Issue As transformers `from_pretrained()` automatically load ckpt in the default torch.dtype (here, fp32) and not follow the dtype of the weights themselve, it often causes high CPU memory usage even OOM, which crushes the training. ### Fix Follow by huggingface/transformers#34743 and tested on loading Llama-3.1-405B-Instruct. ``` - model = AutoModelForCausalLM.from_pretrained(model_file) + model = AutoModelForCausalLM.from_pretrained(model_file, torch_dtype="auto") ``` ### Potential problem: Some CPUs can not support `bfloat16` and it might take some time to convert back to `float32`. Co-authored-by: Changlong Yu <changlyu@amazon.com>
diff --git a/verl/utils/checkpoint/megatron_checkpoint_manager.py b/verl/utils/checkpoint/megatron_checkpoint_manager.py
@@ -267,7 +267,7 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
                         state_dict['score.weight'] = state_dict['score.weight']
                     else:
                         from transformers import AutoModelForCausalLM
-                        model = AutoModelForCausalLM.from_pretrained(self.config.model.path)
+                        model = AutoModelForCausalLM.from_pretrained(self.config.model.path, torch_dtype="auto")
                 model.save_pretrained(hf_model_ckpt_path, state_dict=state_dict)
                 if hdfs_path is not None:
                     print(f'Uploading checkpoint to {hdfs_path}')
diff --git a/verl/utils/model.py b/verl/utils/model.py
@@ -316,7 +316,7 @@ def load_megatron_model_weights(config,
                     'model.embed_tokens.weight'][:32000]  # workaround, 32001 -> 32000
                 is_value_model = True
             else:
-                model = AutoModelForCausalLM.from_pretrained(local_model_path)
+                model = AutoModelForCausalLM.from_pretrained(local_model_path, torch_dtype="auto")
                 state_dict = model.state_dict()
 
     from verl.models.weight_loader_registry import get_weight_loader