lr=1e-4
lora_rank=64
lora_alpha=128
lora_trainable="q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
modules_to_save="embed_tokens,lm_head"
lora_dropout=0.05
pretrained_model=/mnt/wx_feature/home/chopinyan/codework/Chinese-LLaMA-Alpaca-2/scripts/chinese-alpaca-2-7b/
chinese_tokenizer_path=/mnt/wx_feature/home/chopinyan/codework/Chinese-LLaMA-Alpaca-2/scripts/chinese-alpaca-2-7b/
dataset_dir=/mnt/wx_feature/home/chopinyan/codework/Chinese-LLaMA-Alpaca-2/scripts/firefly/
per_device_train_batch_size=1
per_device_eval_batch_size=1
gradient_accumulation_steps=1
max_seq_length=512
output_dir=output_dir
validation_file=/mnt/wx_feature/home/chopinyan/codework/Chinese-LLaMA-Alpaca-2/scripts/firefly/alpaca_data-0-3252.json
deepspeed_config_file=ds_zero2_no_offload.json
torchrun --nnodes 1 --nproc_per_node 1 run_clm_sft_with_peft.py \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
--tokenizer_name_or_path ${chinese_tokenizer_path} \
--dataset_dir ${dataset_dir} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--do_train \ --do_eval \
--seed $RANDOM \
--fp16 \
--num_train_epochs 1 \
--lr_scheduler_type cosine \
--learning_rate ${lr} \
--warmup_ratio 0.03 \
--weight_decay 0 \
--logging_strategy steps \
--logging_steps 10 \
--save_strategy steps \
--save_total_limit 3 \
--evaluation_strategy steps \
--eval_steps 100 \
--save_steps 200 \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--preprocessing_num_workers 8 \
--max_seq_length ${max_seq_length} \
--output_dir ${output_dir} \
--overwrite_output_dir \
--ddp_timeout 30000 \
--logging_first_step True \
--lora_rank ${lora_rank} \
--lora_alpha ${lora_alpha} \
--trainable ${lora_trainable} \
--lora_dropout ${lora_dropout} \
--modules_to_save ${modules_to_save} \
--torch_dtype float16 \
--validation_file ${validation_file} \
--load_in_kbits 16 \
--gradient_checkpointing \
--ddp_find_unused_parameters False
[INFO|trainer.py:386] 2023-11-13 20:05:21,910 >> You have loaded a model on multiple GPUs. `is_model_parallel` attribu
te will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Traceback (most recent call last):
File "run_clm_sft_with_peft.py", line 531, in <module>
main()
File "run_clm_sft_with_peft.py", line 504, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1544, in train
return inner_training_loop(
File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 1558, in _inner_training_loop
train_dataloader = self.get_train_dataloader()
File "/usr/local/python/lib/python3.8/site-packages/transformers/trainer.py", line 856, in get_train_dataloader
for step, inputs in enumerate(train_dataloader):
File "/usr/local/python/lib/python3.8/site-packages/accelerate/data_loader.py", line 451, in __iter__
current_batch = next(dataloader_iter)
File "/usr/local/python/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 634, in __next__
data = self._next_data()
File "/usr/local/python/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 678, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/usr/local/python/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
return self.collate_fn(data)
File "/mnt/wx_feature/home/chopinyan/codework/Chinese-LLaMA-Alpaca-2/scripts/training/build_dataset.py", line 96, in
__call__
input_ids = torch.nn.utils.rnn.pad_sequence(
File "/usr/local/python/lib/python3.8/site-packages/torch/nn/utils/rnn.py", line 399, in pad_sequence
return torch._C._nn.pad_sequence(sequences, batch_first, padding_value)
TypeError: pad_sequence(): argument 'padding_value' (position 3) must be float, not NoneType
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 129986) of binary: /usr/l
ocal/python/bin/python3.8
提交前必须检查以下项目
问题类型
其他问题
基础模型
Chinese-Alpaca-2 (7B/13B)
操作系统
Linux
详细描述问题
依赖情况(代码类问题务必提供)
运行日志或截图