-
Notifications
You must be signed in to change notification settings - Fork 3k
add single_model network and use intermediate api #9412
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b26a6ca
3f01533
09d102f
38e1e03
bf5f2b0
d9a776e
ddffc98
5367921
edcd14d
f2a3574
38de88f
2eab58c
9d12f50
149b42c
311a200
e3ceb0d
b223d15
02d1adf
43643c6
7f6876f
e446376
63b4532
b552361
b8b2f71
575ee33
30c93d4
c5d967d
41f65c5
ffde7ad
5d0d2eb
5a28b85
49a1cec
88c512e
e0f6d0f
f00161a
78be8b0
27df94b
2102ad6
d295a76
9fe59d9
cdcfcc6
4b03203
67bb667
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. | ||
| # PaddlePaddle version 3.0-beta2 or higher is required, please upgrade your PaddlePaddle to 3.0-beta2 or other higher version. | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
|
|
||
| unset PADDLE_ELASTIC_JOB_ID | ||
| unset PADDLE_TRAINER_ENDPOINTS | ||
| unset DISTRIBUTED_TRAINER_ENDPOINTS | ||
| unset FLAGS_START_PORT | ||
| unset PADDLE_ELASTIC_TIMEOUT | ||
|
|
||
| export NNODES=1 | ||
| export PADDLE_TRAINERS_NUM=1 | ||
| export FLAGS_benchmark=false | ||
|
|
||
| export GLOG_v=0 | ||
| export FLAGS_print_ir=0 | ||
| #this optional is for compare train precison | ||
| export FLAGS_cudnn_deterministic=1 | ||
| export FLAGS_embedding_deterministic=1 | ||
| # export NVIDIA_TF32_OVERRIDE=0 | ||
|
|
||
| export FLAGS_call_stack_level=3 | ||
| export FLAGS_enable_pir_api=1 | ||
| export PYTHONPATH=../../../:$PYTHONPATH | ||
|
|
||
| set -x | ||
| unset CUDA_VISIBLE_DEVICES | ||
|
|
||
| task_name="gpt3_13b_atuo_perf" | ||
| log_dir="log/$task_name" | ||
| rm -rf $log_dir | ||
| to_static=1 | ||
| # export PYTHONPATH=../../../:$PYTHONPATH | ||
|
|
||
| python -u -m paddle.distributed.launch \ | ||
| --gpus "0,1,2,3" \ | ||
| --log_dir ${log_dir} \ | ||
| run_pretrain_auto.py \ | ||
| --model_name_or_path gpt3-13B-en \ | ||
| --tokenizer_name_or_path gpt3-13B-en \ | ||
| --to_static ${to_static} \ | ||
| --enable_auto_parallel 1 \ | ||
| --input_dir "../gpt_data" \ | ||
| --output_dir "output/$task_name" \ | ||
| --split 949,50,1 \ | ||
| --max_seq_length 4096 \ | ||
| --per_device_train_batch_size 1 \ | ||
| --per_device_eval_batch_size 1 \ | ||
| --scale_loss 1024 \ | ||
| --learning_rate 0.00001 \ | ||
| --min_learning_rate 0.000001 \ | ||
| --max_steps 10000 \ | ||
| --save_steps 50000 \ | ||
| --weight_decay 0.01 \ | ||
| --warmup_ratio 0.01 \ | ||
| --logging_steps 1\ | ||
| --continue_training 0\ | ||
| --dataloader_num_workers 4 \ | ||
| --eval_steps 100000 \ | ||
| --report_to "visualdl" \ | ||
| --disable_tqdm true \ | ||
| --do_train \ | ||
| --do_eval \ | ||
| --device "gpu" \ | ||
| --model_type "gpt_network" \ | ||
| --use_intermediate_api true \ | ||
| --sharding "stage1" \ | ||
| --tensor_parallel_degree 1 \ | ||
| --pipeline_parallel_degree 2 \ | ||
| --virtual_pp_degree 2 \ | ||
| --pipeline_schedule_mode "1F1B" \ | ||
| --virtual_pipeline_seg_method 'GPTDecoderLayerAuto' \ | ||
| --sequence_parallel 0 \ | ||
| --use_flash_attention 1 \ | ||
| --fused_linear 1 \ | ||
| --fuse_attention_ffn 1 \ | ||
| --fuse_attention_qkv 1 \ | ||
| --fused_linear_param_grad_add 1 \ | ||
| --recompute 0 \ | ||
| --recompute_use_reentrant true \ | ||
| --recompute_granularity "full" \ | ||
| --pp_recompute_interval 1 \ | ||
| --gradient_accumulation_steps 32 \ | ||
| --max_grad_norm 0 \ | ||
| --bf16 1 \ | ||
| --fp16_opt_level "O2" \ | ||
| --amp_master_grad true \ | ||
| --attention_probs_dropout_prob 0.1 \ | ||
| --hidden_dropout_prob 0.1 \ | ||
| --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \ | ||
| --tensor_parallel_config "enable_mp_async_allreduce" \ | ||
| --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \ | ||
| --pipeline_parallel_config "enable_send_recv_overlap enable_split_backward" \ | ||
| --num_hidden_layers 2 \ | ||
| # --auto_parallel_resume_form_hybrid_parallel true \ | ||
| # --resume_from_checkpoint "output/gpt3_13b_hand_perf/checkpoint-1" \ | ||
| # --fused_linear 1 \ | ||
| # --use_fast_layer_norm 1 \ | ||
| # --use_fused_dropout_add 1 \ | ||
| # --pipeline_parallel_config "enable_send_recv_overlap enable_split_backward" \ | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,13 +38,16 @@ | |
| CosineAnnealingWithWarmupDecay, | ||
| GPTConfig, | ||
| GPTForCausalLMAuto, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 是否可以考虑统一一个run_pretrain_auto.py,不然每个模型维护一个脚本,维护成本比较大。
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 考率到修改启动脚本涉及到的ci/ce脚本较多。故计划此pr合入之后提交个pr统一修改 |
||
| GPTForCausalLMNet, | ||
| GPTPretrainingCriterionAuto, | ||
| GPTPretrainingCriterionNet, | ||
| LinearAnnealingWithWarmupDecay, | ||
| ) | ||
| from paddlenlp.utils.log import logger | ||
|
|
||
| MODEL_CLASSES = { | ||
| "gpt": (GPTConfig, GPTForCausalLMAuto, GPTPretrainingCriterionAuto), | ||
| "gpt_network": (GPTConfig, GPTForCausalLMNet, GPTPretrainingCriterionNet), | ||
| } | ||
|
|
||
| from paddlenlp.data.causal_dataset import ( | ||
|
|
@@ -91,6 +94,10 @@ class PreTrainingArguments(AutoTrainingArguments): | |
| default=False, | ||
| metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."}, | ||
| ) | ||
| use_intermediate_api: bool = field( | ||
| default=False, | ||
| metadata={"help": "Weather to use auto_parallel intermediate api"}, | ||
| ) | ||
|
|
||
| def __post_init__(self): | ||
| super().__post_init__() | ||
|
|
@@ -529,9 +536,9 @@ def main(): | |
| dtype = "float16" | ||
| if training_args.bf16: | ||
| dtype = "bfloat16" | ||
|
|
||
| model = model_class.from_config(config, dtype=dtype) | ||
| criterion = criterion_class(config) | ||
| with paddle.LazyGuard(): | ||
| model = model_class.from_config(config, dtype=dtype) | ||
| criterion = criterion_class(config) | ||
| if training_args.recompute: | ||
|
|
||
| def fn(layer): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,86 @@ | ||
| # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. | ||
| #PaddlePaddle version 3.0-beta2 or higher is required, please upgrade your PaddlePaddle to 3.0-beta2 or other higher version. | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| set -x | ||
|
|
||
| export FLAGS_cudnn_deterministic=1 | ||
| export FLAGS_embedding_deterministic=1 | ||
| export FLAGS_max_inplace_grad_add=65536 | ||
|
|
||
| task_name="llama_auto" | ||
| rm -rf output | ||
| rm -rf single | ||
|
|
||
| export PYTHONPATH=../../../:$PYTHONPATH | ||
|
|
||
| python -u -m paddle.distributed.launch \ | ||
| --gpus "0,1,2,3,4,5,6,7" \ | ||
| --log_dir "single" \ | ||
| ./run_pretrain_auto.py \ | ||
| --model_name_or_path "facebook/llama-7b" \ | ||
| --tokenizer_name_or_path "facebook/llama-7b" \ | ||
| --input_dir "./data" \ | ||
| --output_dir "./output" \ | ||
| --split 949,50,1 \ | ||
| --to_static false \ | ||
| --seed 1234 \ | ||
| --pipeline_parallel_degree 2 \ | ||
| --sharding_parallel_degree 2 \ | ||
| --tensor_parallel_degree 2 \ | ||
| --virtual_pp_degree 1 \ | ||
| --weight_decay 0.01 \ | ||
| --warmup_ratio 0.01 \ | ||
| --max_grad_norm 1.0 \ | ||
| --learning_rate 3e-05 \ | ||
| --min_learning_rate 3e-06 \ | ||
| --max_steps 2000 \ | ||
| --logging_steps 1 \ | ||
| --eval_steps 10000 \ | ||
| --save_steps 2000 \ | ||
| --auto_parallel_resume_form_hybrid_parallel true \ | ||
| --continue_training 0 \ | ||
| --do_train true \ | ||
| --do_eval false \ | ||
| --do_predict false \ | ||
| --disable_tqdm true \ | ||
| --save_total_limit 2 \ | ||
| --device gpu \ | ||
| --dataloader_num_workers 4 \ | ||
| --distributed_dataloader 0 \ | ||
| --enable_auto_parallel 1 \ | ||
| --per_device_train_batch_size 1 \ | ||
| --gradient_accumulation_steps 8 \ | ||
| --per_device_eval_batch_size 1 \ | ||
| --recompute false \ | ||
| --recompute_use_reentrant true \ | ||
| --skip_profile_timer true \ | ||
| --recompute_granularity full \ | ||
| --pp_recompute_interval 0 \ | ||
| --bf16 true \ | ||
| --fp16_opt_level "O2" \ | ||
| --amp_master_grad true \ | ||
| --fuse_attention_ffn false \ | ||
| --fuse_attention_qkv true \ | ||
| --use_flash_attention true \ | ||
| --use_fused_rope true \ | ||
| --use_fused_rms_norm true \ | ||
| --max_seq_length 4096 \ | ||
| --sequence_parallel true \ | ||
| --sharding "stage1" \ | ||
| --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \ | ||
| --tensor_parallel_config "enable_mp_async_allreduce" \ | ||
| --model_type "llama_network" \ | ||
| --ignore_load_lr_and_optim true \ | ||
| --ignore_save_lr_and_optim true \ | ||
| --use_intermediate_api true \ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,12 +41,15 @@ | |
| LinearAnnealingWithWarmupDecay, | ||
| LlamaConfig, | ||
| LlamaForCausalLM3DAuto, | ||
| LlamaForCausalLMNet, | ||
| LlamaPretrainingCriterion3DAuto, | ||
| LlamaPretrainingCriterionNet, | ||
| ) | ||
| from paddlenlp.utils.log import logger | ||
|
|
||
| MODEL_CLASSES = { | ||
| "llama": (LlamaConfig, LlamaForCausalLM3DAuto, LlamaPretrainingCriterion3DAuto), | ||
| "llama_network": (LlamaConfig, LlamaForCausalLMNet, LlamaPretrainingCriterionNet), | ||
| } | ||
|
|
||
|
|
||
|
|
@@ -100,6 +103,10 @@ class PreTrainingArguments(AutoTrainingArguments): | |
| default=False, | ||
| metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."}, | ||
| ) | ||
| use_intermediate_api: bool = field( | ||
| default=False, | ||
| metadata={"help": "Weather to use auto_parallel intermediate api"}, | ||
| ) | ||
|
|
||
| def __post_init__(self): | ||
| super().__post_init__() | ||
|
|
@@ -544,6 +551,7 @@ def main(): | |
| config.use_recompute = training_args.recompute | ||
| config.tensor_parallel_degree = training_args.tensor_parallel_degree | ||
| config.tensor_parallel_rank = training_args.tensor_parallel_rank | ||
| config.sharding_parallel_degree = training_args.sharding_parallel_degree | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 看了一下代码,为啥在400多行创建Topology的位置,sharding_degree默认设置为1?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. auto_parallel's sharding is not orthogonal with dp, mp and pp
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dp_degree已经包含了sharding_degree,所以sharding_degree设为1即可。 |
||
|
|
||
| if training_args.strategy.pipeline.enable and config.virtual_pp_degree > 1: | ||
| pipeline = training_args.strategy.pipeline | ||
|
|
@@ -564,10 +572,6 @@ def main(): | |
| model = model_class.from_config(config, dtype="float32") | ||
| criterion = criterion_class(config) | ||
|
|
||
| for param in model.parameters(): | ||
| assert not param._is_initialized() | ||
| param.initialize() | ||
|
|
||
| if training_args.recompute: | ||
|
|
||
| def fn(layer): | ||
|
|
@@ -621,6 +625,7 @@ def fn(layer): | |
| eval_dataset=eval_dataset if training_args.do_eval else None, | ||
| optimizers=(None, lr_scheduler), | ||
| tokenizer=tokenizer, | ||
| model_args=model_args, | ||
| ) | ||
|
|
||
| checkpoint = None | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,99 @@ | ||
| # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. | ||
| # PaddlePaddle version 3.0-beta2 or higher is required, please upgrade your PaddlePaddle to 3.0-beta2 or other higher version. | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| # just for debug | ||
|
|
||
| set -x | ||
| unset PADDLE_ELASTIC_JOB_ID | ||
| unset PADDLE_TRAINER_ENDPOINTS | ||
| unset DISTRIBUTED_TRAINER_ENDPOINTS | ||
| unset FLAGS_START_PORT | ||
| unset PADDLE_ELASTIC_TIMEOUT | ||
|
|
||
| export NNODES=1 | ||
| export PADDLE_TRAINERS_NUM=1 | ||
| export FLAGS_call_stack_level=3 | ||
| export FLAGS_use_cuda_managed_memory=true | ||
|
|
||
| task_name="llama_auto" | ||
| rm -rf output/$task_name/ | ||
| rm -rf "output/$task_name""_log" | ||
|
|
||
| export SOT_LOG_LEVEL=4 | ||
| export PYTHONPATH=../../../:$PYTHONPATH | ||
|
|
||
|
|
||
| rm -rf ./log/auto_3d_auto | ||
|
|
||
| export FLAGS_embedding_deterministic=1 | ||
| export FLAGS_cudnn_deterministic=1 | ||
| export FLAGS_max_inplace_grad_add=65536 | ||
| export NVIDIA_TF32_OVERRIDE=0 | ||
| export FLAGS_enable_pir_in_executor=1 | ||
| export FLAGS_enable_pir_api=1 | ||
|
|
||
|
|
||
| python -u -m paddle.distributed.launch \ | ||
| --gpus "4,5" \ | ||
| --log_dir "log/auto_3d_auto" \ | ||
| run_pretrain_3D_auto.py \ | ||
| --model_name_or_path "qwen/qwen-14b" \ | ||
| --tokenizer_name_or_path "qwen/qwen-14b" \ | ||
| --model_type "qwen_network" \ | ||
| --use_intermediate_api true \ | ||
| --input_dir "../data" \ | ||
| --output_dir "./checkpoints/qwen_pretrain_ckpts" \ | ||
| --per_device_train_batch_size 1\ | ||
| --gradient_accumulation_steps 32\ | ||
| --per_device_eval_batch_size 16\ | ||
| --sharding "stage1" \ | ||
| --sharding_parallel_degree 1\ | ||
| --tensor_parallel_degree 2\ | ||
| --pipeline_parallel_degree 1\ | ||
| --virtual_pp_degree 1\ | ||
| --use_flash_attention false\ | ||
| --use_fused_rms_norm false\ | ||
| --use_fused_rope false\ | ||
| --max_seq_length 4096\ | ||
| --learning_rate 3e-05\ | ||
| --min_learning_rate 3e-06\ | ||
| --scale_loss 1024\ | ||
| --warmup_steps 30\ | ||
| --logging_steps 1\ | ||
| --max_steps 10000\ | ||
| --save_steps 1000\ | ||
| --eval_steps 10000\ | ||
| --weight_decay 0.01\ | ||
| --bf16 true\ | ||
| --fp16_opt_level "O2"\ | ||
| --amp_master_grad true \ | ||
| --warmup_ratio 0.01\ | ||
| --max_grad_norm 0.0\ | ||
| --dataloader_num_workers 4\ | ||
| --continue_training 0\ | ||
| --do_train true\ | ||
| --do_eval false\ | ||
| --do_predict false \ | ||
| --disable_tqdm true\ | ||
| --recompute false\ | ||
| --recompute_granularity "core_attn"\ | ||
| --recompute_use_reentrant true\ | ||
| --distributed_dataloader 0\ | ||
| --save_total_limit 2\ | ||
| --enable_auto_parallel 1\ | ||
| --to_static 1 \ | ||
| --num_hidden_layers 1 \ | ||
| --attention_probs_dropout_prob 0 \ | ||
| --hidden_dropout_prob 0 \ | ||
| --auto_parallel_resume_form_hybrid_parallel true \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这里依赖的paddle版本是否给出明确的版本信息,可以提现在readme文档上
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done