Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
b26a6ca
add single_model network and use intermediate api
blacksheep-Aristotle Nov 12, 2024
3f01533
[AutoParallel]: fix llama_model_network run error
blacksheep-Aristotle Nov 15, 2024
09d102f
New version of auto config
FeixLiu Nov 19, 2024
38e1e03
bug fix
FeixLiu Nov 20, 2024
bf5f2b0
[AutoParallel]:add qwen run intermedaite api script
blacksheep-Aristotle Nov 20, 2024
d9a776e
fix sharding
FeixLiu Nov 27, 2024
ddffc98
support attn_mask not None for pp
deepllz Nov 28, 2024
5367921
fix for mp
FeixLiu Nov 28, 2024
edcd14d
update api calling
FeixLiu Dec 4, 2024
f2a3574
update the script
FeixLiu Dec 4, 2024
38de88f
add qwen mp/sp/pp config and process attn_mask and position_id
jeff41404 Nov 28, 2024
2eab58c
fix issue of pipeline and add lazy_init
jeff41404 Nov 29, 2024
9d12f50
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 3, 2024
149b42c
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 5, 2024
311a200
[Auto Parallel] add llama2 7b/13b intermediate benchmark config
deepllz Dec 5, 2024
e3ceb0d
update api
FeixLiu Dec 5, 2024
b223d15
update plan
FeixLiu Dec 9, 2024
02d1adf
qwen fit base api
FeixLiu Dec 9, 2024
43643c6
fix alibi and attn_mask
deepllz Dec 9, 2024
7f6876f
bug fix
FeixLiu Dec 11, 2024
e446376
[AutoParallel]:gpt single network support tp to share_embedding
blacksheep-Aristotle Dec 12, 2024
63b4532
[AutoParallel]:gpt single network support tp to share_embedding
blacksheep-Aristotle Dec 12, 2024
b552361
add intermediate ci
blacksheep-Aristotle Dec 18, 2024
b8b2f71
add single_model network and use intermediate api
blacksheep-Aristotle Nov 12, 2024
575ee33
New version of auto config
FeixLiu Nov 19, 2024
30c93d4
fix sharding
FeixLiu Nov 27, 2024
c5d967d
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 3, 2024
41f65c5
fix gpt_network to use intermediate_api
blacksheep-Aristotle Dec 5, 2024
ffde7ad
update ci loss baseline
blacksheep-Aristotle Dec 18, 2024
5d0d2eb
update gpt run_pretrain_py
blacksheep-Aristotle Dec 19, 2024
5a28b85
fix sharding error
blacksheep-Aristotle Dec 19, 2024
49a1cec
fix gpt format error
blacksheep-Aristotle Dec 19, 2024
88c512e
[AutoParallel]:fix llama vpp ci error
blacksheep-Aristotle Dec 19, 2024
e0f6d0f
[AutoParallel]:fix llama A100 ci failed
blacksheep-Aristotle Dec 23, 2024
f00161a
[AutoParallel]:update model formate
blacksheep-Aristotle Dec 24, 2024
78be8b0
[AutoParallel]:fix ipp error
blacksheep-Aristotle Dec 24, 2024
27df94b
[AutoParallel]:fix a100 ci error
blacksheep-Aristotle Dec 24, 2024
2102ad6
[AutoParallel]:fix a100 ci error
blacksheep-Aristotle Dec 24, 2024
d295a76
[AutoParallel]:update model_netowrk formate
blacksheep-Aristotle Dec 25, 2024
9fe59d9
[AutoParallel]:add explanatory note
blacksheep-Aristotle Dec 27, 2024
cdcfcc6
[AutoParallel]:add explanatory note
blacksheep-Aristotle Dec 27, 2024
4b03203
[AutoParallel]:add explanatory note
blacksheep-Aristotle Dec 27, 2024
67bb667
Delete =1.0.0
blacksheep-Aristotle Dec 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions llm/auto_parallel/gpt-3/gpt_with_intermediate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里依赖的paddle版本是否给出明确的版本信息,可以提现在readme文档上

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

# PaddlePaddle version 3.0-beta2 or higher is required, please upgrade your PaddlePaddle to 3.0-beta2 or other higher version.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


unset PADDLE_ELASTIC_JOB_ID
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS
unset FLAGS_START_PORT
unset PADDLE_ELASTIC_TIMEOUT

export NNODES=1
export PADDLE_TRAINERS_NUM=1
export FLAGS_benchmark=false

export GLOG_v=0
export FLAGS_print_ir=0
#this optional is for compare train precison
export FLAGS_cudnn_deterministic=1
export FLAGS_embedding_deterministic=1
# export NVIDIA_TF32_OVERRIDE=0

export FLAGS_call_stack_level=3
export FLAGS_enable_pir_api=1
export PYTHONPATH=../../../:$PYTHONPATH

set -x
unset CUDA_VISIBLE_DEVICES

task_name="gpt3_13b_atuo_perf"
log_dir="log/$task_name"
rm -rf $log_dir
to_static=1
# export PYTHONPATH=../../../:$PYTHONPATH

python -u -m paddle.distributed.launch \
--gpus "0,1,2,3" \
--log_dir ${log_dir} \
run_pretrain_auto.py \
--model_name_or_path gpt3-13B-en \
--tokenizer_name_or_path gpt3-13B-en \
--to_static ${to_static} \
--enable_auto_parallel 1 \
--input_dir "../gpt_data" \
--output_dir "output/$task_name" \
--split 949,50,1 \
--max_seq_length 4096 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--scale_loss 1024 \
--learning_rate 0.00001 \
--min_learning_rate 0.000001 \
--max_steps 10000 \
--save_steps 50000 \
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--logging_steps 1\
--continue_training 0\
--dataloader_num_workers 4 \
--eval_steps 100000 \
--report_to "visualdl" \
--disable_tqdm true \
--do_train \
--do_eval \
--device "gpu" \
--model_type "gpt_network" \
--use_intermediate_api true \
--sharding "stage1" \
--tensor_parallel_degree 1 \
--pipeline_parallel_degree 2 \
--virtual_pp_degree 2 \
--pipeline_schedule_mode "1F1B" \
--virtual_pipeline_seg_method 'GPTDecoderLayerAuto' \
--sequence_parallel 0 \
--use_flash_attention 1 \
--fused_linear 1 \
--fuse_attention_ffn 1 \
--fuse_attention_qkv 1 \
--fused_linear_param_grad_add 1 \
--recompute 0 \
--recompute_use_reentrant true \
--recompute_granularity "full" \
--pp_recompute_interval 1 \
--gradient_accumulation_steps 32 \
--max_grad_norm 0 \
--bf16 1 \
--fp16_opt_level "O2" \
--amp_master_grad true \
--attention_probs_dropout_prob 0.1 \
--hidden_dropout_prob 0.1 \
--sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \
--tensor_parallel_config "enable_mp_async_allreduce" \
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
--pipeline_parallel_config "enable_send_recv_overlap enable_split_backward" \
--num_hidden_layers 2 \
# --auto_parallel_resume_form_hybrid_parallel true \
# --resume_from_checkpoint "output/gpt3_13b_hand_perf/checkpoint-1" \
# --fused_linear 1 \
# --use_fast_layer_norm 1 \
# --use_fused_dropout_add 1 \
# --pipeline_parallel_config "enable_send_recv_overlap enable_split_backward" \


13 changes: 10 additions & 3 deletions llm/auto_parallel/gpt-3/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,16 @@
CosineAnnealingWithWarmupDecay,
GPTConfig,
GPTForCausalLMAuto,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

是否可以考虑统一一个run_pretrain_auto.py,不然每个模型维护一个脚本,维护成本比较大。

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

考率到修改启动脚本涉及到的ci/ce脚本较多。故计划此pr合入之后提交个pr统一修改

GPTForCausalLMNet,
GPTPretrainingCriterionAuto,
GPTPretrainingCriterionNet,
LinearAnnealingWithWarmupDecay,
)
from paddlenlp.utils.log import logger

MODEL_CLASSES = {
"gpt": (GPTConfig, GPTForCausalLMAuto, GPTPretrainingCriterionAuto),
"gpt_network": (GPTConfig, GPTForCausalLMNet, GPTPretrainingCriterionNet),
}

from paddlenlp.data.causal_dataset import (
Expand Down Expand Up @@ -91,6 +94,10 @@ class PreTrainingArguments(AutoTrainingArguments):
default=False,
metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
)
use_intermediate_api: bool = field(
default=False,
metadata={"help": "Weather to use auto_parallel intermediate api"},
)

def __post_init__(self):
super().__post_init__()
Expand Down Expand Up @@ -529,9 +536,9 @@ def main():
dtype = "float16"
if training_args.bf16:
dtype = "bfloat16"

model = model_class.from_config(config, dtype=dtype)
criterion = criterion_class(config)
with paddle.LazyGuard():
model = model_class.from_config(config, dtype=dtype)
criterion = criterion_class(config)
if training_args.recompute:

def fn(layer):
Expand Down
86 changes: 86 additions & 0 deletions llm/auto_parallel/llama/llama_with_api.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#PaddlePaddle version 3.0-beta2 or higher is required, please upgrade your PaddlePaddle to 3.0-beta2 or other higher version.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -x

export FLAGS_cudnn_deterministic=1
export FLAGS_embedding_deterministic=1
export FLAGS_max_inplace_grad_add=65536

task_name="llama_auto"
rm -rf output
rm -rf single

export PYTHONPATH=../../../:$PYTHONPATH

python -u -m paddle.distributed.launch \
--gpus "0,1,2,3,4,5,6,7" \
--log_dir "single" \
./run_pretrain_auto.py \
--model_name_or_path "facebook/llama-7b" \
--tokenizer_name_or_path "facebook/llama-7b" \
--input_dir "./data" \
--output_dir "./output" \
--split 949,50,1 \
--to_static false \
--seed 1234 \
--pipeline_parallel_degree 2 \
--sharding_parallel_degree 2 \
--tensor_parallel_degree 2 \
--virtual_pp_degree 1 \
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--max_grad_norm 1.0 \
--learning_rate 3e-05 \
--min_learning_rate 3e-06 \
--max_steps 2000 \
--logging_steps 1 \
--eval_steps 10000 \
--save_steps 2000 \
--auto_parallel_resume_form_hybrid_parallel true \
--continue_training 0 \
--do_train true \
--do_eval false \
--do_predict false \
--disable_tqdm true \
--save_total_limit 2 \
--device gpu \
--dataloader_num_workers 4 \
--distributed_dataloader 0 \
--enable_auto_parallel 1 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 8 \
--per_device_eval_batch_size 1 \
--recompute false \
--recompute_use_reentrant true \
--skip_profile_timer true \
--recompute_granularity full \
--pp_recompute_interval 0 \
--bf16 true \
--fp16_opt_level "O2" \
--amp_master_grad true \
--fuse_attention_ffn false \
--fuse_attention_qkv true \
--use_flash_attention true \
--use_fused_rope true \
--use_fused_rms_norm true \
--max_seq_length 4096 \
--sequence_parallel true \
--sharding "stage1" \
--sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \
--tensor_parallel_config "enable_mp_async_allreduce" \
--model_type "llama_network" \
--ignore_load_lr_and_optim true \
--ignore_save_lr_and_optim true \
--use_intermediate_api true \
13 changes: 9 additions & 4 deletions llm/auto_parallel/llama/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,15 @@
LinearAnnealingWithWarmupDecay,
LlamaConfig,
LlamaForCausalLM3DAuto,
LlamaForCausalLMNet,
LlamaPretrainingCriterion3DAuto,
LlamaPretrainingCriterionNet,
)
from paddlenlp.utils.log import logger

MODEL_CLASSES = {
"llama": (LlamaConfig, LlamaForCausalLM3DAuto, LlamaPretrainingCriterion3DAuto),
"llama_network": (LlamaConfig, LlamaForCausalLMNet, LlamaPretrainingCriterionNet),
}


Expand Down Expand Up @@ -100,6 +103,10 @@ class PreTrainingArguments(AutoTrainingArguments):
default=False,
metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
)
use_intermediate_api: bool = field(
default=False,
metadata={"help": "Weather to use auto_parallel intermediate api"},
)

def __post_init__(self):
super().__post_init__()
Expand Down Expand Up @@ -544,6 +551,7 @@ def main():
config.use_recompute = training_args.recompute
config.tensor_parallel_degree = training_args.tensor_parallel_degree
config.tensor_parallel_rank = training_args.tensor_parallel_rank
config.sharding_parallel_degree = training_args.sharding_parallel_degree
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

看了一下代码,为啥在400多行创建Topology的位置,sharding_degree默认设置为1?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

auto_parallel's sharding is not orthogonal with dp, mp and pp

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dp_degree已经包含了sharding_degree,所以sharding_degree设为1即可。


if training_args.strategy.pipeline.enable and config.virtual_pp_degree > 1:
pipeline = training_args.strategy.pipeline
Expand All @@ -564,10 +572,6 @@ def main():
model = model_class.from_config(config, dtype="float32")
criterion = criterion_class(config)

for param in model.parameters():
assert not param._is_initialized()
param.initialize()

if training_args.recompute:

def fn(layer):
Expand Down Expand Up @@ -621,6 +625,7 @@ def fn(layer):
eval_dataset=eval_dataset if training_args.do_eval else None,
optimizers=(None, lr_scheduler),
tokenizer=tokenizer,
model_args=model_args,
)

checkpoint = None
Expand Down
99 changes: 99 additions & 0 deletions llm/auto_parallel/qwen/run_intermediate_api.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# PaddlePaddle version 3.0-beta2 or higher is required, please upgrade your PaddlePaddle to 3.0-beta2 or other higher version.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# just for debug

set -x
unset PADDLE_ELASTIC_JOB_ID
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS
unset FLAGS_START_PORT
unset PADDLE_ELASTIC_TIMEOUT

export NNODES=1
export PADDLE_TRAINERS_NUM=1
export FLAGS_call_stack_level=3
export FLAGS_use_cuda_managed_memory=true

task_name="llama_auto"
rm -rf output/$task_name/
rm -rf "output/$task_name""_log"

export SOT_LOG_LEVEL=4
export PYTHONPATH=../../../:$PYTHONPATH


rm -rf ./log/auto_3d_auto

export FLAGS_embedding_deterministic=1
export FLAGS_cudnn_deterministic=1
export FLAGS_max_inplace_grad_add=65536
export NVIDIA_TF32_OVERRIDE=0
export FLAGS_enable_pir_in_executor=1
export FLAGS_enable_pir_api=1


python -u -m paddle.distributed.launch \
--gpus "4,5" \
--log_dir "log/auto_3d_auto" \
run_pretrain_3D_auto.py \
--model_name_or_path "qwen/qwen-14b" \
--tokenizer_name_or_path "qwen/qwen-14b" \
--model_type "qwen_network" \
--use_intermediate_api true \
--input_dir "../data" \
--output_dir "./checkpoints/qwen_pretrain_ckpts" \
--per_device_train_batch_size 1\
--gradient_accumulation_steps 32\
--per_device_eval_batch_size 16\
--sharding "stage1" \
--sharding_parallel_degree 1\
--tensor_parallel_degree 2\
--pipeline_parallel_degree 1\
--virtual_pp_degree 1\
--use_flash_attention false\
--use_fused_rms_norm false\
--use_fused_rope false\
--max_seq_length 4096\
--learning_rate 3e-05\
--min_learning_rate 3e-06\
--scale_loss 1024\
--warmup_steps 30\
--logging_steps 1\
--max_steps 10000\
--save_steps 1000\
--eval_steps 10000\
--weight_decay 0.01\
--bf16 true\
--fp16_opt_level "O2"\
--amp_master_grad true \
--warmup_ratio 0.01\
--max_grad_norm 0.0\
--dataloader_num_workers 4\
--continue_training 0\
--do_train true\
--do_eval false\
--do_predict false \
--disable_tqdm true\
--recompute false\
--recompute_granularity "core_attn"\
--recompute_use_reentrant true\
--distributed_dataloader 0\
--save_total_limit 2\
--enable_auto_parallel 1\
--to_static 1 \
--num_hidden_layers 1 \
--attention_probs_dropout_prob 0 \
--hidden_dropout_prob 0 \
--auto_parallel_resume_form_hybrid_parallel true \
Loading