From 7ac0efcd14e237c4778f7faacf5ae19f7c3289d1 Mon Sep 17 00:00:00 2001 From: zhangyuqin1998 Date: Wed, 8 May 2024 14:00:50 +0000 Subject: [PATCH] Fix rng_state in llm models --- llm/ernie-3.5-se/modeling.py | 36 +++++++++++++++------ paddlenlp/transformers/gemma/modeling.py | 20 +++++++++--- paddlenlp/transformers/gpt/modeling.py | 15 ++++++--- paddlenlp/transformers/gpt/modeling_auto.py | 15 ++++++--- paddlenlp/transformers/llama/modeling.py | 15 ++++++--- paddlenlp/transformers/mixtral/modeling.py | 15 ++++++--- paddlenlp/transformers/qwen/modeling.py | 16 ++++++--- 7 files changed, 98 insertions(+), 34 deletions(-) diff --git a/llm/ernie-3.5-se/modeling.py b/llm/ernie-3.5-se/modeling.py index 9e1165e71a65..c4ce1e72ea6a 100644 --- a/llm/ernie-3.5-se/modeling.py +++ b/llm/ernie-3.5-se/modeling.py @@ -1181,17 +1181,35 @@ def __init__(self, config): else: vocab_size = config.vocab_size - self.weight = self.create_parameter( - shape=[vocab_size, config.hidden_size] if config.tie_word_embeddings else [config.hidden_size, vocab_size], - dtype=paddle.get_default_dtype(), - ) - if config.weight_share_add_bias and config.use_bias: - self.bias = self.create_parameter( - shape=[vocab_size], + if vocab_size != config.vocab_size: + with get_rng_state_tracker().rng_state(): + self.weight = self.create_parameter( + shape=[vocab_size, config.hidden_size] + if config.tie_word_embeddings + else [config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) + if config.weight_share_add_bias and config.use_bias: + self.bias = self.create_parameter( + shape=[vocab_size], + dtype=paddle.get_default_dtype(), + ) + else: + self.bias = None + else: + self.weight = self.create_parameter( + shape=[vocab_size, config.hidden_size] + if config.tie_word_embeddings + else [config.hidden_size, vocab_size], dtype=paddle.get_default_dtype(), ) - else: - self.bias = None + if config.weight_share_add_bias and config.use_bias: + self.bias = self.create_parameter( + shape=[vocab_size], + dtype=paddle.get_default_dtype(), + ) + else: + self.bias = None # Must set distributed attr for Tensor Parallel ! self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False diff --git a/paddlenlp/transformers/gemma/modeling.py b/paddlenlp/transformers/gemma/modeling.py index 299b47e52e86..be358769d640 100644 --- a/paddlenlp/transformers/gemma/modeling.py +++ b/paddlenlp/transformers/gemma/modeling.py @@ -1366,11 +1366,21 @@ def __init__(self, config: GemmaConfig): else: vocab_size = config.vocab_size - self.weight = self.create_parameter( - shape=[vocab_size, config.hidden_size] if config.tie_word_embeddings else [config.hidden_size, vocab_size], - dtype=paddle.get_default_dtype(), - ) - + if vocab_size != config.vocab_size: + with get_rng_state_tracker().rng_state(): + self.weight = self.create_parameter( + shape=[vocab_size, config.hidden_size] + if config.tie_word_embeddings + else [config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) + else: + self.weight = self.create_parameter( + shape=[vocab_size, config.hidden_size] + if config.tie_word_embeddings + else [config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) # Must set distributed attr for Tensor Parallel ! self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False if self.weight.is_distributed: diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py index 14569835f078..ffeae172b2b6 100644 --- a/paddlenlp/transformers/gpt/modeling.py +++ b/paddlenlp/transformers/gpt/modeling.py @@ -1432,10 +1432,17 @@ def __init__(self, config: GPTConfig, embedding_weights=None): else: vocab_size = config.vocab_size - self.weight = self.create_parameter( - shape=[vocab_size, config.hidden_size], - dtype=paddle.get_default_dtype(), - ) + if vocab_size != config.vocab_size: + with get_rng_state_tracker().rng_state(): + self.weight = self.create_parameter( + shape=[vocab_size, config.hidden_size], + dtype=paddle.get_default_dtype(), + ) + else: + self.weight = self.create_parameter( + shape=[vocab_size, config.hidden_size], + dtype=paddle.get_default_dtype(), + ) # Must set distributed attr for Tensor Parallel ! self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False if self.weight.is_distributed: diff --git a/paddlenlp/transformers/gpt/modeling_auto.py b/paddlenlp/transformers/gpt/modeling_auto.py index fb30cbc9bbc2..719d4ca4a37b 100644 --- a/paddlenlp/transformers/gpt/modeling_auto.py +++ b/paddlenlp/transformers/gpt/modeling_auto.py @@ -1138,10 +1138,17 @@ def __init__(self, config: GPTConfig, embedding_weights=None, ipp=None): else: vocab_size = config.vocab_size - self.weight = self.create_parameter( - shape=[vocab_size, config.hidden_size], - dtype=paddle.get_default_dtype(), - ) + if vocab_size != config.vocab_size: + with get_rng_state_tracker().rng_state(): + self.weight = self.create_parameter( + shape=[vocab_size, config.hidden_size], + dtype=paddle.get_default_dtype(), + ) + else: + self.weight = self.create_parameter( + shape=[vocab_size, config.hidden_size], + dtype=paddle.get_default_dtype(), + ) # Must set distributed attr for Tensor Parallel ! self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False if self.weight.is_distributed: diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 37c573189821..bfd01d2a6962 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -1765,10 +1765,17 @@ def __init__(self, config: LlamaConfig): else: vocab_size = config.vocab_size - self.weight = self.create_parameter( - shape=[config.hidden_size, vocab_size], - dtype=paddle.get_default_dtype(), - ) + if vocab_size != config.vocab_size: + with get_rng_state_tracker().rng_state(): + self.weight = self.create_parameter( + shape=[config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) + else: + self.weight = self.create_parameter( + shape=[config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) # Must set distributed attr for Tensor Parallel ! self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False if self.weight.is_distributed: diff --git a/paddlenlp/transformers/mixtral/modeling.py b/paddlenlp/transformers/mixtral/modeling.py index 068f48f5ef1a..c4c642b0b992 100644 --- a/paddlenlp/transformers/mixtral/modeling.py +++ b/paddlenlp/transformers/mixtral/modeling.py @@ -1338,10 +1338,17 @@ def __init__(self, config: MixtralConfig): else: vocab_size = config.vocab_size - self.weight = self.create_parameter( - shape=[config.hidden_size, vocab_size], - dtype=paddle.get_default_dtype(), - ) + if vocab_size != config.vocab_size: + with get_rng_state_tracker().rng_state(): + self.weight = self.create_parameter( + shape=[config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) + else: + self.weight = self.create_parameter( + shape=[config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) # Must set distributed attr for Tensor Parallel ! self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False if self.weight.is_distributed: diff --git a/paddlenlp/transformers/qwen/modeling.py b/paddlenlp/transformers/qwen/modeling.py index 8cb171ab2ee1..056ae08cbe12 100755 --- a/paddlenlp/transformers/qwen/modeling.py +++ b/paddlenlp/transformers/qwen/modeling.py @@ -22,6 +22,7 @@ import paddle.nn.functional as F from paddle import Tensor, nn from paddle.distributed import fleet +from paddle.distributed.fleet.layers.mpu.random import get_rng_state_tracker from paddle.distributed.fleet.utils import recompute from paddle.utils import try_import @@ -784,10 +785,17 @@ def __init__(self, config: QWenConfig): else: vocab_size = config.vocab_size - self.weight = self.create_parameter( - shape=[config.hidden_size, vocab_size], - dtype=paddle.get_default_dtype(), - ) + if vocab_size != config.vocab_size: + with get_rng_state_tracker().rng_state(): + self.weight = self.create_parameter( + shape=[config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) + else: + self.weight = self.create_parameter( + shape=[config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) # Must set distributed attr for Tensor Parallel ! self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False if self.weight.is_distributed: