Add support for Baichuan2

xhaihao · Zjq9409 · Wei-Lin-Intel · xhaihao · commit ea023f0904c0 · 2024-11-27T10:09:28.000+08:00
Below is an example for baichuan-inc/Baichuan2-7B-Chat:
python3 run_generation.py \
--model_name_or_path baichuan-inc/Baichuan2-7B-Chat \
--bf16 --trim_logits --batch_size 1 \
--max_input_tokens 1024 --max_new_tokens 512 \
--use_kv_cache --use_hpu_graphs --use_flash_attention \
--reuse_cache \
--no-ignore_eos

Below is an example for baichuan-inc/Baichuan2-13B-Chat:
python3 run_generation.py \
--model_name_or_path baichuan-inc/Baichuan2-13B-Chat \
--bf16 --trim_logits --batch_size 1 \
--max_input_tokens 1024 --max_new_tokens 512 \
--use_kv_cache --use_hpu_graphs --bucket_size 256 \
--bucket_internal --reuse_cache \
--no-ignore_eos

Co-authored-by: Jianqian Zhou &lt;jianqian.zhou@intel.com&gt;
Co-authored-by: Wei Lin &lt;wei2.lin@intel.com&gt;
Signed-off-by: Haihao Xiang &lt;haihao.xiang@intel.com&gt;
diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
@@ -472,9 +472,11 @@ def main():
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
-    embedding_size = model.get_input_embeddings().weight.shape[0]
-    if len(tokenizer) > embedding_size:
-        model.resize_token_embeddings(len(tokenizer))
+    # We need to skip this test for baichuan pretrain
+    if config.model_type not in ("baichuan"):
+        embedding_size = model.get_input_embeddings().weight.shape[0]
+        if len(tokenizer) > embedding_size:
+            model.resize_token_embeddings(len(tokenizer))
 
     # Preprocessing the datasets.
     # First we tokenize all the texts.
diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
@@ -111,13 +111,14 @@ def __init__(self, tokenizer, model, args, options):
             "gptj",
             "starcoder2",
             "gemma",
+            "baichuan",
         ]:
             self.model_inputs.update(
                 {
                     "reuse_cache": self.options.reuse_cache,
                 }
             )
-        if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon", "starcoder2", "gemma"]:
+        if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon", "starcoder2", "gemma", "baichuan"]:
             if self.model.config.model_type != "falcon":
                 self.model_inputs.update(
                     {
diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
@@ -112,6 +112,7 @@
     "paligemma",
     "idefics2",
     "mllama",
+    "baichuan",
 ]
 
 
@@ -1081,8 +1082,9 @@ def generate(
                     "qwen2_moe",
                     "gemma",
                     "gemma2",
+                    "baichuan",
                 ]
-            ), "reuse_cache only supported by llama, mistral, falcon, mixtral, phi, qwen2, qwen2_moe, gemma, gemma2 and starcoder2 at the moment"
+            ), "reuse_cache only supported by llama, mistral, falcon, mixtral, phi, qwen2, qwen2_moe, gemma, gemma2, starcoder2 and baichuan at the moment"
             if not generation_config.bucket_internal:
                 assert (
                     generation_config.bucket_size <= 0
@@ -1288,8 +1290,12 @@ def generate(
                 "gemma",
                 "gemma2",
                 "qwen2_moe",
+                "baichuan",
             ]:
-                if self.config.max_position_embeddings < calculated_max_length:
+                if (
+                    hasattr(self.config, "max_position_embeddings")
+                    and self.config.max_position_embeddings < calculated_max_length
+                ):
                     unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length)
 
         # 8. determine generation mode
diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
@@ -28,6 +28,9 @@
 )
 from .models import (
     GAUDI_WHISPER_ATTENTION_CLASSES,
+    BaichuanConfig,
+    BaichuanForCausalLM,
+    BaichuanTokenizer,
     DeciLMConfig,
     DeciLMForCausalLM,
     Gaudi2Idefics2ImageProcessor,
@@ -676,3 +679,8 @@ def adapt_transformers_to_gaudi():
     transformers.models.xglm.modeling_xglm.XGLMModel.forward = gaudi_xglm_model_forward
     transformers.models.xglm.modeling_xglm.XGLMAttention.forward = gaudi_xglm_attention_forward
     transformers.models.xglm.modeling_xglm.XGLMDecoderLayer.forward = gaudi_xglm_decoder_layer_forward
+
+    # Optimization for Baichuan2 on Gaudi
+    transformers.AutoConfig.register("baichuan", BaichuanConfig)
+    transformers.AutoTokenizer.register(BaichuanConfig, slow_tokenizer_class=BaichuanTokenizer)
+    transformers.AutoModelForCausalLM.register(BaichuanConfig, BaichuanForCausalLM)
diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
@@ -1,4 +1,9 @@
 from .albert import gaudi_albert_forward
+from .baichuan import (
+    BaichuanConfig,
+    BaichuanForCausalLM,
+    BaichuanTokenizer,
+)
 from .bart import (
     gaudi_BartAttention_forward,
     gaudi_BartDecoder_forward,
diff --git a/optimum/habana/transformers/models/baichuan/__init__.py b/optimum/habana/transformers/models/baichuan/__init__.py
@@ -0,0 +1,5 @@
+from .configuration_baichuan import BaichuanConfig
+from .modeling_baichuan import (
+    BaichuanForCausalLM,
+)
+from .tokenization_baichuan import BaichuanTokenizer
diff --git a/optimum/habana/transformers/models/baichuan/configuration_baichuan.py b/optimum/habana/transformers/models/baichuan/configuration_baichuan.py
@@ -0,0 +1,79 @@
+# Copyright 2023 Baichuan Inc. All Rights Reserved.
+
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Adapted from the following sources:
+https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/configuration_baichuan.py
+https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/configuration_baichuan.py
+"""
+
+import sys
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class BaichuanConfig(PretrainedConfig):
+    model_type = "baichuan"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=125696,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=sys.maxsize,
+        model_max_length=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        gradient_checkpointing=False,
+        z_loss_weight=0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        # 13B config doesn't have max_position_embeddings
+        if max_position_embeddings < sys.maxsize:
+            self.max_position_embeddings = max_position_embeddings
+        self.model_max_length = model_max_length
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.z_loss_weight = z_loss_weight
+        self.gradient_checkpointing = (gradient_checkpointing,)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/optimum/habana/transformers/models/baichuan/generation_utils.py b/optimum/habana/transformers/models/baichuan/generation_utils.py
@@ -0,0 +1,108 @@
+# Copyright 2023 Baichuan Inc. All Rights Reserved.
+
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Adapted from the following sources:
+https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/generation_utils.py
+https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py
+"""
+
+from queue import Queue
+from typing import List
+
+import torch
+
+
+def build_chat_input(model, tokenizer, messages: List[dict], max_new_tokens: int = 0):
+    def _parse_messages(messages, split_role="user"):
+        system, rounds = "", []
+        round = []
+        for i, message in enumerate(messages):
+            if message["role"] == "system":
+                assert i == 0
+                system = message["content"]
+                continue
+            if message["role"] == split_role and round:
+                rounds.append(round)
+                round = []
+            round.append(message)
+        if round:
+            rounds.append(round)
+        return system, rounds
+
+    max_new_tokens = max_new_tokens or model.generation_config.max_new_tokens
+    max_input_tokens = model.config.model_max_length - max_new_tokens
+    system, rounds = _parse_messages(messages, split_role="user")
+    system_tokens = tokenizer.encode(system)
+    max_history_tokens = max_input_tokens - len(system_tokens)
+
+    history_tokens = []
+    for round in rounds[::-1]:
+        round_tokens = []
+        for message in round:
+            if message["role"] == "user":
+                round_tokens.append(model.generation_config.user_token_id)
+            else:
+                round_tokens.append(model.generation_config.assistant_token_id)
+            round_tokens.extend(tokenizer.encode(message["content"]))
+        if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
+            history_tokens = round_tokens + history_tokens  # concat left
+            if len(history_tokens) < max_history_tokens:
+                continue
+        break
+
+    input_tokens = system_tokens + history_tokens
+    if messages[-1]["role"] != "assistant":
+        input_tokens.append(model.generation_config.assistant_token_id)
+    input_tokens = input_tokens[-max_input_tokens:]  # truncate left
+    return torch.LongTensor([input_tokens]).to(model.device)
+
+
+class TextIterStreamer:
+    def __init__(self, tokenizer, skip_prompt=False, skip_special_tokens=False):
+        self.tokenizer = tokenizer
+        self.skip_prompt = skip_prompt
+        self.skip_special_tokens = skip_special_tokens
+        self.tokens = []
+        self.text_queue = Queue()
+        self.next_tokens_are_prompt = True
+
+    def put(self, value):
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+        else:
+            if len(value.shape) > 1:
+                value = value[0]
+            self.tokens.extend(value.tolist())
+            self.text_queue.put(self.tokenizer.decode(self.tokens, skip_special_tokens=self.skip_special_tokens))
+
+    def end(self):
+        self.text_queue.put(None)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        value = self.text_queue.get()
+        if value is None:
+            raise StopIteration()
+        else:
+            return value
diff --git a/optimum/habana/transformers/models/baichuan/modeling_baichuan.py b/optimum/habana/transformers/models/baichuan/modeling_baichuan.py
diff --git a/optimum/habana/transformers/models/baichuan/tokenization_baichuan.py b/optimum/habana/transformers/models/baichuan/tokenization_baichuan.py
diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py

Original file line number	Diff line number	Diff line change
`@@ -111,13 +111,14 @@ def __init__(self, tokenizer, model, args, options):`
`111`	`111`	`"gptj",`
`112`	`112`	`"starcoder2",`
`113`	`113`	`"gemma",`
	`114`	`+ "baichuan",`
`114`	`115`	`]:`
`115`	`116`	`self.model_inputs.update(`
`116`	`117`	`{`
`117`	`118`	`"reuse_cache": self.options.reuse_cache,`
`118`	`119`	`}`
`119`	`120`	`)`
`120`		`- if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon", "starcoder2", "gemma"]:`
	`121`	`+ if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon", "starcoder2", "gemma", "baichuan"]:`
`121`	`122`	`if self.model.config.model_type != "falcon":`
`122`	`123`	`self.model_inputs.update(`
`123`	`124`	`{`