PaddlePaddle · ZHUI · Oct 18, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/paddlenlp/data/data_collator.py b/paddlenlp/data/data_collator.py
@@ -370,11 +370,7 @@ def __call__(self, features, return_tensors=None):
         if return_tensors is None:
             return_tensors = self.return_tensors
         labels = [feature["labels"] for feature in batch] if "labels" in batch[0].keys() else None
-        use_attn_mask_startend_row_indices = (
-            [feature["attn_mask_startend_row_indices"] for feature in batch]
-            if "attn_mask_startend_row_indices" in batch[0].keys()
-            else None
-        )
+
         # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
         # same length to return tensors.
         if labels is not None:
@@ -401,29 +397,6 @@ def __call__(self, features, return_tensors=None):
                     feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
                 else:
                     feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
-        if use_attn_mask_startend_row_indices is not None:
-            if self.max_length is not None:
-                max_length = self.max_length
-            else:
-                max_length = max(len(l) for l in use_attn_mask_startend_row_indices)
-            if self.pad_to_multiple_of is not None:
-                max_length = (
-                    (max_length + self.pad_to_multiple_of - 1) // self.pad_to_multiple_of * self.pad_to_multiple_of
-                )
-
-            for feature in batch:
-                pad_len = max_length - len(feature["attn_mask_startend_row_indices"])
-                remainder = np.zeros([1, pad_len], dtype=np.int32)
-                feature["attn_mask_startend_row_indices"] = (
-                    np.concatenate(
-                        [remainder, np.array([feature["attn_mask_startend_row_indices"]], dtype=np.int32) + pad_len],
-                        axis=-1,
-                    )
-                    if padding_side == "left"
-                    else np.concatenate(
-                        [np.array([feature["attn_mask_startend_row_indices"]], dtype=np.int32), remainder], axis=-1
-                    )
-                )
 
         batch = self.tokenizer.pad(
             batch,

diff --git a/paddlenlp/transformers/bloom/tokenizer.py b/paddlenlp/transformers/bloom/tokenizer.py
@@ -18,14 +18,11 @@
 import os
 import shutil
 from functools import lru_cache
-from typing import Dict, Literal, Optional, Union
 
-import numpy as np
 from paddle.utils import try_import
 
 from paddlenlp.transformers import AddedToken, PretrainedTokenizer
 
-from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy
 from .configuration import (
     BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST,
     _construct_resource_file_url,
@@ -353,63 +350,3 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
             return output
 
         return output + bos_token_ids + token_ids_1
-
-    def _pad(
-        self,
-        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[Literal["right", "left"]] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-
-        Args:
-            encoded_inputs:
-                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in `padding_side` argument:
-
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
-            padding_side: (optional) The side on which the model should have padding applied.
-                Should be selected between ['right', 'left'].
-                Default value is picked from the class attribute of the same name.
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
-            attention_mask = encoded_inputs["attention_mask"]
-            encoded_inputs.pop("attention_mask")
-        else:
-            attention_mask = None
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-        encoded_inputs = super()._pad(
-            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
-        )
-        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
-            encoded_inputs["attention_mask"] = attention_mask
-            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-            if needs_to_be_padded:
-                difference = max_length - len(required_input)
-                if "attention_mask" in encoded_inputs:
-                    encoded_inputs["attention_mask"] = np.pad(
-                        encoded_inputs["attention_mask"],
-                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
-                        mode="constant",
-                        constant_values=0,
-                    )
-        return encoded_inputs
diff --git a/paddlenlp/transformers/chatglm_v2/tokenizer.py b/paddlenlp/transformers/chatglm_v2/tokenizer.py
@@ -17,7 +17,6 @@
 import re
 from typing import Any, Dict, List, Literal, Optional, Union
 
-import numpy as np
 from sentencepiece import SentencePieceProcessor
 
 from .. import PretrainedTokenizer
@@ -249,14 +248,12 @@ def _pad(
     ) -> dict:
         """
         Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-
         Args:
             encoded_inputs:
                 Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
             max_length: maximum length of the returned list and optionally padding length (see below).
                 Will truncate by taking into account the special tokens.
             padding_strategy: PaddingStrategy to use for padding.
-
                 - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                 - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                 - PaddingStrategy.DO_NOT_PAD: Do not pad
@@ -280,39 +277,16 @@ def _pad(
         required_input = encoded_inputs[self.model_input_names[0]]
         seq_length = len(required_input)
 
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = len(required_input)
-
-        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-
-        # Initialize attention mask if not present.
-        if "attention_mask" not in encoded_inputs:
-            encoded_inputs["attention_mask"] = [1] * seq_length
-
         if "position_ids" not in encoded_inputs:
             encoded_inputs["position_ids"] = list(range(seq_length))
 
-        if needs_to_be_padded:
-            difference = max_length - len(required_input)
-
-            if "attention_mask" in encoded_inputs:
-                # 3D/4D attention mask
-                if len(np.shape(encoded_inputs["attention_mask"])) > 2:
-                    encoded_inputs["attention_mask"] = np.pad(
-                        encoded_inputs["attention_mask"],
-                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
-                        mode="constant",
-                        constant_values=0,
-                    )
-                # 2D attention mask
-                else:
-                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
-            if "position_ids" in encoded_inputs:
-                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
-            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+        super()._pad(
+            encoded_inputs=encoded_inputs,
+            max_length=max_length,
+            padding_strategy=padding_strategy,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
 
         return encoded_inputs
 

diff --git a/paddlenlp/transformers/gemma/tokenizer.py b/paddlenlp/transformers/gemma/tokenizer.py
@@ -15,19 +15,13 @@
 
 import os
 from shutil import copyfile
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple
 
-import numpy as np
 import sentencepiece as spm
 
 from ...utils.log import logger
 from .. import PretrainedTokenizer
-from ..tokenizer_utils_base import (
-    AddedToken,
-    BatchEncoding,
-    EncodedInput,
-    PaddingStrategy,
-)
+from ..tokenizer_utils_base import AddedToken
 
 __all__ = ["GemmaTokenizer"]
 
@@ -316,65 +310,3 @@ def create_token_type_ids_from_sequences(
             output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
 
         return output
-
-    def _pad(
-        self,
-        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[Literal["right", "left"]] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        For Zero Padding, Copied from llama
-
-        Args:
-            encoded_inputs:
-                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in self.padding_side:
-
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
-            padding_side: (optional) The side on which the model should have padding applied.
-                Should be selected between ['right', 'left'].
-                Default value is picked from the class attribute of the same name.
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-
-        # attention_mask shape [1,seq_len,seq_len]
-        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
-            attention_mask = encoded_inputs["attention_mask"]
-            encoded_inputs.pop("attention_mask")
-        else:
-            attention_mask = None
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-        encoded_inputs = super()._pad(
-            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
-        )
-        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
-            encoded_inputs["attention_mask"] = attention_mask
-            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-            if needs_to_be_padded:
-                difference = max_length - len(required_input)
-                if "attention_mask" in encoded_inputs:
-                    encoded_inputs["attention_mask"] = np.pad(
-                        encoded_inputs["attention_mask"],
-                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
-                        mode="constant",
-                        constant_values=0,
-                    )
-        return encoded_inputs
diff --git a/paddlenlp/transformers/gpt/tokenizer.py b/paddlenlp/transformers/gpt/tokenizer.py
@@ -17,15 +17,12 @@
 import os
 import shutil
 from functools import lru_cache
-from typing import Dict, Literal, Optional, Union
 
 import jieba
-import numpy as np
 import sentencepiece as spm
 from paddle.utils import try_import
 
 from .. import AddedToken, PretrainedTokenizer
-from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy
 
 __all__ = [
     "GPTTokenizer",
@@ -577,65 +574,3 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
             return output
 
         return output + bos_token_ids + token_ids_1
-
-    def _pad(
-        self,
-        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[Literal["right", "left"]] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-
-        Args:
-            encoded_inputs:
-                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in `padding_side` argument:
-
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                >= 7.5 (Volta).
-            padding_side: (optional) The side on which the model should have padding applied.
-                Should be selected between ['right', 'left'].
-                Default value is picked from the class attribute of the same name.
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-
-        # attention_mask shape [1,seq_len,seq_len]
-        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
-            attention_mask = encoded_inputs["attention_mask"]
-            encoded_inputs.pop("attention_mask")
-        else:
-            attention_mask = None
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-        encoded_inputs = super()._pad(
-            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
-        )
-        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
-            encoded_inputs["attention_mask"] = attention_mask
-            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-            if needs_to_be_padded:
-                difference = max_length - len(required_input)
-                if "attention_mask" in encoded_inputs:
-                    encoded_inputs["attention_mask"] = np.pad(
-                        encoded_inputs["attention_mask"],
-                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
-                        mode="constant",
-                        constant_values=0,
-                    )
-        return encoded_inputs