Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 1 addition & 28 deletions paddlenlp/data/data_collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,11 +370,7 @@ def __call__(self, features, return_tensors=None):
if return_tensors is None:
return_tensors = self.return_tensors
labels = [feature["labels"] for feature in batch] if "labels" in batch[0].keys() else None
use_attn_mask_startend_row_indices = (
[feature["attn_mask_startend_row_indices"] for feature in batch]
if "attn_mask_startend_row_indices" in batch[0].keys()
else None
)

# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
# same length to return tensors.
if labels is not None:
Expand All @@ -401,29 +397,6 @@ def __call__(self, features, return_tensors=None):
feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
else:
feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
if use_attn_mask_startend_row_indices is not None:
if self.max_length is not None:
max_length = self.max_length
else:
max_length = max(len(l) for l in use_attn_mask_startend_row_indices)
if self.pad_to_multiple_of is not None:
max_length = (
(max_length + self.pad_to_multiple_of - 1) // self.pad_to_multiple_of * self.pad_to_multiple_of
)

for feature in batch:
pad_len = max_length - len(feature["attn_mask_startend_row_indices"])
remainder = np.zeros([1, pad_len], dtype=np.int32)
feature["attn_mask_startend_row_indices"] = (
np.concatenate(
[remainder, np.array([feature["attn_mask_startend_row_indices"]], dtype=np.int32) + pad_len],
axis=-1,
)
if padding_side == "left"
else np.concatenate(
[np.array([feature["attn_mask_startend_row_indices"]], dtype=np.int32), remainder], axis=-1
)
)

batch = self.tokenizer.pad(
batch,
Expand Down
63 changes: 0 additions & 63 deletions paddlenlp/transformers/bloom/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,11 @@
import os
import shutil
from functools import lru_cache
from typing import Dict, Literal, Optional, Union

import numpy as np
from paddle.utils import try_import

from paddlenlp.transformers import AddedToken, PretrainedTokenizer

from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy
from .configuration import (
BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST,
_construct_resource_file_url,
Expand Down Expand Up @@ -353,63 +350,3 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
return output

return output + bos_token_ids + token_ids_1

def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[Literal["right", "left"]] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

Args:
encoded_inputs:
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.

- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in `padding_side` argument:

- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
>= 7.5 (Volta).
padding_side: (optional) The side on which the model should have padding applied.
Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
# Load from model defaults
if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
attention_mask = encoded_inputs["attention_mask"]
encoded_inputs.pop("attention_mask")
else:
attention_mask = None

required_input = encoded_inputs[self.model_input_names[0]]
encoded_inputs = super()._pad(
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
)
if attention_mask is not None and len(np.shape(attention_mask)) > 2:
encoded_inputs["attention_mask"] = attention_mask
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
if needs_to_be_padded:
difference = max_length - len(required_input)
if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = np.pad(
encoded_inputs["attention_mask"],
pad_width=[(0, 0), (difference, 0), (difference, 0)],
mode="constant",
constant_values=0,
)
return encoded_inputs
40 changes: 7 additions & 33 deletions paddlenlp/transformers/chatglm_v2/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import re
from typing import Any, Dict, List, Literal, Optional, Union

import numpy as np
from sentencepiece import SentencePieceProcessor

from .. import PretrainedTokenizer
Expand Down Expand Up @@ -249,14 +248,12 @@ def _pad(
) -> dict:
"""
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

Args:
encoded_inputs:
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.

- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
Expand All @@ -280,39 +277,16 @@ def _pad(
required_input = encoded_inputs[self.model_input_names[0]]
seq_length = len(required_input)

if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)

if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length

# Initialize attention mask if not present.
if "attention_mask" not in encoded_inputs:
encoded_inputs["attention_mask"] = [1] * seq_length

if "position_ids" not in encoded_inputs:
encoded_inputs["position_ids"] = list(range(seq_length))

if needs_to_be_padded:
difference = max_length - len(required_input)

if "attention_mask" in encoded_inputs:
# 3D/4D attention mask
if len(np.shape(encoded_inputs["attention_mask"])) > 2:
encoded_inputs["attention_mask"] = np.pad(
encoded_inputs["attention_mask"],
pad_width=[(0, 0), (difference, 0), (difference, 0)],
mode="constant",
constant_values=0,
)
# 2D attention mask
else:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "position_ids" in encoded_inputs:
encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
super()._pad(
encoded_inputs=encoded_inputs,
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)

return encoded_inputs

Expand Down
72 changes: 2 additions & 70 deletions paddlenlp/transformers/gemma/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,13 @@

import os
from shutil import copyfile
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
import sentencepiece as spm

from ...utils.log import logger
from .. import PretrainedTokenizer
from ..tokenizer_utils_base import (
AddedToken,
BatchEncoding,
EncodedInput,
PaddingStrategy,
)
from ..tokenizer_utils_base import AddedToken

__all__ = ["GemmaTokenizer"]

Expand Down Expand Up @@ -316,65 +310,3 @@ def create_token_type_ids_from_sequences(
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)

return output

def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[Literal["right", "left"]] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
For Zero Padding, Copied from llama

Args:
encoded_inputs:
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.

- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in self.padding_side:

- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
>= 7.5 (Volta).
padding_side: (optional) The side on which the model should have padding applied.
Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
# Load from model defaults

# attention_mask shape [1,seq_len,seq_len]
if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
attention_mask = encoded_inputs["attention_mask"]
encoded_inputs.pop("attention_mask")
else:
attention_mask = None

required_input = encoded_inputs[self.model_input_names[0]]
encoded_inputs = super()._pad(
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
)
if attention_mask is not None and len(np.shape(attention_mask)) > 2:
encoded_inputs["attention_mask"] = attention_mask
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
if needs_to_be_padded:
difference = max_length - len(required_input)
if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = np.pad(
encoded_inputs["attention_mask"],
pad_width=[(0, 0), (difference, 0), (difference, 0)],
mode="constant",
constant_values=0,
)
return encoded_inputs
65 changes: 0 additions & 65 deletions paddlenlp/transformers/gpt/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,12 @@
import os
import shutil
from functools import lru_cache
from typing import Dict, Literal, Optional, Union

import jieba
import numpy as np
import sentencepiece as spm
from paddle.utils import try_import

from .. import AddedToken, PretrainedTokenizer
from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy

__all__ = [
"GPTTokenizer",
Expand Down Expand Up @@ -577,65 +574,3 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
return output

return output + bos_token_ids + token_ids_1

def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[Literal["right", "left"]] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

Args:
encoded_inputs:
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.

- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in `padding_side` argument:

- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
>= 7.5 (Volta).
padding_side: (optional) The side on which the model should have padding applied.
Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
# Load from model defaults

# attention_mask shape [1,seq_len,seq_len]
if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
attention_mask = encoded_inputs["attention_mask"]
encoded_inputs.pop("attention_mask")
else:
attention_mask = None

required_input = encoded_inputs[self.model_input_names[0]]
encoded_inputs = super()._pad(
encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, padding_side, return_attention_mask
)
if attention_mask is not None and len(np.shape(attention_mask)) > 2:
encoded_inputs["attention_mask"] = attention_mask
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
if needs_to_be_padded:
difference = max_length - len(required_input)
if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = np.pad(
encoded_inputs["attention_mask"],
pad_width=[(0, 0), (difference, 0), (difference, 0)],
mode="constant",
constant_values=0,
)
return encoded_inputs
Loading