Skip to content

Commit cd22b0d

Browse files
authored
[Tokenizer] Add BertTokenizerFast, support register new tokenizer (#9353)
* Add , support register tokenizer, fix some typo * add more tests * CustomTokenizerFast2->CustomTokenizerFastWithoutSlow * lint
1 parent 5217a3b commit cd22b0d

13 files changed

Lines changed: 522 additions & 25 deletions

paddlenlp/transformers/auto/tokenizer.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from ...utils.import_utils import import_module
2525
from ...utils.log import logger
2626
from ..configuration_utils import PretrainedConfig
27+
from ..tokenizer_utils import PretrainedTokenizer
2728
from ..tokenizer_utils_base import TOKENIZER_CONFIG_FILE
2829
from ..tokenizer_utils_fast import PretrainedTokenizerFast
2930
from .configuration import (
@@ -45,7 +46,13 @@
4546
[
4647
("albert", (("AlbertChineseTokenizer", "AlbertEnglishTokenizer"), None)),
4748
("bart", "BartTokenizer"),
48-
("bert", "BertTokenizer"),
49+
(
50+
"bert",
51+
(
52+
"BertTokenizer",
53+
"BertTokenizerFast" if is_tokenizers_available() else None,
54+
),
55+
),
4956
("blenderbot", "BlenderbotTokenizer"),
5057
("bloom", "BloomTokenizer"),
5158
("clip", "CLIPTokenizer"),
@@ -459,3 +466,46 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
459466
"- or a correct model-identifier of community-contributed pretrained models,\n"
460467
"- or the correct path to a directory containing relevant tokenizer files.\n"
461468
)
469+
470+
def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False):
471+
"""
472+
Register a new tokenizer in this mapping.
473+
474+
475+
Args:
476+
config_class ([`PretrainedConfig`]):
477+
The configuration corresponding to the model to register.
478+
slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
479+
The slow tokenizer to register.
480+
fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
481+
The fast tokenizer to register.
482+
"""
483+
if slow_tokenizer_class is None and fast_tokenizer_class is None:
484+
raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class")
485+
if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PretrainedTokenizerFast):
486+
raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.")
487+
if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PretrainedTokenizer):
488+
raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.")
489+
490+
if (
491+
slow_tokenizer_class is not None
492+
and fast_tokenizer_class is not None
493+
and issubclass(fast_tokenizer_class, PretrainedTokenizerFast)
494+
and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class
495+
):
496+
raise ValueError(
497+
"The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not "
498+
"consistent with the slow tokenizer class you passed (fast tokenizer has "
499+
f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those "
500+
"so they match!"
501+
)
502+
503+
# Avoid resetting a set slow/fast tokenizer if we are passing just the other ones.
504+
if config_class in TOKENIZER_MAPPING._extra_content:
505+
existing_slow, existing_fast = TOKENIZER_MAPPING[config_class]
506+
if slow_tokenizer_class is None:
507+
slow_tokenizer_class = existing_slow
508+
if fast_tokenizer_class is None:
509+
fast_tokenizer_class = existing_fast
510+
511+
TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class), exist_ok=exist_ok)
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
from typing import List, Optional, Tuple
17+
18+
from tokenizers import normalizers
19+
20+
from ..tokenizer_utils_fast import PretrainedTokenizerFast
21+
from .tokenizer import BertTokenizer
22+
23+
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
24+
25+
26+
class BertTokenizerFast(PretrainedTokenizerFast):
27+
r"""
28+
29+
This tokenizer inherits from [`PretrainedTokenizerFast`] which contains most of the main methods. Users should
30+
refer to this superclass for more information regarding those methods.
31+
32+
Args:
33+
vocab_file (`str`):
34+
File containing the vocabulary.
35+
do_lower_case (`bool`, *optional*, defaults to `True`):
36+
Whether or not to lowercase the input when tokenizing.
37+
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
38+
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
39+
token instead.
40+
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
41+
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
42+
sequence classification or for a text and a question for question answering. It is also used as the last
43+
token of a sequence built with special tokens.
44+
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
45+
The token used for padding, for example when batching sequences of different lengths.
46+
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
47+
The classifier token which is used when doing sequence classification (classification of the whole sequence
48+
instead of per-token classification). It is the first token of the sequence when built with special tokens.
49+
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
50+
The token used for masking values. This is the token used when training this model with masked language
51+
modeling. This is the token which the model will try to predict.
52+
clean_text (`bool`, *optional*, defaults to `True`):
53+
Whether or not to clean the text before tokenization by removing any control characters and replacing all
54+
whitespaces by the classic one.
55+
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
56+
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese.
57+
strip_accents (`bool`, *optional*):
58+
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
59+
value for `lowercase` (as in the original BERT).
60+
wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
61+
The prefix for subwords.
62+
"""
63+
64+
resource_files_names = VOCAB_FILES_NAMES
65+
slow_tokenizer_class = BertTokenizer
66+
67+
def __init__(
68+
self,
69+
vocab_file=None,
70+
tokenizer_file=None,
71+
do_lower_case=True,
72+
unk_token="[UNK]",
73+
sep_token="[SEP]",
74+
pad_token="[PAD]",
75+
cls_token="[CLS]",
76+
mask_token="[MASK]",
77+
tokenize_chinese_chars=True,
78+
strip_accents=None,
79+
**kwargs,
80+
):
81+
super().__init__(
82+
vocab_file,
83+
tokenizer_file=tokenizer_file,
84+
do_lower_case=do_lower_case,
85+
unk_token=unk_token,
86+
sep_token=sep_token,
87+
pad_token=pad_token,
88+
cls_token=cls_token,
89+
mask_token=mask_token,
90+
tokenize_chinese_chars=tokenize_chinese_chars,
91+
strip_accents=strip_accents,
92+
**kwargs,
93+
)
94+
95+
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
96+
if (
97+
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
98+
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
99+
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
100+
):
101+
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
102+
normalizer_state["lowercase"] = do_lower_case
103+
normalizer_state["strip_accents"] = strip_accents
104+
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
105+
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
106+
107+
self.do_lower_case = do_lower_case
108+
109+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
110+
"""
111+
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
112+
adding special tokens. A BERT sequence has the following format:
113+
114+
- single sequence: `[CLS] X [SEP]`
115+
- pair of sequences: `[CLS] A [SEP] B [SEP]`
116+
117+
Args:
118+
token_ids_0 (`List[int]`):
119+
List of IDs to which the special tokens will be added.
120+
token_ids_1 (`List[int]`, *optional*):
121+
Optional second list of IDs for sequence pairs.
122+
123+
Returns:
124+
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
125+
"""
126+
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
127+
128+
if token_ids_1 is not None:
129+
output += token_ids_1 + [self.sep_token_id]
130+
131+
return output
132+
133+
def create_token_type_ids_from_sequences(
134+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
135+
) -> List[int]:
136+
"""
137+
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
138+
pair mask has the following format:
139+
140+
```
141+
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
142+
| first sequence | second sequence |
143+
```
144+
145+
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
146+
147+
Args:
148+
token_ids_0 (`List[int]`):
149+
List of IDs.
150+
token_ids_1 (`List[int]`, *optional*):
151+
Optional second list of IDs for sequence pairs.
152+
153+
Returns:
154+
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
155+
"""
156+
sep = [self.sep_token_id]
157+
cls = [self.cls_token_id]
158+
if token_ids_1 is None:
159+
return len(cls + token_ids_0 + sep) * [0]
160+
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
161+
162+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
163+
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
164+
return tuple(files)

paddlenlp/transformers/convert_slow_tokenizer.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18+
from collections import OrderedDict
1819
from typing import Dict, List, Optional, Tuple
1920

2021
import tokenizers
@@ -28,7 +29,7 @@
2829
pre_tokenizers,
2930
processors,
3031
)
31-
from tokenizers.models import BPE, Unigram
32+
from tokenizers.models import BPE, Unigram, WordPiece
3233

3334
from paddlenlp.utils.import_utils import (
3435
is_protobuf_available,
@@ -330,6 +331,50 @@ def converted(self) -> Tokenizer:
330331
return tokenizer
331332

332333

334+
class BertConverter(Converter):
335+
def converted(self) -> Tokenizer:
336+
vocab = self.original_tokenizer.vocab
337+
tokenizer = Tokenizer(
338+
WordPiece(
339+
OrderedDict([(vocab._idx_to_token[i], i) for i in range(len(vocab))]),
340+
unk_token=str(self.original_tokenizer.unk_token),
341+
)
342+
)
343+
344+
tokenize_chinese_chars = False
345+
strip_accents = False
346+
do_lower_case = False
347+
if hasattr(self.original_tokenizer, "basic_tokenizer"):
348+
tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
349+
strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
350+
do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case
351+
352+
tokenizer.normalizer = normalizers.BertNormalizer(
353+
clean_text=True,
354+
handle_chinese_chars=tokenize_chinese_chars,
355+
strip_accents=strip_accents,
356+
lowercase=do_lower_case,
357+
)
358+
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
359+
360+
cls = str(self.original_tokenizer.cls_token)
361+
sep = str(self.original_tokenizer.sep_token)
362+
cls_token_id = self.original_tokenizer.cls_token_id
363+
sep_token_id = self.original_tokenizer.sep_token_id
364+
365+
tokenizer.post_processor = processors.TemplateProcessing(
366+
single=f"{cls}:0 $A:0 {sep}:0",
367+
pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1",
368+
special_tokens=[
369+
(cls, cls_token_id),
370+
(sep, sep_token_id),
371+
],
372+
)
373+
tokenizer.decoder = decoders.WordPiece(prefix="##")
374+
375+
return tokenizer
376+
377+
333378
class LlamaConverter(SpmConverter):
334379
handle_byte_fallback = True
335380

@@ -399,6 +444,7 @@ def pre_tokenizer(self, replacement, add_prefix_space):
399444

400445
SLOW_TO_FAST_CONVERTERS = {
401446
"LlamaTokenizer": LlamaConverter,
447+
"BertTokenizer": BertConverter,
402448
}
403449

404450

paddlenlp/transformers/tokenizer_utils_base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1851,6 +1851,9 @@ def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
18511851

18521852
# Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
18531853
tokenizer_class = self.__class__.__name__
1854+
# Remove the Fast at the end unless we have a special `PreTrainedTokenizerFast`
1855+
if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast":
1856+
tokenizer_class = tokenizer_class[:-4]
18541857
tokenizer_config["tokenizer_class"] = tokenizer_class
18551858

18561859
with io.open(tokenizer_config_file, "w", encoding="utf-8") as f:

paddlenlp/transformers/tokenizer_utils_fast.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -750,7 +750,7 @@ def train_new_from_iterator(
750750
Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.
751751
752752
Returns:
753-
[`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
753+
[`PretrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
754754
`text_iterator`.
755755
756756
"""

tests/transformers/auto/test_confiugration.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,10 @@
2323
from paddlenlp.transformers import AutoConfig
2424
from paddlenlp.transformers.auto.configuration import CONFIG_MAPPING
2525
from paddlenlp.transformers.bert.configuration import BertConfig
26-
from paddlenlp.transformers.configuration_utils import PretrainedConfig
2726
from paddlenlp.utils.env import CONFIG_NAME
2827

28+
from ...utils.test_module.custom_configuration import CustomConfig
29+
2930

3031
class AutoConfigTest(unittest.TestCase):
3132
def test_built_in_model_class_config(self):
@@ -90,13 +91,6 @@ def test_load_from_legacy_config(self):
9091
self.assertEqual(auto_config.hidden_size, number)
9192

9293
def test_new_config_registration(self):
93-
class CustomConfig(PretrainedConfig):
94-
model_type = "custom"
95-
96-
def __init__(self, attribute=1, **kwargs):
97-
self.attribute = attribute
98-
super().__init__(**kwargs)
99-
10094
try:
10195
AutoConfig.register("custom", CustomConfig)
10296
# Wrong model type will raise an error

0 commit comments

Comments
 (0)