Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/transformers/tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import logging
import re
import unicodedata
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Set, Tuple, Union

from .file_utils import add_end_docstrings
from .tokenization_utils_base import (
Expand Down Expand Up @@ -118,7 +118,7 @@ def __init__(self, **kwargs):
# until the serialization of Fast tokenizers is updated
self.added_tokens_encoder: Dict[str, int] = {}
self.added_tokens_decoder: Dict[int, str] = {}
self.unique_no_split_tokens: List[str] = []
self.unique_no_split_tokens: Set[str] = set()

@property
def is_fast(self) -> bool:
Expand Down Expand Up @@ -207,10 +207,10 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to

# Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
if special_tokens:
self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens)))
self.unique_no_split_tokens = set(self.unique_no_split_tokens).union(set(new_tokens))
else:
# Or on the newly added tokens
self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
self.unique_no_split_tokens = set(self.unique_no_split_tokens).union(set(tokens_to_add))

return len(tokens_to_add)

Expand Down Expand Up @@ -347,7 +347,7 @@ def split_on_tokens(tok_list, text):
)

no_split_token = self.unique_no_split_tokens
tokenized_text = split_on_tokens(no_split_token, text)
tokenized_text = split_on_tokens(list(no_split_token), text)
return tokenized_text

def _tokenize(self, text, **kwargs):
Expand Down