From b7dcfc0531273734e6319acdab1ea47be119712a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 13 Aug 2020 14:24:05 +0200 Subject: [PATCH 1/3] change unique_no_split_tokens's type to set --- src/transformers/tokenization_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index cbe9b34beeff..851afd540c92 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -20,7 +20,7 @@ import logging import re import unicodedata -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union from .file_utils import add_end_docstrings from .tokenization_utils_base import ( @@ -118,7 +118,7 @@ def __init__(self, **kwargs): # until the serialization of Fast tokenizers is updated self.added_tokens_encoder: Dict[str, int] = {} self.added_tokens_decoder: Dict[int, str] = {} - self.unique_no_split_tokens: List[str] = [] + self.unique_no_split_tokens: Set[str] = set() @property def is_fast(self) -> bool: @@ -207,10 +207,10 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) if special_tokens: - self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens))) + self.unique_no_split_tokens = set(self.unique_no_split_tokens).union(set(new_tokens)) else: # Or on the newly added tokens - self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add))) + self.unique_no_split_tokens = set(self.unique_no_split_tokens).union(set(tokens_to_add)) return len(tokens_to_add) @@ -347,7 +347,7 @@ def split_on_tokens(tok_list, text): ) no_split_token = self.unique_no_split_tokens - tokenized_text = split_on_tokens(no_split_token, text) + tokenized_text = split_on_tokens(list(no_split_token), text) return tokenized_text def _tokenize(self, text, **kwargs): From b3aa1391068869710d41d36ae1c7b5493d72dcfd Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 13 Aug 2020 15:08:06 +0200 Subject: [PATCH 2/3] use sorted list instead of set --- src/transformers/tokenization_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 851afd540c92..b376d3a295fe 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -118,7 +118,7 @@ def __init__(self, **kwargs): # until the serialization of Fast tokenizers is updated self.added_tokens_encoder: Dict[str, int] = {} self.added_tokens_decoder: Dict[int, str] = {} - self.unique_no_split_tokens: Set[str] = set() + self.unique_no_split_tokens: List[str] = [] @property def is_fast(self) -> bool: @@ -207,10 +207,10 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) if special_tokens: - self.unique_no_split_tokens = set(self.unique_no_split_tokens).union(set(new_tokens)) + self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens))) else: # Or on the newly added tokens - self.unique_no_split_tokens = set(self.unique_no_split_tokens).union(set(tokens_to_add)) + self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add))) return len(tokens_to_add) @@ -347,7 +347,7 @@ def split_on_tokens(tok_list, text): ) no_split_token = self.unique_no_split_tokens - tokenized_text = split_on_tokens(list(no_split_token), text) + tokenized_text = split_on_tokens(no_split_token, text) return tokenized_text def _tokenize(self, text, **kwargs): From dfb7549b71cc0affaf2550fd6156017ebeea9010 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 13 Aug 2020 15:12:24 +0200 Subject: [PATCH 3/3] style --- src/transformers/tokenization_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index b376d3a295fe..3121980c0d5b 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -20,7 +20,7 @@ import logging import re import unicodedata -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union from .file_utils import add_end_docstrings from .tokenization_utils_base import (