From b7dcfc0531273734e6319acdab1ea47be119712a Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Thu, 13 Aug 2020 14:24:05 +0200
Subject: [PATCH 1/3] change unique_no_split_tokens's type to set

---
 src/transformers/tokenization_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index cbe9b34beeff..851afd540c92 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -20,7 +20,7 @@
 import logging
 import re
 import unicodedata
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 from .file_utils import add_end_docstrings
 from .tokenization_utils_base import (
@@ -118,7 +118,7 @@ def __init__(self, **kwargs):
         # until the serialization of Fast tokenizers is updated
         self.added_tokens_encoder: Dict[str, int] = {}
         self.added_tokens_decoder: Dict[int, str] = {}
-        self.unique_no_split_tokens: List[str] = []
+        self.unique_no_split_tokens: Set[str] = set()
 
     @property
     def is_fast(self) -> bool:
@@ -207,10 +207,10 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
 
         # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
         if special_tokens:
-            self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens)))
+            self.unique_no_split_tokens = set(self.unique_no_split_tokens).union(set(new_tokens))
         else:
             # Or on the newly added tokens
-            self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
+            self.unique_no_split_tokens = set(self.unique_no_split_tokens).union(set(tokens_to_add))
 
         return len(tokens_to_add)
 
@@ -347,7 +347,7 @@ def split_on_tokens(tok_list, text):
             )
 
         no_split_token = self.unique_no_split_tokens
-        tokenized_text = split_on_tokens(no_split_token, text)
+        tokenized_text = split_on_tokens(list(no_split_token), text)
         return tokenized_text
 
     def _tokenize(self, text, **kwargs):

From b3aa1391068869710d41d36ae1c7b5493d72dcfd Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Thu, 13 Aug 2020 15:08:06 +0200
Subject: [PATCH 2/3] use sorted list instead of set

---
 src/transformers/tokenization_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 851afd540c92..b376d3a295fe 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -118,7 +118,7 @@ def __init__(self, **kwargs):
         # until the serialization of Fast tokenizers is updated
         self.added_tokens_encoder: Dict[str, int] = {}
         self.added_tokens_decoder: Dict[int, str] = {}
-        self.unique_no_split_tokens: Set[str] = set()
+        self.unique_no_split_tokens: List[str] = []
 
     @property
     def is_fast(self) -> bool:
@@ -207,10 +207,10 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
 
         # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
         if special_tokens:
-            self.unique_no_split_tokens = set(self.unique_no_split_tokens).union(set(new_tokens))
+            self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
         else:
             # Or on the newly added tokens
-            self.unique_no_split_tokens = set(self.unique_no_split_tokens).union(set(tokens_to_add))
+            self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
 
         return len(tokens_to_add)
 
@@ -347,7 +347,7 @@ def split_on_tokens(tok_list, text):
             )
 
         no_split_token = self.unique_no_split_tokens
-        tokenized_text = split_on_tokens(list(no_split_token), text)
+        tokenized_text = split_on_tokens(no_split_token, text)
         return tokenized_text
 
     def _tokenize(self, text, **kwargs):

From dfb7549b71cc0affaf2550fd6156017ebeea9010 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Thu, 13 Aug 2020 15:12:24 +0200
Subject: [PATCH 3/3] style

---
 src/transformers/tokenization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index b376d3a295fe..3121980c0d5b 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -20,7 +20,7 @@
 import logging
 import re
 import unicodedata
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from .file_utils import add_end_docstrings
 from .tokenization_utils_base import (