From 4bc3b3f6cb886324a79a0951539048c512f9ae7e Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Mon, 26 Oct 2020 23:11:51 +0100
Subject: [PATCH 01/24] Fixing roberta for slow-fast tests

---
 src/transformers/tokenization_roberta_fast.py | 27 ++++++++
 src/transformers/tokenization_utils_base.py   |  4 +-
 tests/test_pipelines_common.py                | 67 ++++++++++++++++++-
 3 files changed, 96 insertions(+), 2 deletions(-)
diff --git a/src/transformers/tokenization_roberta_fast.py b/src/transformers/tokenization_roberta_fast.py
index 3709aec944fe..c94891542e01 100644
--- a/src/transformers/tokenization_roberta_fast.py
+++ b/src/transformers/tokenization_roberta_fast.py
@@ -18,6 +18,7 @@
 
 from .tokenization_gpt2_fast import GPT2TokenizerFast
 from .tokenization_roberta import RobertaTokenizer
+from .tokenization_utils_base import AddedToken
 from .utils import logging
 
 
@@ -172,6 +173,32 @@ def __init__(
             **kwargs,
         )
 
+    @property
+    def mask_token(self) -> str:
+        """
+        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+        not having been set.
+
+        Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline.
+        The mask token will greedily comprise the space before the `<mask>`.
+        """
+        if self._mask_token is None and self.verbose:
+            logger.error("Using mask_token, but it is not set yet.")
+            return None
+        return str(self._mask_token)
+
+    @mask_token.setter
+    def mask_token(self, value):
+        """ Overriding the default behavior of the mask token to have it eat the space before it.
+
+            This is needed to preserve backward compatibility with all the previously used models
+            based on Roberta.
+        """
+        # Mask token behave like a normal word, i.e. include the space before it
+        # So we set lstrip to True
+        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
+        self._mask_token = value
+
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
         if token_ids_1 is None:
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 0622e78c8b3a..7e7b10c5096f 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -180,7 +180,9 @@ def to_py_obj(obj):
     """
     Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
     """
-    if isinstance(obj, (list, tuple)):
+    if isinstance(obj, (dict, BatchEncoding)):
+        return {k: to_py_obj(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
         return [to_py_obj(o) for o in obj]
     elif is_tf_available() and isinstance(obj, tf.Tensor):
         return obj.numpy().tolist()
diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
index d6acea2da6cc..37a348449554 100644
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@@ -1,7 +1,9 @@
 import unittest
+from unittest import mock
 from typing import List, Optional
 
 from transformers import is_tf_available, is_torch_available, pipeline
+from transformers.tokenization_utils_base import to_py_obj
 from transformers.pipelines import DefaultArgumentHandler, Pipeline
 from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow
 
@@ -83,7 +85,7 @@ def _test_pipeline(self, nlp: Pipeline):
         raise NotImplementedError
 
 
-@is_pipeline_test
+# @is_pipeline_test
 class MonoInputPipelineCommonMixin:
     pipeline_task = None
     pipeline_loading_kwargs = {}  # Additional kwargs to load the pipeline with
@@ -139,6 +141,69 @@ def test_tf_small(self):
             )
             self._test_pipeline(nlp)
 
+    @require_torch
+    def test_compare_slow_fast_torch(self):
+        for model_name in self.small_models:
+            nlp_slow = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                use_fast=False,
+                **self.pipeline_loading_kwargs,
+            )
+            nlp_fast = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                use_fast=True,
+                **self.pipeline_loading_kwargs,
+            )
+            self._compare_slow_fast_pipelines(nlp_slow, nlp_fast)
+
+    @require_tf
+    def test_compare_slow_fast_tf(self):
+        for model_name in self.small_models:
+            nlp_slow = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="tf",
+                use_fast=False,
+                **self.pipeline_loading_kwargs,
+            )
+            nlp_fast = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="tf",
+                use_fast=True,
+                **self.pipeline_loading_kwargs,
+            )
+            self._compare_slow_fast_pipelines(nlp_slow, nlp_fast)
+
+    def _compare_slow_fast_pipelines(self, nlp_slow: Pipeline, nlp_fast: Pipeline):
+        with mock.patch.object(nlp_slow.model, 'forward', wraps=nlp_slow.model.forward) as mock_slow,\
+                mock.patch.object(nlp_fast.model, 'forward', wraps=nlp_fast.model.forward) as mock_fast:
+            for inputs in self.valid_inputs:
+                outputs_slow = nlp_slow(inputs, **self.pipeline_running_kwargs)
+                outputs_fast = nlp_fast(inputs, **self.pipeline_running_kwargs)
+
+                mock_slow.assert_called()
+                mock_fast.assert_called()
+
+                slow_call_args, slow_call_kwargs = mock_slow.call_args
+                fast_call_args, fast_call_kwargs = mock_fast.call_args
+
+                slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs)
+                fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs)
+
+                self.assertEqual(slow_call_args, fast_call_args)
+                self.assertDictEqual(slow_call_kwargs, fast_call_kwargs)
+
+                self.assertEqual(outputs_slow, outputs_fast)
+
     @require_torch
     @slow
     def test_torch_large(self):

From 1ce1c638350158f5ebeb2130b1f3a4f550252f32 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Mon, 26 Oct 2020 23:37:09 +0100
Subject: [PATCH 02/24] WIP getting equivalence on pipelines

---
 src/transformers/tokenization_roberta_fast.py |   6 +-
 tests/test_pipelines_common.py                | 166 +++++++-----------
 tests/test_pipelines_dialog.py                |   1 +
 tests/test_pipelines_zero_shot.py             |  12 ++
 4 files changed, 82 insertions(+), 103 deletions(-)

diff --git a/src/transformers/tokenization_roberta_fast.py b/src/transformers/tokenization_roberta_fast.py
index c94891542e01..02b779bf8281 100644
--- a/src/transformers/tokenization_roberta_fast.py
+++ b/src/transformers/tokenization_roberta_fast.py
@@ -189,10 +189,10 @@ def mask_token(self) -> str:
 
     @mask_token.setter
     def mask_token(self, value):
-        """ Overriding the default behavior of the mask token to have it eat the space before it.
+        """Overriding the default behavior of the mask token to have it eat the space before it.
 
-            This is needed to preserve backward compatibility with all the previously used models
-            based on Roberta.
+        This is needed to preserve backward compatibility with all the previously used models
+        based on Roberta.
         """
         # Mask token behave like a normal word, i.e. include the space before it
         # So we set lstrip to True
diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
index 37a348449554..5a6713aaedf8 100644
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@@ -1,22 +1,24 @@
 import unittest
-from unittest import mock
 from typing import List, Optional
+from unittest import mock
 
 from transformers import is_tf_available, is_torch_available, pipeline
-from transformers.tokenization_utils_base import to_py_obj
 from transformers.pipelines import DefaultArgumentHandler, Pipeline
 from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow
+from transformers.tokenization_utils_base import to_py_obj
 
 
 VALID_INPUTS = ["A simple string", ["list of strings"]]
 
 
-@is_pipeline_test
+# @is_pipeline_test
 class CustomInputPipelineCommonMixin:
     pipeline_task = None
-    pipeline_loading_kwargs = {}
-    small_models = None  # Models tested without the @slow decorator
-    large_models = None  # Models tested with the @slow decorator
+    pipeline_loading_kwargs = {}  # Additional kwargs to load the pipeline with
+    pipeline_running_kwargs = {}  # Additional kwargs to run the pipeline with
+    small_models = []  # Models tested without the @slow decorator
+    large_models = []  # Models tested with the @slow decorator
+    valid_inputs = VALID_INPUTS  # Some inputs which are valid to compare fast and slow tokenizers
 
     def setUp(self) -> None:
         if not is_tf_available() and not is_torch_available():
@@ -48,78 +50,41 @@ def setUp(self) -> None:
     @require_torch
     @slow
     def test_pt_defaults(self):
-        pipeline(self.pipeline_task, framework="pt")
+        pipeline(self.pipeline_task, framework="pt", **self.pipeline_loading_kwargs)
 
     @require_tf
     @slow
     def test_tf_defaults(self):
-        pipeline(self.pipeline_task, framework="tf")
+        pipeline(self.pipeline_task, framework="tf", **self.pipeline_loading_kwargs)
 
     @require_torch
     def test_torch_small(self):
         for model_name in self.small_models:
-            nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="pt")
+            nlp = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                **self.pipeline_loading_kwargs,
+            )
             self._test_pipeline(nlp)
 
     @require_tf
     def test_tf_small(self):
         for model_name in self.small_models:
-            nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="tf")
+            nlp = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="tf",
+                **self.pipeline_loading_kwargs,
+            )
             self._test_pipeline(nlp)
 
     @require_torch
     @slow
     def test_torch_large(self):
         for model_name in self.large_models:
-            nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="pt")
-            self._test_pipeline(nlp)
-
-    @require_tf
-    @slow
-    def test_tf_large(self):
-        for model_name in self.large_models:
-            nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="tf")
-            self._test_pipeline(nlp)
-
-    def _test_pipeline(self, nlp: Pipeline):
-        raise NotImplementedError
-
-
-# @is_pipeline_test
-class MonoInputPipelineCommonMixin:
-    pipeline_task = None
-    pipeline_loading_kwargs = {}  # Additional kwargs to load the pipeline with
-    pipeline_running_kwargs = {}  # Additional kwargs to run the pipeline with
-    small_models = []  # Models tested without the @slow decorator
-    large_models = []  # Models tested with the @slow decorator
-    mandatory_keys = {}  # Keys which should be in the output
-    valid_inputs = VALID_INPUTS  # inputs which are valid
-    invalid_inputs = [None]  # inputs which are not allowed
-    expected_multi_result: Optional[List] = None
-    expected_check_keys: Optional[List[str]] = None
-
-    def setUp(self) -> None:
-        if not is_tf_available() and not is_torch_available():
-            return  # Currently no JAX pipelines
-
-        for model_name in self.small_models:
-            pipeline(self.pipeline_task, model=model_name, tokenizer=model_name, **self.pipeline_loading_kwargs)
-        for model_name in self.large_models:
-            pipeline(self.pipeline_task, model=model_name, tokenizer=model_name, **self.pipeline_loading_kwargs)
-
-    @require_torch
-    @slow
-    def test_pt_defaults_loads(self):
-        pipeline(self.pipeline_task, framework="pt", **self.pipeline_loading_kwargs)
-
-    @require_tf
-    @slow
-    def test_tf_defaults_loads(self):
-        pipeline(self.pipeline_task, framework="tf", **self.pipeline_loading_kwargs)
-
-    @require_torch
-    def test_torch_small(self):
-        for model_name in self.small_models:
             nlp = pipeline(
                 task=self.pipeline_task,
                 model=model_name,
@@ -130,8 +95,9 @@ def test_torch_small(self):
             self._test_pipeline(nlp)
 
     @require_tf
-    def test_tf_small(self):
-        for model_name in self.small_models:
+    @slow
+    def test_tf_large(self):
+        for model_name in self.large_models:
             nlp = pipeline(
                 task=self.pipeline_task,
                 model=model_name,
@@ -141,6 +107,9 @@ def test_tf_small(self):
             )
             self._test_pipeline(nlp)
 
+    def _test_pipeline(self, nlp: Pipeline):
+        raise NotImplementedError
+
     @require_torch
     def test_compare_slow_fast_torch(self):
         for model_name in self.small_models:
@@ -160,7 +129,7 @@ def test_compare_slow_fast_torch(self):
                 use_fast=True,
                 **self.pipeline_loading_kwargs,
             )
-            self._compare_slow_fast_pipelines(nlp_slow, nlp_fast)
+            self._compare_slow_fast_pipelines(nlp_slow, nlp_fast, method="forward")
 
     @require_tf
     def test_compare_slow_fast_tf(self):
@@ -181,54 +150,51 @@ def test_compare_slow_fast_tf(self):
                 use_fast=True,
                 **self.pipeline_loading_kwargs,
             )
-            self._compare_slow_fast_pipelines(nlp_slow, nlp_fast)
-
-    def _compare_slow_fast_pipelines(self, nlp_slow: Pipeline, nlp_fast: Pipeline):
-        with mock.patch.object(nlp_slow.model, 'forward', wraps=nlp_slow.model.forward) as mock_slow,\
-                mock.patch.object(nlp_fast.model, 'forward', wraps=nlp_fast.model.forward) as mock_fast:
+            self._compare_slow_fast_pipelines(nlp_slow, nlp_fast, method="call")
+
+    def _compare_slow_fast_pipelines(self, nlp_slow: Pipeline, nlp_fast: Pipeline, method: str):
+        """We check that the inputs to the models forward passes are identical for
+        slow and fast tokenizers.
+        """
+        with mock.patch.object(
+            nlp_slow.model, method, wraps=getattr(nlp_slow.model, method)
+        ) as mock_slow, mock.patch.object(nlp_fast.model, method, wraps=getattr(nlp_fast.model, method)) as mock_fast:
             for inputs in self.valid_inputs:
-                outputs_slow = nlp_slow(inputs, **self.pipeline_running_kwargs)
-                outputs_fast = nlp_fast(inputs, **self.pipeline_running_kwargs)
+                if isinstance(inputs, dict):
+                    inputs.update(self.pipeline_running_kwargs)
+                    _ = nlp_slow(**inputs)
+                    _ = nlp_fast(**inputs)
+                else:
+                    _ = nlp_slow(inputs, **self.pipeline_running_kwargs)
+                    _ = nlp_fast(inputs, **self.pipeline_running_kwargs)
 
                 mock_slow.assert_called()
                 mock_fast.assert_called()
 
-                slow_call_args, slow_call_kwargs = mock_slow.call_args
-                fast_call_args, fast_call_kwargs = mock_fast.call_args
+                self.assertEqual(len(mock_slow.call_args_list), len(mock_fast.call_args_list))
+                for mock_slow_call_args, mock_fast_call_args in zip(
+                    mock_slow.call_args_list, mock_slow.call_args_list
+                ):
+                    slow_call_args, slow_call_kwargs = mock_slow_call_args
+                    fast_call_args, fast_call_kwargs = mock_fast_call_args
 
-                slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs)
-                fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs)
+                    slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs)
+                    fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs)
 
-                self.assertEqual(slow_call_args, fast_call_args)
-                self.assertDictEqual(slow_call_kwargs, fast_call_kwargs)
+                    self.assertEqual(slow_call_args, fast_call_args)
+                    self.assertDictEqual(slow_call_kwargs, fast_call_kwargs)
 
-                self.assertEqual(outputs_slow, outputs_fast)
 
-    @require_torch
-    @slow
-    def test_torch_large(self):
-        for model_name in self.large_models:
-            nlp = pipeline(
-                task=self.pipeline_task,
-                model=model_name,
-                tokenizer=model_name,
-                framework="pt",
-                **self.pipeline_loading_kwargs,
-            )
-            self._test_pipeline(nlp)
+@is_pipeline_test
+class MonoInputPipelineCommonMixin(CustomInputPipelineCommonMixin):
+    """A version of the CustomInputPipelineCommonMixin
+    with a predefined `_test_pipeline` method.
+    """
 
-    @require_tf
-    @slow
-    def test_tf_large(self):
-        for model_name in self.large_models:
-            nlp = pipeline(
-                task=self.pipeline_task,
-                model=model_name,
-                tokenizer=model_name,
-                framework="tf",
-                **self.pipeline_loading_kwargs,
-            )
-            self._test_pipeline(nlp)
+    mandatory_keys = {}  # Keys which should be in the output
+    invalid_inputs = [None]  # inputs which are not allowed
+    expected_multi_result: Optional[List] = None
+    expected_check_keys: Optional[List[str]] = None
 
     def _test_pipeline(self, nlp: Pipeline):
         self.assertIsNotNone(nlp)
diff --git a/tests/test_pipelines_dialog.py b/tests/test_pipelines_dialog.py
index 751d4b2b3e5f..9413441378d2 100644
--- a/tests/test_pipelines_dialog.py
+++ b/tests/test_pipelines_dialog.py
@@ -9,6 +9,7 @@ class DialoguePipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
     pipeline_task = "conversational"
     small_models = []  # Default model - Models tested without the @slow decorator
     large_models = ["microsoft/DialoGPT-medium"]  # Models tested with the @slow decorator
+    valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]]
 
     def _test_pipeline(self, nlp: Pipeline):
         valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]]
diff --git a/tests/test_pipelines_zero_shot.py b/tests/test_pipelines_zero_shot.py
index 42adfc27ced0..25db16cb96db 100644
--- a/tests/test_pipelines_zero_shot.py
+++ b/tests/test_pipelines_zero_shot.py
@@ -11,6 +11,18 @@ class ZeroShotClassificationPipelineTests(CustomInputPipelineCommonMixin, unitte
         "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"
     ]  # Models tested without the @slow decorator
     large_models = ["roberta-large-mnli"]  # Models tested with the @slow decorator
+    valid_inputs = [
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics"},
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics"]},
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics, public health"},
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics", "public health"]},
+        {"sequences": ["Who are you voting for in 2020?"], "candidate_labels": "politics"},
+        {
+            "sequences": "Who are you voting for in 2020?",
+            "candidate_labels": "politics",
+            "hypothesis_template": "This text is about {}",
+        },
+    ]
 
     def _test_scores_sum_to_one(self, result):
         sum = 0.0

From 15350e80d14c9fa91939ca6eac9ef2976eb1607d Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 27 Oct 2020 16:18:05 +0100
Subject: [PATCH 03/24] slow-to-fast equivalence - working on
 question-answering pipeline

---
 src/transformers/data/processors/squad.py     |  45 ++++--
 src/transformers/pipelines.py                 | 135 ++++++++++++++----
 src/transformers/tokenization_roberta_fast.py |  10 +-
 tests/test_pipelines_conversational.py        |   2 +-
 tests/test_pipelines_dialog.py                |  30 ----
 tests/test_pipelines_question_answering.py    |  12 ++
 tests/test_tokenization_common.py             |  18 +++
 7 files changed, 175 insertions(+), 77 deletions(-)
 delete mode 100644 tests/test_pipelines_dialog.py

diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 41daa06e99e2..4f928e690969 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -8,7 +8,7 @@
 
 from ...file_utils import is_tf_available, is_torch_available
 from ...tokenization_bert import whitespace_tokenize
-from ...tokenization_utils_base import PreTrainedTokenizerBase, TruncationStrategy
+from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
 from ...utils import logging
 from .utils import DataProcessor
 
@@ -350,24 +350,39 @@ def squad_convert_examples_to_features(
     # Defining helper methods
     features = []
 
-    threads = min(threads, cpu_count())
-    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
-        annotate_ = partial(
-            squad_convert_example_to_features,
+    #################
+    squad_convert_example_to_features_init(tokenizer)
+    for example in examples:
+        feature = squad_convert_example_to_features(
+            example,
             max_seq_length=max_seq_length,
             doc_stride=doc_stride,
             max_query_length=max_query_length,
             padding_strategy=padding_strategy,
             is_training=is_training,
         )
-        features = list(
-            tqdm(
-                p.imap(annotate_, examples, chunksize=32),
-                total=len(examples),
-                desc="convert squad examples to features",
-                disable=not tqdm_enabled,
-            )
-        )
+        features.append(feature)
+
+    #################
+    # threads = min(threads, cpu_count())
+    # with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+    #     annotate_ = partial(
+    #         squad_convert_example_to_features,
+    #         max_seq_length=max_seq_length,
+    #         doc_stride=doc_stride,
+    #         max_query_length=max_query_length,
+    #         padding_strategy=padding_strategy,
+    #         is_training=is_training,
+    #     )
+    #     features = list(
+    #         tqdm(
+    #             p.imap(annotate_, examples, chunksize=32),
+    #             total=len(examples),
+    #             desc="convert squad examples to features",
+    #             disable=not tqdm_enabled,
+    #         )
+    #     )
+    #################
 
     new_features = []
     unique_id = 1000000000
@@ -765,6 +780,7 @@ class SquadFeatures:
         token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
         start_position: start of the answer token index
         end_position: end of the answer token index
+        encoding: optionally store the BatchEncoding with the fast-tokenizer alignement methods.
     """
 
     def __init__(
@@ -784,6 +800,7 @@ def __init__(
         end_position,
         is_impossible,
         qas_id: str = None,
+        encoding: BatchEncoding = None,
     ):
         self.input_ids = input_ids
         self.attention_mask = attention_mask
@@ -803,6 +820,8 @@ def __init__(
         self.is_impossible = is_impossible
         self.qas_id = qas_id
 
+        self.encoding = encoding
+
 
 class SquadResult:
     """
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 4a7c42fd863c..a3a9a7a01a35 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -32,7 +32,7 @@
 
 from .configuration_auto import AutoConfig
 from .configuration_utils import PretrainedConfig
-from .data import SquadExample, squad_convert_examples_to_features
+from .data import SquadExample, SquadFeatures, squad_convert_examples_to_features
 from .file_utils import add_end_docstrings, is_tf_available, is_torch_available
 from .modelcard import ModelCard
 from .tokenization_auto import AutoTokenizer
@@ -1721,6 +1721,7 @@ def __call__(self, *args, **kwargs):
             - **answer** (:obj:`str`) -- The answer to the question.
         """
         # Set defaults values
+        kwargs.setdefault("padding", "longest")
         kwargs.setdefault("topk", 1)
         kwargs.setdefault("doc_stride", 128)
         kwargs.setdefault("max_answer_len", 15)
@@ -1736,19 +1737,83 @@ def __call__(self, *args, **kwargs):
 
         # Convert inputs to features
         examples = self._args_parser(*args, **kwargs)
-        features_list = [
-            squad_convert_examples_to_features(
-                examples=[example],
-                tokenizer=self.tokenizer,
-                max_seq_length=kwargs["max_seq_len"],
-                doc_stride=kwargs["doc_stride"],
-                max_query_length=kwargs["max_question_len"],
-                padding_strategy=PaddingStrategy.MAX_LENGTH.value,
-                is_training=False,
-                tqdm_enabled=False,
-            )
-            for example in examples
-        ]
+        if not self.tokenizer.is_fast:
+            features_list = [
+                squad_convert_examples_to_features(
+                    examples=[example],
+                    tokenizer=self.tokenizer,
+                    max_seq_length=kwargs["max_seq_len"],
+                    doc_stride=kwargs["doc_stride"],
+                    max_query_length=kwargs["max_question_len"],
+                    padding_strategy=PaddingStrategy.MAX_LENGTH.value,
+                    is_training=False,
+                    tqdm_enabled=False,
+                )
+                for example in examples
+            ]
+        else:
+            features_list = []
+            for example in examples:
+                # Define the side we want to truncate / pad and the text/pair sorting
+                question_first = bool(self.tokenizer.padding_side == "right")
+                if question_first:
+                    texts = example.question_text
+                    pairs = example.context_text
+                    truncation = "only_second"
+                else:
+                    texts = example.context_text
+                    pairs = example.question_text
+                    truncation = "only_first"
+
+                encoded_inputs = self.tokenizer(
+                    texts,
+                    pairs,
+                    padding=kwargs["padding"],
+                    truncation=truncation,
+                    max_length=kwargs["max_seq_len"],
+                    stride=kwargs["doc_stride"],
+                    return_tensors="np",
+                    return_token_type_ids=True,
+                    return_overflowing_tokens=True,
+                    return_offsets_mapping=True,
+                    return_special_tokens_mask=True,
+                )
+
+                # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+                p_mask = encoded_inputs["token_type_ids"] == (0 if question_first else 1)  # Mask the question
+                p_mask = p_mask | encoded_inputs["special_tokens_mask"]  # And mask the special tokens
+                cls_index = 0
+                if self.tokenizer.cls_token_id:
+                    # kKep the cls_token unmasked (some models use it to indicate unanswerable questions)
+                    cls_index = np.where(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id)
+                    p_mask[cls_index] = 0
+
+                features = []
+                num_spans = len(encoded_inputs["input_ids"])
+                for span_idx in range(num_spans):
+                    features.append(
+                        SquadFeatures(
+                            encoded_inputs["input_ids"][span_idx],
+                            encoded_inputs["attention_mask"][span_idx],
+                            encoded_inputs["token_type_ids"][span_idx],
+                            cls_index[span_idx],
+                            p_mask[span_idx].tolist(),
+                            encoding=encoded_inputs[span_idx],
+                            # We don't use the rest of the values
+                            token_to_orig_map={},
+                            example_index=0,
+                            unique_id=0,
+                            paragraph_len=0,
+                            token_is_max_context=0,
+                            tokens=[],
+                            start_position=0,
+                            end_position=0,
+                            is_impossible=False,
+                            qas_id=None,
+                        )
+                    )
+                features_list.append(features)
+
         all_answers = []
         for features, example in zip(features_list, examples):
             model_input_names = self.tokenizer.model_input_names + ["input_ids"]
@@ -1791,20 +1856,34 @@ def __call__(self, *args, **kwargs):
                 start_[0] = end_[0] = 0.0
 
                 starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
-                char_to_word = np.array(example.char_to_word_offset)
-
-                # Convert the answer (tokens) back to the original text
-                answers += [
-                    {
-                        "score": score.item(),
-                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
-                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
-                        "answer": " ".join(
-                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
-                        ),
-                    }
-                    for s, e, score in zip(starts, ends, scores)
-                ]
+                if not self.tokenizer.is_fast:
+                    char_to_word = np.array(example.char_to_word_offset)
+
+                    # Convert the answer (tokens) back to the original text
+                    answers += [
+                        {
+                            "score": score.item(),
+                            "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
+                            "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
+                            "answer": " ".join(
+                                example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
+                            ),
+                        }
+                        for s, e, score in zip(starts, ends, scores)
+                    ]
+                else:
+                    # Convert the answer (tokens) back to the original text
+                    answers += [
+                        {
+                            "score": score.item(),
+                            "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
+                            "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
+                            "answer": " ".join(
+                                example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
+                            ),
+                        }
+                        for s, e, score in zip(starts, ends, scores)
+                    ]
 
             if kwargs["handle_impossible_answer"]:
                 answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})
diff --git a/src/transformers/tokenization_roberta_fast.py b/src/transformers/tokenization_roberta_fast.py
index 02b779bf8281..696c43bf53ba 100644
--- a/src/transformers/tokenization_roberta_fast.py
+++ b/src/transformers/tokenization_roberta_fast.py
@@ -179,8 +179,8 @@ def mask_token(self) -> str:
         :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
         not having been set.
 
-        Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline.
-        The mask token will greedily comprise the space before the `<mask>`.
+        Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
+        comprise the space before the `<mask>`.
         """
         if self._mask_token is None and self.verbose:
             logger.error("Using mask_token, but it is not set yet.")
@@ -189,10 +189,10 @@ def mask_token(self) -> str:
 
     @mask_token.setter
     def mask_token(self, value):
-        """Overriding the default behavior of the mask token to have it eat the space before it.
+        """
+        Overriding the default behavior of the mask token to have it eat the space before it.
 
-        This is needed to preserve backward compatibility with all the previously used models
-        based on Roberta.
+        This is needed to preserve backward compatibility with all the previously used models based on Roberta.
         """
         # Mask token behave like a normal word, i.e. include the space before it
         # So we set lstrip to True
diff --git a/tests/test_pipelines_conversational.py b/tests/test_pipelines_conversational.py
index 3492283479b7..e70bb8a843ed 100644
--- a/tests/test_pipelines_conversational.py
+++ b/tests/test_pipelines_conversational.py
@@ -9,7 +9,7 @@
 DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0
 
 
-class TextGenerationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
+class ConversationalPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
     pipeline_task = "conversational"
     small_models = []  # Models tested without the @slow decorator
     large_models = ["microsoft/DialoGPT-medium"]  # Models tested with the @slow decorator
diff --git a/tests/test_pipelines_dialog.py b/tests/test_pipelines_dialog.py
deleted file mode 100644
index 9413441378d2..000000000000
--- a/tests/test_pipelines_dialog.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import unittest
-
-from transformers.pipelines import Conversation, Pipeline
-
-from .test_pipelines_common import CustomInputPipelineCommonMixin
-
-
-class DialoguePipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
-    pipeline_task = "conversational"
-    small_models = []  # Default model - Models tested without the @slow decorator
-    large_models = ["microsoft/DialoGPT-medium"]  # Models tested with the @slow decorator
-    valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]]
-
-    def _test_pipeline(self, nlp: Pipeline):
-        valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]]
-        invalid_inputs = ["Hi there!", Conversation()]
-        self.assertIsNotNone(nlp)
-
-        mono_result = nlp(valid_inputs[0])
-        self.assertIsInstance(mono_result, Conversation)
-
-        multi_result = nlp(valid_inputs[1])
-        self.assertIsInstance(multi_result, list)
-        self.assertIsInstance(multi_result[0], Conversation)
-        # Inactive conversations passed to the pipeline raise a ValueError
-        self.assertRaises(ValueError, nlp, valid_inputs[1])
-
-        for bad_input in invalid_inputs:
-            self.assertRaises(Exception, nlp, bad_input)
-        self.assertRaises(Exception, nlp, invalid_inputs)
diff --git a/tests/test_pipelines_question_answering.py b/tests/test_pipelines_question_answering.py
index 3f3f6dc83a72..379c58938f9f 100644
--- a/tests/test_pipelines_question_answering.py
+++ b/tests/test_pipelines_question_answering.py
@@ -7,10 +7,22 @@
 
 class QAPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
     pipeline_task = "question-answering"
+    pipeline_running_kwargs = {
+        "padding": "max_length",
+        "max_seq_len": 25,
+        "doc_stride": 5,
+    }  # Default is 'longest' but we use 'max_length' to test equivalence between slow/fast tokenizers
     small_models = [
         "sshleifer/tiny-distilbert-base-cased-distilled-squad"
     ]  # Models tested without the @slow decorator
     large_models = []  # Models tested with the @slow decorator
+    valid_inputs = [
+        {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
+        {
+            "question": "In what field is HuggingFace working ?",
+            "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
+        },
+    ]
 
     def _test_pipeline(self, nlp: Pipeline):
         output_keys = {"score", "answer", "start", "end"}
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 0090c0f47d30..a3df25152121 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -576,6 +576,24 @@ def test_mask_output(self):
                     sequences, mask = information["input_ids"], information["token_type_ids"]
                     self.assertEqual(len(sequences), len(mask))
 
+    def test_token_type_ids(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                seq_0 = "Test this method."
+                seq_1 = "With these inputs."
+
+                # We want to have sequence 0 and sequence 1 are tagged
+                # respectively with 0 and 1 token_ids
+                # (regardeless of weither the model use token type ids)
+                # We use this assumption in the QA pipeline among other place
+                output = tokenizer(seq_0, return_token_type_ids=True)
+                self.assertIn(0, output["token_type_ids"])
+
+                output = tokenizer(seq_0, seq_1, return_token_type_ids=True)
+                self.assertIn(0, output["token_type_ids"])
+                self.assertIn(1, output["token_type_ids"])
+
     def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:

From 449e346d79a68893cbc70b6f5abc75db0f8cc59d Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Mon, 2 Nov 2020 17:17:09 +0100
Subject: [PATCH 04/24] optional FAISS tests

---
 tests/test_retrieval_rag.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_retrieval_rag.py b/tests/test_retrieval_rag.py
index 93774be18382..fde5ac529c2f 100644
--- a/tests/test_retrieval_rag.py
+++ b/tests/test_retrieval_rag.py
@@ -9,7 +9,7 @@
 import numpy as np
 from datasets import Dataset
 
-import faiss
+from transformers import is_faiss_available
 from transformers.configuration_bart import BartConfig
 from transformers.configuration_dpr import DPRConfig
 from transformers.configuration_rag import RagConfig
@@ -26,6 +26,9 @@
 from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
 from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
 
+if is_faiss_available():
+    import faiss
+
 
 @require_faiss
 @require_datasets

From eb375bcec8d9d51179d42ac5299d1f3bdb01181e Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Mon, 9 Nov 2020 17:43:21 +0100
Subject: [PATCH 05/24] Pipeline Q&A

---
 src/transformers/data/processors/squad.py   |  50 ++++-----
 src/transformers/pipelines.py               |  67 +++++++-----
 src/transformers/tokenization_utils_base.py |  41 ++++++--
 src/transformers/tokenization_utils_fast.py |  39 ++++---
 tests/test_retrieval_rag.py                 |   1 +
 tests/test_tokenization_common.py           | 108 ++++++++++++++++++++
 6 files changed, 229 insertions(+), 77 deletions(-)

diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 4f928e690969..06cd170a4dc6 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -351,37 +351,37 @@ def squad_convert_examples_to_features(
     features = []
 
     #################
-    squad_convert_example_to_features_init(tokenizer)
-    for example in examples:
-        feature = squad_convert_example_to_features(
-            example,
-            max_seq_length=max_seq_length,
-            doc_stride=doc_stride,
-            max_query_length=max_query_length,
-            padding_strategy=padding_strategy,
-            is_training=is_training,
-        )
-        features.append(feature)
-
-    #################
-    # threads = min(threads, cpu_count())
-    # with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
-    #     annotate_ = partial(
-    #         squad_convert_example_to_features,
+    # squad_convert_example_to_features_init(tokenizer)
+    # for example in examples:
+    #     feature = squad_convert_example_to_features(
+    #         example,
     #         max_seq_length=max_seq_length,
     #         doc_stride=doc_stride,
     #         max_query_length=max_query_length,
     #         padding_strategy=padding_strategy,
     #         is_training=is_training,
     #     )
-    #     features = list(
-    #         tqdm(
-    #             p.imap(annotate_, examples, chunksize=32),
-    #             total=len(examples),
-    #             desc="convert squad examples to features",
-    #             disable=not tqdm_enabled,
-    #         )
-    #     )
+    #     features.append(feature)
+
+    #################
+    threads = min(threads, cpu_count())
+    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+        annotate_ = partial(
+            squad_convert_example_to_features,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            padding_strategy=padding_strategy,
+            is_training=is_training,
+        )
+        features = list(
+            tqdm(
+                p.imap(annotate_, examples, chunksize=32),
+                total=len(examples),
+                desc="convert squad examples to features",
+                disable=not tqdm_enabled,
+            )
+        )
     #################
 
     new_features = []
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index a3a9a7a01a35..4b796db0303a 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -1756,20 +1756,12 @@ def __call__(self, *args, **kwargs):
             for example in examples:
                 # Define the side we want to truncate / pad and the text/pair sorting
                 question_first = bool(self.tokenizer.padding_side == "right")
-                if question_first:
-                    texts = example.question_text
-                    pairs = example.context_text
-                    truncation = "only_second"
-                else:
-                    texts = example.context_text
-                    pairs = example.question_text
-                    truncation = "only_first"
 
                 encoded_inputs = self.tokenizer(
-                    texts,
-                    pairs,
+                    text=example.question_text if question_first else example.context_text,
+                    text_pair=example.context_text if question_first else example.question_text,
                     padding=kwargs["padding"],
-                    truncation=truncation,
+                    truncation="only_second" if question_first else "only_first",
                     max_length=kwargs["max_seq_len"],
                     stride=kwargs["doc_stride"],
                     return_tensors="np",
@@ -1779,27 +1771,28 @@ def __call__(self, *args, **kwargs):
                     return_special_tokens_mask=True,
                 )
 
+                num_spans = len(encoded_inputs["input_ids"])
+
                 # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
                 p_mask = encoded_inputs["token_type_ids"] == (0 if question_first else 1)  # Mask the question
                 p_mask = p_mask | encoded_inputs["special_tokens_mask"]  # And mask the special tokens
-                cls_index = 0
                 if self.tokenizer.cls_token_id:
-                    # kKep the cls_token unmasked (some models use it to indicate unanswerable questions)
-                    cls_index = np.where(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id)
+                    # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
+                    cls_index = np.nonzero(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id)
                     p_mask[cls_index] = 0
 
                 features = []
-                num_spans = len(encoded_inputs["input_ids"])
                 for span_idx in range(num_spans):
                     features.append(
                         SquadFeatures(
-                            encoded_inputs["input_ids"][span_idx],
-                            encoded_inputs["attention_mask"][span_idx],
-                            encoded_inputs["token_type_ids"][span_idx],
-                            cls_index[span_idx],
-                            p_mask[span_idx].tolist(),
+                            input_ids=encoded_inputs["input_ids"][span_idx],
+                            attention_mask=encoded_inputs["attention_mask"][span_idx],
+                            token_type_ids=encoded_inputs["token_type_ids"][span_idx],
+                            p_mask=p_mask[span_idx].tolist(),
                             encoding=encoded_inputs[span_idx],
-                            # We don't use the rest of the values
+                            # We don't use the rest of the values - and actually
+                            # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
+                            cls_index=None,
                             token_to_orig_map={},
                             example_index=0,
                             unique_id=0,
@@ -1860,6 +1853,10 @@ def __call__(self, *args, **kwargs):
                     char_to_word = np.array(example.char_to_word_offset)
 
                     # Convert the answer (tokens) back to the original text
+                    # Score: score from the model
+                    # Start: Index of the first character of the answer in the context string
+                    # End: Index of the character following the last character of the answer in the context string
+                    # Answer: Plain text of the answer
                     answers += [
                         {
                             "score": score.item(),
@@ -1873,14 +1870,32 @@ def __call__(self, *args, **kwargs):
                     ]
                 else:
                     # Convert the answer (tokens) back to the original text
+                    # Score: score from the model
+                    # Start: Index of the first character of the answer in the context string
+                    # End: Index of the character following the last character of the answer in the context string
+                    # Answer: Plain text of the answer
+                    question_first = bool(self.tokenizer.padding_side == "right")
+                    enc = feature.encoding
+
+                    # Sometimes the max probability token is in the middle of a word so:
+                    # - we start by finding the right word containing the token with `token_to_word`
+                    # - then we convert this word in a character span with `word_to_chars`
                     answers += [
                         {
                             "score": score.item(),
-                            "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
-                            "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
-                            "answer": " ".join(
-                                example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
-                            ),
+                            "start": enc.word_to_chars(
+                                enc.token_to_word(s), sequence_index=1 if question_first else 0
+                            )[0],
+                            "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[
+                                1
+                            ],
+                            "answer": example.context_text[
+                                enc.word_to_chars(enc.token_to_word(s), sequence_index=1 if question_first else 0)[
+                                    0
+                                ] : enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[
+                                    1
+                                ]
+                            ],
                         }
                         for s, e, score in zip(starts, ends, scores)
                     ]
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 7e7b10c5096f..010066eaf26b 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -365,9 +365,11 @@ def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] =
             token_index = self._seq_len + token_index
         return self._encodings[batch_index].token_to_word(token_index)
 
-    def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> Optional[TokenSpan]:
+    def word_to_tokens(
+        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
+    ) -> Optional[TokenSpan]:
         """
-        Get the encoded token span corresponding to a word in the sequence of the batch.
+        Get the encoded token span corresponding to a word in a sequence of the batch.
 
         Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:
 
@@ -376,8 +378,9 @@ def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = N
 
         Can be called as:
 
-        - ``self.word_to_tokens(word_index)`` if batch size is 1
-        - ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1
+        - ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1
+        - ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal
+          to 1
 
         This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
         are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
@@ -390,6 +393,9 @@ def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = N
             word_index (:obj:`int`, `optional`):
                 If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                 sequence.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided word index belongs to.
 
         Returns:
             Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
@@ -407,7 +413,7 @@ def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = N
             batch_index = self._batch_size + batch_index
         if word_index < 0:
             word_index = self._seq_len + word_index
-        span = self._encodings[batch_index].word_to_tokens(word_index)
+        span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
         return TokenSpan(*span) if span is not None else None
 
     def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
@@ -446,7 +452,9 @@ def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] =
             token_index = batch_or_token_index
         return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
 
-    def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
+    def char_to_token(
+        self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
+    ) -> int:
         """
         Get the index of the token in the encoded output comprising a character in the original string for a sequence
         of the batch.
@@ -467,6 +475,9 @@ def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = No
             char_index (:obj:`int`, `optional`):
                 If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                 sequence.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided character index belongs to.
 
 
         Returns:
@@ -480,9 +491,11 @@ def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = No
         else:
             batch_index = 0
             char_index = batch_or_char_index
-        return self._encodings[batch_index].char_to_token(char_index)
+        return self._encodings[batch_index].char_to_token(char_index, sequence_index)
 
-    def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan:
+    def word_to_chars(
+        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
+    ) -> CharSpan:
         """
         Get the character span in the original string corresponding to given word in a sequence of the batch.
 
@@ -503,6 +516,9 @@ def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = No
             word_index (:obj:`int`, `optional`):
                 If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
                 sequence.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided word index belongs to.
 
         Returns:
             :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
@@ -520,9 +536,9 @@ def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = No
         else:
             batch_index = 0
             word_index = batch_or_word_index
-        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index)))
+        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
 
-    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
+    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
         """
         Get the word in the original string corresponding to a character in the original string of a sequence of the
         batch.
@@ -543,6 +559,9 @@ def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = Non
             char_index (:obj:`int`, `optional`):
                 If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
                 orginal string.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided character index belongs to.
 
 
         Returns:
@@ -556,7 +575,7 @@ def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = Non
         else:
             batch_index = 0
             char_index = batch_or_char_index
-        return self._encodings[batch_index].char_to_word(char_index)
+        return self._encodings[batch_index].char_to_word(char_index, sequence_index)
 
     def convert_to_tensors(
         self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 8754c3334db8..7f7d479893b3 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -169,9 +169,10 @@ def _convert_encoding(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-    ) -> Dict[str, Any]:
+    ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
         """
-        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict.
+        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
+        of encodings, take care of building a batch from overflowing tokens.
 
         Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
         lists (overflows) of lists (tokens).
@@ -203,7 +204,7 @@ def _convert_encoding(
             if return_length:
                 encoding_dict["length"].append(len(e.ids))
 
-        return encoding_dict
+        return encoding_dict, encodings
 
     def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
         """
@@ -390,9 +391,12 @@ def _batch_encode_plus(
         )
 
         # Convert encoding to dict
-        # `Tokens` has type: List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]]
+        # `Tokens` has type: Tuple[
+        #                       List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
+        #                       List[EncodingFast]
+        #                    ]
         # with nested dimensions corresponding to batch, overflows, sequence length
-        tokens = [
+        tokens_and_encodings = [
             self._convert_encoding(
                 encoding=encoding,
                 return_token_type_ids=return_token_type_ids,
@@ -406,22 +410,27 @@ def _batch_encode_plus(
             for encoding in encodings
         ]
 
-        # Convert the output to have dict[list] from list[dict]
-        sanitized = {}
-        for key in tokens[0].keys():
-            # To List[List[List[int]]] of shape (batch, overflows, sequence length)
-            stack = [e for item in tokens for e in item[key]]
-            sanitized[key] = stack
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+        # (we say ~ because the number of overflow varies with the example in the batch)
+        #
+        # To match each overflowing sample with the original sample in the batch
+        # we add an overflow_to_sample_mapping array (see below)
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0].keys():
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
 
         # If returning overflowing tokens, we need to return a mapping
         # from the batch idx to the original sample
         if return_overflowing_tokens:
             overflow_to_sample_mapping = []
-            for i, enc in enumerate(tokens):
-                overflow_to_sample_mapping += [i] * len(enc["input_ids"])
-            sanitized["overflow_to_sample_mapping"] = overflow_to_sample_mapping
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
 
-        return BatchEncoding(sanitized, encodings, tensor_type=return_tensors)
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
 
     def _encode_plus(
         self,
diff --git a/tests/test_retrieval_rag.py b/tests/test_retrieval_rag.py
index fde5ac529c2f..a95324535b82 100644
--- a/tests/test_retrieval_rag.py
+++ b/tests/test_retrieval_rag.py
@@ -26,6 +26,7 @@
 from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
 from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
 
+
 if is_faiss_available():
     import faiss
 
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index a3df25152121..2903cca33fd6 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -1896,6 +1896,114 @@ def test_alignement_methods(self):
                     batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
                 )
 
+                # Pair of input sequences
+
+                words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
+                text = " ".join(words)
+                pair_words = ["Amazing", "example", "full", "of", "inspiration"]
+                pair_text = " ".join(pair_words)
+                batch_size = 3
+                index_word_in_first_seq = words.index("inspiration")
+                index_word_in_pair_seq = pair_words.index("inspiration")
+                index_char_in_first_seq = text.find("inspiration")
+                index_char_in_pair_seq = pair_text.find("inspiration")
+
+                pair_encoding = tokenizer_r.encode_plus(text, pair_text, add_special_tokens=False)
+
+                pair_batch_encoding = tokenizer_r.batch_encode_plus(
+                    [(text, pair_text)] * batch_size, add_special_tokens=False
+                )
+                num_tokens = len(encoding["input_ids"])
+
+                last_word_index = len(words) - 1
+                last_token_index = num_tokens - 1
+                last_batch_index = batch_size - 1
+                last_char_index = len(text) - 1
+
+                # Assert word_to_tokens
+                self.assertNotEqual(
+                    pair_encoding.word_to_tokens(index_word_in_first_seq, sequence_index=0).start,
+                    pair_encoding.word_to_tokens(index_word_in_pair_seq, sequence_index=1).start,
+                )
+                self.assertEqual(
+                    pair_encoding["input_ids"][
+                        pair_encoding.word_to_tokens(index_word_in_first_seq, sequence_index=0).start
+                    ],
+                    pair_encoding["input_ids"][
+                        pair_encoding.word_to_tokens(index_word_in_pair_seq, sequence_index=1).start
+                    ],
+                )
+                self.assertNotEqual(
+                    pair_batch_encoding.word_to_tokens(1, index_word_in_first_seq, sequence_index=0).start,
+                    pair_batch_encoding.word_to_tokens(1, index_word_in_pair_seq, sequence_index=1).start,
+                )
+                self.assertEqual(
+                    pair_batch_encoding["input_ids"][1][
+                        pair_batch_encoding.word_to_tokens(1, index_word_in_first_seq, sequence_index=0).start
+                    ],
+                    pair_batch_encoding["input_ids"][1][
+                        pair_batch_encoding.word_to_tokens(1, index_word_in_pair_seq, sequence_index=1).start
+                    ],
+                )
+
+                # Assert char_to_token
+                self.assertNotEqual(
+                    pair_encoding.char_to_token(index_char_in_first_seq, sequence_index=0),
+                    pair_encoding.char_to_token(index_char_in_pair_seq, sequence_index=1),
+                )
+                self.assertEqual(
+                    pair_encoding["input_ids"][pair_encoding.char_to_token(index_char_in_first_seq, sequence_index=0)],
+                    pair_encoding["input_ids"][pair_encoding.char_to_token(index_char_in_pair_seq, sequence_index=1)],
+                )
+                self.assertNotEqual(
+                    pair_batch_encoding.char_to_token(1, index_char_in_first_seq, sequence_index=0),
+                    pair_batch_encoding.char_to_token(1, index_char_in_pair_seq, sequence_index=1),
+                )
+                self.assertEqual(
+                    pair_batch_encoding["input_ids"][1][
+                        pair_batch_encoding.char_to_token(1, index_char_in_first_seq, sequence_index=0)
+                    ],
+                    pair_batch_encoding["input_ids"][1][
+                        pair_batch_encoding.char_to_token(1, index_char_in_pair_seq, sequence_index=1)
+                    ],
+                )
+
+                # Assert char_to_word
+                self.assertNotEqual(
+                    pair_encoding.char_to_word(index_char_in_first_seq, sequence_index=0),
+                    pair_encoding.char_to_word(index_char_in_pair_seq, sequence_index=1),
+                )
+                self.assertEqual(
+                    words[pair_encoding.char_to_word(index_char_in_first_seq, sequence_index=0)],
+                    pair_words[pair_encoding.char_to_word(index_char_in_pair_seq, sequence_index=1)],
+                )
+                self.assertNotEqual(
+                    pair_batch_encoding.char_to_word(1, index_char_in_first_seq, sequence_index=0),
+                    pair_batch_encoding.char_to_word(1, index_char_in_pair_seq, sequence_index=1),
+                )
+                self.assertEqual(
+                    words[pair_batch_encoding.char_to_word(1, index_char_in_first_seq, sequence_index=0)],
+                    pair_words[pair_batch_encoding.char_to_word(1, index_char_in_pair_seq, sequence_index=1)],
+                )
+
+                # Assert word_to_chars
+                self.assertNotEqual(
+                    pair_encoding.word_to_chars(index_word_in_first_seq, sequence_index=0).start,
+                    pair_encoding.word_to_chars(index_word_in_pair_seq, sequence_index=1).start,
+                )
+                self.assertEqual(
+                    text[pair_encoding.word_to_chars(index_word_in_first_seq, sequence_index=0).start],
+                    pair_text[pair_encoding.word_to_chars(index_word_in_pair_seq, sequence_index=1).start],
+                )
+                self.assertNotEqual(
+                    pair_batch_encoding.word_to_chars(1, index_word_in_first_seq, sequence_index=0).start,
+                    pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start,
+                )
+                self.assertEqual(
+                    text[pair_batch_encoding.word_to_chars(1, index_word_in_first_seq, sequence_index=0).start],
+                    pair_text[pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start],
+                )
+
     def test_tokenization_python_rust_equals(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):

From 336759326a05c2f1ab3d82737f67e217fc11a09b Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Mon, 9 Nov 2020 17:57:00 +0100
Subject: [PATCH 06/24] Move pipeline tests to their own test job again

---
 tests/test_pipelines_common.py | 77 +---------------------------------
 1 file changed, 1 insertion(+), 76 deletions(-)

diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
index c73252811705..736ac9612081 100644
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@@ -2,8 +2,6 @@
 from unittest import mock
 
 from transformers import is_tf_available, is_torch_available, pipeline
-
-# from transformers.pipelines import DefaultArgumentHandler, Pipeline
 from transformers.pipelines import Pipeline
 from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow
 from transformers.tokenization_utils_base import to_py_obj
@@ -12,7 +10,7 @@
 VALID_INPUTS = ["A simple string", ["list of strings"]]
 
 
-# @is_pipeline_test
+@is_pipeline_test
 class CustomInputPipelineCommonMixin:
     pipeline_task = None
     pipeline_loading_kwargs = {}  # Additional kwargs to load the pipeline with
@@ -230,76 +228,3 @@ def _test_pipeline(self, nlp: Pipeline):
                 self.assertIn(key, result)
 
         self.assertRaises(Exception, nlp, self.invalid_inputs)
-
-
-# @is_pipeline_test
-# class DefaultArgumentHandlerTestCase(unittest.TestCase):
-#     def setUp(self) -> None:
-#         self.handler = DefaultArgumentHandler()
-#
-#     def test_kwargs_x(self):
-#         mono_data = {"X": "This is a sample input"}
-#         mono_args = self.handler(**mono_data)
-#
-#         self.assertTrue(isinstance(mono_args, list))
-#         self.assertEqual(len(mono_args), 1)
-#
-#         multi_data = {"x": ["This is a sample input", "This is a second sample input"]}
-#         multi_args = self.handler(**multi_data)
-#
-#         self.assertTrue(isinstance(multi_args, list))
-#         self.assertEqual(len(multi_args), 2)
-#
-#     def test_kwargs_data(self):
-#         mono_data = {"data": "This is a sample input"}
-#         mono_args = self.handler(**mono_data)
-#
-#         self.assertTrue(isinstance(mono_args, list))
-#         self.assertEqual(len(mono_args), 1)
-#
-#         multi_data = {"data": ["This is a sample input", "This is a second sample input"]}
-#         multi_args = self.handler(**multi_data)
-#
-#         self.assertTrue(isinstance(multi_args, list))
-#         self.assertEqual(len(multi_args), 2)
-#
-#     def test_multi_kwargs(self):
-#         mono_data = {"data": "This is a sample input", "X": "This is a sample input 2"}
-#         mono_args = self.handler(**mono_data)
-#
-#         self.assertTrue(isinstance(mono_args, list))
-#         self.assertEqual(len(mono_args), 2)
-#
-#         multi_data = {
-#             "data": ["This is a sample input", "This is a second sample input"],
-#             "test": ["This is a sample input 2", "This is a second sample input 2"],
-#         }
-#         multi_args = self.handler(**multi_data)
-#
-#         self.assertTrue(isinstance(multi_args, list))
-#         self.assertEqual(len(multi_args), 4)
-#
-#     def test_args(self):
-#         mono_data = "This is a sample input"
-#         mono_args = self.handler(mono_data)
-#
-#         self.assertTrue(isinstance(mono_args, list))
-#         self.assertEqual(len(mono_args), 1)
-#
-#         mono_data = ["This is a sample input"]
-#         mono_args = self.handler(mono_data)
-#
-#         self.assertTrue(isinstance(mono_args, list))
-#         self.assertEqual(len(mono_args), 1)
-#
-#         multi_data = ["This is a sample input", "This is a second sample input"]
-#         multi_args = self.handler(multi_data)
-#
-#         self.assertTrue(isinstance(multi_args, list))
-#         self.assertEqual(len(multi_args), 2)
-#
-#         multi_data = ["This is a sample input", "This is a second sample input"]
-#         multi_args = self.handler(*multi_data)
-#
-#         self.assertTrue(isinstance(multi_args, list))
-#         self.assertEqual(len(multi_args), 2)

From 36e0900148d401c291f1d2faf7a470ce6084ce25 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Mon, 9 Nov 2020 20:38:06 +0100
Subject: [PATCH 07/24] update tokenizer to add sequence id methods

---
 src/transformers/tokenization_utils_base.py | 99 +++++++++++++++++++++
 tests/test_tokenization_common.py           | 56 +++++++++++-
 2 files changed, 151 insertions(+), 4 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 1cd6e491e8ec..b7f1a71ddd0c 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -216,6 +216,9 @@ class BatchEncoding(UserDict):
             initialization.
         prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
+        n_sequences (:obj:`Optional[int]`, `optional`):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            initialization.
     """
 
     def __init__(
@@ -224,6 +227,7 @@ def __init__(
         encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
         tensor_type: Union[None, str, TensorType] = None,
         prepend_batch_axis: bool = False,
+        n_sequences: Optional[int] = None,
     ):
         super().__init__(data)
 
@@ -232,8 +236,22 @@ def __init__(
 
         self._encodings = encoding
 
+        if n_sequences is None and encoding is not None and len(encoding):
+            n_sequences = encoding[0].n_sequences
+
+        self._n_sequences = n_sequences
+
         self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
 
+    @property
+    def n_sequences(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
+        :class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single
+        sentence) or :obj:`2` (a pair of sentences)
+        """
+        return self.n_sequences
+
     @property
     def is_fast(self) -> bool:
         """
@@ -311,6 +329,27 @@ def tokens(self, batch_index: int = 0) -> List[str]:
             raise ValueError("tokens() is not available when using Python-based tokenizers")
         return self._encodings[batch_index].tokens
 
+    def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
+        """
+        Return a list mapping the tokens to the id of their original sentences:
+
+            - :obj:`None` for special tokens added around or between sequences,
+            - :obj:`0` for tokens coresponding to words in the first sequence,
+            - :obj:`1` for tokens coresponding to words in the second sequence when a pair of sequences was jointly
+              encoded.
+
+        Args:
+            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+
+        Returns:
+            :obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens
+            added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their
+            corresponding sequence.
+        """
+        if not self._encodings:
+            raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
+        return self._encodings[batch_index].sequences
+
     def words(self, batch_index: int = 0) -> List[Optional[int]]:
         """
         Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
@@ -325,8 +364,68 @@ def words(self, batch_index: int = 0) -> List[Optional[int]]:
         """
         if not self._encodings:
             raise ValueError("words() is not available when using Python-based tokenizers")
+        warnings.warn(
+            "`BatchEncoding.words(batch_index: int = 0)` propperty is deprecated and should be replaced with the identical, "
+            "but more self-explanatory `BatchEncoding.words(batch_index: int = 0)` property.",
+            FutureWarning,
+        )
+        return self.word_ids(batch_index)
+
+    def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
+        """
+        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
+
+        Args:
+            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+
+        Returns:
+            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
+            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
+            word (several tokens will be mapped to the same word index if they are parts of that word).
+        """
+        if not self._encodings:
+            raise ValueError("word_ids() is not available when using Python-based tokenizers")
         return self._encodings[batch_index].words
 
+    def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
+        """
+        Get the index of the sequence represented by the given token. In the general use case, this method returns
+        :obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair
+
+        Can be called as:
+
+        - ``self.token_to_sequence(token_index)`` if batch size is 1
+        - ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
+        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
+        tokenized words.
+
+        Args:
+            batch_or_token_index (:obj:`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the token in the sequence.
+            token_index (:obj:`int`, `optional`):
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
+                sequence.
+
+        Returns:
+            :obj:`int`: Index of the word in the input sequence.
+        """
+
+        if not self._encodings:
+            raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
+        if token_index is not None:
+            batch_index = batch_or_token_index
+        else:
+            batch_index = 0
+            token_index = batch_or_token_index
+        if batch_index < 0:
+            batch_index = self._batch_size + batch_index
+        if token_index < 0:
+            token_index = self._seq_len + token_index
+        return self._encodings[batch_index].token_to_sequence(token_index)
+
     def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
         """
         Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 2903cca33fd6..376616a0b5de 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -581,7 +581,6 @@ def test_token_type_ids(self):
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 seq_0 = "Test this method."
-                seq_1 = "With these inputs."
 
                 # We want to have sequence 0 and sequence 1 are tagged
                 # respectively with 0 and 1 token_ids
@@ -590,9 +589,28 @@ def test_token_type_ids(self):
                 output = tokenizer(seq_0, return_token_type_ids=True)
                 self.assertIn(0, output["token_type_ids"])
 
-                output = tokenizer(seq_0, seq_1, return_token_type_ids=True)
-                self.assertIn(0, output["token_type_ids"])
-                self.assertIn(1, output["token_type_ids"])
+    def test_sequence_ids(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            if not tokenizer.is_fast:
+                continue
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                seq_0 = "Test this method."
+                seq_1 = "With these inputs."
+
+                # We want to have sequence 0 and sequence 1 are tagged
+                # respectively with 0 and 1 token_ids
+                # (regardeless of weither the model use token type ids)
+                # We use this assumption in the QA pipeline among other place
+                output = tokenizer(seq_0)
+                self.assertIn(0, output.sequence_ids())
+
+                output = tokenizer(seq_0, seq_1)
+                self.assertIn(0, output.sequence_ids())
+                self.assertIn(1, output.sequence_ids())
+
+                if tokenizer.num_special_tokens_to_add(pair=True):
+                    self.assertIn(None, output.sequence_ids())
 
     def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
@@ -1896,6 +1914,13 @@ def test_alignement_methods(self):
                     batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
                 )
 
+                # Assert token_to_sequence
+                self.assertEqual(encoding.token_to_sequence(num_tokens // 2), 0)
+                self.assertEqual(encoding.token_to_sequence(0, num_tokens // 2), 0)
+                self.assertEqual(batch_encoding.token_to_sequence(1, num_tokens // 2), 0)
+                self.assertEqual(batch_encoding.token_to_sequence(0, num_tokens // 2), 0)
+                self.assertEqual(batch_encoding.token_to_sequence(last_batch_index, num_tokens // 2), 0)
+
                 # Pair of input sequences
 
                 words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
@@ -2004,6 +2029,29 @@ def test_alignement_methods(self):
                     pair_text[pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start],
                 )
 
+                # Assert token_to_sequence
+                pair_encoding = tokenizer_r.encode_plus(text, pair_text, add_special_tokens=True)
+
+                pair_sequence_ids = [
+                    pair_encoding.token_to_sequence(i) for i in range(len(pair_encoding["input_ids"]))
+                ]
+                self.assertIn(0, pair_sequence_ids)
+                self.assertIn(1, pair_sequence_ids)
+                if tokenizer_r.num_special_tokens_to_add(pair=True):
+                    self.assertIn(None, pair_sequence_ids)
+
+                pair_batch_encoding = tokenizer_r.batch_encode_plus(
+                    [(text, pair_text)] * batch_size, add_special_tokens=True
+                )
+                pair_batch_sequence_ids = [
+                    pair_batch_encoding.token_to_sequence(1, i)
+                    for i in range(len(pair_batch_encoding["input_ids"][0]))
+                ]
+                self.assertIn(0, pair_batch_sequence_ids)
+                self.assertIn(1, pair_batch_sequence_ids)
+                if tokenizer_r.num_special_tokens_to_add(pair=True):
+                    self.assertIn(None, pair_batch_sequence_ids)
+
     def test_tokenization_python_rust_equals(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):

From ef4919ba22879945fc831fdb9dfc8a9bbfa8eae4 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 10:42:22 +0100
Subject: [PATCH 08/24] update to tokenizers 0.9.4

---
 setup.py                                    | 4 ++--
 src/transformers/tokenization_utils_base.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 04c51912fdc9..a13a5b9bda5e 100644
--- a/setup.py
+++ b/setup.py
@@ -96,7 +96,7 @@
     extras["retrieval"] = ["faiss-cpu", "datasets"]
     extras["flax"] = ["jaxlib==0.1.55", "jax>=0.2.0", "flax==0.2.2"]
 
-extras["tokenizers"] = ["tokenizers==0.9.2"]
+extras["tokenizers"] = ["tokenizers==0.9.4"]
 extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"]
 
 extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
@@ -129,7 +129,7 @@
     packages=find_packages("src"),
     install_requires=[
         "numpy",
-        "tokenizers == 0.9.3",
+        "tokenizers == 0.9.4",
         # dataclasses for Python versions that don't have it
         "dataclasses;python_version<'3.7'",
         # utilities from PyPA to e.g. compare versions
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index b7f1a71ddd0c..f9eb8566da44 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -348,7 +348,7 @@ def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
         """
         if not self._encodings:
             raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
-        return self._encodings[batch_index].sequences
+        return self._encodings[batch_index].sequence_ids
 
     def words(self, batch_index: int = 0) -> List[Optional[int]]:
         """
@@ -385,7 +385,7 @@ def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
         """
         if not self._encodings:
             raise ValueError("word_ids() is not available when using Python-based tokenizers")
-        return self._encodings[batch_index].words
+        return self._encodings[batch_index].word_ids
 
     def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
         """

From dab816880c7a6762ba898153024e739d740c2ce6 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 13:07:03 +0100
Subject: [PATCH 09/24] set sentencepiecce as optional

---
 setup.py                                  | 2 +-
 tests/test_tokenization_xlm_prophetnet.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index a13a5b9bda5e..fba83903a205 100644
--- a/setup.py
+++ b/setup.py
@@ -143,7 +143,7 @@
         # for OpenAI GPT
         "regex != 2019.12.17",
         # for SentencePiece models
-        "sentencepiece == 0.1.91",
+        # "sentencepiece == 0.1.91",
         "protobuf",
         # for XLM
         "sacremoses",
diff --git a/tests/test_tokenization_xlm_prophetnet.py b/tests/test_tokenization_xlm_prophetnet.py
index 83097ff71d71..7dfdee6b5f8a 100644
--- a/tests/test_tokenization_xlm_prophetnet.py
+++ b/tests/test_tokenization_xlm_prophetnet.py
@@ -18,7 +18,7 @@
 import unittest
 
 from transformers.file_utils import cached_property
-from transformers.testing_utils import slow
+from transformers.testing_utils import require_sentencepiece, slow
 from transformers.tokenization_xlm_prophetnet import SPIECE_UNDERLINE, XLMProphetNetTokenizer
 
 from .test_tokenization_common import TokenizerTesterMixin
@@ -27,6 +27,7 @@
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
 
 
+@require_sentencepiece
 class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     tokenizer_class = XLMProphetNetTokenizer

From 84bc2444decc6f910061bf76598f5917c599a30a Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 14:29:11 +0100
Subject: [PATCH 10/24] clean up squad

---
 src/transformers/data/processors/squad.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 5dbfed40cb9e..167cf3ee48d9 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -350,20 +350,6 @@ def squad_convert_examples_to_features(
     # Defining helper methods
     features = []
 
-    #################
-    # squad_convert_example_to_features_init(tokenizer)
-    # for example in examples:
-    #     feature = squad_convert_example_to_features(
-    #         example,
-    #         max_seq_length=max_seq_length,
-    #         doc_stride=doc_stride,
-    #         max_query_length=max_query_length,
-    #         padding_strategy=padding_strategy,
-    #         is_training=is_training,
-    #     )
-    #     features.append(feature)
-
-    #################
     threads = min(threads, cpu_count())
     with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
         annotate_ = partial(
@@ -382,7 +368,6 @@ def squad_convert_examples_to_features(
                 disable=not tqdm_enabled,
             )
         )
-    #################
 
     new_features = []
     unique_id = 1000000000

From 751ee692d7486c33f719b81768b658ab70b0d025 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 14:43:10 +0100
Subject: [PATCH 11/24] clean up pipelines to use sequence_ids

---
 src/transformers/pipelines.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 55a8b033a06e..ce3dcab24d36 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -1808,13 +1808,21 @@ def __call__(self, *args, **kwargs):
                     return_special_tokens_mask=True,
                 )
 
+                # When the input is too long, it's converted in a batch of inputs with overflowing tokens
+                # and a stride of overlap between the inputs. If a batch of inputs is given, a special output
+                # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
+                # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
+                # "num_span" is the number of output samples generated from the overflowing tokens.
                 num_spans = len(encoded_inputs["input_ids"])
 
                 # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-                p_mask = encoded_inputs["token_type_ids"] == (0 if question_first else 1)  # Mask the question
-                p_mask = p_mask | encoded_inputs["special_tokens_mask"]  # And mask the special tokens
+                # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
+                p_mask = np.asarray([[tok != 1 if question_first else 0
+                                        for tok in encoded_inputs.sequence_ids(span_id)]
+                                    for span_id in range(num_spans)])
+
+                # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
                 if self.tokenizer.cls_token_id:
-                    # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
                     cls_index = np.nonzero(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id)
                     p_mask[cls_index] = 0
 

From 0e8d7f7020ff0787b7da7f88d5fd89d71712a30a Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 15:50:44 +0100
Subject: [PATCH 12/24] style/quality

---
 src/transformers/pipelines.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index ce3dcab24d36..aaba696c702e 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -1817,9 +1817,12 @@ def __call__(self, *args, **kwargs):
 
                 # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
                 # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
-                p_mask = np.asarray([[tok != 1 if question_first else 0
-                                        for tok in encoded_inputs.sequence_ids(span_id)]
-                                    for span_id in range(num_spans)])
+                p_mask = np.asarray(
+                    [
+                        [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
+                        for span_id in range(num_spans)
+                    ]
+                )
 
                 # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
                 if self.tokenizer.cls_token_id:

From eb72b1fe5cb98275e5ff1eda5ce21db69a27a34d Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 16:01:06 +0100
Subject: [PATCH 13/24] wording

---
 src/transformers/tokenization_utils_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 4a9ffd1d3323..5d0e50add8f1 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -367,8 +367,8 @@ def words(self, batch_index: int = 0) -> List[Optional[int]]:
         if not self._encodings:
             raise ValueError("words() is not available when using Python-based tokenizers")
         warnings.warn(
-            "`BatchEncoding.words(batch_index: int = 0)` propperty is deprecated and should be replaced with the identical, "
-            "but more self-explanatory `BatchEncoding.words(batch_index: int = 0)` property.",
+            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
+            "but more self-explanatory `BatchEncoding.word_ids()` property.",
             FutureWarning,
         )
         return self.word_ids(batch_index)

From 16da2c54244e448ff5da7e16380e7b56ce2b1ac6 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 16:06:13 +0100
Subject: [PATCH 14/24] Switch to use_fast = True by default

---
 src/transformers/pipelines.py         | 4 ++--
 src/transformers/tokenization_auto.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index aaba696c702e..8cc533d980ee 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -2840,7 +2840,7 @@ def pipeline(
     tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
     framework: Optional[str] = None,
     revision: Optional[str] = None,
-    use_fast: bool = False,
+    use_fast: bool = True,
     **kwargs
 ) -> Pipeline:
     """
@@ -2898,7 +2898,7 @@ def pipeline(
             When passing a task name or a string model identifier: The specific model version to use. It can be a
             branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
             artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git.
-        use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`).
         kwargs:
             Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
diff --git a/src/transformers/tokenization_auto.py b/src/transformers/tokenization_auto.py
index 93c9fbfe64a9..7e375d05986b 100644
--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -280,7 +280,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                 git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
                 identifier allowed by git.
-            use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
                 Whether or not to try to load the fast version of the tokenizer.
             kwargs (additional keyword arguments, `optional`):
                 Will be passed to the Tokenizer ``__init__()`` method. Can be used to set special tokens like
@@ -308,7 +308,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         if "bert-base-japanese" in str(pretrained_model_name_or_path):
             return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 
-        use_fast = kwargs.pop("use_fast", False)
+        use_fast = kwargs.pop("use_fast", True)
 
         if config.tokenizer_class is not None:
             if use_fast and not config.tokenizer_class.endswith("Fast"):

From 0f03fdb6e73aa4d9679dc16462556821878279d2 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 16:25:38 +0100
Subject: [PATCH 15/24] update tests for use_fast at True by default

---
 tests/test_pipelines_ner.py     | 2 +-
 tests/test_tokenization_auto.py | 4 ++--
 tests/test_tokenization_rag.py  | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/test_pipelines_ner.py b/tests/test_pipelines_ner.py
index bc12900d8422..44f47d66d99b 100644
--- a/tests/test_pipelines_ner.py
+++ b/tests/test_pipelines_ner.py
@@ -149,7 +149,7 @@ def test_pt_ignore_subwords_slow_tokenizer_raises(self):
             tokenizer = AutoTokenizer.from_pretrained(model_name)
 
             with self.assertRaises(ValueError):
-                pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True)
+                pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True, use_fast=False)
 
     @require_torch
     def test_pt_defaults_slow_tokenizer(self):
diff --git a/tests/test_tokenization_auto.py b/tests/test_tokenization_auto.py
index 390e89b08939..e06d7800bb1d 100644
--- a/tests/test_tokenization_auto.py
+++ b/tests/test_tokenization_auto.py
@@ -116,5 +116,5 @@ def test_parents_and_children_in_mappings(self):
 
     @require_tokenizers
     def test_from_pretrained_use_fast_toggle(self):
-        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizer)
-        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True), BertTokenizerFast)
+        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer)
+        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizerFast)
diff --git a/tests/test_tokenization_rag.py b/tests/test_tokenization_rag.py
index 158aadca6940..3a2551b3859e 100644
--- a/tests/test_tokenization_rag.py
+++ b/tests/test_tokenization_rag.py
@@ -4,13 +4,12 @@
 import tempfile
 from unittest import TestCase
 
+from transformers import BartTokenizer, DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast
 from transformers.configuration_bart import BartConfig
 from transformers.configuration_dpr import DPRConfig
 from transformers.file_utils import is_datasets_available, is_faiss_available, is_torch_available
-from transformers.testing_utils import require_datasets, require_faiss, require_torch, slow
-from transformers.tokenization_bart import BartTokenizer
+from transformers.testing_utils import require_datasets, require_faiss, require_tokenizers, require_torch, slow
 from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
-from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
 from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
 
 
@@ -96,6 +95,7 @@ def get_bart_tokenizer(self) -> BartTokenizer:
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
+    @require_tokenizers
     def test_save_load_pretrained_with_saved_config(self):
 
         save_dir = os.path.join(self.tmpdirname, "rag_tokenizer")
@@ -104,7 +104,7 @@ def test_save_load_pretrained_with_saved_config(self):
         rag_config.save_pretrained(save_dir)
         rag_tokenizer.save_pretrained(save_dir)
         new_rag_tokenizer = RagTokenizer.from_pretrained(save_dir, config=rag_config)
-        self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizer)
+        self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizerFast)
         self.assertEqual(new_rag_tokenizer.question_encoder.vocab, rag_tokenizer.question_encoder.vocab)
         self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizer)
         self.assertEqual(new_rag_tokenizer.generator.encoder, rag_tokenizer.generator.encoder)

From 87cb801a7f115c835128aa408025958044aec5f6 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 16:45:24 +0100
Subject: [PATCH 16/24] fix rag tokenizer test

---
 tests/test_tokenization_rag.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_tokenization_rag.py b/tests/test_tokenization_rag.py
index 3a2551b3859e..63bdb541e61d 100644
--- a/tests/test_tokenization_rag.py
+++ b/tests/test_tokenization_rag.py
@@ -4,7 +4,7 @@
 import tempfile
 from unittest import TestCase
 
-from transformers import BartTokenizer, DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast
+from transformers import BartTokenizer, BartTokenizerFast, DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast
 from transformers.configuration_bart import BartConfig
 from transformers.configuration_dpr import DPRConfig
 from transformers.file_utils import is_datasets_available, is_faiss_available, is_torch_available
@@ -105,9 +105,9 @@ def test_save_load_pretrained_with_saved_config(self):
         rag_tokenizer.save_pretrained(save_dir)
         new_rag_tokenizer = RagTokenizer.from_pretrained(save_dir, config=rag_config)
         self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizerFast)
-        self.assertEqual(new_rag_tokenizer.question_encoder.vocab, rag_tokenizer.question_encoder.vocab)
-        self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizer)
-        self.assertEqual(new_rag_tokenizer.generator.encoder, rag_tokenizer.generator.encoder)
+        self.assertEqual(new_rag_tokenizer.question_encoder.get_vocab(), rag_tokenizer.question_encoder.get_vocab())
+        self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizerFast)
+        self.assertEqual(new_rag_tokenizer.generator.get_vocab(), rag_tokenizer.generator.get_vocab())
 
     @slow
     def test_pretrained_token_nq_tokenizer(self):

From 77ee69ff77d922a9c732d5276feace12f32716bc Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 17:00:14 +0100
Subject: [PATCH 17/24] removing protobuf from required dependencies

---
 setup.py                                   |  4 +---
 src/transformers/convert_slow_tokenizer.py | 19 +++++++--------
 src/transformers/file_utils.py             | 27 ++++++++++++++++++++++
 3 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/setup.py b/setup.py
index fba83903a205..7a43c7085aa3 100644
--- a/setup.py
+++ b/setup.py
@@ -101,6 +101,7 @@
 
 extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
 
+extras["protobuf"] = ["protobuf"]
 extras["sentencepiece"] = ["sentencepiece==0.1.91"]
 extras["retrieval"] = ["faiss-cpu", "datasets"]
 extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"]
@@ -142,9 +143,6 @@
         "tqdm >= 4.27",
         # for OpenAI GPT
         "regex != 2019.12.17",
-        # for SentencePiece models
-        # "sentencepiece == 0.1.91",
-        "protobuf",
         # for XLM
         "sacremoses",
     ],
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 8c765943c217..e856d1196768 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -24,10 +24,7 @@
 from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 
-# from transformers.tokenization_openai import OpenAIGPTTokenizer
-from transformers.utils import sentencepiece_model_pb2 as model
-
-from .file_utils import requires_sentencepiece
+from .file_utils import requires_sentencepiece, requires_protobuf
 
 
 class SentencePieceExtractor:
@@ -64,12 +61,6 @@ def check_number_comma(piece: str) -> bool:
     return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit()
 
 
-def get_proto(filename: str):
-    m = model.ModelProto()
-    m.ParseFromString(open(filename, "rb").read())
-    return m
-
-
 class Converter:
     def __init__(self, original_tokenizer):
         self.original_tokenizer = original_tokenizer
@@ -292,8 +283,14 @@ def converted(self) -> Tokenizer:
 
 class SpmConverter(Converter):
     def __init__(self, *args):
+        requires_protobuf(self)
+
         super().__init__(*args)
-        self.proto = get_proto(self.original_tokenizer.vocab_file)
+
+        from .utils import sentencepiece_model_pb2 as model_pb2
+        m = model_pb2.ModelProto()
+        m.ParseFromString(open(self.original_tokenizer.vocab_file, "rb").read())
+        self.proto = m
 
     def vocab(self, proto):
         return [(piece.piece, piece.score) for piece in proto.pieces]
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index f6b63fa8962f..3c03a451e899 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -185,6 +185,15 @@
     _sentencepiece_available = False
 
 
+try:
+    import protobuf  # noqa: F401
+
+    _protobuf_available = True
+
+except ImportError:
+    _protobuf_available = False
+
+
 try:
     import tokenizers  # noqa: F401
 
@@ -270,6 +279,10 @@ def is_sentencepiece_available():
     return _sentencepiece_available
 
 
+def is_protobuf_available():
+    return _protobuf_available
+
+
 def is_tokenizers_available():
     return _tokenizers_available
 
@@ -330,6 +343,14 @@ def wrapper(*args, **kwargs):
 """
 
 
+# docstyle-ignore
+PROTOBUF_IMPORT_ERROR = """
+{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
+that match your environment.
+"""
+
+
 # docstyle-ignore
 FAISS_IMPORT_ERROR = """
 {0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
@@ -420,6 +441,12 @@ def requires_sentencepiece(obj):
         raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name))
 
 
+def requires_protobuf(obj):
+    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+    if not is_protobuf_available():
+        raise ImportError(PROTOBUF_IMPORT_ERROR.format(name))
+
+
 def add_start_docstrings(*docstr):
     def docstring_decorator(fn):
         fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")

From 14839277cf9bb8d71801989c4309d8770cc63fd4 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 17:16:30 +0100
Subject: [PATCH 18/24] fix NER test for use_fast = True by default

---
 src/transformers/convert_slow_tokenizer.py | 3 ++-
 tests/test_pipelines_ner.py                | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index e856d1196768..7e988e7fdd73 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -24,7 +24,7 @@
 from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 
-from .file_utils import requires_sentencepiece, requires_protobuf
+from .file_utils import requires_protobuf, requires_sentencepiece
 
 
 class SentencePieceExtractor:
@@ -288,6 +288,7 @@ def __init__(self, *args):
         super().__init__(*args)
 
         from .utils import sentencepiece_model_pb2 as model_pb2
+
         m = model_pb2.ModelProto()
         m.ParseFromString(open(self.original_tokenizer.vocab_file, "rb").read())
         self.proto = m
diff --git a/tests/test_pipelines_ner.py b/tests/test_pipelines_ner.py
index 44f47d66d99b..58da4aded63e 100644
--- a/tests/test_pipelines_ner.py
+++ b/tests/test_pipelines_ner.py
@@ -146,7 +146,7 @@ def test_tf_small_ignore_subwords_available_for_fast_tokenizers(self):
     @require_torch
     def test_pt_ignore_subwords_slow_tokenizer_raises(self):
         for model_name in self.small_models:
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
 
             with self.assertRaises(ValueError):
                 pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True, use_fast=False)

From b115646c8a27b0c8aa610756e58a061e3f759d61 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 17:55:40 +0100
Subject: [PATCH 19/24] fixing example tests (Q&A examples use slow tokenizers
 for now)

---
 examples/question-answering/run_squad.py         | 6 +++++-
 examples/question-answering/run_squad_trainer.py | 1 +
 src/transformers/tokenization_utils_base.py      | 2 ++
 src/transformers/tokenization_utils_fast.py      | 2 ++
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/question-answering/run_squad.py b/examples/question-answering/run_squad.py
index 59550347c275..4f8fe05a8645 100644
--- a/examples/question-answering/run_squad.py
+++ b/examples/question-answering/run_squad.py
@@ -730,6 +730,7 @@ def main():
         args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
         do_lower_case=args.do_lower_case,
         cache_dir=args.cache_dir if args.cache_dir else None,
+        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
     )
     model = AutoModelForQuestionAnswering.from_pretrained(
         args.model_name_or_path,
@@ -778,7 +779,10 @@ def main():
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir)  # , force_download=True)
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
+        # So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False)
         model.to(args.device)
 
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
diff --git a/examples/question-answering/run_squad_trainer.py b/examples/question-answering/run_squad_trainer.py
index d5fc0723164a..0bb357b21e8e 100644
--- a/examples/question-answering/run_squad_trainer.py
+++ b/examples/question-answering/run_squad_trainer.py
@@ -107,6 +107,7 @@ def main():
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
+        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
     )
     model = AutoModelForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 5d0e50add8f1..a05e06aad63a 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1992,6 +1992,8 @@ def _save_pretrained(
                 "Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
             )
 
+        save_directory = str(save_directory)
+
         added_tokens_file = os.path.join(
             save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
         )
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 6c68c44e6741..c672a0b02ef2 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -527,6 +527,8 @@ def _save_pretrained(
         Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
         specific :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained`
         """
+        save_directory = str(save_directory)
+
         if legacy_format:
             added_tokens_file = os.path.join(
                 save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE

From 56f77e878586356a24c5199824e783742a85712d Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 18:09:35 +0100
Subject: [PATCH 20/24] protobuf in main deps extras["sentencepiece"] and
 example deps

---
 examples/requirements.txt | 1 +
 setup.py                  | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/requirements.txt b/examples/requirements.txt
index 9c2704796789..1ce783440f6e 100644
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -18,3 +18,4 @@ fire
 pytest
 conllu
 sentencepiece != 0.1.92
+protobuf
diff --git a/setup.py b/setup.py
index 7a43c7085aa3..7e7e34661b6f 100644
--- a/setup.py
+++ b/setup.py
@@ -101,8 +101,7 @@
 
 extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
 
-extras["protobuf"] = ["protobuf"]
-extras["sentencepiece"] = ["sentencepiece==0.1.91"]
+extras["sentencepiece"] = ["sentencepiece==0.1.91", "protobuf"]
 extras["retrieval"] = ["faiss-cpu", "datasets"]
 extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"]
 # sphinx-rtd-theme==0.5.0 introduced big changes in the style.

From 6894fc0b8bc15718adffde664f3b3e6c9e745f24 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 19:32:03 +0100
Subject: [PATCH 21/24] fix protobug install test

---
 src/transformers/file_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 3c03a451e899..374b10dafabe 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -186,7 +186,7 @@
 
 
 try:
-    import protobuf  # noqa: F401
+    import google.protobuf  # noqa: F401
 
     _protobuf_available = True
 

From 2441d401ceb02c97f6783395f42816c3408f65c7 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 19:38:02 +0100
Subject: [PATCH 22/24] try to fix seq2seq by switching to slow tokenizers for
 now

---
 examples/seq2seq/test_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/seq2seq/test_datasets.py b/examples/seq2seq/test_datasets.py
index 625b6da347d3..4cbce79eaa92 100644
--- a/examples/seq2seq/test_datasets.py
+++ b/examples/seq2seq/test_datasets.py
@@ -197,7 +197,7 @@ def test_distributed_sortish_sampler_splits_indices_between_procs(self):
     )
     @require_torch_non_multigpu_but_fix_me
     def test_dataset_kwargs(self, tok_name):
-        tokenizer = AutoTokenizer.from_pretrained(tok_name)
+        tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=False)
         if tok_name == MBART_TINY:
             train_dataset = Seq2SeqDataset(
                 tokenizer,

From fc2daadeeb3cfe2fb43ce5edf49478bb734f5f1d Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 21:01:58 +0100
Subject: [PATCH 23/24] Update src/transformers/tokenization_utils_base.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 src/transformers/tokenization_utils_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index a05e06aad63a..744e2440ba3a 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -336,8 +336,8 @@ def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
         Return a list mapping the tokens to the id of their original sentences:
 
             - :obj:`None` for special tokens added around or between sequences,
-            - :obj:`0` for tokens coresponding to words in the first sequence,
-            - :obj:`1` for tokens coresponding to words in the second sequence when a pair of sequences was jointly
+            - :obj:`0` for tokens corresponding to words in the first sequence,
+            - :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
               encoded.
 
         Args:

From 002848bf7dcd4c18ea957c542e3b0f099a641214 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Nov 2020 21:02:08 +0100
Subject: [PATCH 24/24] Update src/transformers/tokenization_utils_base.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 src/transformers/tokenization_utils_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 744e2440ba3a..a7581b70f8c6 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -405,7 +405,7 @@ def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int
 
         Args:
             batch_or_token_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                 the token in the sequence.
             token_index (:obj:`int`, `optional`):
                 If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the