From 4bc3b3f6cb886324a79a0951539048c512f9ae7e Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Mon, 26 Oct 2020 23:11:51 +0100 Subject: [PATCH 01/24] Fixing roberta for slow-fast tests --- src/transformers/tokenization_roberta_fast.py | 27 ++++++++ src/transformers/tokenization_utils_base.py | 4 +- tests/test_pipelines_common.py | 67 ++++++++++++++++++- 3 files changed, 96 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_roberta_fast.py b/src/transformers/tokenization_roberta_fast.py index 3709aec944fe..c94891542e01 100644 --- a/src/transformers/tokenization_roberta_fast.py +++ b/src/transformers/tokenization_roberta_fast.py @@ -18,6 +18,7 @@ from .tokenization_gpt2_fast import GPT2TokenizerFast from .tokenization_roberta import RobertaTokenizer +from .tokenization_utils_base import AddedToken from .utils import logging @@ -172,6 +173,32 @@ def __init__( **kwargs, ) + @property + def mask_token(self) -> str: + """ + :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while + not having been set. + + Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. + The mask token will greedily comprise the space before the ``. + """ + if self._mask_token is None and self.verbose: + logger.error("Using mask_token, but it is not set yet.") + return None + return str(self._mask_token) + + @mask_token.setter + def mask_token(self, value): + """ Overriding the default behavior of the mask token to have it eat the space before it. + + This is needed to preserve backward compatibility with all the previously used models + based on Roberta. + """ + # Mask token behave like a normal word, i.e. include the space before it + # So we set lstrip to True + value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value + self._mask_token = value + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] if token_ids_1 is None: diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 0622e78c8b3a..7e7b10c5096f 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -180,7 +180,9 @@ def to_py_obj(obj): """ Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list. """ - if isinstance(obj, (list, tuple)): + if isinstance(obj, (dict, BatchEncoding)): + return {k: to_py_obj(v) for k, v in obj.items()} + elif isinstance(obj, (list, tuple)): return [to_py_obj(o) for o in obj] elif is_tf_available() and isinstance(obj, tf.Tensor): return obj.numpy().tolist() diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index d6acea2da6cc..37a348449554 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -1,7 +1,9 @@ import unittest +from unittest import mock from typing import List, Optional from transformers import is_tf_available, is_torch_available, pipeline +from transformers.tokenization_utils_base import to_py_obj from transformers.pipelines import DefaultArgumentHandler, Pipeline from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow @@ -83,7 +85,7 @@ def _test_pipeline(self, nlp: Pipeline): raise NotImplementedError -@is_pipeline_test +# @is_pipeline_test class MonoInputPipelineCommonMixin: pipeline_task = None pipeline_loading_kwargs = {} # Additional kwargs to load the pipeline with @@ -139,6 +141,69 @@ def test_tf_small(self): ) self._test_pipeline(nlp) + @require_torch + def test_compare_slow_fast_torch(self): + for model_name in self.small_models: + nlp_slow = pipeline( + task=self.pipeline_task, + model=model_name, + tokenizer=model_name, + framework="pt", + use_fast=False, + **self.pipeline_loading_kwargs, + ) + nlp_fast = pipeline( + task=self.pipeline_task, + model=model_name, + tokenizer=model_name, + framework="pt", + use_fast=True, + **self.pipeline_loading_kwargs, + ) + self._compare_slow_fast_pipelines(nlp_slow, nlp_fast) + + @require_tf + def test_compare_slow_fast_tf(self): + for model_name in self.small_models: + nlp_slow = pipeline( + task=self.pipeline_task, + model=model_name, + tokenizer=model_name, + framework="tf", + use_fast=False, + **self.pipeline_loading_kwargs, + ) + nlp_fast = pipeline( + task=self.pipeline_task, + model=model_name, + tokenizer=model_name, + framework="tf", + use_fast=True, + **self.pipeline_loading_kwargs, + ) + self._compare_slow_fast_pipelines(nlp_slow, nlp_fast) + + def _compare_slow_fast_pipelines(self, nlp_slow: Pipeline, nlp_fast: Pipeline): + with mock.patch.object(nlp_slow.model, 'forward', wraps=nlp_slow.model.forward) as mock_slow,\ + mock.patch.object(nlp_fast.model, 'forward', wraps=nlp_fast.model.forward) as mock_fast: + for inputs in self.valid_inputs: + outputs_slow = nlp_slow(inputs, **self.pipeline_running_kwargs) + outputs_fast = nlp_fast(inputs, **self.pipeline_running_kwargs) + + mock_slow.assert_called() + mock_fast.assert_called() + + slow_call_args, slow_call_kwargs = mock_slow.call_args + fast_call_args, fast_call_kwargs = mock_fast.call_args + + slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs) + fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs) + + self.assertEqual(slow_call_args, fast_call_args) + self.assertDictEqual(slow_call_kwargs, fast_call_kwargs) + + self.assertEqual(outputs_slow, outputs_fast) + @require_torch @slow def test_torch_large(self): From 1ce1c638350158f5ebeb2130b1f3a4f550252f32 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Mon, 26 Oct 2020 23:37:09 +0100 Subject: [PATCH 02/24] WIP getting equivalence on pipelines --- src/transformers/tokenization_roberta_fast.py | 6 +- tests/test_pipelines_common.py | 166 +++++++----------- tests/test_pipelines_dialog.py | 1 + tests/test_pipelines_zero_shot.py | 12 ++ 4 files changed, 82 insertions(+), 103 deletions(-) diff --git a/src/transformers/tokenization_roberta_fast.py b/src/transformers/tokenization_roberta_fast.py index c94891542e01..02b779bf8281 100644 --- a/src/transformers/tokenization_roberta_fast.py +++ b/src/transformers/tokenization_roberta_fast.py @@ -189,10 +189,10 @@ def mask_token(self) -> str: @mask_token.setter def mask_token(self, value): - """ Overriding the default behavior of the mask token to have it eat the space before it. + """Overriding the default behavior of the mask token to have it eat the space before it. - This is needed to preserve backward compatibility with all the previously used models - based on Roberta. + This is needed to preserve backward compatibility with all the previously used models + based on Roberta. """ # Mask token behave like a normal word, i.e. include the space before it # So we set lstrip to True diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index 37a348449554..5a6713aaedf8 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -1,22 +1,24 @@ import unittest -from unittest import mock from typing import List, Optional +from unittest import mock from transformers import is_tf_available, is_torch_available, pipeline -from transformers.tokenization_utils_base import to_py_obj from transformers.pipelines import DefaultArgumentHandler, Pipeline from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow +from transformers.tokenization_utils_base import to_py_obj VALID_INPUTS = ["A simple string", ["list of strings"]] -@is_pipeline_test +# @is_pipeline_test class CustomInputPipelineCommonMixin: pipeline_task = None - pipeline_loading_kwargs = {} - small_models = None # Models tested without the @slow decorator - large_models = None # Models tested with the @slow decorator + pipeline_loading_kwargs = {} # Additional kwargs to load the pipeline with + pipeline_running_kwargs = {} # Additional kwargs to run the pipeline with + small_models = [] # Models tested without the @slow decorator + large_models = [] # Models tested with the @slow decorator + valid_inputs = VALID_INPUTS # Some inputs which are valid to compare fast and slow tokenizers def setUp(self) -> None: if not is_tf_available() and not is_torch_available(): @@ -48,78 +50,41 @@ def setUp(self) -> None: @require_torch @slow def test_pt_defaults(self): - pipeline(self.pipeline_task, framework="pt") + pipeline(self.pipeline_task, framework="pt", **self.pipeline_loading_kwargs) @require_tf @slow def test_tf_defaults(self): - pipeline(self.pipeline_task, framework="tf") + pipeline(self.pipeline_task, framework="tf", **self.pipeline_loading_kwargs) @require_torch def test_torch_small(self): for model_name in self.small_models: - nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="pt") + nlp = pipeline( + task=self.pipeline_task, + model=model_name, + tokenizer=model_name, + framework="pt", + **self.pipeline_loading_kwargs, + ) self._test_pipeline(nlp) @require_tf def test_tf_small(self): for model_name in self.small_models: - nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="tf") + nlp = pipeline( + task=self.pipeline_task, + model=model_name, + tokenizer=model_name, + framework="tf", + **self.pipeline_loading_kwargs, + ) self._test_pipeline(nlp) @require_torch @slow def test_torch_large(self): for model_name in self.large_models: - nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="pt") - self._test_pipeline(nlp) - - @require_tf - @slow - def test_tf_large(self): - for model_name in self.large_models: - nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="tf") - self._test_pipeline(nlp) - - def _test_pipeline(self, nlp: Pipeline): - raise NotImplementedError - - -# @is_pipeline_test -class MonoInputPipelineCommonMixin: - pipeline_task = None - pipeline_loading_kwargs = {} # Additional kwargs to load the pipeline with - pipeline_running_kwargs = {} # Additional kwargs to run the pipeline with - small_models = [] # Models tested without the @slow decorator - large_models = [] # Models tested with the @slow decorator - mandatory_keys = {} # Keys which should be in the output - valid_inputs = VALID_INPUTS # inputs which are valid - invalid_inputs = [None] # inputs which are not allowed - expected_multi_result: Optional[List] = None - expected_check_keys: Optional[List[str]] = None - - def setUp(self) -> None: - if not is_tf_available() and not is_torch_available(): - return # Currently no JAX pipelines - - for model_name in self.small_models: - pipeline(self.pipeline_task, model=model_name, tokenizer=model_name, **self.pipeline_loading_kwargs) - for model_name in self.large_models: - pipeline(self.pipeline_task, model=model_name, tokenizer=model_name, **self.pipeline_loading_kwargs) - - @require_torch - @slow - def test_pt_defaults_loads(self): - pipeline(self.pipeline_task, framework="pt", **self.pipeline_loading_kwargs) - - @require_tf - @slow - def test_tf_defaults_loads(self): - pipeline(self.pipeline_task, framework="tf", **self.pipeline_loading_kwargs) - - @require_torch - def test_torch_small(self): - for model_name in self.small_models: nlp = pipeline( task=self.pipeline_task, model=model_name, @@ -130,8 +95,9 @@ def test_torch_small(self): self._test_pipeline(nlp) @require_tf - def test_tf_small(self): - for model_name in self.small_models: + @slow + def test_tf_large(self): + for model_name in self.large_models: nlp = pipeline( task=self.pipeline_task, model=model_name, @@ -141,6 +107,9 @@ def test_tf_small(self): ) self._test_pipeline(nlp) + def _test_pipeline(self, nlp: Pipeline): + raise NotImplementedError + @require_torch def test_compare_slow_fast_torch(self): for model_name in self.small_models: @@ -160,7 +129,7 @@ def test_compare_slow_fast_torch(self): use_fast=True, **self.pipeline_loading_kwargs, ) - self._compare_slow_fast_pipelines(nlp_slow, nlp_fast) + self._compare_slow_fast_pipelines(nlp_slow, nlp_fast, method="forward") @require_tf def test_compare_slow_fast_tf(self): @@ -181,54 +150,51 @@ def test_compare_slow_fast_tf(self): use_fast=True, **self.pipeline_loading_kwargs, ) - self._compare_slow_fast_pipelines(nlp_slow, nlp_fast) - - def _compare_slow_fast_pipelines(self, nlp_slow: Pipeline, nlp_fast: Pipeline): - with mock.patch.object(nlp_slow.model, 'forward', wraps=nlp_slow.model.forward) as mock_slow,\ - mock.patch.object(nlp_fast.model, 'forward', wraps=nlp_fast.model.forward) as mock_fast: + self._compare_slow_fast_pipelines(nlp_slow, nlp_fast, method="call") + + def _compare_slow_fast_pipelines(self, nlp_slow: Pipeline, nlp_fast: Pipeline, method: str): + """We check that the inputs to the models forward passes are identical for + slow and fast tokenizers. + """ + with mock.patch.object( + nlp_slow.model, method, wraps=getattr(nlp_slow.model, method) + ) as mock_slow, mock.patch.object(nlp_fast.model, method, wraps=getattr(nlp_fast.model, method)) as mock_fast: for inputs in self.valid_inputs: - outputs_slow = nlp_slow(inputs, **self.pipeline_running_kwargs) - outputs_fast = nlp_fast(inputs, **self.pipeline_running_kwargs) + if isinstance(inputs, dict): + inputs.update(self.pipeline_running_kwargs) + _ = nlp_slow(**inputs) + _ = nlp_fast(**inputs) + else: + _ = nlp_slow(inputs, **self.pipeline_running_kwargs) + _ = nlp_fast(inputs, **self.pipeline_running_kwargs) mock_slow.assert_called() mock_fast.assert_called() - slow_call_args, slow_call_kwargs = mock_slow.call_args - fast_call_args, fast_call_kwargs = mock_fast.call_args + self.assertEqual(len(mock_slow.call_args_list), len(mock_fast.call_args_list)) + for mock_slow_call_args, mock_fast_call_args in zip( + mock_slow.call_args_list, mock_slow.call_args_list + ): + slow_call_args, slow_call_kwargs = mock_slow_call_args + fast_call_args, fast_call_kwargs = mock_fast_call_args - slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs) - fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs) + slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs) + fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs) - self.assertEqual(slow_call_args, fast_call_args) - self.assertDictEqual(slow_call_kwargs, fast_call_kwargs) + self.assertEqual(slow_call_args, fast_call_args) + self.assertDictEqual(slow_call_kwargs, fast_call_kwargs) - self.assertEqual(outputs_slow, outputs_fast) - @require_torch - @slow - def test_torch_large(self): - for model_name in self.large_models: - nlp = pipeline( - task=self.pipeline_task, - model=model_name, - tokenizer=model_name, - framework="pt", - **self.pipeline_loading_kwargs, - ) - self._test_pipeline(nlp) +@is_pipeline_test +class MonoInputPipelineCommonMixin(CustomInputPipelineCommonMixin): + """A version of the CustomInputPipelineCommonMixin + with a predefined `_test_pipeline` method. + """ - @require_tf - @slow - def test_tf_large(self): - for model_name in self.large_models: - nlp = pipeline( - task=self.pipeline_task, - model=model_name, - tokenizer=model_name, - framework="tf", - **self.pipeline_loading_kwargs, - ) - self._test_pipeline(nlp) + mandatory_keys = {} # Keys which should be in the output + invalid_inputs = [None] # inputs which are not allowed + expected_multi_result: Optional[List] = None + expected_check_keys: Optional[List[str]] = None def _test_pipeline(self, nlp: Pipeline): self.assertIsNotNone(nlp) diff --git a/tests/test_pipelines_dialog.py b/tests/test_pipelines_dialog.py index 751d4b2b3e5f..9413441378d2 100644 --- a/tests/test_pipelines_dialog.py +++ b/tests/test_pipelines_dialog.py @@ -9,6 +9,7 @@ class DialoguePipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase): pipeline_task = "conversational" small_models = [] # Default model - Models tested without the @slow decorator large_models = ["microsoft/DialoGPT-medium"] # Models tested with the @slow decorator + valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]] def _test_pipeline(self, nlp: Pipeline): valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]] diff --git a/tests/test_pipelines_zero_shot.py b/tests/test_pipelines_zero_shot.py index 42adfc27ced0..25db16cb96db 100644 --- a/tests/test_pipelines_zero_shot.py +++ b/tests/test_pipelines_zero_shot.py @@ -11,6 +11,18 @@ class ZeroShotClassificationPipelineTests(CustomInputPipelineCommonMixin, unitte "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english" ] # Models tested without the @slow decorator large_models = ["roberta-large-mnli"] # Models tested with the @slow decorator + valid_inputs = [ + {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics"}, + {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics"]}, + {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics, public health"}, + {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics", "public health"]}, + {"sequences": ["Who are you voting for in 2020?"], "candidate_labels": "politics"}, + { + "sequences": "Who are you voting for in 2020?", + "candidate_labels": "politics", + "hypothesis_template": "This text is about {}", + }, + ] def _test_scores_sum_to_one(self, result): sum = 0.0 From 15350e80d14c9fa91939ca6eac9ef2976eb1607d Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 27 Oct 2020 16:18:05 +0100 Subject: [PATCH 03/24] slow-to-fast equivalence - working on question-answering pipeline --- src/transformers/data/processors/squad.py | 45 ++++-- src/transformers/pipelines.py | 135 ++++++++++++++---- src/transformers/tokenization_roberta_fast.py | 10 +- tests/test_pipelines_conversational.py | 2 +- tests/test_pipelines_dialog.py | 30 ---- tests/test_pipelines_question_answering.py | 12 ++ tests/test_tokenization_common.py | 18 +++ 7 files changed, 175 insertions(+), 77 deletions(-) delete mode 100644 tests/test_pipelines_dialog.py diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index 41daa06e99e2..4f928e690969 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -8,7 +8,7 @@ from ...file_utils import is_tf_available, is_torch_available from ...tokenization_bert import whitespace_tokenize -from ...tokenization_utils_base import PreTrainedTokenizerBase, TruncationStrategy +from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy from ...utils import logging from .utils import DataProcessor @@ -350,24 +350,39 @@ def squad_convert_examples_to_features( # Defining helper methods features = [] - threads = min(threads, cpu_count()) - with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: - annotate_ = partial( - squad_convert_example_to_features, + ################# + squad_convert_example_to_features_init(tokenizer) + for example in examples: + feature = squad_convert_example_to_features( + example, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, padding_strategy=padding_strategy, is_training=is_training, ) - features = list( - tqdm( - p.imap(annotate_, examples, chunksize=32), - total=len(examples), - desc="convert squad examples to features", - disable=not tqdm_enabled, - ) - ) + features.append(feature) + + ################# + # threads = min(threads, cpu_count()) + # with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: + # annotate_ = partial( + # squad_convert_example_to_features, + # max_seq_length=max_seq_length, + # doc_stride=doc_stride, + # max_query_length=max_query_length, + # padding_strategy=padding_strategy, + # is_training=is_training, + # ) + # features = list( + # tqdm( + # p.imap(annotate_, examples, chunksize=32), + # total=len(examples), + # desc="convert squad examples to features", + # disable=not tqdm_enabled, + # ) + # ) + ################# new_features = [] unique_id = 1000000000 @@ -765,6 +780,7 @@ class SquadFeatures: token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer. start_position: start of the answer token index end_position: end of the answer token index + encoding: optionally store the BatchEncoding with the fast-tokenizer alignement methods. """ def __init__( @@ -784,6 +800,7 @@ def __init__( end_position, is_impossible, qas_id: str = None, + encoding: BatchEncoding = None, ): self.input_ids = input_ids self.attention_mask = attention_mask @@ -803,6 +820,8 @@ def __init__( self.is_impossible = is_impossible self.qas_id = qas_id + self.encoding = encoding + class SquadResult: """ diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 4a7c42fd863c..a3a9a7a01a35 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -32,7 +32,7 @@ from .configuration_auto import AutoConfig from .configuration_utils import PretrainedConfig -from .data import SquadExample, squad_convert_examples_to_features +from .data import SquadExample, SquadFeatures, squad_convert_examples_to_features from .file_utils import add_end_docstrings, is_tf_available, is_torch_available from .modelcard import ModelCard from .tokenization_auto import AutoTokenizer @@ -1721,6 +1721,7 @@ def __call__(self, *args, **kwargs): - **answer** (:obj:`str`) -- The answer to the question. """ # Set defaults values + kwargs.setdefault("padding", "longest") kwargs.setdefault("topk", 1) kwargs.setdefault("doc_stride", 128) kwargs.setdefault("max_answer_len", 15) @@ -1736,19 +1737,83 @@ def __call__(self, *args, **kwargs): # Convert inputs to features examples = self._args_parser(*args, **kwargs) - features_list = [ - squad_convert_examples_to_features( - examples=[example], - tokenizer=self.tokenizer, - max_seq_length=kwargs["max_seq_len"], - doc_stride=kwargs["doc_stride"], - max_query_length=kwargs["max_question_len"], - padding_strategy=PaddingStrategy.MAX_LENGTH.value, - is_training=False, - tqdm_enabled=False, - ) - for example in examples - ] + if not self.tokenizer.is_fast: + features_list = [ + squad_convert_examples_to_features( + examples=[example], + tokenizer=self.tokenizer, + max_seq_length=kwargs["max_seq_len"], + doc_stride=kwargs["doc_stride"], + max_query_length=kwargs["max_question_len"], + padding_strategy=PaddingStrategy.MAX_LENGTH.value, + is_training=False, + tqdm_enabled=False, + ) + for example in examples + ] + else: + features_list = [] + for example in examples: + # Define the side we want to truncate / pad and the text/pair sorting + question_first = bool(self.tokenizer.padding_side == "right") + if question_first: + texts = example.question_text + pairs = example.context_text + truncation = "only_second" + else: + texts = example.context_text + pairs = example.question_text + truncation = "only_first" + + encoded_inputs = self.tokenizer( + texts, + pairs, + padding=kwargs["padding"], + truncation=truncation, + max_length=kwargs["max_seq_len"], + stride=kwargs["doc_stride"], + return_tensors="np", + return_token_type_ids=True, + return_overflowing_tokens=True, + return_offsets_mapping=True, + return_special_tokens_mask=True, + ) + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + p_mask = encoded_inputs["token_type_ids"] == (0 if question_first else 1) # Mask the question + p_mask = p_mask | encoded_inputs["special_tokens_mask"] # And mask the special tokens + cls_index = 0 + if self.tokenizer.cls_token_id: + # kKep the cls_token unmasked (some models use it to indicate unanswerable questions) + cls_index = np.where(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id) + p_mask[cls_index] = 0 + + features = [] + num_spans = len(encoded_inputs["input_ids"]) + for span_idx in range(num_spans): + features.append( + SquadFeatures( + encoded_inputs["input_ids"][span_idx], + encoded_inputs["attention_mask"][span_idx], + encoded_inputs["token_type_ids"][span_idx], + cls_index[span_idx], + p_mask[span_idx].tolist(), + encoding=encoded_inputs[span_idx], + # We don't use the rest of the values + token_to_orig_map={}, + example_index=0, + unique_id=0, + paragraph_len=0, + token_is_max_context=0, + tokens=[], + start_position=0, + end_position=0, + is_impossible=False, + qas_id=None, + ) + ) + features_list.append(features) + all_answers = [] for features, example in zip(features_list, examples): model_input_names = self.tokenizer.model_input_names + ["input_ids"] @@ -1791,20 +1856,34 @@ def __call__(self, *args, **kwargs): start_[0] = end_[0] = 0.0 starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) - char_to_word = np.array(example.char_to_word_offset) - - # Convert the answer (tokens) back to the original text - answers += [ - { - "score": score.item(), - "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), - "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), - "answer": " ".join( - example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] - ), - } - for s, e, score in zip(starts, ends, scores) - ] + if not self.tokenizer.is_fast: + char_to_word = np.array(example.char_to_word_offset) + + # Convert the answer (tokens) back to the original text + answers += [ + { + "score": score.item(), + "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), + "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), + "answer": " ".join( + example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] + ), + } + for s, e, score in zip(starts, ends, scores) + ] + else: + # Convert the answer (tokens) back to the original text + answers += [ + { + "score": score.item(), + "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), + "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), + "answer": " ".join( + example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] + ), + } + for s, e, score in zip(starts, ends, scores) + ] if kwargs["handle_impossible_answer"]: answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""}) diff --git a/src/transformers/tokenization_roberta_fast.py b/src/transformers/tokenization_roberta_fast.py index 02b779bf8281..696c43bf53ba 100644 --- a/src/transformers/tokenization_roberta_fast.py +++ b/src/transformers/tokenization_roberta_fast.py @@ -179,8 +179,8 @@ def mask_token(self) -> str: :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not having been set. - Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. - The mask token will greedily comprise the space before the ``. + Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily + comprise the space before the ``. """ if self._mask_token is None and self.verbose: logger.error("Using mask_token, but it is not set yet.") @@ -189,10 +189,10 @@ def mask_token(self) -> str: @mask_token.setter def mask_token(self, value): - """Overriding the default behavior of the mask token to have it eat the space before it. + """ + Overriding the default behavior of the mask token to have it eat the space before it. - This is needed to preserve backward compatibility with all the previously used models - based on Roberta. + This is needed to preserve backward compatibility with all the previously used models based on Roberta. """ # Mask token behave like a normal word, i.e. include the space before it # So we set lstrip to True diff --git a/tests/test_pipelines_conversational.py b/tests/test_pipelines_conversational.py index 3492283479b7..e70bb8a843ed 100644 --- a/tests/test_pipelines_conversational.py +++ b/tests/test_pipelines_conversational.py @@ -9,7 +9,7 @@ DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0 -class TextGenerationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): +class ConversationalPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): pipeline_task = "conversational" small_models = [] # Models tested without the @slow decorator large_models = ["microsoft/DialoGPT-medium"] # Models tested with the @slow decorator diff --git a/tests/test_pipelines_dialog.py b/tests/test_pipelines_dialog.py deleted file mode 100644 index 9413441378d2..000000000000 --- a/tests/test_pipelines_dialog.py +++ /dev/null @@ -1,30 +0,0 @@ -import unittest - -from transformers.pipelines import Conversation, Pipeline - -from .test_pipelines_common import CustomInputPipelineCommonMixin - - -class DialoguePipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase): - pipeline_task = "conversational" - small_models = [] # Default model - Models tested without the @slow decorator - large_models = ["microsoft/DialoGPT-medium"] # Models tested with the @slow decorator - valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]] - - def _test_pipeline(self, nlp: Pipeline): - valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]] - invalid_inputs = ["Hi there!", Conversation()] - self.assertIsNotNone(nlp) - - mono_result = nlp(valid_inputs[0]) - self.assertIsInstance(mono_result, Conversation) - - multi_result = nlp(valid_inputs[1]) - self.assertIsInstance(multi_result, list) - self.assertIsInstance(multi_result[0], Conversation) - # Inactive conversations passed to the pipeline raise a ValueError - self.assertRaises(ValueError, nlp, valid_inputs[1]) - - for bad_input in invalid_inputs: - self.assertRaises(Exception, nlp, bad_input) - self.assertRaises(Exception, nlp, invalid_inputs) diff --git a/tests/test_pipelines_question_answering.py b/tests/test_pipelines_question_answering.py index 3f3f6dc83a72..379c58938f9f 100644 --- a/tests/test_pipelines_question_answering.py +++ b/tests/test_pipelines_question_answering.py @@ -7,10 +7,22 @@ class QAPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase): pipeline_task = "question-answering" + pipeline_running_kwargs = { + "padding": "max_length", + "max_seq_len": 25, + "doc_stride": 5, + } # Default is 'longest' but we use 'max_length' to test equivalence between slow/fast tokenizers small_models = [ "sshleifer/tiny-distilbert-base-cased-distilled-squad" ] # Models tested without the @slow decorator large_models = [] # Models tested with the @slow decorator + valid_inputs = [ + {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."}, + { + "question": "In what field is HuggingFace working ?", + "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.", + }, + ] def _test_pipeline(self, nlp: Pipeline): output_keys = {"score", "answer", "start", "end"} diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 0090c0f47d30..a3df25152121 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -576,6 +576,24 @@ def test_mask_output(self): sequences, mask = information["input_ids"], information["token_type_ids"] self.assertEqual(len(sequences), len(mask)) + def test_token_type_ids(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + seq_0 = "Test this method." + seq_1 = "With these inputs." + + # We want to have sequence 0 and sequence 1 are tagged + # respectively with 0 and 1 token_ids + # (regardeless of weither the model use token type ids) + # We use this assumption in the QA pipeline among other place + output = tokenizer(seq_0, return_token_type_ids=True) + self.assertIn(0, output["token_type_ids"]) + + output = tokenizer(seq_0, seq_1, return_token_type_ids=True) + self.assertIn(0, output["token_type_ids"]) + self.assertIn(1, output["token_type_ids"]) + def test_number_of_added_tokens(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: From 449e346d79a68893cbc70b6f5abc75db0f8cc59d Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Mon, 2 Nov 2020 17:17:09 +0100 Subject: [PATCH 04/24] optional FAISS tests --- tests/test_retrieval_rag.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_retrieval_rag.py b/tests/test_retrieval_rag.py index 93774be18382..fde5ac529c2f 100644 --- a/tests/test_retrieval_rag.py +++ b/tests/test_retrieval_rag.py @@ -9,7 +9,7 @@ import numpy as np from datasets import Dataset -import faiss +from transformers import is_faiss_available from transformers.configuration_bart import BartConfig from transformers.configuration_dpr import DPRConfig from transformers.configuration_rag import RagConfig @@ -26,6 +26,9 @@ from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES +if is_faiss_available(): + import faiss + @require_faiss @require_datasets From eb375bcec8d9d51179d42ac5299d1f3bdb01181e Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Mon, 9 Nov 2020 17:43:21 +0100 Subject: [PATCH 05/24] Pipeline Q&A --- src/transformers/data/processors/squad.py | 50 ++++----- src/transformers/pipelines.py | 67 +++++++----- src/transformers/tokenization_utils_base.py | 41 ++++++-- src/transformers/tokenization_utils_fast.py | 39 ++++--- tests/test_retrieval_rag.py | 1 + tests/test_tokenization_common.py | 108 ++++++++++++++++++++ 6 files changed, 229 insertions(+), 77 deletions(-) diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index 4f928e690969..06cd170a4dc6 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -351,37 +351,37 @@ def squad_convert_examples_to_features( features = [] ################# - squad_convert_example_to_features_init(tokenizer) - for example in examples: - feature = squad_convert_example_to_features( - example, - max_seq_length=max_seq_length, - doc_stride=doc_stride, - max_query_length=max_query_length, - padding_strategy=padding_strategy, - is_training=is_training, - ) - features.append(feature) - - ################# - # threads = min(threads, cpu_count()) - # with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: - # annotate_ = partial( - # squad_convert_example_to_features, + # squad_convert_example_to_features_init(tokenizer) + # for example in examples: + # feature = squad_convert_example_to_features( + # example, # max_seq_length=max_seq_length, # doc_stride=doc_stride, # max_query_length=max_query_length, # padding_strategy=padding_strategy, # is_training=is_training, # ) - # features = list( - # tqdm( - # p.imap(annotate_, examples, chunksize=32), - # total=len(examples), - # desc="convert squad examples to features", - # disable=not tqdm_enabled, - # ) - # ) + # features.append(feature) + + ################# + threads = min(threads, cpu_count()) + with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: + annotate_ = partial( + squad_convert_example_to_features, + max_seq_length=max_seq_length, + doc_stride=doc_stride, + max_query_length=max_query_length, + padding_strategy=padding_strategy, + is_training=is_training, + ) + features = list( + tqdm( + p.imap(annotate_, examples, chunksize=32), + total=len(examples), + desc="convert squad examples to features", + disable=not tqdm_enabled, + ) + ) ################# new_features = [] diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index a3a9a7a01a35..4b796db0303a 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1756,20 +1756,12 @@ def __call__(self, *args, **kwargs): for example in examples: # Define the side we want to truncate / pad and the text/pair sorting question_first = bool(self.tokenizer.padding_side == "right") - if question_first: - texts = example.question_text - pairs = example.context_text - truncation = "only_second" - else: - texts = example.context_text - pairs = example.question_text - truncation = "only_first" encoded_inputs = self.tokenizer( - texts, - pairs, + text=example.question_text if question_first else example.context_text, + text_pair=example.context_text if question_first else example.question_text, padding=kwargs["padding"], - truncation=truncation, + truncation="only_second" if question_first else "only_first", max_length=kwargs["max_seq_len"], stride=kwargs["doc_stride"], return_tensors="np", @@ -1779,27 +1771,28 @@ def __call__(self, *args, **kwargs): return_special_tokens_mask=True, ) + num_spans = len(encoded_inputs["input_ids"]) + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) p_mask = encoded_inputs["token_type_ids"] == (0 if question_first else 1) # Mask the question p_mask = p_mask | encoded_inputs["special_tokens_mask"] # And mask the special tokens - cls_index = 0 if self.tokenizer.cls_token_id: - # kKep the cls_token unmasked (some models use it to indicate unanswerable questions) - cls_index = np.where(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id) + # keep the cls_token unmasked (some models use it to indicate unanswerable questions) + cls_index = np.nonzero(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id) p_mask[cls_index] = 0 features = [] - num_spans = len(encoded_inputs["input_ids"]) for span_idx in range(num_spans): features.append( SquadFeatures( - encoded_inputs["input_ids"][span_idx], - encoded_inputs["attention_mask"][span_idx], - encoded_inputs["token_type_ids"][span_idx], - cls_index[span_idx], - p_mask[span_idx].tolist(), + input_ids=encoded_inputs["input_ids"][span_idx], + attention_mask=encoded_inputs["attention_mask"][span_idx], + token_type_ids=encoded_inputs["token_type_ids"][span_idx], + p_mask=p_mask[span_idx].tolist(), encoding=encoded_inputs[span_idx], - # We don't use the rest of the values + # We don't use the rest of the values - and actually + # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample + cls_index=None, token_to_orig_map={}, example_index=0, unique_id=0, @@ -1860,6 +1853,10 @@ def __call__(self, *args, **kwargs): char_to_word = np.array(example.char_to_word_offset) # Convert the answer (tokens) back to the original text + # Score: score from the model + # Start: Index of the first character of the answer in the context string + # End: Index of the character following the last character of the answer in the context string + # Answer: Plain text of the answer answers += [ { "score": score.item(), @@ -1873,14 +1870,32 @@ def __call__(self, *args, **kwargs): ] else: # Convert the answer (tokens) back to the original text + # Score: score from the model + # Start: Index of the first character of the answer in the context string + # End: Index of the character following the last character of the answer in the context string + # Answer: Plain text of the answer + question_first = bool(self.tokenizer.padding_side == "right") + enc = feature.encoding + + # Sometimes the max probability token is in the middle of a word so: + # - we start by finding the right word containing the token with `token_to_word` + # - then we convert this word in a character span with `word_to_chars` answers += [ { "score": score.item(), - "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), - "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), - "answer": " ".join( - example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] - ), + "start": enc.word_to_chars( + enc.token_to_word(s), sequence_index=1 if question_first else 0 + )[0], + "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[ + 1 + ], + "answer": example.context_text[ + enc.word_to_chars(enc.token_to_word(s), sequence_index=1 if question_first else 0)[ + 0 + ] : enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[ + 1 + ] + ], } for s, e, score in zip(starts, ends, scores) ] diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 7e7b10c5096f..010066eaf26b 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -365,9 +365,11 @@ def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = token_index = self._seq_len + token_index return self._encodings[batch_index].token_to_word(token_index) - def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> Optional[TokenSpan]: + def word_to_tokens( + self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0 + ) -> Optional[TokenSpan]: """ - Get the encoded token span corresponding to a word in the sequence of the batch. + Get the encoded token span corresponding to a word in a sequence of the batch. Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with: @@ -376,8 +378,9 @@ def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = N Can be called as: - - ``self.word_to_tokens(word_index)`` if batch size is 1 - - ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1 + - ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1 + - ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal + to 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized @@ -390,6 +393,9 @@ def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = N word_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the sequence. + sequence_index (:obj:`int`, `optional`, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided word index belongs to. Returns: Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence. @@ -407,7 +413,7 @@ def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = N batch_index = self._batch_size + batch_index if word_index < 0: word_index = self._seq_len + word_index - span = self._encodings[batch_index].word_to_tokens(word_index) + span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index) return TokenSpan(*span) if span is not None else None def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: @@ -446,7 +452,9 @@ def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = token_index = batch_or_token_index return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) - def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: + def char_to_token( + self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0 + ) -> int: """ Get the index of the token in the encoded output comprising a character in the original string for a sequence of the batch. @@ -467,6 +475,9 @@ def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = No char_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the sequence. + sequence_index (:obj:`int`, `optional`, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided character index belongs to. Returns: @@ -480,9 +491,11 @@ def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = No else: batch_index = 0 char_index = batch_or_char_index - return self._encodings[batch_index].char_to_token(char_index) + return self._encodings[batch_index].char_to_token(char_index, sequence_index) - def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan: + def word_to_chars( + self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0 + ) -> CharSpan: """ Get the character span in the original string corresponding to given word in a sequence of the batch. @@ -503,6 +516,9 @@ def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = No word_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the sequence. + sequence_index (:obj:`int`, `optional`, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided word index belongs to. Returns: :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string. @@ -520,9 +536,9 @@ def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = No else: batch_index = 0 word_index = batch_or_word_index - return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index))) + return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index))) - def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: + def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int: """ Get the word in the original string corresponding to a character in the original string of a sequence of the batch. @@ -543,6 +559,9 @@ def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = Non char_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the orginal string. + sequence_index (:obj:`int`, `optional`, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided character index belongs to. Returns: @@ -556,7 +575,7 @@ def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = Non else: batch_index = 0 char_index = batch_or_char_index - return self._encodings[batch_index].char_to_word(char_index) + return self._encodings[batch_index].char_to_word(char_index, sequence_index) def convert_to_tensors( self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 8754c3334db8..7f7d479893b3 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -169,9 +169,10 @@ def _convert_encoding( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - ) -> Dict[str, Any]: + ) -> Tuple[Dict[str, Any], List[EncodingFast]]: """ - Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict. + Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list + of encodings, take care of building a batch from overflowing tokens. Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are lists (overflows) of lists (tokens). @@ -203,7 +204,7 @@ def _convert_encoding( if return_length: encoding_dict["length"].append(len(e.ids)) - return encoding_dict + return encoding_dict, encodings def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: """ @@ -390,9 +391,12 @@ def _batch_encode_plus( ) # Convert encoding to dict - # `Tokens` has type: List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]] + # `Tokens` has type: Tuple[ + # List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]], + # List[EncodingFast] + # ] # with nested dimensions corresponding to batch, overflows, sequence length - tokens = [ + tokens_and_encodings = [ self._convert_encoding( encoding=encoding, return_token_type_ids=return_token_type_ids, @@ -406,22 +410,27 @@ def _batch_encode_plus( for encoding in encodings ] - # Convert the output to have dict[list] from list[dict] - sanitized = {} - for key in tokens[0].keys(): - # To List[List[List[int]]] of shape (batch, overflows, sequence length) - stack = [e for item in tokens for e in item[key]] - sanitized[key] = stack + # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension + # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length) + # (we say ~ because the number of overflow varies with the example in the batch) + # + # To match each overflowing sample with the original sample in the batch + # we add an overflow_to_sample_mapping array (see below) + sanitized_tokens = {} + for key in tokens_and_encodings[0][0].keys(): + stack = [e for item, _ in tokens_and_encodings for e in item[key]] + sanitized_tokens[key] = stack + sanitized_encodings = [e for _, item in tokens_and_encodings for e in item] # If returning overflowing tokens, we need to return a mapping # from the batch idx to the original sample if return_overflowing_tokens: overflow_to_sample_mapping = [] - for i, enc in enumerate(tokens): - overflow_to_sample_mapping += [i] * len(enc["input_ids"]) - sanitized["overflow_to_sample_mapping"] = overflow_to_sample_mapping + for i, (toks, _) in enumerate(tokens_and_encodings): + overflow_to_sample_mapping += [i] * len(toks["input_ids"]) + sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping - return BatchEncoding(sanitized, encodings, tensor_type=return_tensors) + return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors) def _encode_plus( self, diff --git a/tests/test_retrieval_rag.py b/tests/test_retrieval_rag.py index fde5ac529c2f..a95324535b82 100644 --- a/tests/test_retrieval_rag.py +++ b/tests/test_retrieval_rag.py @@ -26,6 +26,7 @@ from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES + if is_faiss_available(): import faiss diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index a3df25152121..2903cca33fd6 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1896,6 +1896,114 @@ def test_alignement_methods(self): batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1 ) + # Pair of input sequences + + words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"] + text = " ".join(words) + pair_words = ["Amazing", "example", "full", "of", "inspiration"] + pair_text = " ".join(pair_words) + batch_size = 3 + index_word_in_first_seq = words.index("inspiration") + index_word_in_pair_seq = pair_words.index("inspiration") + index_char_in_first_seq = text.find("inspiration") + index_char_in_pair_seq = pair_text.find("inspiration") + + pair_encoding = tokenizer_r.encode_plus(text, pair_text, add_special_tokens=False) + + pair_batch_encoding = tokenizer_r.batch_encode_plus( + [(text, pair_text)] * batch_size, add_special_tokens=False + ) + num_tokens = len(encoding["input_ids"]) + + last_word_index = len(words) - 1 + last_token_index = num_tokens - 1 + last_batch_index = batch_size - 1 + last_char_index = len(text) - 1 + + # Assert word_to_tokens + self.assertNotEqual( + pair_encoding.word_to_tokens(index_word_in_first_seq, sequence_index=0).start, + pair_encoding.word_to_tokens(index_word_in_pair_seq, sequence_index=1).start, + ) + self.assertEqual( + pair_encoding["input_ids"][ + pair_encoding.word_to_tokens(index_word_in_first_seq, sequence_index=0).start + ], + pair_encoding["input_ids"][ + pair_encoding.word_to_tokens(index_word_in_pair_seq, sequence_index=1).start + ], + ) + self.assertNotEqual( + pair_batch_encoding.word_to_tokens(1, index_word_in_first_seq, sequence_index=0).start, + pair_batch_encoding.word_to_tokens(1, index_word_in_pair_seq, sequence_index=1).start, + ) + self.assertEqual( + pair_batch_encoding["input_ids"][1][ + pair_batch_encoding.word_to_tokens(1, index_word_in_first_seq, sequence_index=0).start + ], + pair_batch_encoding["input_ids"][1][ + pair_batch_encoding.word_to_tokens(1, index_word_in_pair_seq, sequence_index=1).start + ], + ) + + # Assert char_to_token + self.assertNotEqual( + pair_encoding.char_to_token(index_char_in_first_seq, sequence_index=0), + pair_encoding.char_to_token(index_char_in_pair_seq, sequence_index=1), + ) + self.assertEqual( + pair_encoding["input_ids"][pair_encoding.char_to_token(index_char_in_first_seq, sequence_index=0)], + pair_encoding["input_ids"][pair_encoding.char_to_token(index_char_in_pair_seq, sequence_index=1)], + ) + self.assertNotEqual( + pair_batch_encoding.char_to_token(1, index_char_in_first_seq, sequence_index=0), + pair_batch_encoding.char_to_token(1, index_char_in_pair_seq, sequence_index=1), + ) + self.assertEqual( + pair_batch_encoding["input_ids"][1][ + pair_batch_encoding.char_to_token(1, index_char_in_first_seq, sequence_index=0) + ], + pair_batch_encoding["input_ids"][1][ + pair_batch_encoding.char_to_token(1, index_char_in_pair_seq, sequence_index=1) + ], + ) + + # Assert char_to_word + self.assertNotEqual( + pair_encoding.char_to_word(index_char_in_first_seq, sequence_index=0), + pair_encoding.char_to_word(index_char_in_pair_seq, sequence_index=1), + ) + self.assertEqual( + words[pair_encoding.char_to_word(index_char_in_first_seq, sequence_index=0)], + pair_words[pair_encoding.char_to_word(index_char_in_pair_seq, sequence_index=1)], + ) + self.assertNotEqual( + pair_batch_encoding.char_to_word(1, index_char_in_first_seq, sequence_index=0), + pair_batch_encoding.char_to_word(1, index_char_in_pair_seq, sequence_index=1), + ) + self.assertEqual( + words[pair_batch_encoding.char_to_word(1, index_char_in_first_seq, sequence_index=0)], + pair_words[pair_batch_encoding.char_to_word(1, index_char_in_pair_seq, sequence_index=1)], + ) + + # Assert word_to_chars + self.assertNotEqual( + pair_encoding.word_to_chars(index_word_in_first_seq, sequence_index=0).start, + pair_encoding.word_to_chars(index_word_in_pair_seq, sequence_index=1).start, + ) + self.assertEqual( + text[pair_encoding.word_to_chars(index_word_in_first_seq, sequence_index=0).start], + pair_text[pair_encoding.word_to_chars(index_word_in_pair_seq, sequence_index=1).start], + ) + self.assertNotEqual( + pair_batch_encoding.word_to_chars(1, index_word_in_first_seq, sequence_index=0).start, + pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start, + ) + self.assertEqual( + text[pair_batch_encoding.word_to_chars(1, index_word_in_first_seq, sequence_index=0).start], + pair_text[pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start], + ) + def test_tokenization_python_rust_equals(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): From 336759326a05c2f1ab3d82737f67e217fc11a09b Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Mon, 9 Nov 2020 17:57:00 +0100 Subject: [PATCH 06/24] Move pipeline tests to their own test job again --- tests/test_pipelines_common.py | 77 +--------------------------------- 1 file changed, 1 insertion(+), 76 deletions(-) diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index c73252811705..736ac9612081 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -2,8 +2,6 @@ from unittest import mock from transformers import is_tf_available, is_torch_available, pipeline - -# from transformers.pipelines import DefaultArgumentHandler, Pipeline from transformers.pipelines import Pipeline from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow from transformers.tokenization_utils_base import to_py_obj @@ -12,7 +10,7 @@ VALID_INPUTS = ["A simple string", ["list of strings"]] -# @is_pipeline_test +@is_pipeline_test class CustomInputPipelineCommonMixin: pipeline_task = None pipeline_loading_kwargs = {} # Additional kwargs to load the pipeline with @@ -230,76 +228,3 @@ def _test_pipeline(self, nlp: Pipeline): self.assertIn(key, result) self.assertRaises(Exception, nlp, self.invalid_inputs) - - -# @is_pipeline_test -# class DefaultArgumentHandlerTestCase(unittest.TestCase): -# def setUp(self) -> None: -# self.handler = DefaultArgumentHandler() -# -# def test_kwargs_x(self): -# mono_data = {"X": "This is a sample input"} -# mono_args = self.handler(**mono_data) -# -# self.assertTrue(isinstance(mono_args, list)) -# self.assertEqual(len(mono_args), 1) -# -# multi_data = {"x": ["This is a sample input", "This is a second sample input"]} -# multi_args = self.handler(**multi_data) -# -# self.assertTrue(isinstance(multi_args, list)) -# self.assertEqual(len(multi_args), 2) -# -# def test_kwargs_data(self): -# mono_data = {"data": "This is a sample input"} -# mono_args = self.handler(**mono_data) -# -# self.assertTrue(isinstance(mono_args, list)) -# self.assertEqual(len(mono_args), 1) -# -# multi_data = {"data": ["This is a sample input", "This is a second sample input"]} -# multi_args = self.handler(**multi_data) -# -# self.assertTrue(isinstance(multi_args, list)) -# self.assertEqual(len(multi_args), 2) -# -# def test_multi_kwargs(self): -# mono_data = {"data": "This is a sample input", "X": "This is a sample input 2"} -# mono_args = self.handler(**mono_data) -# -# self.assertTrue(isinstance(mono_args, list)) -# self.assertEqual(len(mono_args), 2) -# -# multi_data = { -# "data": ["This is a sample input", "This is a second sample input"], -# "test": ["This is a sample input 2", "This is a second sample input 2"], -# } -# multi_args = self.handler(**multi_data) -# -# self.assertTrue(isinstance(multi_args, list)) -# self.assertEqual(len(multi_args), 4) -# -# def test_args(self): -# mono_data = "This is a sample input" -# mono_args = self.handler(mono_data) -# -# self.assertTrue(isinstance(mono_args, list)) -# self.assertEqual(len(mono_args), 1) -# -# mono_data = ["This is a sample input"] -# mono_args = self.handler(mono_data) -# -# self.assertTrue(isinstance(mono_args, list)) -# self.assertEqual(len(mono_args), 1) -# -# multi_data = ["This is a sample input", "This is a second sample input"] -# multi_args = self.handler(multi_data) -# -# self.assertTrue(isinstance(multi_args, list)) -# self.assertEqual(len(multi_args), 2) -# -# multi_data = ["This is a sample input", "This is a second sample input"] -# multi_args = self.handler(*multi_data) -# -# self.assertTrue(isinstance(multi_args, list)) -# self.assertEqual(len(multi_args), 2) From 36e0900148d401c291f1d2faf7a470ce6084ce25 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Mon, 9 Nov 2020 20:38:06 +0100 Subject: [PATCH 07/24] update tokenizer to add sequence id methods --- src/transformers/tokenization_utils_base.py | 99 +++++++++++++++++++++ tests/test_tokenization_common.py | 56 +++++++++++- 2 files changed, 151 insertions(+), 4 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 1cd6e491e8ec..b7f1a71ddd0c 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -216,6 +216,9 @@ class BatchEncoding(UserDict): initialization. prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above). + n_sequences (:obj:`Optional[int]`, `optional`): + You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at + initialization. """ def __init__( @@ -224,6 +227,7 @@ def __init__( encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, tensor_type: Union[None, str, TensorType] = None, prepend_batch_axis: bool = False, + n_sequences: Optional[int] = None, ): super().__init__(data) @@ -232,8 +236,22 @@ def __init__( self._encodings = encoding + if n_sequences is None and encoding is not None and len(encoding): + n_sequences = encoding[0].n_sequences + + self._n_sequences = n_sequences + self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) + @property + def n_sequences(self) -> Optional[int]: + """ + :obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this + :class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single + sentence) or :obj:`2` (a pair of sentences) + """ + return self.n_sequences + @property def is_fast(self) -> bool: """ @@ -311,6 +329,27 @@ def tokens(self, batch_index: int = 0) -> List[str]: raise ValueError("tokens() is not available when using Python-based tokenizers") return self._encodings[batch_index].tokens + def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]: + """ + Return a list mapping the tokens to the id of their original sentences: + + - :obj:`None` for special tokens added around or between sequences, + - :obj:`0` for tokens coresponding to words in the first sequence, + - :obj:`1` for tokens coresponding to words in the second sequence when a pair of sequences was jointly + encoded. + + Args: + batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch. + + Returns: + :obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens + added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their + corresponding sequence. + """ + if not self._encodings: + raise ValueError("sequence_ids() is not available when using Python-based tokenizers") + return self._encodings[batch_index].sequences + def words(self, batch_index: int = 0) -> List[Optional[int]]: """ Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer. @@ -325,8 +364,68 @@ def words(self, batch_index: int = 0) -> List[Optional[int]]: """ if not self._encodings: raise ValueError("words() is not available when using Python-based tokenizers") + warnings.warn( + "`BatchEncoding.words(batch_index: int = 0)` propperty is deprecated and should be replaced with the identical, " + "but more self-explanatory `BatchEncoding.words(batch_index: int = 0)` property.", + FutureWarning, + ) + return self.word_ids(batch_index) + + def word_ids(self, batch_index: int = 0) -> List[Optional[int]]: + """ + Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer. + + Args: + batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch. + + Returns: + :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by + the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding + word (several tokens will be mapped to the same word index if they are parts of that word). + """ + if not self._encodings: + raise ValueError("word_ids() is not available when using Python-based tokenizers") return self._encodings[batch_index].words + def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: + """ + Get the index of the sequence represented by the given token. In the general use case, this method returns + :obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair + + Can be called as: + + - ``self.token_to_sequence(token_index)`` if batch size is 1 + - ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e., + words are defined by the user). In this case it allows to easily associate encoded tokens with provided + tokenized words. + + Args: + batch_or_token_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the token in the sequence. + token_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the + sequence. + + Returns: + :obj:`int`: Index of the word in the input sequence. + """ + + if not self._encodings: + raise ValueError("token_to_sequence() is not available when using Python based tokenizers") + if token_index is not None: + batch_index = batch_or_token_index + else: + batch_index = 0 + token_index = batch_or_token_index + if batch_index < 0: + batch_index = self._batch_size + batch_index + if token_index < 0: + token_index = self._seq_len + token_index + return self._encodings[batch_index].token_to_sequence(token_index) + def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: """ Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch. diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 2903cca33fd6..376616a0b5de 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -581,7 +581,6 @@ def test_token_type_ids(self): for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): seq_0 = "Test this method." - seq_1 = "With these inputs." # We want to have sequence 0 and sequence 1 are tagged # respectively with 0 and 1 token_ids @@ -590,9 +589,28 @@ def test_token_type_ids(self): output = tokenizer(seq_0, return_token_type_ids=True) self.assertIn(0, output["token_type_ids"]) - output = tokenizer(seq_0, seq_1, return_token_type_ids=True) - self.assertIn(0, output["token_type_ids"]) - self.assertIn(1, output["token_type_ids"]) + def test_sequence_ids(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + if not tokenizer.is_fast: + continue + with self.subTest(f"{tokenizer.__class__.__name__}"): + seq_0 = "Test this method." + seq_1 = "With these inputs." + + # We want to have sequence 0 and sequence 1 are tagged + # respectively with 0 and 1 token_ids + # (regardeless of weither the model use token type ids) + # We use this assumption in the QA pipeline among other place + output = tokenizer(seq_0) + self.assertIn(0, output.sequence_ids()) + + output = tokenizer(seq_0, seq_1) + self.assertIn(0, output.sequence_ids()) + self.assertIn(1, output.sequence_ids()) + + if tokenizer.num_special_tokens_to_add(pair=True): + self.assertIn(None, output.sequence_ids()) def test_number_of_added_tokens(self): tokenizers = self.get_tokenizers(do_lower_case=False) @@ -1896,6 +1914,13 @@ def test_alignement_methods(self): batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1 ) + # Assert token_to_sequence + self.assertEqual(encoding.token_to_sequence(num_tokens // 2), 0) + self.assertEqual(encoding.token_to_sequence(0, num_tokens // 2), 0) + self.assertEqual(batch_encoding.token_to_sequence(1, num_tokens // 2), 0) + self.assertEqual(batch_encoding.token_to_sequence(0, num_tokens // 2), 0) + self.assertEqual(batch_encoding.token_to_sequence(last_batch_index, num_tokens // 2), 0) + # Pair of input sequences words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"] @@ -2004,6 +2029,29 @@ def test_alignement_methods(self): pair_text[pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start], ) + # Assert token_to_sequence + pair_encoding = tokenizer_r.encode_plus(text, pair_text, add_special_tokens=True) + + pair_sequence_ids = [ + pair_encoding.token_to_sequence(i) for i in range(len(pair_encoding["input_ids"])) + ] + self.assertIn(0, pair_sequence_ids) + self.assertIn(1, pair_sequence_ids) + if tokenizer_r.num_special_tokens_to_add(pair=True): + self.assertIn(None, pair_sequence_ids) + + pair_batch_encoding = tokenizer_r.batch_encode_plus( + [(text, pair_text)] * batch_size, add_special_tokens=True + ) + pair_batch_sequence_ids = [ + pair_batch_encoding.token_to_sequence(1, i) + for i in range(len(pair_batch_encoding["input_ids"][0])) + ] + self.assertIn(0, pair_batch_sequence_ids) + self.assertIn(1, pair_batch_sequence_ids) + if tokenizer_r.num_special_tokens_to_add(pair=True): + self.assertIn(None, pair_batch_sequence_ids) + def test_tokenization_python_rust_equals(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): From ef4919ba22879945fc831fdb9dfc8a9bbfa8eae4 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 10:42:22 +0100 Subject: [PATCH 08/24] update to tokenizers 0.9.4 --- setup.py | 4 ++-- src/transformers/tokenization_utils_base.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 04c51912fdc9..a13a5b9bda5e 100644 --- a/setup.py +++ b/setup.py @@ -96,7 +96,7 @@ extras["retrieval"] = ["faiss-cpu", "datasets"] extras["flax"] = ["jaxlib==0.1.55", "jax>=0.2.0", "flax==0.2.2"] -extras["tokenizers"] = ["tokenizers==0.9.2"] +extras["tokenizers"] = ["tokenizers==0.9.4"] extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"] extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] @@ -129,7 +129,7 @@ packages=find_packages("src"), install_requires=[ "numpy", - "tokenizers == 0.9.3", + "tokenizers == 0.9.4", # dataclasses for Python versions that don't have it "dataclasses;python_version<'3.7'", # utilities from PyPA to e.g. compare versions diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index b7f1a71ddd0c..f9eb8566da44 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -348,7 +348,7 @@ def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]: """ if not self._encodings: raise ValueError("sequence_ids() is not available when using Python-based tokenizers") - return self._encodings[batch_index].sequences + return self._encodings[batch_index].sequence_ids def words(self, batch_index: int = 0) -> List[Optional[int]]: """ @@ -385,7 +385,7 @@ def word_ids(self, batch_index: int = 0) -> List[Optional[int]]: """ if not self._encodings: raise ValueError("word_ids() is not available when using Python-based tokenizers") - return self._encodings[batch_index].words + return self._encodings[batch_index].word_ids def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: """ From dab816880c7a6762ba898153024e739d740c2ce6 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 13:07:03 +0100 Subject: [PATCH 09/24] set sentencepiecce as optional --- setup.py | 2 +- tests/test_tokenization_xlm_prophetnet.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a13a5b9bda5e..fba83903a205 100644 --- a/setup.py +++ b/setup.py @@ -143,7 +143,7 @@ # for OpenAI GPT "regex != 2019.12.17", # for SentencePiece models - "sentencepiece == 0.1.91", + # "sentencepiece == 0.1.91", "protobuf", # for XLM "sacremoses", diff --git a/tests/test_tokenization_xlm_prophetnet.py b/tests/test_tokenization_xlm_prophetnet.py index 83097ff71d71..7dfdee6b5f8a 100644 --- a/tests/test_tokenization_xlm_prophetnet.py +++ b/tests/test_tokenization_xlm_prophetnet.py @@ -18,7 +18,7 @@ import unittest from transformers.file_utils import cached_property -from transformers.testing_utils import slow +from transformers.testing_utils import require_sentencepiece, slow from transformers.tokenization_xlm_prophetnet import SPIECE_UNDERLINE, XLMProphetNetTokenizer from .test_tokenization_common import TokenizerTesterMixin @@ -27,6 +27,7 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") +@require_sentencepiece class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLMProphetNetTokenizer From 84bc2444decc6f910061bf76598f5917c599a30a Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 14:29:11 +0100 Subject: [PATCH 10/24] clean up squad --- src/transformers/data/processors/squad.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index 5dbfed40cb9e..167cf3ee48d9 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -350,20 +350,6 @@ def squad_convert_examples_to_features( # Defining helper methods features = [] - ################# - # squad_convert_example_to_features_init(tokenizer) - # for example in examples: - # feature = squad_convert_example_to_features( - # example, - # max_seq_length=max_seq_length, - # doc_stride=doc_stride, - # max_query_length=max_query_length, - # padding_strategy=padding_strategy, - # is_training=is_training, - # ) - # features.append(feature) - - ################# threads = min(threads, cpu_count()) with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: annotate_ = partial( @@ -382,7 +368,6 @@ def squad_convert_examples_to_features( disable=not tqdm_enabled, ) ) - ################# new_features = [] unique_id = 1000000000 From 751ee692d7486c33f719b81768b658ab70b0d025 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 14:43:10 +0100 Subject: [PATCH 11/24] clean up pipelines to use sequence_ids --- src/transformers/pipelines.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 55a8b033a06e..ce3dcab24d36 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1808,13 +1808,21 @@ def __call__(self, *args, **kwargs): return_special_tokens_mask=True, ) + # When the input is too long, it's converted in a batch of inputs with overflowing tokens + # and a stride of overlap between the inputs. If a batch of inputs is given, a special output + # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample. + # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping". + # "num_span" is the number of output samples generated from the overflowing tokens. num_spans = len(encoded_inputs["input_ids"]) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) - p_mask = encoded_inputs["token_type_ids"] == (0 if question_first else 1) # Mask the question - p_mask = p_mask | encoded_inputs["special_tokens_mask"] # And mask the special tokens + # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens) + p_mask = np.asarray([[tok != 1 if question_first else 0 + for tok in encoded_inputs.sequence_ids(span_id)] + for span_id in range(num_spans)]) + + # keep the cls_token unmasked (some models use it to indicate unanswerable questions) if self.tokenizer.cls_token_id: - # keep the cls_token unmasked (some models use it to indicate unanswerable questions) cls_index = np.nonzero(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id) p_mask[cls_index] = 0 From 0e8d7f7020ff0787b7da7f88d5fd89d71712a30a Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 15:50:44 +0100 Subject: [PATCH 12/24] style/quality --- src/transformers/pipelines.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index ce3dcab24d36..aaba696c702e 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1817,9 +1817,12 @@ def __call__(self, *args, **kwargs): # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens) - p_mask = np.asarray([[tok != 1 if question_first else 0 - for tok in encoded_inputs.sequence_ids(span_id)] - for span_id in range(num_spans)]) + p_mask = np.asarray( + [ + [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)] + for span_id in range(num_spans) + ] + ) # keep the cls_token unmasked (some models use it to indicate unanswerable questions) if self.tokenizer.cls_token_id: From eb72b1fe5cb98275e5ff1eda5ce21db69a27a34d Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 16:01:06 +0100 Subject: [PATCH 13/24] wording --- src/transformers/tokenization_utils_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 4a9ffd1d3323..5d0e50add8f1 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -367,8 +367,8 @@ def words(self, batch_index: int = 0) -> List[Optional[int]]: if not self._encodings: raise ValueError("words() is not available when using Python-based tokenizers") warnings.warn( - "`BatchEncoding.words(batch_index: int = 0)` propperty is deprecated and should be replaced with the identical, " - "but more self-explanatory `BatchEncoding.words(batch_index: int = 0)` property.", + "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, " + "but more self-explanatory `BatchEncoding.word_ids()` property.", FutureWarning, ) return self.word_ids(batch_index) From 16da2c54244e448ff5da7e16380e7b56ce2b1ac6 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 16:06:13 +0100 Subject: [PATCH 14/24] Switch to use_fast = True by default --- src/transformers/pipelines.py | 4 ++-- src/transformers/tokenization_auto.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index aaba696c702e..8cc533d980ee 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -2840,7 +2840,7 @@ def pipeline( tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, framework: Optional[str] = None, revision: Optional[str] = None, - use_fast: bool = False, + use_fast: bool = True, **kwargs ) -> Pipeline: """ @@ -2898,7 +2898,7 @@ def pipeline( When passing a task name or a string model identifier: The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git. - use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`): + use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`). kwargs: Additional keyword arguments passed along to the specific pipeline init (see the documentation for the diff --git a/src/transformers/tokenization_auto.py b/src/transformers/tokenization_auto.py index 93c9fbfe64a9..7e375d05986b 100644 --- a/src/transformers/tokenization_auto.py +++ b/src/transformers/tokenization_auto.py @@ -280,7 +280,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git. - use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`): + use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to try to load the fast version of the tokenizer. kwargs (additional keyword arguments, `optional`): Will be passed to the Tokenizer ``__init__()`` method. Can be used to set special tokens like @@ -308,7 +308,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): if "bert-base-japanese" in str(pretrained_model_name_or_path): return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - use_fast = kwargs.pop("use_fast", False) + use_fast = kwargs.pop("use_fast", True) if config.tokenizer_class is not None: if use_fast and not config.tokenizer_class.endswith("Fast"): From 0f03fdb6e73aa4d9679dc16462556821878279d2 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 16:25:38 +0100 Subject: [PATCH 15/24] update tests for use_fast at True by default --- tests/test_pipelines_ner.py | 2 +- tests/test_tokenization_auto.py | 4 ++-- tests/test_tokenization_rag.py | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_pipelines_ner.py b/tests/test_pipelines_ner.py index bc12900d8422..44f47d66d99b 100644 --- a/tests/test_pipelines_ner.py +++ b/tests/test_pipelines_ner.py @@ -149,7 +149,7 @@ def test_pt_ignore_subwords_slow_tokenizer_raises(self): tokenizer = AutoTokenizer.from_pretrained(model_name) with self.assertRaises(ValueError): - pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True) + pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True, use_fast=False) @require_torch def test_pt_defaults_slow_tokenizer(self): diff --git a/tests/test_tokenization_auto.py b/tests/test_tokenization_auto.py index 390e89b08939..e06d7800bb1d 100644 --- a/tests/test_tokenization_auto.py +++ b/tests/test_tokenization_auto.py @@ -116,5 +116,5 @@ def test_parents_and_children_in_mappings(self): @require_tokenizers def test_from_pretrained_use_fast_toggle(self): - self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizer) - self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True), BertTokenizerFast) + self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer) + self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizerFast) diff --git a/tests/test_tokenization_rag.py b/tests/test_tokenization_rag.py index 158aadca6940..3a2551b3859e 100644 --- a/tests/test_tokenization_rag.py +++ b/tests/test_tokenization_rag.py @@ -4,13 +4,12 @@ import tempfile from unittest import TestCase +from transformers import BartTokenizer, DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast from transformers.configuration_bart import BartConfig from transformers.configuration_dpr import DPRConfig from transformers.file_utils import is_datasets_available, is_faiss_available, is_torch_available -from transformers.testing_utils import require_datasets, require_faiss, require_torch, slow -from transformers.tokenization_bart import BartTokenizer +from transformers.testing_utils import require_datasets, require_faiss, require_tokenizers, require_torch, slow from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES -from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES @@ -96,6 +95,7 @@ def get_bart_tokenizer(self) -> BartTokenizer: def tearDown(self): shutil.rmtree(self.tmpdirname) + @require_tokenizers def test_save_load_pretrained_with_saved_config(self): save_dir = os.path.join(self.tmpdirname, "rag_tokenizer") @@ -104,7 +104,7 @@ def test_save_load_pretrained_with_saved_config(self): rag_config.save_pretrained(save_dir) rag_tokenizer.save_pretrained(save_dir) new_rag_tokenizer = RagTokenizer.from_pretrained(save_dir, config=rag_config) - self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizer) + self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizerFast) self.assertEqual(new_rag_tokenizer.question_encoder.vocab, rag_tokenizer.question_encoder.vocab) self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizer) self.assertEqual(new_rag_tokenizer.generator.encoder, rag_tokenizer.generator.encoder) From 87cb801a7f115c835128aa408025958044aec5f6 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 16:45:24 +0100 Subject: [PATCH 16/24] fix rag tokenizer test --- tests/test_tokenization_rag.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_tokenization_rag.py b/tests/test_tokenization_rag.py index 3a2551b3859e..63bdb541e61d 100644 --- a/tests/test_tokenization_rag.py +++ b/tests/test_tokenization_rag.py @@ -4,7 +4,7 @@ import tempfile from unittest import TestCase -from transformers import BartTokenizer, DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast +from transformers import BartTokenizer, BartTokenizerFast, DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast from transformers.configuration_bart import BartConfig from transformers.configuration_dpr import DPRConfig from transformers.file_utils import is_datasets_available, is_faiss_available, is_torch_available @@ -105,9 +105,9 @@ def test_save_load_pretrained_with_saved_config(self): rag_tokenizer.save_pretrained(save_dir) new_rag_tokenizer = RagTokenizer.from_pretrained(save_dir, config=rag_config) self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizerFast) - self.assertEqual(new_rag_tokenizer.question_encoder.vocab, rag_tokenizer.question_encoder.vocab) - self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizer) - self.assertEqual(new_rag_tokenizer.generator.encoder, rag_tokenizer.generator.encoder) + self.assertEqual(new_rag_tokenizer.question_encoder.get_vocab(), rag_tokenizer.question_encoder.get_vocab()) + self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizerFast) + self.assertEqual(new_rag_tokenizer.generator.get_vocab(), rag_tokenizer.generator.get_vocab()) @slow def test_pretrained_token_nq_tokenizer(self): From 77ee69ff77d922a9c732d5276feace12f32716bc Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 17:00:14 +0100 Subject: [PATCH 17/24] removing protobuf from required dependencies --- setup.py | 4 +--- src/transformers/convert_slow_tokenizer.py | 19 +++++++-------- src/transformers/file_utils.py | 27 ++++++++++++++++++++++ 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/setup.py b/setup.py index fba83903a205..7a43c7085aa3 100644 --- a/setup.py +++ b/setup.py @@ -101,6 +101,7 @@ extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] +extras["protobuf"] = ["protobuf"] extras["sentencepiece"] = ["sentencepiece==0.1.91"] extras["retrieval"] = ["faiss-cpu", "datasets"] extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] @@ -142,9 +143,6 @@ "tqdm >= 4.27", # for OpenAI GPT "regex != 2019.12.17", - # for SentencePiece models - # "sentencepiece == 0.1.91", - "protobuf", # for XLM "sacremoses", ], diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 8c765943c217..e856d1196768 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -24,10 +24,7 @@ from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece -# from transformers.tokenization_openai import OpenAIGPTTokenizer -from transformers.utils import sentencepiece_model_pb2 as model - -from .file_utils import requires_sentencepiece +from .file_utils import requires_sentencepiece, requires_protobuf class SentencePieceExtractor: @@ -64,12 +61,6 @@ def check_number_comma(piece: str) -> bool: return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit() -def get_proto(filename: str): - m = model.ModelProto() - m.ParseFromString(open(filename, "rb").read()) - return m - - class Converter: def __init__(self, original_tokenizer): self.original_tokenizer = original_tokenizer @@ -292,8 +283,14 @@ def converted(self) -> Tokenizer: class SpmConverter(Converter): def __init__(self, *args): + requires_protobuf(self) + super().__init__(*args) - self.proto = get_proto(self.original_tokenizer.vocab_file) + + from .utils import sentencepiece_model_pb2 as model_pb2 + m = model_pb2.ModelProto() + m.ParseFromString(open(self.original_tokenizer.vocab_file, "rb").read()) + self.proto = m def vocab(self, proto): return [(piece.piece, piece.score) for piece in proto.pieces] diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index f6b63fa8962f..3c03a451e899 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -185,6 +185,15 @@ _sentencepiece_available = False +try: + import protobuf # noqa: F401 + + _protobuf_available = True + +except ImportError: + _protobuf_available = False + + try: import tokenizers # noqa: F401 @@ -270,6 +279,10 @@ def is_sentencepiece_available(): return _sentencepiece_available +def is_protobuf_available(): + return _protobuf_available + + def is_tokenizers_available(): return _tokenizers_available @@ -330,6 +343,14 @@ def wrapper(*args, **kwargs): """ +# docstyle-ignore +PROTOBUF_IMPORT_ERROR = """ +{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the +installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones +that match your environment. +""" + + # docstyle-ignore FAISS_IMPORT_ERROR = """ {0} requires the faiss library but it was not found in your environment. Checkout the instructions on the @@ -420,6 +441,12 @@ def requires_sentencepiece(obj): raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name)) +def requires_protobuf(obj): + name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__ + if not is_protobuf_available(): + raise ImportError(PROTOBUF_IMPORT_ERROR.format(name)) + + def add_start_docstrings(*docstr): def docstring_decorator(fn): fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") From 14839277cf9bb8d71801989c4309d8770cc63fd4 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 17:16:30 +0100 Subject: [PATCH 18/24] fix NER test for use_fast = True by default --- src/transformers/convert_slow_tokenizer.py | 3 ++- tests/test_pipelines_ner.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index e856d1196768..7e988e7fdd73 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -24,7 +24,7 @@ from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece -from .file_utils import requires_sentencepiece, requires_protobuf +from .file_utils import requires_protobuf, requires_sentencepiece class SentencePieceExtractor: @@ -288,6 +288,7 @@ def __init__(self, *args): super().__init__(*args) from .utils import sentencepiece_model_pb2 as model_pb2 + m = model_pb2.ModelProto() m.ParseFromString(open(self.original_tokenizer.vocab_file, "rb").read()) self.proto = m diff --git a/tests/test_pipelines_ner.py b/tests/test_pipelines_ner.py index 44f47d66d99b..58da4aded63e 100644 --- a/tests/test_pipelines_ner.py +++ b/tests/test_pipelines_ner.py @@ -146,7 +146,7 @@ def test_tf_small_ignore_subwords_available_for_fast_tokenizers(self): @require_torch def test_pt_ignore_subwords_slow_tokenizer_raises(self): for model_name in self.small_models: - tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) with self.assertRaises(ValueError): pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True, use_fast=False) From b115646c8a27b0c8aa610756e58a061e3f759d61 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 17:55:40 +0100 Subject: [PATCH 19/24] fixing example tests (Q&A examples use slow tokenizers for now) --- examples/question-answering/run_squad.py | 6 +++++- examples/question-answering/run_squad_trainer.py | 1 + src/transformers/tokenization_utils_base.py | 2 ++ src/transformers/tokenization_utils_fast.py | 2 ++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/question-answering/run_squad.py b/examples/question-answering/run_squad.py index 59550347c275..4f8fe05a8645 100644 --- a/examples/question-answering/run_squad.py +++ b/examples/question-answering/run_squad.py @@ -730,6 +730,7 @@ def main(): args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, + use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling ) model = AutoModelForQuestionAnswering.from_pretrained( args.model_name_or_path, @@ -778,7 +779,10 @@ def main(): # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir) # , force_download=True) - tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) + + # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling + # So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out + tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory diff --git a/examples/question-answering/run_squad_trainer.py b/examples/question-answering/run_squad_trainer.py index d5fc0723164a..0bb357b21e8e 100644 --- a/examples/question-answering/run_squad_trainer.py +++ b/examples/question-answering/run_squad_trainer.py @@ -107,6 +107,7 @@ def main(): tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, + use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling ) model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 5d0e50add8f1..a05e06aad63a 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1992,6 +1992,8 @@ def _save_pretrained( "Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format." ) + save_directory = str(save_directory) + added_tokens_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE ) diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 6c68c44e6741..c672a0b02ef2 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -527,6 +527,8 @@ def _save_pretrained( Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the specific :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` """ + save_directory = str(save_directory) + if legacy_format: added_tokens_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE From 56f77e878586356a24c5199824e783742a85712d Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 18:09:35 +0100 Subject: [PATCH 20/24] protobuf in main deps extras["sentencepiece"] and example deps --- examples/requirements.txt | 1 + setup.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/requirements.txt b/examples/requirements.txt index 9c2704796789..1ce783440f6e 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -18,3 +18,4 @@ fire pytest conllu sentencepiece != 0.1.92 +protobuf diff --git a/setup.py b/setup.py index 7a43c7085aa3..7e7e34661b6f 100644 --- a/setup.py +++ b/setup.py @@ -101,8 +101,7 @@ extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] -extras["protobuf"] = ["protobuf"] -extras["sentencepiece"] = ["sentencepiece==0.1.91"] +extras["sentencepiece"] = ["sentencepiece==0.1.91", "protobuf"] extras["retrieval"] = ["faiss-cpu", "datasets"] extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] # sphinx-rtd-theme==0.5.0 introduced big changes in the style. From 6894fc0b8bc15718adffde664f3b3e6c9e745f24 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 19:32:03 +0100 Subject: [PATCH 21/24] fix protobug install test --- src/transformers/file_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 3c03a451e899..374b10dafabe 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -186,7 +186,7 @@ try: - import protobuf # noqa: F401 + import google.protobuf # noqa: F401 _protobuf_available = True From 2441d401ceb02c97f6783395f42816c3408f65c7 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 19:38:02 +0100 Subject: [PATCH 22/24] try to fix seq2seq by switching to slow tokenizers for now --- examples/seq2seq/test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/seq2seq/test_datasets.py b/examples/seq2seq/test_datasets.py index 625b6da347d3..4cbce79eaa92 100644 --- a/examples/seq2seq/test_datasets.py +++ b/examples/seq2seq/test_datasets.py @@ -197,7 +197,7 @@ def test_distributed_sortish_sampler_splits_indices_between_procs(self): ) @require_torch_non_multigpu_but_fix_me def test_dataset_kwargs(self, tok_name): - tokenizer = AutoTokenizer.from_pretrained(tok_name) + tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=False) if tok_name == MBART_TINY: train_dataset = Seq2SeqDataset( tokenizer, From fc2daadeeb3cfe2fb43ce5edf49478bb734f5f1d Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 21:01:58 +0100 Subject: [PATCH 23/24] Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut --- src/transformers/tokenization_utils_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index a05e06aad63a..744e2440ba3a 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -336,8 +336,8 @@ def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]: Return a list mapping the tokens to the id of their original sentences: - :obj:`None` for special tokens added around or between sequences, - - :obj:`0` for tokens coresponding to words in the first sequence, - - :obj:`1` for tokens coresponding to words in the second sequence when a pair of sequences was jointly + - :obj:`0` for tokens corresponding to words in the first sequence, + - :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly encoded. Args: From 002848bf7dcd4c18ea957c542e3b0f099a641214 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Tue, 10 Nov 2020 21:02:08 +0100 Subject: [PATCH 24/24] Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 744e2440ba3a..a7581b70f8c6 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -405,7 +405,7 @@ def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int Args: batch_or_token_index (:obj:`int`): - Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of the token in the sequence. token_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the