-
Notifications
You must be signed in to change notification settings - Fork 31.9k
[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalence tests pipelines - Removing sentencepiece as a required dependency #8073
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 17 commits
4bc3b3f
1ce1c63
15350e8
449e346
eb375bc
a4cb7f6
3367593
36e0900
ef4919b
dab8168
9e72b29
84bc244
751ee69
0e8d7f7
eb72b1f
16da2c5
0f03fdb
87cb801
77ee69f
1483927
b115646
56f77e8
6894fc0
2441d40
fc2daad
002848b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,7 +32,7 @@ | |
|
|
||
| from .configuration_auto import AutoConfig | ||
| from .configuration_utils import PretrainedConfig | ||
| from .data import SquadExample, squad_convert_examples_to_features | ||
| from .data import SquadExample, SquadFeatures, squad_convert_examples_to_features | ||
| from .file_utils import add_end_docstrings, is_tf_available, is_torch_available | ||
| from .modelcard import ModelCard | ||
| from .tokenization_auto import AutoTokenizer | ||
|
|
@@ -1758,6 +1758,7 @@ def __call__(self, *args, **kwargs): | |
| - **answer** (:obj:`str`) -- The answer to the question. | ||
| """ | ||
| # Set defaults values | ||
| kwargs.setdefault("padding", "longest") | ||
| kwargs.setdefault("topk", 1) | ||
| kwargs.setdefault("doc_stride", 128) | ||
| kwargs.setdefault("max_answer_len", 15) | ||
|
|
@@ -1773,19 +1774,87 @@ def __call__(self, *args, **kwargs): | |
|
|
||
| # Convert inputs to features | ||
| examples = self._args_parser(*args, **kwargs) | ||
| features_list = [ | ||
| squad_convert_examples_to_features( | ||
| examples=[example], | ||
| tokenizer=self.tokenizer, | ||
| max_seq_length=kwargs["max_seq_len"], | ||
| doc_stride=kwargs["doc_stride"], | ||
| max_query_length=kwargs["max_question_len"], | ||
| padding_strategy=PaddingStrategy.MAX_LENGTH.value, | ||
| is_training=False, | ||
| tqdm_enabled=False, | ||
| ) | ||
| for example in examples | ||
| ] | ||
| if not self.tokenizer.is_fast: | ||
| features_list = [ | ||
| squad_convert_examples_to_features( | ||
| examples=[example], | ||
| tokenizer=self.tokenizer, | ||
| max_seq_length=kwargs["max_seq_len"], | ||
| doc_stride=kwargs["doc_stride"], | ||
| max_query_length=kwargs["max_question_len"], | ||
| padding_strategy=PaddingStrategy.MAX_LENGTH.value, | ||
| is_training=False, | ||
| tqdm_enabled=False, | ||
| ) | ||
| for example in examples | ||
| ] | ||
| else: | ||
| features_list = [] | ||
| for example in examples: | ||
| # Define the side we want to truncate / pad and the text/pair sorting | ||
| question_first = bool(self.tokenizer.padding_side == "right") | ||
|
|
||
| encoded_inputs = self.tokenizer( | ||
| text=example.question_text if question_first else example.context_text, | ||
| text_pair=example.context_text if question_first else example.question_text, | ||
| padding=kwargs["padding"], | ||
| truncation="only_second" if question_first else "only_first", | ||
| max_length=kwargs["max_seq_len"], | ||
| stride=kwargs["doc_stride"], | ||
| return_tensors="np", | ||
| return_token_type_ids=True, | ||
| return_overflowing_tokens=True, | ||
| return_offsets_mapping=True, | ||
| return_special_tokens_mask=True, | ||
| ) | ||
|
|
||
| # When the input is too long, it's converted in a batch of inputs with overflowing tokens | ||
| # and a stride of overlap between the inputs. If a batch of inputs is given, a special output | ||
| # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample. | ||
| # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping". | ||
| # "num_span" is the number of output samples generated from the overflowing tokens. | ||
| num_spans = len(encoded_inputs["input_ids"]) | ||
|
|
||
| # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) | ||
| # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens) | ||
| p_mask = np.asarray( | ||
| [ | ||
| [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)] | ||
| for span_id in range(num_spans) | ||
| ] | ||
| ) | ||
|
|
||
| # keep the cls_token unmasked (some models use it to indicate unanswerable questions) | ||
| if self.tokenizer.cls_token_id: | ||
| cls_index = np.nonzero(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id) | ||
| p_mask[cls_index] = 0 | ||
|
|
||
| features = [] | ||
| for span_idx in range(num_spans): | ||
| features.append( | ||
| SquadFeatures( | ||
| input_ids=encoded_inputs["input_ids"][span_idx], | ||
| attention_mask=encoded_inputs["attention_mask"][span_idx], | ||
| token_type_ids=encoded_inputs["token_type_ids"][span_idx], | ||
| p_mask=p_mask[span_idx].tolist(), | ||
| encoding=encoded_inputs[span_idx], | ||
| # We don't use the rest of the values - and actually | ||
| # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would be nice, as longer term goal :-)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would be nice, indeed! If we plan on keeping the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No we can remove them, I didn't spend too much time making the pipeline pretty since there is a big redesign coming soon which will likely get ride of the slow tokenizers. |
||
| cls_index=None, | ||
| token_to_orig_map={}, | ||
| example_index=0, | ||
| unique_id=0, | ||
| paragraph_len=0, | ||
| token_is_max_context=0, | ||
| tokens=[], | ||
| start_position=0, | ||
| end_position=0, | ||
| is_impossible=False, | ||
| qas_id=None, | ||
| ) | ||
| ) | ||
| features_list.append(features) | ||
|
|
||
| all_answers = [] | ||
| for features, example in zip(features_list, examples): | ||
| model_input_names = self.tokenizer.model_input_names + ["input_ids"] | ||
|
|
@@ -1828,20 +1897,56 @@ def __call__(self, *args, **kwargs): | |
| start_[0] = end_[0] = 0.0 | ||
|
|
||
| starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) | ||
| char_to_word = np.array(example.char_to_word_offset) | ||
|
|
||
| # Convert the answer (tokens) back to the original text | ||
| answers += [ | ||
| { | ||
| "score": score.item(), | ||
| "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), | ||
| "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), | ||
| "answer": " ".join( | ||
| example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] | ||
| ), | ||
| } | ||
| for s, e, score in zip(starts, ends, scores) | ||
| ] | ||
| if not self.tokenizer.is_fast: | ||
| char_to_word = np.array(example.char_to_word_offset) | ||
|
|
||
| # Convert the answer (tokens) back to the original text | ||
| # Score: score from the model | ||
| # Start: Index of the first character of the answer in the context string | ||
| # End: Index of the character following the last character of the answer in the context string | ||
| # Answer: Plain text of the answer | ||
| answers += [ | ||
| { | ||
| "score": score.item(), | ||
| "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), | ||
| "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), | ||
| "answer": " ".join( | ||
| example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] | ||
| ), | ||
| } | ||
| for s, e, score in zip(starts, ends, scores) | ||
| ] | ||
| else: | ||
| # Convert the answer (tokens) back to the original text | ||
| # Score: score from the model | ||
| # Start: Index of the first character of the answer in the context string | ||
| # End: Index of the character following the last character of the answer in the context string | ||
| # Answer: Plain text of the answer | ||
| question_first = bool(self.tokenizer.padding_side == "right") | ||
| enc = feature.encoding | ||
|
|
||
| # Sometimes the max probability token is in the middle of a word so: | ||
| # - we start by finding the right word containing the token with `token_to_word` | ||
| # - then we convert this word in a character span with `word_to_chars` | ||
| answers += [ | ||
| { | ||
| "score": score.item(), | ||
| "start": enc.word_to_chars( | ||
| enc.token_to_word(s), sequence_index=1 if question_first else 0 | ||
| )[0], | ||
| "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[ | ||
| 1 | ||
| ], | ||
| "answer": example.context_text[ | ||
| enc.word_to_chars(enc.token_to_word(s), sequence_index=1 if question_first else 0)[ | ||
| 0 | ||
| ] : enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[ | ||
| 1 | ||
| ] | ||
| ], | ||
| } | ||
| for s, e, score in zip(starts, ends, scores) | ||
| ] | ||
|
|
||
| if kwargs["handle_impossible_answer"]: | ||
| answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""}) | ||
|
|
@@ -2735,7 +2840,7 @@ def pipeline( | |
| tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, | ||
| framework: Optional[str] = None, | ||
| revision: Optional[str] = None, | ||
| use_fast: bool = False, | ||
| use_fast: bool = True, | ||
thomwolf marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| **kwargs | ||
| ) -> Pipeline: | ||
| """ | ||
|
|
@@ -2793,7 +2898,7 @@ def pipeline( | |
| When passing a task name or a string model identifier: The specific model version to use. It can be a | ||
| branch name, a tag name, or a commit id, since we use a git-based system for storing models and other | ||
| artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git. | ||
| use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`): | ||
| use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`): | ||
| Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`). | ||
| kwargs: | ||
| Additional keyword arguments passed along to the specific pipeline init (see the documentation for the | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
|
|
||
| from .tokenization_gpt2_fast import GPT2TokenizerFast | ||
| from .tokenization_roberta import RobertaTokenizer | ||
| from .tokenization_utils_base import AddedToken | ||
| from .utils import logging | ||
|
|
||
|
|
||
|
|
@@ -172,6 +173,32 @@ def __init__( | |
| **kwargs, | ||
| ) | ||
|
|
||
| @property | ||
| def mask_token(self) -> str: | ||
| """ | ||
| :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while | ||
| not having been set. | ||
|
|
||
| Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily | ||
| comprise the space before the `<mask>`. | ||
| """ | ||
| if self._mask_token is None and self.verbose: | ||
| logger.error("Using mask_token, but it is not set yet.") | ||
|
Comment on lines
+185
to
+186
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No indeed, let me remove that in a follow-up PR |
||
| return None | ||
| return str(self._mask_token) | ||
|
|
||
| @mask_token.setter | ||
| def mask_token(self, value): | ||
| """ | ||
| Overriding the default behavior of the mask token to have it eat the space before it. | ||
|
|
||
| This is needed to preserve backward compatibility with all the previously used models based on Roberta. | ||
| """ | ||
| # Mask token behave like a normal word, i.e. include the space before it | ||
| # So we set lstrip to True | ||
| value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value | ||
| self._mask_token = value | ||
|
|
||
| def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): | ||
| output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] | ||
| if token_ids_1 is None: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.