huggingface
diff --git a/‎src/transformers/models/bart/modeling_bart.py‎
Lines changed: 25 additions & 18 deletions b/‎src/transformers/models/bart/modeling_bart.py‎
Lines changed: 25 additions & 18 deletions
diff --git a/‎src/transformers/models/bart/modeling_flax_bart.py‎
Lines changed: 25 additions & 17 deletions b/‎src/transformers/models/bart/modeling_flax_bart.py‎
Lines changed: 25 additions & 17 deletions
diff --git a/‎src/transformers/models/bart/modeling_tf_bart.py‎
Lines changed: 23 additions & 16 deletions b/‎src/transformers/models/bart/modeling_tf_bart.py‎
Lines changed: 23 additions & 16 deletions
diff --git a/‎src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py‎
Lines changed: 11 additions & 9 deletions b/‎src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/fsmt/modeling_fsmt.py‎
Lines changed: 13 additions & 10 deletions b/‎src/transformers/models/fsmt/modeling_fsmt.py‎
Lines changed: 13 additions & 10 deletions
diff --git a/‎src/transformers/models/led/modeling_led.py‎
Lines changed: 35 additions & 30 deletions b/‎src/transformers/models/led/modeling_led.py‎
Lines changed: 35 additions & 30 deletions
diff --git a/‎src/transformers/models/m2m_100/modeling_m2m_100.py‎
Lines changed: 10 additions & 8 deletions b/‎src/transformers/models/m2m_100/modeling_m2m_100.py‎
Lines changed: 10 additions & 8 deletions
@@ -534,33 +534,40 @@ def __init_subclass__(self):
 """
 
 BART_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
 
-        >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
+    ```python
+    >>> from transformers import BartTokenizer, BartForConditionalGeneration
 
-        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') >>> tokenizer =
-        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+    >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
 
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
 
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
-        early_stopping=True) >>> print([tokenizer.decode(g, skip_special_tokens=True,
-        clean_up_tokenization_spaces=False) for g in summary_ids])
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 
-    Mask filling example::
+    Mask filling example:
 
-        >>> from transformers import BartTokenizer, BartForConditionalGeneration >>> tokenizer =
-        BartTokenizer.from_pretrained('facebook/bart-large') >>> TXT = "My friends are <mask> but they eat too many
-        carbs."
+    ```python
+    >>> from transformers import BartTokenizer, BartForConditionalGeneration
 
-        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> input_ids =
-        tokenizer([TXT], return_tensors='pt')['input_ids'] >>> logits = model(input_ids).logits
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
 
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() >>> probs = logits[0,
-        masked_index].softmax(dim=0) >>> values, predictions = probs.topk(5)
+    >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
+    >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+    >>> logits = model(input_ids).logits
 
-        >>> tokenizer.decode(predictions).split()
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+    >>> probs = logits[0, masked_index].softmax(dim=0)
+    >>> values, predictions = probs.topk(5)
+
+    >>> tokenizer.decode(predictions).split()
+    ```
 """
 
 BART_INPUTS_DOCSTRING = r"""
 
@@ -1506,32 +1506,40 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 FLAX_BART_CONDITIONAL_GENERATION_DOCSTRING = """
     Returns:
 
-    Summarization example::
+    Summarization example:
 
-        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+    ```python
+    >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+
+    >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
 
-        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') >>> tokenizer =
-        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np")
 
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='jax')
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs["input_ids"]).sequences
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids']).sequences >>>
-        print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    Mask filling example:
 
-    Mask filling example::
+    ```python
+    >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
 
-        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration >>> tokenizer =
-        BartTokenizer.from_pretrained('facebook/bart-large') >>> TXT = "My friends are <mask> but they eat too many
-        carbs."
+    >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large")
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
 
-        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> input_ids =
-        tokenizer([TXT], return_tensors='jax')['input_ids'] >>> logits = model(input_ids).logits
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
+    >>> input_ids = tokenizer([TXT], return_tensors="jax")["input_ids"]
 
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero()[0].item() >>> probs =
-        jax.nn.softmax(logits[0, masked_index], axis=0) >>> values, predictions = jax.lax.top_k(probs)
+    >>> logits = model(input_ids).logits
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero()[0].item()
+    >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0)
+    >>> values, predictions = jax.lax.top_k(probs)
 
-        >>> tokenizer.decode(predictions).split()
+    >>> tokenizer.decode(predictions).split()
+    ```
 """
 
 overwrite_call_docstring(
 
@@ -510,29 +510,36 @@ def serving(self, inputs):
 
 
 BART_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
 
-        >>> from transformers import BartTokenizer, TFBartForConditionalGeneration, BartConfig
+    ```python
+    >>> from transformers import BartTokenizer, TFBartForConditionalGeneration
 
-        >>> model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> tokenizer =
-        BartTokenizer.from_pretrained('facebook/bart-large')
+    >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large")
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
 
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='tf')
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="tf")
 
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
-        early_stopping=True) >>> print([tokenizer.decode(g, skip_special_tokens=True,
-        clean_up_tokenization_spaces=False) for g in summary_ids])
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 
-    Mask filling example::
+    Mask filling example:
 
-        >>> from transformers import BartTokenizer, TFBartForConditionalGeneration >>> tokenizer =
-        BartTokenizer.from_pretrained('facebook/bart-large') >>> TXT = "My friends are <mask> but they eat too many
-        carbs."
+    ```python
+    >>> from transformers import BartTokenizer, TFBartForConditionalGeneration
 
-        >>> model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> input_ids =
-        tokenizer([TXT], return_tensors='tf')['input_ids'] >>> logits = model(input_ids).logits >>> probs =
-        tf.nn.softmax(logits[0]) >>> # probs[5] is associated with the mask token
+    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+    >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large")
+    >>> input_ids = tokenizer([TXT], return_tensors="tf")["input_ids"]
+    >>> logits = model(input_ids).logits
+    >>> probs = tf.nn.softmax(logits[0])
+    >>> # probs[5] is associated with the mask token
+    ```
 """
 
 
 
@@ -1619,19 +1619,21 @@ def dummy_inputs(self):
 """
 
 BIGBIRD_PEGASUS_GENERATION_EXAMPLE = r"""
-    Summarization example::
+    Summarization example:
 
-        >>> from transformers import PegasusTokenizer, BigBirdPegasusForConditionalGeneration, BigBirdPegasusConfig
+    ```python
+    >>> from transformers import PegasusTokenizer, BigBirdPegasusForConditionalGeneration
 
-        >>> model = BigBirdPegasusForConditionalGeneration.from_pretrained('google/bigbird-pegasus-large-arxiv') >>>
-        tokenizer = PegasusTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv')
+    >>> model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
+    >>> tokenizer = PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
 
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
-        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=4096, return_tensors='pt', truncation=True)
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=4096, return_tensors="pt", truncation=True)
 
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
-        early_stopping=True) >>> print([tokenizer.decode(g, skip_special_tokens=True,
-        clean_up_tokenization_spaces=False) for g in summary_ids])
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 """
 
 BIGBIRD_PEGASUS_INPUTS_DOCSTRING = r"""
 
@@ -1482,7 +1482,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 FLAX_BLENDERBOT_SMALL_CONDITIONAL_GENERATION_DOCSTRING = """
     Returns:
 
-    Summarization example::
+    Summarization example:
 
         >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
@@ -1495,7 +1495,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
         >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids']).sequences >>>
         print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
 
-    Mask filling example::
+    Mask filling example:
 
         >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration >>>
         tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M') >>> TXT = "My friends are
 
@@ -199,16 +199,19 @@
 FSMT_GENERATION_EXAMPLE = r"""
     Translation example::
 
-        from transformers import FSMTTokenizer, FSMTForConditionalGeneration
-
-        mname = "facebook/wmt19-ru-en" model = FSMTForConditionalGeneration.from_pretrained(mname) tokenizer =
-        FSMTTokenizer.from_pretrained(mname)
-
-        src_text = "Машинное обучение - это здорово, не так ли?" input_ids = tokenizer.encode(src_text,
-        return_tensors='pt') outputs = model.generate(input_ids, num_beams=5, num_return_sequences=3) for i, output in
-        enumerate(outputs):
-            decoded = tokenizer.decode(output, skip_special_tokens=True) print(f"{i}: {decoded})
-         # 1: Machine learning is great, isn't it? ...
+    ```python
+    >>> from transformers import FSMTTokenizer, FSMTForConditionalGeneration
+
+    >>> mname = "facebook/wmt19-ru-en"
+    >>> model = FSMTForConditionalGeneration.from_pretrained(mname)
+    >>> tokenizer = FSMTTokenizer.from_pretrained(mname)
+
+    >>> src_text = "Машинное обучение - это здорово, не так ли?"
+    >>> input_ids = tokenizer(src_text, return_tensors="pt")
+    >>> outputs = model.generate(input_ids, num_beams=5, num_return_sequences=3)
+    >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+    "Machine learning is great, isn't it?"
+    ```
 
 """
 
 
@@ -1454,36 +1454,41 @@ class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
 """
 
 LED_GENERATION_EXAMPLE = r"""
-    Summarization example::
-
-        >>> import torch >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
-
-        >>> model = LEDForConditionalGeneration.from_pretrained('allenai/led-large-16384-arxiv') >>> tokenizer =
-        LEDTokenizer.from_pretrained('allenai/led-large-16384-arxiv')
-
-        >>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art ... results in
-        a wide range of natural language tasks including generative ... language modeling (Dai et al., 2019; Radford et
-        al., 2019) and discriminative ... language understanding (Devlin et al., 2019). This success is partly due to
-        ... the self-attention component which enables the network to capture contextual ... information from the
-        entire sequence. While powerful, the memory and computational ... requirements of self-attention grow
-        quadratically with sequence length, making ... it infeasible (or very expensive) to process long sequences. ...
-        ... To address this limitation, we present Longformer, a modified Transformer ... architecture with a
-        self-attention operation that scales linearly with the ... sequence length, making it versatile for processing
-        long documents (Fig 1). This ... is an advantage for natural language tasks such as long document
-        classification, ... question answering (QA), and coreference resolution, where existing approaches ...
-        partition or shorten the long context into smaller sequences that fall within the ... typical 512 token limit
-        of BERT-style pretrained models. Such partitioning could ... potentially result in loss of important
-        cross-partition information, and to ... mitigate this problem, existing methods often rely on complex
-        architectures to ... address such interactions. On the other hand, our proposed Longformer is able to ... build
-        contextual representations of the entire context using multiple layers of ... attention, reducing the need for
-        task-specific architectures.''' >>> inputs = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors='pt')
-
-        >>> # Global attention on the first token (cf. Beltagy et al. 2020) >>> global_attention_mask =
-        torch.zeros_like(inputs) >>> global_attention_mask[:, 0] = 1
-
-        >>> # Generate Summary >>> summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask,
-        ... num_beams=3, max_length=32, early_stopping=True) >>> print(tokenizer.decode(summary_ids[0],
-        skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    Summarization example:
+
+    ```python
+    >>> import torch
+    >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
+
+    >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
+    >>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")
+
+    >>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art
+    ...     results in a wide range of natural language tasks including generative language modeling
+    ...     (Dai et al., 2019; Radford et al., 2019) and discriminative ... language understanding (Devlin et al., 2019).
+    ...     This success is partly due to the self-attention component which enables the network to capture contextual
+    ...     information from the entire sequence. While powerful, the memory and computational requirements of
+    ...     self-attention grow quadratically with sequence length, making it infeasible (or very expensive) to
+    ...     process long sequences. To address this limitation, we present Longformer, a modified Transformer
+    ...     architecture with a self-attention operation that scales linearly with the sequence length, making it
+    ...     versatile for processing long documents (Fig 1). This is an advantage for natural language tasks such as
+    ...     long document classification, question answering (QA), and coreference resolution, where existing approaches
+    ...     partition or shorten the long context into smaller sequences that fall within the typical 512 token limit
+    ...     of BERT-style pretrained models. Such partitioning could potentially result in loss of important
+    ...     cross-partition information, and to mitigate this problem, existing methods often rely on complex
+    ...     architectures to address such interactions. On the other hand, our proposed Longformer is able to build
+    ...     contextual representations of the entire context using multiple layers of attention, reducing the need for
+    ...     task-specific architectures.'''
+    >>> inputs = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors="pt")
+
+    >>> # Global attention on the first token (cf. Beltagy et al. 2020)
+    >>> global_attention_mask = torch.zeros_like(inputs)
+    >>> global_attention_mask[:, 0] = 1
+
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask, num_beams=3, max_length=32)
+    >>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    ```
 """
 
 LED_INPUTS_DOCSTRING = r"""
 
@@ -566,17 +566,19 @@ def _set_gradient_checkpointing(self, module, value=False):
 M2M_100_GENERATION_EXAMPLE = r"""
     Translation example::
 
-        >>> from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
+    ```python
+    >>> from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
 
-        >>> model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M') >>> tokenizer =
-        M2M100Tokenizer.from_pretrained('facebook/m2m100_418M')
+    >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+    >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 
-        >>> text_to_translate = "Life is like a box of chocolates" >>> model_inputs = tokenizer(text_to_translate,
-        return_tensors='pt')
+    >>> text_to_translate = "Life is like a box of chocolates"
+    >>> model_inputs = tokenizer(text_to_translate, return_tensors="pt")
 
-        >>> # translate to French >>> gen_tokens = model.generate( **model_inputs,
-        forced_bos_token_id=tokenizer.get_lang_id("fr")) >>> print(tokenizer.batch_decode(gen_tokens,
-        skip_special_tokens=True))
+    >>> # translate to French
+    >>> gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
+    >>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
+    ```
 """
 
 M2M_100_INPUTS_DOCSTRING = r"""