@@ -1454,36 +1454,41 @@ class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
14541454"""
14551455
14561456LED_GENERATION_EXAMPLE = r"""
1457- Summarization example::
1458-
1459- >>> import torch >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
1460-
1461- >>> model = LEDForConditionalGeneration.from_pretrained('allenai/led-large-16384-arxiv') >>> tokenizer =
1462- LEDTokenizer.from_pretrained('allenai/led-large-16384-arxiv')
1463-
1464- >>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art ... results in
1465- a wide range of natural language tasks including generative ... language modeling (Dai et al., 2019; Radford et
1466- al., 2019) and discriminative ... language understanding (Devlin et al., 2019). This success is partly due to
1467- ... the self-attention component which enables the network to capture contextual ... information from the
1468- entire sequence. While powerful, the memory and computational ... requirements of self-attention grow
1469- quadratically with sequence length, making ... it infeasible (or very expensive) to process long sequences. ...
1470- ... To address this limitation, we present Longformer, a modified Transformer ... architecture with a
1471- self-attention operation that scales linearly with the ... sequence length, making it versatile for processing
1472- long documents (Fig 1). This ... is an advantage for natural language tasks such as long document
1473- classification, ... question answering (QA), and coreference resolution, where existing approaches ...
1474- partition or shorten the long context into smaller sequences that fall within the ... typical 512 token limit
1475- of BERT-style pretrained models. Such partitioning could ... potentially result in loss of important
1476- cross-partition information, and to ... mitigate this problem, existing methods often rely on complex
1477- architectures to ... address such interactions. On the other hand, our proposed Longformer is able to ... build
1478- contextual representations of the entire context using multiple layers of ... attention, reducing the need for
1479- task-specific architectures.''' >>> inputs = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors='pt')
1480-
1481- >>> # Global attention on the first token (cf. Beltagy et al. 2020) >>> global_attention_mask =
1482- torch.zeros_like(inputs) >>> global_attention_mask[:, 0] = 1
1483-
1484- >>> # Generate Summary >>> summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask,
1485- ... num_beams=3, max_length=32, early_stopping=True) >>> print(tokenizer.decode(summary_ids[0],
1486- skip_special_tokens=True, clean_up_tokenization_spaces=True))
1457+ Summarization example:
1458+
1459+ ```python
1460+ >>> import torch
1461+ >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
1462+
1463+ >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
1464+ >>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")
1465+
1466+ >>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art
1467+ ... results in a wide range of natural language tasks including generative language modeling
1468+ ... (Dai et al., 2019; Radford et al., 2019) and discriminative ... language understanding (Devlin et al., 2019).
1469+ ... This success is partly due to the self-attention component which enables the network to capture contextual
1470+ ... information from the entire sequence. While powerful, the memory and computational requirements of
1471+ ... self-attention grow quadratically with sequence length, making it infeasible (or very expensive) to
1472+ ... process long sequences. To address this limitation, we present Longformer, a modified Transformer
1473+ ... architecture with a self-attention operation that scales linearly with the sequence length, making it
1474+ ... versatile for processing long documents (Fig 1). This is an advantage for natural language tasks such as
1475+ ... long document classification, question answering (QA), and coreference resolution, where existing approaches
1476+ ... partition or shorten the long context into smaller sequences that fall within the typical 512 token limit
1477+ ... of BERT-style pretrained models. Such partitioning could potentially result in loss of important
1478+ ... cross-partition information, and to mitigate this problem, existing methods often rely on complex
1479+ ... architectures to address such interactions. On the other hand, our proposed Longformer is able to build
1480+ ... contextual representations of the entire context using multiple layers of attention, reducing the need for
1481+ ... task-specific architectures.'''
1482+ >>> inputs = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors="pt")
1483+
1484+ >>> # Global attention on the first token (cf. Beltagy et al. 2020)
1485+ >>> global_attention_mask = torch.zeros_like(inputs)
1486+ >>> global_attention_mask[:, 0] = 1
1487+
1488+ >>> # Generate Summary
1489+ >>> summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask, num_beams=3, max_length=32)
1490+ >>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
1491+ ```
14871492"""
14881493
14891494LED_INPUTS_DOCSTRING = r"""
0 commit comments