Skip to content

Commit 650fb02

Browse files
Use newer version of multi-news with fixes (#4451)
* Use newer version of multi-news with fixes * Update dummy data and dataset_infos.json Co-authored-by: mariosasko <[email protected]>
1 parent 39f02e4 commit 650fb02

File tree

3 files changed

+21
-13
lines changed

3 files changed

+21
-13
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"default": {"description": "\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"|||||\".\n - summary: news summary.\n", "citation": "\n@misc{alex2019multinews,\n title={Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model},\n author={Alexander R. Fabbri and Irene Li and Tianwei She and Suyi Li and Dragomir R. Radev},\n year={2019},\n eprint={1906.01749},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/Alex-Fabbri/Multi-News", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "task_templates": null, "builder_name": "multi_news", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 561149267, "num_examples": 44972, "dataset_name": "multi_news"}, "validation": {"name": "validation", "num_bytes": 68617339, "num_examples": 5622, "dataset_name": "multi_news"}, "test": {"name": "test", "num_bytes": 70389647, "num_examples": 5622, "dataset_name": "multi_news"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1vRY2wM6rlOZrf9exGTm5pXj5ExlVwJ0C": {"num_bytes": 256966232, "checksum": "64ae4d2483b248c9664b50bacfab6821f8a3e93f382c7587686fa4a127f77626"}}, "download_size": 256966232, "post_processing_size": null, "dataset_size": 700156253, "size_in_bytes": 957122485}}
1+
{"default": {"description": "\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"|||||\".\n - summary: news summary.\n", "citation": "\n@misc{alex2019multinews,\n title={Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model},\n author={Alexander R. Fabbri and Irene Li and Tianwei She and Suyi Li and Dragomir R. Radev},\n year={2019},\n eprint={1906.01749},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/Alex-Fabbri/Multi-News", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "task_templates": null, "builder_name": "multi_news", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 558392305, "num_examples": 44972, "dataset_name": "multi_news"}, "validation": {"name": "validation", "num_bytes": 68272440, "num_examples": 5622, "dataset_name": "multi_news"}, "test": {"name": "test", "num_bytes": 70032132, "num_examples": 5622, "dataset_name": "multi_news"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1wHAWDOwOoQWSj7HYpyJ3Aeud8WhhaJ7P": {"num_bytes": 547512283, "checksum": "627781c8ce55d528fcdacd495db45583a915e2d24b7983b0a5a6693ede933bb1"}, "https://drive.google.com/uc?export=download&id=1QVgswwhVTkd3VLCzajK6eVkcrSWEK6kq": {"num_bytes": 58793912, "checksum": "e9e82b8f413b0f1ed4eb7c883f93bb744f829c218c1608b6ba7615d687d07121"}, "https://drive.google.com/uc?export=download&id=1p_u9_jpz3Zbj0EL05QFX6wvJAahmOn6h": {"num_bytes": 66875522, "checksum": "f0a43902da366eea2b882e39ddd4c0975ad44aba6b61095a2ea90362e9e2bb65"}, "https://drive.google.com/uc?export=download&id=1Y1lBbBU5Q0aJMqLhYEOdEtTqQ85XnRRM": {"num_bytes": 7295302, "checksum": "bb08a078e0cb2b8ca9cc0fe3bfbe9d4098dee706bd00eb97449155e41b880157"}, "https://drive.google.com/uc?export=download&id=1-n_6fj-1nM7sWtBSNkQCSfl5Rb3zPVfr": {"num_bytes": 68999509, "checksum": "138d3ac2dc899cbcd2e3745aaa94d1c1db55fb7058d9df4ba3ef2dac05a3a186"}, "https://drive.google.com/uc?export=download&id=1CX_YcgQ3WwNC1fXBpMfwMXFPCqsd9Lbp": {"num_bytes": 7309099, "checksum": "fa97cf91a62ae82a0af6da88f2ddf8e06eb4e3b90f7971d8e0c516436518fae3"}}, "download_size": 756785627, "post_processing_size": null, "dataset_size": 696696877, "size_in_bytes": 1453482504}}
121 Bytes
Binary file not shown.

datasets/multi_news/multi_news.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@
1616
# Lint as: python3
1717
"""Multi-News dataset."""
1818

19-
20-
import os
21-
2219
import datasets
2320

2421

@@ -44,7 +41,20 @@
4441
- summary: news summary.
4542
"""
4643

47-
_URL = "https://drive.google.com/uc?export=download&id=1vRY2wM6rlOZrf9exGTm5pXj5ExlVwJ0C"
44+
_URLs = {
45+
"train": [
46+
"https://drive.google.com/uc?export=download&id=1wHAWDOwOoQWSj7HYpyJ3Aeud8WhhaJ7P",
47+
"https://drive.google.com/uc?export=download&id=1QVgswwhVTkd3VLCzajK6eVkcrSWEK6kq",
48+
],
49+
"val": [
50+
"https://drive.google.com/uc?export=download&id=1p_u9_jpz3Zbj0EL05QFX6wvJAahmOn6h",
51+
"https://drive.google.com/uc?export=download&id=1Y1lBbBU5Q0aJMqLhYEOdEtTqQ85XnRRM",
52+
],
53+
"test": [
54+
"https://drive.google.com/uc?export=download&id=1-n_6fj-1nM7sWtBSNkQCSfl5Rb3zPVfr",
55+
"https://drive.google.com/uc?export=download&id=1CX_YcgQ3WwNC1fXBpMfwMXFPCqsd9Lbp",
56+
],
57+
}
4858

4959
_DOCUMENT = "document"
5060
_SUMMARY = "summary"
@@ -66,27 +76,25 @@ def _info(self):
6676

6777
def _split_generators(self, dl_manager):
6878
"""Returns SplitGenerators."""
69-
extract_path = os.path.join(dl_manager.download_and_extract(_URL), "multi-news-original")
79+
files = dl_manager.download(_URLs)
7080
return [
7181
datasets.SplitGenerator(
7282
name=datasets.Split.TRAIN,
73-
gen_kwargs={"path": os.path.join(extract_path, "train")},
83+
gen_kwargs={"src_file": files["train"][0], "tgt_file": files["train"][1]},
7484
),
7585
datasets.SplitGenerator(
7686
name=datasets.Split.VALIDATION,
77-
gen_kwargs={"path": os.path.join(extract_path, "val")},
87+
gen_kwargs={"src_file": files["val"][0], "tgt_file": files["val"][1]},
7888
),
7989
datasets.SplitGenerator(
8090
name=datasets.Split.TEST,
81-
gen_kwargs={"path": os.path.join(extract_path, "test")},
91+
gen_kwargs={"src_file": files["test"][0], "tgt_file": files["test"][1]},
8292
),
8393
]
8494

85-
def _generate_examples(self, path=None):
95+
def _generate_examples(self, src_file, tgt_file):
8696
"""Yields examples."""
87-
with open(os.path.join(path + ".src"), encoding="utf-8") as src_f, open(
88-
os.path.join(path + ".tgt"), encoding="utf-8"
89-
) as tgt_f:
97+
with open(src_file, encoding="utf-8") as src_f, open(tgt_file, encoding="utf-8") as tgt_f:
9098
for i, (src_line, tgt_line) in enumerate(zip(src_f, tgt_f)):
9199
yield i, {
92100
# In original file, each line has one example and natural newline

0 commit comments

Comments
 (0)