Use newer version of multi-news with fixes (#4451)

JohnGiorgi · mariosasko · web-flow · commit 650fb02ee3b9 · 2022-06-07T19:14:44.000+02:00
* Use newer version of multi-news with fixes

* Update dummy data and dataset_infos.json

Co-authored-by: mariosasko &lt;mariosasko777@gmail.com&gt;
diff --git a/datasets/multi_news/dataset_infos.json b/datasets/multi_news/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n  - document: text of news articles seperated by special token \"|||||\".\n  - summary: news summary.\n", "citation": "\n@misc{alex2019multinews,\n    title={Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model},\n    author={Alexander R. Fabbri and Irene Li and Tianwei She and Suyi Li and Dragomir R. Radev},\n    year={2019},\n    eprint={1906.01749},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/Alex-Fabbri/Multi-News", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "task_templates": null, "builder_name": "multi_news", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 561149267, "num_examples": 44972, "dataset_name": "multi_news"}, "validation": {"name": "validation", "num_bytes": 68617339, "num_examples": 5622, "dataset_name": "multi_news"}, "test": {"name": "test", "num_bytes": 70389647, "num_examples": 5622, "dataset_name": "multi_news"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1vRY2wM6rlOZrf9exGTm5pXj5ExlVwJ0C": {"num_bytes": 256966232, "checksum": "64ae4d2483b248c9664b50bacfab6821f8a3e93f382c7587686fa4a127f77626"}}, "download_size": 256966232, "post_processing_size": null, "dataset_size": 700156253, "size_in_bytes": 957122485}}
+{"default": {"description": "\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n  - document: text of news articles seperated by special token \"|||||\".\n  - summary: news summary.\n", "citation": "\n@misc{alex2019multinews,\n    title={Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model},\n    author={Alexander R. Fabbri and Irene Li and Tianwei She and Suyi Li and Dragomir R. Radev},\n    year={2019},\n    eprint={1906.01749},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/Alex-Fabbri/Multi-News", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "task_templates": null, "builder_name": "multi_news", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 558392305, "num_examples": 44972, "dataset_name": "multi_news"}, "validation": {"name": "validation", "num_bytes": 68272440, "num_examples": 5622, "dataset_name": "multi_news"}, "test": {"name": "test", "num_bytes": 70032132, "num_examples": 5622, "dataset_name": "multi_news"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1wHAWDOwOoQWSj7HYpyJ3Aeud8WhhaJ7P": {"num_bytes": 547512283, "checksum": "627781c8ce55d528fcdacd495db45583a915e2d24b7983b0a5a6693ede933bb1"}, "https://drive.google.com/uc?export=download&id=1QVgswwhVTkd3VLCzajK6eVkcrSWEK6kq": {"num_bytes": 58793912, "checksum": "e9e82b8f413b0f1ed4eb7c883f93bb744f829c218c1608b6ba7615d687d07121"}, "https://drive.google.com/uc?export=download&id=1p_u9_jpz3Zbj0EL05QFX6wvJAahmOn6h": {"num_bytes": 66875522, "checksum": "f0a43902da366eea2b882e39ddd4c0975ad44aba6b61095a2ea90362e9e2bb65"}, "https://drive.google.com/uc?export=download&id=1Y1lBbBU5Q0aJMqLhYEOdEtTqQ85XnRRM": {"num_bytes": 7295302, "checksum": "bb08a078e0cb2b8ca9cc0fe3bfbe9d4098dee706bd00eb97449155e41b880157"}, "https://drive.google.com/uc?export=download&id=1-n_6fj-1nM7sWtBSNkQCSfl5Rb3zPVfr": {"num_bytes": 68999509, "checksum": "138d3ac2dc899cbcd2e3745aaa94d1c1db55fb7058d9df4ba3ef2dac05a3a186"}, "https://drive.google.com/uc?export=download&id=1CX_YcgQ3WwNC1fXBpMfwMXFPCqsd9Lbp": {"num_bytes": 7309099, "checksum": "fa97cf91a62ae82a0af6da88f2ddf8e06eb4e3b90f7971d8e0c516436518fae3"}}, "download_size": 756785627, "post_processing_size": null, "dataset_size": 696696877, "size_in_bytes": 1453482504}}
diff --git a/datasets/multi_news/dummy/1.0.0/dummy_data.zip b/datasets/multi_news/dummy/1.0.0/dummy_data.zip
diff --git a/datasets/multi_news/multi_news.py b/datasets/multi_news/multi_news.py
@@ -16,9 +16,6 @@
 # Lint as: python3
 """Multi-News dataset."""
 
-
-import os
-
 import datasets
 
 
@@ -44,7 +41,20 @@
   - summary: news summary.
 """
 
-_URL = "https://drive.google.com/uc?export=download&id=1vRY2wM6rlOZrf9exGTm5pXj5ExlVwJ0C"
+_URLs = {
+    "train": [
+        "https://drive.google.com/uc?export=download&id=1wHAWDOwOoQWSj7HYpyJ3Aeud8WhhaJ7P",
+        "https://drive.google.com/uc?export=download&id=1QVgswwhVTkd3VLCzajK6eVkcrSWEK6kq",
+    ],
+    "val": [
+        "https://drive.google.com/uc?export=download&id=1p_u9_jpz3Zbj0EL05QFX6wvJAahmOn6h",
+        "https://drive.google.com/uc?export=download&id=1Y1lBbBU5Q0aJMqLhYEOdEtTqQ85XnRRM",
+    ],
+    "test": [
+        "https://drive.google.com/uc?export=download&id=1-n_6fj-1nM7sWtBSNkQCSfl5Rb3zPVfr",
+        "https://drive.google.com/uc?export=download&id=1CX_YcgQ3WwNC1fXBpMfwMXFPCqsd9Lbp",
+    ],
+}
 
 _DOCUMENT = "document"
 _SUMMARY = "summary"
@@ -66,27 +76,25 @@ def _info(self):
 
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
-        extract_path = os.path.join(dl_manager.download_and_extract(_URL), "multi-news-original")
+        files = dl_manager.download(_URLs)
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
-                gen_kwargs={"path": os.path.join(extract_path, "train")},
+                gen_kwargs={"src_file": files["train"][0], "tgt_file": files["train"][1]},
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
-                gen_kwargs={"path": os.path.join(extract_path, "val")},
+                gen_kwargs={"src_file": files["val"][0], "tgt_file": files["val"][1]},
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
-                gen_kwargs={"path": os.path.join(extract_path, "test")},
+                gen_kwargs={"src_file": files["test"][0], "tgt_file": files["test"][1]},
             ),
         ]
 
-    def _generate_examples(self, path=None):
+    def _generate_examples(self, src_file, tgt_file):
         """Yields examples."""
-        with open(os.path.join(path + ".src"), encoding="utf-8") as src_f, open(
-            os.path.join(path + ".tgt"), encoding="utf-8"
-        ) as tgt_f:
+        with open(src_file, encoding="utf-8") as src_f, open(tgt_file, encoding="utf-8") as tgt_f:
             for i, (src_line, tgt_line) in enumerate(zip(src_f, tgt_f)):
                 yield i, {
                     # In original file, each line has one example and natural newline

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"default": {"description": "\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"\|\|\|\|\|\".\n - summary: news summary.\n", "citation": "\n@misc{alex2019multinews,\n title={Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model},\n author={Alexander R. Fabbri and Irene Li and Tianwei She and Suyi Li and Dragomir R. Radev},\n year={2019},\n eprint={1906.01749},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/Alex-Fabbri/Multi-News", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "task_templates": null, "builder_name": "multi_news", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 561149267, "num_examples": 44972, "dataset_name": "multi_news"}, "validation": {"name": "validation", "num_bytes": 68617339, "num_examples": 5622, "dataset_name": "multi_news"}, "test": {"name": "test", "num_bytes": 70389647, "num_examples": 5622, "dataset_name": "multi_news"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1vRY2wM6rlOZrf9exGTm5pXj5ExlVwJ0C": {"num_bytes": 256966232, "checksum": "64ae4d2483b248c9664b50bacfab6821f8a3e93f382c7587686fa4a127f77626"}}, "download_size": 256966232, "post_processing_size": null, "dataset_size": 700156253, "size_in_bytes": 957122485}}
	`1`	+{"default": {"description": "\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"\|\|\|\|\|\".\n - summary: news summary.\n", "citation": "\n@misc{alex2019multinews,\n title={Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model},\n author={Alexander R. Fabbri and Irene Li and Tianwei She and Suyi Li and Dragomir R. Radev},\n year={2019},\n eprint={1906.01749},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/Alex-Fabbri/Multi-News", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "task_templates": null, "builder_name": "multi_news", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 558392305, "num_examples": 44972, "dataset_name": "multi_news"}, "validation": {"name": "validation", "num_bytes": 68272440, "num_examples": 5622, "dataset_name": "multi_news"}, "test": {"name": "test", "num_bytes": 70032132, "num_examples": 5622, "dataset_name": "multi_news"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1wHAWDOwOoQWSj7HYpyJ3Aeud8WhhaJ7P": {"num_bytes": 547512283, "checksum": "627781c8ce55d528fcdacd495db45583a915e2d24b7983b0a5a6693ede933bb1"}, "https://drive.google.com/uc?export=download&id=1QVgswwhVTkd3VLCzajK6eVkcrSWEK6kq": {"num_bytes": 58793912, "checksum": "e9e82b8f413b0f1ed4eb7c883f93bb744f829c218c1608b6ba7615d687d07121"}, "https://drive.google.com/uc?export=download&id=1p_u9_jpz3Zbj0EL05QFX6wvJAahmOn6h": {"num_bytes": 66875522, "checksum": "f0a43902da366eea2b882e39ddd4c0975ad44aba6b61095a2ea90362e9e2bb65"}, "https://drive.google.com/uc?export=download&id=1Y1lBbBU5Q0aJMqLhYEOdEtTqQ85XnRRM": {"num_bytes": 7295302, "checksum": "bb08a078e0cb2b8ca9cc0fe3bfbe9d4098dee706bd00eb97449155e41b880157"}, "https://drive.google.com/uc?export=download&id=1-n_6fj-1nM7sWtBSNkQCSfl5Rb3zPVfr": {"num_bytes": 68999509, "checksum": "138d3ac2dc899cbcd2e3745aaa94d1c1db55fb7058d9df4ba3ef2dac05a3a186"}, "https://drive.google.com/uc?export=download&id=1CX_YcgQ3WwNC1fXBpMfwMXFPCqsd9Lbp": {"num_bytes": 7309099, "checksum": "fa97cf91a62ae82a0af6da88f2ddf8e06eb4e3b90f7971d8e0c516436518fae3"}}, "download_size": 756785627, "post_processing_size": null, "dataset_size": 696696877, "size_in_bytes": 1453482504}}