diff --git a/datasets/multi_news/dataset_infos.json b/datasets/multi_news/dataset_infos.json index 01f45dd2e28..353d790aebb 100644 --- a/datasets/multi_news/dataset_infos.json +++ b/datasets/multi_news/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"|||||\".\n - summary: news summary.\n", "citation": "\n@misc{alex2019multinews,\n title={Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model},\n author={Alexander R. Fabbri and Irene Li and Tianwei She and Suyi Li and Dragomir R. Radev},\n year={2019},\n eprint={1906.01749},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/Alex-Fabbri/Multi-News", "license": "", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "task_templates": null, "builder_name": "multi_news", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 558392305, "num_examples": 44972, "dataset_name": "multi_news"}, "validation": {"name": "validation", "num_bytes": 68272440, "num_examples": 5622, "dataset_name": "multi_news"}, "test": {"name": "test", "num_bytes": 70032132, "num_examples": 5622, "dataset_name": "multi_news"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=1wHAWDOwOoQWSj7HYpyJ3Aeud8WhhaJ7P": {"num_bytes": 547512283, "checksum": "627781c8ce55d528fcdacd495db45583a915e2d24b7983b0a5a6693ede933bb1"}, "https://drive.google.com/uc?export=download&id=1QVgswwhVTkd3VLCzajK6eVkcrSWEK6kq": {"num_bytes": 58793912, "checksum": "e9e82b8f413b0f1ed4eb7c883f93bb744f829c218c1608b6ba7615d687d07121"}, "https://drive.google.com/uc?export=download&id=1p_u9_jpz3Zbj0EL05QFX6wvJAahmOn6h": {"num_bytes": 66875522, "checksum": "f0a43902da366eea2b882e39ddd4c0975ad44aba6b61095a2ea90362e9e2bb65"}, "https://drive.google.com/uc?export=download&id=1Y1lBbBU5Q0aJMqLhYEOdEtTqQ85XnRRM": {"num_bytes": 7295302, "checksum": "bb08a078e0cb2b8ca9cc0fe3bfbe9d4098dee706bd00eb97449155e41b880157"}, "https://drive.google.com/uc?export=download&id=1-n_6fj-1nM7sWtBSNkQCSfl5Rb3zPVfr": {"num_bytes": 68999509, "checksum": "138d3ac2dc899cbcd2e3745aaa94d1c1db55fb7058d9df4ba3ef2dac05a3a186"}, "https://drive.google.com/uc?export=download&id=1CX_YcgQ3WwNC1fXBpMfwMXFPCqsd9Lbp": {"num_bytes": 7309099, "checksum": "fa97cf91a62ae82a0af6da88f2ddf8e06eb4e3b90f7971d8e0c516436518fae3"}}, "download_size": 756785627, "post_processing_size": null, "dataset_size": 696696877, "size_in_bytes": 1453482504}} \ No newline at end of file +{"default": {"description": "\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"|||||\".\n - summary: news summary.\n", "citation": "\n@misc{alex2019multinews,\n title={Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model},\n author={Alexander R. Fabbri and Irene Li and Tianwei She and Suyi Li and Dragomir R. Radev},\n year={2019},\n eprint={1906.01749},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/Alex-Fabbri/Multi-News", "license": "For non-commercial research and educational purposes only", "features": {"document": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "document", "output": "summary"}, "task_templates": null, "builder_name": "multi_news", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 558392265, "num_examples": 44972, "dataset_name": "multi_news"}, "validation": {"name": "validation", "num_bytes": 68272432, "num_examples": 5622, "dataset_name": "multi_news"}, "test": {"name": "test", "num_bytes": 70032124, "num_examples": 5622, "dataset_name": "multi_news"}}, "download_checksums": {"https://huggingface.co/datasets/multi_news/resolve/main/data/train.src.cleaned": {"num_bytes": 547512283, "checksum": "627781c8ce55d528fcdacd495db45583a915e2d24b7983b0a5a6693ede933bb1"}, "https://huggingface.co/datasets/multi_news/resolve/main/data/train.tgt": {"num_bytes": 58793912, "checksum": "e9e82b8f413b0f1ed4eb7c883f93bb744f829c218c1608b6ba7615d687d07121"}, "https://huggingface.co/datasets/multi_news/resolve/main/data/val.src.cleaned": {"num_bytes": 66875522, "checksum": "f0a43902da366eea2b882e39ddd4c0975ad44aba6b61095a2ea90362e9e2bb65"}, "https://huggingface.co/datasets/multi_news/resolve/main/data/val.tgt": {"num_bytes": 7295302, "checksum": "bb08a078e0cb2b8ca9cc0fe3bfbe9d4098dee706bd00eb97449155e41b880157"}, "https://huggingface.co/datasets/multi_news/resolve/main/data/test.src.cleaned": {"num_bytes": 68999509, "checksum": "138d3ac2dc899cbcd2e3745aaa94d1c1db55fb7058d9df4ba3ef2dac05a3a186"}, "https://huggingface.co/datasets/multi_news/resolve/main/data/test.tgt": {"num_bytes": 7309099, "checksum": "fa97cf91a62ae82a0af6da88f2ddf8e06eb4e3b90f7971d8e0c516436518fae3"}}, "download_size": 756785627, "post_processing_size": null, "dataset_size": 696696821, "size_in_bytes": 1453482448}} \ No newline at end of file diff --git a/datasets/multi_news/dummy/1.0.0/dummy_data.zip b/datasets/multi_news/dummy/1.0.0/dummy_data.zip index 8ab08e3755b..1c4cfcce265 100644 Binary files a/datasets/multi_news/dummy/1.0.0/dummy_data.zip and b/datasets/multi_news/dummy/1.0.0/dummy_data.zip differ diff --git a/datasets/multi_news/multi_news.py b/datasets/multi_news/multi_news.py index d5646ac424e..b55a079ef1d 100644 --- a/datasets/multi_news/multi_news.py +++ b/datasets/multi_news/multi_news.py @@ -19,6 +19,10 @@ import datasets +_HOMEPAGE = "https://github.com/Alex-Fabbri/Multi-News" + +_LICENSE = "For non-commercial research and educational purposes only" + _CITATION = """ @misc{alex2019multinews, title={Multi-News: a Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model}, @@ -41,18 +45,19 @@ - summary: news summary. """ +_REPO = "https://huggingface.co/datasets/multi_news/resolve/main/data" _URLs = { "train": [ - "https://drive.google.com/uc?export=download&id=1wHAWDOwOoQWSj7HYpyJ3Aeud8WhhaJ7P", - "https://drive.google.com/uc?export=download&id=1QVgswwhVTkd3VLCzajK6eVkcrSWEK6kq", + f"{_REPO}/train.src.cleaned", + f"{_REPO}/train.tgt", ], "val": [ - "https://drive.google.com/uc?export=download&id=1p_u9_jpz3Zbj0EL05QFX6wvJAahmOn6h", - "https://drive.google.com/uc?export=download&id=1Y1lBbBU5Q0aJMqLhYEOdEtTqQ85XnRRM", + f"{_REPO}/val.src.cleaned", + f"{_REPO}/val.tgt", ], "test": [ - "https://drive.google.com/uc?export=download&id=1-n_6fj-1nM7sWtBSNkQCSfl5Rb3zPVfr", - "https://drive.google.com/uc?export=download&id=1CX_YcgQ3WwNC1fXBpMfwMXFPCqsd9Lbp", + f"{_REPO}/test.src.cleaned", + f"{_REPO}/test.tgt", ], } @@ -70,7 +75,8 @@ def _info(self): description=_DESCRIPTION, features=datasets.Features({_DOCUMENT: datasets.Value("string"), _SUMMARY: datasets.Value("string")}), supervised_keys=(_DOCUMENT, _SUMMARY), - homepage="https://github.com/Alex-Fabbri/Multi-News", + homepage=_HOMEPAGE, + license=_LICENSE, citation=_CITATION, )