Host pn_summary data on the Hub instead of Google Drive (#4586)

albertvillanova · web-flow · commit 6d2a970f2544 · 2022-06-28T16:42:03.000+02:00
* Replace data URL with Hub one

* Update metadata JSON

* Update documentation card

* Update dummy data
diff --git a/datasets/pn_summary/README.md b/datasets/pn_summary/README.md
@@ -53,7 +53,6 @@ pretty_name: Persian News Summary (PnSummary)
 
 ## Dataset Description
 
-- **Homepage:** https://github.com/hooshvare/pn-summary/
 - **Repository:** https://github.com/hooshvare/pn-summary/
 - **Paper:** https://arxiv.org/abs/2012.11204
 - **Leaderboard:** [More Information Needed]
diff --git a/datasets/pn_summary/dataset_infos.json b/datasets/pn_summary/dataset_infos.json
@@ -1 +1 @@
-{"1.0.0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "names_file": null, "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "names_file": null, "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pn_summary", "config_name": "1.0.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436709, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311841, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936844, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685394, "size_in_bytes": 441276535}}
+{"1.0.0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pn_summary", "config_name": "1.0.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436493, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311817, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936820, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://huggingface.co/datasets/pn_summary/resolve/main/data/pn_summary.zip": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685130, "size_in_bytes": 441276271}}
diff --git a/datasets/pn_summary/dummy/1.0.0/1.0.0/dummy_data.zip b/datasets/pn_summary/dummy/1.0.0/1.0.0/dummy_data.zip
diff --git a/datasets/pn_summary/pn_summary.py b/datasets/pn_summary/pn_summary.py
@@ -39,9 +39,10 @@
 _HOMEPAGE = "https://github.com/hooshvare/pn-summary"
 _LICENSE = "MIT License"
 
+_REPO = "https://huggingface.co/datasets/pn_summary/resolve/main/data"
 _URLs = {
     "1.0.0": {
-        "data": "https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download",
+        "data": f"{_REPO}/pn_summary.zip",
         "features": [
             {"name": "id", "type": datasets.Value("string")},
             {"name": "title", "type": datasets.Value("string")},

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"1.0.0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "names_file": null, "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "names_file": null, "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pn_summary", "config_name": "1.0.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436709, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311841, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936844, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685394, "size_in_bytes": 441276535}}
	`1`	+{"1.0.0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pn_summary", "config_name": "1.0.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436493, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311817, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936820, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://huggingface.co/datasets/pn_summary/resolve/main/data/pn_summary.zip": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685130, "size_in_bytes": 441276271}}