Skip to content

Commit 6d2a970

Browse files
Host pn_summary data on the Hub instead of Google Drive (#4586)
* Replace data URL with Hub one * Update metadata JSON * Update documentation card * Update dummy data
1 parent dad0ef8 commit 6d2a970

File tree

4 files changed

+3
-3
lines changed

4 files changed

+3
-3
lines changed

datasets/pn_summary/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ pretty_name: Persian News Summary (PnSummary)
5353

5454
## Dataset Description
5555

56-
- **Homepage:** https://github.com/hooshvare/pn-summary/
5756
- **Repository:** https://github.com/hooshvare/pn-summary/
5857
- **Paper:** https://arxiv.org/abs/2012.11204
5958
- **Leaderboard:** [More Information Needed]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"1.0.0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "names_file": null, "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "names_file": null, "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pn_summary", "config_name": "1.0.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436709, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311841, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936844, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685394, "size_in_bytes": 441276535}}
1+
{"1.0.0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pn_summary", "config_name": "1.0.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436493, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311817, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936820, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://huggingface.co/datasets/pn_summary/resolve/main/data/pn_summary.zip": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685130, "size_in_bytes": 441276271}}
-478 Bytes
Binary file not shown.

datasets/pn_summary/pn_summary.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,10 @@
3939
_HOMEPAGE = "https://github.com/hooshvare/pn-summary"
4040
_LICENSE = "MIT License"
4141

42+
_REPO = "https://huggingface.co/datasets/pn_summary/resolve/main/data"
4243
_URLs = {
4344
"1.0.0": {
44-
"data": "https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download",
45+
"data": f"{_REPO}/pn_summary.zip",
4546
"features": [
4647
{"name": "id", "type": datasets.Value("string")},
4748
{"name": "title", "type": datasets.Value("string")},

0 commit comments

Comments
 (0)