diff --git a/datasets/makhzan/README.md b/datasets/makhzan/README.md index ab84102a366..e5490ab3bcc 100644 --- a/datasets/makhzan/README.md +++ b/datasets/makhzan/README.md @@ -222,7 +222,12 @@ Zeerak Ahmed ### Citation Information -No citation information. +``` +@misc{makhzan, +title={Maḵẖzan}, +howpublished = "\url{https://github.com/zeerakahmed/makhzan/}", +} +``` ### Contributions diff --git a/datasets/makhzan/dataset_infos.json b/datasets/makhzan/dataset_infos.json index 88aba9e3651..1583d578c98 100644 --- a/datasets/makhzan/dataset_infos.json +++ b/datasets/makhzan/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "An Urdu text corpus for machine learning, natural language processing and linguistic analysis.\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {A great new dataset},\nauthors={huggingface, Inc.\n},\nyear={2020}\n}\n", "homepage": "https://matnsaz.net/en/makhzan", "license": "All files in the /text directory are covered under standard copyright. Each piece of text has been included in this repository with explicity permission of respective copyright holders, who are identified in the tag for each file. You are free to use this text for analysis, research and development, but you are not allowed to redistribute or republish this text. Some cases where a less restrictive license could apply to files in the /text directory are presented below. In some cases copyright free text has been digitally reproduced through the hard work of our collaborators. In such cases we have credited the appropriate people where possible in a notes field in the file's metadata, and we strongly encourage you to contact them before redistributing this text in any form. Where a separate license is provided along with the text, we have provided corresponding data in the publication field in a file's metadata.", "features": {"file_id": {"dtype": "string", "id": null, "_type": "Value"}, "metadata": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "num-words": {"dtype": "int64", "id": null, "_type": "Value"}, "contains-non-urdu-languages": {"dtype": "string", "id": null, "_type": "Value"}, "document_body": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "makhzan", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 35637310, "num_examples": 5522, "dataset_name": "makhzan"}}, "download_checksums": {"https://github.com/zeerakahmed/makhzan/archive/99db56552d6781dcd184bdd3466bce15fd0a1ec0.zip": {"num_bytes": 15187763, "checksum": "d50d5d168d20dae7a2b4da3e51c60be1fb0a15588dfbeb4b9cf31e87fe45d54e"}}, "download_size": 15187763, "post_processing_size": null, "dataset_size": 35637310, "size_in_bytes": 50825073}} +{"default": {"description": "An Urdu text corpus for machine learning, natural language processing and linguistic analysis.\n", "citation": "\\\n@misc{makhzan,\ntitle={Ma\u1e35\u1e96zan},\nhowpublished = \"\\url{https://github.com/zeerakahmed/makhzan/}\",\n}\n", "homepage": "https://matnsaz.net/en/makhzan", "license": "All files in the /text directory are covered under standard copyright. Each piece of text has been included in this repository with explicity permission of respective copyright holders, who are identified in the tag for each file. You are free to use this text for analysis, research and development, but you are not allowed to redistribute or republish this text. Some cases where a less restrictive license could apply to files in the /text directory are presented below. In some cases copyright free text has been digitally reproduced through the hard work of our collaborators. In such cases we have credited the appropriate people where possible in a notes field in the file's metadata, and we strongly encourage you to contact them before redistributing this text in any form. Where a separate license is provided along with the text, we have provided corresponding data in the publication field in a file's metadata.", "features": {"file_id": {"dtype": "string", "id": null, "_type": "Value"}, "metadata": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "num-words": {"dtype": "int64", "id": null, "_type": "Value"}, "contains-non-urdu-languages": {"dtype": "string", "id": null, "_type": "Value"}, "document_body": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "makhzan", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 35637310, "num_examples": 5522, "dataset_name": "makhzan"}}, "download_checksums": {"https://github.com/zeerakahmed/makhzan/archive/99db56552d6781dcd184bdd3466bce15fd0a1ec0.zip": {"num_bytes": 15187763, "checksum": "d50d5d168d20dae7a2b4da3e51c60be1fb0a15588dfbeb4b9cf31e87fe45d54e"}}, "download_size": 15187763, "post_processing_size": null, "dataset_size": 35637310, "size_in_bytes": 50825073}} \ No newline at end of file diff --git a/datasets/makhzan/makhzan.py b/datasets/makhzan/makhzan.py index 43a2e8f6d4d..37cf7bdcc0d 100644 --- a/datasets/makhzan/makhzan.py +++ b/datasets/makhzan/makhzan.py @@ -22,12 +22,10 @@ import datasets -_CITATION = """\ -@InProceedings{huggingface:dataset, -title = {A great new dataset}, -authors={huggingface, Inc. -}, -year={2020} +_CITATION = r"""\ +@misc{makhzan, +title={Maḵẖzan}, +howpublished = "\url{https://github.com/zeerakahmed/makhzan/}", } """