huggingface · albertvillanova · Aug 25, 2022 · Aug 25, 2022 · Aug 25, 2022 · Aug 25, 2022
diff --git a/datasets/makhzan/README.md b/datasets/makhzan/README.md
@@ -222,7 +222,12 @@ Zeerak Ahmed
 
 ### Citation Information
 
-No citation information.
+```
+@misc{makhzan,
+title={Maḵẖzan},
+howpublished = "\url{https://github.com/zeerakahmed/makhzan/}",
+}
+```
 
 ### Contributions
 

diff --git a/datasets/makhzan/dataset_infos.json b/datasets/makhzan/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "An Urdu text corpus for machine learning, natural language processing and linguistic analysis.\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {A great new dataset},\nauthors={huggingface, Inc.\n},\nyear={2020}\n}\n", "homepage": "https://matnsaz.net/en/makhzan", "license": "All files in the /text directory are covered under standard copyright. Each piece of text has been included in this repository with explicity permission of respective copyright holders, who are identified in the <meta> tag for each file. You are free to use this text for analysis, research and development, but you are not allowed to redistribute or republish this text. Some cases where a less restrictive license could apply to files in the /text directory are presented below. In some cases copyright free text has been digitally reproduced through the hard work of our collaborators. In such cases we have credited the appropriate people where possible in a notes field in the file's metadata, and we strongly encourage you to contact them before redistributing this text in any form. Where a separate license is provided along with the text, we have provided corresponding data in the publication field in a file's metadata.", "features": {"file_id": {"dtype": "string", "id": null, "_type": "Value"}, "metadata": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "num-words": {"dtype": "int64", "id": null, "_type": "Value"}, "contains-non-urdu-languages": {"dtype": "string", "id": null, "_type": "Value"}, "document_body": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "makhzan", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 35637310, "num_examples": 5522, "dataset_name": "makhzan"}}, "download_checksums": {"https://github.com/zeerakahmed/makhzan/archive/99db56552d6781dcd184bdd3466bce15fd0a1ec0.zip": {"num_bytes": 15187763, "checksum": "d50d5d168d20dae7a2b4da3e51c60be1fb0a15588dfbeb4b9cf31e87fe45d54e"}}, "download_size": 15187763, "post_processing_size": null, "dataset_size": 35637310, "size_in_bytes": 50825073}}
+{"default": {"description": "An Urdu text corpus for machine learning, natural language processing and linguistic analysis.\n", "citation": "\\\n@misc{makhzan,\ntitle={Ma\u1e35\u1e96zan},\nhowpublished = \"\\url{https://github.com/zeerakahmed/makhzan/}\",\n}\n", "homepage": "https://matnsaz.net/en/makhzan", "license": "All files in the /text directory are covered under standard copyright. Each piece of text has been included in this repository with explicity permission of respective copyright holders, who are identified in the <meta> tag for each file. You are free to use this text for analysis, research and development, but you are not allowed to redistribute or republish this text. Some cases where a less restrictive license could apply to files in the /text directory are presented below. In some cases copyright free text has been digitally reproduced through the hard work of our collaborators. In such cases we have credited the appropriate people where possible in a notes field in the file's metadata, and we strongly encourage you to contact them before redistributing this text in any form. Where a separate license is provided along with the text, we have provided corresponding data in the publication field in a file's metadata.", "features": {"file_id": {"dtype": "string", "id": null, "_type": "Value"}, "metadata": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "num-words": {"dtype": "int64", "id": null, "_type": "Value"}, "contains-non-urdu-languages": {"dtype": "string", "id": null, "_type": "Value"}, "document_body": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "makhzan", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 35637310, "num_examples": 5522, "dataset_name": "makhzan"}}, "download_checksums": {"https://github.com/zeerakahmed/makhzan/archive/99db56552d6781dcd184bdd3466bce15fd0a1ec0.zip": {"num_bytes": 15187763, "checksum": "d50d5d168d20dae7a2b4da3e51c60be1fb0a15588dfbeb4b9cf31e87fe45d54e"}}, "download_size": 15187763, "post_processing_size": null, "dataset_size": 35637310, "size_in_bytes": 50825073}}
diff --git a/datasets/makhzan/makhzan.py b/datasets/makhzan/makhzan.py
@@ -22,12 +22,10 @@
 import datasets
 
 
-_CITATION = """\
-@InProceedings{huggingface:dataset,
-title = {A great new dataset},
-authors={huggingface, Inc.
-},
-year={2020}
+_CITATION = r"""\
+@misc{makhzan,
+title={Maḵẖzan},
+howpublished = "\url{https://github.com/zeerakahmed/makhzan/}",
 }
 """
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"default": {"description": "An Urdu text corpus for machine learning, natural language processing and linguistic analysis.\n", "citation": "@InProceedings{huggingface:dataset,\ntitle = {A great new dataset},\nauthors={huggingface, Inc.\n},\nyear={2020}\n}\n", "homepage": "https://matnsaz.net/en/makhzan", "license": "All files in the /text directory are covered under standard copyright. Each piece of text has been included in this repository with explicity permission of respective copyright holders, who are identified in the <meta> tag for each file. You are free to use this text for analysis, research and development, but you are not allowed to redistribute or republish this text. Some cases where a less restrictive license could apply to files in the /text directory are presented below. In some cases copyright free text has been digitally reproduced through the hard work of our collaborators. In such cases we have credited the appropriate people where possible in a notes field in the file's metadata, and we strongly encourage you to contact them before redistributing this text in any form. Where a separate license is provided along with the text, we have provided corresponding data in the publication field in a file's metadata.", "features": {"file_id": {"dtype": "string", "id": null, "_type": "Value"}, "metadata": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "num-words": {"dtype": "int64", "id": null, "_type": "Value"}, "contains-non-urdu-languages": {"dtype": "string", "id": null, "_type": "Value"}, "document_body": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "makhzan", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 35637310, "num_examples": 5522, "dataset_name": "makhzan"}}, "download_checksums": {"https://github.com/zeerakahmed/makhzan/archive/99db56552d6781dcd184bdd3466bce15fd0a1ec0.zip": {"num_bytes": 15187763, "checksum": "d50d5d168d20dae7a2b4da3e51c60be1fb0a15588dfbeb4b9cf31e87fe45d54e"}}, "download_size": 15187763, "post_processing_size": null, "dataset_size": 35637310, "size_in_bytes": 50825073}}
		{"default": {"description": "An Urdu text corpus for machine learning, natural language processing and linguistic analysis.\n", "citation": "\\\n@misc{makhzan,\ntitle={Ma\u1e35\u1e96zan},\nhowpublished = \"\\url{https://github.com/zeerakahmed/makhzan/}\",\n}\n", "homepage": "https://matnsaz.net/en/makhzan", "license": "All files in the /text directory are covered under standard copyright. Each piece of text has been included in this repository with explicity permission of respective copyright holders, who are identified in the <meta> tag for each file. You are free to use this text for analysis, research and development, but you are not allowed to redistribute or republish this text. Some cases where a less restrictive license could apply to files in the /text directory are presented below. In some cases copyright free text has been digitally reproduced through the hard work of our collaborators. In such cases we have credited the appropriate people where possible in a notes field in the file's metadata, and we strongly encourage you to contact them before redistributing this text in any form. Where a separate license is provided along with the text, we have provided corresponding data in the publication field in a file's metadata.", "features": {"file_id": {"dtype": "string", "id": null, "_type": "Value"}, "metadata": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "num-words": {"dtype": "int64", "id": null, "_type": "Value"}, "contains-non-urdu-languages": {"dtype": "string", "id": null, "_type": "Value"}, "document_body": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "makhzan", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 35637310, "num_examples": 5522, "dataset_name": "makhzan"}}, "download_checksums": {"https://github.com/zeerakahmed/makhzan/archive/99db56552d6781dcd184bdd3466bce15fd0a1ec0.zip": {"num_bytes": 15187763, "checksum": "d50d5d168d20dae7a2b4da3e51c60be1fb0a15588dfbeb4b9cf31e87fe45d54e"}}, "download_size": 15187763, "post_processing_size": null, "dataset_size": 35637310, "size_in_bytes": 50825073}}