diff --git a/datasets/arabic_billion_words/README.md b/datasets/arabic_billion_words/README.md index e78f77b9a14..ec4fe6ab3af 100644 --- a/datasets/arabic_billion_words/README.md +++ b/datasets/arabic_billion_words/README.md @@ -6,7 +6,7 @@ language_creators: languages: - ar licenses: -- unkown +- unknown multilinguality: - monolingual size_categories: @@ -37,6 +37,7 @@ task_categories: task_ids: - language-modeling paperswithcode_id: null +pretty_name: Arabic Billion Words --- # Dataset Card for Arabic Billion Words Corpus @@ -92,7 +93,18 @@ Arabic ### Data Instances -[More Information Needed] +This is an example of the "Almasryalyoum" configuration subset: +```python +{ + "url": "http://today.almasryalyoum.com/printerfriendly.aspx?ArticleID=61300", + "head_line": "رئيس وزراء المجر: عنصرية جماهير أوجبيست جلبت العار للبلاد", + "date": "19/5/2007", + "text": """قال متحدث باسم الحكومة المجرية: إن رئيس الوزراء فيرنك جيوركساني رحب بقرار اتحاد كرة القدم المجري بخصم ثلاث نقاط من نادي أوجبيست بسبب السلوك العنصري الذي صدر من جماهيره. +وعاقب الاتحاد المجري فريق أوجبيست بعد أن سخرت جماهيره من إبراهيم سيديبي مهاجم فريق ديبرينسين الأسود أثناء مباراة الفريقين أوائل مايو الجاري. +يذكر أن الاتحاد فرض أيضا غرامة مالية قدرها 20 ألف دولار علي أوجبيست في عام 2005 بعد أن رددت جماهيره شعارات معادية للسامية خلال مباراة بالدوري المجري. +وأوضح جيوركساني في خطاب إلي إيستفان كيستليكي رئيس الاتحاد المجري لكرة القدم، أن هذا السلوك العنصري من الجماهير «جلب العار لكرة القدم وللمجر». يذكر أن المجر بها مجموعة من مشجعي كرة القدم المشاغبين «الهوليجانز»، وشارك الكثير منهم في أعمال شغب معادية للحكومة في العام الماضي.""", +} +``` ### Data Fields @@ -104,7 +116,20 @@ The data fields are: ### Data Splits -[More Information Needed] +There is only one "training" split for all configuration subsets, containing the following number of examples: + +| | Number of examples | +|:---------------|-------------------:| +| Alittihad | 11551 | +| Almasryalyoum | 3170 | +| Almustaqbal | 12627 | +| Alqabas | 16449 | +| Echoroukonline | 34931 | +| Ryiadh | 13112 | +| Sabanews | 23036 | +| SaudiYoum | 73922 | +| Techreen | 37491 | +| Youm7 | 33460 | ## Dataset Creation diff --git a/datasets/arabic_billion_words/arabic_billion_words.py b/datasets/arabic_billion_words/arabic_billion_words.py index 5b0a671450f..c655505a4cd 100644 --- a/datasets/arabic_billion_words/arabic_billion_words.py +++ b/datasets/arabic_billion_words/arabic_billion_words.py @@ -157,14 +157,14 @@ def _generate_examples(self, filepath): pattern = f"<{data_tag}(.*?)" data = re.finditer(r"" + pattern, current_multi_line, re.MULTILINE | re.DOTALL) text, url, head_line, date = ["", "", "", ""] - for _, record in enumerate(data): + for record in data: try: text = self._clean_text(self._extract_tags(record, "Text")) url = self._extract_tags(record, "URL") head_line = self._clean_text(self._extract_tags(record, "Headline")) date = self._extract_tags(record, "Dateline") - except ValueError: - pass + except IndexError: + continue yield str(_idx), {"url": url, "head_line": head_line, "date": date, "text": text} _idx += 1 current_multi_line = "" diff --git a/datasets/arabic_billion_words/dataset_infos.json b/datasets/arabic_billion_words/dataset_infos.json index d4aae05c463..8abd0e1d3cd 100644 --- a/datasets/arabic_billion_words/dataset_infos.json +++ b/datasets/arabic_billion_words/dataset_infos.json @@ -1 +1 @@ -{"Alittihad": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "arabic_billion_words ", "config_name": "Alittihad", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 21619825, "num_examples": 11551, "dataset_name": "arabic_billion_words "}}, "download_checksums": {"http://abuelkhair.net/corpus/Alittihad_XML_utf_8.rar": {"num_bytes": 348259999, "checksum": "6dd90f7ca98699e924e0ea423dc9f4f648c645379f8bffe15eeb97af00fd6fc0"}}, "download_size": 348259999, "post_processing_size": null, "dataset_size": 21619825, "size_in_bytes": 369879824}, "Almasryalyoum": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "arabic_billion_words ", "config_name": "Almasryalyoum", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4214673, "num_examples": 3170, "dataset_name": "arabic_billion_words "}}, "download_checksums": {"http://abuelkhair.net/corpus/Almasryalyoum_XML_utf_8.rar": {"num_bytes": 242604438, "checksum": "f88d24179fa97df8d179242cb564301be2c7a4ecd36a027815b8ce1563059e7a"}}, "download_size": 242604438, "post_processing_size": null, "dataset_size": 4214673, "size_in_bytes": 246819111}, "Almustaqbal": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "arabic_billion_words ", "config_name": "Almustaqbal", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11657265, "num_examples": 12627, "dataset_name": "arabic_billion_words "}}, "download_checksums": {"http://abuelkhair.net/corpus/Almustaqbal_XML_utf_8.rar": {"num_bytes": 350826797, "checksum": "dff3361ad821f3bd3912cd7282db5c15a34919312b9bc7d708a8b30782c7fc36"}}, "download_size": 350826797, "post_processing_size": null, "dataset_size": 11657265, "size_in_bytes": 362484062}, "Alqabas": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "arabic_billion_words ", "config_name": "Alqabas", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 15035398, "num_examples": 16449, "dataset_name": "arabic_billion_words "}}, "download_checksums": {"http://abuelkhair.net/corpus/Alqabas_XML_utf_8.rar": {"num_bytes": 595274646, "checksum": "e5ea70add534220a8caf8d230959f134f49a822ce3612adb4f1bb537dc3cc6b4"}}, "download_size": 595274646, "post_processing_size": null, "dataset_size": 15035398, "size_in_bytes": 610310044}, "Echoroukonline": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "arabic_billion_words ", "config_name": "Echoroukonline", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 116272124, "num_examples": 34931, "dataset_name": "arabic_billion_words "}}, "download_checksums": {"http://abuelkhair.net/corpus/Echoroukonline_XML_utf_8.rar": {"num_bytes": 108184378, "checksum": "8f3e85bd99caeb9c5c4922edcd18720fc3700fd6751febfa7ee72e05a584a270"}}, "download_size": 108184378, "post_processing_size": null, "dataset_size": 116272124, "size_in_bytes": 224456502}, "Ryiadh": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "arabic_billion_words ", "config_name": "Ryiadh", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 18938097, "num_examples": 13112, "dataset_name": "arabic_billion_words "}}, "download_checksums": {"http://abuelkhair.net/corpus/Ryiadh_XML_utf_8.rar": {"num_bytes": 691264971, "checksum": "c934867e53cb57d45ff99a8b5cfa991ae255a1ecb20e79309a41af2aa3e45c15"}}, "download_size": 691264971, "post_processing_size": null, "dataset_size": 18938097, "size_in_bytes": 710203068}, "Sabanews": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "arabic_billion_words ", "config_name": "Sabanews", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 49592924, "num_examples": 23036, "dataset_name": "arabic_billion_words "}}, "download_checksums": {"http://abuelkhair.net/corpus/Sabanews_XML_utf_8.rar": {"num_bytes": 38214558, "checksum": "c9b2f1ac8ed2a5e89ab9a6bcd82a0d825569b813b53cd83419968782e9946dbe"}}, "download_size": 38214558, "post_processing_size": null, "dataset_size": 49592924, "size_in_bytes": 87807482}, "SaudiYoum": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "arabic_billion_words ", "config_name": "SaudiYoum", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 175560593, "num_examples": 73922, "dataset_name": "arabic_billion_words "}}, "download_checksums": {"http://abuelkhair.net/corpus/SaudiYoum_XML_utf_8.rar": {"num_bytes": 605537923, "checksum": "d4cbb5554acb03fb7ce271a0b708c1bc6bcf31593ae8c670bed7f8c22335a915"}}, "download_size": 605537923, "post_processing_size": null, "dataset_size": 175560593, "size_in_bytes": 781098516}, "Techreen": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "arabic_billion_words ", "config_name": "Techreen", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 127556261, "num_examples": 37491, "dataset_name": "arabic_billion_words "}}, "download_checksums": {"http://abuelkhair.net/corpus/Techreen_XML_utf_8.rar": {"num_bytes": 252976781, "checksum": "5e4ab520399069fd38d9d80f4429fc05efaae51a912e1467becfc2686e424770"}}, "download_size": 252976781, "post_processing_size": null, "dataset_size": 127556261, "size_in_bytes": 380533042}, "Youm7": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "arabic_billion_words ", "config_name": "Youm7", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 36991165, "num_examples": 33460, "dataset_name": "arabic_billion_words "}}, "download_checksums": {"http://abuelkhair.net/corpus/Youm7_XML_utf_8.rar": {"num_bytes": 617708074, "checksum": "cd81aa0b3d74e5d9a07377369ea473d8a7bd51cb5826e9809d700de2ddeffe23"}}, "download_size": 617708074, "post_processing_size": null, "dataset_size": 36991165, "size_in_bytes": 654699239}} \ No newline at end of file +{"Alittihad": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Alittihad", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 21619825, "num_examples": 11551, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Alittihad_XML_utf_8.rar": {"num_bytes": 348259999, "checksum": "6dd90f7ca98699e924e0ea423dc9f4f648c645379f8bffe15eeb97af00fd6fc0"}}, "download_size": 348259999, "post_processing_size": null, "dataset_size": 21619825, "size_in_bytes": 369879824}, "Almasryalyoum": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Almasryalyoum", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4214673, "num_examples": 3170, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Almasryalyoum_XML_utf_8.rar": {"num_bytes": 242604438, "checksum": "f88d24179fa97df8d179242cb564301be2c7a4ecd36a027815b8ce1563059e7a"}}, "download_size": 242604438, "post_processing_size": null, "dataset_size": 4214673, "size_in_bytes": 246819111}, "Almustaqbal": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Almustaqbal", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11657265, "num_examples": 12627, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Almustaqbal_XML_utf_8.rar": {"num_bytes": 350826797, "checksum": "dff3361ad821f3bd3912cd7282db5c15a34919312b9bc7d708a8b30782c7fc36"}}, "download_size": 350826797, "post_processing_size": null, "dataset_size": 11657265, "size_in_bytes": 362484062}, "Alqabas": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Alqabas", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 15035398, "num_examples": 16449, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Alqabas_XML_utf_8.rar": {"num_bytes": 595274646, "checksum": "e5ea70add534220a8caf8d230959f134f49a822ce3612adb4f1bb537dc3cc6b4"}}, "download_size": 595274646, "post_processing_size": null, "dataset_size": 15035398, "size_in_bytes": 610310044}, "Echoroukonline": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Echoroukonline", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 116272124, "num_examples": 34931, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Echoroukonline_XML_utf_8.rar": {"num_bytes": 108184378, "checksum": "8f3e85bd99caeb9c5c4922edcd18720fc3700fd6751febfa7ee72e05a584a270"}}, "download_size": 108184378, "post_processing_size": null, "dataset_size": 116272124, "size_in_bytes": 224456502}, "Ryiadh": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Ryiadh", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 18938097, "num_examples": 13112, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Ryiadh_XML_utf_8.rar": {"num_bytes": 691264971, "checksum": "c934867e53cb57d45ff99a8b5cfa991ae255a1ecb20e79309a41af2aa3e45c15"}}, "download_size": 691264971, "post_processing_size": null, "dataset_size": 18938097, "size_in_bytes": 710203068}, "Sabanews": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Sabanews", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 49592924, "num_examples": 23036, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Sabanews_XML_utf_8.rar": {"num_bytes": 38214558, "checksum": "c9b2f1ac8ed2a5e89ab9a6bcd82a0d825569b813b53cd83419968782e9946dbe"}}, "download_size": 38214558, "post_processing_size": null, "dataset_size": 49592924, "size_in_bytes": 87807482}, "SaudiYoum": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "SaudiYoum", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 175560593, "num_examples": 73922, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/SaudiYoum_XML_utf_8.rar": {"num_bytes": 605537923, "checksum": "d4cbb5554acb03fb7ce271a0b708c1bc6bcf31593ae8c670bed7f8c22335a915"}}, "download_size": 605537923, "post_processing_size": null, "dataset_size": 175560593, "size_in_bytes": 781098516}, "Techreen": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Techreen", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 127556261, "num_examples": 37491, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Techreen_XML_utf_8.rar": {"num_bytes": 252976781, "checksum": "5e4ab520399069fd38d9d80f4429fc05efaae51a912e1467becfc2686e424770"}}, "download_size": 252976781, "post_processing_size": null, "dataset_size": 127556261, "size_in_bytes": 380533042}, "Youm7": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Youm7", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 36991165, "num_examples": 33460, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Youm7_XML_utf_8.rar": {"num_bytes": 617708074, "checksum": "cd81aa0b3d74e5d9a07377369ea473d8a7bd51cb5826e9809d700de2ddeffe23"}}, "download_size": 617708074, "post_processing_size": null, "dataset_size": 36991165, "size_in_bytes": 654699239}} \ No newline at end of file