huggingface · lhoestq · Aug 23, 2021 · Aug 19, 2021 · Aug 19, 2021 · Aug 23, 2021
diff --git a/datasets/books3/README.md → datasets/the_pile_books3/README.md b/datasets/books3/README.md → datasets/the_pile_books3/README.md
@@ -20,10 +20,10 @@ task_ids:
 - language-modeling
 ---
 
-# Dataset Card for books3
+# Dataset Card for the_pile_books3
 
 ## Table of Contents
-- [Dataset Card for books3](#dataset-card-for-books3)
+- [Dataset Card for the_pile_books3](#dataset-card-for-the_pile_books3)
   - [Table of Contents](#table-of-contents)
   - [Dataset Description](#dataset-description)
     - [Dataset Summary](#dataset-summary)

diff --git a/datasets/books3/dataset_infos.json → datasets/the_pile_books3/dataset_infos.json b/datasets/books3/dataset_infos.json → datasets/the_pile_books3/dataset_infos.json
@@ -1 +1 @@
-{"plain_text": {"description": "Shawn Presser's work. All of bibliotik in plain .txt form, aka 197,000 books processed in exactly \nthe same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \n\"books2\" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know\nvery little about any differences. People suspect it's \"all of libgen\", but it's purely conjecture.\n", "citation": "@article{pile,\n    title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n    author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n    journal={arXiv preprint arXiv:2101.00027},\n    year={2020}\n}\n", "homepage": "https://github.com/soskek/bookcorpus/issues/27#issuecomment-716104208", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "books3", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 108395639965, "num_examples": 196640, "dataset_name": "books3"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz": {"num_bytes": 39516981435, "checksum": "016b90fa6b8507328b6a90d13b0f68c2b87dfd281b35e449a1d466fd9eebc14a"}}, "download_size": 39516981435, "post_processing_size": null, "dataset_size": 108395639965, "size_in_bytes": 147912621400}}
+{"plain_text": {"description": "Shawn Presser's work. All of bibliotik in plain .txt form, aka 197,000 books processed in exactly \nthe same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \n\"books2\" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know\nvery little about any differences. People suspect it's \"all of libgen\", but it's purely conjecture.\n", "citation": "@article{pile,\n    title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n    author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n    journal={arXiv preprint arXiv:2101.00027},\n    year={2020}\n}\n", "homepage": "https://github.com/soskek/bookcorpus/issues/27#issuecomment-716104208", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "the_pile_books3", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 108395639965, "num_examples": 196640, "dataset_name": "the_pile_books3"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz": {"num_bytes": 39516981435, "checksum": "016b90fa6b8507328b6a90d13b0f68c2b87dfd281b35e449a1d466fd9eebc14a"}}, "download_size": 39516981435, "post_processing_size": null, "dataset_size": 108395639965, "size_in_bytes": 147912621400}}
diff --git a/...ks3/dummy/plain_text/1.0.0/dummy_data.zip → ...ks3/dummy/plain_text/1.0.0/dummy_data.zip b/...ks3/dummy/plain_text/1.0.0/dummy_data.zip → ...ks3/dummy/plain_text/1.0.0/dummy_data.zip
diff --git a/datasets/books3/books3.py → datasets/the_pile_books3/the_pile_books3.py b/datasets/books3/books3.py → datasets/the_pile_books3/the_pile_books3.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+# Copyright 2021 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -43,17 +43,17 @@
 
 
 class Books3Config(datasets.BuilderConfig):
-    """BuilderConfig for Books3."""
+    """BuilderConfig for ThePileBooks3."""
 
     def __init__(self, **kwargs):
-        """BuilderConfig for Books3.
+        """BuilderConfig for ThePileBooks3.
         Args:
         **kwargs: keyword arguments forwarded to super.
         """
         super(Books3Config, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)
 
 
-class Books3(datasets.GeneratorBasedBuilder):
+class ThePileBooks3(datasets.GeneratorBasedBuilder):
     """Books3 dataset."""
 
     BUILDER_CONFIGS = [

diff --git a/datasets/openwebtext2/README.md → datasets/the_pile_openwebtext2/README.md b/datasets/openwebtext2/README.md → datasets/the_pile_openwebtext2/README.md
@@ -22,10 +22,10 @@ task_ids:
 - text-scoring-other-rating
 ---
 
-# Dataset Card for openwebtext2
+# Dataset Card for the_pile_openwebtext2
 
 ## Table of Contents
-- [Dataset Card for openwebtext2](#dataset-card-for-openwebtext2)
+- [Dataset Card for the_pile_openwebtext2](#dataset-card-for-the_pile_openwebtext2)
   - [Table of Contents](#table-of-contents)
   - [Dataset Description](#dataset-description)
     - [Dataset Summary](#dataset-summary)

diff --git a/datasets/openwebtext2/dataset_infos.json → .../the_pile_openwebtext2/dataset_infos.json b/datasets/openwebtext2/dataset_infos.json → .../the_pile_openwebtext2/dataset_infos.json
@@ -1 +1 @@
-{"plain_text": {"description": "OpenWebText2 is an enhanced version of the original OpenWebTextCorpus covering all Reddit submissions from 2005 up until April 2020, with further months becoming available after the corresponding PushShift dump files are released.\n", "citation": "@article{pile,\n    title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n    author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n    journal={arXiv preprint arXiv:2101.00027},\n    year={2020}\n}\n", "homepage": "https://openwebtext2.readthedocs.io/en/latest/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "openwebtext2", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 68571017395, "num_examples": 17103059, "dataset_name": "openwebtext2"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar": {"num_bytes": 29344276480, "checksum": "9043d1b93c35ff1a38a17e16c73c009d4617dcaab6da15adc0faf4779739a027"}}, "download_size": 29344276480, "post_processing_size": null, "dataset_size": 68571017395, "size_in_bytes": 97915293875}}
+{"plain_text": {"description": "OpenWebText2 is an enhanced version of the original OpenWebTextCorpus covering all Reddit submissions from 2005 up until April 2020, with further months becoming available after the corresponding PushShift dump files are released.\n", "citation": "@article{pile,\n    title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n    author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n    journal={arXiv preprint arXiv:2101.00027},\n    year={2020}\n}\n", "homepage": "https://openwebtext2.readthedocs.io/en/latest/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "the_pile_openwebtext2", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 68571017395, "num_examples": 17103059, "dataset_name": "the_pile_openwebtext2"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar": {"num_bytes": 29344276480, "checksum": "9043d1b93c35ff1a38a17e16c73c009d4617dcaab6da15adc0faf4779739a027"}}, "download_size": 29344276480, "post_processing_size": null, "dataset_size": 68571017395, "size_in_bytes": 97915293875}}
diff --git a/...xt2/dummy/plain_text/1.0.0/dummy_data.zip → ...xt2/dummy/plain_text/1.0.0/dummy_data.zip b/...xt2/dummy/plain_text/1.0.0/dummy_data.zip → ...xt2/dummy/plain_text/1.0.0/dummy_data.zip
diff --git a/datasets/openwebtext2/openwebtext2.py → ...ile_openwebtext2/the_pile_openwebtext2.py b/datasets/openwebtext2/openwebtext2.py → ...ile_openwebtext2/the_pile_openwebtext2.py
diff --git a/datasets/stack_exchange/README.md → datasets/the_pile_stack_exchange/README.md b/datasets/stack_exchange/README.md → datasets/the_pile_stack_exchange/README.md
@@ -23,7 +23,7 @@ task_ids:
 # Dataset Card for Stack Exchange
 
 ## Table of Contents
-- [Dataset Card for Stack Exchange](#dataset-card-for-stack-exchange)
+- [Dataset Card for Stack Exchange](#dataset-card-for-the_pile_stack_exchange)
   - [Table of Contents](#table-of-contents)
   - [Dataset Description](#dataset-description)
     - [Dataset Summary](#dataset-summary)

diff --git a/datasets/stack_exchange/dataset_infos.json → ...he_pile_stack_exchange/dataset_infos.json b/datasets/stack_exchange/dataset_infos.json → ...he_pile_stack_exchange/dataset_infos.json
@@ -1 +1 @@
-{"plain_text": {"description": "This dataset is part of EleutherAI/The Pile dataset and is a dataset for Language Models from processing stackexchange data dump, which is an anonymized dump of all user-contributed content on the Stack Exchange network.\n", "citation": "@article{pile,\n    title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n    author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n    journal={arXiv preprint arXiv:2101.00027},\n    year={2020}\n}\n", "homepage": "https://github.com/EleutherAI/stackexchange-dataset", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "stack_exchange", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11075434609, "num_examples": 5096117, "dataset_name": "stack_exchange"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar": {"num_bytes": 36802959360, "checksum": "f64f31d20db8d8692c1a019314a14974b4911a34ffef126feaf42da88860c666"}}, "download_size": 36802959360, "post_processing_size": null, "dataset_size": 11075434609, "size_in_bytes": 47878393969}}
+{"plain_text": {"description": "This dataset is part of EleutherAI/The Pile dataset and is a dataset for Language Models from processing stackexchange data dump, which is an anonymized dump of all user-contributed content on the Stack Exchange network.\n", "citation": "@article{pile,\n    title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n    author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n    journal={arXiv preprint arXiv:2101.00027},\n    year={2020}\n}\n", "homepage": "https://github.com/EleutherAI/stackexchange-dataset", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "the_pile_stack_exchange", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11075434609, "num_examples": 5096117, "dataset_name": "the_pile_stack_exchange"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar": {"num_bytes": 36802959360, "checksum": "f64f31d20db8d8692c1a019314a14974b4911a34ffef126feaf42da88860c666"}}, "download_size": 36802959360, "post_processing_size": null, "dataset_size": 11075434609, "size_in_bytes": 47878393969}}
diff --git a/...nge/dummy/plain_text/1.0.0/dummy_data.zip → ...nge/dummy/plain_text/1.0.0/dummy_data.zip b/...nge/dummy/plain_text/1.0.0/dummy_data.zip → ...nge/dummy/plain_text/1.0.0/dummy_data.zip
diff --git a/datasets/stack_exchange/stack_exchange.py → ...stack_exchange/the_pile_stack_exchange.py b/datasets/stack_exchange/stack_exchange.py → ...stack_exchange/the_pile_stack_exchange.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+# Copyright 2021 The HuggingFace Datasets Authors and the current dataset script contributor.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@
 _URL = "https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar"
 
 
-class StackExchange(datasets.GeneratorBasedBuilder):
+class ThePileStackExchange(datasets.GeneratorBasedBuilder):
     """The StackExchange dataset."""
 
     BUILDER_CONFIGS = [
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"plain_text": {"description": "Shawn Presser's work. All of bibliotik in plain .txt form, aka 197,000 books processed in exactly \nthe same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \n\"books2\" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know\nvery little about any differences. People suspect it's \"all of libgen\", but it's purely conjecture.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://github.com/soskek/bookcorpus/issues/27#issuecomment-716104208", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "books3", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 108395639965, "num_examples": 196640, "dataset_name": "books3"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz": {"num_bytes": 39516981435, "checksum": "016b90fa6b8507328b6a90d13b0f68c2b87dfd281b35e449a1d466fd9eebc14a"}}, "download_size": 39516981435, "post_processing_size": null, "dataset_size": 108395639965, "size_in_bytes": 147912621400}}
		{"plain_text": {"description": "Shawn Presser's work. All of bibliotik in plain .txt form, aka 197,000 books processed in exactly \nthe same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \n\"books2\" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know\nvery little about any differences. People suspect it's \"all of libgen\", but it's purely conjecture.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://github.com/soskek/bookcorpus/issues/27#issuecomment-716104208", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "the_pile_books3", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 108395639965, "num_examples": 196640, "dataset_name": "the_pile_books3"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz": {"num_bytes": 39516981435, "checksum": "016b90fa6b8507328b6a90d13b0f68c2b87dfd281b35e449a1d466fd9eebc14a"}}, "download_size": 39516981435, "post_processing_size": null, "dataset_size": 108395639965, "size_in_bytes": 147912621400}}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"plain_text": {"description": "OpenWebText2 is an enhanced version of the original OpenWebTextCorpus covering all Reddit submissions from 2005 up until April 2020, with further months becoming available after the corresponding PushShift dump files are released.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://openwebtext2.readthedocs.io/en/latest/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "openwebtext2", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 68571017395, "num_examples": 17103059, "dataset_name": "openwebtext2"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar": {"num_bytes": 29344276480, "checksum": "9043d1b93c35ff1a38a17e16c73c009d4617dcaab6da15adc0faf4779739a027"}}, "download_size": 29344276480, "post_processing_size": null, "dataset_size": 68571017395, "size_in_bytes": 97915293875}}
		{"plain_text": {"description": "OpenWebText2 is an enhanced version of the original OpenWebTextCorpus covering all Reddit submissions from 2005 up until April 2020, with further months becoming available after the corresponding PushShift dump files are released.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://openwebtext2.readthedocs.io/en/latest/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "the_pile_openwebtext2", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 68571017395, "num_examples": 17103059, "dataset_name": "the_pile_openwebtext2"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar": {"num_bytes": 29344276480, "checksum": "9043d1b93c35ff1a38a17e16c73c009d4617dcaab6da15adc0faf4779739a027"}}, "download_size": 29344276480, "post_processing_size": null, "dataset_size": 68571017395, "size_in_bytes": 97915293875}}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"plain_text": {"description": "This dataset is part of EleutherAI/The Pile dataset and is a dataset for Language Models from processing stackexchange data dump, which is an anonymized dump of all user-contributed content on the Stack Exchange network.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://github.com/EleutherAI/stackexchange-dataset", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "stack_exchange", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11075434609, "num_examples": 5096117, "dataset_name": "stack_exchange"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar": {"num_bytes": 36802959360, "checksum": "f64f31d20db8d8692c1a019314a14974b4911a34ffef126feaf42da88860c666"}}, "download_size": 36802959360, "post_processing_size": null, "dataset_size": 11075434609, "size_in_bytes": 47878393969}}
		{"plain_text": {"description": "This dataset is part of EleutherAI/The Pile dataset and is a dataset for Language Models from processing stackexchange data dump, which is an anonymized dump of all user-contributed content on the Stack Exchange network.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://github.com/EleutherAI/stackexchange-dataset", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "the_pile_stack_exchange", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11075434609, "num_examples": 5096117, "dataset_name": "the_pile_stack_exchange"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar": {"num_bytes": 36802959360, "checksum": "f64f31d20db8d8692c1a019314a14974b4911a34ffef126feaf42da88860c666"}}, "download_size": 36802959360, "post_processing_size": null, "dataset_size": 11075434609, "size_in_bytes": 47878393969}}