huggingface · lhoestq · Aug 18, 2021 · Aug 14, 2021 · Aug 14, 2021 · Aug 14, 2021
diff --git a/datasets/books3/README.md b/datasets/books3/README.md
@@ -0,0 +1,169 @@
+---
+annotations_creators:
+- no-annotation
+language_creators:
+- found
+languages:
+- en
+licenses:
+- mit
+multilinguality:
+- monolingual
+pretty_name: Books3
+size_categories:
+- unknown
+source_datasets:
+- original
+task_categories:
+- sequence-modeling
+task_ids:
+- language-modeling
+---
+
+# Dataset Card for books3
+
+## Table of Contents
+- [Dataset Card for books3](#dataset-card-for-books3)
+  - [Table of Contents](#table-of-contents)
+  - [Dataset Description](#dataset-description)
+    - [Dataset Summary](#dataset-summary)
+    - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
+    - [Languages](#languages)
+  - [Dataset Structure](#dataset-structure)
+    - [Data Instances](#data-instances)
+    - [Data Fields](#data-fields)
+    - [Data Splits](#data-splits)
+  - [|split|num examples|](#splitnum-examples)
+  - [Dataset Creation](#dataset-creation)
+    - [Curation Rationale](#curation-rationale)
+    - [Source Data](#source-data)
+      - [Initial Data Collection and Normalization](#initial-data-collection-and-normalization)
+      - [Who are the source language producers?](#who-are-the-source-language-producers)
+    - [Annotations](#annotations)
+      - [Annotation process](#annotation-process)
+      - [Who are the annotators?](#who-are-the-annotators)
+    - [Personal and Sensitive Information](#personal-and-sensitive-information)
+  - [Considerations for Using the Data](#considerations-for-using-the-data)
+    - [Social Impact of Dataset](#social-impact-of-dataset)
+    - [Discussion of Biases](#discussion-of-biases)
+    - [Other Known Limitations](#other-known-limitations)
+  - [Additional Information](#additional-information)
+    - [Dataset Curators](#dataset-curators)
+    - [Licensing Information](#licensing-information)
+    - [Citation Information](#citation-information)
+    - [Contributions](#contributions)
+
+## Dataset Description
+
+- **Homepage:** https://github.com/soskek/bookcorpus/issues/27#issuecomment-716104208
+- **Repository:** [Needs More Information]
+- **Paper:** https://arxiv.org/abs/2101.00027
+- **Leaderboard:** [Needs More Information]
+- **Point of Contact:** [Needs More Information]
+
+### Dataset Summary
+
+This dataset is Shawn Presser's work and is part of EleutherAi/The Pile dataset. 
+
+This dataset contains all of bibliotik in plain .txt form, aka 197,000 books processed in exactly  the same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious  "books2" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know very little about any differences. People suspect it's "all of libgen", but it's purely conjecture.
+
+|download_size|36.8 Gib|
+|dataset_size|100.9 Gib|
+
+### Supported Tasks and Leaderboards
+
+- `lm`
+
+### Languages
+
+- `en`
+
+## Dataset Structure
+
+### Data Instances
+
+```
+{'title': '07 LEGO Ninjago - The Search For Zane (Scholastic) - Kate Howard (retail)'
+'text': '\n\nTITLE PAGE\n\nFROM THE JOURNAL OF SENSEI GARMADON\n\nCHAPTER 1\n\nCHAPTER 2\n\nCHAPTER 3\n\nCHAPTER 4\n\nCHAPTER 5\n\nCHAPTER 6\n\nCHAPTER 7\n\nCHAPTER 8\n\nCHAPTER 9\n\nCOPYRIGHT\n\nThroughout Ninjago", five ninja are well-known for their speed, strength, and  of course  the elemental powers that help them protect our world from evil. But there are others who possess some of the same powers as the ninja. Others who may not always use their powers for good.\n\nBefore now, the ninja believed they were special. They di.......'}
+```
+
+### Data Fields
+
+- `title`
+- `text`
+
+### Data Splits
+
+|split|num examples|
+--------------------------------
+|train|196640|
+
+## Dataset Creation
+
+### Curation Rationale
+
+[Needs More Information]
+
+### Source Data
+
+#### Initial Data Collection and Normalization
+
+[Needs More Information]
+
+#### Who are the source language producers?
+
+[Needs More Information]
+
+### Annotations
+
+#### Annotation process
+
+[Needs More Information]
+
+#### Who are the annotators?
+
+[Needs More Information]
+
+### Personal and Sensitive Information
+
+[Needs More Information]
+
+## Considerations for Using the Data
+
+### Social Impact of Dataset
+
+[Needs More Information]
+
+### Discussion of Biases
+
+[Needs More Information]
+
+### Other Known Limitations
+
+[Needs More Information]
+
+## Additional Information
+
+### Dataset Curators
+
+[Needs More Information]
+
+### Licensing Information
+
+MIT
+
+### Citation Information
+
+```
+@article{pile,
+    title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
+    author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
+    journal={arXiv preprint arXiv:2101.00027},
+    year={2020}
+}
+```
+
+### Contributions
+
+[@shawwn](https://github.com/shawwn) creates this dataset.
+[@richarddwang](https://github.com/richarddwang) adds this dataset.
diff --git a/datasets/books3/books3.py b/datasets/books3/books3.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""The BookCorpus dataset based on Shawn Presser's work https://github.com/soskek/bookcorpus/issues/27 """
+
+
+from pathlib import Path
+
+import datasets
+
+
+_DESCRIPTION = """\
+This dataset is Shawn Presser's work and is part of EleutherAi/The Pile dataset. \
+This dataset contains all of bibliotik in plain .txt form, aka 197,000 books processed in exactly \
+the same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \
+"books2" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know \
+very little about any differences. People suspect it's "all of libgen", but it's purely conjecture.
+"""
+
+_CITATION = """\
+@article{pile,
+    title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
+    author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
+    journal={arXiv preprint arXiv:2101.00027},
+    year={2020}
+}
+"""
+_PROJECT_URL = "https://github.com/soskek/bookcorpus/issues/27#issuecomment-716104208"
+_DOWNLOAD_URL = "https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz"
+
+
+class Books3Config(datasets.BuilderConfig):
+    """BuilderConfig for Books3."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for Books3.
+        Args:
+        **kwargs: keyword arguments forwarded to super.
+        """
+        super(Books3Config, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)
+
+
+class Books3(datasets.GeneratorBasedBuilder):
+    """Books3 dataset."""
+
+    BUILDER_CONFIGS = [
+        Books3Config(
+            name="plain_text",
+            description="Plain text",
+        )
+    ]
+    # Every example is a whole book thus big, adjust writer_batch_size to avoid OOM at the cost of writing speed
+    DEFAULT_WRITER_BATCH_SIZE = 500
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "title": datasets.Value("string"),
+                    "text": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage=_PROJECT_URL,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        extracted_dir = dl_manager.download_and_extract(_DOWNLOAD_URL)
+        leaf_dirs = Path(extracted_dir).glob("**/Bibliotik/*")
+        leaf_dirs = sorted(leaf_dirs)
+
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"leaf_dirs": leaf_dirs}),
+        ]
+
+    def _generate_examples(self, leaf_dirs):
+        _id = 0
+        for leaf_dir in leaf_dirs:
+            for path in sorted(leaf_dir.glob("**/*.epub.txt")):
+                with path.open(mode="r", encoding="utf-8") as f:
+                    entry = {"title": path.name, "text": f.read()}
+                yield _id, entry
+                _id += 1
diff --git a/datasets/books3/dataset_infos.json b/datasets/books3/dataset_infos.json
@@ -0,0 +1 @@
+{"plain_text": {"description": "Shawn Presser's work. All of bibliotik in plain .txt form, aka 197,000 books processed in exactly \nthe same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \n\"books2\" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know\nvery little about any differences. People suspect it's \"all of libgen\", but it's purely conjecture.\n", "citation": "@article{pile,\n    title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n    author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n    journal={arXiv preprint arXiv:2101.00027},\n    year={2020}\n}\n", "homepage": "https://github.com/soskek/bookcorpus/issues/27#issuecomment-716104208", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "books3", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 108395639965, "num_examples": 196640, "dataset_name": "books3"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz": {"num_bytes": 39516981435, "checksum": "016b90fa6b8507328b6a90d13b0f68c2b87dfd281b35e449a1d466fd9eebc14a"}}, "download_size": 39516981435, "post_processing_size": null, "dataset_size": 108395639965, "size_in_bytes": 147912621400}}
diff --git a/datasets/books3/dummy/plain_text/1.0.0/dummy_data.zip b/datasets/books3/dummy/plain_text/1.0.0/dummy_data.zip
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"plain_text": {"description": "Shawn Presser's work. All of bibliotik in plain .txt form, aka 197,000 books processed in exactly \nthe same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \n\"books2\" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know\nvery little about any differences. People suspect it's \"all of libgen\", but it's purely conjecture.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://github.com/soskek/bookcorpus/issues/27#issuecomment-716104208", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "books3", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 108395639965, "num_examples": 196640, "dataset_name": "books3"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz": {"num_bytes": 39516981435, "checksum": "016b90fa6b8507328b6a90d13b0f68c2b87dfd281b35e449a1d466fd9eebc14a"}}, "download_size": 39516981435, "post_processing_size": null, "dataset_size": 108395639965, "size_in_bytes": 147912621400}}