Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ task_ids:
- language-modeling
---

# Dataset Card for books3
# Dataset Card for the_pile_books3

## Table of Contents
- [Dataset Card for books3](#dataset-card-for-books3)
- [Dataset Card for the_pile_books3](#dataset-card-for-the_pile_books3)
- [Table of Contents](#table-of-contents)
- [Dataset Description](#dataset-description)
- [Dataset Summary](#dataset-summary)
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"plain_text": {"description": "Shawn Presser's work. All of bibliotik in plain .txt form, aka 197,000 books processed in exactly \nthe same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \n\"books2\" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know\nvery little about any differences. People suspect it's \"all of libgen\", but it's purely conjecture.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://github.com/soskek/bookcorpus/issues/27#issuecomment-716104208", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "books3", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 108395639965, "num_examples": 196640, "dataset_name": "books3"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz": {"num_bytes": 39516981435, "checksum": "016b90fa6b8507328b6a90d13b0f68c2b87dfd281b35e449a1d466fd9eebc14a"}}, "download_size": 39516981435, "post_processing_size": null, "dataset_size": 108395639965, "size_in_bytes": 147912621400}}
{"plain_text": {"description": "Shawn Presser's work. All of bibliotik in plain .txt form, aka 197,000 books processed in exactly \nthe same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \n\"books2\" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know\nvery little about any differences. People suspect it's \"all of libgen\", but it's purely conjecture.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://github.com/soskek/bookcorpus/issues/27#issuecomment-716104208", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "the_pile_books3", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 108395639965, "num_examples": 196640, "dataset_name": "the_pile_books3"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz": {"num_bytes": 39516981435, "checksum": "016b90fa6b8507328b6a90d13b0f68c2b87dfd281b35e449a1d466fd9eebc14a"}}, "download_size": 39516981435, "post_processing_size": null, "dataset_size": 108395639965, "size_in_bytes": 147912621400}}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
# Copyright 2021 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -43,17 +43,17 @@


class Books3Config(datasets.BuilderConfig):
"""BuilderConfig for Books3."""
"""BuilderConfig for ThePileBooks3."""

def __init__(self, **kwargs):
"""BuilderConfig for Books3.
"""BuilderConfig for ThePileBooks3.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super(Books3Config, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)


class Books3(datasets.GeneratorBasedBuilder):
class ThePileBooks3(datasets.GeneratorBasedBuilder):
"""Books3 dataset."""

BUILDER_CONFIGS = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ task_ids:
- text-scoring-other-rating
---

# Dataset Card for openwebtext2
# Dataset Card for the_pile_openwebtext2

## Table of Contents
- [Dataset Card for openwebtext2](#dataset-card-for-openwebtext2)
- [Dataset Card for the_pile_openwebtext2](#dataset-card-for-the_pile_openwebtext2)
- [Table of Contents](#table-of-contents)
- [Dataset Description](#dataset-description)
- [Dataset Summary](#dataset-summary)
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"plain_text": {"description": "OpenWebText2 is an enhanced version of the original OpenWebTextCorpus covering all Reddit submissions from 2005 up until April 2020, with further months becoming available after the corresponding PushShift dump files are released.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://openwebtext2.readthedocs.io/en/latest/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "openwebtext2", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 68571017395, "num_examples": 17103059, "dataset_name": "openwebtext2"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar": {"num_bytes": 29344276480, "checksum": "9043d1b93c35ff1a38a17e16c73c009d4617dcaab6da15adc0faf4779739a027"}}, "download_size": 29344276480, "post_processing_size": null, "dataset_size": 68571017395, "size_in_bytes": 97915293875}}
{"plain_text": {"description": "OpenWebText2 is an enhanced version of the original OpenWebTextCorpus covering all Reddit submissions from 2005 up until April 2020, with further months becoming available after the corresponding PushShift dump files are released.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://openwebtext2.readthedocs.io/en/latest/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "the_pile_openwebtext2", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 68571017395, "num_examples": 17103059, "dataset_name": "the_pile_openwebtext2"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar": {"num_bytes": 29344276480, "checksum": "9043d1b93c35ff1a38a17e16c73c009d4617dcaab6da15adc0faf4779739a027"}}, "download_size": 29344276480, "post_processing_size": null, "dataset_size": 68571017395, "size_in_bytes": 97915293875}}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ task_ids:
# Dataset Card for Stack Exchange

## Table of Contents
- [Dataset Card for Stack Exchange](#dataset-card-for-stack-exchange)
- [Dataset Card for Stack Exchange](#dataset-card-for-the_pile_stack_exchange)
- [Table of Contents](#table-of-contents)
- [Dataset Description](#dataset-description)
- [Dataset Summary](#dataset-summary)
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"plain_text": {"description": "This dataset is part of EleutherAI/The Pile dataset and is a dataset for Language Models from processing stackexchange data dump, which is an anonymized dump of all user-contributed content on the Stack Exchange network.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://github.com/EleutherAI/stackexchange-dataset", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "stack_exchange", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11075434609, "num_examples": 5096117, "dataset_name": "stack_exchange"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar": {"num_bytes": 36802959360, "checksum": "f64f31d20db8d8692c1a019314a14974b4911a34ffef126feaf42da88860c666"}}, "download_size": 36802959360, "post_processing_size": null, "dataset_size": 11075434609, "size_in_bytes": 47878393969}}
{"plain_text": {"description": "This dataset is part of EleutherAI/The Pile dataset and is a dataset for Language Models from processing stackexchange data dump, which is an anonymized dump of all user-contributed content on the Stack Exchange network.\n", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://github.com/EleutherAI/stackexchange-dataset", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "the_pile_stack_exchange", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11075434609, "num_examples": 5096117, "dataset_name": "the_pile_stack_exchange"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar": {"num_bytes": 36802959360, "checksum": "f64f31d20db8d8692c1a019314a14974b4911a34ffef126feaf42da88860c666"}}, "download_size": 36802959360, "post_processing_size": null, "dataset_size": 11075434609, "size_in_bytes": 47878393969}}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
# Copyright 2021 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,7 +37,7 @@
_URL = "https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar"


class StackExchange(datasets.GeneratorBasedBuilder):
class ThePileStackExchange(datasets.GeneratorBasedBuilder):
"""The StackExchange dataset."""

BUILDER_CONFIGS = [
Expand Down