huggingface · lhoestq · Jun 1, 2022 · May 11, 2022 · May 11, 2022 · May 11, 2022
diff --git a/datasets/enwik8/README.md b/datasets/enwik8/README.md
@@ -0,0 +1,163 @@
+---
+annotations_creators:
+- no-annotation
+language_creators:
+- found
+languages:
+- en
+licenses:
+- mit
+multilinguality:
+- monolingual
+pretty_name: enwik8
+size_categories:
+- 10K<n<100K
+source_datasets:
+- original
+task_categories:
+- fill-mask
+- text-generation
+task_ids:
+- language-modeling
+- masked-language-modeling
+---
+
+# Dataset Card for enwik8
+
+## Table of Contents
+- [Dataset Description](#dataset-description)
+  - [Dataset Summary](#dataset-summary)
+  - [Supported Tasks](#supported-tasks-and-leaderboards)
+  - [Languages](#languages)
+- [Dataset Structure](#dataset-structure)
+  - [Data Instances](#data-instances)
+  - [Data Fields](#data-instances)
+  - [Data Splits](#data-instances)
+- [Dataset Creation](#dataset-creation)
+  - [Curation Rationale](#curation-rationale)
+  - [Source Data](#source-data)
+  - [Annotations](#annotations)
+  - [Personal and Sensitive Information](#personal-and-sensitive-information)
+- [Considerations for Using the Data](#considerations-for-using-the-data)
+  - [Social Impact of Dataset](#social-impact-of-dataset)
+  - [Discussion of Biases](#discussion-of-biases)
+  - [Other Known Limitations](#other-known-limitations)
+- [Additional Information](#additional-information)
+  - [Dataset Curators](#dataset-curators)
+  - [Licensing Information](#licensing-information)
+  - [Citation Information](#citation-information)
+
+## Dataset Description
+
+- **Homepage:** https://cs.fit.edu/~mmahoney/compression/textdata.html
+- **Repository:** [Needs More Information]
+- **Paper:** [Needs More Information]
+- **Leaderboard:** [Needs More Information]
+- **Point of Contact:** [Needs More Information]
+
+### Dataset Summary
+
+The enwik8 datasset is based on Wikipedia and is typically used to measure a model's ability to compress data. The data come from a Wikipedia dump from 2006.
+
+### Supported Tasks and Leaderboards
+
+[Needs More Information]
+
+### Languages
+
+en
+
+## Dataset Structure
+
+### Data Instances
+
+- **Size of downloaded dataset files:** 33.39 MB
+- **Size of generated dataset files:** 99.47 MB
+- **Total size:** 132.86 MB
+
+```
+{
+   "text": "In [[Denmark]], the [[Freetown Christiania]] was created in downtown [[Copenhagen]]....",
+}
+```
+
+### Data Fields
+
+The data fields are the same among all sets.
+
+#### enwik8
+
+- `text`: a `string` feature.
+
+#### enwik8-raw
+
+- `text`: a `string` feature.
+
+### Data Splits
+
+| dataset     | train   |
+| ---         | ---     |
+| enwik8      | 1128024	|
+| enwik8- raw | 1       |
+
+## Dataset Creation
+
+### Curation Rationale
+
+[Needs More Information]
+
+### Source Data
+
+#### Initial Data Collection and Normalization
+
+[Needs More Information]
+
+#### Who are the source language producers?
+
+[Needs More Information]
+
+### Annotations
+
+#### Annotation process
+
+[Needs More Information]
+
+#### Who are the annotators?
+
+[Needs More Information]
+
+### Personal and Sensitive Information
+
+[Needs More Information]
+
+## Considerations for Using the Data
+
+### Social Impact of Dataset
+
+[Needs More Information]
+
+### Discussion of Biases
+
+[Needs More Information]
+
+### Other Known Limitations
+
+[Needs More Information]
+
+## Additional Information
+
+### Dataset Curators
+
+[Needs More Information]
+
+### Licensing Information
+
+[Needs More Information]
+
+### Citation Information
+
+Dataset is not part of a publication, and can therefore not be cited.
+
+### Contributions
+
+Thanks to [@HallerPatrick](https://github.com/HallerPatrick) for adding this dataset.
diff --git a/datasets/enwik8/dataset_infos.json b/datasets/enwik8/dataset_infos.json
@@ -0,0 +1 @@
+{"enwik8": {"description": "The dataset is based on the Hutter Prize (http://prize.hutter1.net) and contains the first 10^8 byte of Wikipedia\n", "citation": "", "homepage": "https://cs.fit.edu/~mmahoney/compression/textdata.html", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "enwik8", "config_name": "enwik8", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 104299244, "num_examples": 1128024, "dataset_name": "enwik8"}}, "download_checksums": {"http://cs.fit.edu/~mmahoney/compression/enwik8.zip": {"num_bytes": 35012219, "checksum": "9591b88a79ef28eeef58b6213ffbbc1b793db83d67b7d451061829b38e0dcc69"}}, "download_size": 35012219, "post_processing_size": null, "dataset_size": 104299244, "size_in_bytes": 139311463}, "enwik8-raw": {"description": "The dataset is based on the Hutter Prize (http://prize.hutter1.net) and contains the first 10^8 byte of Wikipedia\n", "citation": "", "homepage": "https://cs.fit.edu/~mmahoney/compression/textdata.html", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "enwik8", "config_name": "enwik8-raw", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 100000004, "num_examples": 1, "dataset_name": "enwik8"}}, "download_checksums": {"http://cs.fit.edu/~mmahoney/compression/enwik8.zip": {"num_bytes": 35012219, "checksum": "9591b88a79ef28eeef58b6213ffbbc1b793db83d67b7d451061829b38e0dcc69"}}, "download_size": 35012219, "post_processing_size": null, "dataset_size": 100000004, "size_in_bytes": 135012223}}
diff --git a/datasets/enwik8/dummy/enwik8-raw/1.1.0/dummy_data.zip b/datasets/enwik8/dummy/enwik8-raw/1.1.0/dummy_data.zip
diff --git a/datasets/enwik8/dummy/enwik8/1.1.0/dummy_data.zip b/datasets/enwik8/dummy/enwik8/1.1.0/dummy_data.zip
diff --git a/datasets/enwik8/enwik8.py b/datasets/enwik8/enwik8.py
@@ -0,0 +1,95 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import datasets
+
+
+_CITATION = ""
+
+# You can copy an official description
+_DESCRIPTION = """\
+The dataset is based on the Hutter Prize (http://prize.hutter1.net) and contains the first 10^8 byte of Wikipedia
+"""
+
+_HOMEPAGE = "https://cs.fit.edu/~mmahoney/compression/textdata.html"
+
+_LICENSE = ""
+
+# The HuggingFace Datasets library doesn't host the datasets but only points to the original files.
+# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
+_URLS = {"source": "http://cs.fit.edu/~mmahoney/compression/enwik8.zip"}
+
+
+class Enwik8(datasets.GeneratorBasedBuilder):
+
+    VERSION = datasets.Version("1.1.0")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="enwik8",
+            version=VERSION,
+            description="This version of the dataset contains a split by line version with all content",
+        ),
+        datasets.BuilderConfig(
+            name="enwik8-raw",
+            version=VERSION,
+            description="This version of the dataset contains a raw string version split with all content",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "enwik8"
+
+    def _info(self):
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "text": datasets.Value("string"),
+                }
+            ),
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+
+        urls = _URLS["source"]
+        data_dir = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "enwik8"),
+                    "split": "train",
+                },
+            )
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with open(filepath, encoding="utf-8") as f:
+            if self.config.name.endswith("raw"):
+                yield 0, {"text": f.read()}
+            else:
+                lines = f.readlines()
+                for key, line in enumerate(lines):
+                    if line.strip():
+                        yield key, {"text": line}
+                    else:
+                        yield key, {"text": ""}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"enwik8": {"description": "The dataset is based on the Hutter Prize (http://prize.hutter1.net) and contains the first 10^8 byte of Wikipedia\n", "citation": "", "homepage": "https://cs.fit.edu/~mmahoney/compression/textdata.html", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "enwik8", "config_name": "enwik8", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 104299244, "num_examples": 1128024, "dataset_name": "enwik8"}}, "download_checksums": {"http://cs.fit.edu/~mmahoney/compression/enwik8.zip": {"num_bytes": 35012219, "checksum": "9591b88a79ef28eeef58b6213ffbbc1b793db83d67b7d451061829b38e0dcc69"}}, "download_size": 35012219, "post_processing_size": null, "dataset_size": 104299244, "size_in_bytes": 139311463}, "enwik8-raw": {"description": "The dataset is based on the Hutter Prize (http://prize.hutter1.net) and contains the first 10^8 byte of Wikipedia\n", "citation": "", "homepage": "https://cs.fit.edu/~mmahoney/compression/textdata.html", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "enwik8", "config_name": "enwik8-raw", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 100000004, "num_examples": 1, "dataset_name": "enwik8"}}, "download_checksums": {"http://cs.fit.edu/~mmahoney/compression/enwik8.zip": {"num_bytes": 35012219, "checksum": "9591b88a79ef28eeef58b6213ffbbc1b793db83d67b7d451061829b38e0dcc69"}}, "download_size": 35012219, "post_processing_size": null, "dataset_size": 100000004, "size_in_bytes": 135012223}}