Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions datasets/banking77/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
---
annotations_creators:
- expert-generated
extended:
- original
language_creators:
- expert-generated
languages:
Expand Down
6 changes: 3 additions & 3 deletions datasets/c4/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ This is the version prepared by AllenAI, hosted at this address: https://hugging
It comes in four variants:

- `en`: 305GB in JSON format
- `en.noblocklist`: 380GB in JSON format
- `en.noclean`: 2.3TB in JSON format
- `en_noblocklist`: 380GB in JSON format
- `en_noclean`: 2.3TB in JSON format
- `realnewslike`: 15GB in JSON format

The `en.noblocklist` variant is exactly the same as the `en` variant, except we turned off the so-called "badwords filter", which removes all documents that contain words from the lists at https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words.
The `en_noblocklist` variant is exactly the same as the `en` variant, except we turned off the so-called "badwords filter", which removes all documents that contain words from the lists at https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words.

### Supported Tasks and Leaderboards

Expand Down
8 changes: 4 additions & 4 deletions datasets/c4/c4.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@

_URL = "https://github.com/allenai/allennlp/discussions/5056"

_VARIANTS = ["en", "realnewslike", "en.noblocklist", "en.noclean"]
_VARIANTS = ["en", "realnewslike", "en_noblocklist", "en_noclean"]

_N_SHARDS_PER_SPLIT = {
"en": {"train": 1024, "validation": 8},
"realnewslike": {"train": 512, "validation": 1},
"en.noblocklist": {"train": 1024, "validation": 8},
"en.noclean": {"train": 7168, "validation": 64},
"en_noblocklist": {"train": 1024, "validation": 8},
"en_noclean": {"train": 7168, "validation": 64},
}

_DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz"
Expand Down Expand Up @@ -68,7 +68,7 @@ def _split_generators(self, dl_manager):
for split in ["train", "validation"]:
n_shards = _N_SHARDS_PER_SPLIT[self.config.name][split]
data_urls[split] = [
_DATA_URL.format(name=self.config.name, split=split, index=index, n_shards=n_shards)
_DATA_URL.format(name=self.config.name.replace("_", "."), split=split, index=index, n_shards=n_shards)
for index in range(n_shards)
]
train_downloaded_files = dl_manager.download(data_urls["train"])
Expand Down
2 changes: 1 addition & 1 deletion datasets/c4/dataset_infos.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion datasets/cnn_dailymail/cnn_dailymail.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ class CnnDailymail(datasets.GeneratorBasedBuilder):
"""CNN/DailyMail non-anonymized summarization dataset."""

BUILDER_CONFIGS = [
CnnDailymailConfig(name=str(version), description="Plain text", version=version)
CnnDailymailConfig(name=str(version).replace(".", "_"), description="Plain text", version=version)
for version in _SUPPORTED_VERSIONS
]

Expand Down
2 changes: 1 addition & 1 deletion datasets/cnn_dailymail/dataset_infos.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion datasets/common_gen/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ paperswithcode_id: commongen
pretty_name: CommonGen
task_ids:
- text2text-generation-other-concepts-to-text
tasks_categories:
task_categories:
- text2text-generation
---

Expand Down
8 changes: 4 additions & 4 deletions datasets/cos_e/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ inference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.

### Data Instances

#### v1.0
#### v1_0

- **Size of downloaded dataset files:** 4.10 MB
- **Size of the generated dataset:** 2.23 MB
Expand All @@ -77,7 +77,7 @@ An example of 'train' looks as follows.
}
```

#### v1.11
#### v1_11

- **Size of downloaded dataset files:** 6.23 MB
- **Size of the generated dataset:** 2.91 MB
Expand All @@ -99,15 +99,15 @@ An example of 'train' looks as follows.

The data fields are the same among all splits.

#### v1.0
#### v1_0
- `id`: a `string` feature.
- `question`: a `string` feature.
- `choices`: a `list` of `string` features.
- `answer`: a `string` feature.
- `abstractive_explanation`: a `string` feature.
- `extractive_explanation`: a `string` feature.

#### v1.11
#### v1_11
- `id`: a `string` feature.
- `question`: a `string` feature.
- `choices`: a `list` of `string` features.
Expand Down
14 changes: 7 additions & 7 deletions datasets/cos_e/cos_e.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ def _download_and_index_cqa(dl_manager, name):

downloaded_files = dl_manager.download_and_extract(
{
"cqa_train": _CQA_V1_11_URL_TRAIN if name == "v1.11" else _CQA_V1_0_URL_TRAIN,
"cqa_dev": _CQA_V1_11_URL_DEV if name == "v1.11" else _CQA_V1_0_URL_DEV,
"cqa_test": _CQA_V1_11_URL_TEST if name == "v1.11" else _CQA_V1_0_URL_TEST,
"cqa_train": _CQA_V1_11_URL_TRAIN if name == "v1_11" else _CQA_V1_0_URL_TRAIN,
"cqa_dev": _CQA_V1_11_URL_DEV if name == "v1_11" else _CQA_V1_0_URL_DEV,
"cqa_test": _CQA_V1_11_URL_TEST if name == "v1_11" else _CQA_V1_0_URL_TEST,
}
)

Expand Down Expand Up @@ -110,12 +110,12 @@ class CosE(datasets.GeneratorBasedBuilder):

BUILDER_CONFIGS = [
CosEConfig(
name="v1.0",
name="v1_0",
description="cos-e version 1.0",
version=datasets.Version("1.0.0", ""),
),
CosEConfig(
name="v1.11",
name="v1_11",
description="cos-e version 1.11",
version=datasets.Version("1.11.0", ""),
),
Expand Down Expand Up @@ -146,15 +146,15 @@ def _split_generators(self, dl_manager):
# want to _create_ the Cos-E dataset from scratch.
cqa_indexed = _download_and_index_cqa(dl_manager, self.config.name)

if self.config.name == "v1.11":
if self.config.name == "v1_11":
files = dl_manager.download_and_extract(
{
"dev": [_COS_E_URL + "v1.11/cose_dev_v1.11_processed.jsonl"],
"train": [_COS_E_URL + "v1.11/cose_train_v1.11_processed.jsonl"],
}
)

elif self.config.name == "v1.0":
elif self.config.name == "v1_0":
files = dl_manager.download_and_extract(
{
"dev": [_COS_E_URL + "v1.0/cose_dev_v1.0_processed.jsonl"],
Expand Down
2 changes: 1 addition & 1 deletion datasets/cos_e/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"v1.0": {"description": "\nCommon Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.\n", "citation": "\n@inproceedings{rajani2019explain,\n title = \"Explain Yourself! Leveraging Language models for Commonsense Reasoning\",\n author = \"Rajani, Nazneen Fatema and\n McCann, Bryan and\n Xiong, Caiming and\n Socher, Richard\",\n year=\"2019\",\n booktitle = \"Proceedings of the 2019 Conference of the Association for Computational Linguistics (ACL2019)\",\n url =\"https://arxiv.org/abs/1906.02361\"\n}\n", "homepage": "https://github.com/salesforce/cos-e", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "abstractive_explanation": {"dtype": "string", "id": null, "_type": "Value"}, "extractive_explanation": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "cos_e", "config_name": "v1.0", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2077517, "num_examples": 7610, "dataset_name": "cos_e"}, "validation": {"name": "validation", "num_bytes": 261887, "num_examples": 950, "dataset_name": "cos_e"}}, "download_checksums": {"https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.0/train_rand_split.jsonl": {"num_bytes": 2160200, "checksum": "1989ce97e24d8572113d6a18f44e0f11ee9d206fb9bf9a1133937645583e697e"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.0/dev_rand_split.jsonl": {"num_bytes": 268531, "checksum": "790dd2a8492e7f3b51ded04116de603115b7acaded32ea84f6a7101f9d571ac1"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.0/test_rand_split_no_answers.jsonl": {"num_bytes": 250752, "checksum": "b9c3d1319667ea1569be6f7b3ed0546bd8222d2f3a759f928307343a0282e190"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.0/cose_dev_v1.0_processed.jsonl": {"num_bytes": 182444, "checksum": "ab7b8ac91bca1a6ba798816af6aca703a739f576c919360ddc376d9d3046be53"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.0/cose_train_v1.0_processed.jsonl": {"num_bytes": 1433393, "checksum": "df9f83ac4891f38e0771470858d5f1c4b5bb08fee5c53f38f9df9b3d3675ea74"}}, "download_size": 4295320, "dataset_size": 2339404, "size_in_bytes": 6634724}, "v1.11": {"description": "\nCommon Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.\n", "citation": "\n@inproceedings{rajani2019explain,\n title = \"Explain Yourself! Leveraging Language models for Commonsense Reasoning\",\n author = \"Rajani, Nazneen Fatema and\n McCann, Bryan and\n Xiong, Caiming and\n Socher, Richard\",\n year=\"2019\",\n booktitle = \"Proceedings of the 2019 Conference of the Association for Computational Linguistics (ACL2019)\",\n url =\"https://arxiv.org/abs/1906.02361\"\n}\n", "homepage": "https://github.com/salesforce/cos-e", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "abstractive_explanation": {"dtype": "string", "id": null, "_type": "Value"}, "extractive_explanation": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "cos_e", "config_name": "v1.11", "version": {"version_str": "1.11.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 11, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2717420, "num_examples": 9741, "dataset_name": "cos_e"}, "validation": {"name": "validation", "num_bytes": 331760, "num_examples": 1221, "dataset_name": "cos_e"}}, "download_checksums": {"https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl": {"num_bytes": 3785890, "checksum": "58ffa3c8472410e24b8c43f423d89c8a003d8284698a6ed7874355dedd09a2fb"}, "https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl": {"num_bytes": 471653, "checksum": "3210497fdaae614ac085d9eb873dd7f4d49b6f965a93adadc803e1229fd8a02a"}, "https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl": {"num_bytes": 423148, "checksum": "b426896d71a9cd064cf01cfaf6e920817c51701ef66028883ac1af2e73ad5f29"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.11/cose_dev_v1.11_processed.jsonl": {"num_bytes": 200867, "checksum": "a8367c94901ba249e48bcec76eaff9e7b91cec0f0e4d94879975d7d1b952bc41"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.11/cose_train_v1.11_processed.jsonl": {"num_bytes": 1653976, "checksum": "4c0ccfd34243cf7af62b441643437769663edcb980b991487f766b97a547e9bd"}}, "download_size": 6535534, "dataset_size": 3049180, "size_in_bytes": 9584714}}
{"v1_0": {"description": "\nCommon Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.\n", "citation": "\n@inproceedings{rajani2019explain,\n title = \"Explain Yourself! Leveraging Language models for Commonsense Reasoning\",\n author = \"Rajani, Nazneen Fatema and\n McCann, Bryan and\n Xiong, Caiming and\n Socher, Richard\",\n year=\"2019\",\n booktitle = \"Proceedings of the 2019 Conference of the Association for Computational Linguistics (ACL2019)\",\n url =\"https://arxiv.org/abs/1906.02361\"\n}\n", "homepage": "https://github.com/salesforce/cos-e", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "abstractive_explanation": {"dtype": "string", "id": null, "_type": "Value"}, "extractive_explanation": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "cos_e", "config_name": "v1_0", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2077517, "num_examples": 7610, "dataset_name": "cos_e"}, "validation": {"name": "validation", "num_bytes": 261887, "num_examples": 950, "dataset_name": "cos_e"}}, "download_checksums": {"https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.0/train_rand_split.jsonl": {"num_bytes": 2160200, "checksum": "1989ce97e24d8572113d6a18f44e0f11ee9d206fb9bf9a1133937645583e697e"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.0/dev_rand_split.jsonl": {"num_bytes": 268531, "checksum": "790dd2a8492e7f3b51ded04116de603115b7acaded32ea84f6a7101f9d571ac1"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.0/test_rand_split_no_answers.jsonl": {"num_bytes": 250752, "checksum": "b9c3d1319667ea1569be6f7b3ed0546bd8222d2f3a759f928307343a0282e190"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.0/cose_dev_v1.0_processed.jsonl": {"num_bytes": 182444, "checksum": "ab7b8ac91bca1a6ba798816af6aca703a739f576c919360ddc376d9d3046be53"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.0/cose_train_v1.0_processed.jsonl": {"num_bytes": 1433393, "checksum": "df9f83ac4891f38e0771470858d5f1c4b5bb08fee5c53f38f9df9b3d3675ea74"}}, "download_size": 4295320, "dataset_size": 2339404, "size_in_bytes": 6634724}, "v1_11": {"description": "\nCommon Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.\n", "citation": "\n@inproceedings{rajani2019explain,\n title = \"Explain Yourself! Leveraging Language models for Commonsense Reasoning\",\n author = \"Rajani, Nazneen Fatema and\n McCann, Bryan and\n Xiong, Caiming and\n Socher, Richard\",\n year=\"2019\",\n booktitle = \"Proceedings of the 2019 Conference of the Association for Computational Linguistics (ACL2019)\",\n url =\"https://arxiv.org/abs/1906.02361\"\n}\n", "homepage": "https://github.com/salesforce/cos-e", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "choices": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "abstractive_explanation": {"dtype": "string", "id": null, "_type": "Value"}, "extractive_explanation": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "cos_e", "config_name": "v1_11", "version": {"version_str": "1.11.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 11, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2717420, "num_examples": 9741, "dataset_name": "cos_e"}, "validation": {"name": "validation", "num_bytes": 331760, "num_examples": 1221, "dataset_name": "cos_e"}}, "download_checksums": {"https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl": {"num_bytes": 3785890, "checksum": "58ffa3c8472410e24b8c43f423d89c8a003d8284698a6ed7874355dedd09a2fb"}, "https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl": {"num_bytes": 471653, "checksum": "3210497fdaae614ac085d9eb873dd7f4d49b6f965a93adadc803e1229fd8a02a"}, "https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl": {"num_bytes": 423148, "checksum": "b426896d71a9cd064cf01cfaf6e920817c51701ef66028883ac1af2e73ad5f29"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.11/cose_dev_v1.11_processed.jsonl": {"num_bytes": 200867, "checksum": "a8367c94901ba249e48bcec76eaff9e7b91cec0f0e4d94879975d7d1b952bc41"}, "https://raw.githubusercontent.com/salesforce/cos-e/master/data/v1.11/cose_train_v1.11_processed.jsonl": {"num_bytes": 1653976, "checksum": "4c0ccfd34243cf7af62b441643437769663edcb980b991487f766b97a547e9bd"}}, "download_size": 6535534, "dataset_size": 3049180, "size_in_bytes": 9584714}}
4 changes: 2 additions & 2 deletions datasets/evidence_infer_treatment/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1275,8 +1275,8 @@ We have recently collected additional data for this task (https://arxiv.org/abs/

| name | train | validation | test |
|------|------:|-----------:|-----:|
| 1.1 | 1931 | 248 | 240 |
| 2.0 | 2690 | 340 | 334 |
| 1_1 | 1931 | 248 | 240 |
| 2_0 | 2690 | 340 | 334 |

## Dataset Creation

Expand Down
Loading