huggingface · lhoestq · Aug 31, 2022 · Sep 2, 2022 · Sep 2, 2022 · Sep 2, 2022
diff --git a/ADD_NEW_DATASET.md b/ADD_NEW_DATASET.md
@@ -166,8 +166,7 @@ Sometimes you need to use several *configurations* and/or *splits* (usually at l
 
 #### Tests (optional)
 
- To check that your dataset works correctly and to create its `dataset_info` metadata in the dataset card, run the command:
-
+ To check that your dataset works correctly and to create its `dataset_infos` metadata in the dataset card, run the command:
 
 ```bash
 datasets-cli test datasets/<your-dataset-folder> --save_info --all_configs
@@ -238,7 +237,6 @@ Now that your dataset script runs and create a dataset with the format you expec
 
 	This first command should create a `README.md` file containing the metadata if this file doesn't exist already, or add the metadata to an existing `README.md` file in your dataset folder.
 
-
 You have now finished the coding part, congratulation! 🎉 You are Awesome! 😎
 
 Note: You can use the CLI tool from the root of the repository with the following command:

diff --git a/datasets/acronym_identification/README.md b/datasets/acronym_identification/README.md
@@ -28,6 +28,33 @@ train-eval-index:
   col_mapping:
     tokens: tokens
     labels: tags
+dataset_info:
+  features:
+  - name: id
+    dtype: string
+  - name: tokens
+    sequence: string
+  - name: labels
+    sequence:
+      class_label:
+        names:
+          0: B-long
+          1: B-short
+          2: I-long
+          3: I-short
+          4: O
+  splits:
+  - name: test
+    num_bytes: 987728
+    num_examples: 1750
+  - name: train
+    num_bytes: 7792803
+    num_examples: 14006
+  - name: validation
+    num_bytes: 952705
+    num_examples: 1717
+  download_size: 8556464
+  dataset_size: 9733236
 ---
 
 # Dataset Card for Acronym Identification Dataset
@@ -212,4 +239,4 @@ The dataset provided for this shared task is licensed under CC BY-NC-SA 4.0 inte
 
 ### Contributions
 
-Thanks to [@abhishekkrthakur](https://github.com/abhishekkrthakur) for adding this dataset.
+Thanks to [@abhishekkrthakur](https://github.com/abhishekkrthakur) for adding this dataset.
diff --git a/datasets/acronym_identification/dataset_infos.json b/datasets/acronym_identification/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.\n", "citation": "@inproceedings{veyseh-et-al-2020-what,\n   title={{What Does This Acronym Mean? Introducing a New Dataset for Acronym Identification and Disambiguation}},\n   author={Amir Pouran Ben Veyseh and Franck Dernoncourt and Quan Hung Tran and Thien Huu Nguyen},\n   year={2020},\n   booktitle={Proceedings of COLING},\n   link={https://arxiv.org/pdf/2010.14678v1.pdf}\n}\n", "homepage": "https://github.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"num_classes": 5, "names": ["B-long", "B-short", "I-long", "I-short", "O"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "acronym_identification", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7792803, "num_examples": 14006, "dataset_name": "acronym_identification"}, "validation": {"name": "validation", "num_bytes": 952705, "num_examples": 1717, "dataset_name": "acronym_identification"}, "test": {"name": "test", "num_bytes": 987728, "num_examples": 1750, "dataset_name": "acronym_identification"}}, "download_checksums": {"https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/train.json": {"num_bytes": 7134043, "checksum": "2a48182187235167e8cbfa71e13c5c9882c4cabdefd2148edace2a50ccd8bbcd"}, "https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/dev.json": {"num_bytes": 873517, "checksum": "950000511ddab850170c85ae99c7ceb775e8bed6846482e06e47a8f99b16f8c2"}, "https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/test.json": {"num_bytes": 548904, "checksum": "5a37584eaa56ac23ffef23de7109d07bac6a19928eda96184348d89a01c82671"}}, "download_size": 8556464, "post_processing_size": null, "dataset_size": 9733236, "size_in_bytes": 18289700}}
+{"default": {"description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.\n", "citation": "@inproceedings{veyseh-et-al-2020-what,\n   title={{What Does This Acronym Mean? Introducing a New Dataset for Acronym Identification and Disambiguation}},\n   author={Amir Pouran Ben Veyseh and Franck Dernoncourt and Quan Hung Tran and Thien Huu Nguyen},\n   year={2020},\n   booktitle={Proceedings of COLING},\n   link={https://arxiv.org/pdf/2010.14678v1.pdf}\n}\n", "homepage": "https://github.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI", "license": "", "features": {"id": {"dtype": "string", "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "_type": "Value"}, "_type": "Sequence"}, "labels": {"feature": {"names": ["B-long", "B-short", "I-long", "I-short", "O"], "_type": "ClassLabel"}, "_type": "Sequence"}}, "builder_name": "acronym_identification", "config_name": "default", "version": {"version_str": "1.0.0", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7792803, "num_examples": 14006, "dataset_name": "acronym_identification"}, "validation": {"name": "validation", "num_bytes": 952705, "num_examples": 1717, "dataset_name": "acronym_identification"}, "test": {"name": "test", "num_bytes": 987728, "num_examples": 1750, "dataset_name": "acronym_identification"}}, "download_checksums": {"https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/train.json": {"num_bytes": 7134043, "checksum": "2a48182187235167e8cbfa71e13c5c9882c4cabdefd2148edace2a50ccd8bbcd"}, "https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/dev.json": {"num_bytes": 873517, "checksum": "950000511ddab850170c85ae99c7ceb775e8bed6846482e06e47a8f99b16f8c2"}, "https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/test.json": {"num_bytes": 548904, "checksum": "5a37584eaa56ac23ffef23de7109d07bac6a19928eda96184348d89a01c82671"}}, "download_size": 8556464, "dataset_size": 9733236, "size_in_bytes": 18289700}}
diff --git a/datasets/ade_corpus_v2/README.md b/datasets/ade_corpus_v2/README.md
@@ -33,48 +33,121 @@ train-eval-index:
     text: text
     label: target
   metrics:
-    - type: accuracy
-      name: Accuracy
-    - type: f1
-      name: F1 macro
-      args:
-        average: macro
-    - type: f1
-      name: F1 micro
-      args:
-        average: micro  
-    - type: f1
-      name: F1 weighted
-      args:
-        average: weighted
-    - type: precision
-      name: Precision macro
-      args:
-        average: macro  
-    - type: precision
-      name: Precision micro
-      args:
-        average: micro  
-    - type: precision
-      name: Precision weighted
-      args:
-        average: weighted  
-    - type: recall
-      name: Recall macro
-      args:
-        average: macro  
-    - type: recall
-      name: Recall micro
-      args:
-        average: micro  
-    - type: recall
-      name: Recall weighted
-      args:
-        average: weighted
+  - type: accuracy
+    name: Accuracy
+  - type: f1
+    name: F1 macro
+    args:
+      average: macro
+  - type: f1
+    name: F1 micro
+    args:
+      average: micro
+  - type: f1
+    name: F1 weighted
+    args:
+      average: weighted
+  - type: precision
+    name: Precision macro
+    args:
+      average: macro
+  - type: precision
+    name: Precision micro
+    args:
+      average: micro
+  - type: precision
+    name: Precision weighted
+    args:
+      average: weighted
+  - type: recall
+    name: Recall macro
+    args:
+      average: macro
+  - type: recall
+    name: Recall micro
+    args:
+      average: micro
+  - type: recall
+    name: Recall weighted
+    args:
+      average: weighted
 configs:
 - Ade_corpus_v2_classification
 - Ade_corpus_v2_drug_ade_relation
 - Ade_corpus_v2_drug_dosage_relation
+dataset_info:
+- config_name: Ade_corpus_v2_classification
+  features:
+  - name: text
+    dtype: string
+  - name: label
+    dtype:
+      class_label:
+        names:
+          0: Not-Related
+          1: Related
+  splits:
+  - name: train
+    num_bytes: 3403711
+    num_examples: 23516
+  download_size: 3791162
+  dataset_size: 3403711
+- config_name: Ade_corpus_v2_drug_ade_relation
+  features:
+  - name: text
+    dtype: string
+  - name: drug
+    dtype: string
+  - name: effect
+    dtype: string
+  - name: indexes
+    struct:
+    - name: drug
+      sequence:
+      - name: start_char
+        dtype: int32
+      - name: end_char
+        dtype: int32
+    - name: effect
+      sequence:
+      - name: start_char
+        dtype: int32
+      - name: end_char
+        dtype: int32
+  splits:
+  - name: train
+    num_bytes: 1546021
+    num_examples: 6821
+  download_size: 3791162
+  dataset_size: 1546021
+- config_name: Ade_corpus_v2_drug_dosage_relation
+  features:
+  - name: text
+    dtype: string
+  - name: drug
+    dtype: string
+  - name: dosage
+    dtype: string
+  - name: indexes
+    struct:
+    - name: drug
+      sequence:
+      - name: start_char
+        dtype: int32
+      - name: end_char
+        dtype: int32
+    - name: dosage
+      sequence:
+      - name: start_char
+        dtype: int32
+      - name: end_char
+        dtype: int32
+  splits:
+  - name: train
+    num_bytes: 64725
+    num_examples: 279
+  download_size: 3791162
+  dataset_size: 64725
 ---
 
 # Dataset Card for Adverse Drug Reaction Data v2
@@ -293,4 +366,4 @@ abstract = "A significant amount of information about drug-related safety issues
 
 ### Contributions
 
-Thanks to [@Nilanshrajput](https://github.com/Nilanshrajput), [@lhoestq](https://github.com/lhoestq) for adding this dataset.
+Thanks to [@Nilanshrajput](https://github.com/Nilanshrajput), [@lhoestq](https://github.com/lhoestq) for adding this dataset.
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"default": {"description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.\n", "citation": "@inproceedings{veyseh-et-al-2020-what,\n title={{What Does This Acronym Mean? Introducing a New Dataset for Acronym Identification and Disambiguation}},\n author={Amir Pouran Ben Veyseh and Franck Dernoncourt and Quan Hung Tran and Thien Huu Nguyen},\n year={2020},\n booktitle={Proceedings of COLING},\n link={https://arxiv.org/pdf/2010.14678v1.pdf}\n}\n", "homepage": "https://github.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"num_classes": 5, "names": ["B-long", "B-short", "I-long", "I-short", "O"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "acronym_identification", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7792803, "num_examples": 14006, "dataset_name": "acronym_identification"}, "validation": {"name": "validation", "num_bytes": 952705, "num_examples": 1717, "dataset_name": "acronym_identification"}, "test": {"name": "test", "num_bytes": 987728, "num_examples": 1750, "dataset_name": "acronym_identification"}}, "download_checksums": {"https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/train.json": {"num_bytes": 7134043, "checksum": "2a48182187235167e8cbfa71e13c5c9882c4cabdefd2148edace2a50ccd8bbcd"}, "https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/dev.json": {"num_bytes": 873517, "checksum": "950000511ddab850170c85ae99c7ceb775e8bed6846482e06e47a8f99b16f8c2"}, "https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/test.json": {"num_bytes": 548904, "checksum": "5a37584eaa56ac23ffef23de7109d07bac6a19928eda96184348d89a01c82671"}}, "download_size": 8556464, "post_processing_size": null, "dataset_size": 9733236, "size_in_bytes": 18289700}}
		{"default": {"description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.\n", "citation": "@inproceedings{veyseh-et-al-2020-what,\n title={{What Does This Acronym Mean? Introducing a New Dataset for Acronym Identification and Disambiguation}},\n author={Amir Pouran Ben Veyseh and Franck Dernoncourt and Quan Hung Tran and Thien Huu Nguyen},\n year={2020},\n booktitle={Proceedings of COLING},\n link={https://arxiv.org/pdf/2010.14678v1.pdf}\n}\n", "homepage": "https://github.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI", "license": "", "features": {"id": {"dtype": "string", "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "_type": "Value"}, "_type": "Sequence"}, "labels": {"feature": {"names": ["B-long", "B-short", "I-long", "I-short", "O"], "_type": "ClassLabel"}, "_type": "Sequence"}}, "builder_name": "acronym_identification", "config_name": "default", "version": {"version_str": "1.0.0", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7792803, "num_examples": 14006, "dataset_name": "acronym_identification"}, "validation": {"name": "validation", "num_bytes": 952705, "num_examples": 1717, "dataset_name": "acronym_identification"}, "test": {"name": "test", "num_bytes": 987728, "num_examples": 1750, "dataset_name": "acronym_identification"}}, "download_checksums": {"https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/train.json": {"num_bytes": 7134043, "checksum": "2a48182187235167e8cbfa71e13c5c9882c4cabdefd2148edace2a50ccd8bbcd"}, "https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/dev.json": {"num_bytes": 873517, "checksum": "950000511ddab850170c85ae99c7ceb775e8bed6846482e06e47a8f99b16f8c2"}, "https://raw.githubusercontent.com/amirveyseh/AAAI-21-SDU-shared-task-1-AI/master/dataset/test.json": {"num_bytes": 548904, "checksum": "5a37584eaa56ac23ffef23de7109d07bac6a19928eda96184348d89a01c82671"}}, "download_size": 8556464, "dataset_size": 9733236, "size_in_bytes": 18289700}}