huggingface
diff --git a/‎.circleci/config.yml‎
Lines changed: 0 additions & 107 deletions b/‎.circleci/config.yml‎
Lines changed: 0 additions & 107 deletions
diff --git a/‎.circleci/deploy.sh‎
Lines changed: 0 additions & 81 deletions b/‎.circleci/deploy.sh‎
Lines changed: 0 additions & 81 deletions
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 79 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎.github/workflows/test-audio.yml‎
Lines changed: 0 additions & 30 deletions b/‎.github/workflows/test-audio.yml‎
Lines changed: 0 additions & 30 deletions
diff --git a/‎ADD_NEW_DATASET.md‎
Lines changed: 2 additions & 0 deletions b/‎ADD_NEW_DATASET.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎datasets/aeslc/README.md‎
Lines changed: 7 additions & 0 deletions b/‎datasets/aeslc/README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎datasets/conll2003/README.md‎
Lines changed: 1 addition & 1 deletion b/‎datasets/conll2003/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datasets/conll2003/conll2003.py‎
Lines changed: 8 additions & 7 deletions b/‎datasets/conll2003/conll2003.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎datasets/conll2003/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion b/‎datasets/conll2003/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,79 @@
+name: CI
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+
+env:
+  HF_SCRIPTS_VERSION: main
+  HF_ALLOW_CODE_EVAL: 1
+
+jobs:
+
+  check_code_quality:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.6"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check quality
+        run: |
+          black --check --line-length 119 --target-version py36 tests src benchmarks datasets metrics
+          isort --check-only tests src benchmarks datasets metrics
+          flake8 tests src benchmarks datasets metrics
+
+  test:
+    needs: check_code_quality
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+        pyarrow_version: [latest, 6.0.1]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Install OS dependencies
+        if: ${{ matrix.os == 'ubuntu-latest' }}
+        run: |
+          sudo apt-get -y update
+          sudo apt-get -y install libsndfile1 sox
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Set up Python 3.6
+        if: ${{ matrix.os == 'ubuntu-latest' }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.6
+      - name: Set up Python 3.7
+        if: ${{ matrix.os == 'windows-latest' }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.7
+      - name: Upgrade pip
+        run: python -m pip install --upgrade pip
+      - name: Pin setuptools-scm
+        if: ${{ matrix.os == 'ubuntu-latest' }}
+        run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.6" && pip install "setuptools-scm==6.4.2"
+      - name: Install dependencies
+        run: |
+          pip install .[tests]
+          pip install -r additional-tests-requirements.txt --no-deps
+      - name: Install latest PyArrow
+        if: ${{ matrix.pyarrow_version == 'latest' }}
+        run: pip install pyarrow --upgrade
+      - name: Install PyArrow ${{ matrix.pyarrow_version }}
+        if: ${{ matrix.pyarrow_version != 'latest' }}
+        run: pip install pyarrow==${{ matrix.pyarrow_version }}
+      - name: Test with pytest
+        run: |
+          python -m pytest -n 2 --dist loadfile -sv ./tests/
@@ -134,6 +134,8 @@ You can also start (or copy any part) from one of the datasets of reference list
 - multilingual: [xquad](https://github.com/huggingface/datasets/blob/main/datasets/xquad/xquad.py) (original data are in json)
 - multitask: [matinf](https://github.com/huggingface/datasets/blob/main/datasets/matinf/matinf.py) (original data need to be downloaded by the user because it requires authentication)
 - speech recognition: [librispeech_asr](https://github.com/huggingface/datasets/blob/main/datasets/librispeech_asr/librispeech_asr.py) (original data is in .flac format)
+- image classification: [beans](https://github.com/huggingface/datasets/blob/main/datasets/beans/beans.py) (original data are in .jpg format)
+- object detection: [wider_face](https://github.com/huggingface/datasets/blob/main/datasets/wider_face/wider_face.py) (image files are in .jpg format and metadata come from text files)
 
 While you are developing the dataset script you can list test it by opening a python interpreter and running the script (the script is dynamically updated each time you modify it):
 
 
@@ -3,6 +3,13 @@ language:
 - en
 paperswithcode_id: aeslc
 pretty_name: AESLC
+task_categories:
+- summarization
+task_ids:
+- summarization-other-email-headline-generation
+- summarization-other-conversations-summarization
+- summarization-other-multi-document-summarization
+- summarization-other-aspect-based-summarization
 ---
 
 # Dataset Card for "aeslc"
 
@@ -17,7 +17,7 @@ task_categories:
 - token-classification
 task_ids:
 - named-entity-recognition
-- part-of-speech-tagging
+- part-of-speech
 paperswithcode_id: conll-2003
 pretty_name: CoNLL-2003
 train-eval-index:
 
@@ -234,10 +234,11 @@ def _generate_examples(self, filepath):
                     chunk_tags.append(splits[2])
                     ner_tags.append(splits[3].rstrip())
             # last example
-            yield guid, {
-                "id": str(guid),
-                "tokens": tokens,
-                "pos_tags": pos_tags,
-                "chunk_tags": chunk_tags,
-                "ner_tags": ner_tags,
-            }
+            if tokens:
+                yield guid, {
+                    "id": str(guid),
+                    "tokens": tokens,
+                    "pos_tags": pos_tags,
+                    "chunk_tags": chunk_tags,
+                    "ner_tags": ner_tags,
+                }
@@ -1 +1 @@
-{"conll2003": {"description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "citation": "@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,\n    title = \"Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition\",\n    author = \"Tjong Kim Sang, Erik F.  and\n      De Meulder, Fien\",\n    booktitle = \"Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003\",\n    year = \"2003\",\n    url = \"https://www.aclweb.org/anthology/W03-0419\",\n    pages = \"142--147\",\n}\n", "homepage": "https://www.aclweb.org/anthology/W03-0419/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pos_tags": {"feature": {"num_classes": 47, "names": ["\"", "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "NN|SYM", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "chunk_tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "ner_tags": {"feature": {"num_classes": 9, "names": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "conll2003", "config_name": "conll2003", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6931418, "num_examples": 14042, "dataset_name": "conll2003"}, "validation": {"name": "validation", "num_bytes": 1739271, "num_examples": 3251, "dataset_name": "conll2003"}, "test": {"name": "test", "num_bytes": 1582102, "num_examples": 3454, "dataset_name": "conll2003"}}, "download_checksums": {"https://data.deepai.org/conll2003.zip": {"num_bytes": 982975, "checksum": "96a104d174ddae7558bab603f19382c5fe02ff1da5c077a7f3ce2ced1578a2c3"}}, "download_size": 982975, "post_processing_size": null, "dataset_size": 10252791, "size_in_bytes": 11235766}}
+{"conll2003": {"description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "citation": "@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,\n    title = \"Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition\",\n    author = \"Tjong Kim Sang, Erik F.  and\n      De Meulder, Fien\",\n    booktitle = \"Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003\",\n    year = \"2003\",\n    url = \"https://www.aclweb.org/anthology/W03-0419\",\n    pages = \"142--147\",\n}\n", "homepage": "https://www.aclweb.org/anthology/W03-0419/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pos_tags": {"feature": {"num_classes": 47, "names": ["\"", "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "NN|SYM", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "chunk_tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "ner_tags": {"feature": {"num_classes": 9, "names": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "conll2003", "config_name": "conll2003", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6931345, "num_examples": 14041, "dataset_name": "conll2003"}, "validation": {"name": "validation", "num_bytes": 1739223, "num_examples": 3250, "dataset_name": "conll2003"}, "test": {"name": "test", "num_bytes": 1582054, "num_examples": 3453, "dataset_name": "conll2003"}}, "download_checksums": {"https://data.deepai.org/conll2003.zip": {"num_bytes": 982975, "checksum": "96a104d174ddae7558bab603f19382c5fe02ff1da5c077a7f3ce2ced1578a2c3"}}, "download_size": 982975, "post_processing_size": null, "dataset_size": 10252622, "size_in_bytes": 11235597}}
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"conll2003": {"description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "citation": "@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,\n title = \"Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition\",\n author = \"Tjong Kim Sang, Erik F. and\n De Meulder, Fien\",\n booktitle = \"Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003\",\n year = \"2003\",\n url = \"https://www.aclweb.org/anthology/W03-0419\",\n pages = \"142--147\",\n}\n", "homepage": "https://www.aclweb.org/anthology/W03-0419/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pos_tags": {"feature": {"num_classes": 47, "names": ["\"", "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "NN\|SYM", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "chunk_tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "ner_tags": {"feature": {"num_classes": 9, "names": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "conll2003", "config_name": "conll2003", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6931418, "num_examples": 14042, "dataset_name": "conll2003"}, "validation": {"name": "validation", "num_bytes": 1739271, "num_examples": 3251, "dataset_name": "conll2003"}, "test": {"name": "test", "num_bytes": 1582102, "num_examples": 3454, "dataset_name": "conll2003"}}, "download_checksums": {"https://data.deepai.org/conll2003.zip": {"num_bytes": 982975, "checksum": "96a104d174ddae7558bab603f19382c5fe02ff1da5c077a7f3ce2ced1578a2c3"}}, "download_size": 982975, "post_processing_size": null, "dataset_size": 10252791, "size_in_bytes": 11235766}}
	`1`	+{"conll2003": {"description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "citation": "@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,\n title = \"Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition\",\n author = \"Tjong Kim Sang, Erik F. and\n De Meulder, Fien\",\n booktitle = \"Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003\",\n year = \"2003\",\n url = \"https://www.aclweb.org/anthology/W03-0419\",\n pages = \"142--147\",\n}\n", "homepage": "https://www.aclweb.org/anthology/W03-0419/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pos_tags": {"feature": {"num_classes": 47, "names": ["\"", "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "NN\|SYM", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "chunk_tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "ner_tags": {"feature": {"num_classes": 9, "names": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "conll2003", "config_name": "conll2003", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6931345, "num_examples": 14041, "dataset_name": "conll2003"}, "validation": {"name": "validation", "num_bytes": 1739223, "num_examples": 3250, "dataset_name": "conll2003"}, "test": {"name": "test", "num_bytes": 1582054, "num_examples": 3453, "dataset_name": "conll2003"}}, "download_checksums": {"https://data.deepai.org/conll2003.zip": {"num_bytes": 982975, "checksum": "96a104d174ddae7558bab603f19382c5fe02ff1da5c077a7f3ce2ced1578a2c3"}}, "download_size": 982975, "post_processing_size": null, "dataset_size": 10252622, "size_in_bytes": 11235597}}