Skip to content

Commit 61a18f8

Browse files
committed
Merge branch 'main' into only-match-separated-split-names
2 parents ee15ede + 4fb3ed0 commit 61a18f8

35 files changed

+962
-744
lines changed

.circleci/config.yml

Lines changed: 0 additions & 107 deletions
This file was deleted.

.circleci/deploy.sh

Lines changed: 0 additions & 81 deletions
This file was deleted.

.github/workflows/ci.yml

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
name: CI
2+
3+
on:
4+
pull_request:
5+
branches:
6+
- main
7+
push:
8+
branches:
9+
- main
10+
11+
env:
12+
HF_SCRIPTS_VERSION: main
13+
HF_ALLOW_CODE_EVAL: 1
14+
15+
jobs:
16+
17+
check_code_quality:
18+
runs-on: ubuntu-latest
19+
steps:
20+
- uses: actions/checkout@v3
21+
- name: Set up Python
22+
uses: actions/setup-python@v4
23+
with:
24+
python-version: "3.6"
25+
- name: Install dependencies
26+
run: |
27+
python -m pip install --upgrade pip
28+
pip install .[quality]
29+
- name: Check quality
30+
run: |
31+
black --check --line-length 119 --target-version py36 tests src benchmarks datasets metrics
32+
isort --check-only tests src benchmarks datasets metrics
33+
flake8 tests src benchmarks datasets metrics
34+
35+
test:
36+
needs: check_code_quality
37+
strategy:
38+
fail-fast: false
39+
matrix:
40+
os: [ubuntu-latest, windows-latest]
41+
pyarrow_version: [latest, 6.0.1]
42+
runs-on: ${{ matrix.os }}
43+
steps:
44+
- name: Install OS dependencies
45+
if: ${{ matrix.os == 'ubuntu-latest' }}
46+
run: |
47+
sudo apt-get -y update
48+
sudo apt-get -y install libsndfile1 sox
49+
- uses: actions/checkout@v3
50+
with:
51+
fetch-depth: 0
52+
- name: Set up Python 3.6
53+
if: ${{ matrix.os == 'ubuntu-latest' }}
54+
uses: actions/setup-python@v4
55+
with:
56+
python-version: 3.6
57+
- name: Set up Python 3.7
58+
if: ${{ matrix.os == 'windows-latest' }}
59+
uses: actions/setup-python@v4
60+
with:
61+
python-version: 3.7
62+
- name: Upgrade pip
63+
run: python -m pip install --upgrade pip
64+
- name: Pin setuptools-scm
65+
if: ${{ matrix.os == 'ubuntu-latest' }}
66+
run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.6" && pip install "setuptools-scm==6.4.2"
67+
- name: Install dependencies
68+
run: |
69+
pip install .[tests]
70+
pip install -r additional-tests-requirements.txt --no-deps
71+
- name: Install latest PyArrow
72+
if: ${{ matrix.pyarrow_version == 'latest' }}
73+
run: pip install pyarrow --upgrade
74+
- name: Install PyArrow ${{ matrix.pyarrow_version }}
75+
if: ${{ matrix.pyarrow_version != 'latest' }}
76+
run: pip install pyarrow==${{ matrix.pyarrow_version }}
77+
- name: Test with pytest
78+
run: |
79+
python -m pytest -n 2 --dist loadfile -sv ./tests/

.github/workflows/test-audio.yml

Lines changed: 0 additions & 30 deletions
This file was deleted.

ADD_NEW_DATASET.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ You can also start (or copy any part) from one of the datasets of reference list
134134
- multilingual: [xquad](https://github.com/huggingface/datasets/blob/main/datasets/xquad/xquad.py) (original data are in json)
135135
- multitask: [matinf](https://github.com/huggingface/datasets/blob/main/datasets/matinf/matinf.py) (original data need to be downloaded by the user because it requires authentication)
136136
- speech recognition: [librispeech_asr](https://github.com/huggingface/datasets/blob/main/datasets/librispeech_asr/librispeech_asr.py) (original data is in .flac format)
137+
- image classification: [beans](https://github.com/huggingface/datasets/blob/main/datasets/beans/beans.py) (original data are in .jpg format)
138+
- object detection: [wider_face](https://github.com/huggingface/datasets/blob/main/datasets/wider_face/wider_face.py) (image files are in .jpg format and metadata come from text files)
137139

138140
While you are developing the dataset script you can list test it by opening a python interpreter and running the script (the script is dynamically updated each time you modify it):
139141

datasets/aeslc/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@ language:
33
- en
44
paperswithcode_id: aeslc
55
pretty_name: AESLC
6+
task_categories:
7+
- summarization
8+
task_ids:
9+
- summarization-other-email-headline-generation
10+
- summarization-other-conversations-summarization
11+
- summarization-other-multi-document-summarization
12+
- summarization-other-aspect-based-summarization
613
---
714

815
# Dataset Card for "aeslc"

datasets/conll2003/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ task_categories:
1717
- token-classification
1818
task_ids:
1919
- named-entity-recognition
20-
- part-of-speech-tagging
20+
- part-of-speech
2121
paperswithcode_id: conll-2003
2222
pretty_name: CoNLL-2003
2323
train-eval-index:

datasets/conll2003/conll2003.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -234,10 +234,11 @@ def _generate_examples(self, filepath):
234234
chunk_tags.append(splits[2])
235235
ner_tags.append(splits[3].rstrip())
236236
# last example
237-
yield guid, {
238-
"id": str(guid),
239-
"tokens": tokens,
240-
"pos_tags": pos_tags,
241-
"chunk_tags": chunk_tags,
242-
"ner_tags": ner_tags,
243-
}
237+
if tokens:
238+
yield guid, {
239+
"id": str(guid),
240+
"tokens": tokens,
241+
"pos_tags": pos_tags,
242+
"chunk_tags": chunk_tags,
243+
"ner_tags": ner_tags,
244+
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"conll2003": {"description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "citation": "@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,\n title = \"Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition\",\n author = \"Tjong Kim Sang, Erik F. and\n De Meulder, Fien\",\n booktitle = \"Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003\",\n year = \"2003\",\n url = \"https://www.aclweb.org/anthology/W03-0419\",\n pages = \"142--147\",\n}\n", "homepage": "https://www.aclweb.org/anthology/W03-0419/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pos_tags": {"feature": {"num_classes": 47, "names": ["\"", "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "NN|SYM", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "chunk_tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "ner_tags": {"feature": {"num_classes": 9, "names": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "conll2003", "config_name": "conll2003", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6931418, "num_examples": 14042, "dataset_name": "conll2003"}, "validation": {"name": "validation", "num_bytes": 1739271, "num_examples": 3251, "dataset_name": "conll2003"}, "test": {"name": "test", "num_bytes": 1582102, "num_examples": 3454, "dataset_name": "conll2003"}}, "download_checksums": {"https://data.deepai.org/conll2003.zip": {"num_bytes": 982975, "checksum": "96a104d174ddae7558bab603f19382c5fe02ff1da5c077a7f3ce2ced1578a2c3"}}, "download_size": 982975, "post_processing_size": null, "dataset_size": 10252791, "size_in_bytes": 11235766}}
1+
{"conll2003": {"description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "citation": "@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,\n title = \"Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition\",\n author = \"Tjong Kim Sang, Erik F. and\n De Meulder, Fien\",\n booktitle = \"Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003\",\n year = \"2003\",\n url = \"https://www.aclweb.org/anthology/W03-0419\",\n pages = \"142--147\",\n}\n", "homepage": "https://www.aclweb.org/anthology/W03-0419/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pos_tags": {"feature": {"num_classes": 47, "names": ["\"", "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "NN|SYM", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "chunk_tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "ner_tags": {"feature": {"num_classes": 9, "names": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "conll2003", "config_name": "conll2003", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6931345, "num_examples": 14041, "dataset_name": "conll2003"}, "validation": {"name": "validation", "num_bytes": 1739223, "num_examples": 3250, "dataset_name": "conll2003"}, "test": {"name": "test", "num_bytes": 1582054, "num_examples": 3453, "dataset_name": "conll2003"}}, "download_checksums": {"https://data.deepai.org/conll2003.zip": {"num_bytes": 982975, "checksum": "96a104d174ddae7558bab603f19382c5fe02ff1da5c077a7f3ce2ced1578a2c3"}}, "download_size": 982975, "post_processing_size": null, "dataset_size": 10252622, "size_in_bytes": 11235597}}

0 commit comments

Comments
 (0)