Skip to content

Commit 713f83c

Browse files
authored
Merge branch 'main' into dl-and-pp-as-parquet
2 parents b480549 + 6c398c1 commit 713f83c

33 files changed

+585
-460
lines changed

datasets/crd3/README.md

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,6 @@ paperswithcode_id: crd3
5555
- **Repository:** [CRD3 repository](https://github.com/RevanthRameshkumar/CRD3)
5656
- **Paper:** [Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset](https://www.aclweb.org/anthology/2020.acl-main.459/)
5757
- **Point of Contact:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
58-
- **Size of downloaded dataset files:** 279.93 MB
59-
- **Size of the generated dataset:** 4020.33 MB
60-
- **Total amount of disk used:** 4300.25 MB
6158

6259
### Dataset Summary
6360

@@ -69,6 +66,7 @@ collaboration and spoken interaction. For each dialogue, there are a large numbe
6966
and semantic ties to the previous dialogues.
7067

7168
### Supported Tasks and Leaderboards
69+
7270
`summarization`: The dataset can be used to train a model for abstractive summarization. A [fast abstractive summarization-RL](https://github.com/ChenRocks/fast_abs_rl) model was presented as a baseline, which achieves ROUGE-L-F1 of 25.18.
7371

7472
### Languages
@@ -79,13 +77,8 @@ The text in the dataset is in English, as spoken by actors on The Critical Role
7977

8078
### Data Instances
8179

82-
#### default
83-
84-
- **Size of downloaded dataset files:** 279.93 MB
85-
- **Size of the generated dataset:** 4020.33 MB
86-
- **Total amount of disk used:** 4300.25 MB
87-
8880
An example of 'train' looks as follows.
81+
8982
```
9083
{
9184
"alignment_score": 3.679936647415161,
@@ -105,7 +98,6 @@ An example of 'train' looks as follows.
10598

10699
The data fields are the same among all splits.
107100

108-
#### default
109101
- `chunk`: a `string` feature.
110102
- `chunk_id`: a `int32` feature.
111103
- `turn_start`: a `int32` feature.
@@ -120,7 +112,7 @@ The data fields are the same among all splits.
120112

121113
| name | train |validation| test |
122114
|-------|------:|---------:|------:|
123-
|default|26,232| 3,470|4,541|
115+
|default|38,969| 6,327|7,500|
124116

125117
## Dataset Creation
126118

@@ -180,19 +172,16 @@ This work is licensed under a [Creative Commons Attribution-ShareAlike 4.0 Inter
180172

181173
### Citation Information
182174

183-
```
184-
175+
```bibtex
185176
@inproceedings{
186177
title = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},
187178
author = {Rameshkumar, Revanth and Bailey, Peter},
188179
year = {2020},
189180
publisher = {Association for Computational Linguistics},
190181
conference = {ACL}
191182
}
192-
193183
```
194184

195-
196185
### Contributions
197186

198187
Thanks to [@thomwolf](https://github.com/thomwolf), [@lhoestq](https://github.com/lhoestq), [@mariamabarham](https://github.com/mariamabarham), [@lewtun](https://github.com/lewtun) for adding this dataset.

datasets/crd3/crd3.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@
4545
and semantic ties to the previous dialogues.
4646
"""
4747

48-
_URL = "https://github.com/RevanthRameshkumar/CRD3/archive/master.zip"
48+
_URL = "https://huggingface.co/datasets/crd3/resolve/72bffe55b4d5bf19b530d3e417447b3384ba3673/data/aligned%20data.zip"
4949

5050

5151
def get_train_test_dev_files(files, test_split, train_split, dev_split):
52-
test_files = dev_files = train_files = []
52+
test_files, dev_files, train_files = [], [], []
5353
for file in files:
5454
filename = os.path.split(file)[1].split("_")[0]
5555
if filename in test_split:
@@ -88,20 +88,22 @@ def _info(self):
8888
)
8989

9090
def _split_generators(self, dl_manager):
91-
path = dl_manager.download_and_extract(_URL)
92-
test_file = os.path.join(path, "CRD3-master", "data", "aligned data", "test_files")
93-
train_file = os.path.join(path, "CRD3-master", "data", "aligned data", "train_files")
94-
dev_file = os.path.join(path, "CRD3-master", "data", "aligned data", "val_files")
91+
root = dl_manager.download_and_extract(_URL)
92+
path = os.path.join(root, "aligned data")
93+
94+
test_file = os.path.join(path, "test_files")
95+
train_file = os.path.join(path, "train_files")
96+
dev_file = os.path.join(path, "val_files")
9597
with open(test_file, encoding="utf-8") as f:
9698
test_splits = [file.replace("\n", "") for file in f.readlines()]
9799

98100
with open(train_file, encoding="utf-8") as f:
99101
train_splits = [file.replace("\n", "") for file in f.readlines()]
100102
with open(dev_file, encoding="utf-8") as f:
101103
dev_splits = [file.replace("\n", "") for file in f.readlines()]
102-
c2 = "CRD3-master/data/aligned data/c=2"
103-
c3 = "CRD3-master/data/aligned data/c=3"
104-
c4 = "CRD3-master/data/aligned data/c=4"
104+
c2 = "c=2"
105+
c3 = "c=3"
106+
c4 = "c=4"
105107
files = [os.path.join(path, c2, file) for file in sorted(os.listdir(os.path.join(path, c2)))]
106108
files.extend([os.path.join(path, c3, file) for file in sorted(os.listdir(os.path.join(path, c3)))])
107109
files.extend([os.path.join(path, c4, file) for file in sorted(os.listdir(os.path.join(path, c4)))])

datasets/crd3/dataset_infos.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"default": {"description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.\nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding\nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player\ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,\nand semantic ties to the previous dialogues.\n", "citation": "\n@inproceedings{\ntitle = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},\nauthor = {Rameshkumar, Revanth and Bailey, Peter},\nyear = {2020},\npublisher = {Association for Computational Linguistics},\nconference = {ACL}\n}\n ", "homepage": "https://github.com/RevanthRameshkumar/CRD3", "license": "", "features": {"chunk": {"dtype": "string", "id": null, "_type": "Value"}, "chunk_id": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_start": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_end": {"dtype": "int32", "id": null, "_type": "Value"}, "alignment_score": {"dtype": "float32", "id": null, "_type": "Value"}, "turns": {"feature": {"names": {"dtype": "string", "id": null, "_type": "Value"}, "utterances": {"dtype": "string", "id": null, "_type": "Value"}, "number": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "crd3", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 318560673, "num_examples": 52796, "dataset_name": "crd3"}, "test": {"name": "test", "num_bytes": 318560673, "num_examples": 52796, "dataset_name": "crd3"}, "validation": {"name": "validation", "num_bytes": 318560673, "num_examples": 52796, "dataset_name": "crd3"}}, "download_checksums": {"https://github.com/RevanthRameshkumar/CRD3/archive/master.zip": {"num_bytes": 294222220, "checksum": "c77a937394f265735ba54b32a7a051f77a97d264c74b0535dee77ef9791815b5"}}, "download_size": 294222220, "post_processing_size": null, "dataset_size": 955682019, "size_in_bytes": 1249904239}}
1+
{"default": {"description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.\nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding\nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player\ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,\nand semantic ties to the previous dialogues.\n", "citation": "\n@inproceedings{\ntitle = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},\nauthor = {Rameshkumar, Revanth and Bailey, Peter},\nyear = {2020},\npublisher = {Association for Computational Linguistics},\nconference = {ACL}\n}\n ", "homepage": "https://github.com/RevanthRameshkumar/CRD3", "license": "", "features": {"chunk": {"dtype": "string", "id": null, "_type": "Value"}, "chunk_id": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_start": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_end": {"dtype": "int32", "id": null, "_type": "Value"}, "alignment_score": {"dtype": "float32", "id": null, "_type": "Value"}, "turns": [{"names": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "utterances": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "number": {"dtype": "int32", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "crd3", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 236605152, "num_examples": 38969, "dataset_name": "crd3"}, "test": {"name": "test", "num_bytes": 40269203, "num_examples": 7500, "dataset_name": "crd3"}, "validation": {"name": "validation", "num_bytes": 41543528, "num_examples": 6327, "dataset_name": "crd3"}}, "download_checksums": {"https://huggingface.co/datasets/crd3/resolve/72bffe55b4d5bf19b530d3e417447b3384ba3673/data/aligned%20data.zip": {"num_bytes": 117519820, "checksum": "c66bd9f7848bcd514a35c154edd2fc874f1a3076876d8bd7208bf3caf4b7fb0b"}}, "download_size": 117519820, "post_processing_size": null, "dataset_size": 318417883, "size_in_bytes": 435937703}}
1.02 KB
Binary file not shown.

docs/source/image_process.mdx

Lines changed: 165 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,11 @@ Both parameter values default to 1000, which can be expensive if you are storing
3939

4040
## Data augmentation
4141

42-
🤗 Datasets can apply data augmentations from any library or package to your dataset. This guide will use the transforms from [torchvision](https://pytorch.org/vision/stable/transforms.html).
42+
🤗 Datasets can apply data augmentations from any library or package to your dataset.
43+
44+
### Image Classification
45+
46+
First let's see how you can transform image classification datasets. This guide will use the transforms from [torchvision](https://pytorch.org/vision/stable/transforms.html).
4347

4448
<Tip>
4549

@@ -88,3 +92,163 @@ Now you can take a look at the augmented image by indexing into the `pixel_value
8892
<img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/image_process_jitter.png">
8993
<img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/image_process_jitter.png"/>
9094
</div>
95+
96+
### Object Detection
97+
98+
Object detection models identify something in an image, and object detection datasets are used for applications such as autonomous driving and detecting natural hazards like wildfire. This guide will show you how to apply transformations to an object detection dataset following the [tutorial](https://albumentations.ai/docs/examples/example_bboxes/) from [Albumentations](https://albumentations.ai/docs/).
99+
100+
To run these examples, make sure you have up-to-date versions of `albumentations` and `cv2` installed:
101+
102+
```
103+
pip install -U albumentations opencv-python
104+
```
105+
106+
In this example, you'll use the [`cppe-5`](https://huggingface.co/datasets/cppe-5) dataset for identifying medical personal protective equipment (PPE) in the context of the COVID-19 pandemic.
107+
108+
Load the dataset and take a look at an example:
109+
110+
```py
111+
from datasets import load_dataset
112+
113+
>>> ds = load_dataset("cppe-5")
114+
>>> example = ds['train'][0]
115+
>>> example
116+
{'height': 663,
117+
'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=943x663 at 0x7FC3DC756250>,
118+
'image_id': 15,
119+
'objects': {'area': [3796, 1596, 152768, 81002],
120+
'bbox': [[302.0, 109.0, 73.0, 52.0],
121+
[810.0, 100.0, 57.0, 28.0],
122+
[160.0, 31.0, 248.0, 616.0],
123+
[741.0, 68.0, 202.0, 401.0]],
124+
'category': [4, 4, 0, 0],
125+
'id': [114, 115, 116, 117]},
126+
'width': 943}
127+
```
128+
129+
The dataset has the following fields:
130+
131+
- `image`: PIL.Image.Image object containing the image.
132+
- `image_id`: The image ID.
133+
- `height`: The image height.
134+
- `width`: The image width.
135+
- `objects`: A dictionary containing bounding box metadata for the objects in the image:
136+
- `id`: The annotation id.
137+
- `area`: The area of the bounding box.
138+
- `bbox`: The object's bounding box (in the [coco](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco) format).
139+
- `category`: The object's category, with possible values including `Coverall (0)`, `Face_Shield (1)`, `Gloves (2)`, `Goggles (3)` and `Mask (4)`.
140+
141+
You can visualize the `bboxes` on the image using some internal torch utilities. To do that, you will need to reference the [`~datasets.ClassLabel`] feature associated with the category IDs so you can look up the string labels:
142+
143+
144+
```py
145+
>>> import torch
146+
>>> from torchvision.ops import box_convert
147+
>>> from torchvision.utils import draw_bounding_boxes
148+
>>> from torchvision.transforms.functional import pil_to_tensor, to_pil_image
149+
150+
>>> categories = ds['train'].features['objects'].feature['category']
151+
152+
>>> boxes_xywh = torch.tensor(example['objects']['bbox'])
153+
>>> boxes_xyxy = box_convert(boxes_xywh, 'xywh', 'xyxy')
154+
>>> labels = [categories.int2str(x) for x in example['objects']['category']]
155+
>>> to_pil_image(
156+
... draw_bounding_boxes(
157+
... pil_to_tensor(example['image']),
158+
... boxes_xyxy,
159+
... colors="red",
160+
... labels=labels,
161+
... )
162+
... )
163+
```
164+
165+
<div class="flex justify-center">
166+
<img src="https://huggingface.co/datasets/nateraw/documentation-images/resolve/main/visualize_detection_example.png">
167+
</div>
168+
169+
170+
With `albumentations`, you can apply transforms that will affect the image while also updating the `bboxes` accordingly. In this case, the image is resized to (480, 480), flipped horizontally, and brightened.
171+
172+
`albumentations` expects the image to be in BGR format, not RGB, so you'll have to convert the image before applying the transform.
173+
174+
```py
175+
>>> import albumentations as A
176+
>>> import numpy as np
177+
178+
>>> transform = A.Compose([
179+
... A.Resize(480, 480),
180+
... A.HorizontalFlip(p=1.0),
181+
... A.RandomBrightnessContrast(p=1.0),
182+
... ], bbox_params=A.BboxParams(format='coco', label_fields=['category']))
183+
184+
>>> # RGB PIL Image -> BGR Numpy array
185+
>>> image = np.flip(np.array(example['image']), -1)
186+
>>> out = transform(
187+
... image=image,
188+
... bboxes=example['objects']['bbox'],
189+
... category=example['objects']['category'],
190+
... )
191+
```
192+
193+
Now when you visualize the result, the image should be flipped, but the `bboxes` should still be in the right places.
194+
195+
```py
196+
>>> image = torch.tensor(out['image']).flip(-1).permute(2, 0, 1)
197+
>>> boxes_xywh = torch.stack([torch.tensor(x) for x in out['bboxes']])
198+
>>> boxes_xyxy = box_convert(boxes_xywh, 'xywh', 'xyxy')
199+
>>> labels = [categories.int2str(x) for x in out['category']]
200+
>>> to_pil_image(
201+
... draw_bounding_boxes(
202+
... image,
203+
... boxes_xyxy,
204+
... colors='red',
205+
... labels=labels
206+
... )
207+
... )
208+
```
209+
210+
<div class="flex justify-center">
211+
<img src="https://huggingface.co/datasets/nateraw/documentation-images/resolve/main/visualize_detection_example_transformed.png">
212+
</div>
213+
214+
Create a function to apply the transform to a batch of examples:
215+
216+
```py
217+
>>> def transforms(examples):
218+
... images, bboxes, categories = [], [], []
219+
... for image, objects in zip(examples['image'], examples['objects']):
220+
... image = np.array(image.convert("RGB"))[:, :, ::-1]
221+
... out = transform(
222+
... image=image,
223+
... bboxes=objects['bbox'],
224+
... category=objects['category']
225+
... )
226+
... images.append(torch.tensor(out['image']).flip(-1).permute(2, 0, 1))
227+
... bboxes.append(torch.tensor(out['bboxes']))
228+
... categories.append(out['category'])
229+
... return {'image': images, 'bbox': bboxes, 'category': categories}
230+
```
231+
232+
Use the [`~Dataset.set_transform`] function to apply the transform on-the-fly which consumes less disk space. The randomness of data augmentation may return a different image if you access the same example twice. It is especially useful when training a model for several epochs.
233+
234+
```py
235+
>>> ds['train'].set_transform(transforms)
236+
```
237+
238+
You can verify the transform works by visualizing the 10th example:
239+
240+
```py
241+
>>> example = ds['train'][10]
242+
>>> to_pil_image(
243+
... draw_bounding_boxes(
244+
... example['image'],
245+
... box_convert(example['bbox'], 'xywh', 'xyxy'),
246+
... colors='red',
247+
... labels=[categories.int2str(x) for x in example['category']]
248+
... )
249+
... )
250+
```
251+
252+
<div class="flex justify-center">
253+
<img src="https://huggingface.co/datasets/nateraw/documentation-images/resolve/main/visualize_detection_example_transformed_2.png">
254+
</div>

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@
151151
"bert_score>=0.3.6",
152152
"jiwer",
153153
"mauve-text",
154-
"rouge_score",
154+
"rouge_score<0.0.7",
155155
"sacrebleu",
156156
"sacremoses",
157157
"scikit-learn",

src/datasets/arrow_dataset.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
from collections import Counter, UserDict
2828
from collections.abc import Mapping
2929
from copy import deepcopy
30-
from dataclasses import asdict
3130
from functools import partial, wraps
3231
from io import BytesIO
3332
from math import ceil, floor
@@ -95,7 +94,7 @@
9594
from .utils._hf_hub_fixes import create_repo
9695
from .utils.file_utils import _retry, cached_path, estimate_dataset_size, hf_hub_url
9796
from .utils.info_utils import is_small_dataset
98-
from .utils.py_utils import convert_file_size_to_int, unique_values
97+
from .utils.py_utils import asdict, convert_file_size_to_int, unique_values
9998
from .utils.stratify import stratified_shuffle_split_generate_indices
10099
from .utils.tf_utils import minimal_tf_collate_fn
101100
from .utils.typing import PathLike
@@ -825,8 +824,13 @@ def from_pandas(
825824
info = DatasetInfo()
826825
info.features = features
827826
table = InMemoryTable.from_pandas(
828-
df=df, preserve_index=preserve_index, schema=features.arrow_schema if features is not None else None
827+
df=df,
828+
preserve_index=preserve_index,
829829
)
830+
if features is not None:
831+
# more expensive cast than InMemoryTable.from_pandas(..., schema=features.arrow_schema)
832+
# needed to support the str to Audio conversion for instance
833+
table = table.cast(features.arrow_schema)
830834
return cls(table, info=info, split=split)
831835

832836
@classmethod

src/datasets/arrow_writer.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import json
1717
import os
1818
import sys
19-
from dataclasses import asdict
2019
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
2120

2221
import fsspec
@@ -42,7 +41,7 @@
4241
from .table import array_cast, cast_array_to_feature, table_cast
4342
from .utils import logging
4443
from .utils.file_utils import hash_url_to_filename
45-
from .utils.py_utils import first_non_null_value
44+
from .utils.py_utils import asdict, first_non_null_value
4645

4746

4847
logger = logging.get_logger(__name__)

0 commit comments

Comments
 (0)