Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/datasets/features/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,19 @@ class Audio:
channels.
decode (:obj:`bool`, default ``True``): Whether to decode the audio data. If `False`,
returns the underlying dictionary in the format {"path": audio_path, "bytes": audio_bytes}.

Example:

```py
>>> from datasets import load_dataset, Audio
>>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=16000))
>>> ds[0]["audio"]
{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ...,
3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
'sampling_rate': 16000}
```
"""

sampling_rate: Optional[int] = None
Expand Down
136 changes: 134 additions & 2 deletions src/datasets/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,15 @@ class Value:
large_binary
string
large_string

Example:

```py
>>> from datasets import Features
>>> features = Features({'stars': Value(dtype='int32')})
>>> features
{'stars': Value(dtype='int32', id=None)}
```
"""

dtype: str
Expand Down Expand Up @@ -476,6 +485,20 @@ def encode_example(self, value):

@dataclass
class Array2D(_ArrayXD):
"""Create a two-dimensional array.

Args:
shape (`tuple`): The size of each dimension.
dtype (`str`): The value of the data type.

Example:

```py
>>> from datasets import Features
>>> features = Features({'x': Array2D(shape=(1, 3), dtype='int32')})
```
"""

shape: tuple
dtype: str
id: Optional[str] = None
Expand All @@ -485,6 +508,20 @@ class Array2D(_ArrayXD):

@dataclass
class Array3D(_ArrayXD):
"""Create a three-dimensional array.

Args:
shape (`tuple`): The size of each dimension.
dtype (`str`): The value of the data type.

Example:

```py
>>> from datasets import Features
>>> features = Features({'x': Array3D(shape=(1, 2, 3), dtype='int32')})
```
"""

shape: tuple
dtype: str
id: Optional[str] = None
Expand All @@ -494,6 +531,20 @@ class Array3D(_ArrayXD):

@dataclass
class Array4D(_ArrayXD):
"""Create a four-dimensional array.

Args:
shape (`tuple`): The size of each dimension.
dtype (`str`): The value of the data type.

Example:

```py
>>> from datasets import Features
>>> features = Features({'x': Array4D(shape=(1, 2, 2, 3), dtype='int32')})
```
"""

shape: tuple
dtype: str
id: Optional[str] = None
Expand All @@ -503,6 +554,20 @@ class Array4D(_ArrayXD):

@dataclass
class Array5D(_ArrayXD):
"""Create a five-dimensional array.

Args:
shape (`tuple`): The size of each dimension.
dtype (`str`): The value of the data type.

Example:

```py
>>> from datasets import Features
>>> features = Features({'x': Array5D(shape=(1, 2, 2, 3, 3), dtype='int32')})
```
"""

shape: tuple
dtype: str
id: Optional[str] = None
Expand Down Expand Up @@ -792,6 +857,15 @@ class ClassLabel:
names (:obj:`list` of :obj:`str`, optional): String names for the integer classes.
The order in which the names are provided is kept.
names_file (:obj:`str`, optional): Path to a file with names for the integer classes, one per line.

Example:

```py
>>> from datasets Features
>>> features = Features({'label': ClassLabel(num_classes=3, names=['bad', 'ok', 'good'])})
>>> features
{'label': ClassLabel(num_classes=3, names=['bad', 'ok', 'good'], id=None)}
```
"""

num_classes: int = None
Expand Down Expand Up @@ -835,7 +909,17 @@ def __call__(self):
return self.pa_type

def str2int(self, values: Union[str, Iterable]):
"""Conversion class name string => integer."""
"""Conversion class name string => integer.

Example:

```py
>>> from datasets import load_dataset
>>> ds = load_dataset("rotten_tomatoes", split="train")
>>> ds.features["label"].str2int('neg')
0
```
"""
if not isinstance(values, str) and not isinstance(values, Iterable):
raise ValueError(
f"Values {values} should be a string or an Iterable (list, numpy array, pytorch, tensorflow tensors)"
Expand Down Expand Up @@ -864,7 +948,17 @@ def str2int(self, values: Union[str, Iterable]):
return output if return_list else output[0]

def int2str(self, values: Union[int, Iterable]):
"""Conversion integer => class name string."""
"""Conversion integer => class name string.

Example:

```py
>>> from datasets import load_dataset
>>> ds = load_dataset("rotten_tomatoes", split="train")
>>> ds.features["label"].int2str(0)
'neg'
```
"""
if not isinstance(values, int) and not isinstance(values, Iterable):
raise ValueError(
f"Values {values} should be an integer or an Iterable (list, numpy array, pytorch, tensorflow tensors)"
Expand Down Expand Up @@ -911,6 +1005,19 @@ def _load_names_from_file(names_filepath):
class Sequence:
"""Construct a list of feature from a single type or a dict of types.
Mostly here for compatiblity with tfds.

Args:
feature: A list of features of a single type or a dictionary of types.
length (`int`): Length of the sequence.

Example:

```py
>>> from datasets import Features, Sequence, Value, ClassLabel
>>> features = Features({'post': Sequence(feature={'text': Value(dtype='string'), 'upvotes': Value(dtype='int32'), 'label': ClassLabel(num_classes=2, names=['hot', 'cold'])})})
>>> features
{'post': Sequence(feature={'text': Value(dtype='string', id=None), 'upvotes': Value(dtype='int32', id=None), 'label': ClassLabel(num_classes=2, names=['hot', 'cold'], id=None)}, length=-1, id=None)}
```
"""

feature: Any
Expand Down Expand Up @@ -1453,6 +1560,17 @@ def copy(self) -> "Features":

Returns:
:class:`Features`

Example:

```py
>>> from datasets import load_dataset
>>> ds = load_dataset("rotten_tomatoes", split="train")
>>> copy_of_features = ds.features.copy()
>>> copy_of_features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
```
"""
return copy.deepcopy(self)

Expand Down Expand Up @@ -1527,6 +1645,20 @@ def flatten(self, max_depth=16) -> "Features":

Returns:
Features: the flattened features

Example:

```py
>>> from datasets import load_dataset
>>> ds = load_dataset("squad", split="train")
>>> ds.features.flatten()
{'answers.answer_start': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
'answers.text': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
'context': Value(dtype='string', id=None),
'id': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None),
'title': Value(dtype='string', id=None)}
```
"""
for depth in range(1, max_depth):
no_change = True
Expand Down
10 changes: 10 additions & 0 deletions src/datasets/features/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ class Image:
Args:
decode (:obj:`bool`, default ``True``): Whether to decode the image data. If `False`,
returns the underlying dictionary in the format {"path": image_path, "bytes": image_bytes}.

Examples:

```py
>>> from datasets import load_dataset, Image
>>> ds = load_dataset("beans", split="train")
>>> ds = ds.cast_column('image', Image(decode=False))
{'bytes': None,
'path': '/root/.cache/huggingface/datasets/downloads/extracted/b0a21163f78769a2cf11f58dfc767fb458fc7cea5c05dccc0144a2c0f0bc1292/train/healthy/healthy_train.85.jpg'}
```
"""

decode: bool = True
Expand Down
15 changes: 15 additions & 0 deletions src/datasets/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,14 @@ def write_to_directory(self, metric_info_dir, pretty_print=False):
"""Write `MetricInfo` as JSON to `metric_info_dir`.
Also save the license separately in LICENCE.
If `pretty_print` is True, the JSON will be pretty-printed with the indent level of 4.

Example:

```py
>>> from datasets import load_metric
>>> metric = load_metric("accuracy")
>>> metric.info.write_to_directory("/path/to/directory/")
```
"""
with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), "w", encoding="utf-8") as f:
json.dump(asdict(self), f, indent=4 if pretty_print else None)
Expand All @@ -368,6 +376,13 @@ def from_directory(cls, metric_info_dir) -> "MetricInfo":
Args:
metric_info_dir: `str` The directory containing the metadata file. This
should be the root directory of a specific dataset version.

Example:

```py
>>> from datasets import MetricInfo
>>> metric_info = MetricInfo.from_directory("/path/to/directory/")
```
"""
logger.info(f"Loading Metric info from {metric_info_dir}")
if not metric_info_dir:
Expand Down
24 changes: 24 additions & 0 deletions src/datasets/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,14 @@ def compute(self, *, predictions=None, references=None, **kwargs) -> Optional[di

- Dictionary with the metrics if this metric is run on the main process (``process_id == 0``).
- None if the metric is not run on the main process (``process_id != 0``).

Example:

```py
>>> from datasets import load_metric
>>> metric = load_metric("accuracy")
>>> accuracy = metric.compute(predictions=model_prediction, references=labels)
```
"""
all_kwargs = {"predictions": predictions, "references": references, **kwargs}
if predictions is None and references is None:
Expand Down Expand Up @@ -454,6 +462,14 @@ def add_batch(self, *, predictions=None, references=None, **kwargs):
Args:
predictions (list/array/tensor, optional): Predictions.
references (list/array/tensor, optional): References.

Example:

```py
>>> from datasets import load_metric
>>> metric = load_metric("accuracy")
>>> metric.add_batch(predictions=model_prediction, references=labels)
```
"""
bad_inputs = [input_name for input_name in kwargs if input_name not in self.features]
if bad_inputs:
Expand Down Expand Up @@ -493,6 +509,14 @@ def add(self, *, prediction=None, reference=None, **kwargs):
Args:
prediction (list/array/tensor, optional): Predictions.
reference (list/array/tensor, optional): References.

Example:

```py
>>> from datasets import load_metric
>>> metric = load_metric("accuracy")
>>> metric.add(predictions=model_predictions, references=labels)
```
"""
bad_inputs = [input_name for input_name in kwargs if input_name not in self.features]
if bad_inputs:
Expand Down