Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions docs/source/about_dataset_features.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ Let's have a look at the features of the MRPC dataset from the GLUE benchmark:
>>> from datasets import load_dataset
>>> dataset = load_dataset('nyu-mll/glue', 'mrpc', split='train')
>>> dataset.features
{'idx': Value(dtype='int32'),
{'idx': Value('int32'),
'label': ClassLabel(names=['not_equivalent', 'equivalent']),
'sentence1': Value(dtype='string'),
'sentence2': Value(dtype='string'),
'sentence1': Value('string'),
'sentence2': Value('string'),
}
```

Expand All @@ -38,12 +38,12 @@ If your data type contains a list of objects, then you want to use the [`List`]
>>> from datasets import load_dataset
>>> dataset = load_dataset('rajpurkar/squad', split='train')
>>> dataset.features
{'id': Value(dtype='string'),
'title': Value(dtype='string'),
'context': Value(dtype='string'),
'question': Value(dtype='string'),
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
{'id': Value('string'),
'title': Value('string'),
'context': Value('string'),
'question': Value('string'),
'answers': {'text': List(Value('string')),
'answer_start': List(Value('int32'))}}
```

The `answers` field is constructed using the dict of features because and contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/load_hub.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Movie Review Dataset. This is a dataset of containing 5,331 positive and 5,331 n
# Inspect dataset features
>>> ds_builder.info.features
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
'text': Value('string')}
```

If you're happy with the dataset, then load it with [`load_dataset`]:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/loading.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,6 @@ Now when you look at your dataset features, you can see it uses the custom label

```py
>>> dataset['train'].features
{'text': Value(dtype='string'),
{'text': Value('string'),
'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'])}
```
24 changes: 12 additions & 12 deletions docs/source/process.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -223,21 +223,21 @@ The [`~Dataset.cast`] function transforms the feature type of one or more column

```py
>>> dataset.features
{'sentence1': Value(dtype='string'),
'sentence2': Value(dtype='string'),
{'sentence1': Value('string'),
'sentence2': Value('string'),
'label': ClassLabel(names=['not_equivalent', 'equivalent']),
'idx': Value(dtype='int32')}
'idx': Value('int32')}

>>> from datasets import ClassLabel, Value
>>> new_features = dataset.features.copy()
>>> new_features["label"] = ClassLabel(names=["negative", "positive"])
>>> new_features["idx"] = Value("int64")
>>> dataset = dataset.cast(new_features)
>>> dataset.features
{'sentence1': Value(dtype='string'),
'sentence2': Value(dtype='string'),
{'sentence1': Value('string'),
'sentence2': Value('string'),
'label': ClassLabel(names=['negative', 'positive']),
'idx': Value(dtype='int64')}
'idx': Value('int64')}
```

<Tip>
Expand Down Expand Up @@ -265,12 +265,12 @@ Sometimes a column can be a nested structure of several types. Take a look at th
>>> from datasets import load_dataset
>>> dataset = load_dataset("rajpurkar/squad", split="train")
>>> dataset.features
{'id': Value(dtype='string'),
'title': Value(dtype='string'),
'context': Value(dtype='string'),
'question': Value(dtype='string'),
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
{'id': Value('string'),
'title': Value('string'),
'context': Value('string'),
'question': Value('string'),
'answers': {'text': List(Value('string')),
'answer_start': List(Value('int32'))}}
```

The `answers` field contains two subfields: `text` and `answer_start`. Use the [`~Dataset.flatten`] function to extract the subfields into their own separate columns:
Expand Down
12 changes: 6 additions & 6 deletions docs/source/stream.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -241,21 +241,21 @@ When you need to remove one or more columns, give [`IterableDataset.remove_colum
>>> from datasets import load_dataset
>>> dataset = load_dataset('nyu-mll/glue', 'mrpc', split='train', streaming=True)
>>> dataset.features
{'sentence1': Value(dtype='string'),
'sentence2': Value(dtype='string'),
{'sentence1': Value('string'),
'sentence2': Value('string'),
'label': ClassLabel(names=['not_equivalent', 'equivalent']),
'idx': Value(dtype='int32')}
'idx': Value('int32')}

>>> from datasets import ClassLabel, Value
>>> new_features = dataset.features.copy()
>>> new_features["label"] = ClassLabel(names=['negative', 'positive'])
>>> new_features["idx"] = Value('int64')
>>> dataset = dataset.cast(new_features)
>>> dataset.features
{'sentence1': Value(dtype='string'),
'sentence2': Value(dtype='string'),
{'sentence1': Value('string'),
'sentence2': Value('string'),
'label': ClassLabel(names=['negative', 'positive']),
'idx': Value(dtype='int64')}
'idx': Value('int64')}
```

<Tip>
Expand Down
30 changes: 15 additions & 15 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1957,14 +1957,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
>>> from datasets import load_dataset
>>> ds = load_dataset("boolq", split="validation")
>>> ds.features
{'answer': Value(dtype='bool'),
'passage': Value(dtype='string'),
'question': Value(dtype='string')}
{'answer': Value('bool'),
'passage': Value('string'),
'question': Value('string')}
>>> ds = ds.class_encode_column('answer')
>>> ds.features
{'answer': ClassLabel(num_classes=2, names=['False', 'True']),
'passage': Value(dtype='string'),
'question': Value(dtype='string')}
'passage': Value('string'),
'question': Value('string')}
```
"""
# Sanity checks
Expand Down Expand Up @@ -2035,12 +2035,12 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas
>>> from datasets import load_dataset
>>> ds = load_dataset("rajpurkar/squad", split="train")
>>> ds.features
{'id': Value(dtype='string'),
'title': Value(dtype='string'),
'context': Value(dtype='string'),
'question': Value(dtype='string'),
'answers': {'text': List(feature=Value(dtype='string'), length=-1),
'answer_start': List(feature=Value(dtype='int32'), length=-1)}}
{'id': Value('string'),
'title': Value('string'),
'context': Value('string'),
'question': Value('string'),
'answers': {'text': List(Value('string')),
'answer_start': List(Value('int32'))}}
>>> ds.flatten()
Dataset({
features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
Expand Down Expand Up @@ -2109,14 +2109,14 @@ def cast(
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
>>> ds.features
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
'text': Value('string')}
>>> new_features = ds.features.copy()
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
>>> new_features['text'] = Value('large_string')
>>> ds = ds.cast(new_features)
>>> ds.features
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='large_string')}
'text': Value('large_string')}
```
"""
if sorted(features) != sorted(self._data.column_names):
Expand Down Expand Up @@ -2168,11 +2168,11 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
>>> ds.features
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
'text': Value('string')}
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
>>> ds.features
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='string')}
'text': Value('string')}
```
"""
feature = _fix_for_backward_compatible_features(feature)
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ def get_all_exported_dataset_infos(cls) -> DatasetInfosDict:
>>> from datasets import load_dataset_builder
>>> ds_builder = load_dataset_builder('vivos')
>>> ds_builder.get_all_exported_dataset_infos()
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
```
"""
return DatasetInfosDict.from_directory(cls.get_imported_module_dir())
Expand All @@ -527,7 +527,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
>>> from datasets import load_dataset_builder
>>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
>>> ds_builder.get_exported_dataset_info()
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
```
"""
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())
Expand Down
38 changes: 19 additions & 19 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,12 +201,12 @@ def flatten(self, max_depth=16) -> "DatasetDict":
>>> from datasets import load_dataset
>>> ds = load_dataset("rajpurkar/squad")
>>> ds["train"].features
{'id': Value(dtype='string'),
'title': Value(dtype='string'),
'context': Value(dtype='string'),
'question': Value(dtype='string'),
'answers.text': List(feature=Value(dtype='string'), length=-1),
'answers.answer_start': List(feature=Value(dtype='int32'), length=-1)}
{'id': Value('string'),
'title': Value('string'),
'context': Value('string'),
'question': Value('string'),
'answers.text': List(Value('string')),
'answers.answer_start': List(Value('int32'))}
>>> ds.flatten()
DatasetDict({
train: Dataset({
Expand Down Expand Up @@ -290,14 +290,14 @@ def cast(self, features: Features) -> "DatasetDict":
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
>>> ds["train"].features
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
'text': Value('string')}
>>> new_features = ds["train"].features.copy()
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
>>> new_features['text'] = Value('large_string')
>>> ds = ds.cast(new_features)
>>> ds["train"].features
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='large_string')}
'text': Value('large_string')}
```
"""
self._check_values_type()
Expand All @@ -322,11 +322,11 @@ def cast_column(self, column: str, feature) -> "DatasetDict":
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
>>> ds["train"].features
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
'text': Value('string')}
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
>>> ds["train"].features
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='string')}
'text': Value('string')}
```
"""
self._check_values_type()
Expand Down Expand Up @@ -513,14 +513,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data
>>> from datasets import load_dataset
>>> ds = load_dataset("boolq")
>>> ds["train"].features
{'answer': Value(dtype='bool'),
'passage': Value(dtype='string'),
'question': Value(dtype='string')}
{'answer': Value('bool'),
'passage': Value('string'),
'question': Value('string')}
>>> ds = ds.class_encode_column("answer")
>>> ds["train"].features
{'answer': ClassLabel(num_classes=2, names=['False', 'True']),
'passage': Value(dtype='string'),
'question': Value(dtype='string')}
'passage': Value('string'),
'question': Value('string')}
```
"""
self._check_values_type()
Expand Down Expand Up @@ -2381,11 +2381,11 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True)
>>> ds["train"].features
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
'text': Value('string')}
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
>>> ds["train"].features
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='string')}
'text': Value('string')}
```
"""
return IterableDatasetDict(
Expand Down Expand Up @@ -2417,14 +2417,14 @@ def cast(
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True)
>>> ds["train"].features
{'label': ClassLabel(names=['neg', 'pos']),
'text': Value(dtype='string')}
'text': Value('string')}
>>> new_features = ds["train"].features.copy()
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
>>> new_features['text'] = Value('large_string')
>>> ds = ds.cast(new_features)
>>> ds["train"].features
{'label': ClassLabel(names=['bad', 'good']),
'text': Value(dtype='large_string')}
'text': Value('large_string')}
```
"""
return IterableDatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()})
Expand Down
Loading
Loading