diff --git a/docs/source/about_dataset_features.mdx b/docs/source/about_dataset_features.mdx index 6df80ce72a1..5cfac6739e3 100644 --- a/docs/source/about_dataset_features.mdx +++ b/docs/source/about_dataset_features.mdx @@ -10,10 +10,10 @@ Let's have a look at the features of the MRPC dataset from the GLUE benchmark: >>> from datasets import load_dataset >>> dataset = load_dataset('nyu-mll/glue', 'mrpc', split='train') >>> dataset.features -{'idx': Value(dtype='int32'), +{'idx': Value('int32'), 'label': ClassLabel(names=['not_equivalent', 'equivalent']), - 'sentence1': Value(dtype='string'), - 'sentence2': Value(dtype='string'), + 'sentence1': Value('string'), + 'sentence2': Value('string'), } ``` @@ -38,12 +38,12 @@ If your data type contains a list of objects, then you want to use the [`List`] >>> from datasets import load_dataset >>> dataset = load_dataset('rajpurkar/squad', split='train') >>> dataset.features -{'id': Value(dtype='string'), - 'title': Value(dtype='string'), - 'context': Value(dtype='string'), - 'question': Value(dtype='string'), - 'answers': {'text': List(feature=Value(dtype='string'), length=-1), - 'answer_start': List(feature=Value(dtype='int32'), length=-1)}} +{'id': Value('string'), + 'title': Value('string'), + 'context': Value('string'), + 'question': Value('string'), + 'answers': {'text': List(Value('string')), + 'answer_start': List(Value('int32'))}} ``` The `answers` field is constructed using the dict of features because and contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively. diff --git a/docs/source/load_hub.mdx b/docs/source/load_hub.mdx index d557cbc526c..49777e763a4 100644 --- a/docs/source/load_hub.mdx +++ b/docs/source/load_hub.mdx @@ -21,7 +21,7 @@ Movie Review Dataset. This is a dataset of containing 5,331 positive and 5,331 n # Inspect dataset features >>> ds_builder.info.features {'label': ClassLabel(names=['neg', 'pos']), - 'text': Value(dtype='string')} + 'text': Value('string')} ``` If you're happy with the dataset, then load it with [`load_dataset`]: diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index cc1710cb8b9..23cd9aa5de0 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -417,6 +417,6 @@ Now when you look at your dataset features, you can see it uses the custom label ```py >>> dataset['train'].features -{'text': Value(dtype='string'), +{'text': Value('string'), 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'])} ``` diff --git a/docs/source/process.mdx b/docs/source/process.mdx index ec86f41dab2..39b9fd94077 100644 --- a/docs/source/process.mdx +++ b/docs/source/process.mdx @@ -223,10 +223,10 @@ The [`~Dataset.cast`] function transforms the feature type of one or more column ```py >>> dataset.features -{'sentence1': Value(dtype='string'), -'sentence2': Value(dtype='string'), +{'sentence1': Value('string'), +'sentence2': Value('string'), 'label': ClassLabel(names=['not_equivalent', 'equivalent']), -'idx': Value(dtype='int32')} +'idx': Value('int32')} >>> from datasets import ClassLabel, Value >>> new_features = dataset.features.copy() @@ -234,10 +234,10 @@ The [`~Dataset.cast`] function transforms the feature type of one or more column >>> new_features["idx"] = Value("int64") >>> dataset = dataset.cast(new_features) >>> dataset.features -{'sentence1': Value(dtype='string'), -'sentence2': Value(dtype='string'), +{'sentence1': Value('string'), +'sentence2': Value('string'), 'label': ClassLabel(names=['negative', 'positive']), -'idx': Value(dtype='int64')} +'idx': Value('int64')} ``` @@ -265,12 +265,12 @@ Sometimes a column can be a nested structure of several types. Take a look at th >>> from datasets import load_dataset >>> dataset = load_dataset("rajpurkar/squad", split="train") >>> dataset.features -{'id': Value(dtype='string'), - 'title': Value(dtype='string'), - 'context': Value(dtype='string'), - 'question': Value(dtype='string'), - 'answers': {'text': List(feature=Value(dtype='string'), length=-1), - 'answer_start': List(feature=Value(dtype='int32'), length=-1)}} +{'id': Value('string'), + 'title': Value('string'), + 'context': Value('string'), + 'question': Value('string'), + 'answers': {'text': List(Value('string')), + 'answer_start': List(Value('int32'))}} ``` The `answers` field contains two subfields: `text` and `answer_start`. Use the [`~Dataset.flatten`] function to extract the subfields into their own separate columns: diff --git a/docs/source/stream.mdx b/docs/source/stream.mdx index a0f6a596e91..914dffd69f6 100644 --- a/docs/source/stream.mdx +++ b/docs/source/stream.mdx @@ -241,10 +241,10 @@ When you need to remove one or more columns, give [`IterableDataset.remove_colum >>> from datasets import load_dataset >>> dataset = load_dataset('nyu-mll/glue', 'mrpc', split='train', streaming=True) >>> dataset.features -{'sentence1': Value(dtype='string'), -'sentence2': Value(dtype='string'), +{'sentence1': Value('string'), +'sentence2': Value('string'), 'label': ClassLabel(names=['not_equivalent', 'equivalent']), -'idx': Value(dtype='int32')} +'idx': Value('int32')} >>> from datasets import ClassLabel, Value >>> new_features = dataset.features.copy() @@ -252,10 +252,10 @@ When you need to remove one or more columns, give [`IterableDataset.remove_colum >>> new_features["idx"] = Value('int64') >>> dataset = dataset.cast(new_features) >>> dataset.features -{'sentence1': Value(dtype='string'), -'sentence2': Value(dtype='string'), +{'sentence1': Value('string'), +'sentence2': Value('string'), 'label': ClassLabel(names=['negative', 'positive']), -'idx': Value(dtype='int64')} +'idx': Value('int64')} ``` diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index cfdf6f2a444..e499fb6f9ba 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1957,14 +1957,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data >>> from datasets import load_dataset >>> ds = load_dataset("boolq", split="validation") >>> ds.features - {'answer': Value(dtype='bool'), - 'passage': Value(dtype='string'), - 'question': Value(dtype='string')} + {'answer': Value('bool'), + 'passage': Value('string'), + 'question': Value('string')} >>> ds = ds.class_encode_column('answer') >>> ds.features {'answer': ClassLabel(num_classes=2, names=['False', 'True']), - 'passage': Value(dtype='string'), - 'question': Value(dtype='string')} + 'passage': Value('string'), + 'question': Value('string')} ``` """ # Sanity checks @@ -2035,12 +2035,12 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas >>> from datasets import load_dataset >>> ds = load_dataset("rajpurkar/squad", split="train") >>> ds.features - {'id': Value(dtype='string'), - 'title': Value(dtype='string'), - 'context': Value(dtype='string'), - 'question': Value(dtype='string'), - 'answers': {'text': List(feature=Value(dtype='string'), length=-1), - 'answer_start': List(feature=Value(dtype='int32'), length=-1)}} + {'id': Value('string'), + 'title': Value('string'), + 'context': Value('string'), + 'question': Value('string'), + 'answers': {'text': List(Value('string')), + 'answer_start': List(Value('int32'))}} >>> ds.flatten() Dataset({ features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'], @@ -2109,14 +2109,14 @@ def cast( >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") >>> ds.features {'label': ClassLabel(names=['neg', 'pos']), - 'text': Value(dtype='string')} + 'text': Value('string')} >>> new_features = ds.features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds.features {'label': ClassLabel(names=['bad', 'good']), - 'text': Value(dtype='large_string')} + 'text': Value('large_string')} ``` """ if sorted(features) != sorted(self._data.column_names): @@ -2168,11 +2168,11 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") >>> ds.features {'label': ClassLabel(names=['neg', 'pos']), - 'text': Value(dtype='string')} + 'text': Value('string')} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds.features {'label': ClassLabel(names=['bad', 'good']), - 'text': Value(dtype='string')} + 'text': Value('string')} ``` """ feature = _fix_for_backward_compatible_features(feature) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 118f77a354a..fab71d31059 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -513,7 +513,7 @@ def get_all_exported_dataset_infos(cls) -> DatasetInfosDict: >>> from datasets import load_dataset_builder >>> ds_builder = load_dataset_builder('vivos') >>> ds_builder.get_all_exported_dataset_infos() - {'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)} + {'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)} ``` """ return DatasetInfosDict.from_directory(cls.get_imported_module_dir()) @@ -527,7 +527,7 @@ def get_exported_dataset_info(self) -> DatasetInfo: >>> from datasets import load_dataset_builder >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') >>> ds_builder.get_exported_dataset_info() - DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None) + DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None) ``` """ return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo()) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 9d8c67ae0e2..5f6248a498f 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -201,12 +201,12 @@ def flatten(self, max_depth=16) -> "DatasetDict": >>> from datasets import load_dataset >>> ds = load_dataset("rajpurkar/squad") >>> ds["train"].features - {'id': Value(dtype='string'), - 'title': Value(dtype='string'), - 'context': Value(dtype='string'), - 'question': Value(dtype='string'), - 'answers.text': List(feature=Value(dtype='string'), length=-1), - 'answers.answer_start': List(feature=Value(dtype='int32'), length=-1)} + {'id': Value('string'), + 'title': Value('string'), + 'context': Value('string'), + 'question': Value('string'), + 'answers.text': List(Value('string')), + 'answers.answer_start': List(Value('int32'))} >>> ds.flatten() DatasetDict({ train: Dataset({ @@ -290,14 +290,14 @@ def cast(self, features: Features) -> "DatasetDict": >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") >>> ds["train"].features {'label': ClassLabel(names=['neg', 'pos']), - 'text': Value(dtype='string')} + 'text': Value('string')} >>> new_features = ds["train"].features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds["train"].features {'label': ClassLabel(names=['bad', 'good']), - 'text': Value(dtype='large_string')} + 'text': Value('large_string')} ``` """ self._check_values_type() @@ -322,11 +322,11 @@ def cast_column(self, column: str, feature) -> "DatasetDict": >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") >>> ds["train"].features {'label': ClassLabel(names=['neg', 'pos']), - 'text': Value(dtype='string')} + 'text': Value('string')} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds["train"].features {'label': ClassLabel(names=['bad', 'good']), - 'text': Value(dtype='string')} + 'text': Value('string')} ``` """ self._check_values_type() @@ -513,14 +513,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data >>> from datasets import load_dataset >>> ds = load_dataset("boolq") >>> ds["train"].features - {'answer': Value(dtype='bool'), - 'passage': Value(dtype='string'), - 'question': Value(dtype='string')} + {'answer': Value('bool'), + 'passage': Value('string'), + 'question': Value('string')} >>> ds = ds.class_encode_column("answer") >>> ds["train"].features {'answer': ClassLabel(num_classes=2, names=['False', 'True']), - 'passage': Value(dtype='string'), - 'question': Value(dtype='string')} + 'passage': Value('string'), + 'question': Value('string')} ``` """ self._check_values_type() @@ -2381,11 +2381,11 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) >>> ds["train"].features {'label': ClassLabel(names=['neg', 'pos']), - 'text': Value(dtype='string')} + 'text': Value('string')} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds["train"].features {'label': ClassLabel(names=['bad', 'good']), - 'text': Value(dtype='string')} + 'text': Value('string')} ``` """ return IterableDatasetDict( @@ -2417,14 +2417,14 @@ def cast( >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) >>> ds["train"].features {'label': ClassLabel(names=['neg', 'pos']), - 'text': Value(dtype='string')} + 'text': Value('string')} >>> new_features = ds["train"].features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds["train"].features {'label': ClassLabel(names=['bad', 'good']), - 'text': Value(dtype='large_string')} + 'text': Value('large_string')} ``` """ return IterableDatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()}) diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index ca5f51abdf9..7a94a0d8c73 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -517,9 +517,9 @@ class Value: ```py >>> from datasets import Features - >>> features = Features({'stars': Value(dtype='int32')}) + >>> features = Features({'stars': Value('int32')}) >>> features - {'stars': Value(dtype='int32')} + {'stars': Value('int32')} ``` """ @@ -1160,7 +1160,7 @@ def _load_names_from_file(names_filepath): return [name.strip() for name in f.read().split("\n") if name.strip()] # Filter empty names -def Sequence(feature, length=-1): +def Sequence(feature): """ A `Sequence` is a utility that automatically converts internal dictionary feature into a dictionary of lists. This behavior is implemented to have a compatibility layer with the TensorFlow Datasets library but may be @@ -1458,10 +1458,10 @@ def generate_from_dict(obj: Any): if class_type == LargeList: feature = obj.pop("feature") - return LargeList(feature=generate_from_dict(feature), **obj) + return LargeList(generate_from_dict(feature), **obj) if class_type == List: feature = obj.pop("feature") - return List(feature=generate_from_dict(feature), **obj) + return List(generate_from_dict(feature), **obj) if class_type == Sequence: # backward compatibility, this translates to a List or a dict feature = obj.pop("feature") return Sequence(feature=generate_from_dict(feature), **obj) @@ -1483,11 +1483,11 @@ def generate_from_arrow_type(pa_type: pa.DataType) -> FeatureType: if isinstance(pa_type, pa.StructType): return {field.name: generate_from_arrow_type(field.type) for field in pa_type} elif isinstance(pa_type, pa.FixedSizeListType): - return List(feature=generate_from_arrow_type(pa_type.value_type), length=pa_type.list_size) + return List(generate_from_arrow_type(pa_type.value_type), length=pa_type.list_size) elif isinstance(pa_type, pa.ListType): - return List(feature=generate_from_arrow_type(pa_type.value_type)) + return List(generate_from_arrow_type(pa_type.value_type)) elif isinstance(pa_type, pa.LargeListType): - return LargeList(feature=generate_from_arrow_type(pa_type.value_type)) + return LargeList(generate_from_arrow_type(pa_type.value_type)) elif isinstance(pa_type, _ArrayXDExtensionType): array_feature = [None, None, Array2D, Array3D, Array4D, Array5D][pa_type.ndims] return array_feature(shape=pa_type.shape, dtype=pa_type.value_type) @@ -1849,7 +1849,7 @@ def from_dict(cls, dic) -> "Features": Example:: >>> Features.from_dict({'_type': {'dtype': 'string', 'id': None, '_type': 'Value'}}) - {'_type': Value(dtype='string')} + {'_type': Value('string')} """ obj = generate_from_dict(dic) return cls(**obj) @@ -2147,7 +2147,7 @@ def copy(self) -> "Features": >>> copy_of_features = ds.features.copy() >>> copy_of_features {'label': ClassLabel(names=['neg', 'pos']), - 'text': Value(dtype='string')} + 'text': Value('string')} ``` """ return copy.deepcopy(self) @@ -2175,7 +2175,7 @@ def reorder_fields_as(self, other: "Features") -> "Features": >>> assert f1.type != f2.type >>> # re-ordering keeps the base structure (here List is defined at the root level), but makes the fields order match >>> f1.reorder_fields_as(f2) - {'root': List(feature={'b': Value(dtype='string'), 'a': Value(dtype='string')}, length=-1)} + {'root': List({'b': Value('string'), 'a': Value('string')})} >>> assert f1.reorder_fields_as(f2).type == f2.type """ @@ -2223,12 +2223,12 @@ def flatten(self, max_depth=16) -> "Features": >>> from datasets import load_dataset >>> ds = load_dataset("rajpurkar/squad", split="train") >>> ds.features.flatten() - {'answers.answer_start': List(feature=Value(dtype='int32'), length=-1, id=None), - 'answers.text': List(feature=Value(dtype='string'), length=-1, id=None), - 'context': Value(dtype='string'), - 'id': Value(dtype='string'), - 'question': Value(dtype='string'), - 'title': Value(dtype='string')} + {'answers.answer_start': List(Value('int32'), id=None), + 'answers.text': List(Value('string'), id=None), + 'context': Value('string'), + 'id': Value('string'), + 'question': Value('string'), + 'title': Value('string')} ``` """ for depth in range(1, max_depth): diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index a5797bfa54c..1550b8f2db2 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -3233,19 +3233,19 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDataset": >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train", streaming=True) >>> ds.features {'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None), - 'english_transcription': Value(dtype='string'), + 'english_transcription': Value('string'), 'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']), 'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']), - 'path': Value(dtype='string'), - 'transcription': Value(dtype='string')} + 'path': Value('string'), + 'transcription': Value('string')} >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000)) >>> ds.features {'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), - 'english_transcription': Value(dtype='string'), + 'english_transcription': Value('string'), 'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']), 'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']), - 'path': Value(dtype='string'), - 'transcription': Value(dtype='string')} + 'path': Value('string'), + 'transcription': Value('string')} ``` """ feature = _fix_for_backward_compatible_features(feature) @@ -3285,14 +3285,14 @@ def cast( >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) >>> ds.features {'label': ClassLabel(names=['neg', 'pos']), - 'text': Value(dtype='string')} + 'text': Value('string')} >>> new_features = ds.features.copy() >>> new_features["label"] = ClassLabel(names=["bad", "good"]) >>> new_features["text"] = Value("large_string") >>> ds = ds.cast(new_features) >>> ds.features {'label': ClassLabel(names=['bad', 'good']), - 'text': Value(dtype='large_string')} + 'text': Value('large_string')} ``` """ features = _fix_for_backward_compatible_features(features) @@ -3348,7 +3348,7 @@ def decode(self, enable: bool = True, num_threads: int = 0) -> "IterableDataset" >>> ds = ds.decode(False) >>> ds.features {'image': Image(mode=None, decode=False, id=None), - 'text': Value(dtype='string')} + 'text': Value('string')} >>> next(iter(ds)) { 'image': { diff --git a/src/datasets/load.py b/src/datasets/load.py index c540e511473..bc2b0e679b6 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1117,7 +1117,7 @@ def load_dataset_builder( >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') >>> ds_builder.info.features {'label': ClassLabel(names=['neg', 'pos']), - 'text': Value(dtype='string')} + 'text': Value('string')} ``` """ download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 317bf6273b2..4289a99eee1 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -149,7 +149,7 @@ def _create_dummy_dataset( { "col_1": Array2D(shape=(2, 2), dtype="bool"), "col_2": Array3D(shape=(2, 2, 2), dtype="string"), - "col_3": List(feature=Value("int64")), + "col_3": List(Value("int64")), } ) dset = Dataset.from_dict(data, features=features) @@ -205,7 +205,7 @@ def test_dummy_dataset(self, in_memory): { "col_1": Array2D(shape=(2, 2), dtype="bool"), "col_2": Array3D(shape=(2, 2, 2), dtype="string"), - "col_3": List(feature=Value("int64")), + "col_3": List(Value("int64")), } ), )