From 484a51382d8f7a0f14c629be07709d57e6a0bba2 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 23 Jun 2025 22:30:52 +0200 Subject: [PATCH 1/9] no more sequence --- src/datasets/arrow_dataset.py | 15 +- src/datasets/dataset_dict.py | 11 +- src/datasets/features/__init__.py | 3 +- src/datasets/features/features.py | 212 ++++++------------ src/datasets/features/translation.py | 6 +- .../folder_based_builder.py | 8 +- src/datasets/table.py | 68 ++---- tests/commands/test_test.py | 10 +- tests/features/test_array_xd.py | 4 +- tests/features/test_audio.py | 6 +- tests/features/test_features.py | 168 +++++++------- tests/features/test_image.py | 6 +- tests/fixtures/files.py | 14 +- tests/io/test_parquet.py | 4 +- tests/packaged_modules/test_webdataset.py | 4 +- tests/test_arrow_dataset.py | 90 ++++---- tests/test_dataset_dict.py | 8 +- tests/test_dataset_list.py | 4 +- tests/test_table.py | 80 +++---- 19 files changed, 294 insertions(+), 427 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 37170a51bfa..d0169ea5b3b 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -76,7 +76,7 @@ from .arrow_writer import ArrowWriter, OptimizedTypedSequence from .data_files import sanitize_patterns from .download.streaming_download_manager import xgetsize -from .features import Audio, ClassLabel, Features, Image, Sequence, Value, Video +from .features import Audio, ClassLabel, Features, Image, List, Value, Video from .features.features import ( FeatureType, _align_features, @@ -2028,11 +2028,12 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas >>> from datasets import load_dataset >>> ds = load_dataset("rajpurkar/squad", split="train") >>> ds.features - {'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None), - 'context': Value(dtype='string', id=None), - 'id': Value(dtype='string', id=None), - 'question': Value(dtype='string', id=None), - 'title': Value(dtype='string', id=None)} + {'id': Value(dtype='string'), + 'title': Value(dtype='string'), + 'context': Value(dtype='string'), + 'question': Value(dtype='string'), + 'answers': {'text': List(feature=Value(dtype='string'), length=-1), + 'answer_start': List(feature=Value(dtype='int32'), length=-1)}} >>> ds.flatten() Dataset({ features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'], @@ -6350,7 +6351,7 @@ def process_label_ids(batch): features[label_column] = ( ClassLabel(num_classes=len(label_names), names=label_names) if isinstance(label_feature, ClassLabel) - else Sequence(ClassLabel(num_classes=len(label_names), names=label_names)) + else List(ClassLabel(num_classes=len(label_names), names=label_names)) ) return self.map(process_label_ids, features=features, batched=True, desc="Aligning the labels") diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 8f259c93c72..4d79f95620e 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -201,11 +201,12 @@ def flatten(self, max_depth=16) -> "DatasetDict": >>> from datasets import load_dataset >>> ds = load_dataset("rajpurkar/squad") >>> ds["train"].features - {'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None), - 'context': Value(dtype='string', id=None), - 'id': Value(dtype='string', id=None), - 'question': Value(dtype='string', id=None), - 'title': Value(dtype='string', id=None)} + {'id': Value(dtype='string'), + 'title': Value(dtype='string'), + 'context': Value(dtype='string'), + 'question': Value(dtype='string'), + 'answers.text': List(feature=Value(dtype='string'), length=-1), + 'answers.answer_start': List(feature=Value(dtype='int32'), length=-1)} >>> ds.flatten() DatasetDict({ train: Dataset({ diff --git a/src/datasets/features/__init__.py b/src/datasets/features/__init__.py index 95bb1cf1080..36133ce5e5a 100644 --- a/src/datasets/features/__init__.py +++ b/src/datasets/features/__init__.py @@ -7,6 +7,7 @@ "ClassLabel", "Features", "LargeList", + "List", "Sequence", "Value", "Image", @@ -16,7 +17,7 @@ "Pdf", ] from .audio import Audio -from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, Sequence, Value +from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value from .image import Image from .pdf import Pdf from .translation import Translation, TranslationVariableLanguages diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index e69947fa61b..99222d8c32a 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -1157,34 +1157,30 @@ def _load_names_from_file(names_filepath): return [name.strip() for name in f.read().split("\n") if name.strip()] # Filter empty names +def Sequence(feature, length=-1): + if isinstance(feature, dict): + return {key: List(value, length=length) for key, value in feature.items()} + else: + return List(feature, length=length) + + @dataclass -class Sequence: - """Construct a list of feature from a single type or a dict of types. - Mostly here for compatiblity with tfds. +class List: + """Feature type for large list data composed of child feature data type. + + It is backed by `pyarrow.ListType`, which uses 32-bit offsets or a fixed length. Args: feature ([`FeatureType`]): - A list of features of a single type or a dictionary of types. - length (`int`): - Length of the sequence. - - Example: - - ```py - >>> from datasets import Features, Sequence, Value, ClassLabel - >>> features = Features({'post': Sequence(feature={'text': Value(dtype='string'), 'upvotes': Value(dtype='int32'), 'label': ClassLabel(num_classes=2, names=['hot', 'cold'])})}) - >>> features - {'post': Sequence(feature={'text': Value(dtype='string', id=None), 'upvotes': Value(dtype='int32', id=None), 'label': ClassLabel(names=['hot', 'cold'], id=None)}, length=-1, id=None)} - ``` + Child feature data type of each item within the large list. """ feature: Any length: int = -1 id: Optional[str] = field(default=None, repr=False) # Automatically constructed - dtype: ClassVar[str] = "list" pa_type: ClassVar[Any] = None - _type: str = field(default="Sequence", init=False, repr=False) + _type: str = field(default="List", init=False, repr=False) @dataclass @@ -1214,7 +1210,7 @@ class LargeList: Translation, TranslationVariableLanguages, LargeList, - Sequence, + List, Array2D, Array3D, Array4D, @@ -1233,7 +1229,7 @@ def _check_non_null_non_empty_recursive(obj, schema: Optional[FeatureType] = Non """ if obj is None: return False - elif isinstance(obj, (list, tuple)) and (schema is None or isinstance(schema, (list, tuple, LargeList, Sequence))): + elif isinstance(obj, (list, tuple)) and (schema is None or isinstance(schema, (list, tuple, LargeList, List))): if len(obj) > 0: if schema is None: pass @@ -1273,14 +1269,9 @@ def get_nested_type(schema: FeatureType) -> pa.DataType: elif isinstance(schema, LargeList): value_type = get_nested_type(schema.feature) return pa.large_list(value_type) - elif isinstance(schema, Sequence): + elif isinstance(schema, List): value_type = get_nested_type(schema.feature) - # We allow to reverse list of dict => dict of list for compatibility with tfds - if isinstance(schema.feature, dict): - data_type = pa.struct({f.name: pa.list_(f.type, schema.length) for f in value_type}) - else: - data_type = pa.list_(value_type, schema.length) - return data_type + return pa.list_(value_type, schema.length) # Other objects are callable which returns their data type (ClassLabel, Array2D, Translation, Arrow datatype creation methods) return schema() @@ -1317,7 +1308,7 @@ def encode_nested_example(schema, obj, level=0): if encode_nested_example(sub_schema, first_elmt, level=level + 1) != first_elmt: return [encode_nested_example(sub_schema, o, level=level + 1) for o in obj] return list(obj) - elif isinstance(schema, LargeList): + elif isinstance(schema, (LargeList, List)): if obj is None: return None else: @@ -1329,42 +1320,6 @@ def encode_nested_example(schema, obj, level=0): if encode_nested_example(sub_schema, first_elmt, level=level + 1) != first_elmt: return [encode_nested_example(sub_schema, o, level=level + 1) for o in obj] return list(obj) - elif isinstance(schema, Sequence): - if obj is None: - return None - # We allow to reverse list of dict => dict of list for compatibility with tfds - if isinstance(schema.feature, dict): - # dict of list to fill - list_dict = {} - if isinstance(obj, (list, tuple)): - # obj is a list of dict - for k in schema.feature: - list_dict[k] = [encode_nested_example(schema.feature[k], o.get(k), level=level + 1) for o in obj] - return list_dict - else: - # obj is a single dict - for k in schema.feature: - list_dict[k] = ( - [encode_nested_example(schema.feature[k], o, level=level + 1) for o in obj[k]] - if k in obj - else None - ) - return list_dict - # schema.feature is not a dict - if isinstance(obj, str): # don't interpret a string as a list - raise ValueError(f"Got a string but expected a list instead: '{obj}'") - else: - if len(obj) > 0: - for first_elmt in obj: - if _check_non_null_non_empty_recursive(first_elmt, schema.feature): - break - # be careful when comparing tensors here - if ( - not (isinstance(first_elmt, list) or np.isscalar(first_elmt)) - or encode_nested_example(schema.feature, first_elmt, level=level + 1) != first_elmt - ): - return [encode_nested_example(schema.feature, o, level=level + 1) for o in obj] - return list(obj) # Object with special encoding: # ClassLabel will convert from string to int, TranslationVariableLanguages does some checks elif hasattr(schema, "encode_example"): @@ -1399,7 +1354,7 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[dict[str, Uni if decode_nested_example(sub_schema, first_elmt) != first_elmt: return [decode_nested_example(sub_schema, o) for o in obj] return list(obj) - elif isinstance(schema, LargeList): + elif isinstance(schema, (LargeList, List)): if obj is None: return None else: @@ -1411,12 +1366,6 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[dict[str, Uni if decode_nested_example(sub_schema, first_elmt) != first_elmt: return [decode_nested_example(sub_schema, o) for o in obj] return list(obj) - elif isinstance(schema, Sequence): - # We allow to reverse list of dict => dict of list for compatibility with tfds - if isinstance(schema.feature, dict): - return {k: decode_nested_example([schema.feature[k]], obj[k]) for k in schema.feature} - else: - return decode_nested_example([schema.feature], obj) # Object with special decoding: elif hasattr(schema, "decode_example") and getattr(schema, "decode", True): # we pass the token to read and decode files from private repositories in streaming mode @@ -1430,7 +1379,7 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[dict[str, Uni Translation.__name__: Translation, TranslationVariableLanguages.__name__: TranslationVariableLanguages, LargeList.__name__: LargeList, - Sequence.__name__: Sequence, + List.__name__: List, Array2D.__name__: Array2D, Array3D.__name__: Array3D, Array4D.__name__: Array4D, @@ -1485,7 +1434,10 @@ def generate_from_dict(obj: Any): if class_type == LargeList: feature = obj.pop("feature") return LargeList(feature=generate_from_dict(feature), **obj) - if class_type == Sequence: + if class_type == List: + feature = obj.pop("feature") + return List(feature=generate_from_dict(feature), **obj) + if class_type == Sequence: # backward compatibility, this translates to a List or a dict feature = obj.pop("feature") return Sequence(feature=generate_from_dict(feature), **obj) @@ -1506,15 +1458,11 @@ def generate_from_arrow_type(pa_type: pa.DataType) -> FeatureType: if isinstance(pa_type, pa.StructType): return {field.name: generate_from_arrow_type(field.type) for field in pa_type} elif isinstance(pa_type, pa.FixedSizeListType): - return Sequence(feature=generate_from_arrow_type(pa_type.value_type), length=pa_type.list_size) + return List(feature=generate_from_arrow_type(pa_type.value_type), length=pa_type.list_size) elif isinstance(pa_type, pa.ListType): - feature = generate_from_arrow_type(pa_type.value_type) - if isinstance(feature, (dict, tuple, list)): - return [feature] - return Sequence(feature=feature) + return List(feature=generate_from_arrow_type(pa_type.value_type)) elif isinstance(pa_type, pa.LargeListType): - feature = generate_from_arrow_type(pa_type.value_type) - return LargeList(feature=feature) + return LargeList(feature=generate_from_arrow_type(pa_type.value_type)) elif isinstance(pa_type, _ArrayXDExtensionType): array_feature = [None, None, Array2D, Array3D, Array4D, Array5D][pa_type.ndims] return array_feature(shape=pa_type.shape, dtype=pa_type.value_type) @@ -1596,7 +1544,7 @@ def to_pyarrow_listarray(data: Any, pa_type: _ArrayXDExtensionType) -> pa.Array: """Convert to PyArrow ListArray. Args: - data (Any): Sequence, iterable, np.ndarray or pd.Series. + data (Any): List, iterable, np.ndarray or pd.Series. pa_type (_ArrayXDExtensionType): Any of the ArrayNDExtensionType. Returns: @@ -1624,8 +1572,8 @@ def _visit(feature: FeatureType, func: Callable[[FeatureType], Optional[FeatureT out = func([_visit(feature[0], func)]) elif isinstance(feature, LargeList): out = func(LargeList(_visit(feature.feature, func))) - elif isinstance(feature, Sequence): - out = func(Sequence(_visit(feature.feature, func), length=feature.length)) + elif isinstance(feature, List): + out = func(List(_visit(feature.feature, func), length=feature.length)) else: out = func(feature) return feature if out is None else out @@ -1653,19 +1601,12 @@ def _visit_with_path( Returns: `FeatureType`: the visited feature. """ - if isinstance(feature, Sequence) and isinstance(feature.feature, dict): - feature = {k: [f] for k, f in feature.feature.items()} - # ^ Sequence of dicts is special, it must be converted to a dict of lists (see https://huggingface.co/docs/datasets/v2.16.1/en/package_reference/main_classes#datasets.Features) if isinstance(feature, Features): out = func(Features({k: _visit_with_path(f, func, visit_path + [k]) for k, f in feature.items()}), visit_path) elif isinstance(feature, dict): out = func({k: _visit_with_path(f, func, visit_path + [k]) for k, f in feature.items()}, visit_path) - elif isinstance(feature, (list, tuple)): - out = func([_visit_with_path(feature[0], func, visit_path + [0])], visit_path) - elif isinstance(feature, Sequence): - out = func( - Sequence(_visit_with_path(feature.feature, func, visit_path + [0]), length=feature.length), visit_path - ) + elif isinstance(feature, List): + out = func(List(_visit_with_path(feature.feature, func, visit_path + [0]), length=feature.length), visit_path) elif isinstance(feature, LargeList): out = func(LargeList(_visit_with_path(feature.feature, func, visit_path + [0])), visit_path) else: @@ -1689,7 +1630,7 @@ def require_decoding(feature: FeatureType, ignore_decode_attribute: bool = False return require_decoding(feature[0]) elif isinstance(feature, LargeList): return require_decoding(feature.feature) - elif isinstance(feature, Sequence): + elif isinstance(feature, List): return require_decoding(feature.feature) else: return hasattr(feature, "decode_example") and ( @@ -1707,11 +1648,9 @@ def require_storage_cast(feature: FeatureType) -> bool: """ if isinstance(feature, dict): return any(require_storage_cast(f) for f in feature.values()) - elif isinstance(feature, (list, tuple)): - return require_storage_cast(feature[0]) elif isinstance(feature, LargeList): return require_storage_cast(feature.feature) - elif isinstance(feature, Sequence): + elif isinstance(feature, List): return require_storage_cast(feature.feature) else: return hasattr(feature, "cast_storage") @@ -1727,11 +1666,9 @@ def require_storage_embed(feature: FeatureType) -> bool: """ if isinstance(feature, dict): return any(require_storage_cast(f) for f in feature.values()) - elif isinstance(feature, (list, tuple)): - return require_storage_cast(feature[0]) elif isinstance(feature, LargeList): return require_storage_cast(feature.feature) - elif isinstance(feature, Sequence): + elif isinstance(feature, List): return require_storage_cast(feature.feature) else: return hasattr(feature, "embed_storage") @@ -1771,14 +1708,14 @@ class Features(dict): will be stored as integers in the dataset. - Python `dict` specifies a composite feature containing a mapping of sub-fields to sub-features. It's possible to have nested fields of nested fields in an arbitrary manner. - - Python `list`, [`LargeList`] or [`Sequence`] specifies a composite feature containing a sequence of + - [`List`] or [`LargeList`] specifies a composite feature containing a sequence of sub-features, all of the same feature type. - A [`Sequence`] with an internal dictionary feature will be automatically converted into a dictionary of + A `Sequence` is deprecated and automatically converts internal dictionary feature into a dictionary of lists. This behavior is implemented to have a compatibility layer with the TensorFlow Datasets library but may be - un-wanted in some cases. If you don't want this behavior, you can use a Python `list` or a [`LargeList`] + un-wanted in some cases. If you don't want this behavior, you can use a [`List`] or a [`LargeList`] instead of the [`Sequence`]. @@ -1944,9 +1881,9 @@ def to_yaml_inner(obj: Union[dict, list]) -> dict: if _type == "LargeList": _feature = obj.pop("feature") return simplify({"large_list": to_yaml_inner(_feature), **obj}) - elif _type == "Sequence": + elif _type == "List": _feature = obj.pop("feature") - return simplify({"sequence": to_yaml_inner(_feature), **obj}) + return simplify({"list": to_yaml_inner(_feature), **obj}) elif _type == "Value": return obj elif _type and not obj: @@ -2013,13 +1950,20 @@ def from_yaml_inner(obj: Union[dict, list]) -> Union[dict, list]: return {} _type = next(iter(obj)) if _type == "large_list": - _feature = unsimplify(obj).pop(_type) - return {"feature": from_yaml_inner(_feature), **obj, "_type": "LargeList"} + _feature = from_yaml_inner(unsimplify(obj).pop(_type)) + return {"feature": _feature, **obj, "_type": "LargeList"} if _type == "sequence": - _feature = unsimplify(obj).pop(_type) - return {"feature": from_yaml_inner(_feature), **obj, "_type": "Sequence"} + _feature = from_yaml_inner(unsimplify(obj).pop(_type)) + if isinstance(_feature, dict): + return { + name: {"feature": _subfeature, **obj, "_type": "List"} + for name, _subfeature in _feature.items() + } + else: + return {"feature": _feature, **obj, "_type": "List"} if _type == "list": - return [from_yaml_inner(unsimplify(obj)[_type])] + _feature = from_yaml_inner(unsimplify(obj).pop(_type)) + return {"feature": _feature, **obj, "_type": "List"} if _type == "struct": return from_yaml_inner(obj["struct"]) elif _type == "dtype": @@ -2203,37 +2147,20 @@ def reorder_fields_as(self, other: "Features") -> "Features": Example:: - >>> from datasets import Features, Sequence, Value + >>> from datasets import Features, List, Value >>> # let's say we have two features with a different order of nested fields (for a and b for example) - >>> f1 = Features({"root": Sequence({"a": Value("string"), "b": Value("string")})}) - >>> f2 = Features({"root": {"b": Sequence(Value("string")), "a": Sequence(Value("string"))}}) + >>> f1 = Features({"root": {"a": Value("string"), "b": Value("string")}}) + >>> f2 = Features({"root": {"b": Value("string"), "a": Value("string")}}) >>> assert f1.type != f2.type - >>> # re-ordering keeps the base structure (here Sequence is defined at the root level), but makes the fields order match + >>> # re-ordering keeps the base structure (here List is defined at the root level), but makes the fields order match >>> f1.reorder_fields_as(f2) - {'root': Sequence(feature={'b': Value(dtype='string', id=None), 'a': Value(dtype='string', id=None)}, length=-1, id=None)} + {'root': List(feature={'b': Value(dtype='string'), 'a': Value(dtype='string')}, length=-1)} >>> assert f1.reorder_fields_as(f2).type == f2.type """ def recursive_reorder(source, target, stack=""): stack_position = " at " + stack[1:] if stack else "" - if isinstance(target, Sequence): - target = target.feature - if isinstance(target, dict): - target = {k: [v] for k, v in target.items()} - else: - target = [target] - if isinstance(source, Sequence): - sequence_kwargs = vars(source).copy() - source = sequence_kwargs.pop("feature") - if isinstance(source, dict): - source = {k: [v] for k, v in source.items()} - reordered = recursive_reorder(source, target, stack) - return Sequence({k: v[0] for k, v in reordered.items()}, **sequence_kwargs) - else: - source = [source] - reordered = recursive_reorder(source, target, stack) - return Sequence(reordered[0], **sequence_kwargs) - elif isinstance(source, dict): + if isinstance(source, dict): if not isinstance(target, dict): raise ValueError(f"Type mismatch: between {source} and {target}" + stack_position) if sorted(source) != sorted(target): @@ -2244,16 +2171,14 @@ def recursive_reorder(source, target, stack=""): ) raise ValueError(message) return {key: recursive_reorder(source[key], target[key], stack + f".{key}") for key in target} - elif isinstance(source, list): - if not isinstance(target, list): + elif isinstance(source, List): + if not isinstance(target, List): raise ValueError(f"Type mismatch: between {source} and {target}" + stack_position) - if len(source) != len(target): - raise ValueError(f"Length mismatch: between {source} and {target}" + stack_position) - return [recursive_reorder(source[i], target[i], stack + ".") for i in range(len(target))] + return List(recursive_reorder(source.feature, target.feature, stack + "."), length=source.length) elif isinstance(source, LargeList): if not isinstance(target, LargeList): raise ValueError(f"Type mismatch: between {source} and {target}" + stack_position) - return LargeList(recursive_reorder(source.feature, target.feature, stack)) + return LargeList(recursive_reorder(source.feature, target.feature, stack + ".")) else: return source @@ -2277,8 +2202,8 @@ def flatten(self, max_depth=16) -> "Features": >>> from datasets import load_dataset >>> ds = load_dataset("rajpurkar/squad", split="train") >>> ds.features.flatten() - {'answers.answer_start': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), - 'answers.text': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), + {'answers.answer_start': List(feature=Value(dtype='int32', id=None), length=-1, id=None), + 'answers.text': List(feature=Value(dtype='string', id=None), length=-1, id=None), 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), @@ -2293,15 +2218,6 @@ def flatten(self, max_depth=16) -> "Features": no_change = False flattened.update({f"{column_name}.{k}": v for k, v in subfeature.items()}) del flattened[column_name] - elif isinstance(subfeature, Sequence) and isinstance(subfeature.feature, dict): - no_change = False - flattened.update( - { - f"{column_name}.{k}": Sequence(v) if not isinstance(v, dict) else [v] - for k, v in subfeature.feature.items() - } - ) - del flattened[column_name] elif hasattr(subfeature, "flatten") and subfeature.flatten() != subfeature: no_change = False flattened.update({f"{column_name}.{k}": v for k, v in subfeature.flatten().items()}) diff --git a/src/datasets/features/translation.py b/src/datasets/features/translation.py index bb91a7cfb7b..9bee3aa2c11 100644 --- a/src/datasets/features/translation.py +++ b/src/datasets/features/translation.py @@ -121,9 +121,9 @@ def encode_example(self, translation_dict): def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]: """Flatten the TranslationVariableLanguages feature into a dictionary.""" - from .features import Sequence, Value + from .features import List, Value return { - "language": Sequence(Value("string")), - "translation": Sequence(Value("string")), + "language": List(Value("string")), + "translation": List(Value("string")), } diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index 50788eecff0..f1ec638ab3b 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -212,11 +212,11 @@ def _set_feature(feature): key = key[: -len("_file_name")] or self.BASE_COLUMN_NAME out[key] = self.BASE_FEATURE() feature_not_found = False - elif (key == "file_names" or key.endswith("_file_names")) and feature[ - key - ] == datasets.Sequence(datasets.Value("string")): + elif (key == "file_names" or key.endswith("_file_names")) and feature[key] == datasets.List( + datasets.Value("string") + ): key = key[: -len("_file_names")] or (self.BASE_COLUMN_NAME + "s") - out[key] = datasets.Sequence(self.BASE_FEATURE()) + out[key] = datasets.List(self.BASE_FEATURE()) feature_not_found = False elif (key == "file_names" or key.endswith("_file_names")) and feature[key] == [ datasets.Value("string") diff --git a/src/datasets/table.py b/src/datasets/table.py index 2e616a84688..8d301afea7a 100644 --- a/src/datasets/table.py +++ b/src/datasets/table.py @@ -8,7 +8,6 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pyarrow.types from .utils.logging import get_logger @@ -1982,7 +1981,7 @@ def cast_array_to_feature( Returns: array (`pyarrow.Array`): the casted array """ - from .features.features import LargeList, Sequence, get_nested_type + from .features.features import LargeList, List, get_nested_type _c = partial( cast_array_to_feature, @@ -1995,12 +1994,8 @@ def cast_array_to_feature( if hasattr(feature, "cast_storage"): return feature.cast_storage(array) - elif pa.types.is_struct(array.type): - # feature must be a dict or Sequence(subfeatures_dict) - if isinstance(feature, Sequence) and isinstance(feature.feature, dict): - sequence_kwargs = vars(feature).copy() - feature = sequence_kwargs.pop("feature") - feature = {name: Sequence(subfeature, **sequence_kwargs) for name, subfeature in feature.items()} + if pa.types.is_struct(array.type): + # feature must be a dict if isinstance(feature, dict) and (array_fields := {field.name for field in array.type}) <= set(feature): null_array = pa.array([None] * len(array)) arrays = [ @@ -2009,17 +2004,8 @@ def cast_array_to_feature( ] return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null()) elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type): - # feature must be either [subfeature] or LargeList(subfeature) or Sequence(subfeature) - if isinstance(feature, list): - casted_array_values = _c(array.values, feature[0]) - if pa.types.is_list(array.type) and casted_array_values.type == array.values.type: - # Both array and feature have equal list type and values (within the list) type - return array - else: - # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError - array_offsets = _combine_list_array_offsets_with_mask(array) - return pa.ListArray.from_arrays(array_offsets, casted_array_values) - elif isinstance(feature, LargeList): + # feature must be either List(subfeature) or LargeList(subfeature) + if isinstance(feature, LargeList): casted_array_values = _c(array.values, feature.feature) if pa.types.is_large_list(array.type) and casted_array_values.type == array.values.type: # Both array and feature have equal large_list type and values (within the list) type @@ -2028,7 +2014,7 @@ def cast_array_to_feature( # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError array_offsets = _combine_list_array_offsets_with_mask(array) return pa.LargeListArray.from_arrays(array_offsets, casted_array_values) - elif isinstance(feature, Sequence): + elif isinstance(feature, List): if feature.length > -1: if _are_list_values_of_length(array, feature.length): if array.null_count > 0: @@ -2072,16 +2058,13 @@ def cast_array_to_feature( array_offsets = _combine_list_array_offsets_with_mask(array) return pa.ListArray.from_arrays(array_offsets, casted_array_values) elif pa.types.is_fixed_size_list(array.type): - # feature must be either [subfeature] or Sequence(subfeature) - if isinstance(feature, list): - array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size - return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature[0]), mask=array.is_null()) - elif isinstance(feature, LargeList): + # feature must be List(subfeature) + if isinstance(feature, LargeList): array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size return pa.LargeListArray.from_arrays( array_offsets, _c(array.values, feature.feature), mask=array.is_null() ) - elif isinstance(feature, Sequence): + elif isinstance(feature, List): if feature.length > -1: if feature.length == array.type.list_size: array_values = array.values[ @@ -2099,7 +2082,7 @@ def cast_array_to_feature( allow_primitive_to_str=allow_primitive_to_str, allow_decimal_to_str=allow_decimal_to_str, ) - elif not isinstance(feature, (Sequence, dict, list, tuple)): + elif not isinstance(feature, (List, LargeList, dict)): return array_cast( array, feature(), @@ -2131,7 +2114,7 @@ def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_ Returns: array (`pyarrow.Array`): the casted array """ - from .features import Sequence + from .features import LargeList, List _e = partial(embed_array_storage, token_per_repo_id=token_per_repo_id) @@ -2140,21 +2123,15 @@ def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_ if hasattr(feature, "embed_storage"): return feature.embed_storage(array, token_per_repo_id=token_per_repo_id) elif pa.types.is_struct(array.type): - # feature must be a dict or Sequence(subfeatures_dict) - if isinstance(feature, Sequence) and isinstance(feature.feature, dict): - feature = { - name: Sequence(subfeature, length=feature.length) for name, subfeature in feature.feature.items() - } + # feature must be a dict if isinstance(feature, dict): arrays = [_e(array.field(name), subfeature) for name, subfeature in feature.items()] return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null()) elif pa.types.is_list(array.type): - # feature must be either [subfeature] or Sequence(subfeature) + # feature must be either List(subfeature) # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError array_offsets = _combine_list_array_offsets_with_mask(array) - if isinstance(feature, list): - return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature[0])) - if isinstance(feature, Sequence) and feature.length == -1: + if isinstance(feature, List) and feature.length == -1: return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature.feature)) elif pa.types.is_large_list(array.type): # feature must be LargeList(subfeature) @@ -2162,14 +2139,14 @@ def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_ array_offsets = _combine_list_array_offsets_with_mask(array) return pa.LargeListArray.from_arrays(array_offsets, _e(array.values, feature.feature)) elif pa.types.is_fixed_size_list(array.type): - # feature must be Sequence(subfeature) - if isinstance(feature, Sequence) and feature.length > -1: + # feature must be List(subfeature) + if isinstance(feature, List) and feature.length > -1: array_values = array.values[ array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size ] embedded_array_values = _e(array_values, feature.feature) return pa.FixedSizeListArray.from_arrays(embedded_array_values, feature.length, mask=array.is_null()) - if not isinstance(feature, (Sequence, dict, list, tuple)): + if not isinstance(feature, (List, LargeList, dict)): return array raise TypeError(f"Couldn't embed array of type\n{_short_str(array.type)}\nwith\n{_short_str(feature)}") @@ -2350,7 +2327,7 @@ def table_visitor(table: pa.Table, function: Callable[[pa.Array], None]): function (`Callable[[pa.Array], None]`): Function to apply to each array. """ - from .features import Features, Sequence + from .features import Features, LargeList, List features = Features.from_arrow_schema(table.schema) @@ -2363,17 +2340,10 @@ def _visit(array, feature): array = array.storage function(array, feature) if pa.types.is_struct(array.type) and not hasattr(feature, "cast_storage"): - if isinstance(feature, Sequence) and isinstance(feature.feature, dict): - feature = { - name: Sequence(subfeature, length=feature.length) - for name, subfeature in feature.feature.items() - } for name, subfeature in feature.items(): _visit(array.field(name), subfeature) elif pa.types.is_list(array.type): - if isinstance(feature, list): - _visit(array.values, feature[0]) - elif isinstance(feature, Sequence): + if isinstance(feature, (LargeList, List)): _visit(array.values, feature.feature) for name, feature in features.items(): diff --git a/tests/commands/test_test.py b/tests/commands/test_test.py index 396f7e84611..f8935a8c025 100644 --- a/tests/commands/test_test.py +++ b/tests/commands/test_test.py @@ -3,7 +3,7 @@ import pytest -from datasets import ClassLabel, Features, Sequence, Value +from datasets import ClassLabel, Features, Value from datasets.commands.test import TestCommand from datasets.info import DatasetInfo, DatasetInfosDict @@ -43,12 +43,12 @@ def test_test_command(dataset_dir): "default": DatasetInfo( features=Features( { - "tokens": Sequence(Value("string")), - "ner_tags": Sequence( + "tokens": List(Value("string")), + "ner_tags": List( ClassLabel(names=["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]) ), - "langs": Sequence(Value("string")), - "spans": Sequence(Value("string")), + "langs": List(Value("string")), + "spans": List(Value("string")), } ), splits=[ diff --git a/tests/features/test_array_xd.py b/tests/features/test_array_xd.py index 8eb9e4e0242..4b30387386a 100644 --- a/tests/features/test_array_xd.py +++ b/tests/features/test_array_xd.py @@ -421,11 +421,11 @@ def test_array_xd_with_np(seq_type, dtype, shape, feature_class): data = np.zeros(shape, dtype=dtype) expected = data.tolist() if seq_type == "sequence": - feature = datasets.Sequence(feature) + feature = datasets.List(feature) data = [data] expected = [expected] elif seq_type == "sequence_of_sequence": - feature = datasets.Sequence(datasets.Sequence(feature)) + feature = datasets.List(datasets.List(feature)) data = [[data]] expected = [[expected]] ds = datasets.Dataset.from_dict({"col": [data]}, features=datasets.Features({"col": feature})) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 38999e64b4e..f959458777c 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -7,7 +7,7 @@ import pytest from datasets import Column, Dataset, concatenate_datasets, load_dataset -from datasets.features import Audio, Features, Sequence, Value +from datasets.features import Audio, Features, Value from ..utils import require_sndfile, require_torchcodec @@ -54,7 +54,7 @@ def test_audio_feature_type_to_arrow(): assert features.arrow_schema == pa.schema({"audio": Audio().pa_type}) features = Features({"struct_containing_an_audio": {"audio": Audio()}}) assert features.arrow_schema == pa.schema({"struct_containing_an_audio": pa.struct({"audio": Audio().pa_type})}) - features = Features({"sequence_of_audios": Sequence(Audio())}) + features = Features({"sequence_of_audios": List(Audio())}) assert features.arrow_schema == pa.schema({"sequence_of_audios": pa.list_(Audio().pa_type)}) @@ -375,7 +375,7 @@ def test_dataset_with_audio_feature_with_none(): # nested tests data = {"audio": [[None]]} - features = Features({"audio": Sequence(Audio())}) + features = Features({"audio": List(Audio())}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} diff --git a/tests/features/test_features.py b/tests/features/test_features.py index 3e2d36cc77a..c9d98c9f001 100644 --- a/tests/features/test_features.py +++ b/tests/features/test_features.py @@ -53,7 +53,7 @@ def test_from_arrow_schema_simple(self): def test_from_arrow_schema_with_sequence(self): data = {"a": [{"b": {"c": ["text"]}}] * 10, "foo": [1] * 10} - original_features = Features({"a": {"b": Sequence({"c": Value("string")})}, "foo": Value("int64")}) + original_features = Features({"a": {"b": {"c": List(Value("string"))}}, "foo": Value("int64")}) dset = Dataset.from_dict(data, features=original_features) new_features = dset.features new_dset = Dataset.from_dict(data, features=new_features) @@ -145,41 +145,41 @@ def test_reorder_fields_as(self): "title": Value("string"), "url": Value("string"), "html": Value("string"), - "tokens": Sequence({"token": Value("string"), "is_html": Value("bool")}), + "tokens": {"token": List(Value("string")), "is_html": List(Value("bool"))}, }, "question": { "text": Value("string"), - "tokens": Sequence(Value("string")), + "tokens": List(Value("string")), }, - "annotations": Sequence( - { - "id": Value("string"), - "long_answer": { + "annotations": { + "id": List(Value("string")), + "long_answer": List( + { "start_token": Value("int64"), "end_token": Value("int64"), "start_byte": Value("int64"), "end_byte": Value("int64"), - }, - "short_answers": Sequence( - { - "start_token": Value("int64"), - "end_token": Value("int64"), - "start_byte": Value("int64"), - "end_byte": Value("int64"), - "text": Value("string"), - } - ), - "yes_no_answer": ClassLabel(names=["NO", "YES"]), - } - ), + } + ), + "short_answers": List( + { + "start_token": List(Value("int64")), + "end_token": List(Value("int64")), + "start_byte": List(Value("int64")), + "end_byte": List(Value("int64")), + "text": List(Value("string")), + } + ), + "yes_no_answer": List(ClassLabel(names=["NO", "YES"])), + }, } ) - other = Features( # same but with [] instead of sequences, and with a shuffled fields order + other = Features( # same but with a shuffled fields order { "id": Value("string"), "document": { - "tokens": Sequence({"token": Value("string"), "is_html": Value("bool")}), + "tokens": {"token": List(Value("string")), "is_html": List(Value("bool"))}, "title": Value("string"), "url": Value("string"), "html": Value("string"), @@ -189,27 +189,25 @@ def test_reorder_fields_as(self): "tokens": [Value("string")], }, "annotations": { - "yes_no_answer": [ClassLabel(names=["NO", "YES"])], - "id": [Value("string")], - "long_answer": [ + "yes_no_answer": List(ClassLabel(names=["NO", "YES"])), + "id": List(Value("string")), + "long_answer": List( { "end_byte": Value("int64"), "start_token": Value("int64"), "end_token": Value("int64"), "start_byte": Value("int64"), } - ], - "short_answers": [ - Sequence( - { - "text": Value("string"), - "start_token": Value("int64"), - "end_token": Value("int64"), - "start_byte": Value("int64"), - "end_byte": Value("int64"), - } - ) - ], + ), + "short_answers": List( + { + "text": List(Value("string")), + "start_token": List(Value("int64")), + "end_token": List(Value("int64")), + "start_byte": List(Value("int64")), + "end_byte": List(Value("int64")), + } + ), }, } ) @@ -218,36 +216,36 @@ def test_reorder_fields_as(self): { "id": Value("string"), "document": { - "tokens": Sequence({"token": Value("string"), "is_html": Value("bool")}), + "tokens": {"token": List(Value("string")), "is_html": List(Value("bool"))}, "title": Value("string"), "url": Value("string"), "html": Value("string"), }, "question": { "text": Value("string"), - "tokens": Sequence(Value("string")), + "tokens": List(Value("string")), }, - "annotations": Sequence( - { - "yes_no_answer": ClassLabel(names=["NO", "YES"]), - "id": Value("string"), - "long_answer": { + "annotations": { + "yes_no_answer": List(ClassLabel(names=["NO", "YES"])), + "id": List(Value("string")), + "long_answer": List( + { "end_byte": Value("int64"), "start_token": Value("int64"), "end_token": Value("int64"), "start_byte": Value("int64"), - }, - "short_answers": Sequence( - { - "text": Value("string"), - "start_token": Value("int64"), - "end_token": Value("int64"), - "start_byte": Value("int64"), - "end_byte": Value("int64"), - } - ), - } - ), + } + ), + "short_answers": List( + { + "text": List(Value("string")), + "start_token": List(Value("int64")), + "end_token": List(Value("int64")), + "start_byte": List(Value("int64")), + "end_byte": List(Value("int64")), + } + ), + }, } ) @@ -265,7 +263,7 @@ def test_flatten(self): assert features == _features, "calling flatten shouldn't alter the current features" def test_flatten_with_sequence(self): - features = Features({"foo": Sequence({"bar": {"my_value": Value("int32")}})}) + features = Features({"foo": {"bar": List({"my_value": Value("int32")})}}) _features = features.copy() flattened_features = features.flatten() assert flattened_features == {"foo.bar": [{"my_value": Value("int32")}]} @@ -278,7 +276,7 @@ def assert_features_dicts_are_synced(features: Features): and features.keys() == features._column_requires_decoding.keys() ) - features = Features({"foo": Sequence({"bar": {"my_value": Value("int32")}})}) + features = Features({"foo": {"bar": List({"my_value": Value("int32")})}}) assert_features_dicts_are_synced(features) features["barfoo"] = Image() assert_features_dicts_are_synced(features) @@ -400,7 +398,7 @@ def test_class_label_to_and_from_dict(class_label_arg, tmp_path_factory): @pytest.mark.parametrize( "schema", - [[Audio()], LargeList(Audio()), Sequence(Audio())], + [[Audio()], LargeList(Audio()), List(Audio())], ) def test_decode_nested_example_with_list_types(schema, monkeypatch): mock_decode_example = MagicMock() @@ -413,7 +411,7 @@ def test_decode_nested_example_with_list_types(schema, monkeypatch): @pytest.mark.parametrize( "schema", - [[ClassLabel(names=["a", "b"])], LargeList(ClassLabel(names=["a", "b"])), Sequence(ClassLabel(names=["a", "b"]))], + [[ClassLabel(names=["a", "b"])], LargeList(ClassLabel(names=["a", "b"])), List(ClassLabel(names=["a", "b"]))], ) def test_encode_nested_example_with_list_types(schema): result = encode_nested_example(schema, ["b"]) @@ -422,7 +420,7 @@ def test_encode_nested_example_with_list_types(schema): @pytest.mark.parametrize("inner_type", [Value("int32"), {"subcolumn": Value("int32")}]) def test_encode_nested_example_sequence_with_none(inner_type): - schema = Sequence(inner_type) + schema = List(inner_type) obj = None result = encode_nested_example(schema, obj) assert result is None @@ -434,7 +432,7 @@ def test_encode_nested_example_sequence_with_none(inner_type): ({"col_1": ClassLabel(names=["a", "b"])}, {"col_1": "b"}, {"col_1": 1}), ({"col_1": [ClassLabel(names=["a", "b"])]}, {"col_1": ["b"]}, {"col_1": [1]}), ({"col_1": LargeList(ClassLabel(names=["a", "b"]))}, {"col_1": ["b"]}, {"col_1": [1]}), - ({"col_1": Sequence(ClassLabel(names=["a", "b"]))}, {"col_1": ["b"]}, {"col_1": [1]}), + ({"col_1": List(ClassLabel(names=["a", "b"]))}, {"col_1": ["b"]}, {"col_1": [1]}), ], ) def test_encode_example(features_dict, example, expected_encoded_example): @@ -446,7 +444,7 @@ def test_encode_example(features_dict, example, expected_encoded_example): def test_encode_batch_with_example_with_empty_first_elem(): features = Features( { - "x": Sequence(Sequence(ClassLabel(names=["a", "b"]))), + "x": List(List(ClassLabel(names=["a", "b"]))), } ) encoded_batch = features.encode_batch( @@ -497,7 +495,7 @@ def test_dataset_feature_with_none(feature): # nested tests data = {"col": [[None]]} - features = Features({"col": Sequence(feature)}) + features = Features({"col": List(feature)}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"col"} @@ -668,21 +666,17 @@ def test_dont_iterate_over_each_element_in_a_list(self, mocked_cast): Features({"foo": {}}), Features({"foo": {"bar": Value("int32")}}), Features({"foo": {"bar1": Value("int32"), "bar2": Value("float64")}}), - Features({"foo": Sequence(Value("int32"))}), - Features({"foo": Sequence({})}), - Features({"foo": Sequence({"bar": Value("int32")})}), - Features({"foo": [Value("int32")]}), - Features({"foo": [{"bar": Value("int32")}]}), + Features({"foo": List(Value("int32"))}), + Features({"foo": {"bar": List(Value("int32"))}}), + Features({"foo": List({"bar": Value("int32")})}), Features({"foo": LargeList(Value("int32"))}), Features({"foo": LargeList({"bar": Value("int32")})}), ] NESTED_CUSTOM_FEATURES = [ Features({"foo": {"bar": ClassLabel(names=["negative", "positive"])}}), - Features({"foo": Sequence(ClassLabel(names=["negative", "positive"]))}), - Features({"foo": Sequence({"bar": ClassLabel(names=["negative", "positive"])})}), - Features({"foo": [ClassLabel(names=["negative", "positive"])]}), - Features({"foo": [{"bar": ClassLabel(names=["negative", "positive"])}]}), + Features({"foo": List(ClassLabel(names=["negative", "positive"]))}), + Features({"foo": List({"bar": ClassLabel(names=["negative", "positive"])})}), Features({"foo": LargeList(ClassLabel(names=["negative", "positive"]))}), Features({"foo": LargeList({"bar": ClassLabel(names=["negative", "positive"])})}), ] @@ -709,7 +703,7 @@ def test_features_to_yaml_list(features: Features): [ ({"col": [{"sub_col": Value("int32")}]}, {"col": [{"sub_col": Value("int32")}]}), ({"col": LargeList({"sub_col": Value("int32")})}, {"col": LargeList({"sub_col": Value("int32")})}), - ({"col": Sequence({"sub_col": Value("int32")})}, {"col.sub_col": Sequence(Value("int32"))}), + ({"col": {"sub_col": List(Value("int32"))}}, {"col.sub_col": List(Value("int32"))}), ], ) def test_features_flatten_with_list_types(features_dict, expected_features_dict): @@ -731,7 +725,7 @@ def test_features_flatten_with_list_types(features_dict, expected_features_dict) ), ( {"col": {"feature": {"dtype": "int32", "_type": "Value"}, "_type": "Sequence"}}, - {"col": Sequence(Value("int32"))}, + {"col": List(Value("int32"))}, ), ( {"col": [{"sub_col": {"dtype": "int32", "_type": "Value"}}]}, @@ -743,7 +737,7 @@ def test_features_flatten_with_list_types(features_dict, expected_features_dict) ), ( {"col": {"feature": {"sub_col": {"dtype": "int32", "_type": "Value"}}, "_type": "Sequence"}}, - {"col": Sequence({"sub_col": Value("int32")})}, + {"col": {"sub_col": List(Value("int32"))}}, ), ], ) @@ -765,7 +759,7 @@ def test_features_from_dict_with_list_types(deserialized_features_dict, expected ), ( {"feature": {"dtype": "int32", "_type": "Value"}, "_type": "Sequence"}, - Sequence(Value("int32")), + List(Value("int32")), ), ( [{"sub_col": {"dtype": "int32", "_type": "Value"}}], @@ -777,7 +771,7 @@ def test_features_from_dict_with_list_types(deserialized_features_dict, expected ), ( {"feature": {"sub_col": {"dtype": "int32", "_type": "Value"}}, "_type": "Sequence"}, - Sequence({"sub_col": Value("int32")}), + {"sub_col": List(Value("int32"))}, ), ], ) @@ -882,7 +876,7 @@ def test_features_from_arrow_schema_list_data_type(list_dtype, scalar_dtype): [ ([Value("int64")], [Value("int64")]), (LargeList(Value("int64")), LargeList(Value("int64"))), - (Sequence(Value("int64")), Sequence(Value("int64"))), + (List(Value("int64")), List(Value("int64"))), ( [{"sub_col_1": Value("int64"), "sub_col_2": Value("int64")}], [{"sub_col_2": Value("int64"), "sub_col_1": Value("int64")}], @@ -892,8 +886,8 @@ def test_features_from_arrow_schema_list_data_type(list_dtype, scalar_dtype): LargeList({"sub_col_2": Value("int64"), "sub_col_1": Value("int64")}), ), ( - Sequence({"sub_col_1": Value("int64"), "sub_col_2": Value("int64")}), - Sequence({"sub_col_2": Value("int64"), "sub_col_1": Value("int64")}), + {"sub_col_1": List(Value("int64")), "sub_col_2": List(Value("int64"))}, + {"sub_col_2": List(Value("int64")), "sub_col_1": List(Value("int64"))}, ), ], ) @@ -953,7 +947,7 @@ def test_generate_from_arrow_type_with_arrow_nested_data_type( @pytest.mark.parametrize( "schema", - [[ClassLabel(names=["a", "b"])], LargeList(ClassLabel(names=["a", "b"])), Sequence(ClassLabel(names=["a", "b"]))], + [[ClassLabel(names=["a", "b"])], LargeList(ClassLabel(names=["a", "b"])), List(ClassLabel(names=["a", "b"]))], ) def test_check_non_null_non_empty_recursive_with_list_types(schema): assert _check_non_null_non_empty_recursive([], schema) is False @@ -964,31 +958,31 @@ def test_check_non_null_non_empty_recursive_with_list_types(schema): [ [[ClassLabel(names=["a", "b"])]], LargeList(LargeList(ClassLabel(names=["a", "b"]))), - Sequence(Sequence(ClassLabel(names=["a", "b"]))), + List(List(ClassLabel(names=["a", "b"]))), ], ) def test_check_non_null_non_empty_recursive_with_nested_list_types(schema): assert _check_non_null_non_empty_recursive([[]], schema) is False -@pytest.mark.parametrize("feature", [[Audio()], LargeList(Audio()), Sequence(Audio())]) +@pytest.mark.parametrize("feature", [[Audio()], LargeList(Audio()), List(Audio())]) def test_require_decoding_with_list_types(feature): assert require_decoding(feature) -@pytest.mark.parametrize("feature", [[Audio()], LargeList(Audio()), Sequence(Audio())]) +@pytest.mark.parametrize("feature", [[Audio()], LargeList(Audio()), List(Audio())]) def test_require_storage_cast_with_list_types(feature): assert require_storage_cast(feature) -@pytest.mark.parametrize("feature", [[Audio()], LargeList(Audio()), Sequence(Audio())]) +@pytest.mark.parametrize("feature", [[Audio()], LargeList(Audio()), List(Audio())]) def test_require_storage_embed_with_list_types(feature): assert require_storage_embed(feature) @pytest.mark.parametrize( "feature, expected", - [([Value("int32")], [1]), (LargeList(Value("int32")), LargeList(1)), (Sequence(Value("int32")), Sequence(1))], + [([Value("int32")], [1]), (LargeList(Value("int32")), LargeList(1)), (List(Value("int32")), List(1))], ) def test_visit_with_list_types(feature, expected): def func(x): diff --git a/tests/features/test_image.py b/tests/features/test_image.py index d639bf84ac5..57559b57a1e 100644 --- a/tests/features/test_image.py +++ b/tests/features/test_image.py @@ -9,7 +9,7 @@ import pyarrow as pa import pytest -from datasets import Column, Dataset, Features, Image, Sequence, Value, concatenate_datasets, load_dataset +from datasets import Column, Dataset, Features, Image, Value, concatenate_datasets, load_dataset from datasets.features.image import encode_np_array, image_to_bytes from ..utils import require_pil @@ -45,7 +45,7 @@ def test_image_feature_type_to_arrow(): assert features.arrow_schema == pa.schema({"image": Image().pa_type}) features = Features({"struct_containing_an_image": {"image": Image()}}) assert features.arrow_schema == pa.schema({"struct_containing_an_image": pa.struct({"image": Image().pa_type})}) - features = Features({"sequence_of_images": Sequence(Image())}) + features = Features({"sequence_of_images": List(Image())}) assert features.arrow_schema == pa.schema({"sequence_of_images": pa.list_(Image().pa_type)}) @@ -276,7 +276,7 @@ def test_dataset_with_image_feature_with_none(): # nested tests data = {"images": [[None]]} - features = Features({"images": Sequence(Image())}) + features = Features({"images": List(Image())}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"images"} diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py index 25b1448ae46..b1c6f7768a2 100644 --- a/tests/fixtures/files.py +++ b/tests/fixtures/files.py @@ -24,14 +24,12 @@ def dataset(): n = 10 features = datasets.Features( { - "tokens": datasets.Sequence(datasets.Value("string")), - "labels": datasets.Sequence(datasets.ClassLabel(names=["negative", "positive"])), - "answers": datasets.Sequence( - { - "text": datasets.Value("string"), - "answer_start": datasets.Value("int32"), - } - ), + "tokens": datasets.List(datasets.Value("string")), + "labels": datasets.List(datasets.ClassLabel(names=["negative", "positive"])), + "answers": { + "text": datasets.List(datasets.Value("string")), + "answer_start": datasets.List(datasets.Value("int32")), + }, "id": datasets.Value("int64"), } ) diff --git a/tests/io/test_parquet.py b/tests/io/test_parquet.py index cdc55c9e18e..5062b88a60c 100644 --- a/tests/io/test_parquet.py +++ b/tests/io/test_parquet.py @@ -2,7 +2,7 @@ import pyarrow.parquet as pq import pytest -from datasets import Audio, Dataset, DatasetDict, Features, IterableDatasetDict, NamedSplit, Sequence, Value, config +from datasets import Audio, Dataset, DatasetDict, Features, IterableDatasetDict, NamedSplit, Value, config from datasets.features.image import Image from datasets.info import DatasetInfo from datasets.io.parquet import ParquetDatasetReader, ParquetDatasetWriter, get_writer_batch_size @@ -219,7 +219,7 @@ def test_dataset_to_parquet_keeps_features(shared_datadir, tmp_path): [ (Features({"foo": Value("int32")}), None), (Features({"image": Image(), "foo": Value("int32")}), config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS), - (Features({"nested": Sequence(Audio())}), config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS), + (Features({"nested": List(Audio())}), config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS), ], ) def test_get_writer_batch_size(feature, expected): diff --git a/tests/packaged_modules/test_webdataset.py b/tests/packaged_modules/test_webdataset.py index 12aa6275382..d65915fe6ea 100644 --- a/tests/packaged_modules/test_webdataset.py +++ b/tests/packaged_modules/test_webdataset.py @@ -3,7 +3,7 @@ import pytest -from datasets import Audio, DownloadManager, Features, Image, Sequence, Value +from datasets import Audio, DownloadManager, Features, Image, List, Value from datasets.packaged_modules.webdataset.webdataset import WebDataset from ..utils import ( @@ -245,7 +245,7 @@ def test_tensor_webdataset(tensor_wds_file): "__key__": Value("string"), "__url__": Value("string"), "json": {"text": Value("string")}, - "pth": Sequence(Value("float32")), + "pth": List(Value("float32")), } ) assert len(split_generators) == 1 diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 8e365462197..317bf6273b2 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -34,7 +34,7 @@ Features, Image, LargeList, - Sequence, + List, Translation, TranslationVariableLanguages, Value, @@ -143,13 +143,13 @@ def _create_dummy_dataset( data = { "col_1": [[[True, False], [False, True]]] * 4, # 2D "col_2": [[[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]]] * 4, # 3D array - "col_3": [[3, 2, 1, 0]] * 4, # Sequence + "col_3": [[3, 2, 1, 0]] * 4, # List } features = Features( { "col_1": Array2D(shape=(2, 2), dtype="bool"), "col_2": Array3D(shape=(2, 2, 2), dtype="string"), - "col_3": Sequence(feature=Value("int64")), + "col_3": List(feature=Value("int64")), } ) dset = Dataset.from_dict(data, features=features) @@ -205,7 +205,7 @@ def test_dummy_dataset(self, in_memory): { "col_1": Array2D(shape=(2, 2), dtype="bool"), "col_2": Array3D(shape=(2, 2, 2), dtype="string"), - "col_3": Sequence(feature=Value("int64")), + "col_3": List(feature=Value("int64")), } ), ) @@ -913,7 +913,7 @@ def test_flatten(self, in_memory): with tempfile.TemporaryDirectory() as tmp_dir: with Dataset.from_dict( {"a": [{"b": {"c": ["text"]}}] * 10, "foo": [1] * 10}, - features=Features({"a": {"b": Sequence({"c": Value("string")})}, "foo": Value("int64")}), + features=Features({"a": {"b": {"c": List(Value("string"))}}, "foo": Value("int64")}), ) as dset: with self._to(in_memory, tmp_dir, dset) as dset: fingerprint = dset._fingerprint @@ -921,7 +921,7 @@ def test_flatten(self, in_memory): self.assertListEqual(sorted(dset.column_names), ["a.b.c", "foo"]) self.assertListEqual(sorted(dset.features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( - dset.features, Features({"a.b.c": Sequence(Value("string")), "foo": Value("int64")}) + dset.features, Features({"a.b.c": List(Value("string")), "foo": Value("int64")}) ) self.assertNotEqual(dset._fingerprint, fingerprint) assert_arrow_metadata_are_synced_with_dataset_features(dset) @@ -962,8 +962,8 @@ def test_flatten(self, in_memory): dset.features, Features( { - "a.language": Sequence(Value("string")), - "a.translation": Sequence(Value("string")), + "a.language": List(Value("string")), + "a.translation": List(Value("string")), "foo": Value("int64"), } ), @@ -1729,7 +1729,7 @@ def func(example): self.assertEqual(len(dset_test), 30) self.assertDictEqual( dset_test.features, - Features({"filename": Value("string"), "tensor": Sequence(Value("float32"))}), + Features({"filename": Value("string"), "tensor": List(Value("float32"))}), ) self.assertListEqual(dset_test[0]["tensor"], [1, 2, 3]) @@ -1746,7 +1746,7 @@ def func(example): self.assertEqual(len(dset_test), 30) self.assertDictEqual( dset_test.features, - Features({"filename": Value("string"), "tensor": Sequence(Value("float32"))}), + Features({"filename": Value("string"), "tensor": List(Value("float32"))}), ) self.assertListEqual(dset_test[0]["tensor"], [1, 2, 3]) @@ -1763,7 +1763,7 @@ def func(example): self.assertEqual(len(dset_test), 30) self.assertDictEqual( dset_test.features, - Features({"filename": Value("string"), "tensor": Sequence(Value("float32"))}), + Features({"filename": Value("string"), "tensor": List(Value("float32"))}), ) self.assertListEqual(dset_test[0]["tensor"], [1, 2, 3]) @@ -1777,7 +1777,7 @@ def func(example): self.assertEqual(len(dset_test), 30) self.assertDictEqual( dset_test.features, - Features({"filename": Value("string"), "tensor": Sequence(Value("float64"))}), + Features({"filename": Value("string"), "tensor": List(Value("float64"))}), ) self.assertListEqual(dset_test[0]["tensor"], [1, 2, 3]) @@ -1795,7 +1795,7 @@ def func(batch): self.assertEqual(len(dset_test), 30) self.assertDictEqual( dset_test.features, - Features({"filename": Value("string"), "tensor": Sequence(Value("float32"))}), + Features({"filename": Value("string"), "tensor": List(Value("float32"))}), ) self.assertListEqual(dset_test[0]["tensor"], [1, 2, 3]) @@ -2019,8 +2019,8 @@ def test_filter_caching(self, in_memory): def test_keep_features_after_transform_specified(self, in_memory): features = Features( { - "tokens": Sequence(Value("string")), - "labels": Sequence(ClassLabel(names=["negative", "positive"])), + "tokens": List(Value("string")), + "labels": List(ClassLabel(names=["negative", "positive"])), } ) @@ -2040,8 +2040,8 @@ def invert_labels(x): def test_keep_features_after_transform_unspecified(self, in_memory): features = Features( { - "tokens": Sequence(Value("string")), - "labels": Sequence(ClassLabel(names=["negative", "positive"])), + "tokens": List(Value("string")), + "labels": List(ClassLabel(names=["negative", "positive"])), } ) @@ -2061,8 +2061,8 @@ def invert_labels(x): def test_keep_features_after_transform_to_file(self, in_memory): features = Features( { - "tokens": Sequence(Value("string")), - "labels": Sequence(ClassLabel(names=["negative", "positive"])), + "tokens": List(Value("string")), + "labels": List(ClassLabel(names=["negative", "positive"])), } ) @@ -2083,8 +2083,8 @@ def invert_labels(x): def test_keep_features_after_transform_to_memory(self, in_memory): features = Features( { - "tokens": Sequence(Value("string")), - "labels": Sequence(ClassLabel(names=["negative", "positive"])), + "tokens": List(Value("string")), + "labels": List(ClassLabel(names=["negative", "positive"])), } ) @@ -2103,8 +2103,8 @@ def invert_labels(x): def test_keep_features_after_loading_from_cache(self, in_memory): features = Features( { - "tokens": Sequence(Value("string")), - "labels": Sequence(ClassLabel(names=["negative", "positive"])), + "tokens": List(Value("string")), + "labels": List(ClassLabel(names=["negative", "positive"])), } ) @@ -2129,8 +2129,8 @@ def invert_labels(x): def test_keep_features_with_new_features(self, in_memory): features = Features( { - "tokens": Sequence(Value("string")), - "labels": Sequence(ClassLabel(names=["negative", "positive"])), + "tokens": List(Value("string")), + "labels": List(ClassLabel(names=["negative", "positive"])), } ) @@ -2139,9 +2139,9 @@ def invert_labels(x): expected_features = Features( { - "tokens": Sequence(Value("string")), - "labels": Sequence(ClassLabel(names=["negative", "positive"])), - "labels2": Sequence(Value("int64")), + "tokens": List(Value("string")), + "labels": List(ClassLabel(names=["negative", "positive"])), + "labels2": List(Value("int64")), } ) @@ -2875,9 +2875,7 @@ def test_format_vectors(self, in_memory): for col in columns: self.assertIsInstance(dset[0][col], (str, list)) self.assertIsInstance(dset[:2][col], list) - self.assertDictEqual( - dset.features, Features({"filename": Value("string"), "vec": Sequence(Value("float64"))}) - ) + self.assertDictEqual(dset.features, Features({"filename": Value("string"), "vec": List(Value("float64"))})) dset.set_format("tensorflow") self.assertIsNotNone(dset[0]) @@ -2930,9 +2928,7 @@ def test_format_ragged_vectors(self, in_memory): for col in columns: self.assertIsInstance(dset[0][col], (str, list)) self.assertIsInstance(dset[:2][col], list) - self.assertDictEqual( - dset.features, Features({"filename": Value("string"), "vec": Sequence(Value("float64"))}) - ) + self.assertDictEqual(dset.features, Features({"filename": Value("string"), "vec": List(Value("float64"))})) dset.set_format("tensorflow") self.assertIsNotNone(dset[0]) @@ -2986,7 +2982,7 @@ def test_format_nested(self, in_memory): dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset, ): self.assertDictEqual( - dset.features, Features({"filename": Value("string"), "nested": {"foo": Sequence(Value("float64"))}}) + dset.features, Features({"filename": Value("string"), "nested": {"foo": List(Value("float64"))}}) ) dset.set_format("tensorflow") @@ -3293,7 +3289,7 @@ def test_from_pandas(self): self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) self.assertDictEqual(dset.features, Features({"col_1": Value("int64"), "col_2": Value("string")})) - features = Features({"col_1": Sequence(Value("string")), "col_2": Value("string")}) + features = Features({"col_1": List(Value("string")), "col_2": Value("string")}) self.assertRaises(TypeError, Dataset.from_pandas, df, features=features) @require_polars @@ -3322,7 +3318,7 @@ def test_from_polars(self): self.assertListEqual(list(dset.features.keys()), ["col_1", "col_2"]) self.assertDictEqual(dset.features, Features({"col_1": Value("int64"), "col_2": Value("large_string")})) - features = Features({"col_1": Sequence(Value("string")), "col_2": Value("large_string")}) + features = Features({"col_1": List(Value("string")), "col_2": Value("large_string")}) self.assertRaises(TypeError, Dataset.from_polars, df, features=features) def test_from_dict(self): @@ -3417,8 +3413,8 @@ def test_tf_string_encoding(self): def test_cast_with_sliced_list(): - old_features = Features({"foo": Sequence(Value("int64"))}) - new_features = Features({"foo": Sequence(Value("int32"))}) + old_features = Features({"foo": List(Value("int64"))}) + new_features = Features({"foo": List(Value("int32"))}) dataset = Dataset.from_dict({"foo": [[i] * (i % 3) for i in range(20)]}, features=old_features) casted_dataset = dataset.cast(new_features, batch_size=2) # small batch size to slice the ListArray assert dataset["foo"] == casted_dataset["foo"] @@ -4263,14 +4259,12 @@ def test_dataset_to_json(dataset, tmp_path): { "features": Features( { - "tokens": Sequence(Value("string")), - "labels": Sequence(Value("int16")), - "answers": Sequence( - { - "text": Value("string"), - "answer_start": Value("int32"), - } - ), + "tokens": List(Value("string")), + "labels": List(Value("int16")), + "answers": { + "text": List(Value("string")), + "answer_start": List(Value("int32")), + }, "id": Value("int32"), } ) @@ -4436,7 +4430,7 @@ def test_dataset_format_with_unformatted_image(): ds = Dataset.from_dict( {"a": [np.arange(4 * 4 * 3).reshape(4, 4, 3)] * 10, "b": [[0, 1]] * 10}, - Features({"a": Image(), "b": Sequence(Value("int64"))}), + Features({"a": Image(), "b": List(Value("int64"))}), ) ds.set_format("np", columns=["b"], output_all_columns=True) assert isinstance(ds[0]["a"], PIL.Image.Image) diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py index 651f2ec822e..afc0ae8b417 100644 --- a/tests/test_dataset_dict.py +++ b/tests/test_dataset_dict.py @@ -9,7 +9,7 @@ from datasets import load_from_disk from datasets.arrow_dataset import Dataset from datasets.dataset_dict import DatasetDict, IterableDatasetDict -from datasets.features import ClassLabel, Features, Sequence, Value +from datasets.features import ClassLabel, Features, List, Value from datasets.iterable_dataset import IterableDataset from datasets.splits import NamedSplit @@ -71,15 +71,13 @@ def _create_dummy_iterable_dataset_dict(self, multiple_columns=False) -> Iterabl def test_flatten(self): dset_split = Dataset.from_dict( {"a": [{"b": {"c": ["text"]}}] * 10, "foo": [1] * 10}, - features=Features({"a": {"b": Sequence({"c": Value("string")})}, "foo": Value("int64")}), + features=Features({"a": {"b": {"c": List(Value("string"))}}, "foo": Value("int64")}), ) dset = DatasetDict({"train": dset_split, "test": dset_split}) dset = dset.flatten() self.assertDictEqual(dset.column_names, {"train": ["a.b.c", "foo"], "test": ["a.b.c", "foo"]}) self.assertListEqual(sorted(dset["train"].features.keys()), ["a.b.c", "foo"]) - self.assertDictEqual( - dset["train"].features, Features({"a.b.c": Sequence(Value("string")), "foo": Value("int64")}) - ) + self.assertDictEqual(dset["train"].features, Features({"a.b.c": List(Value("string")), "foo": Value("int64")})) del dset def test_set_format_numpy(self): diff --git a/tests/test_dataset_list.py b/tests/test_dataset_list.py index 1004ae3cd68..642eb2c5736 100644 --- a/tests/test_dataset_list.py +++ b/tests/test_dataset_list.py @@ -1,6 +1,6 @@ from unittest import TestCase -from datasets import Sequence, Value +from datasets import List, Value from datasets.arrow_dataset import Dataset @@ -39,7 +39,7 @@ def test_uneven_records(self): # checks what happens with missing columns def test_variable_list_records(self): # checks if the type can be inferred from the second record list_records = [{"col_1": []}, {"col_1": [1, 2]}] dset = Dataset.from_list(list_records) - self.assertEqual(dset.info.features["col_1"], Sequence(Value("int64"))) + self.assertEqual(dset.info.features["col_1"], List(Value("int64"))) def test_create_empty(self): dset = Dataset.from_list([]) diff --git a/tests/test_table.py b/tests/test_table.py index 3d3db09e5d6..7ca740e8fde 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -9,7 +9,7 @@ import pyarrow as pa import pytest -from datasets.features import Array2D, ClassLabel, Features, Image, LargeList, Sequence, Value +from datasets.features import Array2D, ClassLabel, Features, Image, LargeList, Value from datasets.features.features import Array2DExtensionType, get_nested_type from datasets.table import ( ConcatenationTable, @@ -1105,42 +1105,36 @@ def test_indexed_table_mixin(): def test_cast_integer_array_to_features(): arr = pa.array([[0, 1]]) - assert cast_array_to_feature(arr, Sequence(Value("string"))).type == pa.list_(pa.string()) - assert cast_array_to_feature(arr, Sequence(Value("string")), allow_decimal_to_str=False).type == pa.list_( - pa.string() - ) + assert cast_array_to_feature(arr, List(Value("string"))).type == pa.list_(pa.string()) + assert cast_array_to_feature(arr, List(Value("string")), allow_decimal_to_str=False).type == pa.list_(pa.string()) with pytest.raises(TypeError): - cast_array_to_feature(arr, Sequence(Value("string")), allow_primitive_to_str=False) + cast_array_to_feature(arr, List(Value("string")), allow_primitive_to_str=False) def test_cast_float_array_to_features(): arr = pa.array([[0.0, 1.0]]) - assert cast_array_to_feature(arr, Sequence(Value("string"))).type == pa.list_(pa.string()) - assert cast_array_to_feature(arr, Sequence(Value("string")), allow_decimal_to_str=False).type == pa.list_( - pa.string() - ) + assert cast_array_to_feature(arr, List(Value("string"))).type == pa.list_(pa.string()) + assert cast_array_to_feature(arr, List(Value("string")), allow_decimal_to_str=False).type == pa.list_(pa.string()) with pytest.raises(TypeError): - cast_array_to_feature(arr, Sequence(Value("string")), allow_primitive_to_str=False) + cast_array_to_feature(arr, List(Value("string")), allow_primitive_to_str=False) def test_cast_boolean_array_to_features(): arr = pa.array([[False, True]]) - assert cast_array_to_feature(arr, Sequence(Value("string"))).type == pa.list_(pa.string()) - assert cast_array_to_feature(arr, Sequence(Value("string")), allow_decimal_to_str=False).type == pa.list_( - pa.string() - ) + assert cast_array_to_feature(arr, List(Value("string"))).type == pa.list_(pa.string()) + assert cast_array_to_feature(arr, List(Value("string")), allow_decimal_to_str=False).type == pa.list_(pa.string()) with pytest.raises(TypeError): - cast_array_to_feature(arr, Sequence(Value("string")), allow_primitive_to_str=False) + cast_array_to_feature(arr, List(Value("string")), allow_primitive_to_str=False) def test_cast_decimal_array_to_features(): arr = pa.array([[Decimal(0), Decimal(1)]]) - assert cast_array_to_feature(arr, Sequence(Value("string"))).type == pa.list_(pa.string()) - assert cast_array_to_feature(arr, Sequence(Value("string")), allow_primitive_to_str=False).type == pa.list_( + assert cast_array_to_feature(arr, List(Value("string"))).type == pa.list_(pa.string()) + assert cast_array_to_feature(arr, List(Value("string")), allow_primitive_to_str=False).type == pa.list_( pa.string() ) with pytest.raises(TypeError): - cast_array_to_feature(arr, Sequence(Value("string")), allow_decimal_to_str=False) + cast_array_to_feature(arr, List(Value("string")), allow_decimal_to_str=False) @pytest.mark.parametrize( @@ -1160,7 +1154,7 @@ def test_cast_array_to_feature_with_struct_with_missing_fields(array_list, expec def test_cast_array_to_features_nested(): arr = pa.array([[{"foo": [0]}]]) - assert cast_array_to_feature(arr, [{"foo": Sequence(Value("string"))}]).type == pa.list_( + assert cast_array_to_feature(arr, [{"foo": List(Value("string"))}]).type == pa.list_( pa.struct({"foo": pa.list_(pa.string())}) ) @@ -1187,12 +1181,12 @@ def test_cast_array_to_features_nested_with_nulls(): def test_cast_array_to_features_to_null_type(): # same type arr = pa.array([[None, None]]) - assert cast_array_to_feature(arr, Sequence(Value("null"))).type == pa.list_(pa.null()) + assert cast_array_to_feature(arr, List(Value("null"))).type == pa.list_(pa.null()) # different type arr = pa.array([[None, 1]]) with pytest.raises(TypeError): - cast_array_to_feature(arr, Sequence(Value("null"))) + cast_array_to_feature(arr, List(Value("null"))) def test_cast_array_to_features_array_xd(): @@ -1207,26 +1201,26 @@ def test_cast_array_to_features_array_xd(): def test_cast_array_to_features_sequence_classlabel(): arr = pa.array([[], [1], [0, 1]], pa.list_(pa.int64())) - assert cast_array_to_feature(arr, Sequence(ClassLabel(names=["foo", "bar"]))).type == pa.list_(pa.int64()) + assert cast_array_to_feature(arr, List(ClassLabel(names=["foo", "bar"]))).type == pa.list_(pa.int64()) arr = pa.array([[], ["bar"], ["foo", "bar"]], pa.list_(pa.string())) - assert cast_array_to_feature(arr, Sequence(ClassLabel(names=["foo", "bar"]))).type == pa.list_(pa.int64()) + assert cast_array_to_feature(arr, List(ClassLabel(names=["foo", "bar"]))).type == pa.list_(pa.int64()) # Test empty arrays arr = pa.array([[], []], pa.list_(pa.int64())) - assert cast_array_to_feature(arr, Sequence(ClassLabel(names=["foo", "bar"]))).type == pa.list_(pa.int64()) + assert cast_array_to_feature(arr, List(ClassLabel(names=["foo", "bar"]))).type == pa.list_(pa.int64()) arr = pa.array([[], []], pa.list_(pa.string())) - assert cast_array_to_feature(arr, Sequence(ClassLabel(names=["foo", "bar"]))).type == pa.list_(pa.int64()) + assert cast_array_to_feature(arr, List(ClassLabel(names=["foo", "bar"]))).type == pa.list_(pa.int64()) # Test invalid class labels arr = pa.array([[2]], pa.list_(pa.int64())) with pytest.raises(ValueError): - assert cast_array_to_feature(arr, Sequence(ClassLabel(names=["foo", "bar"]))) + assert cast_array_to_feature(arr, List(ClassLabel(names=["foo", "bar"]))) arr = pa.array([["baz"]], pa.list_(pa.string())) with pytest.raises(ValueError): - assert cast_array_to_feature(arr, Sequence(ClassLabel(names=["foo", "bar"]))) + assert cast_array_to_feature(arr, List(ClassLabel(names=["foo", "bar"]))) @pytest.mark.parametrize( @@ -1240,14 +1234,14 @@ def test_cast_array_to_features_sequence_classlabel(): def test_cast_fixed_size_list_array_to_features_sequence(arr, slice, target_value_feature): arr = arr if slice is None else arr[slice] # Fixed size list - casted_array = cast_array_to_feature(arr, Sequence(target_value_feature, length=arr.type.list_size)) - assert casted_array.type == get_nested_type(Sequence(target_value_feature, length=arr.type.list_size)) + casted_array = cast_array_to_feature(arr, List(target_value_feature, length=arr.type.list_size)) + assert casted_array.type == get_nested_type(List(target_value_feature, length=arr.type.list_size)) assert casted_array.to_pylist() == arr.to_pylist() with pytest.raises(TypeError): - cast_array_to_feature(arr, Sequence(target_value_feature, length=arr.type.list_size + 1)) + cast_array_to_feature(arr, List(target_value_feature, length=arr.type.list_size + 1)) # Variable size list - casted_array = cast_array_to_feature(arr, Sequence(target_value_feature)) - assert casted_array.type == get_nested_type(Sequence(target_value_feature)) + casted_array = cast_array_to_feature(arr, List(target_value_feature)) + assert casted_array.type == get_nested_type(List(target_value_feature)) assert casted_array.to_pylist() == arr.to_pylist() casted_array = cast_array_to_feature(arr, [target_value_feature]) assert casted_array.type == get_nested_type([target_value_feature]) @@ -1265,16 +1259,16 @@ def test_cast_fixed_size_list_array_to_features_sequence(arr, slice, target_valu def test_cast_list_array_to_features_sequence(arr, slice, target_value_feature): arr = arr if slice is None else arr[slice] # Variable size list - casted_array = cast_array_to_feature(arr, Sequence(target_value_feature)) - assert casted_array.type == get_nested_type(Sequence(target_value_feature)) + casted_array = cast_array_to_feature(arr, List(target_value_feature)) + assert casted_array.type == get_nested_type(List(target_value_feature)) assert casted_array.to_pylist() == arr.to_pylist() casted_array = cast_array_to_feature(arr, [target_value_feature]) assert casted_array.type == get_nested_type([target_value_feature]) assert casted_array.to_pylist() == arr.to_pylist() # Fixed size list list_size = arr.value_lengths().drop_null()[0].as_py() if arr.value_lengths().drop_null() else 2 - casted_array = cast_array_to_feature(arr, Sequence(target_value_feature, length=list_size)) - assert casted_array.type == get_nested_type(Sequence(target_value_feature, length=list_size)) + casted_array = cast_array_to_feature(arr, List(target_value_feature, length=list_size)) + assert casted_array.type == get_nested_type(List(target_value_feature, length=list_size)) assert casted_array.to_pylist() == arr.to_pylist() @@ -1303,7 +1297,7 @@ def test_cast_array_to_feature_with_list_array_and_sequence_feature( array_type = pa.struct({"col_1": array_type}) sequence_feature = {"col_1": sequence_feature} expected_array_type = pa.struct({"col_1": expected_array_type}) - feature = Sequence(sequence_feature) + feature = List(sequence_feature) array = pa.array([array_data], type=array_type) cast_array = cast_array_to_feature(array, feature) assert cast_array.type == expected_array_type @@ -1337,12 +1331,12 @@ def test_cast_array_xd_to_features_sequence(): arr = Array2DExtensionType(shape=(2, 3), dtype="int64").wrap_array(pa.array(arr, pa.list_(pa.list_(pa.int64())))) arr = pa.ListArray.from_arrays([0, None, 4, 8], arr) # Variable size list - casted_array = cast_array_to_feature(arr, Sequence(Array2D(shape=(2, 3), dtype="int32"))) - assert casted_array.type == get_nested_type(Sequence(Array2D(shape=(2, 3), dtype="int32"))) + casted_array = cast_array_to_feature(arr, List(Array2D(shape=(2, 3), dtype="int32"))) + assert casted_array.type == get_nested_type(List(Array2D(shape=(2, 3), dtype="int32"))) assert casted_array.to_pylist() == arr.to_pylist() # Fixed size list - casted_array = cast_array_to_feature(arr, Sequence(Array2D(shape=(2, 3), dtype="int32"), length=4)) - assert casted_array.type == get_nested_type(Sequence(Array2D(shape=(2, 3), dtype="int32"), length=4)) + casted_array = cast_array_to_feature(arr, List(Array2D(shape=(2, 3), dtype="int32"), length=4)) + assert casted_array.type == get_nested_type(List(Array2D(shape=(2, 3), dtype="int32"), length=4)) assert casted_array.to_pylist() == arr.to_pylist() @@ -1380,7 +1374,7 @@ def test_embed_array_storage_nested(image_file): ), ( pa.array([[{"path": "image_path"}]], type=pa.list_(Image.pa_type)), - Sequence(Image()), + List(Image()), pa.types.is_list, ), ], From dc33788f577bbc9362e41296025b952c249e0f8a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 23 Jun 2025 22:34:17 +0200 Subject: [PATCH 2/9] docs --- docs/source/about_dataset_features.mdx | 15 ++++++++------- docs/source/package_reference/main_classes.mdx | 2 ++ docs/source/process.mdx | 11 ++++++----- src/datasets/features/features.py | 1 + 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/docs/source/about_dataset_features.mdx b/docs/source/about_dataset_features.mdx index d575e28065d..6df80ce72a1 100644 --- a/docs/source/about_dataset_features.mdx +++ b/docs/source/about_dataset_features.mdx @@ -32,20 +32,21 @@ Refer to [`Value`] for a full list of supported data types. The [`ClassLabel`] feature informs 🤗 Datasets the `label` column contains two classes. The classes are labeled `not_equivalent` and `equivalent`. Labels are stored as integers in the dataset. When you retrieve the labels, [`ClassLabel.int2str`] and [`ClassLabel.str2int`] carries out the conversion from integer value to label name, and vice versa. -If your data type contains a list of objects, then you want to use the [`Sequence`] feature. Remember the SQuAD dataset? +If your data type contains a list of objects, then you want to use the [`List`] feature. Remember the SQuAD dataset? ```py >>> from datasets import load_dataset >>> dataset = load_dataset('rajpurkar/squad', split='train') >>> dataset.features -{'answers': Sequence(feature={'text': Value(dtype='string'), 'answer_start': Value(dtype='int32')}, length=-1), -'context': Value(dtype='string'), -'id': Value(dtype='string'), -'question': Value(dtype='string'), -'title': Value(dtype='string')} +{'id': Value(dtype='string'), + 'title': Value(dtype='string'), + 'context': Value(dtype='string'), + 'question': Value(dtype='string'), + 'answers': {'text': List(feature=Value(dtype='string'), length=-1), + 'answer_start': List(feature=Value(dtype='int32'), length=-1)}} ``` -The `answers` field is constructed using the [`Sequence`] feature because it contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively. +The `answers` field is constructed using the dict of features because and contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively. diff --git a/docs/source/package_reference/main_classes.mdx b/docs/source/package_reference/main_classes.mdx index 41ec8051f44..299dd765d13 100644 --- a/docs/source/package_reference/main_classes.mdx +++ b/docs/source/package_reference/main_classes.mdx @@ -235,6 +235,8 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable [[autodoc]] datasets.LargeList +[[autodoc]] datasets.List + [[autodoc]] datasets.Sequence ### Translation diff --git a/docs/source/process.mdx b/docs/source/process.mdx index bdc7e33caf5..ec86f41dab2 100644 --- a/docs/source/process.mdx +++ b/docs/source/process.mdx @@ -265,11 +265,12 @@ Sometimes a column can be a nested structure of several types. Take a look at th >>> from datasets import load_dataset >>> dataset = load_dataset("rajpurkar/squad", split="train") >>> dataset.features -{'answers': Sequence(feature={'text': Value(dtype='string'), 'answer_start': Value(dtype='int32')}, length=-1), -'context': Value(dtype='string'), -'id': Value(dtype='string'), -'question': Value(dtype='string'), -'title': Value(dtype='string')} +{'id': Value(dtype='string'), + 'title': Value(dtype='string'), + 'context': Value(dtype='string'), + 'question': Value(dtype='string'), + 'answers': {'text': List(feature=Value(dtype='string'), length=-1), + 'answer_start': List(feature=Value(dtype='int32'), length=-1)}} ``` The `answers` field contains two subfields: `text` and `answer_start`. Use the [`~Dataset.flatten`] function to extract the subfields into their own separate columns: diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 99222d8c32a..148face482d 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -1158,6 +1158,7 @@ def _load_names_from_file(names_filepath): def Sequence(feature, length=-1): + """deprecated, please use List instead""" if isinstance(feature, dict): return {key: List(value, length=length) for key, value in feature.items()} else: From 2d90521b157fb531039c2e1c0ab25a1d39dab2b5 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 24 Jun 2025 16:44:26 +0200 Subject: [PATCH 3/9] fix tests --- src/datasets/features/features.py | 21 +++---------- tests/features/test_features.py | 51 +++++++++++++------------------ tests/io/test_parquet.py | 2 +- tests/test_builder.py | 20 ++++++------ tests/test_iterable_dataset.py | 3 +- tests/test_table.py | 43 ++++++++++++++------------ 6 files changed, 62 insertions(+), 78 deletions(-) diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 148face482d..97af496614f 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -1294,21 +1294,6 @@ def encode_nested_example(schema, obj, level=0): if obj is not None else None ) - - elif isinstance(schema, (list, tuple)): - sub_schema = schema[0] - if obj is None: - return None - elif isinstance(obj, np.ndarray): - return encode_nested_example(schema, obj.tolist()) - else: - if len(obj) > 0: - for first_elmt in obj: - if _check_non_null_non_empty_recursive(first_elmt, sub_schema): - break - if encode_nested_example(sub_schema, first_elmt, level=level + 1) != first_elmt: - return [encode_nested_example(sub_schema, o, level=level + 1) for o in obj] - return list(obj) elif isinstance(schema, (LargeList, List)): if obj is None: return None @@ -1318,7 +1303,11 @@ def encode_nested_example(schema, obj, level=0): for first_elmt in obj: if _check_non_null_non_empty_recursive(first_elmt, sub_schema): break - if encode_nested_example(sub_schema, first_elmt, level=level + 1) != first_elmt: + try: + changed = bool(encode_nested_example(sub_schema, first_elmt, level=level + 1) != first_elmt) + except ValueError: # can happen when comparing arrays + changed = False + if changed: return [encode_nested_example(sub_schema, o, level=level + 1) for o in obj] return list(obj) # Object with special encoding: diff --git a/tests/features/test_features.py b/tests/features/test_features.py index c9d98c9f001..1ead81f8e3d 100644 --- a/tests/features/test_features.py +++ b/tests/features/test_features.py @@ -1,5 +1,4 @@ import datetime -from typing import List, Tuple from unittest import TestCase from unittest.mock import MagicMock, patch @@ -10,7 +9,7 @@ from datasets import Array2D from datasets.arrow_dataset import Column, Dataset -from datasets.features import Audio, ClassLabel, Features, Image, LargeList, Sequence, Value +from datasets.features import Audio, ClassLabel, Features, Image, LargeList, List, Sequence, Value from datasets.features.features import ( _align_features, _arrow_to_datasets_dtype, @@ -186,7 +185,7 @@ def test_reorder_fields_as(self): }, "question": { "text": Value("string"), - "tokens": [Value("string")], + "tokens": List(Value("string")), }, "annotations": { "yes_no_answer": List(ClassLabel(names=["NO", "YES"])), @@ -266,7 +265,7 @@ def test_flatten_with_sequence(self): features = Features({"foo": {"bar": List({"my_value": Value("int32")})}}) _features = features.copy() flattened_features = features.flatten() - assert flattened_features == {"foo.bar": [{"my_value": Value("int32")}]} + assert flattened_features == {"foo.bar": List({"my_value": Value("int32")})} assert features == _features, "calling flatten shouldn't alter the current features" def test_features_dicts_are_synced(self): @@ -411,7 +410,7 @@ def test_decode_nested_example_with_list_types(schema, monkeypatch): @pytest.mark.parametrize( "schema", - [[ClassLabel(names=["a", "b"])], LargeList(ClassLabel(names=["a", "b"])), List(ClassLabel(names=["a", "b"]))], + [List(ClassLabel(names=["a", "b"])), LargeList(ClassLabel(names=["a", "b"]))], ) def test_encode_nested_example_with_list_types(schema): result = encode_nested_example(schema, ["b"]) @@ -430,7 +429,7 @@ def test_encode_nested_example_sequence_with_none(inner_type): "features_dict, example, expected_encoded_example", [ ({"col_1": ClassLabel(names=["a", "b"])}, {"col_1": "b"}, {"col_1": 1}), - ({"col_1": [ClassLabel(names=["a", "b"])]}, {"col_1": ["b"]}, {"col_1": [1]}), + ({"col_1": List(ClassLabel(names=["a", "b"]))}, {"col_1": ["b"]}, {"col_1": [1]}), ({"col_1": LargeList(ClassLabel(names=["a", "b"]))}, {"col_1": ["b"]}, {"col_1": [1]}), ({"col_1": List(ClassLabel(names=["a", "b"]))}, {"col_1": ["b"]}, {"col_1": [1]}), ], @@ -716,20 +715,16 @@ def test_features_flatten_with_list_types(features_dict, expected_features_dict) "deserialized_features_dict, expected_features_dict", [ ( - {"col": [{"dtype": "int32", "_type": "Value"}]}, - {"col": [Value("int32")]}, + {"col": {"feature": {"dtype": "int32", "_type": "Value"}, "_type": "List"}}, + {"col": List(Value("int32"))}, ), ( {"col": {"feature": {"dtype": "int32", "_type": "Value"}, "_type": "LargeList"}}, {"col": LargeList(Value("int32"))}, ), ( - {"col": {"feature": {"dtype": "int32", "_type": "Value"}, "_type": "Sequence"}}, - {"col": List(Value("int32"))}, - ), - ( - {"col": [{"sub_col": {"dtype": "int32", "_type": "Value"}}]}, - {"col": [{"sub_col": Value("int32")}]}, + {"col": {"feature": {"sub_col": {"dtype": "int32", "_type": "Value"}}, "_type": "List"}}, + {"col": List({"sub_col": Value("int32")})}, ), ( {"col": {"feature": {"sub_col": {"dtype": "int32", "_type": "Value"}}, "_type": "LargeList"}}, @@ -749,28 +744,24 @@ def test_features_from_dict_with_list_types(deserialized_features_dict, expected @pytest.mark.parametrize( "deserialized_feature_dict, expected_feature", [ - ( - [{"dtype": "int32", "_type": "Value"}], - [Value("int32")], - ), ( {"feature": {"dtype": "int32", "_type": "Value"}, "_type": "LargeList"}, LargeList(Value("int32")), ), ( - {"feature": {"dtype": "int32", "_type": "Value"}, "_type": "Sequence"}, + {"feature": {"dtype": "int32", "_type": "Value"}, "_type": "List"}, List(Value("int32")), ), ( - [{"sub_col": {"dtype": "int32", "_type": "Value"}}], - [{"sub_col": Value("int32")}], + {"feature": {"sub_col": {"dtype": "int32", "_type": "Value"}}, "_type": "List"}, + List({"sub_col": Value("int32")}), ), ( {"feature": {"sub_col": {"dtype": "int32", "_type": "Value"}}, "_type": "LargeList"}, LargeList({"sub_col": Value("int32")}), ), ( - {"feature": {"sub_col": {"dtype": "int32", "_type": "Value"}}, "_type": "Sequence"}, + {"sub_col": {"feature": {"dtype": "int32", "_type": "Value"}, "_type": "List"}}, {"sub_col": List(Value("int32"))}, ), ], @@ -852,7 +843,7 @@ def test_features_to_arrow_schema(features: Features): @pytest.mark.parametrize("features", NESTED_COMPARISON) -def test_features_alignment(features: Tuple[List[Features], Features]): +def test_features_alignment(features: tuple[list[Features], list[Features]]): inputs, expected = features _check_if_features_can_be_aligned(inputs) # Check that we can align, will raise otherwise. assert _align_features(inputs) == expected @@ -874,12 +865,12 @@ def test_features_from_arrow_schema_list_data_type(list_dtype, scalar_dtype): @pytest.mark.parametrize( "feature, other_feature", [ - ([Value("int64")], [Value("int64")]), + (List(Value("int64")), List(Value("int64"))), (LargeList(Value("int64")), LargeList(Value("int64"))), (List(Value("int64")), List(Value("int64"))), ( - [{"sub_col_1": Value("int64"), "sub_col_2": Value("int64")}], - [{"sub_col_2": Value("int64"), "sub_col_1": Value("int64")}], + List({"sub_col_1": Value("int64"), "sub_col_2": Value("int64")}), + List({"sub_col_2": Value("int64"), "sub_col_1": Value("int64")}), ), ( LargeList({"sub_col_1": Value("int64"), "sub_col_2": Value("int64")}), @@ -965,24 +956,24 @@ def test_check_non_null_non_empty_recursive_with_nested_list_types(schema): assert _check_non_null_non_empty_recursive([[]], schema) is False -@pytest.mark.parametrize("feature", [[Audio()], LargeList(Audio()), List(Audio())]) +@pytest.mark.parametrize("feature", [LargeList(Audio()), List(Audio())]) def test_require_decoding_with_list_types(feature): assert require_decoding(feature) -@pytest.mark.parametrize("feature", [[Audio()], LargeList(Audio()), List(Audio())]) +@pytest.mark.parametrize("feature", [LargeList(Audio()), List(Audio())]) def test_require_storage_cast_with_list_types(feature): assert require_storage_cast(feature) -@pytest.mark.parametrize("feature", [[Audio()], LargeList(Audio()), List(Audio())]) +@pytest.mark.parametrize("feature", [LargeList(Audio()), List(Audio())]) def test_require_storage_embed_with_list_types(feature): assert require_storage_embed(feature) @pytest.mark.parametrize( "feature, expected", - [([Value("int32")], [1]), (LargeList(Value("int32")), LargeList(1)), (List(Value("int32")), List(1))], + [(List(Value("int32")), List(1)), (LargeList(Value("int32")), LargeList(1)), (List(Value("int32")), List(1))], ) def test_visit_with_list_types(feature, expected): def func(x): diff --git a/tests/io/test_parquet.py b/tests/io/test_parquet.py index 5062b88a60c..c01781972f5 100644 --- a/tests/io/test_parquet.py +++ b/tests/io/test_parquet.py @@ -2,7 +2,7 @@ import pyarrow.parquet as pq import pytest -from datasets import Audio, Dataset, DatasetDict, Features, IterableDatasetDict, NamedSplit, Value, config +from datasets import Audio, Dataset, DatasetDict, Features, IterableDatasetDict, List, NamedSplit, Value, config from datasets.features.image import Image from datasets.info import DatasetInfo from datasets.io.parquet import ParquetDatasetReader, ParquetDatasetWriter, get_writer_batch_size diff --git a/tests/test_builder.py b/tests/test_builder.py index c87bf030edc..4c8c949b6c8 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -26,7 +26,7 @@ from datasets.data_files import DataFilesList from datasets.dataset_dict import DatasetDict, IterableDatasetDict from datasets.download.download_manager import DownloadMode -from datasets.features import Features, Value +from datasets.features import Features, List, Value from datasets.info import DatasetInfo, PostProcessedInfo from datasets.iterable_dataset import IterableDataset from datasets.load import configure_builder_class @@ -346,7 +346,7 @@ def _post_processing_resources(self, split): with tempfile.TemporaryDirectory() as tmp_dir: builder = DummyBuilder(cache_dir=tmp_dir) builder.info.post_processed = PostProcessedInfo( - features=Features({"text": Value("string"), "tokens": [Value("string")]}) + features=Features({"text": Value("string"), "tokens": List(Value("string"))}) ) builder._post_process = types.MethodType(_post_process, builder) builder._post_processing_resources = types.MethodType(_post_processing_resources, builder) @@ -366,7 +366,7 @@ def _post_processing_resources(self, split): with ArrowWriter( path=os.path.join(builder.cache_dir, f"tokenized_dataset-{split}.arrow"), - features=Features({"text": Value("string"), "tokens": [Value("string")]}), + features=Features({"text": Value("string"), "tokens": List(Value("string"))}), ) as writer: writer.write_batch({"text": ["foo"] * 10, "tokens": [list("foo")] * 10}) writer.finalize() @@ -377,10 +377,10 @@ def _post_processing_resources(self, split): self.assertEqual(len(dsets["train"]), 10) self.assertEqual(len(dsets["test"]), 10) self.assertDictEqual( - dsets["train"].features, Features({"text": Value("string"), "tokens": [Value("string")]}) + dsets["train"].features, Features({"text": Value("string"), "tokens": List(Value("string"))}) ) self.assertDictEqual( - dsets["test"].features, Features({"text": Value("string"), "tokens": [Value("string")]}) + dsets["test"].features, Features({"text": Value("string"), "tokens": List(Value("string"))}) ) self.assertListEqual(dsets["train"].column_names, ["text", "tokens"]) self.assertListEqual(dsets["test"].column_names, ["text", "tokens"]) @@ -390,7 +390,7 @@ def _post_processing_resources(self, split): self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train") self.assertEqual(len(dset), 10) - self.assertDictEqual(dset.features, Features({"text": Value("string"), "tokens": [Value("string")]})) + self.assertDictEqual(dset.features, Features({"text": Value("string"), "tokens": List(Value("string"))})) self.assertListEqual(dset.column_names, ["text", "tokens"]) self.assertGreater(builder.info.post_processing_size, 0) self.assertGreater( @@ -402,7 +402,7 @@ def _post_processing_resources(self, split): self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test[:30%]") self.assertEqual(len(dset), 13) - self.assertDictEqual(dset.features, Features({"text": Value("string"), "tokens": [Value("string")]})) + self.assertDictEqual(dset.features, Features({"text": Value("string"), "tokens": List(Value("string"))})) self.assertListEqual(dset.column_names, ["text", "tokens"]) del dset @@ -410,7 +410,7 @@ def _post_processing_resources(self, split): self.assertIsInstance(dset, Dataset) self.assertEqual(dset.split, "train+test") self.assertEqual(len(dset), 20) - self.assertDictEqual(dset.features, Features({"text": Value("string"), "tokens": [Value("string")]})) + self.assertDictEqual(dset.features, Features({"text": Value("string"), "tokens": List(Value("string"))})) self.assertListEqual(dset.column_names, ["text", "tokens"]) del dset @@ -555,7 +555,7 @@ def _post_processing_resources(self, split): with tempfile.TemporaryDirectory() as tmp_dir: builder = DummyBuilder(cache_dir=tmp_dir) builder.info.post_processed = PostProcessedInfo( - features=Features({"text": Value("string"), "tokens": [Value("string")]}) + features=Features({"text": Value("string"), "tokens": List(Value("string"))}) ) builder._post_process = types.MethodType(_post_process, builder) builder._post_processing_resources = types.MethodType(_post_processing_resources, builder) @@ -570,7 +570,7 @@ def _post_processing_resources(self, split): self.assertDictEqual(builder.info.features, Features({"text": Value("string")})) self.assertDictEqual( builder.info.post_processed.features, - Features({"text": Value("string"), "tokens": [Value("string")]}), + Features({"text": Value("string"), "tokens": List(Value("string"))}), ) self.assertEqual(builder.info.splits["train"].num_examples, 100) self.assertTrue( diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py index 855903fd8c2..1bca866bdf8 100644 --- a/tests/test_iterable_dataset.py +++ b/tests/test_iterable_dataset.py @@ -19,6 +19,7 @@ ClassLabel, Features, Image, + List, Value, ) from datasets.formatting import Formatter, get_format_type_from_alias @@ -1766,7 +1767,7 @@ def test_iterable_dataset_features_cast_to_python(): { "id": Value("int64"), "timestamp": Value("timestamp[us]"), - "array": [Value("int64")], + "array": List(Value("int64")), } ) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) diff --git a/tests/test_table.py b/tests/test_table.py index 7ca740e8fde..b7bf75cc803 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -2,14 +2,14 @@ import pickle from decimal import Decimal from functools import partial -from typing import List, Union +from typing import Union from unittest.mock import MagicMock import numpy as np import pyarrow as pa import pytest -from datasets.features import Array2D, ClassLabel, Features, Image, LargeList, Value +from datasets.features import Array2D, ClassLabel, Features, Image, LargeList, List, Value from datasets.features.features import Array2DExtensionType, get_nested_type from datasets.table import ( ConcatenationTable, @@ -40,7 +40,7 @@ def in_memory_pa_table(arrow_file) -> pa.Table: return pa.ipc.open_stream(arrow_file).read_all() -def _to_testing_blocks(table: TableBlock) -> List[List[TableBlock]]: +def _to_testing_blocks(table: TableBlock) -> list[list[TableBlock]]: assert len(table) > 2 blocks = [ [table.slice(0, 2)], @@ -1049,7 +1049,7 @@ def test_concat_tables(arrow_file, in_memory_pa_table): assert isinstance(concatenated_table.blocks[0][2], InMemoryTable) -def _interpolation_search_ground_truth(arr: List[int], x: int) -> Union[int, IndexError]: +def _interpolation_search_ground_truth(arr: list[int], x: int) -> Union[int, IndexError]: for i in range(len(arr) - 1): if arr[i] <= x < arr[i + 1]: return i @@ -1154,7 +1154,7 @@ def test_cast_array_to_feature_with_struct_with_missing_fields(array_list, expec def test_cast_array_to_features_nested(): arr = pa.array([[{"foo": [0]}]]) - assert cast_array_to_feature(arr, [{"foo": List(Value("string"))}]).type == pa.list_( + assert cast_array_to_feature(arr, List({"foo": List(Value("string"))})).type == pa.list_( pa.struct({"foo": pa.list_(pa.string())}) ) @@ -1168,12 +1168,12 @@ def test_cast_array_to_features_to_nested_with_no_fields(): def test_cast_array_to_features_nested_with_nulls(): # same type arr = pa.array([{"foo": [None, [0]]}], pa.struct({"foo": pa.list_(pa.list_(pa.int64()))})) - casted_array = cast_array_to_feature(arr, {"foo": [[Value("int64")]]}) + casted_array = cast_array_to_feature(arr, {"foo": List(List(Value("int64")))}) assert casted_array.type == pa.struct({"foo": pa.list_(pa.list_(pa.int64()))}) assert casted_array.to_pylist() == arr.to_pylist() # different type arr = pa.array([{"foo": [None, [0]]}], pa.struct({"foo": pa.list_(pa.list_(pa.int64()))})) - casted_array = cast_array_to_feature(arr, {"foo": [[Value("int32")]]}) + casted_array = cast_array_to_feature(arr, {"foo": List(List(Value("int32")))}) assert casted_array.type == pa.struct({"foo": pa.list_(pa.list_(pa.int32()))}) assert casted_array.to_pylist() == [{"foo": [None, [0]]}] @@ -1243,8 +1243,8 @@ def test_cast_fixed_size_list_array_to_features_sequence(arr, slice, target_valu casted_array = cast_array_to_feature(arr, List(target_value_feature)) assert casted_array.type == get_nested_type(List(target_value_feature)) assert casted_array.to_pylist() == arr.to_pylist() - casted_array = cast_array_to_feature(arr, [target_value_feature]) - assert casted_array.type == get_nested_type([target_value_feature]) + casted_array = cast_array_to_feature(arr, List(target_value_feature)) + assert casted_array.type == get_nested_type(List(target_value_feature)) assert casted_array.to_pylist() == arr.to_pylist() @@ -1262,8 +1262,8 @@ def test_cast_list_array_to_features_sequence(arr, slice, target_value_feature): casted_array = cast_array_to_feature(arr, List(target_value_feature)) assert casted_array.type == get_nested_type(List(target_value_feature)) assert casted_array.to_pylist() == arr.to_pylist() - casted_array = cast_array_to_feature(arr, [target_value_feature]) - assert casted_array.type == get_nested_type([target_value_feature]) + casted_array = cast_array_to_feature(arr, List(target_value_feature)) + assert casted_array.type == get_nested_type(List(target_value_feature)) assert casted_array.to_pylist() == arr.to_pylist() # Fixed size list list_size = arr.value_lengths().drop_null()[0].as_py() if arr.value_lengths().drop_null() else 2 @@ -1278,6 +1278,11 @@ def test_cast_list_array_to_features_sequence(arr, slice, target_value_feature): def test_cast_array_to_feature_with_list_array_and_sequence_feature( list_within_struct, from_list_type, sequence_feature_dtype ): + list_feature = { + "list": List, + "fixed_size_list": partial(List, length=2), + "large_list": LargeList, + } list_type = { "list": pa.list_, "fixed_size_list": partial(pa.list_, list_size=2), @@ -1290,14 +1295,17 @@ def test_cast_array_to_feature_with_list_array_and_sequence_feature( to_type = "list" array_data = [0, 1] array_type = list_type[from_list_type](pa.int64()) - sequence_feature = Value(sequence_feature_dtype) - expected_array_type = list_type[to_type](primitive_type[sequence_feature_dtype]) + sequence_feature = list_feature[from_list_type](Value(sequence_feature_dtype)) + expected_array_type = list_type[from_list_type](primitive_type[sequence_feature_dtype]) if list_within_struct: array_data = {"col_1": array_data} array_type = pa.struct({"col_1": array_type}) sequence_feature = {"col_1": sequence_feature} expected_array_type = pa.struct({"col_1": expected_array_type}) - feature = List(sequence_feature) + array_data = [array_data] * 2 + array_type = list_type[from_list_type](array_type) + feature = list_feature[to_type](sequence_feature) + expected_array_type = list_type[to_type](expected_array_type) array = pa.array([array_data], type=array_type) cast_array = cast_array_to_feature(array, feature) assert cast_array.type == expected_array_type @@ -1364,7 +1372,7 @@ def test_embed_array_storage_nested(image_file): [ ( pa.array([[{"path": "image_path"}]], type=pa.list_(Image.pa_type)), - [Image()], + List(Image()), pa.types.is_list, ), ( @@ -1372,11 +1380,6 @@ def test_embed_array_storage_nested(image_file): LargeList(Image()), pa.types.is_large_list, ), - ( - pa.array([[{"path": "image_path"}]], type=pa.list_(Image.pa_type)), - List(Image()), - pa.types.is_list, - ), ], ) def test_embed_array_storage_with_list_types(array, feature, expected_embedded_array_type, monkeypatch): From 52e04d520864f31be761fc2ed5548230f43eeefc Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 24 Jun 2025 20:19:32 +0200 Subject: [PATCH 4/9] fix tests and add backward compatibility utilities --- src/datasets/arrow_dataset.py | 6 ++++ src/datasets/features/features.py | 52 ++++++++++++++++++++++--------- src/datasets/iterable_dataset.py | 5 +++ src/datasets/load.py | 12 +++---- tests/features/test_audio.py | 2 +- tests/features/test_image.py | 4 +-- tests/utils.py | 6 ++-- 7 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index d0169ea5b3b..f5d4d5878c5 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -81,6 +81,7 @@ FeatureType, _align_features, _check_if_features_can_be_aligned, + _fix_for_backward_compatible_features, generate_from_arrow_type, pandas_types_mapper, require_decoding, @@ -2118,6 +2119,7 @@ def cast( f"as the columns in the dataset: {self._data.column_names}" ) + features = _fix_for_backward_compatible_features(features) schema = features.arrow_schema format = self.format dataset = self.with_format("arrow") @@ -2167,6 +2169,7 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option 'text': Value(dtype='string', id=None)} ``` """ + feature = _fix_for_backward_compatible_features(feature) if hasattr(feature, "decode_example"): dataset = copy.deepcopy(self) dataset._info.features[column] = feature @@ -3083,6 +3086,9 @@ def map( if fn_kwargs is None: fn_kwargs = {} + if features is not None: + features = _fix_for_backward_compatible_features(features) + if num_proc is not None and num_proc > len(self): num_proc = len(self) logger.warning( diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 97af496614f..3e329e791f7 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -1158,7 +1158,24 @@ def _load_names_from_file(names_filepath): def Sequence(feature, length=-1): - """deprecated, please use List instead""" + """ + A `Sequence` is a utility that automatically converts internal dictionary feature into a dictionary of + lists. This behavior is implemented to have a compatibility layer with the TensorFlow Datasets library but may be + un-wanted in some cases. If you don't want this behavior, you can use a [`List`] or a [`LargeList`] + instead of the [`Sequence`]. + + Args: + feature ([`FeatureType`]): + Child feature data type of each item within the large list. + length (optional `int`, default to -1): + Length of the list if it is fixed. + Defaults to -1 which means an arbitrary length. + + Returns: + [`List`] of the specified feature, except `dict` of sub-features + which are converted to `dict` of lists of sub-features for compatibility with TFDS. + + """ if isinstance(feature, dict): return {key: List(value, length=length) for key, value in feature.items()} else: @@ -1174,6 +1191,9 @@ class List: Args: feature ([`FeatureType`]): Child feature data type of each item within the large list. + length (optional `int`, default to -1): + Length of the list if it is fixed. + Defaults to -1 which means an arbitrary length. """ feature: Any @@ -1558,8 +1578,6 @@ def _visit(feature: FeatureType, func: Callable[[FeatureType], Optional[FeatureT out = func(Features({k: _visit(f, func) for k, f in feature.items()})) elif isinstance(feature, dict): out = func({k: _visit(f, func) for k, f in feature.items()}) - elif isinstance(feature, (list, tuple)): - out = func([_visit(feature[0], func)]) elif isinstance(feature, LargeList): out = func(LargeList(_visit(feature.feature, func))) elif isinstance(feature, List): @@ -1700,22 +1718,19 @@ class Features(dict): It's possible to have nested fields of nested fields in an arbitrary manner. - [`List`] or [`LargeList`] specifies a composite feature containing a sequence of sub-features, all of the same feature type. - - - - A `Sequence` is deprecated and automatically converts internal dictionary feature into a dictionary of - lists. This behavior is implemented to have a compatibility layer with the TensorFlow Datasets library but may be - un-wanted in some cases. If you don't want this behavior, you can use a [`List`] or a [`LargeList`] - instead of the [`Sequence`]. - - - - [`Array2D`], [`Array3D`], [`Array4D`] or [`Array5D`] feature for multidimensional arrays. - [`Audio`] feature to store the absolute path to an audio file or a dictionary with the relative path - to an audio file ("path" key) and its bytes content ("bytes" key). This feature extracts the audio data. + to an audio file ("path" key) and its bytes content ("bytes" key). + This feature loads the audio lazily with a decoder. - [`Image`] feature to store the absolute path to an image file, an `np.ndarray` object, a `PIL.Image.Image` object or a dictionary with the relative path to an image file ("path" key) and its bytes content ("bytes" key). This feature extracts the image data. + - [`Video`] feature to store the absolute path to a video file, a `torchcodec.decoders.VideoDecoder` object + or a dictionary with the relative path to a video file ("path" key) and its bytes content ("bytes" key). + This feature loads the video lazily with a decoder. + - [`Pdf`] feature to store the absolute path to a PDF file, a `pdfplumber.pdf.PDF` object + or a dictionary with the relative path to a PDF file ("path" key) and its bytes content ("bytes" key). + This feature loads the PDF lazily with a PDF reader. - [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation. """ @@ -2252,3 +2267,12 @@ def _check_if_features_can_be_aligned(features_list: list[Features]): raise ValueError( f'The features can\'t be aligned because the key {k} of features {features} has unexpected type - {v} (expected either {name2feature[k]} or Value("null").' ) + + +def _fix_for_backward_compatible_features(feature: Any) -> FeatureType: + def _fix_old_list(feature): + if isinstance(feature, list): + return List(_fix_for_backward_compatible_features(feature[0])) + return feature + + return _visit(feature, _fix_old_list) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index c8ccdea7c1e..8031c140ef4 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -37,6 +37,7 @@ Value, _align_features, _check_if_features_can_be_aligned, + _fix_for_backward_compatible_features, _visit, cast_to_python_objects, require_decoding, @@ -2661,6 +2662,8 @@ def map( function = identity_func if fn_kwargs is None: fn_kwargs = {} + if features is not None: + features = _fix_for_backward_compatible_features(features) ex_iterable = self._ex_iterable # no need to apply features if ex_iterable is typed and if there was no cast_column() @@ -3244,6 +3247,7 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDataset": 'transcription': Value(dtype='string', id=None)} ``` """ + feature = _fix_for_backward_compatible_features(feature) info = self._info.copy() info.features[column] = feature return IterableDataset( @@ -3290,6 +3294,7 @@ def cast( 'text': Value(dtype='large_string', id=None)} ``` """ + features = _fix_for_backward_compatible_features(features) info = self._info.copy() info.features = features return IterableDataset( diff --git a/src/datasets/load.py b/src/datasets/load.py index 36e4840f5bb..6c51dabd0b3 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -23,7 +23,6 @@ import posixpath from collections import Counter from collections.abc import Mapping, Sequence -from contextlib import nullcontext from dataclasses import dataclass, field from pathlib import Path from typing import Any, Optional, Union @@ -60,6 +59,7 @@ from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin from .exceptions import DataFilesNotFoundError, DatasetNotFoundError from .features import Features +from .features.features import _fix_for_backward_compatible_features from .fingerprint import Hasher from .info import DatasetInfo, DatasetInfosDict from .iterable_dataset import IterableDataset @@ -171,12 +171,7 @@ def import_main_class(module_path) -> Optional[type[DatasetBuilder]]: def get_dataset_builder_class( dataset_module: "DatasetModule", dataset_name: Optional[str] = None ) -> type[DatasetBuilder]: - with ( - lock_importable_file(dataset_module.importable_file_path) - if dataset_module.importable_file_path - else nullcontext() - ): - builder_cls = import_main_class(dataset_module.module_path) + builder_cls = import_main_class(dataset_module.module_path) if dataset_module.builder_configs_parameters.builder_configs: dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name") if dataset_name is None: @@ -388,7 +383,6 @@ class DatasetModule: builder_kwargs: dict builder_configs_parameters: BuilderConfigsParameters = field(default_factory=BuilderConfigsParameters) dataset_infos: Optional[DatasetInfosDict] = None - importable_file_path: Optional[str] = None class _DatasetModuleFactory: @@ -1133,6 +1127,8 @@ def load_dataset_builder( if storage_options is not None: download_config = download_config.copy() if download_config else DownloadConfig() download_config.storage_options.update(storage_options) + if features is not None: + features = _fix_for_backward_compatible_features(features) dataset_module = dataset_module_factory( path, revision=revision, diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index f959458777c..dae082429ed 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -7,7 +7,7 @@ import pytest from datasets import Column, Dataset, concatenate_datasets, load_dataset -from datasets.features import Audio, Features, Value +from datasets.features import Audio, Features, List, Value from ..utils import require_sndfile, require_torchcodec diff --git a/tests/features/test_image.py b/tests/features/test_image.py index 57559b57a1e..0b6774330b5 100644 --- a/tests/features/test_image.py +++ b/tests/features/test_image.py @@ -9,7 +9,7 @@ import pyarrow as pa import pytest -from datasets import Column, Dataset, Features, Image, Value, concatenate_datasets, load_dataset +from datasets import Column, Dataset, Features, Image, List, Value, concatenate_datasets, load_dataset from datasets.features.image import encode_np_array, image_to_bytes from ..utils import require_pil @@ -336,7 +336,7 @@ def test_dataset_concatenate_image_features(shared_datadir): def test_dataset_concatenate_nested_image_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other image_path = str(shared_datadir / "test_image_rgb.jpg") - features = Features({"list_of_structs_of_images": [{"image": Image()}]}) + features = Features({"list_of_structs_of_images": List({"image": Image()})}) data1 = {"list_of_structs_of_images": [[{"image": image_path}]]} dset1 = Dataset.from_dict(data1, features=features) data2 = {"list_of_structs_of_images": [[{"image": {"bytes": open(image_path, "rb").read()}}]]} diff --git a/tests/utils.py b/tests/utils.py index 66341e70220..be159bae21f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -191,13 +191,13 @@ def require_torchvision(test_case): def require_torchcodec(test_case): """ - Decorator marking a test that requires torchvision. + Decorator marking a test that requires torchcodec. - These tests are skipped when torchvision isn't installed. + These tests are skipped when torchcodec isn't installed. """ if not config.TORCHCODEC_AVAILABLE: - test_case = unittest.skip("test requires torchvision")(test_case) + test_case = unittest.skip("test requires torchcodec")(test_case) return test_case From 31c778066c8fe282623dd20cb9939aad8c9a2d9b Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 24 Jun 2025 20:45:36 +0200 Subject: [PATCH 5/9] fix tests --- src/datasets/arrow_dataset.py | 6 ++++++ src/datasets/features/features.py | 7 ++++--- src/datasets/search.py | 4 ++-- tests/commands/test_test.py | 2 +- tests/test_inspect.py | 4 ++-- tests/test_upstream_hub.py | 3 ++- 6 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index f5d4d5878c5..cd5b5e52d1b 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -898,6 +898,8 @@ def from_pandas( f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}" ) features = features if features is not None else info.features if info is not None else None + if features is not None: + features = _fix_for_backward_compatible_features(features) if info is None: info = DatasetInfo() info.features = features @@ -943,6 +945,8 @@ def from_polars( f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}" ) features = features if features is not None else info.features if info is not None else None + if features is not None: + features = _fix_for_backward_compatible_features(features) if info is None: info = DatasetInfo() info.features = features @@ -988,6 +992,8 @@ def from_dict( f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}" ) features = features if features is not None else info.features if info is not None else None + if features is not None: + features = _fix_for_backward_compatible_features(features) arrow_typed_mapping = {} for col, data in mapping.items(): if isinstance(data, (pa.Array, pa.ChunkedArray)): diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 3e329e791f7..5aa9c722267 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -1957,14 +1957,15 @@ def from_yaml_inner(obj: Union[dict, list]) -> Union[dict, list]: if _type == "large_list": _feature = from_yaml_inner(unsimplify(obj).pop(_type)) return {"feature": _feature, **obj, "_type": "LargeList"} - if _type == "sequence": - _feature = from_yaml_inner(unsimplify(obj).pop(_type)) - if isinstance(_feature, dict): + if _type == "sequence": # backward compatibility + if isinstance(obj[_type], list): + _feature = from_yaml_inner(unsimplify(obj).pop(_type)) return { name: {"feature": _subfeature, **obj, "_type": "List"} for name, _subfeature in _feature.items() } else: + _feature = from_yaml_inner(unsimplify(obj).pop(_type)) return {"feature": _feature, **obj, "_type": "List"} if _type == "list": _feature = from_yaml_inner(unsimplify(obj).pop(_type)) diff --git a/src/datasets/search.py b/src/datasets/search.py index 07ec6c93bc1..cb994d24e41 100644 --- a/src/datasets/search.py +++ b/src/datasets/search.py @@ -7,7 +7,7 @@ import fsspec import numpy as np -from .features import Sequence +from .features import List from .utils import logging from .utils import tqdm as hf_tqdm @@ -266,7 +266,7 @@ def add_vectors( """ import faiss # noqa: F811 - if column and not isinstance(vectors.features[column], Sequence): + if column and not isinstance(vectors.features[column], List): raise ValueError( f"Wrong feature type for column '{column}'. Expected 1d array, got {vectors.features[column]}" ) diff --git a/tests/commands/test_test.py b/tests/commands/test_test.py index f8935a8c025..9c47cdefc8d 100644 --- a/tests/commands/test_test.py +++ b/tests/commands/test_test.py @@ -3,7 +3,7 @@ import pytest -from datasets import ClassLabel, Features, Value +from datasets import ClassLabel, Features, List, Value from datasets.commands.test import TestCommand from datasets.info import DatasetInfo, DatasetInfosDict diff --git a/tests/test_inspect.py b/tests/test_inspect.py index fd227670253..9855efc3159 100644 --- a/tests/test_inspect.py +++ b/tests/test_inspect.py @@ -52,7 +52,7 @@ def test_get_dataset_config_info_raises(path, config_name, expected_exception): @pytest.mark.parametrize( "path, expected", [ - ("acronym_identification", ["default"]), + ("amirveyseh/acronym_identification", ["default"]), ("rajpurkar/squad", ["plain_text"]), ("dalle-mini/wit", ["default"]), ("hf-internal-testing/librispeech_asr_dummy", ["clean"]), @@ -69,7 +69,7 @@ def test_get_dataset_config_names(path, expected): @pytest.mark.parametrize( "path, expected", [ - ("acronym_identification", "default"), + ("amirveyseh/acronym_identification", "default"), ("rajpurkar/squad", "plain_text"), ("dalle-mini/wit", "default"), ("hf-internal-testing/librispeech_asr_dummy", "clean"), diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index b118f174264..d549a22f2b8 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -23,6 +23,7 @@ Features, Image, IterableDatasetDict, + List, Value, load_dataset, load_dataset_builder, @@ -441,7 +442,7 @@ def test_push_dataset_to_hub_custom_features_image(self, temporary_repo): def test_push_dataset_to_hub_custom_features_image_list(self, temporary_repo): image_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_image_rgb.jpg") data = {"x": [[image_path], [image_path, image_path]], "y": [0, -1]} - features = Features({"x": [Image()], "y": Value("int32")}) + features = Features({"x": List(Image()), "y": Value("int32")}) ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: From c8441c599ea2ac68d8fcaf1dd7a421c732ba3df6 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 25 Jun 2025 15:17:53 +0200 Subject: [PATCH 6/9] last fix --- tests/test_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_table.py b/tests/test_table.py index b7bf75cc803..e5cebb793de 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1358,7 +1358,7 @@ def test_embed_array_storage(image_file): def test_embed_array_storage_nested(image_file): array = pa.array([[{"bytes": None, "path": image_file}]], type=pa.list_(Image.pa_type)) - embedded_images_array = embed_array_storage(array, [Image()]) + embedded_images_array = embed_array_storage(array, List(Image())) assert isinstance(embedded_images_array.to_pylist()[0][0]["path"], str) assert isinstance(embedded_images_array.to_pylist()[0][0]["bytes"], bytes) array = pa.array([{"foo": {"bytes": None, "path": image_file}}], type=pa.struct({"foo": Image.pa_type})) From dcd1f6ee1880fa7a294f9b34af569334fd28021a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 25 Jun 2025 15:20:51 +0200 Subject: [PATCH 7/9] last --- src/datasets/iterable_dataset.py | 3 ++- tests/features/test_features.py | 2 +- tests/test_upstream_hub.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 8031c140ef4..c70381542c3 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -34,6 +34,7 @@ from .features import Features from .features.features import ( FeatureType, + List, Value, _align_features, _check_if_features_can_be_aligned, @@ -3450,7 +3451,7 @@ def batch_fn(unbatched): return {k: [v] for k, v in unbatched.items()} if self.features: - features = Features({col: [feature] for col, feature in self.features.items()}) + features = Features({col: List(feature) for col, feature in self.features.items()}) else: features = None return self.map( diff --git a/tests/features/test_features.py b/tests/features/test_features.py index 1ead81f8e3d..9b0d924c631 100644 --- a/tests/features/test_features.py +++ b/tests/features/test_features.py @@ -397,7 +397,7 @@ def test_class_label_to_and_from_dict(class_label_arg, tmp_path_factory): @pytest.mark.parametrize( "schema", - [[Audio()], LargeList(Audio()), List(Audio())], + [LargeList(Audio()), List(Audio())], ) def test_decode_nested_example_with_list_types(schema, monkeypatch): mock_decode_example = MagicMock() diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index d549a22f2b8..fd35308fe1f 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -454,7 +454,7 @@ def test_push_dataset_to_hub_custom_features_image_list(self, temporary_repo): assert list(ds.features.keys()) == list(hub_ds.features.keys()) assert ds.features == hub_ds.features assert ds[:] == hub_ds[:] - hub_ds = hub_ds.cast_column("x", [Image(decode=False)]) + hub_ds = hub_ds.cast_column("x", List(Image(decode=False))) elem = hub_ds[0]["x"][0] path, bytes_ = elem["path"], elem["bytes"] assert isinstance(path, str) From c4f3c2a6681d3104a3cd43ed8212b505a5d6117d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 25 Jun 2025 15:58:15 +0200 Subject: [PATCH 8/9] fix docstrings --- src/datasets/arrow_dataset.py | 18 +++++++++--------- src/datasets/builder.py | 4 ++-- src/datasets/dataset_dict.py | 26 +++++++++++++------------- src/datasets/features/features.py | 18 +++++++++--------- src/datasets/iterable_dataset.py | 18 +++++++++--------- src/datasets/load.py | 2 +- 6 files changed, 43 insertions(+), 43 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index cd5b5e52d1b..e3c4f5c50cf 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1957,14 +1957,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data >>> from datasets import load_dataset >>> ds = load_dataset("boolq", split="validation") >>> ds.features - {'answer': Value(dtype='bool', id=None), - 'passage': Value(dtype='string', id=None), - 'question': Value(dtype='string', id=None)} + {'answer': Value(dtype='bool'), + 'passage': Value(dtype='string'), + 'question': Value(dtype='string')} >>> ds = ds.class_encode_column('answer') >>> ds.features {'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None), - 'passage': Value(dtype='string', id=None), - 'question': Value(dtype='string', id=None)} + 'passage': Value(dtype='string'), + 'question': Value(dtype='string')} ``` """ # Sanity checks @@ -2109,14 +2109,14 @@ def cast( >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") >>> ds.features {'label': ClassLabel(names=['neg', 'pos'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} >>> new_features = ds.features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds.features {'label': ClassLabel(names=['bad', 'good'], id=None), - 'text': Value(dtype='large_string', id=None)} + 'text': Value(dtype='large_string')} ``` """ if sorted(features) != sorted(self._data.column_names): @@ -2168,11 +2168,11 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") >>> ds.features {'label': ClassLabel(names=['neg', 'pos'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds.features {'label': ClassLabel(names=['bad', 'good'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} ``` """ feature = _fix_for_backward_compatible_features(feature) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index de79c71995e..118f77a354a 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -513,7 +513,7 @@ def get_all_exported_dataset_infos(cls) -> DatasetInfosDict: >>> from datasets import load_dataset_builder >>> ds_builder = load_dataset_builder('vivos') >>> ds_builder.get_all_exported_dataset_infos() - {'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)} + {'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)} ``` """ return DatasetInfosDict.from_directory(cls.get_imported_module_dir()) @@ -527,7 +527,7 @@ def get_exported_dataset_info(self) -> DatasetInfo: >>> from datasets import load_dataset_builder >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') >>> ds_builder.get_exported_dataset_info() - DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None) + DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string'), 'path': Value(dtype='string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None) ``` """ return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo()) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 4d79f95620e..1b523ea10da 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -290,14 +290,14 @@ def cast(self, features: Features) -> "DatasetDict": >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") >>> ds["train"].features {'label': ClassLabel(names=['neg', 'pos'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} >>> new_features = ds["train"].features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds["train"].features {'label': ClassLabel(names=['bad', 'good'], id=None), - 'text': Value(dtype='large_string', id=None)} + 'text': Value(dtype='large_string')} ``` """ self._check_values_type() @@ -322,11 +322,11 @@ def cast_column(self, column: str, feature) -> "DatasetDict": >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") >>> ds["train"].features {'label': ClassLabel(names=['neg', 'pos'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds["train"].features {'label': ClassLabel(names=['bad', 'good'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} ``` """ self._check_values_type() @@ -513,14 +513,14 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data >>> from datasets import load_dataset >>> ds = load_dataset("boolq") >>> ds["train"].features - {'answer': Value(dtype='bool', id=None), - 'passage': Value(dtype='string', id=None), - 'question': Value(dtype='string', id=None)} + {'answer': Value(dtype='bool'), + 'passage': Value(dtype='string'), + 'question': Value(dtype='string')} >>> ds = ds.class_encode_column("answer") >>> ds["train"].features {'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None), - 'passage': Value(dtype='string', id=None), - 'question': Value(dtype='string', id=None)} + 'passage': Value(dtype='string'), + 'question': Value(dtype='string')} ``` """ self._check_values_type() @@ -2381,11 +2381,11 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) >>> ds["train"].features {'label': ClassLabel(names=['neg', 'pos'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds["train"].features {'label': ClassLabel(names=['bad', 'good'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} ``` """ return IterableDatasetDict( @@ -2417,14 +2417,14 @@ def cast( >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) >>> ds["train"].features {'label': ClassLabel(names=['neg', 'pos'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} >>> new_features = ds["train"].features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds["train"].features {'label': ClassLabel(names=['bad', 'good'], id=None), - 'text': Value(dtype='large_string', id=None)} + 'text': Value(dtype='large_string')} ``` """ return IterableDatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()}) diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 5aa9c722267..20dcb8878af 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -519,7 +519,7 @@ class Value: >>> from datasets import Features >>> features = Features({'stars': Value(dtype='int32')}) >>> features - {'stars': Value(dtype='int32', id=None)} + {'stars': Value(dtype='int32')} ``` """ @@ -1834,7 +1834,7 @@ def from_dict(cls, dic) -> "Features": Example:: >>> Features.from_dict({'_type': {'dtype': 'string', 'id': None, '_type': 'Value'}}) - {'_type': Value(dtype='string', id=None)} + {'_type': Value(dtype='string')} """ obj = generate_from_dict(dic) return cls(**obj) @@ -2132,7 +2132,7 @@ def copy(self) -> "Features": >>> copy_of_features = ds.features.copy() >>> copy_of_features {'label': ClassLabel(names=['neg', 'pos'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} ``` """ return copy.deepcopy(self) @@ -2208,12 +2208,12 @@ def flatten(self, max_depth=16) -> "Features": >>> from datasets import load_dataset >>> ds = load_dataset("rajpurkar/squad", split="train") >>> ds.features.flatten() - {'answers.answer_start': List(feature=Value(dtype='int32', id=None), length=-1, id=None), - 'answers.text': List(feature=Value(dtype='string', id=None), length=-1, id=None), - 'context': Value(dtype='string', id=None), - 'id': Value(dtype='string', id=None), - 'question': Value(dtype='string', id=None), - 'title': Value(dtype='string', id=None)} + {'answers.answer_start': List(feature=Value(dtype='int32'), length=-1, id=None), + 'answers.text': List(feature=Value(dtype='string'), length=-1, id=None), + 'context': Value(dtype='string'), + 'id': Value(dtype='string'), + 'question': Value(dtype='string'), + 'title': Value(dtype='string')} ``` """ for depth in range(1, max_depth): diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index c70381542c3..4bd9f80d47d 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -3233,19 +3233,19 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDataset": >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train", streaming=True) >>> ds.features {'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None), - 'english_transcription': Value(dtype='string', id=None), + 'english_transcription': Value(dtype='string'), 'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill'], id=None), 'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN'], id=None), - 'path': Value(dtype='string', id=None), - 'transcription': Value(dtype='string', id=None)} + 'path': Value(dtype='string'), + 'transcription': Value(dtype='string')} >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000)) >>> ds.features {'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), - 'english_transcription': Value(dtype='string', id=None), + 'english_transcription': Value(dtype='string'), 'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill'], id=None), 'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN'], id=None), - 'path': Value(dtype='string', id=None), - 'transcription': Value(dtype='string', id=None)} + 'path': Value(dtype='string'), + 'transcription': Value(dtype='string')} ``` """ feature = _fix_for_backward_compatible_features(feature) @@ -3285,14 +3285,14 @@ def cast( >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) >>> ds.features {'label': ClassLabel(names=['neg', 'pos'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} >>> new_features = ds.features.copy() >>> new_features["label"] = ClassLabel(names=["bad", "good"]) >>> new_features["text"] = Value("large_string") >>> ds = ds.cast(new_features) >>> ds.features {'label': ClassLabel(names=['bad', 'good'], id=None), - 'text': Value(dtype='large_string', id=None)} + 'text': Value(dtype='large_string')} ``` """ features = _fix_for_backward_compatible_features(features) @@ -3348,7 +3348,7 @@ def decode(self, enable: bool = True, num_threads: int = 0) -> "IterableDataset" >>> ds = ds.decode(False) >>> ds.features {'image': Image(mode=None, decode=False, id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} >>> next(iter(ds)) { 'image': { diff --git a/src/datasets/load.py b/src/datasets/load.py index 5743c7d8276..8d1d49dc2ee 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1117,7 +1117,7 @@ def load_dataset_builder( >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') >>> ds_builder.info.features {'label': ClassLabel(names=['neg', 'pos'], id=None), - 'text': Value(dtype='string', id=None)} + 'text': Value(dtype='string')} ``` """ download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS) From 5b359cf03370f3517c69dcfa1cfcbdfef37cb314 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 25 Jun 2025 15:58:47 +0200 Subject: [PATCH 9/9] again --- src/datasets/arrow_dataset.py | 10 +++++----- src/datasets/dataset_dict.py | 18 +++++++++--------- src/datasets/features/features.py | 4 ++-- src/datasets/iterable_dataset.py | 12 ++++++------ src/datasets/load.py | 2 +- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index e3c4f5c50cf..cfdf6f2a444 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1962,7 +1962,7 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data 'question': Value(dtype='string')} >>> ds = ds.class_encode_column('answer') >>> ds.features - {'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None), + {'answer': ClassLabel(num_classes=2, names=['False', 'True']), 'passage': Value(dtype='string'), 'question': Value(dtype='string')} ``` @@ -2108,14 +2108,14 @@ def cast( >>> from datasets import load_dataset, ClassLabel, Value >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") >>> ds.features - {'label': ClassLabel(names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos']), 'text': Value(dtype='string')} >>> new_features = ds.features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds.features - {'label': ClassLabel(names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good']), 'text': Value(dtype='large_string')} ``` """ @@ -2167,11 +2167,11 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option >>> from datasets import load_dataset, ClassLabel >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") >>> ds.features - {'label': ClassLabel(names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos']), 'text': Value(dtype='string')} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds.features - {'label': ClassLabel(names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good']), 'text': Value(dtype='string')} ``` """ diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 1b523ea10da..9d8c67ae0e2 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -289,14 +289,14 @@ def cast(self, features: Features) -> "DatasetDict": >>> from datasets import load_dataset, ClassLabel, Value >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") >>> ds["train"].features - {'label': ClassLabel(names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos']), 'text': Value(dtype='string')} >>> new_features = ds["train"].features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds["train"].features - {'label': ClassLabel(names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good']), 'text': Value(dtype='large_string')} ``` """ @@ -321,11 +321,11 @@ def cast_column(self, column: str, feature) -> "DatasetDict": >>> from datasets import load_dataset, ClassLabel >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") >>> ds["train"].features - {'label': ClassLabel(names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos']), 'text': Value(dtype='string')} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds["train"].features - {'label': ClassLabel(names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good']), 'text': Value(dtype='string')} ``` """ @@ -518,7 +518,7 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data 'question': Value(dtype='string')} >>> ds = ds.class_encode_column("answer") >>> ds["train"].features - {'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None), + {'answer': ClassLabel(num_classes=2, names=['False', 'True']), 'passage': Value(dtype='string'), 'question': Value(dtype='string')} ``` @@ -2380,11 +2380,11 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict >>> from datasets import load_dataset, ClassLabel >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) >>> ds["train"].features - {'label': ClassLabel(names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos']), 'text': Value(dtype='string')} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds["train"].features - {'label': ClassLabel(names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good']), 'text': Value(dtype='string')} ``` """ @@ -2416,14 +2416,14 @@ def cast( >>> from datasets import load_dataset >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) >>> ds["train"].features - {'label': ClassLabel(names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos']), 'text': Value(dtype='string')} >>> new_features = ds["train"].features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds["train"].features - {'label': ClassLabel(names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good']), 'text': Value(dtype='large_string')} ``` """ diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 20dcb8878af..676521e6990 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -987,7 +987,7 @@ class ClassLabel: >>> from datasets import Features, ClassLabel >>> features = Features({'label': ClassLabel(num_classes=3, names=['bad', 'ok', 'good'])}) >>> features - {'label': ClassLabel(names=['bad', 'ok', 'good'], id=None)} + {'label': ClassLabel(names=['bad', 'ok', 'good'])} ``` """ @@ -2131,7 +2131,7 @@ def copy(self) -> "Features": >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train") >>> copy_of_features = ds.features.copy() >>> copy_of_features - {'label': ClassLabel(names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos']), 'text': Value(dtype='string')} ``` """ diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 4bd9f80d47d..a5797bfa54c 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -3234,16 +3234,16 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDataset": >>> ds.features {'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None), 'english_transcription': Value(dtype='string'), - 'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill'], id=None), - 'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN'], id=None), + 'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']), + 'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']), 'path': Value(dtype='string'), 'transcription': Value(dtype='string')} >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000)) >>> ds.features {'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'english_transcription': Value(dtype='string'), - 'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill'], id=None), - 'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN'], id=None), + 'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']), + 'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']), 'path': Value(dtype='string'), 'transcription': Value(dtype='string')} ``` @@ -3284,14 +3284,14 @@ def cast( >>> from datasets import load_dataset, ClassLabel, Value >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) >>> ds.features - {'label': ClassLabel(names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos']), 'text': Value(dtype='string')} >>> new_features = ds.features.copy() >>> new_features["label"] = ClassLabel(names=["bad", "good"]) >>> new_features["text"] = Value("large_string") >>> ds = ds.cast(new_features) >>> ds.features - {'label': ClassLabel(names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good']), 'text': Value(dtype='large_string')} ``` """ diff --git a/src/datasets/load.py b/src/datasets/load.py index 8d1d49dc2ee..c540e511473 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1116,7 +1116,7 @@ def load_dataset_builder( >>> from datasets import load_dataset_builder >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') >>> ds_builder.info.features - {'label': ClassLabel(names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos']), 'text': Value(dtype='string')} ``` """