From 860f4e030d15615fb36226fdaf20fc8f50d08522 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 28 May 2021 16:28:26 +0200 Subject: [PATCH 1/2] fix save_to_disk nested features order in dataet_info.json --- src/datasets/arrow_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 0ea5f0ba42d..610ae6bde40 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -638,7 +638,9 @@ def save_to_disk(self, dataset_path: str, fs=None): with fs.open( Path(dataset_path, config.DATASET_INFO_FILENAME).as_posix(), "w", encoding="utf-8" ) as dataset_info_file: - json.dump(dataset_info, dataset_info_file, indent=2, sort_keys=True) + # Sort only the first level of keys, or we might shuffle fields of nested features if we use sort_keys=True + sorted_keys_dataset_info = {key: dataset_info[key] for key in sorted(dataset_info)} + json.dump(sorted_keys_dataset_info, dataset_info_file, indent=2) logger.info("Dataset saved in {}".format(dataset_path)) @staticmethod From 09c040eeb918e04476c3d250ff1376d8e2a3f323 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 28 May 2021 17:03:14 +0200 Subject: [PATCH 2/2] add test --- tests/test_arrow_dataset.py | 55 ++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 0bd295dc9d3..b55fa52fcd5 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -83,25 +83,29 @@ def inject_fixtures(self, caplog): self._caplog = caplog def _create_dummy_dataset( - self, in_memory: bool, tmp_dir: str, multiple_columns=False, array_features=False + self, in_memory: bool, tmp_dir: str, multiple_columns=False, array_features=False, nested_features=False ) -> Dataset: + assert int(multiple_columns) + int(array_features) + int(nested_features) < 2 if multiple_columns: - if array_features: - data = { - "col_1": [[[True, False], [False, True]]] * 4, # 2D - "col_2": [[[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]]] * 4, # 3D array - "col_3": [[3, 2, 1, 0]] * 4, # Sequence + data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"], "col_3": [False, True, False, True]} + dset = Dataset.from_dict(data) + elif array_features: + data = { + "col_1": [[[True, False], [False, True]]] * 4, # 2D + "col_2": [[[["a", "b"], ["c", "d"]], [["e", "f"], ["g", "h"]]]] * 4, # 3D array + "col_3": [[3, 2, 1, 0]] * 4, # Sequence + } + features = Features( + { + "col_1": Array2D(shape=(2, 2), dtype="bool"), + "col_2": Array3D(shape=(2, 2, 2), dtype="string"), + "col_3": Sequence(feature=Value("int64")), } - features = Features( - { - "col_1": Array2D(shape=(2, 2), dtype="bool"), - "col_2": Array3D(shape=(2, 2, 2), dtype="string"), - "col_3": Sequence(feature=Value("int64")), - } - ) - else: - data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"], "col_3": [False, True, False, True]} - features = None + ) + dset = Dataset.from_dict(data, features=features) + elif nested_features: + data = {"nested": [{"a": i, "x": i * 10, "c": i * 100} for i in range(1, 11)]} + features = Features({"nested": {"a": Value("int64"), "x": Value("int64"), "c": Value("int64")}}) dset = Dataset.from_dict(data, features=features) else: dset = Dataset.from_dict({"filename": ["my_name-train" + "_" + str(x) for x in np.arange(30).tolist()]}) @@ -139,7 +143,7 @@ def test_dummy_dataset(self, in_memory): self.assertEqual(dset["col_1"][0], 3) with tempfile.TemporaryDirectory() as tmp_dir: - with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True, array_features=True) as dset: + with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset: self.assertDictEqual( dset.features, Features( @@ -249,6 +253,19 @@ def test_dummy_dataset_serialize(self, in_memory): self.assertEqual(dset[0]["filename"], "my_name-train_0") self.assertEqual(dset["filename"][0], "my_name-train_0") + with self._create_dummy_dataset(in_memory, tmp_dir, nested_features=True) as dset: + with assert_arrow_memory_doesnt_increase(): + dset.save_to_disk(dataset_path) + + with Dataset.load_from_disk(dataset_path) as dset: + self.assertEqual(len(dset), 10) + self.assertDictEqual( + dset.features, + Features({"nested": {"a": Value("int64"), "x": Value("int64"), "c": Value("int64")}}), + ) + self.assertDictEqual(dset[0]["nested"], {"a": 1, "c": 100, "x": 10}) + self.assertDictEqual(dset["nested"][0], {"a": 1, "c": 100, "x": 10}) + def test_dummy_dataset_load_from_disk(self, in_memory): with tempfile.TemporaryDirectory() as tmp_dir: @@ -453,7 +470,7 @@ def test_class_encode_column(self, in_memory): assert_arrow_metadata_are_synced_with_dataset_features(casted_dset) # Test raises if feature is an array / sequence - with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True, array_features=True) as dset: + with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset: for column in dset.column_names: with self.assertRaises(ValueError): dset.class_encode_column(column) @@ -1508,7 +1525,7 @@ def test_to_csv(self, in_memory): self.assertListEqual(list(csv_dset.columns), list(dset.column_names)) # With array features - with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True, array_features=True) as dset: + with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset: file_path = os.path.join(tmp_dir, "test_path.csv") bytes_written = dset.to_csv(path_or_buf=file_path)