From e7aa4a0c6018b65aeb3ac67faa14a732d18ec8ae Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 7 Jun 2021 18:16:37 +0200 Subject: [PATCH 01/12] Test Dataset from JSON with unsorted column names --- tests/conftest.py | 14 ++++++++++++++ tests/io/test_json.py | 24 +++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4f110012a69..a4b2e4e6669 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -130,6 +130,11 @@ def xml_file(tmp_path_factory): "col_3": [0.0, 1.0, 2.0, 3.0], } +DATA_312 = [ + {"col_3": 0.0, "col_1": "0", "col_2": 0}, + {"col_3": 1.0, "col_1": "1", "col_2": 1}, +] + @pytest.fixture(scope="session") def dataset_dict(): @@ -182,6 +187,15 @@ def jsonl_path(tmp_path_factory): return path +@pytest.fixture(scope="session") +def jsonl_312_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset_312.jsonl") + with open(path, "w") as f: + for item in DATA_312: + f.write(json.dumps(item)) + return path + + @pytest.fixture(scope="session") def text_path(tmp_path_factory): data = ["0", "1", "2", "3"] diff --git a/tests/io/test_json.py b/tests/io/test_json.py index 024c98da025..5b347d943f8 100644 --- a/tests/io/test_json.py +++ b/tests/io/test_json.py @@ -39,7 +39,6 @@ def test_dataset_from_json_keep_in_memory(keep_in_memory, jsonl_path, tmp_path): ) def test_dataset_from_json_features(features, jsonl_path, tmp_path): cache_dir = tmp_path / "cache" - # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = ( @@ -49,6 +48,29 @@ def test_dataset_from_json_features(features, jsonl_path, tmp_path): _check_json_dataset(dataset, expected_features) +@pytest.mark.parametrize( + "features", + [ + None, + {"col_3": "float64", "col_1": "string", "col_2": "int64"}, + ], +) +def test_dataset_from_json_with_unsorted_column_names(features, jsonl_312_path, tmp_path): + cache_dir = tmp_path / "cache" + default_expected_features = {"col_3": "float64", "col_1": "string", "col_2": "int64"} + expected_features = features.copy() if features else default_expected_features + features = ( + Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None + ) + dataset = JsonDatasetReader(jsonl_312_path, features=features, cache_dir=cache_dir).read() + assert isinstance(dataset, Dataset) + assert dataset.num_rows == 2 + assert dataset.num_columns == 3 + assert dataset.column_names == ["col_3", "col_1", "col_2"] + for feature, expected_dtype in expected_features.items(): + assert dataset.features[feature].dtype == expected_dtype + + @pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"]) def test_dataset_from_json_split(split, jsonl_path, tmp_path): cache_dir = tmp_path / "cache" From 03bdf6069ff92a8501f37169f6003aacef9d1219 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 7 Jun 2021 18:21:58 +0200 Subject: [PATCH 02/12] Fix Features type order --- src/datasets/features.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/features.py b/src/datasets/features.py index bb18cd80d96..581b09a8826 100644 --- a/src/datasets/features.py +++ b/src/datasets/features.py @@ -817,8 +817,8 @@ def get_nested_type(schema: FeatureType) -> pa.DataType: # Nested structures: we allow dict, list/tuples, sequences if isinstance(schema, Features): return pa.struct( - {key: get_nested_type(schema[key]) for key in sorted(schema)} - ) # sort to make the order of columns deterministic + {key: get_nested_type(schema[key]) for key in schema} + ) # Features is subclass of dict, and dict order is deterministic since Python 3.7 elif isinstance(schema, dict): return pa.struct( {key: get_nested_type(schema[key]) for key in schema} From 646bf9de93a3320118332892c23e571a34f26388 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 7 Jun 2021 19:30:06 +0200 Subject: [PATCH 03/12] Fix Dataset.add_item --- src/datasets/arrow_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index a78e97b5467..f00d392d79c 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -3077,7 +3077,7 @@ def add_item(self, item: dict, new_fingerprint: str): Returns: :class:`Dataset` """ - item_table = InMemoryTable.from_pydict({k: [v] for k, v in item.items()}) + item_table = InMemoryTable.from_pydict({k: [item[k]] for k in self.features.keys() if k in item}) # Cast item schema = pa.schema(self.features.type) item_table = item_table.cast(schema) From 15356ed1bcd51f9984b9a238f8b4389598693b91 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 7 Jun 2021 20:10:42 +0200 Subject: [PATCH 04/12] Fix ArrowWriter so that column order is not changed --- src/datasets/arrow_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 24f860baee3..e4be7b4a398 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -272,7 +272,7 @@ def write_examples_on_file(self): return # Since current_examples contains (example, key) tuples - cols = sorted(self.current_examples[0][0].keys()) + cols = self.current_examples[0][0].keys() schema = None if self.pa_writer is None and self.update_features else self._schema try_schema = self._schema if self.pa_writer is None and self.update_features else None From e29d8a812e021e5d5c78f7a8c0efc542da92efb5 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 7 Jun 2021 20:11:03 +0200 Subject: [PATCH 05/12] Fix minor typo --- src/datasets/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/features.py b/src/datasets/features.py index 581b09a8826..ded1b656dd2 100644 --- a/src/datasets/features.py +++ b/src/datasets/features.py @@ -829,7 +829,7 @@ def get_nested_type(schema: FeatureType) -> pa.DataType: return pa.list_(value_type) elif isinstance(schema, Sequence): value_type = get_nested_type(schema.feature) - # We allow to reverse list of dict => dict of list for compatiblity with tfds + # We allow to reverse list of dict => dict of list for compatibility with tfds if isinstance(value_type, pa.StructType): return pa.struct({f.name: pa.list_(f.type, schema.length) for f in value_type}) return pa.list_(value_type, schema.length) From 947eb1f7a2d3b298e819392305684c8176e6ca7f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 7 Jun 2021 20:22:20 +0200 Subject: [PATCH 06/12] Fix test_dataset_add_column --- tests/test_arrow_dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 9b36a5c2da2..2758ad25e5b 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -2393,7 +2393,11 @@ def test_dataset_add_column(column, expected_dtype, in_memory, transform, datase original_dataset: Dataset = getattr(original_dataset, transform_name)(*args, **kwargs) dataset = original_dataset.add_column(column_name, column) assert dataset.data.shape == (4, 4) - expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64", column_name: expected_dtype} + expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"} + # Sort expected features as in the original dataset + expected_features = {feature: expected_features[feature] for feature in original_dataset.features} + # Add new column feature + expected_features[column_name] = expected_dtype assert dataset.data.column_names == list(expected_features.keys()) for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype From 7325f321c9df923b2c0b522415700cc9faca9f07 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 8 Jun 2021 15:48:05 +0200 Subject: [PATCH 07/12] use the order of the schema to order the arrays used to build the table --- src/datasets/arrow_writer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index e4be7b4a398..6501ca50dd9 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -276,7 +276,7 @@ def write_examples_on_file(self): schema = None if self.pa_writer is None and self.update_features else self._schema try_schema = self._schema if self.pa_writer is None and self.update_features else None - arrays = [] + arrays = {} inferred_types = [] for col in cols: col_type = schema.field(col).type if schema is not None else None @@ -293,9 +293,10 @@ def write_examples_on_file(self): type(pa_array) ) ) - arrays.append(pa_array) + arrays[col] = pa_array inferred_types.append(inferred_type) - schema = pa.schema(zip(cols, inferred_types)) if self.pa_writer is None else self._schema + schema = pa.schema(sorted(zip(cols, inferred_types))) if self.pa_writer is None else self._schema + arrays = [arrays[col] for col in schema.names] table = pa.Table.from_arrays(arrays, schema=schema) self.write_table(table) self.current_examples = [] From cdad6bb5041d3fa7ae2bcc8956b56ccab2f88874 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 8 Jun 2021 19:52:35 +0200 Subject: [PATCH 08/12] Revert "use the order of the schema to order the arrays used to build the table" This reverts commit 7325f321c9df923b2c0b522415700cc9faca9f07. --- src/datasets/arrow_writer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 6501ca50dd9..e4be7b4a398 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -276,7 +276,7 @@ def write_examples_on_file(self): schema = None if self.pa_writer is None and self.update_features else self._schema try_schema = self._schema if self.pa_writer is None and self.update_features else None - arrays = {} + arrays = [] inferred_types = [] for col in cols: col_type = schema.field(col).type if schema is not None else None @@ -293,10 +293,9 @@ def write_examples_on_file(self): type(pa_array) ) ) - arrays[col] = pa_array + arrays.append(pa_array) inferred_types.append(inferred_type) - schema = pa.schema(sorted(zip(cols, inferred_types))) if self.pa_writer is None else self._schema - arrays = [arrays[col] for col in schema.names] + schema = pa.schema(zip(cols, inferred_types)) if self.pa_writer is None else self._schema table = pa.Table.from_arrays(arrays, schema=schema) self.write_table(table) self.current_examples = [] From 9556a6aead456d98a6272b9e6db069970edb9303 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 9 Jun 2021 14:05:28 +0200 Subject: [PATCH 09/12] Preserve schema names order in ArrowWriter --- src/datasets/arrow_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index e4be7b4a398..040c409cfad 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -272,7 +272,7 @@ def write_examples_on_file(self): return # Since current_examples contains (example, key) tuples - cols = self.current_examples[0][0].keys() + cols = [col for col in self._schema.names if col in self.current_examples[0][0]] schema = None if self.pa_writer is None and self.update_features else self._schema try_schema = self._schema if self.pa_writer is None and self.update_features else None From 07bdba14f328d83da9530db4978b82d55cdf928f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 9 Jun 2021 14:38:07 +0200 Subject: [PATCH 10/12] Fix cols when no schema --- src/datasets/arrow_writer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 040c409cfad..ae01854f141 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -272,7 +272,11 @@ def write_examples_on_file(self): return # Since current_examples contains (example, key) tuples - cols = [col for col in self._schema.names if col in self.current_examples[0][0]] + cols = ( + [col for col in self._schema.names if col in self.current_examples[0][0]] + if self._schema + else self.current_examples[0][0].keys() + ) schema = None if self.pa_writer is None and self.update_features else self._schema try_schema = self._schema if self.pa_writer is None and self.update_features else None From 9c2b253eed7c00101f9df3052cdd1d27e458c7cf Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 15 Jun 2021 11:32:41 +0200 Subject: [PATCH 11/12] Fix ArrowWriter so that new columns not yet in schema are also written --- src/datasets/arrow_writer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index ae01854f141..3cbff589c5c 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -274,6 +274,7 @@ def write_examples_on_file(self): # Since current_examples contains (example, key) tuples cols = ( [col for col in self._schema.names if col in self.current_examples[0][0]] + + [col for col in self.current_examples[0][0].keys() if col not in self._schema.names] if self._schema else self.current_examples[0][0].keys() ) From 3acf0e0a8c75f801a8539602760c5faadb512ce6 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 15 Jun 2021 16:13:48 +0200 Subject: [PATCH 12/12] Address suggested minor fix --- src/datasets/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/features.py b/src/datasets/features.py index ded1b656dd2..cd3e7d6634b 100644 --- a/src/datasets/features.py +++ b/src/datasets/features.py @@ -818,7 +818,7 @@ def get_nested_type(schema: FeatureType) -> pa.DataType: if isinstance(schema, Features): return pa.struct( {key: get_nested_type(schema[key]) for key in schema} - ) # Features is subclass of dict, and dict order is deterministic since Python 3.7 + ) # Features is subclass of dict, and dict order is deterministic since Python 3.6 elif isinstance(schema, dict): return pa.struct( {key: get_nested_type(schema[key]) for key in schema}