huggingface · albertvillanova · Jun 15, 2021 · Jun 7, 2021 · Jun 7, 2021 · Jun 7, 2021
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -3077,7 +3077,7 @@ def add_item(self, item: dict, new_fingerprint: str):
         Returns:
             :class:`Dataset`
         """
-        item_table = InMemoryTable.from_pydict({k: [v] for k, v in item.items()})
+        item_table = InMemoryTable.from_pydict({k: [item[k]] for k in self.features.keys() if k in item})
         # Cast item
         schema = pa.schema(self.features.type)
         item_table = item_table.cast(schema)

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -272,7 +272,12 @@ def write_examples_on_file(self):
             return
 
         # Since current_examples contains (example, key) tuples
-        cols = sorted(self.current_examples[0][0].keys())
+        cols = (
+            [col for col in self._schema.names if col in self.current_examples[0][0]]
+            + [col for col in self.current_examples[0][0].keys() if col not in self._schema.names]
+            if self._schema
+            else self.current_examples[0][0].keys()
+        )
 
         schema = None if self.pa_writer is None and self.update_features else self._schema
         try_schema = self._schema if self.pa_writer is None and self.update_features else None

diff --git a/src/datasets/features.py b/src/datasets/features.py
@@ -817,8 +817,8 @@ def get_nested_type(schema: FeatureType) -> pa.DataType:
     # Nested structures: we allow dict, list/tuples, sequences
     if isinstance(schema, Features):
         return pa.struct(
-            {key: get_nested_type(schema[key]) for key in sorted(schema)}
-        )  # sort to make the order of columns deterministic
+            {key: get_nested_type(schema[key]) for key in schema}
+        )  # Features is subclass of dict, and dict order is deterministic since Python 3.6
     elif isinstance(schema, dict):
         return pa.struct(
             {key: get_nested_type(schema[key]) for key in schema}
@@ -829,7 +829,7 @@ def get_nested_type(schema: FeatureType) -> pa.DataType:
         return pa.list_(value_type)
     elif isinstance(schema, Sequence):
         value_type = get_nested_type(schema.feature)
-        # We allow to reverse list of dict => dict of list for compatiblity with tfds
+        # We allow to reverse list of dict => dict of list for compatibility with tfds
         if isinstance(value_type, pa.StructType):
             return pa.struct({f.name: pa.list_(f.type, schema.length) for f in value_type})
         return pa.list_(value_type, schema.length)

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -130,6 +130,11 @@ def xml_file(tmp_path_factory):
     "col_3": [0.0, 1.0, 2.0, 3.0],
 }
 
+DATA_312 = [
+    {"col_3": 0.0, "col_1": "0", "col_2": 0},
+    {"col_3": 1.0, "col_1": "1", "col_2": 1},
+]
+
 
 @pytest.fixture(scope="session")
 def dataset_dict():
@@ -182,6 +187,15 @@ def jsonl_path(tmp_path_factory):
     return path
 
 
+@pytest.fixture(scope="session")
+def jsonl_312_path(tmp_path_factory):
+    path = str(tmp_path_factory.mktemp("data") / "dataset_312.jsonl")
+    with open(path, "w") as f:
+        for item in DATA_312:
+            f.write(json.dumps(item))
+    return path
+
+
 @pytest.fixture(scope="session")
 def text_path(tmp_path_factory):
     data = ["0", "1", "2", "3"]

diff --git a/tests/io/test_json.py b/tests/io/test_json.py
@@ -39,7 +39,6 @@ def test_dataset_from_json_keep_in_memory(keep_in_memory, jsonl_path, tmp_path):
 )
 def test_dataset_from_json_features(features, jsonl_path, tmp_path):
     cache_dir = tmp_path / "cache"
-    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
     default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
     expected_features = features.copy() if features else default_expected_features
     features = (
@@ -49,6 +48,29 @@ def test_dataset_from_json_features(features, jsonl_path, tmp_path):
     _check_json_dataset(dataset, expected_features)
 
 
+@pytest.mark.parametrize(
+    "features",
+    [
+        None,
+        {"col_3": "float64", "col_1": "string", "col_2": "int64"},
+    ],
+)
+def test_dataset_from_json_with_unsorted_column_names(features, jsonl_312_path, tmp_path):
+    cache_dir = tmp_path / "cache"
+    default_expected_features = {"col_3": "float64", "col_1": "string", "col_2": "int64"}
+    expected_features = features.copy() if features else default_expected_features
+    features = (
+        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
+    )
+    dataset = JsonDatasetReader(jsonl_312_path, features=features, cache_dir=cache_dir).read()
+    assert isinstance(dataset, Dataset)
+    assert dataset.num_rows == 2
+    assert dataset.num_columns == 3
+    assert dataset.column_names == ["col_3", "col_1", "col_2"]
+    for feature, expected_dtype in expected_features.items():
+        assert dataset.features[feature].dtype == expected_dtype
+
+
 @pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"])
 def test_dataset_from_json_split(split, jsonl_path, tmp_path):
     cache_dir = tmp_path / "cache"

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -2393,7 +2393,11 @@ def test_dataset_add_column(column, expected_dtype, in_memory, transform, datase
         original_dataset: Dataset = getattr(original_dataset, transform_name)(*args, **kwargs)
     dataset = original_dataset.add_column(column_name, column)
     assert dataset.data.shape == (4, 4)
-    expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64", column_name: expected_dtype}
+    expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
+    # Sort expected features as in the original dataset
+    expected_features = {feature: expected_features[feature] for feature in original_dataset.features}
+    # Add new column feature
+    expected_features[column_name] = expected_dtype
     assert dataset.data.column_names == list(expected_features.keys())
     for feature, expected_dtype in expected_features.items():
         assert dataset.features[feature].dtype == expected_dtype