huggingface · lhoestq · Jun 16, 2021 · Jun 15, 2021 · Jun 15, 2021 · Jun 15, 2021
diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py
@@ -91,7 +91,10 @@ def _generate_tables(self, files):
                         f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
                         f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
                     )
-            if self.config.schema:
+            if self.config.features:
                 # Cast allows str <-> int/float, while parse_option explicit_schema does NOT
-                pa_table = pa_table.cast(self.config.schema)
+                # Before casting, rearrange JSON field names to match passed features schema field names order
+                pa_table = pa.Table.from_arrays(
+                    [pa_table[name] for name in self.config.features], schema=self.config.schema
+                )
             yield i, pa_table
diff --git a/tests/io/test_json.py b/tests/io/test_json.py
@@ -71,6 +71,23 @@ def test_dataset_from_json_with_unsorted_column_names(features, jsonl_312_path,
         assert dataset.features[feature].dtype == expected_dtype
 
 
+def test_dataset_from_json_with_mismatched_features(jsonl_312_path, tmp_path):
+    # jsonl_312_path features are {"col_3": "float64", "col_1": "string", "col_2": "int64"}
+    features = {"col_2": "int64", "col_3": "float64", "col_1": "string"}
+    expected_features = features.copy()
+    features = (
+        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
+    )
+    cache_dir = tmp_path / "cache"
+    dataset = JsonDatasetReader(jsonl_312_path, features=features, cache_dir=cache_dir).read()
+    assert isinstance(dataset, Dataset)
+    assert dataset.num_rows == 2
+    assert dataset.num_columns == 3
+    assert dataset.column_names == ["col_2", "col_3", "col_1"]
+    for feature, expected_dtype in expected_features.items():
+        assert dataset.features[feature].dtype == expected_dtype
+
+
 @pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"])
 def test_dataset_from_json_split(split, jsonl_path, tmp_path):
     cache_dir = tmp_path / "cache"