Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/datasets/packaged_modules/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,10 @@ def _generate_tables(self, files):
f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
)
if self.config.schema:
if self.config.features:
# Cast allows str <-> int/float, while parse_option explicit_schema does NOT
pa_table = pa_table.cast(self.config.schema)
# Before casting, rearrange JSON field names to match passed features schema field names order
pa_table = pa.Table.from_arrays(
[pa_table[name] for name in self.config.features], schema=self.config.schema
)
yield i, pa_table
17 changes: 17 additions & 0 deletions tests/io/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,23 @@ def test_dataset_from_json_with_unsorted_column_names(features, jsonl_312_path,
assert dataset.features[feature].dtype == expected_dtype


def test_dataset_from_json_with_mismatched_features(jsonl_312_path, tmp_path):
# jsonl_312_path features are {"col_3": "float64", "col_1": "string", "col_2": "int64"}
features = {"col_2": "int64", "col_3": "float64", "col_1": "string"}
expected_features = features.copy()
features = (
Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
)
cache_dir = tmp_path / "cache"
dataset = JsonDatasetReader(jsonl_312_path, features=features, cache_dir=cache_dir).read()
assert isinstance(dataset, Dataset)
assert dataset.num_rows == 2
assert dataset.num_columns == 3
assert dataset.column_names == ["col_2", "col_3", "col_1"]
for feature, expected_dtype in expected_features.items():
assert dataset.features[feature].dtype == expected_dtype


@pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"])
def test_dataset_from_json_split(split, jsonl_path, tmp_path):
cache_dir = tmp_path / "cache"
Expand Down