Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/datasets/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ def str2int(self, values: Union[str, Iterable]):
if self._str2int:
# strip key if not in dict
if value not in self._str2int:
value = value.strip()
value = str(value).strip()
output.append(self._str2int[str(value)])
else:
# No names provided, try to integerize
Expand Down
6 changes: 6 additions & 0 deletions src/datasets/packaged_modules/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ def _generate_tables(self, files):
f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
)
if self.config.features:
# Encode column if ClassLabel
for i, col in enumerate(self.config.features.keys()):
if isinstance(self.config.features[col], datasets.ClassLabel):
pa_table = pa_table.set_column(
i, self.config.schema.field(col), [self.config.features[col].str2int(pa_table[col])]
)
# Cast allows str <-> int/float, while parse_option explicit_schema does NOT
# Before casting, rearrange JSON field names to match passed features schema field names order
pa_table = pa.Table.from_arrays(
Expand Down
16 changes: 16 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,13 @@ def xml_file(tmp_path_factory):
{"col_3": 1.0, "col_1": "1", "col_2": 1},
]

DATA_STR = [
{"col_1": "s0", "col_2": 0, "col_3": 0.0},
{"col_1": "s1", "col_2": 1, "col_3": 1.0},
{"col_1": "s2", "col_2": 2, "col_3": 2.0},
{"col_1": "s3", "col_2": 3, "col_3": 3.0},
]


@pytest.fixture(scope="session")
def dataset_dict():
Expand Down Expand Up @@ -200,6 +207,15 @@ def jsonl_312_path(tmp_path_factory):
return path


@pytest.fixture(scope="session")
def jsonl_str_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset-str.jsonl")
with open(path, "w") as f:
for item in DATA_STR:
f.write(json.dumps(item))
return path


@pytest.fixture(scope="session")
def text_path(tmp_path_factory):
data = ["0", "1", "2", "3"]
Expand Down
9 changes: 9 additions & 0 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2577,6 +2577,15 @@ def test_dataset_from_json_features(features, jsonl_path, tmp_path):
_check_json_dataset(dataset, expected_features)


def test_dataset_from_json_with_class_label_feature(jsonl_str_path, tmp_path):
features = Features(
{"col_1": ClassLabel(names=["s0", "s1", "s2", "s3"]), "col_2": Value("int64"), "col_3": Value("float64")}
)
cache_dir = tmp_path / "cache"
dataset = Dataset.from_json(jsonl_str_path, features=features, cache_dir=cache_dir)
assert dataset.features["col_1"].dtype == "int64"


@pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"])
def test_dataset_from_json_split(split, jsonl_path, tmp_path):
cache_dir = tmp_path / "cache"
Expand Down