huggingface · albertvillanova · Jun 28, 2021 · Jun 9, 2021 · Jun 9, 2021 · Jun 12, 2021
diff --git a/src/datasets/features.py b/src/datasets/features.py
@@ -604,7 +604,7 @@ def str2int(self, values: Union[str, Iterable]):
             if self._str2int:
                 # strip key if not in dict
                 if value not in self._str2int:
-                    value = value.strip()
+                    value = str(value).strip()
                 output.append(self._str2int[str(value)])
             else:
                 # No names provided, try to integerize

diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py
@@ -92,6 +92,12 @@ def _generate_tables(self, files):
                         f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
                     )
             if self.config.features:
+                # Encode column if ClassLabel
+                for i, col in enumerate(self.config.features.keys()):
+                    if isinstance(self.config.features[col], datasets.ClassLabel):
+                        pa_table = pa_table.set_column(
+                            i, self.config.schema.field(col), [self.config.features[col].str2int(pa_table[col])]
+                        )
                 # Cast allows str <-> int/float, while parse_option explicit_schema does NOT
                 # Before casting, rearrange JSON field names to match passed features schema field names order
                 pa_table = pa.Table.from_arrays(

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -139,6 +139,13 @@ def xml_file(tmp_path_factory):
     {"col_3": 1.0, "col_1": "1", "col_2": 1},
 ]
 
+DATA_STR = [
+    {"col_1": "s0", "col_2": 0, "col_3": 0.0},
+    {"col_1": "s1", "col_2": 1, "col_3": 1.0},
+    {"col_1": "s2", "col_2": 2, "col_3": 2.0},
+    {"col_1": "s3", "col_2": 3, "col_3": 3.0},
+]
+
 
 @pytest.fixture(scope="session")
 def dataset_dict():
@@ -200,6 +207,15 @@ def jsonl_312_path(tmp_path_factory):
     return path
 
 
+@pytest.fixture(scope="session")
+def jsonl_str_path(tmp_path_factory):
+    path = str(tmp_path_factory.mktemp("data") / "dataset-str.jsonl")
+    with open(path, "w") as f:
+        for item in DATA_STR:
+            f.write(json.dumps(item))
+    return path
+
+
 @pytest.fixture(scope="session")
 def text_path(tmp_path_factory):
     data = ["0", "1", "2", "3"]

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -2577,6 +2577,15 @@ def test_dataset_from_json_features(features, jsonl_path, tmp_path):
     _check_json_dataset(dataset, expected_features)
 
 
+def test_dataset_from_json_with_class_label_feature(jsonl_str_path, tmp_path):
+    features = Features(
+        {"col_1": ClassLabel(names=["s0", "s1", "s2", "s3"]), "col_2": Value("int64"), "col_3": Value("float64")}
+    )
+    cache_dir = tmp_path / "cache"
+    dataset = Dataset.from_json(jsonl_str_path, features=features, cache_dir=cache_dir)
+    assert dataset.features["col_1"].dtype == "int64"
+
+
 @pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"])
 def test_dataset_from_json_split(split, jsonl_path, tmp_path):
     cache_dir = tmp_path / "cache"