Skip to content

Commit 67b6909

Browse files
Fix NonMatchingSplitsSizesError/ExpectedMoreSplits in no-code Hub datasets when passing data_dir/data_files (#6925)
* Do not use exported dataset infos in some cases * Add regression tests
1 parent 18cebaf commit 67b6909

File tree

2 files changed

+21
-1
lines changed

2 files changed

+21
-1
lines changed

src/datasets/load.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1235,7 +1235,12 @@ def get_module(self) -> DatasetModule:
12351235
pass
12361236
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
12371237
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
1238-
if config.USE_PARQUET_EXPORT: # maybe don't use the infos from the parquet export
1238+
# Use the infos from the parquet export except in some cases:
1239+
if self.data_dir or self.data_files or (self.revision and self.revision != "main"):
1240+
use_exported_dataset_infos = False
1241+
else:
1242+
use_exported_dataset_infos = True
1243+
if config.USE_PARQUET_EXPORT and use_exported_dataset_infos:
12391244
try:
12401245
exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
12411246
dataset=self.name, revision=self.revision, token=self.download_config.token

tests/test_load.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1267,6 +1267,21 @@ def test_load_dataset_cached_local_script(dataset_loading_script_dir, data_dir,
12671267
assert f"Dataset '{SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST}' doesn't exist on the Hub" in str(exc_info.value)
12681268

12691269

1270+
@pytest.mark.integration
1271+
@pytest.mark.parametrize(
1272+
"kwargs, expected_train_num_rows, expected_test_num_rows",
1273+
[
1274+
({}, 2, 2),
1275+
({"data_dir": "data1"}, 1, 1), # GH-6918: NonMatchingSplitsSizesError
1276+
({"data_files": "data1/train.txt"}, 1, None), # GH-6939: ExpectedMoreSplits
1277+
],
1278+
)
1279+
def test_load_dataset_without_script_from_hub(kwargs, expected_train_num_rows, expected_test_num_rows):
1280+
dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER3, **kwargs)
1281+
assert dataset["train"].num_rows == expected_train_num_rows
1282+
assert (dataset["test"].num_rows == expected_test_num_rows) if expected_test_num_rows else ("test" not in dataset)
1283+
1284+
12701285
@pytest.mark.integration
12711286
@pytest.mark.parametrize("stream_from_cache, ", [False, True])
12721287
def test_load_dataset_cached_from_hub(stream_from_cache, caplog):

0 commit comments

Comments
 (0)