diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 233ebe67140..7f10ece15e1 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -99,8 +99,8 @@ def create_config_id(self, config_kwargs: dict, custom_features: Optional[Featur """ The config id is used to build the cache directory. By default it is equal to the config name. - However the name of a config is not sufficent to have a unique identifier for the dataset being generated since - it doesn't take into account: + However the name of a config is not sufficient to have a unique identifier for the dataset being generated + since it doesn't take into account: - the config kwargs that can be used to overwrite attributes - the custom features used to write the dataset - the data_files for json/text/csv/pandas datasets @@ -150,8 +150,11 @@ def create_config_id(self, config_kwargs: dict, custom_features: Optional[Featur for key in sorted(data_files.keys()): m.update(key) for data_file in data_files[key]: - m.update(os.path.abspath(data_file)) - m.update(str(os.path.getmtime(data_file))) + if is_remote_url(data_file): + m.update(data_file) + else: + m.update(os.path.abspath(data_file)) + m.update(str(os.path.getmtime(data_file))) suffix = m.hexdigest() if custom_features is not None: diff --git a/tests/test_load.py b/tests/test_load.py index 69e744f396b..803a7134d9b 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -335,3 +335,13 @@ def test_load_from_disk_with_default_in_memory( with assert_arrow_memory_increases() if expected_in_memory else assert_arrow_memory_doesnt_increase(): _ = load_from_disk(dataset_path) + + +def test_remote_data_files(): + repo_id = "albertvillanova/tests-raw-jsonl" + filename = "wikiann-bn-validation.jsonl" + data_files = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{filename}" + ds = load_dataset("json", split="train", data_files=data_files, streaming=True) + assert isinstance(ds, IterableDataset) + ds_item = next(iter(ds)) + assert ds_item.keys() == {"langs", "ner_tags", "spans", "tokens"}