huggingface · lhoestq · Jul 9, 2021 · Jul 9, 2021 · Jul 9, 2021
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -99,8 +99,8 @@ def create_config_id(self, config_kwargs: dict, custom_features: Optional[Featur
         """
         The config id is used to build the cache directory.
         By default it is equal to the config name.
-        However the name of a config is not sufficent to have a unique identifier for the dataset being generated since
-        it doesn't take into account:
+        However the name of a config is not sufficient to have a unique identifier for the dataset being generated
+        since it doesn't take into account:
         - the config kwargs that can be used to overwrite attributes
         - the custom features used to write the dataset
         - the data_files for json/text/csv/pandas datasets
@@ -150,8 +150,11 @@ def create_config_id(self, config_kwargs: dict, custom_features: Optional[Featur
             for key in sorted(data_files.keys()):
                 m.update(key)
                 for data_file in data_files[key]:
-                    m.update(os.path.abspath(data_file))
-                    m.update(str(os.path.getmtime(data_file)))
+                    if is_remote_url(data_file):
+                        m.update(data_file)
+                    else:
+                        m.update(os.path.abspath(data_file))
+                        m.update(str(os.path.getmtime(data_file)))
             suffix = m.hexdigest()
 
         if custom_features is not None:

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -335,3 +335,13 @@ def test_load_from_disk_with_default_in_memory(
 
     with assert_arrow_memory_increases() if expected_in_memory else assert_arrow_memory_doesnt_increase():
         _ = load_from_disk(dataset_path)
+
+
+def test_remote_data_files():
+    repo_id = "albertvillanova/tests-raw-jsonl"
+    filename = "wikiann-bn-validation.jsonl"
+    data_files = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{filename}"
+    ds = load_dataset("json", split="train", data_files=data_files, streaming=True)
+    assert isinstance(ds, IterableDataset)
+    ds_item = next(iter(ds))
+    assert ds_item.keys() == {"langs", "ner_tags", "spans", "tokens"}