huggingface · albertvillanova · Aug 16, 2021 · Aug 11, 2021 · Aug 11, 2021 · Aug 11, 2021
diff --git a/src/datasets/utils/streaming_download_manager.py b/src/datasets/utils/streaming_download_manager.py
@@ -15,6 +15,7 @@
 
 logger = get_logger(__name__)
 BASE_KNOWN_EXTENSIONS = ["txt", "csv", "json", "jsonl", "tsv", "conll", "conllu", "parquet", "pkl", "pickle", "xml"]
+COMPRESSION_KNOWN_EXTENSIONS = ["bz2", "lz4", "xz", "zst"]
 
 
 def xjoin(a, *p):
@@ -63,6 +64,19 @@ def read_with_retries(*args, **kwargs):
         return out
 
     file_obj.read = read_with_retries
+    return file_obj
+
+
+def _add_retries_to_fsspec_open_file(fsspec_open_file):
+    open_ = fsspec_open_file.open
+
+    def open_with_retries():
+        file_obj = open_()
+        _add_retries_to_file_obj_read_method(file_obj)
+        return file_obj
+
+    fsspec_open_file.open = open_with_retries
+    return fsspec_open_file
 
 
 def xopen(file, mode="r", *args, **kwargs):
@@ -74,8 +88,13 @@ def xopen(file, mode="r", *args, **kwargs):
     """
     if fsspec.get_fs_token_paths(file)[0].protocol == "https":
         kwargs["headers"] = get_authentication_headers_for_url(file, use_auth_token=kwargs.pop("use_auth_token", None))
-    file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()
-    _add_retries_to_file_obj_read_method(file_obj)
+    compression = fsspec.core.get_compression(file, "infer")
+    if not compression or compression in ["gzip", "zip"]:
+        file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()
+        file_obj = _add_retries_to_file_obj_read_method(file_obj)
+    else:
+        file_obj = fsspec.open(file, mode=mode, compression=compression, *args, **kwargs)
+        file_obj = _add_retries_to_fsspec_open_file(file_obj)
     return file_obj
 
 
@@ -130,7 +149,7 @@ def _extract(self, urlpath):
 
     def _get_extraction_protocol(self, urlpath) -> Optional[str]:
         path = urlpath.split("::")[0]
-        if path.split(".")[-1] in BASE_KNOWN_EXTENSIONS:
+        if path.split(".")[-1] in BASE_KNOWN_EXTENSIONS + COMPRESSION_KNOWN_EXTENSIONS:
             return None
         elif path.endswith(".gz") and not path.endswith(".tar.gz"):
             return "gzip"

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -247,6 +247,24 @@ def test_load_dataset_streaming_gz_json(jsonl_gz_path):
     assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
 
 
+@require_streaming
+@pytest.mark.parametrize(
+    "path", ["sample.jsonl", "sample.jsonl.gz", "sample.tar", "sample.jsonl.xz", "sample.zip", "sample.jsonl.zst"]
+)
+def test_load_dataset_streaming_compressed_files(path):
+    repo_id = "albertvillanova/datasets-tests-compression"
+    data_files = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{path}"
 def _resolve_data_files_locally_or_by_urls( 
 def _resolve_data_files_locally_or_by_urls( 
+    ds = load_dataset("json", split="train", data_files=data_files, streaming=True)
+    assert isinstance(ds, IterableDataset)
+    ds_item = next(iter(ds))
+    assert ds_item == {
+        "tokens": ["Ministeri", "de", "Justícia", "d'Espanya"],
+        "ner_tags": [1, 2, 2, 2],
+        "langs": ["ca", "ca", "ca", "ca"],
+        "spans": ["PER: Ministeri de Justícia d'Espanya"],
+    }
+
+
 def test_loading_from_the_datasets_hub():
     with tempfile.TemporaryDirectory() as tmp_dir:
         dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER, cache_dir=tmp_dir)