Support streaming compressed files (#2786)

albertvillanova · web-flow · commit 98159c4b335d · 2021-08-16T08:36:19.000+02:00
* Pass compression to stream zstd file

* Implement custom readline for io.RawIOBase like

* Fix readline in json module for io.RawIOBase

* Simplify custom readline

* Test load dataset streaming compressed files

* Test xz files

* Support streaming xz compressed files

* Support streaming bz2 and lz4 compressed files

* Fix style in test

* Fix test

* Test zip files

* Test tar files

* Test gzip files

* Implement _add_retries_to_fsspec_open_file

* Add retries to fsspec OpenFile

* Make _add_retries_to_file_obj_read_method return

* Refactor _add_retries_to_fsspec_open_file
diff --git a/src/datasets/utils/streaming_download_manager.py b/src/datasets/utils/streaming_download_manager.py
@@ -15,6 +15,7 @@
 
 logger = get_logger(__name__)
 BASE_KNOWN_EXTENSIONS = ["txt", "csv", "json", "jsonl", "tsv", "conll", "conllu", "parquet", "pkl", "pickle", "xml"]
+COMPRESSION_KNOWN_EXTENSIONS = ["bz2", "lz4", "xz", "zst"]
 
 
 def xjoin(a, *p):
@@ -63,6 +64,19 @@ def read_with_retries(*args, **kwargs):
         return out
 
     file_obj.read = read_with_retries
+    return file_obj
+
+
+def _add_retries_to_fsspec_open_file(fsspec_open_file):
+    open_ = fsspec_open_file.open
+
+    def open_with_retries():
+        file_obj = open_()
+        _add_retries_to_file_obj_read_method(file_obj)
+        return file_obj
+
+    fsspec_open_file.open = open_with_retries
+    return fsspec_open_file
 
 
 def xopen(file, mode="r", *args, **kwargs):
@@ -74,8 +88,13 @@ def xopen(file, mode="r", *args, **kwargs):
     """
     if fsspec.get_fs_token_paths(file)[0].protocol == "https":
         kwargs["headers"] = get_authentication_headers_for_url(file, use_auth_token=kwargs.pop("use_auth_token", None))
-    file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()
-    _add_retries_to_file_obj_read_method(file_obj)
+    compression = fsspec.core.get_compression(file, "infer")
+    if not compression or compression in ["gzip", "zip"]:
+        file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()
+        file_obj = _add_retries_to_file_obj_read_method(file_obj)
+    else:
+        file_obj = fsspec.open(file, mode=mode, compression=compression, *args, **kwargs)
+        file_obj = _add_retries_to_fsspec_open_file(file_obj)
     return file_obj
 
 
@@ -130,7 +149,7 @@ def _extract(self, urlpath):
 
     def _get_extraction_protocol(self, urlpath) -> Optional[str]:
         path = urlpath.split("::")[0]
-        if path.split(".")[-1] in BASE_KNOWN_EXTENSIONS:
+        if path.split(".")[-1] in BASE_KNOWN_EXTENSIONS + COMPRESSION_KNOWN_EXTENSIONS:
             return None
         elif path.endswith(".gz") and not path.endswith(".tar.gz"):
             return "gzip"
diff --git a/tests/test_load.py b/tests/test_load.py
@@ -247,6 +247,24 @@ def test_load_dataset_streaming_gz_json(jsonl_gz_path):
     assert ds_item == {"col_1": "0", "col_2": 0, "col_3": 0.0}
 
 
+@require_streaming
+@pytest.mark.parametrize(
+    "path", ["sample.jsonl", "sample.jsonl.gz", "sample.tar", "sample.jsonl.xz", "sample.zip", "sample.jsonl.zst"]
+)
+def test_load_dataset_streaming_compressed_files(path):
+    repo_id = "albertvillanova/datasets-tests-compression"
+    data_files = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{path}"
+    ds = load_dataset("json", split="train", data_files=data_files, streaming=True)
+    assert isinstance(ds, IterableDataset)
+    ds_item = next(iter(ds))
+    assert ds_item == {
+        "tokens": ["Ministeri", "de", "Justícia", "d'Espanya"],
+        "ner_tags": [1, 2, 2, 2],
+        "langs": ["ca", "ca", "ca", "ca"],
+        "spans": ["PER: Ministeri de Justícia d'Espanya"],
+    }
+
+
 def test_loading_from_the_datasets_hub():
     with tempfile.TemporaryDirectory() as tmp_dir:
         dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER, cache_dir=tmp_dir)