Merge pull request #87 from huggingface/master

JayantGoel001 · web-flow · commit 241aad80706c · 2021-09-09T01:51:55.000+05:30
Update: timit_asr - make the dataset streamable (huggingface#2835)
diff --git a/datasets/timit_asr/timit_asr.py b/datasets/timit_asr/timit_asr.py
@@ -121,7 +121,7 @@ def _generate_examples(self, data_info_csv):
         data_path = os.path.join(os.path.dirname(data_info_csv).strip(), "data")
 
         # Read the data info to extract rows mentioning about non-converted audio only
-        data_info = pd.read_csv(data_info_csv, encoding="utf8")
+        data_info = pd.read_csv(open(data_info_csv, encoding="utf8"))
         # making sure that the columns having no information about the file paths are removed
         data_info.dropna(subset=["path_from_data_dir"], inplace=True)
 
diff --git a/docs/source/dataset_streaming.rst b/docs/source/dataset_streaming.rst
@@ -263,7 +263,7 @@ Dataset script compatibility
 Now that you are aware of how dataset streaming works, you can make sure your dataset script work in streaming mode:
 
 1. make sure you use ``open`` to open the data files: it is extended to work with remote files
-2. if you have to deal with archives like ZIP files, make sure you use ``os.path.join`` to navigate in the archive
+2. if you have to deal with archives like ZIP files, make sure you use ``os.path.join`` and ``os.path.dirname`` to navigate in the archive
 
 Currently a few python functions or classes are not supported for dataset streaming:
 
diff --git a/src/datasets/streaming.py b/src/datasets/streaming.py
@@ -5,7 +5,7 @@
 
 from .utils.logging import get_logger
 from .utils.patching import patch_submodule
-from .utils.streaming_download_manager import xjoin, xopen, xpathjoin, xpathopen
+from .utils.streaming_download_manager import xdirname, xjoin, xopen, xpathjoin, xpathopen
 
 
 logger = get_logger(__name__)
@@ -38,6 +38,7 @@ def extend_module_for_streaming(module_path, use_auth_token: Optional[Union[str,
         patch_submodule(module, "open", xopen).start()
     # allow to navigate in remote zip files
     patch_submodule(module, "os.path.join", xjoin).start()
+    patch_submodule(module, "os.path.dirname", xdirname).start()
     if hasattr(module, "Path"):
         patch.object(module.Path, "joinpath", xpathjoin).start()
         patch.object(module.Path, "__truediv__", xpathjoin).start()
diff --git a/src/datasets/utils/streaming_download_manager.py b/src/datasets/utils/streaming_download_manager.py
@@ -57,6 +57,37 @@ def xjoin(a, *p):
     return "::".join([a] + b)
 
 
+def xdirname(a, *p):
+    """
+    This function extends os.path.dirname to support the "::" hop separator. It supports both paths and urls.
+
+    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
+    This is used to access files inside a zip file over http for example.
+
+    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
+    Then you can just chain the url this way:
+
+        zip://folder1/file.txt::https://host.com/archive.zip
+
+    The xdirname function allows you to apply the dirname on the first path of the chain.
+
+    Example::
+
+        >>> xdirname("zip://folder1/file.txt::https://host.com/archive.zip")
+        zip://folder1::https://host.com/archive.zip
+    """
+    a, *b = a.split("::")
+    if is_local_path(a):
+        a = os.path.dirname(Path(a).as_posix())
+    else:
+        a = posixpath.dirname(a)
+    # if we end up at the root of the protocol, we get for example a = 'http:'
+    # so we have to fix it by adding the '//' that was removed:
+    if a.endswith(":"):
+        a += "//"
+    return "::".join([a] + b)
+
+
 def _as_posix(path: Path):
     """Extend :meth:`pathlib.PurePath.as_posix` to fix missing slash after protocol.
 
diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py
@@ -1,4 +1,5 @@
 import os
+import re
 from pathlib import Path
 
 import pytest
@@ -21,6 +22,28 @@
 TEST_URL_CONTENT = "foo\nbar\nfoobar"
 
 
+def _readd_double_slash_removed_by_path(path_as_posix: str) -> str:
+    """Path(...) on an url path like zip://file.txt::http://host.com/data.zip
+    converts the :// to :/
+    This function readds the ://
+
+    It handles cases like:
+
+    - https://host.com/data.zip
+    - C://data.zip
+    - zip://file.txt::https://host.com/data.zip
+    - zip://file.txt::/Users/username/data.zip
+    - zip://file.txt::C://data.zip
+
+    Args:
+        path_as_posix (str): output of Path(...).as_posix()
+
+    Returns:
+        str: the url path with :// instead of :/
+    """
+    return re.sub("([A-z]:/)([A-z:])", r"\g<1>/\g<2>", path_as_posix)
+
+
 @pytest.mark.parametrize(
     "input_path, expected_path",
     [("zip:/test.txt::/Users/username/bar.zip", "zip://test.txt::/Users/username/bar.zip")],
@@ -32,6 +55,7 @@ def test_as_posix(input_path, expected_path):
 @pytest.mark.parametrize(
     "input_path, paths_to_join, expected_path",
     [
+        (str(Path(__file__).resolve().parent), (Path(__file__).name,), str(Path(__file__).resolve())),
         ("https://host.com/archive.zip", ("file.txt",), "https://host.com/archive.zip/file.txt"),
         (
             "zip://::https://host.com/archive.zip",
@@ -57,11 +81,35 @@ def test_as_posix(input_path, expected_path):
 )
 def test_xjoin(input_path, paths_to_join, expected_path):
     output_path = xjoin(input_path, *paths_to_join)
-    assert output_path == expected_path
+    output_path = _readd_double_slash_removed_by_path(Path(output_path).as_posix())
+    assert output_path == _readd_double_slash_removed_by_path(Path(expected_path).as_posix())
     output_path = xpathjoin(Path(input_path), *paths_to_join)
     assert output_path == Path(expected_path)
 
 
+@pytest.mark.parametrize(
+    "input_path, expected_path",
+    [
+        (str(Path(__file__).resolve()), str(Path(__file__).resolve().parent)),
+        ("https://host.com/archive.zip", "https://host.com"),
+        (
+            "zip://file.txt::https://host.com/archive.zip",
+            "zip://::https://host.com/archive.zip",
+        ),
+        (
+            "zip://folder/file.txt::https://host.com/archive.zip",
+            "zip://folder::https://host.com/archive.zip",
+        ),
+    ],
+)
+def test_xdirname(input_path, expected_path):
+    from datasets.utils.streaming_download_manager import xdirname
+
+    output_path = xdirname(input_path)
+    output_path = _readd_double_slash_removed_by_path(Path(output_path).as_posix())
+    assert output_path == _readd_double_slash_removed_by_path(Path(expected_path).as_posix())
+
+
 def test_xopen_local(text_path):
     with xopen(text_path, encoding="utf-8") as f, open(text_path, encoding="utf-8") as expected_file:
         assert list(f) == list(expected_file)
@@ -99,7 +147,8 @@ def test_streaming_dl_manager_download_and_extract_no_extraction(urlpath):
 def test_streaming_dl_manager_extract(text_gz_path, text_path):
     dl_manager = StreamingDownloadManager()
     output_path = dl_manager.extract(text_gz_path)
-    path = os.path.basename(text_gz_path).rstrip(".gz")
+    path = os.path.basename(text_gz_path)
+    path = path[: path.rindex(".")]
     assert output_path == f"gzip://{path}::{text_gz_path}"
     fsspec_open_file = xopen(output_path, encoding="utf-8")
     with fsspec_open_file as f, open(text_path, encoding="utf-8") as expected_file:
@@ -109,7 +158,8 @@ def test_streaming_dl_manager_extract(text_gz_path, text_path):
 def test_streaming_dl_manager_download_and_extract_with_extraction(text_gz_path, text_path):
     dl_manager = StreamingDownloadManager()
     output_path = dl_manager.download_and_extract(text_gz_path)
-    path = os.path.basename(text_gz_path).rstrip(".gz")
+    path = os.path.basename(text_gz_path)
+    path = path[: path.rindex(".")]
     assert output_path == f"gzip://{path}::{text_gz_path}"
     fsspec_open_file = xopen(output_path, encoding="utf-8")
     with fsspec_open_file as f, open(text_path, encoding="utf-8") as expected_file: