Skip to content

Commit 241aad8

Browse files
Merge pull request #87 from huggingface/master
Update: timit_asr - make the dataset streamable (huggingface#2835)
2 parents 47e51ba + 9a2dff6 commit 241aad8

File tree

5 files changed

+88
-6
lines changed

5 files changed

+88
-6
lines changed

datasets/timit_asr/timit_asr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def _generate_examples(self, data_info_csv):
121121
data_path = os.path.join(os.path.dirname(data_info_csv).strip(), "data")
122122

123123
# Read the data info to extract rows mentioning about non-converted audio only
124-
data_info = pd.read_csv(data_info_csv, encoding="utf8")
124+
data_info = pd.read_csv(open(data_info_csv, encoding="utf8"))
125125
# making sure that the columns having no information about the file paths are removed
126126
data_info.dropna(subset=["path_from_data_dir"], inplace=True)
127127

docs/source/dataset_streaming.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ Dataset script compatibility
263263
Now that you are aware of how dataset streaming works, you can make sure your dataset script work in streaming mode:
264264

265265
1. make sure you use ``open`` to open the data files: it is extended to work with remote files
266-
2. if you have to deal with archives like ZIP files, make sure you use ``os.path.join`` to navigate in the archive
266+
2. if you have to deal with archives like ZIP files, make sure you use ``os.path.join`` and ``os.path.dirname`` to navigate in the archive
267267

268268
Currently a few python functions or classes are not supported for dataset streaming:
269269

src/datasets/streaming.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from .utils.logging import get_logger
77
from .utils.patching import patch_submodule
8-
from .utils.streaming_download_manager import xjoin, xopen, xpathjoin, xpathopen
8+
from .utils.streaming_download_manager import xdirname, xjoin, xopen, xpathjoin, xpathopen
99

1010

1111
logger = get_logger(__name__)
@@ -38,6 +38,7 @@ def extend_module_for_streaming(module_path, use_auth_token: Optional[Union[str,
3838
patch_submodule(module, "open", xopen).start()
3939
# allow to navigate in remote zip files
4040
patch_submodule(module, "os.path.join", xjoin).start()
41+
patch_submodule(module, "os.path.dirname", xdirname).start()
4142
if hasattr(module, "Path"):
4243
patch.object(module.Path, "joinpath", xpathjoin).start()
4344
patch.object(module.Path, "__truediv__", xpathjoin).start()

src/datasets/utils/streaming_download_manager.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,37 @@ def xjoin(a, *p):
5757
return "::".join([a] + b)
5858

5959

60+
def xdirname(a, *p):
61+
"""
62+
This function extends os.path.dirname to support the "::" hop separator. It supports both paths and urls.
63+
64+
A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
65+
This is used to access files inside a zip file over http for example.
66+
67+
Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
68+
Then you can just chain the url this way:
69+
70+
zip://folder1/file.txt::https://host.com/archive.zip
71+
72+
The xdirname function allows you to apply the dirname on the first path of the chain.
73+
74+
Example::
75+
76+
>>> xdirname("zip://folder1/file.txt::https://host.com/archive.zip")
77+
zip://folder1::https://host.com/archive.zip
78+
"""
79+
a, *b = a.split("::")
80+
if is_local_path(a):
81+
a = os.path.dirname(Path(a).as_posix())
82+
else:
83+
a = posixpath.dirname(a)
84+
# if we end up at the root of the protocol, we get for example a = 'http:'
85+
# so we have to fix it by adding the '//' that was removed:
86+
if a.endswith(":"):
87+
a += "//"
88+
return "::".join([a] + b)
89+
90+
6091
def _as_posix(path: Path):
6192
"""Extend :meth:`pathlib.PurePath.as_posix` to fix missing slash after protocol.
6293

tests/test_streaming_download_manager.py

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import re
23
from pathlib import Path
34

45
import pytest
@@ -21,6 +22,28 @@
2122
TEST_URL_CONTENT = "foo\nbar\nfoobar"
2223

2324

25+
def _readd_double_slash_removed_by_path(path_as_posix: str) -> str:
26+
"""Path(...) on an url path like zip://file.txt::http://host.com/data.zip
27+
converts the :// to :/
28+
This function readds the ://
29+
30+
It handles cases like:
31+
32+
- https://host.com/data.zip
33+
- C://data.zip
34+
- zip://file.txt::https://host.com/data.zip
35+
- zip://file.txt::/Users/username/data.zip
36+
- zip://file.txt::C://data.zip
37+
38+
Args:
39+
path_as_posix (str): output of Path(...).as_posix()
40+
41+
Returns:
42+
str: the url path with :// instead of :/
43+
"""
44+
return re.sub("([A-z]:/)([A-z:])", r"\g<1>/\g<2>", path_as_posix)
45+
46+
2447
@pytest.mark.parametrize(
2548
"input_path, expected_path",
2649
[("zip:/test.txt::/Users/username/bar.zip", "zip://test.txt::/Users/username/bar.zip")],
@@ -32,6 +55,7 @@ def test_as_posix(input_path, expected_path):
3255
@pytest.mark.parametrize(
3356
"input_path, paths_to_join, expected_path",
3457
[
58+
(str(Path(__file__).resolve().parent), (Path(__file__).name,), str(Path(__file__).resolve())),
3559
("https://host.com/archive.zip", ("file.txt",), "https://host.com/archive.zip/file.txt"),
3660
(
3761
"zip://::https://host.com/archive.zip",
@@ -57,11 +81,35 @@ def test_as_posix(input_path, expected_path):
5781
)
5882
def test_xjoin(input_path, paths_to_join, expected_path):
5983
output_path = xjoin(input_path, *paths_to_join)
60-
assert output_path == expected_path
84+
output_path = _readd_double_slash_removed_by_path(Path(output_path).as_posix())
85+
assert output_path == _readd_double_slash_removed_by_path(Path(expected_path).as_posix())
6186
output_path = xpathjoin(Path(input_path), *paths_to_join)
6287
assert output_path == Path(expected_path)
6388

6489

90+
@pytest.mark.parametrize(
91+
"input_path, expected_path",
92+
[
93+
(str(Path(__file__).resolve()), str(Path(__file__).resolve().parent)),
94+
("https://host.com/archive.zip", "https://host.com"),
95+
(
96+
"zip://file.txt::https://host.com/archive.zip",
97+
"zip://::https://host.com/archive.zip",
98+
),
99+
(
100+
"zip://folder/file.txt::https://host.com/archive.zip",
101+
"zip://folder::https://host.com/archive.zip",
102+
),
103+
],
104+
)
105+
def test_xdirname(input_path, expected_path):
106+
from datasets.utils.streaming_download_manager import xdirname
107+
108+
output_path = xdirname(input_path)
109+
output_path = _readd_double_slash_removed_by_path(Path(output_path).as_posix())
110+
assert output_path == _readd_double_slash_removed_by_path(Path(expected_path).as_posix())
111+
112+
65113
def test_xopen_local(text_path):
66114
with xopen(text_path, encoding="utf-8") as f, open(text_path, encoding="utf-8") as expected_file:
67115
assert list(f) == list(expected_file)
@@ -99,7 +147,8 @@ def test_streaming_dl_manager_download_and_extract_no_extraction(urlpath):
99147
def test_streaming_dl_manager_extract(text_gz_path, text_path):
100148
dl_manager = StreamingDownloadManager()
101149
output_path = dl_manager.extract(text_gz_path)
102-
path = os.path.basename(text_gz_path).rstrip(".gz")
150+
path = os.path.basename(text_gz_path)
151+
path = path[: path.rindex(".")]
103152
assert output_path == f"gzip://{path}::{text_gz_path}"
104153
fsspec_open_file = xopen(output_path, encoding="utf-8")
105154
with fsspec_open_file as f, open(text_path, encoding="utf-8") as expected_file:
@@ -109,7 +158,8 @@ def test_streaming_dl_manager_extract(text_gz_path, text_path):
109158
def test_streaming_dl_manager_download_and_extract_with_extraction(text_gz_path, text_path):
110159
dl_manager = StreamingDownloadManager()
111160
output_path = dl_manager.download_and_extract(text_gz_path)
112-
path = os.path.basename(text_gz_path).rstrip(".gz")
161+
path = os.path.basename(text_gz_path)
162+
path = path[: path.rindex(".")]
113163
assert output_path == f"gzip://{path}::{text_gz_path}"
114164
fsspec_open_file = xopen(output_path, encoding="utf-8")
115165
with fsspec_open_file as f, open(text_path, encoding="utf-8") as expected_file:

0 commit comments

Comments
 (0)