Skip to content

Commit d1d3c06

Browse files
authored
Improve default patterns resolution (#6704)
* Separate filename and dirname patterns * Nit * Faster local files resolution * Style * Use context manager * Replace `fsspec.get_fs_token_paths` with `url_to_fs` * Fix * Remove context manager
1 parent a02997d commit d1d3c06

File tree

13 files changed

+79
-80
lines changed

13 files changed

+79
-80
lines changed

src/datasets/arrow_dataset.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
import pandas as pd
6060
import pyarrow as pa
6161
import pyarrow.compute as pc
62+
from fsspec.core import url_to_fs
6263
from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi
6364
from multiprocess import Pool
6465
from tqdm.contrib.concurrent import thread_map
@@ -1504,7 +1505,7 @@ def save_to_disk(
15041505
num_shards = num_shards if num_shards is not None else num_proc
15051506

15061507
fs: fsspec.AbstractFileSystem
1507-
fs, _, _ = fsspec.get_fs_token_paths(dataset_path, storage_options=storage_options)
1508+
fs, _ = url_to_fs(dataset_path, **(storage_options or {}))
15081509

15091510
if not is_remote_filesystem(fs):
15101511
parent_cache_files_paths = {
@@ -1694,7 +1695,7 @@ def load_from_disk(
16941695
storage_options = fs.storage_options
16951696

16961697
fs: fsspec.AbstractFileSystem
1697-
fs, _, [dataset_path] = fsspec.get_fs_token_paths(dataset_path, storage_options=storage_options)
1698+
fs, dataset_path = url_to_fs(dataset_path, **(storage_options or {}))
16981699

16991700
dest_dataset_path = dataset_path
17001701
dataset_dict_json_path = posixpath.join(dest_dataset_path, config.DATASETDICT_JSON_FILENAME)

src/datasets/arrow_writer.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import numpy as np
2525
import pyarrow as pa
2626
import pyarrow.parquet as pq
27+
from fsspec.core import url_to_fs
2728

2829
from . import config
2930
from .features import Features, Image, Value
@@ -327,14 +328,10 @@ def __init__(
327328
self._disable_nullable = disable_nullable
328329

329330
if stream is None:
330-
fs_token_paths = fsspec.get_fs_token_paths(path, storage_options=storage_options)
331-
self._fs: fsspec.AbstractFileSystem = fs_token_paths[0]
332-
self._path = (
333-
fs_token_paths[2][0]
334-
if not is_remote_filesystem(self._fs)
335-
else self._fs.unstrip_protocol(fs_token_paths[2][0])
336-
)
337-
self.stream = self._fs.open(fs_token_paths[2][0], "wb")
331+
fs, path = url_to_fs(path, **(storage_options or {}))
332+
self._fs: fsspec.AbstractFileSystem = fs
333+
self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path)
334+
self.stream = self._fs.open(path, "wb")
338335
self._closable_stream = True
339336
else:
340337
self._fs = None
@@ -681,7 +678,7 @@ def finalize(self, metrics_query_result: dict):
681678
"""
682679

683680
# Beam FileSystems require the system's path separator in the older versions
684-
fs, _, [parquet_path] = fsspec.get_fs_token_paths(self._parquet_path)
681+
fs, parquet_path = url_to_fs(self._parquet_path)
685682
parquet_path = str(Path(parquet_path)) if not is_remote_filesystem(fs) else fs.unstrip_protocol(parquet_path)
686683

687684
shards = fs.glob(parquet_path + "*.parquet")

src/datasets/builder.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434

3535
import fsspec
3636
import pyarrow as pa
37+
from fsspec.core import url_to_fs
3738
from multiprocess import Pool
3839
from tqdm.contrib.concurrent import thread_map
3940

@@ -883,7 +884,7 @@ def download_and_prepare(
883884

884885
output_dir = output_dir if output_dir is not None else self._cache_dir
885886
# output_dir can be a remote bucket on GCS or S3 (when using BeamBasedBuilder for distributed data processing)
886-
fs, _, [output_dir] = fsspec.get_fs_token_paths(output_dir, storage_options=storage_options)
887+
fs, output_dir = url_to_fs(output_dir, **(storage_options or {}))
887888
self._fs = fs
888889
self._output_dir = output_dir if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(output_dir)
889890

src/datasets/data_files.py

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from typing import Callable, Dict, List, Optional, Set, Tuple, Union
77

88
import huggingface_hub
9-
from fsspec import get_fs_token_paths
9+
from fsspec.core import url_to_fs
1010
from fsspec.implementations.http import HTTPFileSystem
1111
from huggingface_hub import HfFileSystem
1212
from packaging import version
@@ -46,36 +46,57 @@ class EmptyDatasetError(FileNotFoundError):
4646
}
4747
NON_WORDS_CHARS = "-._ 0-9"
4848
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
49-
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"]
49+
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
50+
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
51+
"{keyword}/**",
52+
"{keyword}[{sep}]*/**",
53+
"**[{sep}/]{keyword}/**",
54+
"**[{sep}/]{keyword}[{sep}]*/**",
55+
]
5056
elif config.FSSPEC_VERSION < version.parse("2023.12.0"):
51-
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"]
57+
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
58+
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
59+
"{keyword}/**/*",
60+
"{keyword}[{sep}]*/**/*",
61+
"**/*[{sep}/]{keyword}/**/*",
62+
"**/*[{sep}/]{keyword}[{sep}]*/**/*",
63+
]
5264
else:
53-
KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = [
54-
"**/{keyword}[{sep}]*",
65+
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*", "**/*[{sep}]{keyword}[{sep}]*"]
66+
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
5567
"**/{keyword}/**",
56-
"**/*[{sep}]{keyword}[{sep}]*",
57-
"**/*[{sep}]{keyword}[{sep}]*/**",
5868
"**/{keyword}[{sep}]*/**",
5969
"**/*[{sep}]{keyword}/**",
70+
"**/*[{sep}]{keyword}[{sep}]*/**",
6071
]
6172

6273
DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
63-
DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = {
74+
DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
6475
split: [
6576
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
6677
for keyword in SPLIT_KEYWORDS[split]
67-
for pattern in KEYWORDS_IN_PATH_NAME_BASE_PATTERNS
78+
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
6879
]
6980
for split in DEFAULT_SPLITS
7081
}
82+
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
83+
split: [
84+
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
85+
for keyword in SPLIT_KEYWORDS[split]
86+
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
87+
]
88+
for split in DEFAULT_SPLITS
89+
}
90+
7191

7292
DEFAULT_PATTERNS_ALL = {
7393
Split.TRAIN: ["**"],
7494
}
7595

7696
ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
7797
ALL_DEFAULT_PATTERNS = [
78-
DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME,
98+
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
99+
DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
79100
DEFAULT_PATTERNS_ALL,
80101
]
81102
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
@@ -351,7 +372,7 @@ def resolve_pattern(
351372
else:
352373
base_path = ""
353374
pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config)
354-
fs, _, _ = get_fs_token_paths(pattern, storage_options=storage_options)
375+
fs, *_ = url_to_fs(pattern, **storage_options)
355376
fs_base_path = base_path.split("::")[0].split("://")[-1] or fs.root_marker
356377
fs_pattern = pattern.split("::")[0].split("://")[-1]
357378
files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}
@@ -409,7 +430,7 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
409430
410431
Output:
411432
412-
{"train": ["**"]}
433+
{'train': ['**']}
413434
414435
Input:
415436
@@ -435,8 +456,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
435456
436457
Output:
437458
438-
{'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
439-
'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
459+
{'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
460+
'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}
440461
441462
Input:
442463
@@ -454,8 +475,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig]
454475
455476
Output:
456477
457-
{'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
458-
'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
478+
{'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
479+
'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}
459480
460481
Input:
461482
@@ -504,7 +525,7 @@ def _get_single_origin_metadata(
504525
download_config: Optional[DownloadConfig] = None,
505526
) -> Tuple[str]:
506527
data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config)
507-
fs, _, _ = get_fs_token_paths(data_file, storage_options=storage_options)
528+
fs, *_ = url_to_fs(data_file, **storage_options)
508529
if isinstance(fs, HfFileSystem):
509530
resolved_path = fs.resolve_path(data_file)
510531
return (resolved_path.repo_id, resolved_path.revision)

src/datasets/dataset_dict.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import fsspec
1414
import numpy as np
15+
from fsspec.core import url_to_fs
1516
from huggingface_hub import (
1617
CommitInfo,
1718
CommitOperationAdd,
@@ -1280,7 +1281,7 @@ def save_to_disk(
12801281
storage_options = fs.storage_options
12811282

12821283
fs: fsspec.AbstractFileSystem
1283-
fs, _, _ = fsspec.get_fs_token_paths(dataset_dict_path, storage_options=storage_options)
1284+
fs, _ = url_to_fs(dataset_dict_path, **(storage_options or {}))
12841285

12851286
if num_shards is None:
12861287
num_shards = {k: None for k in self}
@@ -1354,7 +1355,7 @@ def load_from_disk(
13541355
storage_options = fs.storage_options
13551356

13561357
fs: fsspec.AbstractFileSystem
1357-
fs, _, [dataset_dict_path] = fsspec.get_fs_token_paths(dataset_dict_path, storage_options=storage_options)
1358+
fs, dataset_dict_path = url_to_fs(dataset_dict_path, **(storage_options or {}))
13581359

13591360
dataset_dict_json_path = posixpath.join(dataset_dict_path, config.DATASETDICT_JSON_FILENAME)
13601361
dataset_state_json_path = posixpath.join(dataset_dict_path, config.DATASET_STATE_JSON_FILENAME)

src/datasets/download/streaming_download_manager.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import fsspec
1818
from aiohttp.client_exceptions import ClientError
19+
from fsspec.core import url_to_fs
1920
from huggingface_hub.utils import EntryNotFoundError
2021
from packaging import version
2122

@@ -159,7 +160,7 @@ def xexists(urlpath: str, download_config: Optional[DownloadConfig] = None):
159160
else:
160161
urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config)
161162
main_hop, *rest_hops = urlpath.split("::")
162-
fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options)
163+
fs, *_ = url_to_fs(urlpath, **storage_options)
163164
return fs.exists(main_hop)
164165

165166

@@ -259,7 +260,7 @@ def xisfile(path, download_config: Optional[DownloadConfig] = None) -> bool:
259260
else:
260261
path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config)
261262
main_hop, *rest_hops = path.split("::")
262-
fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options)
263+
fs, *_ = url_to_fs(path, **storage_options)
263264
return fs.isfile(main_hop)
264265

265266

@@ -279,7 +280,7 @@ def xgetsize(path, download_config: Optional[DownloadConfig] = None) -> int:
279280
else:
280281
path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config)
281282
main_hop, *rest_hops = path.split("::")
282-
fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options)
283+
fs, *_ = fs, *_ = url_to_fs(path, **storage_options)
283284
try:
284285
size = fs.size(main_hop)
285286
except EntryNotFoundError:
@@ -307,7 +308,7 @@ def xisdir(path, download_config: Optional[DownloadConfig] = None) -> bool:
307308
else:
308309
path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config)
309310
main_hop, *rest_hops = path.split("::")
310-
fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options)
311+
fs, *_ = fs, *_ = url_to_fs(path, **storage_options)
311312
inner_path = main_hop.split("://")[-1]
312313
if not inner_path.strip("/"):
313314
return True
@@ -546,7 +547,7 @@ def xlistdir(path: str, download_config: Optional[DownloadConfig] = None) -> Lis
546547
# globbing inside a zip in a private repo requires authentication
547548
path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config)
548549
main_hop, *rest_hops = path.split("::")
549-
fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options)
550+
fs, *_ = url_to_fs(path, **storage_options)
550551
inner_path = main_hop.split("://")[-1]
551552
if inner_path.strip("/") and not fs.isdir(inner_path):
552553
raise FileNotFoundError(f"Directory doesn't exist: {path}")
@@ -573,11 +574,7 @@ def xglob(urlpath, *, recursive=False, download_config: Optional[DownloadConfig]
573574
# globbing inside a zip in a private repo requires authentication
574575
urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config)
575576
main_hop, *rest_hops = urlpath.split("::")
576-
fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options)
577-
# - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching
578-
# so to be able to glob patterns like "[0-9]", we have to call `fs.glob`.
579-
# - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories.
580-
# - If there is "**" in the pattern, `fs.glob` must be called anyway.
577+
fs, *_ = url_to_fs(urlpath, **storage_options)
581578
inner_path = main_hop.split("://")[1]
582579
globbed_paths = fs.glob(inner_path)
583580
protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[-1]
@@ -603,7 +600,7 @@ def xwalk(urlpath, download_config: Optional[DownloadConfig] = None, **kwargs):
603600
# walking inside a zip in a private repo requires authentication
604601
urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config)
605602
main_hop, *rest_hops = urlpath.split("::")
606-
fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options)
603+
fs, *_ = url_to_fs(urlpath, **storage_options)
607604
inner_path = main_hop.split("://")[-1]
608605
if inner_path.strip("/") and not fs.isdir(inner_path):
609606
return []
@@ -659,11 +656,7 @@ def glob(self, pattern, download_config: Optional[DownloadConfig] = None):
659656
posix_path = "::".join([main_hop, urlpath, *rest_hops[1:]])
660657
else:
661658
storage_options = None
662-
fs, *_ = fsspec.get_fs_token_paths(xjoin(posix_path, pattern), storage_options=storage_options)
663-
# - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching
664-
# so to be able to glob patterns like "[0-9]", we have to call `fs.glob`.
665-
# - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories.
666-
# - If there is "**" in the pattern, `fs.glob` must be called anyway.
659+
fs, *_ = url_to_fs(xjoin(posix_path, pattern), **(storage_options or {}))
667660
globbed_paths = fs.glob(xjoin(main_hop, pattern))
668661
for globbed_path in globbed_paths:
669662
yield type(self)("::".join([f"{fs.protocol}://{globbed_path}"] + rest_hops))

src/datasets/filesystems/__init__.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import importlib
22
import shutil
3-
import threading
43
import warnings
54
from typing import List
65

@@ -68,19 +67,3 @@ def rename(fs: fsspec.AbstractFileSystem, src: str, dst: str):
6867
shutil.move(fs._strip_protocol(src), fs._strip_protocol(dst))
6968
else:
7069
fs.mv(src, dst, recursive=True)
71-
72-
73-
def _reset_fsspec_lock() -> None:
74-
"""
75-
Clear reference to the loop and thread.
76-
This is necessary otherwise HTTPFileSystem hangs in the ML training loop.
77-
Only required for fsspec >= 0.9.0
78-
See https://github.com/fsspec/gcsfs/issues/379
79-
"""
80-
if hasattr(fsspec.asyn, "reset_lock"):
81-
# for future fsspec>2022.05.0
82-
fsspec.asyn.reset_lock()
83-
else:
84-
fsspec.asyn.iothread[0] = None
85-
fsspec.asyn.loop[0] = None
86-
fsspec.asyn.lock = threading.Lock()

src/datasets/info.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from typing import ClassVar, Dict, List, Optional, Union
4040

4141
import fsspec
42+
from fsspec.core import url_to_fs
4243
from huggingface_hub import DatasetCard, DatasetCardData
4344

4445
from . import config
@@ -251,7 +252,7 @@ def write_to_directory(
251252
storage_options = fs.storage_options
252253

253254
fs: fsspec.AbstractFileSystem
254-
fs, _, _ = fsspec.get_fs_token_paths(dataset_info_dir, storage_options=storage_options)
255+
fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
255256
with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "wb") as f:
256257
self._dump_info(f, pretty_print=pretty_print)
257258
if self.license:
@@ -347,7 +348,7 @@ def from_directory(
347348
storage_options = fs.storage_options
348349

349350
fs: fsspec.AbstractFileSystem
350-
fs, _, _ = fsspec.get_fs_token_paths(dataset_info_dir, storage_options=storage_options)
351+
fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
351352
logger.info(f"Loading Dataset info from {dataset_info_dir}")
352353
if not dataset_info_dir:
353354
raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")

src/datasets/iterable_dataset.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99
from itertools import cycle, islice
1010
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union
1111

12+
import fsspec.asyn
1213
import numpy as np
1314
import pyarrow as pa
1415

1516
from . import config
1617
from .arrow_dataset import Dataset, DatasetInfoMixin
1718
from .features import Features
1819
from .features.features import FeatureType, _align_features, _check_if_features_can_be_aligned, cast_to_python_objects
19-
from .filesystems import _reset_fsspec_lock
2020
from .formatting import PythonFormatter, TensorFormatter, get_format_type_from_alias, get_formatter
2121
from .info import DatasetInfo
2222
from .splits import NamedSplit
@@ -1257,8 +1257,9 @@ def n_shards(self) -> int:
12571257

12581258
def _iter_pytorch(self):
12591259
ex_iterable = self._prepare_ex_iterable_for_iteration()
1260-
# fix for fsspec when using multiprocess
1261-
_reset_fsspec_lock()
1260+
# Fix for fsspec when using multiprocess to avoid hanging in the ML training loop. (only required for fsspec >= 0.9.0)
1261+
# See https://github.com/fsspec/gcsfs/issues/379
1262+
fsspec.asyn.reset_lock()
12621263
# check if there aren't too many workers
12631264
import torch.utils.data
12641265

0 commit comments

Comments
 (0)