From 7fa24f7f1afc55aba5b571fce8e3ad4c47208bb0 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Fri, 1 Mar 2024 00:03:07 +0100 Subject: [PATCH 1/8] Separate filename and dirname patterns --- src/datasets/data_files.py | 50 +++++++++++++++++++++++++++----------- tests/test_data_files.py | 2 ++ 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 752145413db..4a28811efdc 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -46,28 +46,48 @@ class EmptyDatasetError(FileNotFoundError): } NON_WORDS_CHARS = "-._ 0-9" if config.FSSPEC_VERSION < version.parse("2023.9.0"): - KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**[{sep}/]{keyword}[{sep}/]**"] + KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"] + KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [ + "{keyword}/**", + "{keyword}[{sep}]*/**", + "**[{sep}/]{keyword}/**", + "**[{sep}/]{keyword}[{sep}]*/**", + ] elif config.FSSPEC_VERSION < version.parse("2023.12.0"): - KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = ["{keyword}[{sep}/]**", "**/*[{sep}/]{keyword}[{sep}/]**"] + KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"] + KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [ + "{keyword}/**/*", + "{keyword}[{sep}]*/**/*", + "**/*[{sep}/]{keyword}/**/*", + "**/*[{sep}/]{keyword}[{sep}]*/**/*", + ] else: - KEYWORDS_IN_PATH_NAME_BASE_PATTERNS = [ - "**/{keyword}[{sep}]*", + KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*", "**/*[{sep}]{keyword}[{sep}]*"] + KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [ "**/{keyword}/**", - "**/*[{sep}]{keyword}[{sep}]*", - "**/*[{sep}]{keyword}[{sep}]*/**", "**/{keyword}[{sep}]*/**", "**/*[{sep}]{keyword}/**", + "**/*[{sep}]{keyword}[{sep}]*/**", ] DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST] -DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME = { +DEFAULT_PATTERNS_SPLIT_IN_FILENAME = { split: [ pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) for keyword in SPLIT_KEYWORDS[split] - for pattern in KEYWORDS_IN_PATH_NAME_BASE_PATTERNS + for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS ] for split in DEFAULT_SPLITS } +DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = { + split: [ + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + for keyword in SPLIT_KEYWORDS[split] + for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS + ] + for split in DEFAULT_SPLITS +} + DEFAULT_PATTERNS_ALL = { Split.TRAIN: ["**"], @@ -75,7 +95,8 @@ class EmptyDatasetError(FileNotFoundError): ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED] ALL_DEFAULT_PATTERNS = [ - DEFAULT_PATTERNS_SPLIT_IN_PATH_NAME, + DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME, + DEFAULT_PATTERNS_SPLIT_IN_FILENAME, DEFAULT_PATTERNS_ALL, ] if config.FSSPEC_VERSION < version.parse("2023.9.0"): @@ -409,7 +430,7 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] Output: - {"train": ["**"]} + {'train': ['**']} Input: @@ -435,8 +456,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] Output: - {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'], - 'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]} + {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'], + 'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]} Input: @@ -454,8 +475,8 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] Output: - {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'], - 'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]} + {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...], + 'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]} Input: @@ -480,6 +501,7 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] """ resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config) try: + # import pdb; pdb.set_trace() return _get_data_files_patterns(resolver) except FileNotFoundError: raise EmptyDatasetError(f"The directory at {base_path} doesn't contain any data files") from None diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 0b4ef65ce2a..dc59bc5c322 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -635,6 +635,8 @@ def ls(self, path, detail=True, refresh=True, **kwargs): {"test": "my test file.txt"}, {"test": "my-test_file.txt"}, {"test": "test00001.txt"}, + # . case + {"test": "test/train.txt"}, ], ) def test_get_data_files_patterns(base_path, data_file_per_split): From feb87bdee45f8a4cd4d32d25ff7fefcd3943c9f9 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Fri, 1 Mar 2024 17:17:42 +0100 Subject: [PATCH 2/8] Nit --- src/datasets/data_files.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 4a28811efdc..a9f96442ea7 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -501,7 +501,6 @@ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] """ resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config) try: - # import pdb; pdb.set_trace() return _get_data_files_patterns(resolver) except FileNotFoundError: raise EmptyDatasetError(f"The directory at {base_path} doesn't contain any data files") from None From 91a67890e72b36d562106e04341e4dd70ee9b211 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 5 Mar 2024 18:05:38 +0100 Subject: [PATCH 3/8] Faster local files resolution --- src/datasets/data_files.py | 5 ++- src/datasets/filesystems/__init__.py | 55 +++++++++++++++++++++------- src/datasets/iterable_dataset.py | 7 ++-- src/datasets/utils/extract.py | 6 ++- 4 files changed, 53 insertions(+), 20 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index a9f96442ea7..7808d4a2eaa 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -15,6 +15,7 @@ from . import config from .download import DownloadConfig from .download.streaming_download_manager import _prepare_path_and_storage_options, xbasename, xjoin +from .filesystems import CachedLocalFileSystem, get_fs_from_path, is_local_filesystem from .naming import _split_re from .splits import Split from .utils import logging @@ -372,7 +373,9 @@ def resolve_pattern( else: base_path = "" pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config) - fs, _, _ = get_fs_token_paths(pattern, storage_options=storage_options) + fs = get_fs_from_path(pattern, storage_options=storage_options) + if is_local_filesystem(fs): + fs = CachedLocalFileSystem(**fs.storage_options) fs_base_path = base_path.split("::")[0].split("://")[-1] or fs.root_marker fs_pattern = pattern.split("::")[0].split("://")[-1] files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)} diff --git a/src/datasets/filesystems/__init__.py b/src/datasets/filesystems/__init__.py index 15aefa5f42a..e1c420fdc01 100644 --- a/src/datasets/filesystems/__init__.py +++ b/src/datasets/filesystems/__init__.py @@ -1,14 +1,16 @@ import importlib import shutil -import threading import warnings -from typing import List +from functools import lru_cache +from typing import Dict, List, Optional import fsspec import fsspec.asyn +from fsspec.core import _un_chain, filesystem, stringify_path from fsspec.implementations.local import LocalFileSystem from ..utils.deprecation_utils import deprecated +from ..utils.typing import PathLike from . import compression @@ -48,6 +50,17 @@ def extract_path_from_uri(dataset_path: str) -> str: return dataset_path +def is_local_filesystem(fs: fsspec.AbstractFileSystem) -> bool: + """ + Checks if `fs` is a local filesystem. + + Args: + fs (`fsspec.spec.AbstractFileSystem`): + An abstract super-class for pythonic file-systems, e.g. `fsspec.filesystem(\'file\')` or [`datasets.filesystems.S3FileSystem`]. + """ + return isinstance(fs, LocalFileSystem) + + def is_remote_filesystem(fs: fsspec.AbstractFileSystem) -> bool: """ Checks if `fs` is a remote filesystem. @@ -70,17 +83,31 @@ def rename(fs: fsspec.AbstractFileSystem, src: str, dst: str): fs.mv(src, dst, recursive=True) -def _reset_fsspec_lock() -> None: +def get_fs_from_path(urlpath: PathLike, storage_options: Optional[Dict] = None) -> fsspec.AbstractFileSystem: """ - Clear reference to the loop and thread. - This is necessary otherwise HTTPFileSystem hangs in the ML training loop. - Only required for fsspec >= 0.9.0 - See https://github.com/fsspec/gcsfs/issues/379 + Get a filesystem object from a urlpath and storage options. """ - if hasattr(fsspec.asyn, "reset_lock"): - # for future fsspec>2022.05.0 - fsspec.asyn.reset_lock() - else: - fsspec.asyn.iothread[0] = None - fsspec.asyn.loop[0] = None - fsspec.asyn.lock = threading.Lock() + # Based on fsspec.get_fs_token_paths + # TODO: remove this function once a similar function is added to fsspec + urlpath = stringify_path(urlpath) + storage_options = storage_options or {} + chain = _un_chain(urlpath, storage_options or {}) + inkwargs = {} + # Reverse iterate the chain, creating a nested target_* structure + for i, ch in enumerate(reversed(chain)): + urls, nested_protocol, kw = ch + if i == len(chain) - 1: + inkwargs = dict(**kw, **inkwargs) + continue + inkwargs["target_options"] = dict(**kw, **inkwargs) + inkwargs["target_protocol"] = nested_protocol + inkwargs["fo"] = urls + _, protocol, _ = chain[0] + fs = filesystem(protocol, **inkwargs) + return fs + + +class CachedLocalFileSystem(LocalFileSystem): + @lru_cache(maxsize=1) + def find(self, *args, **kwargs): + return super().find(*args, **kwargs) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index f508a0b5271..e5d9157f952 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -9,6 +9,7 @@ from itertools import cycle, islice from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union +import fsspec.asyn import numpy as np import pyarrow as pa @@ -16,7 +17,6 @@ from .arrow_dataset import Dataset, DatasetInfoMixin from .features import Features from .features.features import FeatureType, _align_features, _check_if_features_can_be_aligned, cast_to_python_objects -from .filesystems import _reset_fsspec_lock from .formatting import PythonFormatter, TensorFormatter, get_format_type_from_alias, get_formatter from .info import DatasetInfo from .splits import NamedSplit @@ -1257,8 +1257,9 @@ def n_shards(self) -> int: def _iter_pytorch(self): ex_iterable = self._prepare_ex_iterable_for_iteration() - # fix for fsspec when using multiprocess - _reset_fsspec_lock() + # Fix for fsspec when using multiprocess to avoid hanging in the ML training loop. (only required for fsspec >= 0.9.0) + # See https://github.com/fsspec/gcsfs/issues/379 + fsspec.asyn.reset_lock() # check if there aren't too many workers import torch.utils.data diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index f516a990962..a620dadf0ac 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -52,11 +52,13 @@ def extract(self, input_path: str, force_extract: bool = False) -> str: class BaseExtractor(ABC): @classmethod @abstractmethod - def is_extractable(cls, path: Union[Path, str], **kwargs) -> bool: ... + def is_extractable(cls, path: Union[Path, str], **kwargs) -> bool: + ... @staticmethod @abstractmethod - def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None: ... + def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None: + ... class MagicNumberBaseExtractor(BaseExtractor, ABC): From 2d112bab882ceb30134676ce9477e912b7be4d05 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Tue, 5 Mar 2024 18:07:35 +0100 Subject: [PATCH 4/8] Style --- src/datasets/utils/extract.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/datasets/utils/extract.py b/src/datasets/utils/extract.py index a620dadf0ac..f516a990962 100644 --- a/src/datasets/utils/extract.py +++ b/src/datasets/utils/extract.py @@ -52,13 +52,11 @@ def extract(self, input_path: str, force_extract: bool = False) -> str: class BaseExtractor(ABC): @classmethod @abstractmethod - def is_extractable(cls, path: Union[Path, str], **kwargs) -> bool: - ... + def is_extractable(cls, path: Union[Path, str], **kwargs) -> bool: ... @staticmethod @abstractmethod - def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None: - ... + def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None: ... class MagicNumberBaseExtractor(BaseExtractor, ABC): From 74136d3de7c9b399652779cc673656ed3cfa309a Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 6 Mar 2024 00:24:19 +0100 Subject: [PATCH 5/8] Use context manager --- src/datasets/data_files.py | 57 ++++++++++++++++++++++------ src/datasets/filesystems/__init__.py | 35 +---------------- src/datasets/load.py | 51 +++++++++++++------------ 3 files changed, 73 insertions(+), 70 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 7808d4a2eaa..c71fe4b101b 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -1,13 +1,15 @@ import os import re -from functools import partial +from contextlib import contextmanager +from functools import lru_cache, partial from glob import has_magic from pathlib import Path, PurePath from typing import Callable, Dict, List, Optional, Set, Tuple, Union import huggingface_hub -from fsspec import get_fs_token_paths +from fsspec.core import url_to_fs from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem from huggingface_hub import HfFileSystem from packaging import version from tqdm.contrib.concurrent import thread_map @@ -15,7 +17,7 @@ from . import config from .download import DownloadConfig from .download.streaming_download_manager import _prepare_path_and_storage_options, xbasename, xjoin -from .filesystems import CachedLocalFileSystem, get_fs_from_path, is_local_filesystem +from .filesystems import is_local_filesystem from .naming import _split_re from .splits import Split from .utils import logging @@ -38,6 +40,10 @@ class EmptyDatasetError(FileNotFoundError): pass +class DefaultPattern(str): + pass + + SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*" SPLIT_KEYWORDS = { @@ -74,7 +80,7 @@ class EmptyDatasetError(FileNotFoundError): DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST] DEFAULT_PATTERNS_SPLIT_IN_FILENAME = { split: [ - pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + DefaultPattern(pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)) for keyword in SPLIT_KEYWORDS[split] for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS ] @@ -82,7 +88,7 @@ class EmptyDatasetError(FileNotFoundError): } DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = { split: [ - pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + DefaultPattern(pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)) for keyword in SPLIT_KEYWORDS[split] for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS ] @@ -91,7 +97,7 @@ class EmptyDatasetError(FileNotFoundError): DEFAULT_PATTERNS_ALL = { - Split.TRAIN: ["**"], + Split.TRAIN: [DefaultPattern("**")], } ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED] @@ -109,8 +115,8 @@ class EmptyDatasetError(FileNotFoundError): ] # metadata file for ImageFolder and AudioFolder else: METADATA_PATTERNS = [ - "**/metadata.csv", - "**/metadata.jsonl", + DefaultPattern("**/metadata.csv"), + DefaultPattern("**/metadata.jsonl"), ] # metadata file for ImageFolder and AudioFolder WILDCARD_CHARACTERS = "*[]" FILES_TO_IGNORE = [ @@ -319,6 +325,34 @@ def _get_metadata_files_patterns(pattern_resolver: Callable[[str], List[str]]) - raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}") +_IS_RESOLVE_PATTERNS_CACHED: bool = False + + +class _LocalFileSystemWithCachedFind(LocalFileSystem): + @lru_cache(maxsize=1) + def find(self, *args, **kwargs): + return super().find(*args, **kwargs) + + +def _url_to_fs(url, **kwargs): + fs, url_ = url_to_fs(url, **kwargs) + if isinstance(url, DefaultPattern) and is_local_filesystem(fs) and _IS_RESOLVE_PATTERNS_CACHED: + # the default patterns are similar to each other, so let's cache calls to avoid expensive disk access if possible + fs = _LocalFileSystemWithCachedFind(**fs.storage_options) + return fs, url_ + + +@contextmanager +def cached_pattern_resolution(): + global _IS_RESOLVE_PATTERNS_CACHED + _IS_RESOLVE_PATTERNS_CACHED = True + try: + yield + finally: + _IS_RESOLVE_PATTERNS_CACHED = False + _LocalFileSystemWithCachedFind.clear_instance_cache() + + def resolve_pattern( pattern: str, base_path: str, @@ -366,6 +400,7 @@ def resolve_pattern( Returns: List[str]: List of paths or URLs to the local or remote files that match the patterns. """ + pattern_type = type(pattern) # TODO: use UserString if is_relative_path(pattern): pattern = xjoin(base_path, pattern) elif is_local_path(pattern): @@ -373,9 +408,7 @@ def resolve_pattern( else: base_path = "" pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config) - fs = get_fs_from_path(pattern, storage_options=storage_options) - if is_local_filesystem(fs): - fs = CachedLocalFileSystem(**fs.storage_options) + fs, _ = _url_to_fs(pattern_type(pattern), **storage_options) fs_base_path = base_path.split("::")[0].split("://")[-1] or fs.root_marker fs_pattern = pattern.split("::")[0].split("://")[-1] files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)} @@ -528,7 +561,7 @@ def _get_single_origin_metadata( download_config: Optional[DownloadConfig] = None, ) -> Tuple[str]: data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config) - fs, _, _ = get_fs_token_paths(data_file, storage_options=storage_options) + fs, _ = _url_to_fs(data_file, **storage_options) if isinstance(fs, HfFileSystem): resolved_path = fs.resolve_path(data_file) return (resolved_path.repo_id, resolved_path.revision) diff --git a/src/datasets/filesystems/__init__.py b/src/datasets/filesystems/__init__.py index e1c420fdc01..863dc2fa372 100644 --- a/src/datasets/filesystems/__init__.py +++ b/src/datasets/filesystems/__init__.py @@ -1,16 +1,13 @@ import importlib import shutil import warnings -from functools import lru_cache -from typing import Dict, List, Optional +from typing import List import fsspec import fsspec.asyn -from fsspec.core import _un_chain, filesystem, stringify_path from fsspec.implementations.local import LocalFileSystem from ..utils.deprecation_utils import deprecated -from ..utils.typing import PathLike from . import compression @@ -81,33 +78,3 @@ def rename(fs: fsspec.AbstractFileSystem, src: str, dst: str): shutil.move(fs._strip_protocol(src), fs._strip_protocol(dst)) else: fs.mv(src, dst, recursive=True) - - -def get_fs_from_path(urlpath: PathLike, storage_options: Optional[Dict] = None) -> fsspec.AbstractFileSystem: - """ - Get a filesystem object from a urlpath and storage options. - """ - # Based on fsspec.get_fs_token_paths - # TODO: remove this function once a similar function is added to fsspec - urlpath = stringify_path(urlpath) - storage_options = storage_options or {} - chain = _un_chain(urlpath, storage_options or {}) - inkwargs = {} - # Reverse iterate the chain, creating a nested target_* structure - for i, ch in enumerate(reversed(chain)): - urls, nested_protocol, kw = ch - if i == len(chain) - 1: - inkwargs = dict(**kw, **inkwargs) - continue - inkwargs["target_options"] = dict(**kw, **inkwargs) - inkwargs["target_protocol"] = nested_protocol - inkwargs["fo"] = urls - _, protocol, _ = chain[0] - fs = filesystem(protocol, **inkwargs) - return fs - - -class CachedLocalFileSystem(LocalFileSystem): - @lru_cache(maxsize=1) - def find(self, *args, **kwargs): - return super().find(*args, **kwargs) diff --git a/src/datasets/load.py b/src/datasets/load.py index 685e5d543c2..986521fb01c 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -46,6 +46,7 @@ DataFilesPatternsDict, DataFilesPatternsList, EmptyDatasetError, + cached_pattern_resolution, get_data_patterns, get_metadata_patterns, sanitize_patterns, @@ -1022,17 +1023,18 @@ def get_module(self) -> DatasetModule: # we need a set of data files to find which dataset builder to use # because we need to infer module name by files extensions base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix() - if self.data_files is not None: - patterns = sanitize_patterns(self.data_files) - elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())): - patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"]) - else: - patterns = get_data_patterns(base_path) - data_files = DataFilesDict.from_patterns( - patterns, - base_path=base_path, - allowed_extensions=ALL_ALLOWED_EXTENSIONS, - ) + with cached_pattern_resolution(): + if self.data_files is not None: + patterns = sanitize_patterns(self.data_files) + elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())): + patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"]) + else: + patterns = get_data_patterns(base_path) + data_files = DataFilesDict.from_patterns( + patterns, + base_path=base_path, + allowed_extensions=ALL_ALLOWED_EXTENSIONS, + ) module_name, default_builder_kwargs = infer_module_for_data_files( data_files=data_files, path=self.path, @@ -1041,19 +1043,20 @@ def get_module(self) -> DatasetModule: # Collect metadata files if the module supports them supports_metadata = module_name in _MODULE_SUPPORTS_METADATA if self.data_files is None and supports_metadata: - try: - metadata_patterns = get_metadata_patterns(base_path) - except FileNotFoundError: - metadata_patterns = None - if metadata_patterns is not None: - metadata_data_files_list = DataFilesList.from_patterns(metadata_patterns, base_path=base_path) - if metadata_data_files_list: - data_files = DataFilesDict( - { - split: data_files_list + metadata_data_files_list - for split, data_files_list in data_files.items() - } - ) + with cached_pattern_resolution(): + try: + metadata_patterns = get_metadata_patterns(base_path) + except FileNotFoundError: + metadata_patterns = None + if metadata_patterns is not None: + metadata_data_files_list = DataFilesList.from_patterns(metadata_patterns, base_path=base_path) + if metadata_data_files_list: + data_files = DataFilesDict( + { + split: data_files_list + metadata_data_files_list + for split, data_files_list in data_files.items() + } + ) module_path, _ = _PACKAGED_DATASETS_MODULES[module_name] if metadata_configs: From 253c5ef1e29177e89406a3c2c9f8fb81d69bf784 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 7 Mar 2024 18:13:34 +0100 Subject: [PATCH 6/8] Replace `fsspec.get_fs_token_paths` with `url_to_fs` --- src/datasets/arrow_dataset.py | 5 ++-- src/datasets/arrow_writer.py | 15 +++++------ src/datasets/builder.py | 3 ++- src/datasets/dataset_dict.py | 5 ++-- .../download/streaming_download_manager.py | 25 +++++++------------ src/datasets/info.py | 5 ++-- src/datasets/load.py | 3 ++- src/datasets/utils/file_utils.py | 12 +++------ tests/test_filesystem.py | 3 ++- 9 files changed, 34 insertions(+), 42 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 95126664561..d4a45e7207a 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -59,6 +59,7 @@ import pandas as pd import pyarrow as pa import pyarrow.compute as pc +from fsspec.core import url_to_fs from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi from multiprocess import Pool from tqdm.contrib.concurrent import thread_map @@ -1461,7 +1462,7 @@ def save_to_disk( num_shards = num_shards if num_shards is not None else num_proc fs: fsspec.AbstractFileSystem - fs, _, _ = fsspec.get_fs_token_paths(dataset_path, storage_options=storage_options) + fs, _ = url_to_fs(dataset_path, **(storage_options or {})) if not is_remote_filesystem(fs): parent_cache_files_paths = { @@ -1651,7 +1652,7 @@ def load_from_disk( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, [dataset_path] = fsspec.get_fs_token_paths(dataset_path, storage_options=storage_options) + fs, dataset_path = url_to_fs(dataset_path, **(storage_options or {})) dest_dataset_path = dataset_path dataset_dict_json_path = posixpath.join(dest_dataset_path, config.DATASETDICT_JSON_FILENAME) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 3fed7e14e38..82e72a91ecc 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -24,6 +24,7 @@ import numpy as np import pyarrow as pa import pyarrow.parquet as pq +from fsspec.core import url_to_fs from . import config from .features import Features, Image, Value @@ -327,14 +328,10 @@ def __init__( self._disable_nullable = disable_nullable if stream is None: - fs_token_paths = fsspec.get_fs_token_paths(path, storage_options=storage_options) - self._fs: fsspec.AbstractFileSystem = fs_token_paths[0] - self._path = ( - fs_token_paths[2][0] - if not is_remote_filesystem(self._fs) - else self._fs.unstrip_protocol(fs_token_paths[2][0]) - ) - self.stream = self._fs.open(fs_token_paths[2][0], "wb") + fs, path = url_to_fs(path, **(storage_options or {})) + self._fs: fsspec.AbstractFileSystem = fs + self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path) + self.stream = self._fs.open(path, "wb") self._closable_stream = True else: self._fs = None @@ -681,7 +678,7 @@ def finalize(self, metrics_query_result: dict): """ # Beam FileSystems require the system's path separator in the older versions - fs, _, [parquet_path] = fsspec.get_fs_token_paths(self._parquet_path) + fs, parquet_path = url_to_fs(self._parquet_path) parquet_path = str(Path(parquet_path)) if not is_remote_filesystem(fs) else fs.unstrip_protocol(parquet_path) shards = fs.glob(parquet_path + "*.parquet") diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 8f4bc1962b6..f1a1b76743e 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -34,6 +34,7 @@ import fsspec import pyarrow as pa +from fsspec.core import url_to_fs from multiprocess import Pool from tqdm.contrib.concurrent import thread_map @@ -867,7 +868,7 @@ def download_and_prepare( output_dir = output_dir if output_dir is not None else self._cache_dir # output_dir can be a remote bucket on GCS or S3 (when using BeamBasedBuilder for distributed data processing) - fs, _, [output_dir] = fsspec.get_fs_token_paths(output_dir, storage_options=storage_options) + fs, output_dir = url_to_fs(output_dir, **(storage_options or {})) self._fs = fs self._output_dir = output_dir if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(output_dir) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index ab26dbbb83d..18c99dac1b8 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -12,6 +12,7 @@ import fsspec import numpy as np +from fsspec.core import url_to_fs from huggingface_hub import ( CommitInfo, CommitOperationAdd, @@ -1280,7 +1281,7 @@ def save_to_disk( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, _ = fsspec.get_fs_token_paths(dataset_dict_path, storage_options=storage_options) + fs, _ = url_to_fs(dataset_dict_path, **(storage_options or {})) if num_shards is None: num_shards = {k: None for k in self} @@ -1354,7 +1355,7 @@ def load_from_disk( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, [dataset_dict_path] = fsspec.get_fs_token_paths(dataset_dict_path, storage_options=storage_options) + fs, dataset_dict_path = url_to_fs(dataset_dict_path, **(storage_options or {})) dataset_dict_json_path = posixpath.join(dataset_dict_path, config.DATASETDICT_JSON_FILENAME) dataset_state_json_path = posixpath.join(dataset_dict_path, config.DATASET_STATE_JSON_FILENAME) diff --git a/src/datasets/download/streaming_download_manager.py b/src/datasets/download/streaming_download_manager.py index 0b347216a9c..819150756fe 100644 --- a/src/datasets/download/streaming_download_manager.py +++ b/src/datasets/download/streaming_download_manager.py @@ -16,6 +16,7 @@ import fsspec from aiohttp.client_exceptions import ClientError +from fsspec.core import url_to_fs from huggingface_hub.utils import EntryNotFoundError from packaging import version @@ -159,7 +160,7 @@ def xexists(urlpath: str, download_config: Optional[DownloadConfig] = None): else: urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config) main_hop, *rest_hops = urlpath.split("::") - fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options) + fs, *_ = url_to_fs(urlpath, **storage_options) return fs.exists(main_hop) @@ -259,7 +260,7 @@ def xisfile(path, download_config: Optional[DownloadConfig] = None) -> bool: else: path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config) main_hop, *rest_hops = path.split("::") - fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) + fs, *_ = url_to_fs(path, **storage_options) return fs.isfile(main_hop) @@ -279,7 +280,7 @@ def xgetsize(path, download_config: Optional[DownloadConfig] = None) -> int: else: path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config) main_hop, *rest_hops = path.split("::") - fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) + fs, *_ = fs, *_ = url_to_fs(path, **storage_options) try: size = fs.size(main_hop) except EntryNotFoundError: @@ -307,7 +308,7 @@ def xisdir(path, download_config: Optional[DownloadConfig] = None) -> bool: else: path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config) main_hop, *rest_hops = path.split("::") - fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) + fs, *_ = fs, *_ = url_to_fs(path, **storage_options) inner_path = main_hop.split("://")[-1] if not inner_path.strip("/"): return True @@ -546,7 +547,7 @@ def xlistdir(path: str, download_config: Optional[DownloadConfig] = None) -> Lis # globbing inside a zip in a private repo requires authentication path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config) main_hop, *rest_hops = path.split("::") - fs, *_ = fsspec.get_fs_token_paths(path, storage_options=storage_options) + fs, *_ = url_to_fs(path, **storage_options) inner_path = main_hop.split("://")[-1] if inner_path.strip("/") and not fs.isdir(inner_path): raise FileNotFoundError(f"Directory doesn't exist: {path}") @@ -573,11 +574,7 @@ def xglob(urlpath, *, recursive=False, download_config: Optional[DownloadConfig] # globbing inside a zip in a private repo requires authentication urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config) main_hop, *rest_hops = urlpath.split("::") - fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options) - # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching - # so to be able to glob patterns like "[0-9]", we have to call `fs.glob`. - # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories. - # - If there is "**" in the pattern, `fs.glob` must be called anyway. + fs, *_ = url_to_fs(urlpath, **storage_options) inner_path = main_hop.split("://")[1] globbed_paths = fs.glob(inner_path) protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[-1] @@ -603,7 +600,7 @@ def xwalk(urlpath, download_config: Optional[DownloadConfig] = None, **kwargs): # walking inside a zip in a private repo requires authentication urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config) main_hop, *rest_hops = urlpath.split("::") - fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options) + fs, *_ = url_to_fs(urlpath, **storage_options) inner_path = main_hop.split("://")[-1] if inner_path.strip("/") and not fs.isdir(inner_path): return [] @@ -659,11 +656,7 @@ def glob(self, pattern, download_config: Optional[DownloadConfig] = None): posix_path = "::".join([main_hop, urlpath, *rest_hops[1:]]) else: storage_options = None - fs, *_ = fsspec.get_fs_token_paths(xjoin(posix_path, pattern), storage_options=storage_options) - # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching - # so to be able to glob patterns like "[0-9]", we have to call `fs.glob`. - # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories. - # - If there is "**" in the pattern, `fs.glob` must be called anyway. + fs, *_ = url_to_fs(xjoin(posix_path, pattern), **(storage_options or {})) globbed_paths = fs.glob(xjoin(main_hop, pattern)) for globbed_path in globbed_paths: yield type(self)("::".join([f"{fs.protocol}://{globbed_path}"] + rest_hops)) diff --git a/src/datasets/info.py b/src/datasets/info.py index 74e9a962a0c..557f5b77d3f 100644 --- a/src/datasets/info.py +++ b/src/datasets/info.py @@ -39,6 +39,7 @@ from typing import ClassVar, Dict, List, Optional, Union import fsspec +from fsspec.core import url_to_fs from huggingface_hub import DatasetCard, DatasetCardData from . import config @@ -251,7 +252,7 @@ def write_to_directory( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, _ = fsspec.get_fs_token_paths(dataset_info_dir, storage_options=storage_options) + fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {})) with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "wb") as f: self._dump_info(f, pretty_print=pretty_print) if self.license: @@ -347,7 +348,7 @@ def from_directory( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, _ = fsspec.get_fs_token_paths(dataset_info_dir, storage_options=storage_options) + fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {})) logger.info(f"Loading Dataset info from {dataset_info_dir}") if not dataset_info_dir: raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.") diff --git a/src/datasets/load.py b/src/datasets/load.py index 986521fb01c..9e61b47c34b 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -34,6 +34,7 @@ import fsspec import requests import yaml +from fsspec.core import url_to_fs from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem from . import config @@ -2676,7 +2677,7 @@ def load_from_disk( storage_options = fs.storage_options fs: fsspec.AbstractFileSystem - fs, _, _ = fsspec.get_fs_token_paths(dataset_path, storage_options=storage_options) + fs, *_ = url_to_fs(dataset_path, **(storage_options or {})) if not fs.exists(dataset_path): raise FileNotFoundError(f"Directory {dataset_path} not found") if fs.isfile(posixpath.join(dataset_path, config.DATASET_INFO_FILENAME)) and fs.isfile( diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index b3532b96976..7c0e49ee3d5 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -26,7 +26,7 @@ import fsspec import huggingface_hub import requests -from fsspec.core import strip_protocol +from fsspec.core import strip_protocol, url_to_fs from fsspec.utils import can_be_local from huggingface_hub.utils import insecure_hashlib from packaging import version @@ -315,10 +315,8 @@ def _request_with_retry( def fsspec_head(url, storage_options=None): _raise_if_offline_mode_is_enabled(f"Tried to reach {url}") - fs, _, paths = fsspec.get_fs_token_paths(url, storage_options=storage_options) - if len(paths) > 1: - raise ValueError(f"HEAD can be called with at most one path but was called with {paths}") - return fs.info(paths[0]) + fs, path = url_to_fs(url, **(storage_options or {})) + return fs.info(path) def stack_multiprocessing_download_progress_bars(): @@ -335,9 +333,7 @@ def __init__(self, tqdm_kwargs=None, *args, **kwargs): def fsspec_get(url, temp_file, storage_options=None, desc=None): _raise_if_offline_mode_is_enabled(f"Tried to reach {url}") - fs, _, paths = fsspec.get_fs_token_paths(url, storage_options=storage_options) - if len(paths) > 1: - raise ValueError(f"GET can be called with at most one path but was called with {paths}") + fs, path = url_to_fs(url, **(storage_options or {})) callback = TqdmCallback( tqdm_kwargs={ "desc": desc or "Downloading", diff --git a/tests/test_filesystem.py b/tests/test_filesystem.py index 586312973ae..13d94b86881 100644 --- a/tests/test_filesystem.py +++ b/tests/test_filesystem.py @@ -4,6 +4,7 @@ import fsspec import pytest from fsspec import register_implementation +from fsspec.core import url_to_fs from fsspec.registry import _registry as _fsspec_registry from datasets.filesystems import COMPRESSION_FILESYSTEMS, extract_path_from_uri, is_remote_filesystem @@ -68,7 +69,7 @@ def test_fs_isfile(protocol, zip_jsonl_path, jsonl_gz_path): compressed_file_path = compressed_file_paths[protocol] member_file_path = "dataset.jsonl" path = f"{protocol}://{member_file_path}::{compressed_file_path}" - fs, *_ = fsspec.get_fs_token_paths(path) + fs, *_ = url_to_fs(path) assert fs.isfile(member_file_path) assert not fs.isfile("non_existing_" + member_file_path) From a654f28e1876fafe6aacfaa953052311f1c3e767 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 7 Mar 2024 18:23:40 +0100 Subject: [PATCH 7/8] Fix --- src/datasets/utils/file_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 7c0e49ee3d5..7c823831676 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -345,7 +345,7 @@ def fsspec_get(url, temp_file, storage_options=None, desc=None): else None, } ) - fs.get_file(paths[0], temp_file.name, callback=callback) + fs.get_file(path, temp_file.name, callback=callback) def ftp_head(url, timeout=10.0): From c9d3c035287f275a4c6503473bbd73ba195baae5 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 14 Mar 2024 02:08:53 +0100 Subject: [PATCH 8/8] Remove context manager --- src/datasets/data_files.py | 52 +++++----------------------- src/datasets/filesystems/__init__.py | 11 ------ src/datasets/load.py | 51 +++++++++++++-------------- 3 files changed, 32 insertions(+), 82 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index c71fe4b101b..4a918595408 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -1,7 +1,6 @@ import os import re -from contextlib import contextmanager -from functools import lru_cache, partial +from functools import partial from glob import has_magic from pathlib import Path, PurePath from typing import Callable, Dict, List, Optional, Set, Tuple, Union @@ -9,7 +8,6 @@ import huggingface_hub from fsspec.core import url_to_fs from fsspec.implementations.http import HTTPFileSystem -from fsspec.implementations.local import LocalFileSystem from huggingface_hub import HfFileSystem from packaging import version from tqdm.contrib.concurrent import thread_map @@ -17,7 +15,6 @@ from . import config from .download import DownloadConfig from .download.streaming_download_manager import _prepare_path_and_storage_options, xbasename, xjoin -from .filesystems import is_local_filesystem from .naming import _split_re from .splits import Split from .utils import logging @@ -40,10 +37,6 @@ class EmptyDatasetError(FileNotFoundError): pass -class DefaultPattern(str): - pass - - SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*" SPLIT_KEYWORDS = { @@ -80,7 +73,7 @@ class DefaultPattern(str): DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST] DEFAULT_PATTERNS_SPLIT_IN_FILENAME = { split: [ - DefaultPattern(pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)) + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) for keyword in SPLIT_KEYWORDS[split] for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS ] @@ -88,7 +81,7 @@ class DefaultPattern(str): } DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = { split: [ - DefaultPattern(pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)) + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) for keyword in SPLIT_KEYWORDS[split] for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS ] @@ -97,7 +90,7 @@ class DefaultPattern(str): DEFAULT_PATTERNS_ALL = { - Split.TRAIN: [DefaultPattern("**")], + Split.TRAIN: ["**"], } ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED] @@ -115,8 +108,8 @@ class DefaultPattern(str): ] # metadata file for ImageFolder and AudioFolder else: METADATA_PATTERNS = [ - DefaultPattern("**/metadata.csv"), - DefaultPattern("**/metadata.jsonl"), + "**/metadata.csv", + "**/metadata.jsonl", ] # metadata file for ImageFolder and AudioFolder WILDCARD_CHARACTERS = "*[]" FILES_TO_IGNORE = [ @@ -325,34 +318,6 @@ def _get_metadata_files_patterns(pattern_resolver: Callable[[str], List[str]]) - raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}") -_IS_RESOLVE_PATTERNS_CACHED: bool = False - - -class _LocalFileSystemWithCachedFind(LocalFileSystem): - @lru_cache(maxsize=1) - def find(self, *args, **kwargs): - return super().find(*args, **kwargs) - - -def _url_to_fs(url, **kwargs): - fs, url_ = url_to_fs(url, **kwargs) - if isinstance(url, DefaultPattern) and is_local_filesystem(fs) and _IS_RESOLVE_PATTERNS_CACHED: - # the default patterns are similar to each other, so let's cache calls to avoid expensive disk access if possible - fs = _LocalFileSystemWithCachedFind(**fs.storage_options) - return fs, url_ - - -@contextmanager -def cached_pattern_resolution(): - global _IS_RESOLVE_PATTERNS_CACHED - _IS_RESOLVE_PATTERNS_CACHED = True - try: - yield - finally: - _IS_RESOLVE_PATTERNS_CACHED = False - _LocalFileSystemWithCachedFind.clear_instance_cache() - - def resolve_pattern( pattern: str, base_path: str, @@ -400,7 +365,6 @@ def resolve_pattern( Returns: List[str]: List of paths or URLs to the local or remote files that match the patterns. """ - pattern_type = type(pattern) # TODO: use UserString if is_relative_path(pattern): pattern = xjoin(base_path, pattern) elif is_local_path(pattern): @@ -408,7 +372,7 @@ def resolve_pattern( else: base_path = "" pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config) - fs, _ = _url_to_fs(pattern_type(pattern), **storage_options) + fs, *_ = url_to_fs(pattern, **storage_options) fs_base_path = base_path.split("::")[0].split("://")[-1] or fs.root_marker fs_pattern = pattern.split("::")[0].split("://")[-1] files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)} @@ -561,7 +525,7 @@ def _get_single_origin_metadata( download_config: Optional[DownloadConfig] = None, ) -> Tuple[str]: data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config) - fs, _ = _url_to_fs(data_file, **storage_options) + fs, *_ = url_to_fs(data_file, **storage_options) if isinstance(fs, HfFileSystem): resolved_path = fs.resolve_path(data_file) return (resolved_path.repo_id, resolved_path.revision) diff --git a/src/datasets/filesystems/__init__.py b/src/datasets/filesystems/__init__.py index 863dc2fa372..c2753e3d380 100644 --- a/src/datasets/filesystems/__init__.py +++ b/src/datasets/filesystems/__init__.py @@ -47,17 +47,6 @@ def extract_path_from_uri(dataset_path: str) -> str: return dataset_path -def is_local_filesystem(fs: fsspec.AbstractFileSystem) -> bool: - """ - Checks if `fs` is a local filesystem. - - Args: - fs (`fsspec.spec.AbstractFileSystem`): - An abstract super-class for pythonic file-systems, e.g. `fsspec.filesystem(\'file\')` or [`datasets.filesystems.S3FileSystem`]. - """ - return isinstance(fs, LocalFileSystem) - - def is_remote_filesystem(fs: fsspec.AbstractFileSystem) -> bool: """ Checks if `fs` is a remote filesystem. diff --git a/src/datasets/load.py b/src/datasets/load.py index 9e61b47c34b..bdd5195e4f6 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -47,7 +47,6 @@ DataFilesPatternsDict, DataFilesPatternsList, EmptyDatasetError, - cached_pattern_resolution, get_data_patterns, get_metadata_patterns, sanitize_patterns, @@ -1024,18 +1023,17 @@ def get_module(self) -> DatasetModule: # we need a set of data files to find which dataset builder to use # because we need to infer module name by files extensions base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix() - with cached_pattern_resolution(): - if self.data_files is not None: - patterns = sanitize_patterns(self.data_files) - elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())): - patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"]) - else: - patterns = get_data_patterns(base_path) - data_files = DataFilesDict.from_patterns( - patterns, - base_path=base_path, - allowed_extensions=ALL_ALLOWED_EXTENSIONS, - ) + if self.data_files is not None: + patterns = sanitize_patterns(self.data_files) + elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())): + patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"]) + else: + patterns = get_data_patterns(base_path) + data_files = DataFilesDict.from_patterns( + patterns, + base_path=base_path, + allowed_extensions=ALL_ALLOWED_EXTENSIONS, + ) module_name, default_builder_kwargs = infer_module_for_data_files( data_files=data_files, path=self.path, @@ -1044,20 +1042,19 @@ def get_module(self) -> DatasetModule: # Collect metadata files if the module supports them supports_metadata = module_name in _MODULE_SUPPORTS_METADATA if self.data_files is None and supports_metadata: - with cached_pattern_resolution(): - try: - metadata_patterns = get_metadata_patterns(base_path) - except FileNotFoundError: - metadata_patterns = None - if metadata_patterns is not None: - metadata_data_files_list = DataFilesList.from_patterns(metadata_patterns, base_path=base_path) - if metadata_data_files_list: - data_files = DataFilesDict( - { - split: data_files_list + metadata_data_files_list - for split, data_files_list in data_files.items() - } - ) + try: + metadata_patterns = get_metadata_patterns(base_path) + except FileNotFoundError: + metadata_patterns = None + if metadata_patterns is not None: + metadata_data_files_list = DataFilesList.from_patterns(metadata_patterns, base_path=base_path) + if metadata_data_files_list: + data_files = DataFilesDict( + { + split: data_files_list + metadata_data_files_list + for split, data_files_list in data_files.items() + } + ) module_path, _ = _PACKAGED_DATASETS_MODULES[module_name] if metadata_configs: