Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,14 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =
- if `dataset_path` is a path of a dataset dict directory: a :class:`DatasetDict` with each split.
"""
# copies file from filesystem if it is remote filesystem to local filesystem and modifies dataset_path to temp directory containing local copies
fs = fsspec.filesystem("file") if fs is None else fs
dataset_dict_json_path = Path(dataset_path, config.DATASETDICT_JSON_FILENAME).as_posix()
dataset_info_path = Path(dataset_path, config.DATASET_INFO_FILENAME).as_posix()
if not fs.isfile(dataset_info_path) and fs.isfile(dataset_dict_json_path):
raise FileNotFoundError(
f"No such file or directory: '{dataset_info_path}'. Expected to load a Dataset object, but got a DatasetDict. Please use datasets.load_from_disk instead."
)

if is_remote_filesystem(fs):
src_dataset_path = extract_path_from_uri(dataset_path)
tmp_dir = tempfile.TemporaryDirectory()
Expand Down
1 change: 1 addition & 0 deletions src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@
DATASETDICT_INFOS_FILENAME = "dataset_infos.json"
LICENSE_FILENAME = "LICENSE"
METRIC_INFO_FILENAME = "metric_info.json"
DATASETDICT_JSON_FILENAME = "dataset_dict.json"

MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules"

Expand Down
13 changes: 9 additions & 4 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from datasets.utils.doc_utils import is_documented_by

from . import config
from .arrow_dataset import Dataset
from .features import Features
from .filesystems import extract_path_from_uri, is_remote_filesystem
Expand Down Expand Up @@ -673,7 +674,7 @@ def save_to_disk(self, dataset_dict_path: str, fs=None):

json.dump(
{"splits": list(self)},
fs.open(Path(dest_dataset_dict_path, "dataset_dict.json").as_posix(), "w", encoding="utf-8"),
fs.open(Path(dest_dataset_dict_path, config.DATASETDICT_JSON_FILENAME).as_posix(), "w", encoding="utf-8"),
)
for k, dataset in self.items():
dataset.save_to_disk(Path(dest_dataset_dict_path, k).as_posix(), fs)
Expand Down Expand Up @@ -706,9 +707,13 @@ def load_from_disk(dataset_dict_path: str, fs=None, keep_in_memory: Optional[boo
else:
fs = fsspec.filesystem("file")
dest_dataset_dict_path = dataset_dict_path
for k in json.load(
fs.open(Path(dest_dataset_dict_path, "dataset_dict.json").as_posix(), "r", encoding="utf-8")
)["splits"]:
dataset_dict_json_path = Path(dest_dataset_dict_path, config.DATASETDICT_JSON_FILENAME).as_posix()
dataset_info_path = Path(dest_dataset_dict_path, config.DATASET_INFO_FILENAME).as_posix()
if fs.isfile(dataset_info_path) and not fs.isfile(dataset_dict_json_path):
raise FileNotFoundError(
f"No such file or directory: '{dataset_dict_json_path}'. Expected to load a DatasetDict object, but got a Dataset. Please use datasets.load_from_disk instead."
)
for k in json.load(fs.open(dataset_dict_json_path, "r", encoding="utf-8"))["splits"]:
dataset_dict_split_path = (
dataset_dict_path.split("://")[0] + "://" + Path(dest_dataset_dict_path, k).as_posix()
if is_remote_filesystem(fs)
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,9 +797,9 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =

if not fs.exists(dest_dataset_path):
raise FileNotFoundError("Directory {} not found".format(dataset_path))
if fs.isfile(Path(dest_dataset_path, "dataset_info.json").as_posix()):
if fs.isfile(Path(dest_dataset_path, config.DATASET_INFO_FILENAME).as_posix()):
return Dataset.load_from_disk(dataset_path, fs, keep_in_memory=keep_in_memory)
elif fs.isfile(Path(dest_dataset_path, "dataset_dict.json").as_posix()):
elif fs.isfile(Path(dest_dataset_path, config.DATASETDICT_JSON_FILENAME).as_posix()):
return DatasetDict.load_from_disk(dataset_path, fs, keep_in_memory=keep_in_memory)
else:
raise FileNotFoundError(
Expand Down