Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1050,15 +1050,6 @@ def _save_info(self):
with file_lock:
self.info.write_to_directory(self._output_dir, storage_options=self._fs.storage_options)

def _save_infos(self):
file_lock = (
FileLock(self._output_dir + "_infos.lock")
if not is_remote_filesystem(self._fs)
else contextlib.nullcontext()
)
with file_lock:
DatasetInfosDict(**{self.config.name: self.info}).write_to_directory(self.get_imported_module_dir())

def _make_split_generators_kwargs(self, prepare_split_kwargs):
"""Get kwargs for `self._split_generators()` from `prepare_split_kwargs`."""
del prepare_split_kwargs
Expand Down
36 changes: 8 additions & 28 deletions src/datasets/commands/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
import os
from argparse import ArgumentParser
from collections.abc import Generator
from pathlib import Path
from shutil import copyfile, rmtree
from shutil import rmtree

import datasets.config
from datasets.builder import DatasetBuilder
from datasets.commands import BaseDatasetsCLICommand
from datasets.download.download_manager import DownloadMode
from datasets.info import DatasetInfosDict
from datasets.load import dataset_module_factory, get_dataset_builder_class
from datasets.utils.info_utils import VerificationMode
from datasets.utils.logging import ERROR, get_logger
Expand Down Expand Up @@ -157,35 +157,15 @@ def get_builders() -> Generator[DatasetBuilder, None, None]:
num_proc=self._num_proc,
)
builder.as_dataset()
if self._save_infos:
builder._save_infos()

# If save_infos=True, the dataset card (README.md) is created next to the loaded module file.
# If save_infos=True, we create the dataset card (README.md)
# The dataset_infos are saved in the YAML part of the README.md

# Let's move it to the original directory of the dataset, to allow the user to
# upload them on HF at the same time afterwards.
# This is to allow the user to upload them on HF afterwards.
if self._save_infos:
dataset_readme_path = os.path.join(
builder_cls.get_imported_module_dir(), datasets.config.REPOCARD_FILENAME
)
name = Path(path).name + ".py"
combined_path = os.path.join(path, name)
if os.path.isfile(path):
dataset_dir = os.path.dirname(path)
elif os.path.isfile(combined_path):
dataset_dir = path
elif os.path.isdir(path): # for local directories containing only data files
dataset_dir = path
else: # in case of a remote dataset
dataset_dir = None
print(f"Dataset card saved at {dataset_readme_path}")

# Move dataset_info back to the user
if dataset_dir is not None:
user_dataset_readme_path = os.path.join(dataset_dir, datasets.config.REPOCARD_FILENAME)
copyfile(dataset_readme_path, user_dataset_readme_path)
print(f"Dataset card saved at {user_dataset_readme_path}")
save_infos_dir = os.path.basename(path) if not os.path.isdir(path) else path
os.makedirs(save_infos_dir, exist_ok=True)
DatasetInfosDict(**{builder.config.name: builder.info}).write_to_directory(save_infos_dir)
print(f"Dataset card saved at {os.path.join(save_infos_dir, datasets.config.REPOCARD_FILENAME)}")

# If clear_cache=True, the download folder and the dataset builder cache directory are deleted
if self._clear_cache:
Expand Down
4 changes: 0 additions & 4 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1283,8 +1283,6 @@ def load_dataset(
Whether to copy the dataset in-memory. If `None`, the dataset
will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
nonzero. See more details in the [improve performance](../cache#improve-performance) section.
save_infos (`bool`, defaults to `False`):
Save the dataset information (checksums/size/splits/...).
revision ([`Version`] or `str`, *optional*):
Version of the dataset to load.
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
Expand Down Expand Up @@ -1428,8 +1426,6 @@ def load_dataset(
keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
)
ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory)
if save_infos:
builder_instance._save_infos()

return ds

Expand Down
Loading