Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 3 additions & 34 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
rename,
)
from .fingerprint import Hasher
from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
from .info import DatasetInfo, PostProcessedInfo
from .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset
from .keyhash import DuplicatedKeysError
from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase
Expand Down Expand Up @@ -349,9 +349,7 @@ def __init__(
# prepare info: DatasetInfo are a standardized dataclass across all datasets
# Prefill datasetinfo
if info is None:
# TODO FOR PACKAGED MODULES IT IMPORTS DATA FROM src/packaged_modules which doesn't make sense
info = self.get_exported_dataset_info()
info.update(self._info())
info = self._info()
info.builder_name = self.name
info.dataset_name = self.dataset_name
info.config_name = self.config.name
Expand Down Expand Up @@ -391,7 +389,7 @@ def __init__(
if os.path.exists(self._cache_dir): # check if data exist
if len(os.listdir(self._cache_dir)) > 0:
if os.path.exists(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)):
logger.info("Overwrite dataset info from restored data version if exists.")
logger.debug("Overwrite dataset info from restored data version if exists.")
self.info = DatasetInfo.from_directory(self._cache_dir)
else: # dir exists but no data, remove the empty dir as data aren't available anymore
logger.warning(
Expand Down Expand Up @@ -503,35 +501,6 @@ def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> st
if os.path.isdir(legacy_cache_dir):
return legacy_relative_data_dir

@classmethod
def get_all_exported_dataset_infos(cls) -> DatasetInfosDict:
"""Empty dict if doesn't exist

Example:

```py
>>> from datasets import load_dataset_builder
>>> ds_builder = load_dataset_builder('vivos')
>>> ds_builder.get_all_exported_dataset_infos()
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
```
"""
return DatasetInfosDict.from_directory(cls.get_imported_module_dir())

def get_exported_dataset_info(self) -> DatasetInfo:
"""Empty `DatasetInfo` if doesn't exist

Example:

```py
>>> from datasets import load_dataset_builder
>>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
>>> ds_builder.get_exported_dataset_info()
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value('string'), 'path': Value('string'), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value('string')}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
```
"""
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())

def _create_builder_config(
self, config_name=None, custom_features=None, **config_kwargs
) -> tuple[BuilderConfig, str]:
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
if TORCH_AVAILABLE:
try:
TORCH_VERSION = version.parse(importlib.metadata.version("torch"))
logger.info(f"PyTorch version {TORCH_VERSION} available.")
logger.debug(f"PyTorch version {TORCH_VERSION} available.")
except importlib.metadata.PackageNotFoundError:
pass
else:
Expand All @@ -63,7 +63,7 @@
if POLARS_AVAILABLE:
try:
POLARS_VERSION = version.parse(importlib.metadata.version("polars"))
logger.info(f"Polars version {POLARS_VERSION} available.")
logger.debug(f"Polars version {POLARS_VERSION} available.")
except importlib.metadata.PackageNotFoundError:
pass

Expand All @@ -74,7 +74,7 @@
if DUCKDB_AVAILABLE:
try:
DUCKDB_VERSION = version.parse(importlib.metadata.version("duckdb"))
logger.info(f"Duckdb version {DUCKDB_VERSION} available.")
logger.debug(f"Duckdb version {DUCKDB_VERSION} available.")
except importlib.metadata.PackageNotFoundError:
pass

Expand Down
16 changes: 10 additions & 6 deletions src/datasets/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -1160,7 +1160,7 @@ def _load_names_from_file(names_filepath):
return [name.strip() for name in f.read().split("\n") if name.strip()] # Filter empty names


def Sequence(feature, length=-1):
class Sequence:
"""
A `Sequence` is a utility that automatically converts internal dictionary feature into a dictionary of
lists. This behavior is implemented to have a compatibility layer with the TensorFlow Datasets library but may be
Expand All @@ -1179,14 +1179,18 @@ def Sequence(feature, length=-1):
which are converted to `dict` of lists of sub-features for compatibility with TFDS.

"""
if isinstance(feature, dict):
return {key: List(value, length=length) for key, value in feature.items()}
else:
return List(feature, length=length)

def __new__(cls, feature=None, length=-1, **kwargs):
# useful to still get isinstance(Sequence(Value("int64")), Sequence)
if isinstance(feature, dict):
out = {key: List(value, length=length, **kwargs) for key, value in feature.items()}
else:
out = super().__new__(List)
return out


@dataclass(repr=False)
class List:
class List(Sequence):
"""Feature type for large list data composed of child feature data type.

It is backed by `pyarrow.ListType`, which uses 32-bit offsets or a fixed length.
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def from_directory(cls, dataset_info_dir: str, storage_options: Optional[dict] =
"""
fs: fsspec.AbstractFileSystem
fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
logger.info(f"Loading Dataset info from {dataset_info_dir}")
logger.debug(f"Loading Dataset info from {dataset_info_dir}")
if not dataset_info_dir:
raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")
with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "r", encoding="utf-8") as f:
Expand Down Expand Up @@ -352,7 +352,7 @@ def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=Fa

@classmethod
def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
logger.info(f"Loading Dataset Infos from {dataset_infos_dir}")
logger.debug(f"Loading Dataset Infos from {dataset_infos_dir}")
# Load the info from the YAML part of README.md
if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data
Expand Down
Loading