Skip to content

Commit ba4d30c

Browse files
authored
Module namespace cleanup for v2.0 (#3875)
* Imports cleaning * Small change * Remove unused methods * Small fix * Additional fix * Final fix * Fix benchmark test * Fix benchmark test #2
1 parent fcb2ee2 commit ba4d30c

36 files changed

Lines changed: 286 additions & 362 deletions

benchmarks/utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
import numpy as np
44

55
import datasets
6-
from datasets.features import _ArrayXD
6+
from datasets.arrow_writer import ArrowWriter
7+
from datasets.features.features import _ArrayXD
78

89

910
def get_duration(func):
@@ -46,7 +47,7 @@ def generate_examples(features: dict, num_examples=100, seq_shapes=None):
4647
def generate_example_dataset(dataset_path, features, num_examples=100, seq_shapes=None):
4748
dummy_data = generate_examples(features, num_examples=num_examples, seq_shapes=seq_shapes)
4849

49-
with datasets.ArrowWriter(features=features, path=dataset_path) as writer:
50+
with ArrowWriter(features=features, path=dataset_path) as writer:
5051
for key, record in dummy_data:
5152
example = features.encode_example(record)
5253
writer.write(example)

metrics/perplexity/perplexity.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from transformers import AutoModelForCausalLM, AutoTokenizer
1818

1919
import datasets
20-
from datasets.utils import tqdm
20+
from datasets import utils
2121

2222

2323
_CITATION = """\
@@ -113,7 +113,7 @@ def _compute(self, input_texts, model_id, stride=512, device=None):
113113

114114
ppls = []
115115

116-
for text_index in tqdm(range(0, len(encoded_texts))):
116+
for text_index in utils.tqdm_utils.tqdm(range(0, len(encoded_texts))):
117117
encoded_text = encoded_texts[text_index]
118118
special_tokens_mask = special_tokens_masks[text_index]
119119

src/datasets/__init__.py

Lines changed: 11 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,36 +20,26 @@
2020
__version__ = "1.18.5.dev0"
2121

2222
import pyarrow
23-
from packaging import version as _version
24-
from pyarrow import total_allocated_bytes
23+
from packaging import version
2524

2625

27-
if _version.parse(pyarrow.__version__).major < 5:
26+
if version.parse(pyarrow.__version__).major < 5:
2827
raise ImportWarning(
2928
"To use `datasets`, the module `pyarrow>=5.0.0` is required, and the current version of `pyarrow` doesn't match this condition.\n"
3029
"If you are running this in a Google Colab, you should probably just restart the runtime to use the right version of `pyarrow`."
3130
)
3231

32+
SCRIPTS_VERSION = "master" if version.parse(__version__).is_devrelease else __version__
33+
34+
del pyarrow
35+
del version
36+
3337
from .arrow_dataset import Dataset, concatenate_datasets
34-
from .arrow_reader import ArrowReader, ReadInstruction
35-
from .arrow_writer import ArrowWriter
38+
from .arrow_reader import ReadInstruction
3639
from .builder import ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
3740
from .combine import interleave_datasets
3841
from .dataset_dict import DatasetDict, IterableDatasetDict
39-
from .features import (
40-
Array2D,
41-
Array3D,
42-
Array4D,
43-
Array5D,
44-
Audio,
45-
ClassLabel,
46-
Features,
47-
Image,
48-
Sequence,
49-
Translation,
50-
TranslationVariableLanguages,
51-
Value,
52-
)
42+
from .features import *
5343
from .fingerprint import is_caching_enabled, set_caching_enabled
5444
from .info import DatasetInfo, MetricInfo
5545
from .inspect import (
@@ -63,8 +53,7 @@
6353
list_metrics,
6454
)
6555
from .iterable_dataset import IterableDataset
66-
from .keyhash import KeyHasher
67-
from .load import import_main_class, load_dataset, load_dataset_builder, load_from_disk, load_metric
56+
from .load import load_dataset, load_dataset_builder, load_from_disk, load_metric
6857
from .metric import Metric
6958
from .splits import (
7059
NamedSplit,
@@ -79,6 +68,4 @@
7968
)
8069
from .tasks import *
8170
from .utils import *
82-
83-
84-
SCRIPTS_VERSION = "master" if _version.parse(__version__).is_devrelease else __version__
71+
from .utils import logging

src/datasets/arrow_dataset.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -58,19 +58,8 @@
5858
from . import config, utils
5959
from .arrow_reader import ArrowReader
6060
from .arrow_writer import ArrowWriter, OptimizedTypedSequence
61-
from .features import (
62-
Audio,
63-
ClassLabel,
64-
Features,
65-
FeatureType,
66-
Image,
67-
Sequence,
68-
Value,
69-
_ArrayXD,
70-
decode_nested_example,
71-
pandas_types_mapper,
72-
require_decoding,
73-
)
61+
from .features import Audio, ClassLabel, Features, Image, Sequence, Value
62+
from .features.features import FeatureType, _ArrayXD, decode_nested_example, pandas_types_mapper, require_decoding
7463
from .filesystems import extract_path_from_uri, is_remote_filesystem
7564
from .fingerprint import (
7665
fingerprint_transform,
@@ -2316,7 +2305,7 @@ def init_buffer_and_writer():
23162305
pbar_total = (num_rows // batch_size) + 1 if num_rows % batch_size else num_rows // batch_size
23172306
pbar_unit = "ex" if not batched else "ba"
23182307
pbar_desc = (desc + " " if desc is not None else "") + "#" + str(rank) if rank is not None else desc
2319-
pbar = utils.tqdm(
2308+
pbar = utils.tqdm_utils.tqdm(
23202309
pbar_iterable,
23212310
total=pbar_total,
23222311
disable=disable_tqdm,
@@ -3466,7 +3455,7 @@ def delete_file(file):
34663455
api.delete_file(file, repo_id=repo_id, token=token, repo_type="dataset", revision=branch)
34673456

34683457
if len(file_shards_to_delete):
3469-
for file in utils.tqdm(
3458+
for file in utils.tqdm_utils.tqdm(
34703459
file_shards_to_delete,
34713460
desc="Deleting unused files from dataset repository",
34723461
total=len(file_shards_to_delete),
@@ -3475,7 +3464,7 @@ def delete_file(file):
34753464
delete_file(file)
34763465

34773466
uploaded_size = 0
3478-
for index, shard in utils.tqdm(
3467+
for index, shard in utils.tqdm_utils.tqdm(
34793468
enumerate(shards),
34803469
desc="Pushing dataset shards to the dataset hub",
34813470
total=num_shards,

src/datasets/arrow_reader.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,10 @@
2626
import pyarrow as pa
2727
import pyarrow.parquet as pq
2828

29-
from datasets.utils.file_utils import DownloadConfig
30-
3129
from .naming import _split_re, filename_for_dataset_split
3230
from .table import InMemoryTable, MemoryMappedTable, Table, concat_tables
33-
from .utils import cached_path, logging
31+
from .utils import logging
32+
from .utils.file_utils import DownloadConfig, cached_path
3433

3534

3635
if TYPE_CHECKING:

src/datasets/arrow_writer.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,10 @@
2222
import numpy as np
2323
import pyarrow as pa
2424

25-
from datasets.features.features import FeatureType, Value
26-
2725
from . import config, utils
28-
from .features import (
29-
Features,
30-
Image,
26+
from .features import Features, Image, Value
27+
from .features.features import (
28+
FeatureType,
3129
_ArrayXDExtensionType,
3230
cast_to_python_objects,
3331
generate_from_arrow_type,
@@ -641,9 +639,9 @@ def parquet_to_arrow(sources, destination):
641639
stream = None if isinstance(destination, str) else destination
642640
disable = not utils.is_progress_bar_enabled()
643641
with ArrowWriter(path=destination, stream=stream) as writer:
644-
for source in utils.tqdm(sources, unit="sources", disable=disable):
642+
for source in utils.tqdm_utils.tqdm(sources, unit="sources", disable=disable):
645643
pf = pa.parquet.ParquetFile(source)
646-
for i in utils.tqdm(range(pf.num_row_groups), unit="row_groups", leave=False, disable=disable):
644+
for i in utils.tqdm_utils.tqdm(range(pf.num_row_groups), unit="row_groups", leave=False, disable=disable):
647645
df = pf.read_row_group(i).to_pandas()
648646
for col in df.columns:
649647
df[col] = df[col].apply(json.loads)

src/datasets/builder.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,6 @@
2727
from functools import partial
2828
from typing import Dict, Mapping, Optional, Tuple, Union
2929

30-
from datasets.features import Features
31-
from datasets.utils.mock_download_manager import MockDownloadManager
32-
3330
from . import config, utils
3431
from .arrow_dataset import Dataset
3532
from .arrow_reader import (
@@ -42,16 +39,26 @@
4239
from .arrow_writer import ArrowWriter, BeamWriter
4340
from .data_files import DataFilesDict, sanitize_patterns
4441
from .dataset_dict import DatasetDict, IterableDatasetDict
42+
from .features import Features
4543
from .fingerprint import Hasher
4644
from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
4745
from .iterable_dataset import ExamplesIterable, IterableDataset, _generate_examples_from_tables_wrapper
4846
from .naming import camelcase_to_snakecase, filename_prefix_for_split
4947
from .splits import Split, SplitDict, SplitGenerator
5048
from .utils import logging
5149
from .utils.download_manager import DownloadManager, DownloadMode
52-
from .utils.file_utils import DownloadConfig, is_remote_url
50+
from .utils.file_utils import DownloadConfig, cached_path, is_remote_url
5351
from .utils.filelock import FileLock
5452
from .utils.info_utils import get_size_checksum_dict, verify_checksums, verify_splits
53+
from .utils.mock_download_manager import MockDownloadManager
54+
from .utils.py_utils import (
55+
classproperty,
56+
has_sufficient_disk_space,
57+
map_nested,
58+
memoize,
59+
size_str,
60+
temporary_assignment,
61+
)
5562
from .utils.streaming_download_manager import StreamingDownloadManager
5663

5764

@@ -389,9 +396,9 @@ def _create_builder_config(self, name=None, custom_features=None, **config_kwarg
389396

390397
return builder_config, config_id
391398

392-
@utils.classproperty
399+
@classproperty
393400
@classmethod
394-
@utils.memoize()
401+
@memoize()
395402
def builder_configs(cls):
396403
"""Pre-defined list of configurations for this builder class."""
397404
configs = {config.name: config for config in cls.BUILDER_CONFIGS}
@@ -537,9 +544,9 @@ def download_and_prepare(
537544
return
538545
logger.info(f"Generating dataset {self.name} ({self._cache_dir})")
539546
if not is_remote_url(self._cache_dir_root): # if cache dir is local, check for available space
540-
if not utils.has_sufficient_disk_space(self.info.size_in_bytes or 0, directory=self._cache_dir_root):
547+
if not has_sufficient_disk_space(self.info.size_in_bytes or 0, directory=self._cache_dir_root):
541548
raise OSError(
542-
f"Not enough disk space. Needed: {utils.size_str(self.info.size_in_bytes or 0)} (download: {utils.size_str(self.info.download_size or 0)}, generated: {utils.size_str(self.info.dataset_size or 0)}, post-processed: {utils.size_str(self.info.post_processing_size or 0)})"
549+
f"Not enough disk space. Needed: {size_str(self.info.size_in_bytes or 0)} (download: {size_str(self.info.download_size or 0)}, generated: {size_str(self.info.dataset_size or 0)}, post-processed: {size_str(self.info.post_processing_size or 0)})"
543550
)
544551

545552
@contextlib.contextmanager
@@ -565,9 +572,9 @@ def incomplete_dir(dirname):
565572
if self.info.size_in_bytes:
566573
print(
567574
f"Downloading and preparing dataset {self.info.builder_name}/{self.info.config_name} "
568-
f"(download: {utils.size_str(self.info.download_size)}, generated: {utils.size_str(self.info.dataset_size)}, "
569-
f"post-processed: {utils.size_str(self.info.post_processing_size)}, "
570-
f"total: {utils.size_str(self.info.size_in_bytes)}) to {self._cache_dir}..."
575+
f"(download: {size_str(self.info.download_size)}, generated: {size_str(self.info.dataset_size)}, "
576+
f"post-processed: {size_str(self.info.post_processing_size)}, "
577+
f"total: {size_str(self.info.size_in_bytes)}) to {self._cache_dir}..."
571578
)
572579
else:
573580
print(
@@ -580,7 +587,7 @@ def incomplete_dir(dirname):
580587
with incomplete_dir(self._cache_dir) as tmp_data_dir:
581588
# Temporarily assign _cache_dir to tmp_data_dir to avoid having to forward
582589
# it to every sub function.
583-
with utils.temporary_assignment(self, "_cache_dir", tmp_data_dir):
590+
with temporary_assignment(self, "_cache_dir", tmp_data_dir):
584591
# Try to download the already prepared dataset files
585592
downloaded_from_gcs = False
586593
if try_from_hf_gcs:
@@ -637,7 +644,7 @@ def _download_prepared_from_hf_gcs(self, download_config: DownloadConfig):
637644
if os.sep in resource_file_name:
638645
raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
639646
try:
640-
resource_path = utils.cached_path(remote_cache_dir + "/" + resource_file_name)
647+
resource_path = cached_path(remote_cache_dir + "/" + resource_file_name)
641648
shutil.move(resource_path, os.path.join(self._cache_dir, resource_file_name))
642649
except ConnectionError:
643650
logger.info(f"Couldn't download resourse file {resource_file_name} from Hf google storage.")
@@ -761,7 +768,7 @@ def as_dataset(
761768
split = {s: s for s in self.info.splits}
762769

763770
# Create a dataset for each of the given splits
764-
datasets = utils.map_nested(
771+
datasets = map_nested(
765772
partial(
766773
self._build_single_dataset,
767774
run_post_process=run_post_process,
@@ -903,7 +910,7 @@ def as_streaming_dataset(
903910
raise ValueError(f"Bad split: {split}. Available splits: {list(splits_generators)}")
904911

905912
# Create a dataset for each of the given splits
906-
datasets = utils.map_nested(
913+
datasets = map_nested(
907914
self._as_streaming_dataset_single,
908915
splits_generator,
909916
map_tuple=True,
@@ -1074,7 +1081,7 @@ def _prepare_split(self, split_generator, check_duplicate_keys):
10741081
check_duplicates=check_duplicate_keys,
10751082
) as writer:
10761083
try:
1077-
for key, record in utils.tqdm(
1084+
for key, record in utils.tqdm_utils.tqdm(
10781085
generator,
10791086
unit=" examples",
10801087
total=split_info.num_examples,
@@ -1138,7 +1145,7 @@ def _prepare_split(self, split_generator):
11381145

11391146
generator = self._generate_tables(**split_generator.gen_kwargs)
11401147
with ArrowWriter(features=self.info.features, path=fpath) as writer:
1141-
for key, table in utils.tqdm(
1148+
for key, table in utils.tqdm_utils.tqdm(
11421149
generator, unit=" tables", leave=False, disable=True # not utils.is_progress_bar_enabled()
11431150
):
11441151
writer.write_table(table)

src/datasets/commands/dummy_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@
1111
from datasets import config
1212
from datasets.commands import BaseDatasetsCLICommand
1313
from datasets.load import dataset_module_factory, import_main_class
14-
from datasets.utils import MockDownloadManager
1514
from datasets.utils.download_manager import DownloadManager
1615
from datasets.utils.file_utils import DownloadConfig
1716
from datasets.utils.logging import get_logger, set_verbosity_warning
17+
from datasets.utils.mock_download_manager import MockDownloadManager
1818
from datasets.utils.py_utils import map_nested
1919

2020

src/datasets/data_files.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77
from fsspec.implementations.local import LocalFileSystem
88
from tqdm.contrib.concurrent import thread_map
99

10-
from datasets.filesystems.hffilesystem import HfFileSystem
11-
10+
from .filesystems.hffilesystem import HfFileSystem
1211
from .splits import Split
1312
from .utils import logging
1413
from .utils.file_utils import hf_hub_url, is_remote_url, request_etag

src/datasets/features/__init__.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
11
# flake8: noqa
2+
3+
__all__ = [
4+
"Audio",
5+
"Array2D",
6+
"Array3D",
7+
"Array4D",
8+
"Array5D",
9+
"ClassLabel",
10+
"Features",
11+
"Sequence",
12+
"Value",
13+
"Image",
14+
"Translation",
15+
"TranslationVariableLanguages",
16+
]
217
from .audio import Audio
3-
from .features import *
4-
from .features import (
5-
_ArrayXD,
6-
_ArrayXDExtensionType,
7-
_arrow_to_datasets_dtype,
8-
_cast_to_python_objects,
9-
_is_zero_copy_only,
10-
)
11-
from .image import Image, objects_to_list_of_image_dicts
18+
from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, Sequence, Value
19+
from .image import Image
1220
from .translation import Translation, TranslationVariableLanguages

0 commit comments

Comments
 (0)