Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions metrics/perplexity/perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,7 @@
>>> perplexity = datasets.load_metric("perplexity")
>>> input_texts = datasets.load_dataset("wikitext",
... "wikitext-2-raw-v1",
... split="test")["text"][:50] # doctest:+ELLIPSIS
[...]
... split="test")["text"][:50]
>>> input_texts = [s for s in input_texts if s!='']
>>> results = perplexity.compute(model_id='gpt2',
... input_texts=input_texts) # doctest:+ELLIPSIS
Expand Down
17 changes: 7 additions & 10 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1486,7 +1486,6 @@ def save_to_disk(
disable=not logging.is_progress_bar_enabled(),
unit=" examples",
total=len(self),
leave=False,
desc=f"Saving the dataset ({shards_done}/{num_shards} shards)",
)
kwargs_per_job = (
Expand Down Expand Up @@ -3070,15 +3069,14 @@ def load_processed_shard_from_cache(shard_kwargs):
transformed_dataset = None
try:
transformed_dataset = load_processed_shard_from_cache(dataset_kwargs)
logger.warning(f"Loading cached processed dataset at {dataset_kwargs['cache_file_name']}")
logger.info(f"Loading cached processed dataset at {dataset_kwargs['cache_file_name']}")
except NonExistentDatasetError:
pass
if transformed_dataset is None:
with logging.tqdm(
disable=not logging.is_progress_bar_enabled(),
unit=" examples",
total=pbar_total,
leave=False,
desc=desc or "Map",
) as pbar:
for rank, done, content in Dataset._map_single(**dataset_kwargs):
Expand Down Expand Up @@ -3171,7 +3169,6 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:
disable=not logging.is_progress_bar_enabled(),
unit=" examples",
total=pbar_total,
leave=False,
desc=(desc or "Map") + f" (num_proc={num_proc})",
) as pbar:
for rank, done, content in iflatmap_unordered(
Expand All @@ -3187,7 +3184,7 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:
for kwargs in kwargs_per_job:
del kwargs["shard"]
else:
logger.warning(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}")
logger.info(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}")
assert (
None not in transformed_shards
), f"Failed to retrieve results from map: result list {transformed_shards} still contains None - at least one worker failed to return its results"
Expand Down Expand Up @@ -4086,7 +4083,7 @@ def sort(
# we create a unique hash from the function, current dataset file and the mapping args
indices_cache_file_name = self._get_cache_file_path(new_fingerprint)
if os.path.exists(indices_cache_file_name) and load_from_cache_file:
logger.warning(f"Loading cached sorted indices for dataset at {indices_cache_file_name}")
logger.info(f"Loading cached sorted indices for dataset at {indices_cache_file_name}")
return self._new_dataset_with_indices(
fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name
)
Expand Down Expand Up @@ -4228,7 +4225,7 @@ def shuffle(
# we create a unique hash from the function, current dataset file and the mapping args
indices_cache_file_name = self._get_cache_file_path(new_fingerprint)
if os.path.exists(indices_cache_file_name) and load_from_cache_file:
logger.warning(f"Loading cached shuffled indices for dataset at {indices_cache_file_name}")
logger.info(f"Loading cached shuffled indices for dataset at {indices_cache_file_name}")
return self._new_dataset_with_indices(
fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name
)
Expand Down Expand Up @@ -4459,7 +4456,7 @@ def train_test_split(
and os.path.exists(test_indices_cache_file_name)
and load_from_cache_file
):
logger.warning(
logger.info(
f"Loading cached split indices for dataset at {train_indices_cache_file_name} and {test_indices_cache_file_name}"
)
return DatasetDict(
Expand Down Expand Up @@ -5273,7 +5270,7 @@ def path_in_repo(_index, shard):
first_shard = next(shards_iter)
first_shard_path_in_repo = path_in_repo(0, first_shard)
if first_shard_path_in_repo in data_files and num_shards < len(data_files):
logger.warning("Resuming upload of the dataset shards.")
logger.info("Resuming upload of the dataset shards.")

uploaded_size = 0
shards_path_in_repo = []
Expand Down Expand Up @@ -5450,7 +5447,7 @@ def push_to_hub(
repo_info = None
# update the total info to dump from existing info
if repo_info is not None:
logger.warning("Updating downloaded metadata with the new split.")
logger.info("Updating downloaded metadata with the new split.")
if repo_info.splits and list(repo_info.splits) != [split]:
if self._info.features != repo_info.features:
raise ValueError(
Expand Down
20 changes: 9 additions & 11 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ def __init__(
self.info = DatasetInfo.from_directory(self._cache_dir)
else: # dir exists but no data, remove the empty dir as data aren't available anymore
logger.warning(
f"Old caching folder {self._cache_dir} for dataset {self.name} exists but no data were found. Removing it. "
f"Old caching folder {self._cache_dir} for dataset {self.name} exists but no data were found. Removing it."
)
os.rmdir(self._cache_dir)

Expand Down Expand Up @@ -490,7 +490,7 @@ def _create_builder_config(
if config_name is None and self.BUILDER_CONFIGS and not config_kwargs:
if self.DEFAULT_CONFIG_NAME is not None:
builder_config = self.builder_configs.get(self.DEFAULT_CONFIG_NAME)
logger.warning(f"No config specified, defaulting to: {self.name}/{builder_config.name}")
logger.info(f"No config specified, defaulting to: {self.name}/{builder_config.name}")
else:
if len(self.BUILDER_CONFIGS) > 1:
example_of_usage = f"load_dataset('{self.name}', '{self.BUILDER_CONFIGS[0].name}')"
Expand Down Expand Up @@ -843,7 +843,7 @@ def download_and_prepare(
path_join = os.path.join if is_local else posixpath.join
data_exists = self._fs.exists(path_join(self._output_dir, config.DATASET_INFO_FILENAME))
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
logger.warning(f"Found cached dataset {self.name} ({self._output_dir})")
logger.info(f"Found cached dataset {self.name} ({self._output_dir})")
# We need to update the info in case some splits were added in the meantime
# for example when calling load_dataset from multiple workers.
self.info = self._load_info()
Expand Down Expand Up @@ -882,15 +882,15 @@ def incomplete_dir(dirname):
# information needed to cancel download/preparation if needed.
# This comes right before the progress bar.
if self.info.size_in_bytes:
print(
logger.info(
f"Downloading and preparing dataset {self.info.builder_name}/{self.info.config_name} "
f"(download: {size_str(self.info.download_size)}, generated: {size_str(self.info.dataset_size)}, "
f"post-processed: {size_str(self.info.post_processing_size)}, "
f"total: {size_str(self.info.size_in_bytes)}) to {self._output_dir}..."
)
else:
_dest = self._fs._strip_protocol(self._output_dir) if is_local else self._output_dir
print(
logger.info(
f"Downloading and preparing dataset {self.info.builder_name}/{self.info.config_name} to {_dest}..."
)

Expand Down Expand Up @@ -933,7 +933,7 @@ def incomplete_dir(dirname):
# Download post processing resources
self.download_post_processing_resources(dl_manager)

print(
logger.info(
f"Dataset {self.name} downloaded and prepared to {self._output_dir}. "
f"Subsequent calls will reuse this data."
)
Expand Down Expand Up @@ -1162,7 +1162,7 @@ def as_dataset(
),
split,
map_tuple=True,
disable_tqdm=not logging.is_progress_bar_enabled(),
disable_tqdm=True,
)
if isinstance(datasets, dict):
datasets = DatasetDict(datasets)
Expand Down Expand Up @@ -1490,7 +1490,7 @@ def _prepare_split(
)
num_proc = 1
elif num_proc is not None and num_input_shards < num_proc:
logger.info(
logger.warning(
f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
)
num_proc = num_input_shards
Expand All @@ -1499,7 +1499,6 @@ def _prepare_split(
disable=not logging.is_progress_bar_enabled(),
unit=" examples",
total=split_info.num_examples,
leave=False,
desc=f"Generating {split_info.name} split",
)

Expand Down Expand Up @@ -1751,7 +1750,7 @@ def _prepare_split(
)
num_proc = 1
elif num_proc is not None and num_input_shards < num_proc:
logger.info(
logger.warning(
f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
)
num_proc = num_input_shards
Expand All @@ -1760,7 +1759,6 @@ def _prepare_split(
disable=not logging.is_progress_bar_enabled(),
unit=" examples",
total=split_info.num_examples,
leave=False,
desc=f"Generating {split_info.name} split",
)

Expand Down
2 changes: 1 addition & 1 deletion src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -1628,7 +1628,7 @@ def push_to_hub(
raise ValueError(f"Split name should match '{_split_re}' but got '{split}'.")

for split in self.keys():
logger.warning(f"Pushing split {split} to the Hub.")
logger.info(f"Pushing split {split} to the Hub.")
# The split=key needs to be removed before merging
repo_id, split, uploaded_size, dataset_nbytes, _, _ = self[split]._push_parquet_shards_to_hub(
repo_id,
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/iterable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1233,7 +1233,7 @@ def _iter_pytorch(self):
f"Too many dataloader workers: {worker_info.num_workers} (max is dataset.n_shards={ex_iterable.n_shards}). "
f"Stopping {worker_info.num_workers - ex_iterable.n_shards} dataloader workers."
)
logger.warning(
logger.info(
f"To parallelize data loading, we give each process some shards (or data sources) to process. "
f"Therefore it's unnecessary to have a number of workers greater than dataset.n_shards={ex_iterable.n_shards}. "
f"To enable more parallelism, please split the dataset in more files than {ex_iterable.n_shards}."
Expand Down Expand Up @@ -1304,13 +1304,13 @@ def _prepare_ex_iterable_for_iteration(self) -> _BaseExamplesIterable:
if self._is_main_process():
n_shards_per_node = ex_iterable.n_shards // world_size
plural = "s" if n_shards_per_node > 1 else ""
logger.warning(
logger.info(
f"Assigning {n_shards_per_node} shard{plural} (or data source{plural}) of the dataset to each node."
)
ex_iterable = ex_iterable.shard_data_sources(rank, world_size)
else:
if self._is_main_process():
logger.warning(
logger.info(
f"Assigning 1 out of {world_size} examples of the dataset to each node. The others are skipped during the iteration."
)
logger.info(
Expand Down
1 change: 1 addition & 0 deletions src/datasets/utils/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def _get_library_root_logger() -> logging.Logger:
def _configure_library_root_logger() -> None:
# Apply our default configuration to the library root logger.
library_root_logger = _get_library_root_logger()
library_root_logger.addHandler(logging.StreamHandler())
library_root_logger.setLevel(_get_default_logging_level())


Expand Down
14 changes: 7 additions & 7 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
Summarization,
TextClassification,
)
from datasets.utils.logging import WARNING
from datasets.utils.logging import INFO, get_logger
from datasets.utils.py_utils import temp_seed

from .utils import (
Expand Down Expand Up @@ -1320,7 +1320,7 @@ def test_map_fn_kwargs(self, in_memory):
def test_map_caching(self, in_memory):
with tempfile.TemporaryDirectory() as tmp_dir:
self._caplog.clear()
with self._caplog.at_level(WARNING):
with self._caplog.at_level(INFO, logger=get_logger().name):
with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
with patch(
"datasets.arrow_dataset.Dataset._map_single",
Expand All @@ -1338,7 +1338,7 @@ def test_map_caching(self, in_memory):

with tempfile.TemporaryDirectory() as tmp_dir:
self._caplog.clear()
with self._caplog.at_level(WARNING):
with self._caplog.at_level(INFO, logger=get_logger().name):
with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
with dset.map(lambda x: {"foo": "bar"}) as dset_test1:
dset_test1_data_files = list(dset_test1.cache_files)
Expand All @@ -1349,7 +1349,7 @@ def test_map_caching(self, in_memory):

with tempfile.TemporaryDirectory() as tmp_dir:
self._caplog.clear()
with self._caplog.at_level(WARNING):
with self._caplog.at_level(INFO, logger=get_logger().name):
with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
with patch(
"datasets.arrow_dataset.Pool",
Expand All @@ -1369,7 +1369,7 @@ def test_map_caching(self, in_memory):

with tempfile.TemporaryDirectory() as tmp_dir:
self._caplog.clear()
with self._caplog.at_level(WARNING):
with self._caplog.at_level(INFO, logger=get_logger().name):
with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
with dset.map(lambda x: {"foo": "bar"}, num_proc=2) as dset_test1:
dset_test1_data_files = list(dset_test1.cache_files)
Expand All @@ -1382,7 +1382,7 @@ def test_map_caching(self, in_memory):
try:
self._caplog.clear()
with tempfile.TemporaryDirectory() as tmp_dir:
with self._caplog.at_level(WARNING):
with self._caplog.at_level(INFO, logger=get_logger().name):
with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
datasets.disable_caching()
with dset.map(lambda x: {"foo": "bar"}) as dset_test1:
Expand Down Expand Up @@ -1733,7 +1733,7 @@ def test_filter_multiprocessing(self, in_memory):
def test_filter_caching(self, in_memory):
with tempfile.TemporaryDirectory() as tmp_dir:
self._caplog.clear()
with self._caplog.at_level(WARNING):
with self._caplog.at_level(INFO, logger=get_logger().name):
with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
with dset.filter(lambda x, i: i < 5, with_indices=True) as dset_filter_first_five1:
dset_test1_data_files = list(dset_filter_first_five1.cache_files)
Expand Down
4 changes: 3 additions & 1 deletion tests/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from datasets.streaming import xjoin
from datasets.utils.file_utils import is_local_path
from datasets.utils.info_utils import VerificationMode
from datasets.utils.logging import INFO, get_logger

from .utils import (
assert_arrow_memory_doesnt_increase,
Expand Down Expand Up @@ -1060,7 +1061,8 @@ def test_builder_with_filesystem_download_and_prepare_reload(tmp_path, mockfs, c
DatasetInfo().write_to_directory("mock://my_dataset", storage_options=mockfs.storage_options)
mockfs.touch(f"my_dataset/{builder.name}-train.arrow")
caplog.clear()
builder.download_and_prepare("mock://my_dataset", storage_options=mockfs.storage_options)
with caplog.at_level(INFO, logger=get_logger().name):
builder.download_and_prepare("mock://my_dataset", storage_options=mockfs.storage_options)
assert "Found cached dataset" in caplog.text


Expand Down
4 changes: 3 additions & 1 deletion tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
infer_module_for_data_files,
infer_module_for_data_files_in_archives,
)
from datasets.utils.logging import INFO, get_logger

from .utils import (
OfflineSimulationMode,
Expand Down Expand Up @@ -965,7 +966,8 @@ def test_load_dataset_then_move_then_reload(dataset_loading_script_dir, data_dir
del dataset
os.rename(cache_dir1, cache_dir2)
caplog.clear()
dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="train", cache_dir=cache_dir2)
with caplog.at_level(INFO, logger=get_logger().name):
dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="train", cache_dir=cache_dir2)
assert "Found cached dataset" in caplog.text
assert dataset._fingerprint == fingerprint1, "for the caching mechanism to work, fingerprint should stay the same"
dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="test", cache_dir=cache_dir2)
Expand Down