From 7ebcdb10da461f82108b5ae971446b787fd03425 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 14 Nov 2022 12:40:39 +0100 Subject: [PATCH 1/7] correctly pass the gen_kwards in Generator builder --- src/datasets/packaged_modules/generator/generator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datasets/packaged_modules/generator/generator.py b/src/datasets/packaged_modules/generator/generator.py index c9b6711b737..1efa721b159 100644 --- a/src/datasets/packaged_modules/generator/generator.py +++ b/src/datasets/packaged_modules/generator/generator.py @@ -24,8 +24,8 @@ def _info(self): return datasets.DatasetInfo(features=self.config.features) def _split_generators(self, dl_manager): - return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={})] + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs=self.config.gen_kwargs)] - def _generate_examples(self): - for idx, ex in enumerate(self.config.generator(**self.config.gen_kwargs)): + def _generate_examples(self, **gen_kwargs): + for idx, ex in enumerate(self.config.generator(**gen_kwargs)): yield idx, ex From dc3cce96c101c5e60c9d78344dd42cde3721dc15 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 14 Nov 2022 12:40:43 +0100 Subject: [PATCH 2/7] docs --- src/datasets/iterable_dataset.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 37a0db769db..0a17ad38009 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -764,6 +764,8 @@ def from_generator( generator (:obj:`Callable`): A generator function that `yields` examples. features (:class:`Features`, optional): Dataset features. gen_kwargs(:obj:`dict`, optional): Keyword arguments to be passed to the `generator` callable. + You can define a sharded iterable dataset by passing the list of shards in `gen_kwargs`. + This can be used to improve shuffling and when iterating over the dataset with multiple workers. Returns: :class:`IterableDataset` @@ -777,6 +779,20 @@ def from_generator( ... >>> ds = IterableDataset.from_generator(gen) ``` + + ```py + >>> def gen(shards): + ... for shard in shards: + ... with open(shard) as f: + ... for line in f: + ... yield {"line": line} + ... + >>> shards = [f"data{i}.txt" for i in range(32)] + >>> ds = IterableDataset.from_generator(gen, gen_kwargs={"shards": shards}) + >>> ds = ds.shuffle(seed=42, buffer_size=10_000) # shuffles the shards order + uses a shuffle buffer + >>> from torch.utils.data import DataLoader + >>> dataloader = .DataLoader(ds.with_format("torch"), num_workers=4) # give each worker a subset of 32/4=8 shards + ``` """ from .io.generator import GeneratorDatasetInputStream From 70d4dc6c75372e713db3985cf9f46f8da86d39a8 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 14 Nov 2022 12:40:46 +0100 Subject: [PATCH 3/7] tests --- tests/test_iterable_dataset.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py index 6c25e747630..82708a9308f 100644 --- a/tests/test_iterable_dataset.py +++ b/tests/test_iterable_dataset.py @@ -592,6 +592,19 @@ def gen(): assert list(dataset) == data +def test_iterable_dataset_from_generator_with_shards(): + + def gen(shard_names): + for shard_name in shard_names: + for i in range(10): + yield {"shard_name": shard_name, "i": i} + + shard_names = [f"data{shard_idx}.txt" for shard_idx in range(4)] + dataset = IterableDataset.from_generator(gen, gen_kwargs={"shard_names": shard_names}) + assert isinstance(dataset, IterableDataset) + assert dataset.n_shards == len(shard_names) + + @require_torch def test_iterable_dataset_factory_torch_integration(): import torch From 7ee40e94a1a8d20976789cbde11ccdc68771cc5b Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 14 Nov 2022 12:41:01 +0100 Subject: [PATCH 4/7] style --- tests/test_iterable_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py index 82708a9308f..12c6065a90a 100644 --- a/tests/test_iterable_dataset.py +++ b/tests/test_iterable_dataset.py @@ -593,7 +593,6 @@ def gen(): def test_iterable_dataset_from_generator_with_shards(): - def gen(shard_names): for shard_name in shard_names: for i in range(10): From 0844b23ec59c0bdeba1132fc5a13e5b4acca3007 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 14 Nov 2022 14:45:05 +0100 Subject: [PATCH 5/7] more docs --- docs/source/loading.mdx | 16 ++++++++++++++++ docs/source/package_reference/main_classes.mdx | 5 +++-- src/datasets/arrow_dataset.py | 12 ++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index ba3b5ded659..2c02b504d41 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -277,6 +277,22 @@ Create a dataset from a Python generator with [`~Dataset.from_generator`]: This approach supports loading data larger than available memory. +You can also define a sharded dataset by passing lists to `gen_kwargs`: + +```py +>>> def gen(shards): +... for shard in shards: +... with open(shard) as f: +... for line in f: +... yield {"line": line} +... +>>> shards = [f"data{i}.txt" for i in range(32)] +>>> ds = Dataset.from_generator(gen, gen_kwargs={"shards": shards}) +>>> ds = ds.shuffle(seed=42, buffer_size=10_000) # shuffles the shards order + uses a shuffle buffer +>>> from torch.utils.data import DataLoader +>>> dataloader = .DataLoader(ds.with_format("torch"), num_workers=4) # give each worker a subset of 32/4=8 shards +``` + ### Pandas DataFrame Load Pandas DataFrames with [`~Dataset.from_pandas`]: diff --git a/docs/source/package_reference/main_classes.mdx b/docs/source/package_reference/main_classes.mdx index 1b2b700ff71..d99dd9090b8 100644 --- a/docs/source/package_reference/main_classes.mdx +++ b/docs/source/package_reference/main_classes.mdx @@ -10,7 +10,7 @@ The base class [`Dataset`] implements a Dataset backed by an Apache Arrow table. [[autodoc]] datasets.Dataset - - add_column + - add_column - add_item - from_file - from_buffer @@ -151,7 +151,8 @@ It also has dataset transform methods like map or filter, to process all the spl The base class [`IterableDataset`] implements an iterable Dataset backed by python generators. [[autodoc]] datasets.IterableDataset - - remove_columns + - from_generator + - remove_columns - cast_column - cast - __iter__ diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index a916f402c16..9d33301070c 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -954,6 +954,7 @@ def from_generator( cache_dir (:obj:`str`, optional, default ``"~/.cache/huggingface/datasets"``): Directory to cache data. keep_in_memory (:obj:`bool`, default ``False``): Whether to copy the data in-memory. gen_kwargs(:obj:`dict`, optional): Keyword arguments to be passed to the `generator` callable. + You can define a sharded dataset by passing the list of shards in `gen_kwargs`. **kwargs (additional keyword arguments): Keyword arguments to be passed to :class:`GeneratorConfig`. Returns: @@ -968,6 +969,17 @@ def from_generator( ... >>> ds = Dataset.from_generator(gen) ``` + + ```py + >>> def gen(shards): + ... for shard in shards: + ... with open(shard) as f: + ... for line in f: + ... yield {"line": line} + ... + >>> shards = [f"data{i}.txt" for i in range(32)] + >>> ds = Dataset.from_generator(gen, gen_kwargs={"shards": shards}) + ``` """ from .io.generator import GeneratorDatasetInputStream From 91dfb5bac9bf9660df2c65584c0df596a76cdfa7 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 14 Nov 2022 14:45:50 +0100 Subject: [PATCH 6/7] typo --- docs/source/loading.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index 2c02b504d41..031011eb283 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -290,7 +290,7 @@ You can also define a sharded dataset by passing lists to `gen_kwargs`: >>> ds = Dataset.from_generator(gen, gen_kwargs={"shards": shards}) >>> ds = ds.shuffle(seed=42, buffer_size=10_000) # shuffles the shards order + uses a shuffle buffer >>> from torch.utils.data import DataLoader ->>> dataloader = .DataLoader(ds.with_format("torch"), num_workers=4) # give each worker a subset of 32/4=8 shards +>>> dataloader = DataLoader(ds.with_format("torch"), num_workers=4) # give each worker a subset of 32/4=8 shards ``` ### Pandas DataFrame From 4b6997a396482f47362f3ce08196f9c413b15f51 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 14 Nov 2022 14:45:56 +0100 Subject: [PATCH 7/7] typo2 --- src/datasets/iterable_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 0a17ad38009..e2509573249 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -791,7 +791,7 @@ def from_generator( >>> ds = IterableDataset.from_generator(gen, gen_kwargs={"shards": shards}) >>> ds = ds.shuffle(seed=42, buffer_size=10_000) # shuffles the shards order + uses a shuffle buffer >>> from torch.utils.data import DataLoader - >>> dataloader = .DataLoader(ds.with_format("torch"), num_workers=4) # give each worker a subset of 32/4=8 shards + >>> dataloader = DataLoader(ds.with_format("torch"), num_workers=4) # give each worker a subset of 32/4=8 shards ``` """ from .io.generator import GeneratorDatasetInputStream