Skip to content

Commit eeb5db2

Browse files
dimapihtarmonica-sekoyan
authored andcommitted
add mmap_bin_files param (#14122)
Signed-off-by: dimapihtar <[email protected]>
1 parent 3581ab2 commit eeb5db2

File tree

1 file changed

+3
-0
lines changed

1 file changed

+3
-0
lines changed

nemo/collections/llm/gpt/data/pre_training.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ class PreTrainingDataModule(pl.LightningDataModule, IOMixin):
163163
init_consumed_samples: (Optional[int]): Number of samples already consumed at initialization.
164164
init_global_step: (Optional[int]): Starting global training step count, used for resuming training.
165165
output_log: (Optional[bool]): Whether to print logging/debug output during sampling.
166+
mmap_bin_files: (Optional[bool]): Whether to mmap the .bin files or use file pointers.
166167
object_storage_cache_path: (Optional[str]): Path for caching indices for s3 or msc dataloading.
167168
"""
168169

@@ -193,6 +194,7 @@ def __init__(
193194
init_global_step: Optional[int] = 0,
194195
output_log: Optional[bool] = True,
195196
dataset_cls: Type[MegatronDataset] = GPTDataset,
197+
mmap_bin_files: Optional[bool] = True,
196198
object_storage_cache_path: Optional[str] = None,
197199
) -> None:
198200
super().__init__()
@@ -206,6 +208,7 @@ def __init__(
206208
validate_dataset_asset_accessibility(paths)
207209

208210
build_kwargs = {}
211+
build_kwargs["mmap_bin_files"] = mmap_bin_files
209212
if isinstance(paths, dict):
210213
if split is not None:
211214
warnings.warn(

0 commit comments

Comments
 (0)