Skip to content

Commit a2576b8

Browse files
authored
Reduce default max writer_batch_size (#5163)
1 parent 18d0a21 commit a2576b8

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

src/datasets/arrow_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1617,11 +1617,11 @@ def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Datas
16171617
def cast(
16181618
self,
16191619
features: Features,
1620-
batch_size: Optional[int] = 10_000,
1620+
batch_size: Optional[int] = 1000,
16211621
keep_in_memory: bool = False,
16221622
load_from_cache_file: bool = True,
16231623
cache_file_name: Optional[str] = None,
1624-
writer_batch_size: Optional[int] = 10_000,
1624+
writer_batch_size: Optional[int] = 1000,
16251625
num_proc: Optional[int] = None,
16261626
) -> "Dataset":
16271627
"""

src/datasets/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@
169169

170170
# Batch size constants. For more info, see:
171171
# https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations)
172-
DEFAULT_MAX_BATCH_SIZE = 10_000
172+
DEFAULT_MAX_BATCH_SIZE = 1000
173173

174174
# Size of the preloaded record batch in `Dataset.__iter__`
175175
ARROW_READER_BATCH_SIZE_IN_DATASET_ITER = 10

0 commit comments

Comments
 (0)