Skip to content

Commit cc84e85

Browse files
TextConfig: added "errors"
1 parent f09f781 commit cc84e85

File tree

1 file changed

+4
-1
lines changed
  • src/datasets/packaged_modules/text

1 file changed

+4
-1
lines changed

src/datasets/packaged_modules/text/text.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class TextConfig(datasets.BuilderConfig):
1919

2020
features: Optional[datasets.Features] = None
2121
encoding: str = "utf-8"
22+
errors: str = "strict"
2223
chunksize: int = 10 << 20 # 10MB
2324
keep_linebreaks: bool = False
2425
sample_by: str = "line"
@@ -70,7 +71,9 @@ def _generate_tables(self, files):
7071
pa_table_names = list(self.config.features) if self.config.features is not None else ["text"]
7172
for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
7273
# open in text mode, by default translates universal newlines ("\n", "\r\n" and "\r") into "\n"
73-
with open(file, encoding=self.config.encoding) as f:
74+
with open(file,
75+
encoding=self.config.encoding,
76+
errors=self.config.errors) as f:
7477
if self.config.sample_by == "line":
7578
batch_idx = 0
7679
while True:

0 commit comments

Comments
 (0)