diff --git a/src/datasets/packaged_modules/text/text.py b/src/datasets/packaged_modules/text/text.py index 74bc48d0d13..dadf9848195 100644 --- a/src/datasets/packaged_modules/text/text.py +++ b/src/datasets/packaged_modules/text/text.py @@ -19,6 +19,7 @@ class TextConfig(datasets.BuilderConfig): features: Optional[datasets.Features] = None encoding: str = "utf-8" + errors: Optional[str] = None chunksize: int = 10 << 20 # 10MB keep_linebreaks: bool = False sample_by: str = "line" @@ -70,7 +71,7 @@ def _generate_tables(self, files): pa_table_names = list(self.config.features) if self.config.features is not None else ["text"] for file_idx, file in enumerate(itertools.chain.from_iterable(files)): # open in text mode, by default translates universal newlines ("\n", "\r\n" and "\r") into "\n" - with open(file, encoding=self.config.encoding) as f: + with open(file, encoding=self.config.encoding, errors=self.config.errors) as f: if self.config.sample_by == "line": batch_idx = 0 while True: