diff --git a/src/datasets/packaged_modules/text/text.py b/src/datasets/packaged_modules/text/text.py index 6fb1e0f3650..6921ab8617b 100644 --- a/src/datasets/packaged_modules/text/text.py +++ b/src/datasets/packaged_modules/text/text.py @@ -1,4 +1,5 @@ import itertools +import warnings from dataclasses import dataclass from io import StringIO from typing import Optional @@ -19,11 +20,21 @@ class TextConfig(datasets.BuilderConfig): features: Optional[datasets.Features] = None encoding: str = "utf-8" - errors: Optional[str] = None + errors = "deprecated" + encoding_errors: Optional[str] = None chunksize: int = 10 << 20 # 10MB keep_linebreaks: bool = False sample_by: str = "line" + def __post_init__(self): + if self.errors != "deprecated": + warnings.warn( + "'errors' was deprecated in favor of 'encoding_erros' in version 2.14.0 and will be removed in 3.0.0.\n" + f"You can remove this warning by passing 'encoding_errors={self.errors}' instead.", + FutureWarning, + ) + self.encoding_errors = self.errors + class Text(datasets.ArrowBasedBuilder): BUILDER_CONFIG_CLASS = TextConfig @@ -71,7 +82,7 @@ def _generate_tables(self, files): pa_table_names = list(self.config.features) if self.config.features is not None else ["text"] for file_idx, file in enumerate(itertools.chain.from_iterable(files)): # open in text mode, by default translates universal newlines ("\n", "\r\n" and "\r") into "\n" - with open(file, encoding=self.config.encoding, errors=self.config.errors) as f: + with open(file, encoding=self.config.encoding, errors=self.config.encoding_errors) as f: if self.config.sample_by == "line": batch_idx = 0 while True: