diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index 20ca920a..8bb192f3 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -1,5 +1,6 @@ from __future__ import annotations +import csv import logging import re from array import array @@ -141,7 +142,7 @@ def _generate_label_mapping(self, labels, classes=None): def read_libmultilabel_format(path: str) -> 'dict[str,list[str]]': data = pd.read_csv(path, sep='\t', header=None, dtype=str, - on_bad_lines='skip').fillna('') + on_bad_lines='skip', quoting=csv.QUOTE_NONE).fillna('') if data.shape[1] == 2: data.columns = ['label', 'text'] data = data.reset_index() diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py index e691d238..5c4c771b 100644 --- a/libmultilabel/nn/data_utils.py +++ b/libmultilabel/nn/data_utils.py @@ -1,3 +1,4 @@ +import csv import gc import logging import warnings @@ -136,7 +137,7 @@ def _load_raw_data(path, is_test=False, tokenize_text=True, remove_no_label_data """ logging.info(f'Load data from {path}.') data = pd.read_csv(path, sep='\t', header=None, - error_bad_lines=False, warn_bad_lines=True).fillna('') + error_bad_lines=False, warn_bad_lines=True, quoting=csv.QUOTE_NONE).fillna('') if data.shape[1] == 2: data.columns = ['label', 'text'] data = data.reset_index()