Skip to content
112 changes: 109 additions & 3 deletions nlptest/datahandler/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,11 @@
import jsonlines
from abc import ABC, abstractmethod
from typing import Dict, List

from nlptest.utils.custom_types.sample import ToxicitySample

from .format import Formatter
from ..utils.custom_types import NEROutput, NERPrediction, NERSample, Sample, SequenceClassificationOutput, \
SequenceClassificationSample, SequenceLabel, QASample, SummarizationSample


class _IDataset(ABC):
"""Abstract base class for Dataset.

Expand Down Expand Up @@ -533,3 +530,112 @@ def export_data(self, data: List[Sample], output_path: str):
path to save the data to
"""
raise NotImplementedError()

class HuggingFaceDataset(_IDataset):
"""
Example dataset class that loads data using the Hugging Face dataset library.
"""
COLUMN_NAMES = {
'text-classification': {
'text': ['text', 'sentences', 'sentence', 'sample'],
'label': ['label', 'labels', 'class', 'classes']
}
}

def __init__(self, dataset_name: str):
"""
Initialize the HuggingFaceDataset class.

Args:
dataset_name (str):
Name of the dataset to load.
"""
self.dataset_name = dataset_name

def load_data(self, feature_column: str = "text", target_column: str = "label", split: str = 'test', subset: str = None) -> List[Sample]:
"""
Load the specified split from the dataset library.

Args:
feature_column (str):
Name of the feature_column column.
target_column (str):
Name of the target_column column.
split (str):
Name of the split to load (e.g., train, validation, test).
subset (str):
Name of the configuration.

Returns:
List[Sample]:
Loaded split as a list of Sample objects.
"""
try:
from datasets import load_dataset
except ImportError:
raise ModuleNotFoundError("The 'datasets' package is not installed. Please install it using 'pip install datasets'.")
if subset:
dataset = load_dataset(self.dataset_name, name=subset, split=split)
else:
dataset = load_dataset(self.dataset_name, split=split)

if feature_column and target_column:
dataset = dataset.map(lambda example: {'text': example[feature_column], 'label': example[target_column]})

samples = [self._row_to_sample(example) for example in dataset]
return samples

def export_data(self, data: List[Sample], output_path: str):
"""
Exports the data to the corresponding format and saves it to 'output_path'.

Args:
data (List[Sample]):
Data to export.
output_path (str):
Path to save the data to.
"""
with open(output_path, "w") as file:
csv_writer = csv.writer(file)
csv_writer.writerow(list(self.COLUMN_NAMES['text-classification'].keys()))
for sample in data:
row = self._sample_to_row(sample)
csv_writer.writerow(row)

def _row_to_sample(self, data_row: Dict[str, str]) -> Sample:
"""
Convert a row from the dataset into a Sample for text classification.

Args:
data_row (Dict[str, str]):
Single row of the dataset.

Returns:
Sample:
Row formatted into a Sample object.
"""
input_column = next((col for col in self.COLUMN_NAMES['text-classification']['text'] if col in data_row), None)
output_column = next((col for col in self.COLUMN_NAMES['text-classification']['label'] if col in data_row), None)

original = data_row.get(input_column, '')
label = SequenceLabel(label=data_row.get(output_column, ''), score=1)

return SequenceClassificationSample(
original=original,
expected_results=SequenceClassificationOutput(predictions=[label])
)

def _sample_to_row(self, sample: Sample) -> List[str]:
"""
Convert a Sample object into a row for exporting.

Args:
sample (Sample):
Sample object to convert.

Returns:
List[str]:
Row formatted as a list of strings.
"""
row = [sample.original, sample.expected_results.predictions[0].label]
return row
15 changes: 11 additions & 4 deletions nlptest/nlptest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
from collections import defaultdict
from typing import Dict, List, Optional, Union, Any
import langchain

import pandas as pd
import yaml
from pkg_resources import resource_filename

from .augmentation import AugmentRobustness
from .datahandler.datasource import DataFactory
from .datahandler.datasource import DataFactory,HuggingFaceDataset
from .modelhandler import ModelFactory, LANGCHAIN_HUBS
from .transform import TestFactory

Expand Down Expand Up @@ -54,8 +53,8 @@ def __init__(
model: Union[str, Any],
task: str,
hub: Optional[str] = None,
data: Optional[str] = None,
config: Optional[Union[str, dict]] = None
data: Optional[Union[str, dict]] = None,
config: Optional[Union[str, dict]] = None,
):
"""
Initialize the Harness object.
Expand Down Expand Up @@ -98,6 +97,14 @@ def __init__(
self.is_default = True
logging.info("Default dataset '%s' successfully loaded.", (task, model, hub))

elif type(data) is dict and hub=="huggingface"and task=="text-classification":
self.data = HuggingFaceDataset(data['name']).load_data(
data.get('feature_column', 'text'),
data.get('target_column', 'label'),
data.get('split', 'test'),
data.get('subset', None)
) if data is not None else None

elif data is None and (task, model, hub) not in self.DEFAULTS_DATASET.keys():
raise ValueError("You haven't specified any value for the parameter 'data' and the configuration you "
"passed is not among the default ones. You need to either specify the parameter 'data' "
Expand Down
20 changes: 20 additions & 0 deletions tests/test_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,26 @@ def test_load_text_classification(self):
self.assertEqual(tc_harness.data, loaded_tc_harness.data)
self.assertNotEqual(tc_harness.model, loaded_tc_harness.model)

def test_load_HF_data_text_classification(self):
""""""
save_dir = "/tmp/saved_HF_data_text_classification_harness_test"
tc_harness = Harness(task="text-classification", hub="huggingface",
model="lvwerra/distilbert-imdb",
data={"name":'imdb'}
)
tc_harness.data=tc_harness.data[:10]
tc_harness.generate()
tc_harness.save(save_dir)

loaded_tc_harness = Harness.load(
save_dir=save_dir,
task="text-classification",
model="lvwerra/distilbert-imdb",
hub="huggingface"
)
self.assertEqual(tc_harness._config, loaded_tc_harness._config)
self.assertEqual(tc_harness.data, loaded_tc_harness.data)
self.assertNotEqual(tc_harness.model, loaded_tc_harness.model)

class DefaultCodeBlocksTestCase(unittest.TestCase):
""""""
Expand Down