Pacific-AI-Corp · ArshaanNazir · Jun 14, 2023 · Jun 10, 2023 · Jun 10, 2023 · Jun 10, 2023
diff --git a/nlptest/datahandler/datasource.py b/nlptest/datahandler/datasource.py
@@ -4,14 +4,11 @@
 import jsonlines
 from abc import ABC, abstractmethod
 from typing import Dict, List
-
 from nlptest.utils.custom_types.sample import ToxicitySample
-
 from .format import Formatter
 from ..utils.custom_types import NEROutput, NERPrediction, NERSample, Sample, SequenceClassificationOutput, \
     SequenceClassificationSample, SequenceLabel, QASample, SummarizationSample
 
-
 class _IDataset(ABC):
     """Abstract base class for Dataset.
 
@@ -533,3 +530,112 @@ def export_data(self, data: List[Sample], output_path: str):
                 path to save the data to
         """
         raise NotImplementedError()
+
+class HuggingFaceDataset(_IDataset):
+    """
+    Example dataset class that loads data using the Hugging Face dataset library.
+    """
+    COLUMN_NAMES = {
+        'text-classification': {
+            'text': ['text', 'sentences', 'sentence', 'sample'],
+            'label': ['label', 'labels', 'class', 'classes']
+        }
+    }
+
+    def __init__(self, dataset_name: str):
+        """
+        Initialize the HuggingFaceDataset class.
+
+        Args:
+            dataset_name (str):
+                Name of the dataset to load.
+        """
+        self.dataset_name = dataset_name
+
+    def load_data(self, feature_column: str = "text", target_column: str = "label", split: str = 'test', subset: str = None) -> List[Sample]:
+        """
+        Load the specified split from the dataset library.
+
+        Args:
+            feature_column (str):
+                Name of the feature_column column.
+            target_column (str):
+                Name of the target_column column.
+            split (str):
+                Name of the split to load (e.g., train, validation, test).
+            subset (str):
+                Name of the configuration.
+
+        Returns:
+            List[Sample]:
+                Loaded split as a list of Sample objects.
+        """
+        try:
+            from datasets import load_dataset
+        except ImportError:
+                raise ModuleNotFoundError("The 'datasets' package is not installed. Please install it using 'pip install datasets'.")
+        if subset:
+            dataset = load_dataset(self.dataset_name, name=subset, split=split)
+        else:
+            dataset = load_dataset(self.dataset_name, split=split)
+
+        if feature_column and target_column:
+            dataset = dataset.map(lambda example: {'text': example[feature_column], 'label': example[target_column]})
+
+        samples = [self._row_to_sample(example) for example in dataset]
+        return samples
+
+    def export_data(self, data: List[Sample], output_path: str):
+        """
+        Exports the data to the corresponding format and saves it to 'output_path'.
+
+        Args:
+            data (List[Sample]):
+                Data to export.
+            output_path (str):
+                Path to save the data to.
+        """
+        with open(output_path, "w") as file:
+            csv_writer = csv.writer(file)
+            csv_writer.writerow(list(self.COLUMN_NAMES['text-classification'].keys()))
+            for sample in data:
+                row = self._sample_to_row(sample)
+                csv_writer.writerow(row)
+
+    def _row_to_sample(self, data_row: Dict[str, str]) -> Sample:
+        """
+        Convert a row from the dataset into a Sample for text classification.
+
+        Args:
+            data_row (Dict[str, str]):
+                Single row of the dataset.
+
+        Returns:
+            Sample:
+                Row formatted into a Sample object.
+        """
+        input_column = next((col for col in self.COLUMN_NAMES['text-classification']['text'] if col in data_row), None)
+        output_column = next((col for col in self.COLUMN_NAMES['text-classification']['label'] if col in data_row), None)
+
+        original = data_row.get(input_column, '')
+        label = SequenceLabel(label=data_row.get(output_column, ''), score=1)
+
+        return SequenceClassificationSample(
+            original=original,
+            expected_results=SequenceClassificationOutput(predictions=[label])
+        )
+
+    def _sample_to_row(self, sample: Sample) -> List[str]:
+        """
+        Convert a Sample object into a row for exporting.
+
+        Args:
+            sample (Sample):
+                Sample object to convert.
+
+        Returns:
+            List[str]:
+                Row formatted as a list of strings.
+        """
+        row = [sample.original, sample.expected_results.predictions[0].label]
+        return row
diff --git a/nlptest/nlptest.py b/nlptest/nlptest.py
@@ -4,13 +4,12 @@
 from collections import defaultdict
 from typing import Dict, List, Optional, Union, Any
 import langchain
-
 import pandas as pd
 import yaml
 from pkg_resources import resource_filename
 
 from .augmentation import AugmentRobustness
-from .datahandler.datasource import DataFactory
+from .datahandler.datasource import DataFactory,HuggingFaceDataset
 from .modelhandler import ModelFactory, LANGCHAIN_HUBS
 from .transform import TestFactory
 
@@ -54,8 +53,8 @@ def __init__(
             model: Union[str, Any],
             task: str,
             hub: Optional[str] = None,
-            data: Optional[str] = None,
-            config: Optional[Union[str, dict]] = None
+            data: Optional[Union[str, dict]] = None,
+            config: Optional[Union[str, dict]] = None,            
     ):
         """
         Initialize the Harness object.
@@ -98,6 +97,14 @@ def __init__(
             self.is_default = True
             logging.info("Default dataset '%s' successfully loaded.", (task, model, hub))
 
+        elif type(data) is dict  and hub=="huggingface"and task=="text-classification":
+                self.data = HuggingFaceDataset(data['name']).load_data(
+                    data.get('feature_column', 'text'),
+                    data.get('target_column', 'label'),
+                    data.get('split', 'test'),
+                    data.get('subset', None)
+                ) if data is not None else None
+
         elif data is None and (task, model, hub) not in self.DEFAULTS_DATASET.keys():
             raise ValueError("You haven't specified any value for the parameter 'data' and the configuration you "
                              "passed is not among the default ones. You need to either specify the parameter 'data' "

diff --git a/tests/test_harness.py b/tests/test_harness.py
@@ -129,6 +129,26 @@ def test_load_text_classification(self):
         self.assertEqual(tc_harness.data, loaded_tc_harness.data)
         self.assertNotEqual(tc_harness.model, loaded_tc_harness.model)
 
+    def test_load_HF_data_text_classification(self):
+        """"""
+        save_dir = "/tmp/saved_HF_data_text_classification_harness_test"
+        tc_harness = Harness(task="text-classification", hub="huggingface",
+                            model="lvwerra/distilbert-imdb",
+                            data={"name":'imdb'}
+                                )
+        tc_harness.data=tc_harness.data[:10]
+        tc_harness.generate()
+        tc_harness.save(save_dir)
+
+        loaded_tc_harness = Harness.load(
+            save_dir=save_dir,
+            task="text-classification",
+            model="lvwerra/distilbert-imdb",
+            hub="huggingface"
+        )
+        self.assertEqual(tc_harness._config, loaded_tc_harness._config)
+        self.assertEqual(tc_harness.data, loaded_tc_harness.data)
+        self.assertNotEqual(tc_harness.model, loaded_tc_harness.model)
 
 class DefaultCodeBlocksTestCase(unittest.TestCase):
     """"""