diff --git a/langtest/langtest.py b/langtest/langtest.py index b3f5b614e..9b02652f0 100644 --- a/langtest/langtest.py +++ b/langtest/langtest.py @@ -13,6 +13,7 @@ from pkg_resources import resource_filename + from .tasks import TaskManager from .augmentation import AugmentRobustness, TemplaticAugment from .datahandler.datasource import DataFactory @@ -22,6 +23,7 @@ from .transform.utils import RepresentationOperation +from langtest.utils.benchmark_utils import Leaderboard, Summary from langtest.utils.lib_manager import try_import_lib from langtest.utils.custom_types.helpers import TestResultManager from langtest.utils.checkpoints import divide_into_batches, CheckpointManager @@ -92,6 +94,7 @@ def __init__( model: Optional[Union[list, dict]] = None, data: Optional[Union[list, dict]] = None, config: Optional[Union[str, dict]] = None, + benchmarking: dict = None, ): """Initialize the Harness object. @@ -111,6 +114,25 @@ def __init__( self.is_default = False self.__data_dict = data self.__is_multi_model = False + self.__model_info = model + self.__benchmarking = benchmarking + + # check the list of strings in the data + if isinstance(data, list) and all(isinstance(i, str) for i in data): + temp_data = [] + for dataset in data: + if isinstance(task, dict): + temp_task = task["category"] + else: + temp_task = task + temp_data.append( + config_utils.BenchmarkDatasets.get_dataset_dict( + dataset_name=dataset, task=temp_task + ) + ) + + data = temp_data + self.__data_dict = data # reset classes to default state self.__reset_defaults() @@ -446,6 +468,11 @@ def report( pd.DataFrame: DataFrame containing the results of the tests. """ + + # benchmarking true + if self.__benchmarking: + self.__tracking() + if self._generated_results is None: raise RuntimeError(Errors.E011) @@ -1342,7 +1369,11 @@ def __single_dataset_generate(self, dataset: list): return testcases elif str(self.task) in ("question-answering", "summarization"): - if "bias" in tests.keys() and "bias" == self.__data_dict.get("split"): + if ( + "bias" in tests.keys() + and isinstance(self.__data_dict, dict) + and "bias" == self.__data_dict.get("split") + ): if self.__data_dict["data_source"] in ("BoolQ", "XSum"): tests_to_filter = tests["bias"].keys() testcases = DataFactory.filter_curated_bias(tests_to_filter, dataset) @@ -1611,3 +1642,83 @@ def __reset_defaults(self): """Reset the default values.""" model_response = TestResultManager() model_response.clear_data() + + def __tracking(self, *args, **kwargs): + """Track the progress of the testcases.""" + if self.__benchmarking: + df = self.generated_results() + + path = self.__benchmarking.get( + os.path.expanduser("save_dir"), + os.path.expanduser("~/.langtest/leaderboard/"), + ) + summary = Summary(path) + + # temp dict + temp_dict = {} + if isinstance(self.__data_dict, dict): + temp_dict[self.__data_dict.get("data_source")] = self.__data_dict + else: + for i in self.__data_dict: + temp_dict[i.get("data_source")] = i + + # add the dataset_name column if the data is single dataset + if isinstance(self.__data_dict, dict) and (not self.is_multi_dataset): + df["dataset_name"] = self.__data_dict.get("data_source", "-") + + df["split"] = df["dataset_name"].apply( + lambda x: temp_dict[x].get("split", "-") + ) + df["subset"] = df["dataset_name"].apply( + lambda x: temp_dict[x].get("subset", "-") + ) + + df["hub"] = self.__model_info.get("hub", "-") + if self.__model_info.get("hub", "-") == "lm-studio": + import requests as req + + response = req.get( + "http://localhost:1234/v1/models", + ).json() + + model_name = response["data"][0]["id"] + df["model"] = model_name + else: + df["model"] = self.__model_info.get("model", "-") + df["task"] = str(self.task) + summary.add_report(df) + + def get_leaderboard( + self, + indices=[], + columns=[], + category=False, + split_wise=False, + test_wise=False, + *args, + **kwargs, + ): + """Get the rank of the model on the leaderboard.""" + + if os.path.exists(os.path.expanduser(self.__benchmarking.get("save_dir"))): + path = os.path.expanduser(self.__benchmarking.get("save_dir")) + elif os.path.exists(os.path.expanduser("~/.langtest/leaderboard/")): + path = os.path.expanduser("./.langtest/leaderboard/") + else: + raise FileNotFoundError(f"Summary.csv File is not exists in {path}") + + leaderboard = Leaderboard(path) + + # print(leaderboard.default().to_markdown()) + if indices or columns: + return leaderboard.custom_wise(indices, columns) + if category: + return leaderboard.category_wise() + + if test_wise: + return leaderboard.test_wise() + + if split_wise: + return leaderboard.split_wise() + + return leaderboard.default() diff --git a/langtest/utils/benchmark_utils.py b/langtest/utils/benchmark_utils.py new file mode 100644 index 000000000..2bbab53de --- /dev/null +++ b/langtest/utils/benchmark_utils.py @@ -0,0 +1,330 @@ +import os +from typing import TypeVar, Generic +import pandas as pd + + +class Leaderboard(Generic[TypeVar("T", bound="Leaderboard")]): + + """ + Leaderboard class to manage the ranking of the models + + Args: + path (str): The path to the summary file + + + """ + + _instance = None + + def __new__(cls, *args, **kwargs): + """ + Singleton pattern to ensure only one instance of the class is created + """ + if not cls._instance: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__( + self, + path: str = os.path.expanduser("~/.langtest/leaderboard/summary.csv"), + *args, + **kwargs, + ) -> None: + """ + Initialize the Leaderboard class with the summary file + """ + self.summary = Summary(path, *args, **kwargs) + + def default(self): + """ + Get the score board for the models + """ + df = self.summary.summary_df + df = self.__drop_duplicates(df) + pvt_table = df.pivot_table( + index=["model"], columns="dataset_name", values="score" + ) + + # mean column + pvt_table.insert(0, "Avg", pvt_table.mean(axis=1)) + pvt_table = pvt_table.sort_values(by=["model", "Avg"], ascending=[True, False]) + + # reset the index and fill the NaN values + pvt_table = pvt_table.rename_axis(None, axis=1).reset_index() + pvt_table = pvt_table.fillna("-") + + return pvt_table + + def split_wise(self): + """ + Get the score board for the models by test type + """ + + df = self.summary.summary_df + df = self.__drop_duplicates(df) + pvt_table = df.pivot_table( + index=["model", "split"], + columns=["dataset_name"], + values="score", + ) + + # mean column + pvt_table.insert(0, "Avg", pvt_table.mean(axis=1)) + pvt_table = pvt_table.sort_values(by=["model", "Avg"], ascending=[True, False]) + + pvt_table = pvt_table.fillna("-") + + return pvt_table + + def test_wise(self): + """ + Get the score board for the models by test type + """ + + df = self.summary.summary_df + df = self.__drop_duplicates(df) + pvt_table = df.pivot_table( + index=["model", "test_type"], columns=["dataset_name"], values="score" + ) + + # mean column + pvt_table.insert(0, "Avg", pvt_table.mean(axis=1)) + pvt_table = pvt_table.sort_values(by=["model", "Avg"], ascending=[True, False]) + + pvt_table = pvt_table.fillna("-") + + return pvt_table + + def category_wise(self): + """ + Get the score board for the models by category + """ + df = self.summary.summary_df + df = self.__drop_duplicates(df) + pvt_table = df.pivot_table( + index=["model", "category"], columns=["dataset_name"], values="score" + ) + pvt_table.insert(0, "Avg", pvt_table.mean(axis=1)) + pvt_table = pvt_table.sort_values(by=["model", "Avg"], ascending=[True, False]) + pvt_table = pvt_table.fillna("-") + pvt_table = pvt_table.rename_axis(None, axis=1).reset_index() + + return pvt_table + + def custom_wise(self, indices: list, columns: list = []): + """ + Get the score board for the models by custom group + """ + df = self.summary.summary_df + df = self.__drop_duplicates(df) + pvt_table = df.pivot_table( + index=["model", *indices], + columns=["dataset_name", *columns], + values="score", + aggfunc="first", + ) + pvt_table.insert(0, "Avg", pvt_table.mean(axis=1)) + pvt_table = pvt_table.fillna("-") + pvt_table = pvt_table.sort_values(by=["model", "Avg"], ascending=[True, False]) + # pvt_table = pvt_table.rename_axis(None, axis=1).reset_index() + + return pvt_table + + def __drop_duplicates(self, df: pd.DataFrame): + """ + Drop duplicates from the dataframe + """ + # arrange the dataframe by timestamp in descending order + df["timestamp"] = pd.to_datetime(df["timestamp"]) + df = df.sort_values(by="timestamp", ascending=False) + + # remove duplicates + df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%d-%H-%M-%S") + df = df.sort_values(by="timestamp", ascending=False) + df.reset_index(drop=True, inplace=True) + unique_records = df.drop_duplicates( + subset=[ + # "timestamp", + "category", + "test_type", + "model", + "hub", + "dataset_name", + "split", + "subset", + "task", + ], + # keep=, + ) + + unique_records.reset_index(drop=True, inplace=True) + + return df + + def __repr__(self) -> str: + return self.summary.summary_df.to_markdown() + + +class Summary(Generic[TypeVar("T", bound="Summary")]): + """ + Summary class to manage the summary report + """ + + _instance = None + + def __new__(cls, *args, **kwargs): + """ + Singleton pattern to ensure only one instance of the class is created + """ + if not cls._instance: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self, path: str, *args, **kwargs) -> None: + """ + Initialize the summary + """ + self.save_dir = path + self.file_path = f"{path}summary.csv" + + self.summary_df: pd.DataFrame = self.load_data_from_file( + self.file_path, *args, **kwargs + ) + + def load_data_from_file(self, path: str, *args, **kwargs) -> pd.DataFrame: + """ + Check if file exists + """ + try: + if os.path.exists(path): + return self.__read_from_csv(path, *args, **kwargs) + else: + os.makedirs(os.path.dirname(path), exist_ok=True) + # Create a new file + df = pd.DataFrame(columns=self.__default_columns()) + df.to_csv(path, index=False) + return df + except FileNotFoundError: + raise FileNotFoundError(f"File not found at {path}") + + def __read_from_csv(self, path: str) -> pd.DataFrame: + """ + Read data from csv file + """ + df = pd.read_csv(path) + return df + + def __default_columns(self): + """ + Default columns for the summary report + """ + cols = [ + "timestamp", + "task", + "model", + "hub", + "category", + "test_type", + "dataset_name", + "split", + "subset", + "total_records", + "success_records", + "failure_records", + "score", + ] + return cols + + def add_report( + self, + generated_results: pd.DataFrame, + ) -> None: + """ + Add a new report to the summary + """ + + from datetime import datetime + + # Filter the dataframe for accuracy, fairness and representation + afr_df = self.__afr(generated_results) + not_afr_df = self.__not_afr(generated_results) + + # concatenate the dataframes + temp_summary_df = pd.concat([afr_df, not_afr_df], axis=0) + + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + temp_summary_df["timestamp"] = timestamp + + # insert row to the summary df + self.summary_df = pd.concat([self.summary_df, temp_summary_df], ignore_index=True) + + # Save the summary to the file + self.save_summary() + + def save_summary(self) -> None: + """ + Save the summary to the file + """ + self.summary_df.to_csv(self.file_path, index=False) + + def __afr(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Filter the dataframe for accuracy, fairness and representation + to be used in the summary report + """ + df = df[df["category"].isin(["accuracy", "fairness", "representation"])] + df = df[self.__group_by_cols() + ["actual_result"]] + df = df.rename(columns={"actual_result": "score"}) + + return df + + def __not_afr(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Filter the dataframe for non accuracy, fairness and representation + to be used in the summary report + """ + df = df[~df["category"].isin(["accuracy", "fairness", "representation"])] + + grouped = df.groupby(self.__group_by_cols()) + + # Filter the columns + import numpy as np + + total_records = grouped.size().reset_index(name="total_records") + success_records = grouped["pass"].sum().reset_index(name="success_records") + score = grouped["pass"].mean().reset_index(name="score") + failure_records = grouped.apply( + lambda x: np.size(x["pass"]) - np.sum(x["pass"]) + ).reset_index(name="failure_records") + + # concatenate the dataframes + result = pd.concat( + [ + success_records, + failure_records["failure_records"], + total_records["total_records"], + score["score"], + ], + axis=1, + ) + + return result + + def __group_by_cols(self): + """ + Group by columns + """ + return [ + "category", + "dataset_name", + "test_type", + "model", + "hub", + "split", + "subset", + "task", + ] + + @property + def df(self) -> pd.DataFrame: + return self.summary_df diff --git a/langtest/utils/config_utils.py b/langtest/utils/config_utils.py index eab207c5d..9f762ce30 100644 --- a/langtest/utils/config_utils.py +++ b/langtest/utils/config_utils.py @@ -1,3 +1,4 @@ +from typing import List from pkg_resources import resource_filename @@ -47,3 +48,193 @@ "default": resource_filename("langtest", "data/config/wino_llm_config.yml"), }, } + +BENCHMARK_DATASETS_DICT = { + "ASDiv": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "ASDiv", "split": "test-tiny"}, + }, + "BBQ": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "BBQ", "split": "test-tiny"}, + }, + "Bigbench": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": { + "data_source": "Bigbench", + "subset": "Abstract-narrative-understanding", + "split": "test-tiny", + }, + }, + "BoolQ": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "BoolQ", "split": "test-tiny"}, + }, + "CommonsenseQA": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "CommonsenseQA", "split": "validation-tiny"}, + }, + "FIQA": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "Fiqa", "split": "test-tiny"}, + }, + "HellaSwag": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "HellaSwag", "split": "test-tiny"}, + }, + "Consumer-Contracts": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "Consumer-Contracts", "split": "test"}, + }, + "Contracts": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "Contracts", "split": "test"}, + }, + "Privacy-Policy": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "Privacy-Policy", "split": "test"}, + }, + "LogiQA": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "LogiQA", "split": "test-tiny"}, + }, + "MMLU": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "MMLU", "split": "test-tiny"}, + }, + "NarrativeQA": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "NarrativeQA", "split": "test-tiny"}, + }, + "NQ-open": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "NQ-open", "split": "test-tiny"}, + }, + "OpenBookQA": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "OpenBookQA", "split": "test-tiny"}, + }, + "PIQA": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "PIQA", "split": "test-tiny"}, + }, + "Quac": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "Quac", "split": "test-tiny"}, + }, + "SIQA": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "SIQA", "split": "test-tiny"}, + }, + "TruthfulQA": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "TruthfulQA", "split": "test-tiny"}, + }, + "XSum": { + "task": "summarization", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "XSum", "split": "test-tiny"}, + }, + "MultiLexSum": { + "task": "summarization", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": {"data_source": "MultiLexSum", "split": "test-tiny"}, + }, + "MedMCQA": { + "task": "question-answering", + "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"}, + "data": { + "data_source": "MedMCQA", + "subset": "MedMCQA-Test", + "split": "Radiology", + }, + }, + "MedQA": { + "task": "question-answering", + "model": {"model": "mistralai/Mistral-7B-Instruct-v0.1", "hub": "huggingface"}, + "data": {"data_source": "MedQA", "split": "test"}, + "config": { + "evaluation": { + "metric": "string_distance", + "distance": "jaro", + "threshold": 0.1, + }, + "tests": { + "defaults": {"min_pass_rate": 0.65}, + "robustness": { + "add_ocr_typo": {"min_pass_rate": 0.66}, + "dyslexia_word_swap": {"min_pass_rate": 0.60}, + }, + }, + }, + }, + "PubMedQA": { + "task": "question-answering", + "model": {"model": "j2-jumbo-instruct", "hub": "ai21"}, + "data": {"data_source": "PubMedQA", "split": "pqaa"}, + "config": { + "evaluation": { + "metric": "string_distance", + "distance": "jaro", + "threshold": 0.1, + }, + "tests": { + "defaults": {"min_pass_rate": 0.65}, + "robustness": { + "add_ocr_typo": {"min_pass_rate": 0.66}, + "dyslexia_word_swap": {"min_pass_rate": 0.60}, + }, + }, + }, + }, +} + + +class BenchmarkDatasets: + def __init__(self, task: str, dataset_name: str): + self.__dataset_name = dataset_name + self.__task = task + + @classmethod + def get_dataset_dict(cls, dataset_name: str, task: str) -> dict: + """Get the benchmark dataset configuration for the given dataset name.""" + dataset_config = BENCHMARK_DATASETS_DICT.get(dataset_name, None) + + if dataset_config is None: + raise ValueError( + f"Dataset {dataset_name} not found in the benchmark datasets list - {cls.get_datasets()}." + ) + + return dataset_config.get("data", None) + + @classmethod + def get_datasets(cls) -> List[str]: + """Get the list of benchmark datasets available in the library.""" + return list(BENCHMARK_DATASETS_DICT.keys()) + + @property + def dataset_name(self) -> str: + return self.__dataset_name + + @property + def task(self) -> str: + return self.__task diff --git a/tests/test_datasource.py b/tests/test_datasource.py index 55f2c0e29..d83054f7e 100644 --- a/tests/test_datasource.py +++ b/tests/test_datasource.py @@ -83,10 +83,10 @@ def test_load_raw_data(self, dataset, feature_col, target_col): ( HuggingFaceDataset( source_info={ - "data_source": "wikiann", - "subset": "fo", + "data_source": "tner/wikiann", + "subset": "ace", "feature_column": "tokens", - "target_column": "ner_tags", + "target_column": "tags", "split": "test", }, task=TaskManager("ner"),