diff --git a/langtest/langtest.py b/langtest/langtest.py
index b3f5b614e..9b02652f0 100644
--- a/langtest/langtest.py
+++ b/langtest/langtest.py
@@ -13,6 +13,7 @@
 
 from pkg_resources import resource_filename
 
+
 from .tasks import TaskManager
 from .augmentation import AugmentRobustness, TemplaticAugment
 from .datahandler.datasource import DataFactory
@@ -22,6 +23,7 @@
 
 
 from .transform.utils import RepresentationOperation
+from langtest.utils.benchmark_utils import Leaderboard, Summary
 from langtest.utils.lib_manager import try_import_lib
 from langtest.utils.custom_types.helpers import TestResultManager
 from langtest.utils.checkpoints import divide_into_batches, CheckpointManager
@@ -92,6 +94,7 @@ def __init__(
         model: Optional[Union[list, dict]] = None,
         data: Optional[Union[list, dict]] = None,
         config: Optional[Union[str, dict]] = None,
+        benchmarking: dict = None,
     ):
         """Initialize the Harness object.
 
@@ -111,6 +114,25 @@ def __init__(
         self.is_default = False
         self.__data_dict = data
         self.__is_multi_model = False
+        self.__model_info = model
+        self.__benchmarking = benchmarking
+
+        # check the list of strings in the data
+        if isinstance(data, list) and all(isinstance(i, str) for i in data):
+            temp_data = []
+            for dataset in data:
+                if isinstance(task, dict):
+                    temp_task = task["category"]
+                else:
+                    temp_task = task
+                temp_data.append(
+                    config_utils.BenchmarkDatasets.get_dataset_dict(
+                        dataset_name=dataset, task=temp_task
+                    )
+                )
+
+            data = temp_data
+            self.__data_dict = data
 
         # reset classes to default state
         self.__reset_defaults()
@@ -446,6 +468,11 @@ def report(
             pd.DataFrame:
                 DataFrame containing the results of the tests.
         """
+
+        # benchmarking true
+        if self.__benchmarking:
+            self.__tracking()
+
         if self._generated_results is None:
             raise RuntimeError(Errors.E011)
 
@@ -1342,7 +1369,11 @@ def __single_dataset_generate(self, dataset: list):
                 return testcases
 
         elif str(self.task) in ("question-answering", "summarization"):
-            if "bias" in tests.keys() and "bias" == self.__data_dict.get("split"):
+            if (
+                "bias" in tests.keys()
+                and isinstance(self.__data_dict, dict)
+                and "bias" == self.__data_dict.get("split")
+            ):
                 if self.__data_dict["data_source"] in ("BoolQ", "XSum"):
                     tests_to_filter = tests["bias"].keys()
                     testcases = DataFactory.filter_curated_bias(tests_to_filter, dataset)
@@ -1611,3 +1642,83 @@ def __reset_defaults(self):
         """Reset the default values."""
         model_response = TestResultManager()
         model_response.clear_data()
+
+    def __tracking(self, *args, **kwargs):
+        """Track the progress of the testcases."""
+        if self.__benchmarking:
+            df = self.generated_results()
+
+            path = self.__benchmarking.get(
+                os.path.expanduser("save_dir"),
+                os.path.expanduser("~/.langtest/leaderboard/"),
+            )
+            summary = Summary(path)
+
+            # temp dict
+            temp_dict = {}
+            if isinstance(self.__data_dict, dict):
+                temp_dict[self.__data_dict.get("data_source")] = self.__data_dict
+            else:
+                for i in self.__data_dict:
+                    temp_dict[i.get("data_source")] = i
+
+            # add the dataset_name column if the data is single dataset
+            if isinstance(self.__data_dict, dict) and (not self.is_multi_dataset):
+                df["dataset_name"] = self.__data_dict.get("data_source", "-")
+
+            df["split"] = df["dataset_name"].apply(
+                lambda x: temp_dict[x].get("split", "-")
+            )
+            df["subset"] = df["dataset_name"].apply(
+                lambda x: temp_dict[x].get("subset", "-")
+            )
+
+            df["hub"] = self.__model_info.get("hub", "-")
+            if self.__model_info.get("hub", "-") == "lm-studio":
+                import requests as req
+
+                response = req.get(
+                    "http://localhost:1234/v1/models",
+                ).json()
+
+                model_name = response["data"][0]["id"]
+                df["model"] = model_name
+            else:
+                df["model"] = self.__model_info.get("model", "-")
+            df["task"] = str(self.task)
+            summary.add_report(df)
+
+    def get_leaderboard(
+        self,
+        indices=[],
+        columns=[],
+        category=False,
+        split_wise=False,
+        test_wise=False,
+        *args,
+        **kwargs,
+    ):
+        """Get the rank of the model on the leaderboard."""
+
+        if os.path.exists(os.path.expanduser(self.__benchmarking.get("save_dir"))):
+            path = os.path.expanduser(self.__benchmarking.get("save_dir"))
+        elif os.path.exists(os.path.expanduser("~/.langtest/leaderboard/")):
+            path = os.path.expanduser("./.langtest/leaderboard/")
+        else:
+            raise FileNotFoundError(f"Summary.csv File is not exists in {path}")
+
+        leaderboard = Leaderboard(path)
+
+        # print(leaderboard.default().to_markdown())
+        if indices or columns:
+            return leaderboard.custom_wise(indices, columns)
+        if category:
+            return leaderboard.category_wise()
+
+        if test_wise:
+            return leaderboard.test_wise()
+
+        if split_wise:
+            return leaderboard.split_wise()
+
+        return leaderboard.default()
diff --git a/langtest/utils/benchmark_utils.py b/langtest/utils/benchmark_utils.py
new file mode 100644
index 000000000..2bbab53de
--- /dev/null
+++ b/langtest/utils/benchmark_utils.py
@@ -0,0 +1,330 @@
+import os
+from typing import TypeVar, Generic
+import pandas as pd
+
+
+class Leaderboard(Generic[TypeVar("T", bound="Leaderboard")]):
+
+    """
+    Leaderboard class to manage the ranking of the models
+
+    Args:
+        path (str): The path to the summary file
+
+
+    """
+
+    _instance = None
+
+    def __new__(cls, *args, **kwargs):
+        """
+        Singleton pattern to ensure only one instance of the class is created
+        """
+        if not cls._instance:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(
+        self,
+        path: str = os.path.expanduser("~/.langtest/leaderboard/summary.csv"),
+        *args,
+        **kwargs,
+    ) -> None:
+        """
+        Initialize the Leaderboard class with the summary file
+        """
+        self.summary = Summary(path, *args, **kwargs)
+
+    def default(self):
+        """
+        Get the score board for the models
+        """
+        df = self.summary.summary_df
+        df = self.__drop_duplicates(df)
+        pvt_table = df.pivot_table(
+            index=["model"], columns="dataset_name", values="score"
+        )
+
+        # mean column
+        pvt_table.insert(0, "Avg", pvt_table.mean(axis=1))
+        pvt_table = pvt_table.sort_values(by=["model", "Avg"], ascending=[True, False])
+
+        # reset the index and fill the NaN values
+        pvt_table = pvt_table.rename_axis(None, axis=1).reset_index()
+        pvt_table = pvt_table.fillna("-")
+
+        return pvt_table
+
+    def split_wise(self):
+        """
+        Get the score board for the models by test type
+        """
+
+        df = self.summary.summary_df
+        df = self.__drop_duplicates(df)
+        pvt_table = df.pivot_table(
+            index=["model", "split"],
+            columns=["dataset_name"],
+            values="score",
+        )
+
+        # mean column
+        pvt_table.insert(0, "Avg", pvt_table.mean(axis=1))
+        pvt_table = pvt_table.sort_values(by=["model", "Avg"], ascending=[True, False])
+
+        pvt_table = pvt_table.fillna("-")
+
+        return pvt_table
+
+    def test_wise(self):
+        """
+        Get the score board for the models by test type
+        """
+
+        df = self.summary.summary_df
+        df = self.__drop_duplicates(df)
+        pvt_table = df.pivot_table(
+            index=["model", "test_type"], columns=["dataset_name"], values="score"
+        )
+
+        # mean column
+        pvt_table.insert(0, "Avg", pvt_table.mean(axis=1))
+        pvt_table = pvt_table.sort_values(by=["model", "Avg"], ascending=[True, False])
+
+        pvt_table = pvt_table.fillna("-")
+
+        return pvt_table
+
+    def category_wise(self):
+        """
+        Get the score board for the models by category
+        """
+        df = self.summary.summary_df
+        df = self.__drop_duplicates(df)
+        pvt_table = df.pivot_table(
+            index=["model", "category"], columns=["dataset_name"], values="score"
+        )
+        pvt_table.insert(0, "Avg", pvt_table.mean(axis=1))
+        pvt_table = pvt_table.sort_values(by=["model", "Avg"], ascending=[True, False])
+        pvt_table = pvt_table.fillna("-")
+        pvt_table = pvt_table.rename_axis(None, axis=1).reset_index()
+
+        return pvt_table
+
+    def custom_wise(self, indices: list, columns: list = []):
+        """
+        Get the score board for the models by custom group
+        """
+        df = self.summary.summary_df
+        df = self.__drop_duplicates(df)
+        pvt_table = df.pivot_table(
+            index=["model", *indices],
+            columns=["dataset_name", *columns],
+            values="score",
+            aggfunc="first",
+        )
+        pvt_table.insert(0, "Avg", pvt_table.mean(axis=1))
+        pvt_table = pvt_table.fillna("-")
+        pvt_table = pvt_table.sort_values(by=["model", "Avg"], ascending=[True, False])
+        # pvt_table = pvt_table.rename_axis(None, axis=1).reset_index()
+
+        return pvt_table
+
+    def __drop_duplicates(self, df: pd.DataFrame):
+        """
+        Drop duplicates from the dataframe
+        """
+        # arrange the dataframe by timestamp in descending order
+        df["timestamp"] = pd.to_datetime(df["timestamp"])
+        df = df.sort_values(by="timestamp", ascending=False)
+
+        # remove duplicates
+        df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%d-%H-%M-%S")
+        df = df.sort_values(by="timestamp", ascending=False)
+        df.reset_index(drop=True, inplace=True)
+        unique_records = df.drop_duplicates(
+            subset=[
+                # "timestamp",
+                "category",
+                "test_type",
+                "model",
+                "hub",
+                "dataset_name",
+                "split",
+                "subset",
+                "task",
+            ],
+            # keep=,
+        )
+
+        unique_records.reset_index(drop=True, inplace=True)
+
+        return df
+
+    def __repr__(self) -> str:
+        return self.summary.summary_df.to_markdown()
+
+
+class Summary(Generic[TypeVar("T", bound="Summary")]):
+    """
+    Summary class to manage the summary report
+    """
+
+    _instance = None
+
+    def __new__(cls, *args, **kwargs):
+        """
+        Singleton pattern to ensure only one instance of the class is created
+        """
+        if not cls._instance:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(self, path: str, *args, **kwargs) -> None:
+        """
+        Initialize the summary
+        """
+        self.save_dir = path
+        self.file_path = f"{path}summary.csv"
+
+        self.summary_df: pd.DataFrame = self.load_data_from_file(
+            self.file_path, *args, **kwargs
+        )
+
+    def load_data_from_file(self, path: str, *args, **kwargs) -> pd.DataFrame:
+        """
+        Check if file exists
+        """
+        try:
+            if os.path.exists(path):
+                return self.__read_from_csv(path, *args, **kwargs)
+            else:
+                os.makedirs(os.path.dirname(path), exist_ok=True)
+                # Create a new file
+                df = pd.DataFrame(columns=self.__default_columns())
+                df.to_csv(path, index=False)
+                return df
+        except FileNotFoundError:
+            raise FileNotFoundError(f"File not found at {path}")
+
+    def __read_from_csv(self, path: str) -> pd.DataFrame:
+        """
+        Read data from csv file
+        """
+        df = pd.read_csv(path)
+        return df
+
+    def __default_columns(self):
+        """
+        Default columns for the summary report
+        """
+        cols = [
+            "timestamp",
+            "task",
+            "model",
+            "hub",
+            "category",
+            "test_type",
+            "dataset_name",
+            "split",
+            "subset",
+            "total_records",
+            "success_records",
+            "failure_records",
+            "score",
+        ]
+        return cols
+
+    def add_report(
+        self,
+        generated_results: pd.DataFrame,
+    ) -> None:
+        """
+        Add a new report to the summary
+        """
+
+        from datetime import datetime
+
+        # Filter the dataframe for accuracy, fairness and representation
+        afr_df = self.__afr(generated_results)
+        not_afr_df = self.__not_afr(generated_results)
+
+        # concatenate the dataframes
+        temp_summary_df = pd.concat([afr_df, not_afr_df], axis=0)
+
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        temp_summary_df["timestamp"] = timestamp
+
+        # insert row to the summary df
+        self.summary_df = pd.concat([self.summary_df, temp_summary_df], ignore_index=True)
+
+        # Save the summary to the file
+        self.save_summary()
+
+    def save_summary(self) -> None:
+        """
+        Save the summary to the file
+        """
+        self.summary_df.to_csv(self.file_path, index=False)
+
+    def __afr(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Filter the dataframe for accuracy, fairness and representation
+        to be used in the summary report
+        """
+        df = df[df["category"].isin(["accuracy", "fairness", "representation"])]
+        df = df[self.__group_by_cols() + ["actual_result"]]
+        df = df.rename(columns={"actual_result": "score"})
+
+        return df
+
+    def __not_afr(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Filter the dataframe for non accuracy, fairness and representation
+        to be used in the summary report
+        """
+        df = df[~df["category"].isin(["accuracy", "fairness", "representation"])]
+
+        grouped = df.groupby(self.__group_by_cols())
+
+        # Filter the columns
+        import numpy as np
+
+        total_records = grouped.size().reset_index(name="total_records")
+        success_records = grouped["pass"].sum().reset_index(name="success_records")
+        score = grouped["pass"].mean().reset_index(name="score")
+        failure_records = grouped.apply(
+            lambda x: np.size(x["pass"]) - np.sum(x["pass"])
+        ).reset_index(name="failure_records")
+
+        # concatenate the dataframes
+        result = pd.concat(
+            [
+                success_records,
+                failure_records["failure_records"],
+                total_records["total_records"],
+                score["score"],
+            ],
+            axis=1,
+        )
+
+        return result
+
+    def __group_by_cols(self):
+        """
+        Group by columns
+        """
+        return [
+            "category",
+            "dataset_name",
+            "test_type",
+            "model",
+            "hub",
+            "split",
+            "subset",
+            "task",
+        ]
+
+    @property
+    def df(self) -> pd.DataFrame:
+        return self.summary_df
diff --git a/langtest/utils/config_utils.py b/langtest/utils/config_utils.py
index eab207c5d..9f762ce30 100644
--- a/langtest/utils/config_utils.py
+++ b/langtest/utils/config_utils.py
@@ -1,3 +1,4 @@
+from typing import List
 from pkg_resources import resource_filename
 
 
@@ -47,3 +48,193 @@
         "default": resource_filename("langtest", "data/config/wino_llm_config.yml"),
     },
 }
+
+BENCHMARK_DATASETS_DICT = {
+    "ASDiv": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "ASDiv", "split": "test-tiny"},
+    },
+    "BBQ": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "BBQ", "split": "test-tiny"},
+    },
+    "Bigbench": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {
+            "data_source": "Bigbench",
+            "subset": "Abstract-narrative-understanding",
+            "split": "test-tiny",
+        },
+    },
+    "BoolQ": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "BoolQ", "split": "test-tiny"},
+    },
+    "CommonsenseQA": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "CommonsenseQA", "split": "validation-tiny"},
+    },
+    "FIQA": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "Fiqa", "split": "test-tiny"},
+    },
+    "HellaSwag": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "HellaSwag", "split": "test-tiny"},
+    },
+    "Consumer-Contracts": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "Consumer-Contracts", "split": "test"},
+    },
+    "Contracts": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "Contracts", "split": "test"},
+    },
+    "Privacy-Policy": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "Privacy-Policy", "split": "test"},
+    },
+    "LogiQA": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "LogiQA", "split": "test-tiny"},
+    },
+    "MMLU": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "MMLU", "split": "test-tiny"},
+    },
+    "NarrativeQA": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "NarrativeQA", "split": "test-tiny"},
+    },
+    "NQ-open": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "NQ-open", "split": "test-tiny"},
+    },
+    "OpenBookQA": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "OpenBookQA", "split": "test-tiny"},
+    },
+    "PIQA": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "PIQA", "split": "test-tiny"},
+    },
+    "Quac": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "Quac", "split": "test-tiny"},
+    },
+    "SIQA": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "SIQA", "split": "test-tiny"},
+    },
+    "TruthfulQA": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "TruthfulQA", "split": "test-tiny"},
+    },
+    "XSum": {
+        "task": "summarization",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "XSum", "split": "test-tiny"},
+    },
+    "MultiLexSum": {
+        "task": "summarization",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {"data_source": "MultiLexSum", "split": "test-tiny"},
+    },
+    "MedMCQA": {
+        "task": "question-answering",
+        "model": {"model": "gpt-3.5-turbo-instruct", "hub": "openai"},
+        "data": {
+            "data_source": "MedMCQA",
+            "subset": "MedMCQA-Test",
+            "split": "Radiology",
+        },
+    },
+    "MedQA": {
+        "task": "question-answering",
+        "model": {"model": "mistralai/Mistral-7B-Instruct-v0.1", "hub": "huggingface"},
+        "data": {"data_source": "MedQA", "split": "test"},
+        "config": {
+            "evaluation": {
+                "metric": "string_distance",
+                "distance": "jaro",
+                "threshold": 0.1,
+            },
+            "tests": {
+                "defaults": {"min_pass_rate": 0.65},
+                "robustness": {
+                    "add_ocr_typo": {"min_pass_rate": 0.66},
+                    "dyslexia_word_swap": {"min_pass_rate": 0.60},
+                },
+            },
+        },
+    },
+    "PubMedQA": {
+        "task": "question-answering",
+        "model": {"model": "j2-jumbo-instruct", "hub": "ai21"},
+        "data": {"data_source": "PubMedQA", "split": "pqaa"},
+        "config": {
+            "evaluation": {
+                "metric": "string_distance",
+                "distance": "jaro",
+                "threshold": 0.1,
+            },
+            "tests": {
+                "defaults": {"min_pass_rate": 0.65},
+                "robustness": {
+                    "add_ocr_typo": {"min_pass_rate": 0.66},
+                    "dyslexia_word_swap": {"min_pass_rate": 0.60},
+                },
+            },
+        },
+    },
+}
+
+
+class BenchmarkDatasets:
+    def __init__(self, task: str, dataset_name: str):
+        self.__dataset_name = dataset_name
+        self.__task = task
+
+    @classmethod
+    def get_dataset_dict(cls, dataset_name: str, task: str) -> dict:
+        """Get the benchmark dataset configuration for the given dataset name."""
+        dataset_config = BENCHMARK_DATASETS_DICT.get(dataset_name, None)
+
+        if dataset_config is None:
+            raise ValueError(
+                f"Dataset {dataset_name} not found in the benchmark datasets list - {cls.get_datasets()}."
+            )
+
+        return dataset_config.get("data", None)
+
+    @classmethod
+    def get_datasets(cls) -> List[str]:
+        """Get the list of benchmark datasets available in the library."""
+        return list(BENCHMARK_DATASETS_DICT.keys())
+
+    @property
+    def dataset_name(self) -> str:
+        return self.__dataset_name
+
+    @property
+    def task(self) -> str:
+        return self.__task
diff --git a/tests/test_datasource.py b/tests/test_datasource.py
index 55f2c0e29..d83054f7e 100644
--- a/tests/test_datasource.py
+++ b/tests/test_datasource.py
@@ -83,10 +83,10 @@ def test_load_raw_data(self, dataset, feature_col, target_col):
             (
                 HuggingFaceDataset(
                     source_info={
-                        "data_source": "wikiann",
-                        "subset": "fo",
+                        "data_source": "tner/wikiann",
+                        "subset": "ace",
                         "feature_column": "tokens",
-                        "target_column": "ner_tags",
+                        "target_column": "tags",
                         "split": "test",
                     },
                     task=TaskManager("ner"),