Giskard-AI · andreybavt · Jun 15, 2023 · Jun 13, 2023 · Jun 13, 2023 · Jun 13, 2023
diff --git a/python-client/giskard/scanner/calibration/issues.py b/python-client/giskard/scanner/calibration/issues.py
@@ -3,6 +3,10 @@
 from dataclasses import dataclass
 from functools import lru_cache
 
+from ...testing.tests.calibration import test_underconfidence_rate
+
+from ...testing.tests.calibration import test_overconfidence_rate
+
 from ..common.examples import ExampleExtractor
 
 from ..issues import Issue
@@ -22,6 +26,7 @@ class CalibrationIssueInfo:
     loss_values: pd.Series
     fail_idx: pd.DataFrame
     threshold: float
+    p_threshold: float
 
     @property
     def metric_rel_delta(self):
@@ -107,6 +112,26 @@ class OverconfidenceIssue(CalibrationIssue):
     def metric(self) -> str:
         return "Overconfidence rate"
 
+    def generate_tests(self, with_names=False) -> list:
+        abs_threshold = self.info.metric_value_reference * (1 + self.info.threshold)
+
+        tests = [
+            test_overconfidence_rate(
+                model=self.model,
+                dataset=self.dataset,
+                slicing_function=self.info.slice_fn,
+                threshold=abs_threshold,
+                p_threshold=self.info.p_threshold,
+            )
+        ]
+
+        if with_names:
+            names = [f"Overconfidence on data slice “{self.info.slice_fn}”"]
+
+            return list(zip(tests, names))
+
+        return tests
+
 
 class UnderconfidenceIssue(CalibrationIssue):
     group = "Underconfidence"
@@ -120,3 +145,23 @@ def metric(self) -> str:
     @property
     def deviation(self):
         return f"{self.info.metric_rel_delta * 100:.2f}% than global"
+
+    def generate_tests(self, with_names=False) -> list:
+        abs_threshold = self.info.metric_value_reference * (1 + self.info.threshold)
+
+        tests = [
+            test_underconfidence_rate(
+                model=self.model,
+                dataset=self.dataset,
+                slicing_function=self.info.slice_fn,
+                threshold=abs_threshold,
+                p_threshold=self.info.p_threshold,
+            )
+        ]
+
+        if with_names:
+            names = [f"Underconfidence on data slice “{self.info.slice_fn}”"]
+
+            return list(zip(tests, names))
+
+        return tests
diff --git a/python-client/giskard/scanner/calibration/overconfidence_detector.py b/python-client/giskard/scanner/calibration/overconfidence_detector.py
@@ -1,7 +1,10 @@
 from typing import Sequence
-import numpy as np
 import pandas as pd
 
+from ...testing.tests.calibration import _default_overconfidence_threshold
+
+from ...testing.tests.calibration import _calculate_overconfidence_score
+
 from ...ml_worker.testing.registry.slicing_function import SlicingFunction
 from ...models.base import BaseModel
 from ...datasets import Dataset
@@ -29,18 +32,8 @@ def run(self, model: BaseModel, dataset: Dataset):
         return super().run(model, dataset)
 
     def _calculate_loss(self, model: BaseModel, dataset: Dataset) -> pd.DataFrame:
-        true_target = dataset.df.loc[:, dataset.target].values
-        pred = model.predict(dataset)
-        label2id = {label: n for n, label in enumerate(model.meta.classification_labels)}
-
-        # Empirical cost associated to overconfidence
-        p_max = pred.probabilities
-        p_true_label = np.array([pred.raw[n, label2id[label]] for n, label in enumerate(true_target)])
-
-        loss_values = p_max - p_true_label
-        mask = loss_values > 0
-
-        return pd.DataFrame({self.LOSS_COLUMN_NAME: loss_values[mask]}, index=dataset.df.index[mask])
+        loss = _calculate_overconfidence_score(model, dataset).to_frame(self.LOSS_COLUMN_NAME)
+        return loss[loss[self.LOSS_COLUMN_NAME] > 0]
 
     def _find_issues(
         self,
@@ -86,13 +79,9 @@ def _find_issues(
                             loss_values=meta[self.LOSS_COLUMN_NAME],
                             fail_idx=fail_idx,
                             threshold=self.threshold,
+                            p_threshold=p_threshold,
                         ),
                     )
                 )
 
         return issues
-
-
-def _default_overconfidence_threshold(model: BaseModel) -> float:
-    n = len(model.meta.classification_labels)
-    return 1 / (3e-1 * (n - 2) + 2 - 1e-3 * (n - 2) ** 2)
diff --git a/python-client/giskard/scanner/calibration/underconfidence_detector.py b/python-client/giskard/scanner/calibration/underconfidence_detector.py
@@ -1,7 +1,8 @@
 from typing import Sequence
-import numpy as np
 import pandas as pd
 
+from ...testing.tests.calibration import _calculate_underconfidence_score
+
 from ...ml_worker.testing.registry.slicing_function import SlicingFunction
 from ...models.base import BaseModel
 from ...datasets import Dataset
@@ -29,15 +30,7 @@ def run(self, model: BaseModel, dataset: Dataset):
         return super().run(model, dataset)
 
     def _calculate_loss(self, model: BaseModel, dataset: Dataset) -> pd.DataFrame:
-        # Empirical cost associated to underconfidence: difference between
-        # the two most probable classes.
-        ps = model.predict(dataset).raw
-
-        # Relative difference
-        ps_2 = -np.partition(-ps, 1, axis=-1)[:, :2]
-        loss_values = ps_2.min(axis=-1) / ps_2.max(axis=-1)
-
-        return pd.DataFrame({self.LOSS_COLUMN_NAME: loss_values}, index=dataset.df.index)
+        return _calculate_underconfidence_score(model, dataset).to_frame(self.LOSS_COLUMN_NAME)
 
     def _find_issues(
         self,
@@ -65,6 +58,7 @@ def _find_issues(
             fail_idx = sliced_dataset.df[(sliced_dataset.df[self.LOSS_COLUMN_NAME] > self.p_threshold)].index
 
             # Skip non representative slices
+            # @TODO: do this with a statistical test instead of filtering by count only (GSK-1279)
             if len(fail_idx) < 20:
                 continue
 
@@ -89,6 +83,7 @@ def _find_issues(
                             loss_values=meta[self.LOSS_COLUMN_NAME],
                             fail_idx=fail_idx,
                             threshold=self.threshold,
+                            p_threshold=self.p_threshold,
                         ),
                     )
                 )

diff --git a/python-client/giskard/testing/tests/calibration.py b/python-client/giskard/testing/tests/calibration.py
@@ -0,0 +1,147 @@
+import numpy as np
+import pandas as pd
+from typing import Optional
+
+from ...ml_worker.testing.registry.decorators import test
+
+from ...ml_worker.testing.test_result import TestResult
+
+from ...ml_worker.testing.registry.slicing_function import SlicingFunction
+from ...datasets.base import Dataset
+from ...models.base import BaseModel
+
+
+def _calculate_overconfidence_score(model: BaseModel, dataset: Dataset) -> pd.Series:
+    true_target = dataset.df.loc[:, dataset.target].values
+    pred = model.predict(dataset)
+    label2id = {label: n for n, label in enumerate(model.meta.classification_labels)}
+
+    # Empirical cost associated to overconfidence, i.e. the difference between
+    # the probability assigned to the predicted label and the correct label.
+    p_max = pred.probabilities
+    p_true_label = np.array([pred.raw[n, label2id[label]] for n, label in enumerate(true_target)])
+
+    overconfidence_score = p_max - p_true_label
+    return pd.Series(overconfidence_score, index=dataset.df.index)
+
+
+def _default_overconfidence_threshold(model: BaseModel) -> float:
+    n = len(model.meta.classification_labels)
+    return 1 / (3e-1 * (n - 2) + 2 - 1e-3 * (n - 2) ** 2)
+
+
+@test(name="Overconfidence Rate", tags=["classification"])
+def test_overconfidence_rate(
+    model: BaseModel,
+    dataset: Dataset,
+    slicing_function: Optional[SlicingFunction] = None,
+    threshold: Optional[float] = 0.10,
+    p_threshold: Optional[float] = None,
+):
+    """Tests that the rate of overconfident predictions is below a threshold.
+
+    Overconfident predictions are defined as predictions where the model
+    assigned a large probability to the wrong label. We quantify this as the
+    difference between the largest probability assigned to a label and the
+    probability assigned to the correct label (this will be 0 if the model
+    made the correct prediction). If this is larger than a threshold
+    (`p_threshold`, typically determined automatically depending on the number
+    of classes), then the prediction is considered overconfident.
+    We then calculate the rate of overconfident predictions as the number of
+    overconfident samples divided by the total number of wrongly predicted
+    samples, and check that it is below a user-specified threshold.
+
+    Arguments:
+        model(BaseModel): The model to test.
+        dataset(Dataset): The dataset to test the model on.
+        slicing_function(SlicingFunction, optional): An optional slicing
+            function used to slice the dataset before testing. If not provided,
+            the whole dataset will be considered in calculating the
+            overconfidence rate.
+        threshold(float, optional): The threshold for overconfident prediction
+            rate, i.e. the max ratio of overconfident samples over number of
+            wrongly predicted samples. Default is 0.10 (10%).
+        p_threshold(float, optional): The threshold for the difference between
+            the probability assigned to the wrong label and the correct label
+            over which a prediction is considered overconfident. If not
+            provided, it will be determined automatically depending on the
+            number of classes.
+    """
+    if not model.is_classification:
+        raise ValueError("This test is only applicable to classification models.")
+
+    if slicing_function is not None:
+        dataset = dataset.slice(slicing_function)
+
+    overconfidence_score = _calculate_overconfidence_score(model, dataset)
+
+    if p_threshold is None:
+        p_threshold = _default_overconfidence_threshold(model)
+
+    rate = (overconfidence_score[overconfidence_score > 0].dropna() > p_threshold).mean()
+    passed = rate < threshold
+
+    return TestResult(passed=passed, metric=rate)
+
+
+def _calculate_underconfidence_score(model: BaseModel, dataset: Dataset) -> pd.Series:
+    # Empirical cost associated to underconfidence: difference between the two
+    # most probable classes.
+    ps = model.predict(dataset).raw
+
+    # Relative difference
+    ps_2 = -np.partition(-ps, 1, axis=-1)[:, :2]
+    score_values = ps_2.min(axis=-1) / ps_2.max(axis=-1)
+
+    return pd.Series(score_values, index=dataset.df.index)
+
+
+@test(name="Underconfidence Rate", tags=["classification"])
+def test_underconfidence_rate(
+    model: BaseModel,
+    dataset: Dataset,
+    slicing_function: Optional[SlicingFunction] = None,
+    threshold: Optional[float] = 0.10,
+    p_threshold: float = 0.90,
+):
+    """Tests that the rate of underconfident predictions is below a threshold.
+
+    Underconfident predictions are defined as predictions where the two most
+    probable labels have very similar probabilities. In this case, slight
+    changes can make the model flip its predicted label. By default, we mark a
+    prediction as underconfident when the second most probable prediction has a
+    probability which is only less than 10% smaller than the predicted label
+    (`p_threshold=0.90`).
+    We then calculate the rate of underconfident predictions as the number of
+    underconfident samples divided by the total number of samples, and check
+    that it is below the user-specified threshold.
+
+
+    Arguments:
+        model(BaseModel): The model to test.
+        dataset(Dataset): The dataset to test the model on.
+        slicing_function(SlicingFunction, optional): An optional slicing
+            function used to slice the dataset before testing. If not provided,
+            the whole dataset will be considered in calculating the
+            underconfidence rate.
+        threshold(float, optional): The threshold for underconfident prediction
+            rate. Default is 0.10 (10%).
+        p_threshold(float, optional): The threshold for the relative value of
+            the second most-probable prediction and the max probability. If
+            greater that this value, the prediction is considered
+            underconfident. Default is 0.90, i.e. when the second most probable
+            prediction is 90% or more with respect to the highest probability,
+            the sample prediction is considered underconfident.
+    """
+    if not model.is_classification:
+        raise ValueError("This test is only applicable to classification models.")
+
+    if slicing_function is not None:
+        dataset = dataset.slice(slicing_function)
+
+    underconfidence_score = _calculate_underconfidence_score(model, dataset)
+
+    rate = (underconfidence_score.dropna() > p_threshold).mean()
+    passed = rate < threshold
+
+    return TestResult(passed=passed, metric=rate)
diff --git a/python-client/tests/scan/test_calibration_issues.py b/python-client/tests/scan/test_calibration_issues.py
@@ -0,0 +1,77 @@
+from unittest.mock import MagicMock
+import pandas as pd
+from pytest import approx
+from giskard import Model, Dataset, slicing_function
+from giskard.scanner.calibration.issues import CalibrationIssueInfo, OverconfidenceIssue, UnderconfidenceIssue
+
+
+def test_underconfidence_issue_exports_test():
+    data = MagicMock(Dataset)
+    model = MagicMock(Model)
+
+    @slicing_function(row_level=False)
+    def my_slice(df):
+        return df.head(10)
+
+    info = CalibrationIssueInfo(
+        my_slice,
+        10,
+        metric_value_slice=0.3,
+        metric_value_reference=0.1,
+        loss_values=pd.Series(),
+        fail_idx=[],
+        threshold=0.8,
+        p_threshold=0.81,
+    )
+    issue = UnderconfidenceIssue(model, data, "major", info)
+
+    tests = issue.generate_tests()
+
+    assert len(tests) == 1
+
+    the_test = tests[0]
+
+    assert the_test.meta.name == "test_underconfidence_rate"
+    assert the_test.params["model"] == model
+    assert the_test.params["dataset"] == data
+    assert the_test.params["p_threshold"] == approx(0.81)
+    assert the_test.params["slicing_function"] == my_slice
+
+    # Global rate is 10% (`metric_value_reference`), we accept a 80% deviation, thus up to 18%:
+    assert the_test.params["threshold"] == approx(0.18)
+
+
+def test_overconfidence_issue_exports_test():
+    data = MagicMock(Dataset)
+    model = MagicMock(Model)
+
+    @slicing_function(row_level=False)
+    def my_slice(df):
+        return df.head(10)
+
+    info = CalibrationIssueInfo(
+        my_slice,
+        10,
+        metric_value_slice=0.3,
+        metric_value_reference=0.15,
+        loss_values=pd.Series(),
+        fail_idx=[],
+        threshold=0.10,
+        p_threshold=0.5,
+    )
+    issue = OverconfidenceIssue(model, data, "major", info)
+
+    tests = issue.generate_tests()
+
+    assert len(tests) == 1
+
+    the_test = tests[0]
+
+    assert the_test.meta.name == "test_overconfidence_rate"
+    assert the_test.params["model"] == model
+    assert the_test.params["dataset"] == data
+    assert the_test.params["p_threshold"] == approx(0.5)
+    assert the_test.params["slicing_function"] == my_slice
+
+    # Global rate is 15% (`metric_value_reference`), we accept a 10% deviation, thus up to 16.5%:
+    assert the_test.params["threshold"] == approx(0.165)