Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions python-client/giskard/scanner/calibration/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from dataclasses import dataclass
from functools import lru_cache

from ...testing.tests.calibration import test_underconfidence_rate

from ...testing.tests.calibration import test_overconfidence_rate

from ..common.examples import ExampleExtractor

from ..issues import Issue
Expand All @@ -22,6 +26,7 @@ class CalibrationIssueInfo:
loss_values: pd.Series
fail_idx: pd.DataFrame
threshold: float
p_threshold: float

@property
def metric_rel_delta(self):
Expand Down Expand Up @@ -107,6 +112,26 @@ class OverconfidenceIssue(CalibrationIssue):
def metric(self) -> str:
return "Overconfidence rate"

def generate_tests(self, with_names=False) -> list:
abs_threshold = self.info.metric_value_reference * (1 + self.info.threshold)

tests = [
test_overconfidence_rate(
model=self.model,
dataset=self.dataset,
slicing_function=self.info.slice_fn,
threshold=abs_threshold,
p_threshold=self.info.p_threshold,
)
]

if with_names:
names = [f"Overconfidence on data slice “{self.info.slice_fn}”"]

return list(zip(tests, names))

return tests


class UnderconfidenceIssue(CalibrationIssue):
group = "Underconfidence"
Expand All @@ -120,3 +145,23 @@ def metric(self) -> str:
@property
def deviation(self):
return f"{self.info.metric_rel_delta * 100:.2f}% than global"

def generate_tests(self, with_names=False) -> list:
abs_threshold = self.info.metric_value_reference * (1 + self.info.threshold)

tests = [
test_underconfidence_rate(
model=self.model,
dataset=self.dataset,
slicing_function=self.info.slice_fn,
threshold=abs_threshold,
p_threshold=self.info.p_threshold,
)
]

if with_names:
names = [f"Underconfidence on data slice “{self.info.slice_fn}”"]

return list(zip(tests, names))

return tests
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from typing import Sequence
import numpy as np
import pandas as pd

from ...testing.tests.calibration import _default_overconfidence_threshold

from ...testing.tests.calibration import _calculate_overconfidence_score

from ...ml_worker.testing.registry.slicing_function import SlicingFunction
from ...models.base import BaseModel
from ...datasets import Dataset
Expand Down Expand Up @@ -29,18 +32,8 @@ def run(self, model: BaseModel, dataset: Dataset):
return super().run(model, dataset)

def _calculate_loss(self, model: BaseModel, dataset: Dataset) -> pd.DataFrame:
true_target = dataset.df.loc[:, dataset.target].values
pred = model.predict(dataset)
label2id = {label: n for n, label in enumerate(model.meta.classification_labels)}

# Empirical cost associated to overconfidence
p_max = pred.probabilities
p_true_label = np.array([pred.raw[n, label2id[label]] for n, label in enumerate(true_target)])

loss_values = p_max - p_true_label
mask = loss_values > 0

return pd.DataFrame({self.LOSS_COLUMN_NAME: loss_values[mask]}, index=dataset.df.index[mask])
loss = _calculate_overconfidence_score(model, dataset).to_frame(self.LOSS_COLUMN_NAME)
return loss[loss[self.LOSS_COLUMN_NAME] > 0]

def _find_issues(
self,
Expand Down Expand Up @@ -86,13 +79,9 @@ def _find_issues(
loss_values=meta[self.LOSS_COLUMN_NAME],
fail_idx=fail_idx,
threshold=self.threshold,
p_threshold=p_threshold,
),
)
)

return issues


def _default_overconfidence_threshold(model: BaseModel) -> float:
n = len(model.meta.classification_labels)
return 1 / (3e-1 * (n - 2) + 2 - 1e-3 * (n - 2) ** 2)
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import Sequence
import numpy as np
import pandas as pd

from ...testing.tests.calibration import _calculate_underconfidence_score

from ...ml_worker.testing.registry.slicing_function import SlicingFunction
from ...models.base import BaseModel
from ...datasets import Dataset
Expand Down Expand Up @@ -29,15 +30,7 @@ def run(self, model: BaseModel, dataset: Dataset):
return super().run(model, dataset)

def _calculate_loss(self, model: BaseModel, dataset: Dataset) -> pd.DataFrame:
# Empirical cost associated to underconfidence: difference between
# the two most probable classes.
ps = model.predict(dataset).raw

# Relative difference
ps_2 = -np.partition(-ps, 1, axis=-1)[:, :2]
loss_values = ps_2.min(axis=-1) / ps_2.max(axis=-1)

return pd.DataFrame({self.LOSS_COLUMN_NAME: loss_values}, index=dataset.df.index)
return _calculate_underconfidence_score(model, dataset).to_frame(self.LOSS_COLUMN_NAME)

def _find_issues(
self,
Expand Down Expand Up @@ -65,6 +58,7 @@ def _find_issues(
fail_idx = sliced_dataset.df[(sliced_dataset.df[self.LOSS_COLUMN_NAME] > self.p_threshold)].index

# Skip non representative slices
# @TODO: do this with a statistical test instead of filtering by count only (GSK-1279)
if len(fail_idx) < 20:
continue

Expand All @@ -89,6 +83,7 @@ def _find_issues(
loss_values=meta[self.LOSS_COLUMN_NAME],
fail_idx=fail_idx,
threshold=self.threshold,
p_threshold=self.p_threshold,
),
)
)
Expand Down
147 changes: 147 additions & 0 deletions python-client/giskard/testing/tests/calibration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import numpy as np
import pandas as pd
from typing import Optional

from ...ml_worker.testing.registry.decorators import test

from ...ml_worker.testing.test_result import TestResult

from ...ml_worker.testing.registry.slicing_function import SlicingFunction
from ...datasets.base import Dataset
from ...models.base import BaseModel


def _calculate_overconfidence_score(model: BaseModel, dataset: Dataset) -> pd.Series:
true_target = dataset.df.loc[:, dataset.target].values
pred = model.predict(dataset)
label2id = {label: n for n, label in enumerate(model.meta.classification_labels)}

# Empirical cost associated to overconfidence, i.e. the difference between
# the probability assigned to the predicted label and the correct label.
p_max = pred.probabilities
p_true_label = np.array([pred.raw[n, label2id[label]] for n, label in enumerate(true_target)])

overconfidence_score = p_max - p_true_label
return pd.Series(overconfidence_score, index=dataset.df.index)


def _default_overconfidence_threshold(model: BaseModel) -> float:
n = len(model.meta.classification_labels)
return 1 / (3e-1 * (n - 2) + 2 - 1e-3 * (n - 2) ** 2)


@test(name="Overconfidence Rate", tags=["classification"])
def test_overconfidence_rate(
model: BaseModel,
dataset: Dataset,
slicing_function: Optional[SlicingFunction] = None,
threshold: Optional[float] = 0.10,
p_threshold: Optional[float] = None,
):
"""Tests that the rate of overconfident predictions is below a threshold.

Overconfident predictions are defined as predictions where the model
assigned a large probability to the wrong label. We quantify this as the
difference between the largest probability assigned to a label and the
probability assigned to the correct label (this will be 0 if the model
made the correct prediction). If this is larger than a threshold
(`p_threshold`, typically determined automatically depending on the number
of classes), then the prediction is considered overconfident.
We then calculate the rate of overconfident predictions as the number of
overconfident samples divided by the total number of wrongly predicted
samples, and check that it is below a user-specified threshold.

Arguments:
model(BaseModel): The model to test.
dataset(Dataset): The dataset to test the model on.
slicing_function(SlicingFunction, optional): An optional slicing
function used to slice the dataset before testing. If not provided,
the whole dataset will be considered in calculating the
overconfidence rate.
threshold(float, optional): The threshold for overconfident prediction
rate, i.e. the max ratio of overconfident samples over number of
wrongly predicted samples. Default is 0.10 (10%).
p_threshold(float, optional): The threshold for the difference between
the probability assigned to the wrong label and the correct label
over which a prediction is considered overconfident. If not
provided, it will be determined automatically depending on the
number of classes.
"""
if not model.is_classification:
raise ValueError("This test is only applicable to classification models.")

if slicing_function is not None:
dataset = dataset.slice(slicing_function)

overconfidence_score = _calculate_overconfidence_score(model, dataset)

if p_threshold is None:
p_threshold = _default_overconfidence_threshold(model)

rate = (overconfidence_score[overconfidence_score > 0].dropna() > p_threshold).mean()
passed = rate < threshold

return TestResult(passed=passed, metric=rate)


def _calculate_underconfidence_score(model: BaseModel, dataset: Dataset) -> pd.Series:
# Empirical cost associated to underconfidence: difference between the two
# most probable classes.
ps = model.predict(dataset).raw

# Relative difference
ps_2 = -np.partition(-ps, 1, axis=-1)[:, :2]
score_values = ps_2.min(axis=-1) / ps_2.max(axis=-1)

return pd.Series(score_values, index=dataset.df.index)


@test(name="Underconfidence Rate", tags=["classification"])
def test_underconfidence_rate(
model: BaseModel,
dataset: Dataset,
slicing_function: Optional[SlicingFunction] = None,
threshold: Optional[float] = 0.10,
p_threshold: float = 0.90,
):
"""Tests that the rate of underconfident predictions is below a threshold.

Underconfident predictions are defined as predictions where the two most
probable labels have very similar probabilities. In this case, slight
changes can make the model flip its predicted label. By default, we mark a
prediction as underconfident when the second most probable prediction has a
probability which is only less than 10% smaller than the predicted label
(`p_threshold=0.90`).
We then calculate the rate of underconfident predictions as the number of
underconfident samples divided by the total number of samples, and check
that it is below the user-specified threshold.


Arguments:
model(BaseModel): The model to test.
dataset(Dataset): The dataset to test the model on.
slicing_function(SlicingFunction, optional): An optional slicing
function used to slice the dataset before testing. If not provided,
the whole dataset will be considered in calculating the
underconfidence rate.
threshold(float, optional): The threshold for underconfident prediction
rate. Default is 0.10 (10%).
p_threshold(float, optional): The threshold for the relative value of
the second most-probable prediction and the max probability. If
greater that this value, the prediction is considered
underconfident. Default is 0.90, i.e. when the second most probable
prediction is 90% or more with respect to the highest probability,
the sample prediction is considered underconfident.
"""
if not model.is_classification:
raise ValueError("This test is only applicable to classification models.")

if slicing_function is not None:
dataset = dataset.slice(slicing_function)

underconfidence_score = _calculate_underconfidence_score(model, dataset)

rate = (underconfidence_score.dropna() > p_threshold).mean()
passed = rate < threshold

return TestResult(passed=passed, metric=rate)
77 changes: 77 additions & 0 deletions python-client/tests/scan/test_calibration_issues.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from unittest.mock import MagicMock
import pandas as pd
from pytest import approx
from giskard import Model, Dataset, slicing_function
from giskard.scanner.calibration.issues import CalibrationIssueInfo, OverconfidenceIssue, UnderconfidenceIssue


def test_underconfidence_issue_exports_test():
data = MagicMock(Dataset)
model = MagicMock(Model)

@slicing_function(row_level=False)
def my_slice(df):
return df.head(10)

info = CalibrationIssueInfo(
my_slice,
10,
metric_value_slice=0.3,
metric_value_reference=0.1,
loss_values=pd.Series(),
fail_idx=[],
threshold=0.8,
p_threshold=0.81,
)
issue = UnderconfidenceIssue(model, data, "major", info)

tests = issue.generate_tests()

assert len(tests) == 1

the_test = tests[0]

assert the_test.meta.name == "test_underconfidence_rate"
assert the_test.params["model"] == model
assert the_test.params["dataset"] == data
assert the_test.params["p_threshold"] == approx(0.81)
assert the_test.params["slicing_function"] == my_slice

# Global rate is 10% (`metric_value_reference`), we accept a 80% deviation, thus up to 18%:
assert the_test.params["threshold"] == approx(0.18)


def test_overconfidence_issue_exports_test():
data = MagicMock(Dataset)
model = MagicMock(Model)

@slicing_function(row_level=False)
def my_slice(df):
return df.head(10)

info = CalibrationIssueInfo(
my_slice,
10,
metric_value_slice=0.3,
metric_value_reference=0.15,
loss_values=pd.Series(),
fail_idx=[],
threshold=0.10,
p_threshold=0.5,
)
issue = OverconfidenceIssue(model, data, "major", info)

tests = issue.generate_tests()

assert len(tests) == 1

the_test = tests[0]

assert the_test.meta.name == "test_overconfidence_rate"
assert the_test.params["model"] == model
assert the_test.params["dataset"] == data
assert the_test.params["p_threshold"] == approx(0.5)
assert the_test.params["slicing_function"] == my_slice

# Global rate is 15% (`metric_value_reference`), we accept a 10% deviation, thus up to 16.5%:
assert the_test.params["threshold"] == approx(0.165)
Loading