Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion giskard/scanner/performance/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class MetricResult:
value: float
affected_samples: int
raw_values: Optional[np.ndarray] = None
binary_counts: Optional[list[int]] = None

@property
def name(self):
Expand All @@ -27,6 +28,7 @@ def __str__(self):
class PerformanceMetric(ABC):
name: str
greater_is_better = True
has_binary_counts = False

@abstractmethod
def __call__(self, model: BaseModel, dataset: Dataset) -> MetricResult:
Expand All @@ -43,7 +45,9 @@ def __call__(self, model: BaseModel, dataset: Dataset) -> MetricResult:

value = self._calculate_metric(y_true, y_pred, model)
num_affected = self._calculate_affected_samples(y_true, y_pred, model)
return MetricResult(self, value, num_affected)
binary_counts = self._calculate_binary_counts(value, num_affected) if self.has_binary_counts else None

return MetricResult(self, value, num_affected, binary_counts=binary_counts)

@abstractmethod
def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> MetricResult:
Expand All @@ -52,10 +56,16 @@ def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseM
def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> int:
return len(y_true)

def _calculate_binary_counts(self, value, num_affected) -> list[int]:
x = round(value * num_affected)
y = num_affected - x
return [x, y]


class Accuracy(ClassificationPerformanceMetric):
name = "Accuracy"
greater_is_better = True
has_binary_counts = True

def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel):
return sklearn.metrics.accuracy_score(y_true, y_pred)
Expand All @@ -64,6 +74,7 @@ def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseM
class BalancedAccuracy(ClassificationPerformanceMetric):
name = "Balanced Accuracy"
greater_is_better = True
has_binary_counts = False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not needed


def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel):
return sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
Expand All @@ -89,6 +100,7 @@ def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseM
class F1Score(SklearnClassificationScoreMixin, ClassificationPerformanceMetric):
name = "F1 Score"
greater_is_better = True
has_binary_counts = False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not needed

_sklearn_metric = "f1_score"

def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> int:
Expand All @@ -104,6 +116,7 @@ def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, mo
class Precision(SklearnClassificationScoreMixin, ClassificationPerformanceMetric):
name = "Precision"
greater_is_better = True
has_binary_counts = True
_sklearn_metric = "precision_score"

def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> int:
Expand All @@ -116,6 +129,7 @@ def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, mo
class Recall(SklearnClassificationScoreMixin, ClassificationPerformanceMetric):
name = "Recall"
greater_is_better = True
has_binary_counts = True
_sklearn_metric = "recall_score"

def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> int:
Expand All @@ -128,6 +142,7 @@ def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, mo
class AUC(PerformanceMetric):
name = "ROC AUC"
greater_is_better = True
has_binary_counts = False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not needed


def __call__(self, model: BaseModel, dataset: Dataset) -> MetricResult:
y_true = dataset.df[dataset.target]
Expand Down
82 changes: 62 additions & 20 deletions giskard/scanner/performance/performance_bias_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def _detect_for_metric(
p_values = []
compute_pvalue = self.alpha is not None
for slice_fn in slices:
sliced_dataset, slice_metric, p_value = _calculate_slice_metrics(
slice_dataset, slice_metric, p_value = _calculate_slice_metrics(
model, dataset, metric, slice_fn, with_pvalue=compute_pvalue
)

Expand Down Expand Up @@ -233,7 +233,7 @@ def _detect_for_metric(
"deviation_perc": round(relative_delta * 100, 2),
"abs_deviation_perc": round(abs(relative_delta) * 100, 2),
"comparison_op": "lower" if metric.greater_is_better else "greater",
"slice_size": len(sliced_dataset),
"slice_size": len(slice_dataset),
"threshold": self.threshold,
"p_value": p_value,
},
Expand Down Expand Up @@ -303,36 +303,78 @@ def _is_unbalanced_target(classes: pd.Series):
return (classes.value_counts() / classes.count()).std() > 0.2


def _calculate_slice_metrics(model, dataset, metric, slice_fn, with_pvalue=False):
sliced_dataset = dataset.slice(slice_fn)
slice_metric = metric(model, sliced_dataset)
def _calculate_pvalue_from_contingency_table(slice_metric, comp_metric, max_size_fisher=30):
ctable = [slice_metric.binary_counts, comp_metric.binary_counts]

# if the slice size is too small, use Fisher's exact test, otherwise use a G-test
if min(min(row) for row in ctable) <= max_size_fisher:
logger.debug("PerformanceBiasDetector: Fisher's exact test")
return scipy.stats.fisher_exact(ctable, alternative="two-sided")[1]
logger.debug("PerformanceBiasDetector: G-test")
return scipy.stats.chi2_contingency(ctable, correction=False, lambda_="log-likelihood")[1]


def _calculate_pvalue_from_permutation_test(
slice_dataset, comp_dataset, dataset, model, metric, perm_test_resamples=1000
):
logger.debug("PerformanceBiasDetector: permutation test")

def statistic(slice_ids, comp_ids):
perm_slice_dataset = Dataset(
dataset.df.loc[slice_ids],
target=dataset.target,
)
perm_comp_dataset = Dataset(
dataset.df.loc[comp_ids],
target=dataset.target,
)
return metric(model, perm_slice_dataset).value - metric(model, perm_comp_dataset).value

slice_ids = slice_dataset.df.index.values
comp_ids = comp_dataset.df.index.values
perm_test_result = scipy.stats.permutation_test(
(slice_ids, comp_ids),
statistic=statistic,
permutation_type="independent",
n_resamples=perm_test_resamples,
alternative="two-sided",
)
return perm_test_result.pvalue


def _calculate_slice_metrics(
model, dataset, metric, slice_fn, with_pvalue=False, max_size_fisher=30, perm_test_resamples=1000
):
slice_dataset = dataset.slice(slice_fn)
slice_metric = metric(model, slice_dataset)

if not with_pvalue:
return sliced_dataset, slice_metric, None
return slice_dataset, slice_metric, None

# Perform statistical tests
complementary_dataset = dataset.slice(lambda df: df[~df.index.isin(sliced_dataset.df.index)], row_level=False)
comp_metric = metric(model, complementary_dataset)
comp_dataset = dataset.slice(lambda df: df[~df.index.isin(slice_dataset.df.index)], row_level=False)
comp_metric = metric(model, comp_dataset)

try:
# If we have raw values for the metric, we perform a standard t-test
logger.debug(f"PerformanceBiasDetector: metric name = {slice_metric.name}")
if slice_metric.raw_values is not None:
logger.debug("PerformanceBiasDetector: t-test")
alternative = "less" if metric.greater_is_better else "greater"
_, pvalue = scipy.stats.ttest_ind(
slice_metric.raw_values, comp_metric.raw_values, equal_var=False, alternative=alternative
)
elif metric.has_binary_counts:
# otherwise, this must be classification scores...
pvalue = _calculate_pvalue_from_contingency_table(slice_metric, comp_metric, max_size_fisher)
else:
# otherwise, this must be classification scores, so we perform a G-test
slice_x_cnt = round(slice_metric.value * slice_metric.affected_samples)
slice_y_cnt = slice_metric.affected_samples - slice_x_cnt

comp_x_cnt = round(comp_metric.value * comp_metric.affected_samples)
comp_y_cnt = comp_metric.affected_samples - comp_x_cnt

ctable = [[slice_x_cnt, slice_y_cnt], [comp_x_cnt, comp_y_cnt]]

pvalue = scipy.stats.chi2_contingency(ctable, lambda_="log-likelihood")[1]
except ValueError:
# if the the contingency table cannot be calculated, do a permutation test
pvalue = _calculate_pvalue_from_permutation_test(
slice_dataset, comp_dataset, dataset, model, metric, perm_test_resamples
)
except ValueError as err:
pvalue = np.nan
logger.debug(f"PerformanceBiasDetector: p-value could not be calculated: {err}")

return sliced_dataset, slice_metric, pvalue
logger.debug(f"PerformanceBiasDetector: p-value = {pvalue}")
return slice_dataset, slice_metric, pvalue
93 changes: 64 additions & 29 deletions tests/scan/test_performance_bias_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,67 +92,102 @@ def test_selects_issues_with_benjamini_hochberg(titanic_model, titanic_dataset):
issues = detector.run(titanic_model, titanic_dataset, features=["Name", "Sex", "Pclass"])
assert len(issues) == 3

detector = PerformanceBiasDetector(alpha=1e-10)
detector = PerformanceBiasDetector(alpha=1e-15)

issues = detector.run(titanic_model, titanic_dataset, features=["Name", "Sex", "Pclass"])
assert len(issues) == 2


def test_calculate_slice_metrics():
SLICE_SIZE = 500
rng = np.random.RandomState(42)

# Create a mock model and dataset
model = mock.MagicMock()
dataset = Dataset(pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}), target="y")
dataset = Dataset(pd.DataFrame({"x": np.arange(5001), "y": rng.randint(1, 6, 5001)}), target="y")

def metric(model, dataset):
if len(dataset) == 2: # slice
return mock.MagicMock(value=0.4, affected_samples=11, raw_values=None)

return mock.MagicMock(value=0.46, affected_samples=32, raw_values=None)

metric.greater_is_better = True
# About 20% on large datasets
value = (dataset.df.y % 5 == 0).sum() / len(dataset)
affected_samples = len(dataset)
raw_values = None
x = round(value * affected_samples)
y = affected_samples - x
binary_counts = [x, y]
return mock.MagicMock(
value=value, affected_samples=affected_samples, raw_values=raw_values, binary_counts=binary_counts
)

# Without p-value
sliced_dataset, slice_metric, pvalue = _calculate_slice_metrics(model, dataset, metric, lambda df: df["x"] > 1)
metric.greater_is_better = True
metric.has_binary_counts = True
sliced_dataset, slice_metric, pvalue = _calculate_slice_metrics(model, dataset, metric, lambda df: df["x"] <= 9)

assert sliced_dataset.df.y.tolist() == [5, 6]
assert slice_metric.value == 0.4
assert slice_metric.affected_samples == 11
assert sliced_dataset.df.y.count() == 10
assert slice_metric.value == pytest.approx(0.40, abs=0.02)
assert slice_metric.affected_samples == 10
assert pvalue is None

# With p-value
# With p-value - G-test
sliced_dataset, slice_metric, pvalue = _calculate_slice_metrics(
model, dataset, metric, lambda df: df["x"] > 1, with_pvalue=True
model,
dataset,
metric,
lambda df: df["x"] <= SLICE_SIZE,
with_pvalue=True,
max_size_fisher=1,
perm_test_resamples=1000,
)

assert sliced_dataset.df.y.tolist() == [5, 6]
assert slice_metric.value == 0.4
assert slice_metric.affected_samples == 11
assert pvalue == pytest.approx(0.80, abs=0.01)
assert sliced_dataset.df.y.count() == SLICE_SIZE + 1
assert slice_metric.value == pytest.approx(0.20, abs=0.02)
assert slice_metric.affected_samples == SLICE_SIZE + 1
assert pvalue == pytest.approx(0.28, abs=0.05)

def metric(model, dataset):
if len(dataset) == 2: # slice
return mock.MagicMock(value=0.4, affected_samples=0, raw_values=None)
# With p-value - Fisher's exact test
sliced_dataset, slice_metric, pvalue = _calculate_slice_metrics(
model,
dataset,
metric,
lambda df: df["x"] <= SLICE_SIZE,
with_pvalue=True,
max_size_fisher=1000,
perm_test_resamples=1000,
)

return mock.MagicMock(value=0.46, affected_samples=32, raw_values=None)
assert sliced_dataset.df.y.count() == SLICE_SIZE + 1
assert slice_metric.value == pytest.approx(0.20, abs=0.02)
assert slice_metric.affected_samples == SLICE_SIZE + 1
assert pvalue == pytest.approx(0.28, abs=0.05) # should be about the same as G-test

metric.greater_is_better = True
# With p-value - Permutation test
metric.has_binary_counts = False
sliced_dataset, slice_metric, pvalue = _calculate_slice_metrics(
model,
dataset,
metric,
lambda df: df["x"] <= SLICE_SIZE,
with_pvalue=True,
max_size_fisher=1,
perm_test_resamples=1000,
)

# If the contingency table contains zeros, it will give p-value = NaN
_, _, pvalue = _calculate_slice_metrics(model, dataset, metric, lambda df: df["x"] > 1, True)
assert np.isnan(pvalue)
assert slice_metric.value == pytest.approx(0.20, abs=0.02)
assert slice_metric.affected_samples == SLICE_SIZE + 1
assert pvalue == pytest.approx(0.28, abs=0.05) # should be about the same as G-test and Fisher test

# For regression
def metric(model, dataset):
if len(dataset) == 2: # slice
if len(dataset) == 10: # slice
return mock.MagicMock(value=0.4, affected_samples=5, raw_values=[1, 2, 3, 1, 2])

return mock.MagicMock(value=0.46, affected_samples=7, raw_values=[2, 2, 2, 1, 2, 2, 2])

metric.greater_is_better = True
sliced_dataset, slice_metric, pvalue = _calculate_slice_metrics(
model, dataset, metric, lambda df: df["x"] > 1, with_pvalue=True
model, dataset, metric, lambda df: df["x"] <= 9, with_pvalue=True
)

assert sliced_dataset.df.y.tolist() == [5, 6]
assert sliced_dataset.df.y.count() == 10
assert slice_metric.value == 0.4
assert pvalue == pytest.approx(0.44, abs=0.01)