Skip to content
2 changes: 2 additions & 0 deletions python-client/giskard/scanner/performance/issues.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from functools import lru_cache
from typing import Optional

from .metrics import PerformanceMetric
from ..common.examples import ExampleExtractor
Expand All @@ -19,6 +20,7 @@ class PerformanceIssueInfo:
slice_fn: SlicingFunction
slice_size: int
threshold: float
p_value: Optional[float] = None

@property
def metric_rel_delta(self):
Expand Down
75 changes: 56 additions & 19 deletions python-client/giskard/scanner/performance/metrics.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,70 @@
from dataclasses import dataclass
import numpy as np
import sklearn.metrics
from typing import Optional
from abc import ABC, ABCMeta, abstractmethod

from ...models.base import BaseModel
from ...datasets.base import Dataset


@dataclass
class MetricResult:
metric: "PerformanceMetric"
value: float
affected_samples: int
raw_values: Optional[np.ndarray] = None


class PerformanceMetric(ABC):
name: str
greater_is_better = True

@abstractmethod
def __call__(self, model: BaseModel, dataset: Dataset) -> float:
def __call__(self, model: BaseModel, dataset: Dataset) -> MetricResult:
...


class ClassificationPerformanceMetric(PerformanceMetric, metaclass=ABCMeta):
def __call__(self, model: BaseModel, dataset: Dataset) -> float:
def __call__(self, model: BaseModel, dataset: Dataset) -> MetricResult:
if not model.is_classification:
raise ValueError(f"Metric '{self.name}' is only defined for classification models.")

y_true = dataset.df[dataset.target]
y_pred = model.predict(dataset).prediction
y_true = np.asarray(dataset.df[dataset.target])
y_pred = np.asarray(model.predict(dataset).prediction)

return self._calculate_metric(y_true, y_pred, model)
value = self._calculate_metric(y_true, y_pred, model)
num_affected = self._calculate_affected_samples(y_true, y_pred, model)
return MetricResult(self, value, num_affected)

@abstractmethod
def _calculate_metric(self, y_true, y_pred, model: BaseModel) -> float:
def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> MetricResult:
...

def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> int:
return len(y_true)


class Accuracy(ClassificationPerformanceMetric):
name = "Accuracy"
greater_is_better = True

def _calculate_metric(self, y_true, y_pred, model: BaseModel):
def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel):
return sklearn.metrics.accuracy_score(y_true, y_pred)


class BalancedAccuracy(ClassificationPerformanceMetric):
name = "Balanced Accuracy"
greater_is_better = True

def _calculate_metric(self, y_true, y_pred, model: BaseModel):
def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel):
return sklearn.metrics.balanced_accuracy_score(y_true, y_pred)


class SklearnClassificationScoreMixin:
_sklearn_metric: str

def _calculate_metric(self, y_true, y_pred, model: BaseModel):
def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel):
metric_fn = getattr(sklearn.metrics, self._sklearn_metric)
if model.is_binary_classification:
return metric_fn(
Expand All @@ -67,57 +83,78 @@ class F1Score(SklearnClassificationScoreMixin, ClassificationPerformanceMetric):
greater_is_better = True
_sklearn_metric = "f1_score"

def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> int:
if model.is_binary_classification:
# F1 score will not be affected by true negatives
neg = model.meta.classification_labels[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we only do it for binary classification and not for all cases?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the way the F1 is computed for multiclass is different. In our case it will use the total count of true positives, false positives, and false negatives in a one-vs-rest way for each class. So in the end it will use all the samples.

tn = ((y_true == neg) & (y_pred == neg)).sum()
return len(y_true) - tn

return len(y_true)


class Precision(SklearnClassificationScoreMixin, ClassificationPerformanceMetric):
name = "Precision"
greater_is_better = True
_sklearn_metric = "precision_score"

def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> int:
if model.is_binary_classification:
return (y_pred == model.meta.classification_labels[1]).sum()

return len(y_true)


class Recall(SklearnClassificationScoreMixin, ClassificationPerformanceMetric):
name = "Recall"
greater_is_better = True
_sklearn_metric = "recall_score"

def _calculate_affected_samples(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> int:
if model.is_binary_classification:
return (y_true == model.meta.classification_labels[1]).sum()

return len(y_true)


class AUC(PerformanceMetric):
name = "ROC AUC"
greater_is_better = True

def __call__(self, model: BaseModel, dataset: Dataset) -> float:
def __call__(self, model: BaseModel, dataset: Dataset) -> MetricResult:
y_true = dataset.df[dataset.target]
if model.is_binary_classification:
y_score = model.predict(dataset).raw[:, 1]
else:
y_score = model.predict(dataset).all_predictions

return sklearn.metrics.roc_auc_score(
y_true,
y_score,
multi_class="ovo",
labels=model.meta.classification_labels,
value = sklearn.metrics.roc_auc_score(
y_true, y_score, multi_class="ovo", labels=model.meta.classification_labels
)

return MetricResult(self, value, len(y_true))


class RegressionPerformanceMetric(PerformanceMetric):
def __call__(self, model: BaseModel, dataset: Dataset) -> float:
def __call__(self, model: BaseModel, dataset: Dataset) -> MetricResult:
if not model.is_regression:
raise ValueError(f"Metric '{self.name}' is only defined for regression models.")

y_true = dataset.df[dataset.target]
y_pred = model.predict(dataset).prediction

return self._calculate_metric(y_true, y_pred, model)
value = self._calculate_metric(y_true, y_pred, model)
return MetricResult(self, value, len(y_true))

@abstractmethod
def _calculate_metric(self, y_true, y_pred, model: BaseModel) -> float:
def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel) -> float:
...


class SklearnRegressionScoreMixin:
_sklearn_metric: str

def _calculate_metric(self, y_true, y_pred, model: BaseModel):
def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray, model: BaseModel):
metric_fn = getattr(sklearn.metrics, self._sklearn_metric)
return metric_fn(y_true, y_pred)

Expand Down
Loading