11from typing import Optional , Sequence
22
3- from abc import abstractmethod
3+ from abc import ABC , abstractmethod
44
55import numpy as np
66import pandas as pd
77
88from ...datasets .base import Dataset
99from ...llm import LLMImportError
1010from ...models .base import BaseModel
11+ from ...models .base .model_prediction import ModelPredictionResults
1112from ..issues import Issue , IssueLevel , Robustness
1213from ..logger import logger
1314from ..registry import Detector
15+ from .base_perturbation_function import PerturbationFunction
16+ from .numerical_transformations import NumericalTransformation
1417from .text_transformations import TextTransformation
1518
1619
17- class BaseTextPerturbationDetector (Detector ):
18- """Base class for metamorphic detectors based on text transformations."""
20+ def _relative_delta (actual : np .ndarray , reference : np .ndarray ) -> np .ndarray :
21+ """
22+ Computes elementwise relative delta. If reference[i] == 0, we replace it with epsilon
23+ to avoid division by zero.
24+ """
25+ epsilon = 1e-9
26+ safe_ref = np .where (reference == 0 , epsilon , reference )
27+ return (actual - reference ) / safe_ref
28+
29+
30+ def _get_default_num_samples (model ) -> int :
31+ if model .is_text_generation :
32+ return 10
33+ return 1_000
34+
35+
36+ def _get_default_output_sensitivity (model ) -> float :
37+ if model .is_text_generation :
38+ return 0.15
39+ return 0.05
40+
41+
42+ def _get_default_threshold (model ) -> float :
43+ if model .is_text_generation :
44+ return 0.10
45+ return 0.05
46+
47+
48+ def _generate_robustness_tests (issue : Issue ):
49+ from ...testing .tests .metamorphic import test_metamorphic_invariance
50+
51+ # Only generates a single metamorphic test
52+ return {
53+ f"Invariance to “{ issue .transformation_fn } ”" : test_metamorphic_invariance (
54+ transformation_function = issue .transformation_fn ,
55+ slicing_function = None ,
56+ threshold = 1 - issue .meta ["threshold" ],
57+ output_sensitivity = issue .meta .get ("output_sentitivity" , None ),
58+ )
59+ }
60+
61+
62+ class BasePerturbationDetector (Detector , ABC ):
63+ """
64+ Common parent class for metamorphic perturbation detectors (both text and numerical).
65+ """
1966
2067 _issue_group = Robustness
2168 _taxonomy = ["avid-effect:performance:P0201" ]
2269
2370 def __init__ (
2471 self ,
25- transformations : Optional [Sequence [TextTransformation ]] = None ,
72+ transformations : Optional [Sequence [PerturbationFunction ]] = None ,
2673 threshold : Optional [float ] = None ,
27- output_sensitivity = None ,
74+ output_sensitivity : Optional [ float ] = None ,
2875 num_samples : Optional [int ] = None ,
2976 ):
30- """Creates a new instance of the detector.
77+ """
78+ Creates a new instance of the detector.
3179
3280 Parameters
3381 ----------
34- transformations: Optional[Sequence[TextTransformation ]]
35- The text transformations used in the metamorphic testing. See :ref:`transformation_functions` for details
82+ transformations: Optional[Sequence[PerturbationFunction ]]
83+ The transformations used in the metamorphic testing. See :ref:`transformation_functions` for details
3684 about the available transformations. If not provided, a default set of transformations will be used.
3785 threshold: Optional[float]
3886 The threshold for the fail rate, which is defined as the proportion of samples for which the model
@@ -52,53 +100,103 @@ def __init__(
52100 self .num_samples = num_samples
53101 self .output_sensitivity = output_sensitivity
54102
55- def run (self , model : BaseModel , dataset : Dataset , features : Sequence [str ]) -> Sequence [Issue ]:
56- transformations = self .transformations or self ._get_default_transformations (model , dataset )
103+ @abstractmethod
104+ def _select_features (self , dataset : Dataset , features : Sequence [str ]) -> Sequence [str ]:
105+ raise NotImplementedError
57106
58- # Only analyze text features
59- text_features = [
60- f
61- for f in features
62- if dataset .column_types [f ] == "text" and pd .api .types .is_string_dtype (dataset .df [f ].dtype )
63- ]
107+ @abstractmethod
108+ def _get_default_transformations (self ) -> Sequence [PerturbationFunction ]:
109+ raise NotImplementedError
64110
65- logger .info (
66- f"{ self .__class__ .__name__ } : Running with transformations={ [t .name for t in transformations ]} "
67- f"threshold={ self .threshold } output_sensitivity={ self .output_sensitivity } num_samples={ self .num_samples } "
68- )
111+ @abstractmethod
112+ def _supports_text_generation (self ) -> bool :
113+ raise NotImplementedError
69114
70- issues = []
71- for transformation in transformations :
72- issues .extend (self ._detect_issues (model , dataset , transformation , text_features ))
115+ def _compute_passed (
116+ self ,
117+ model : BaseModel ,
118+ original_pred : ModelPredictionResults ,
119+ perturbed_pred : ModelPredictionResults ,
120+ output_sensitivity : float ,
121+ ) -> np .ndarray :
122+ if model .is_classification :
123+ return original_pred .raw_prediction == perturbed_pred .raw_prediction
124+
125+ elif model .is_regression :
126+ rel_delta = _relative_delta (perturbed_pred .raw_prediction , original_pred .raw_prediction )
127+ return np .abs (rel_delta ) < output_sensitivity
128+
129+ elif model .is_text_generation :
130+ if not self ._supports_text_generation ():
131+ raise NotImplementedError ("Text generation is not supported by this detector." )
132+ try :
133+ import evaluate
134+ except ImportError as err :
135+ raise LLMImportError () from err
136+
137+ scorer = evaluate .load ("bertscore" )
138+ score = scorer .compute (
139+ predictions = perturbed_pred .prediction ,
140+ references = original_pred .prediction ,
141+ model_type = "distilbert-base-multilingual-cased" ,
142+ idf = True ,
143+ )
144+ return np .array (score ["f1" ]) > 1 - output_sensitivity
73145
74- return [i for i in issues if i is not None ]
146+ else :
147+ raise NotImplementedError ("Only classification, regression, or text generation models are supported." )
75148
76- @abstractmethod
77- def _get_default_transformations (self , model : BaseModel , dataset : Dataset ) -> Sequence [TextTransformation ]:
78- ...
149+ def _create_examples (
150+ self ,
151+ original_data : Dataset ,
152+ original_pred : ModelPredictionResults ,
153+ perturbed_data : Dataset ,
154+ perturbed_pred : ModelPredictionResults ,
155+ feature : str ,
156+ passed : np .ndarray ,
157+ model : BaseModel ,
158+ transformation_fn ,
159+ ) -> pd .DataFrame :
160+ examples = original_data .df .loc [~ passed , [feature ]].copy ()
161+ examples [f"{ transformation_fn .name } ({ feature } )" ] = perturbed_data .df .loc [~ passed , feature ]
162+
163+ examples ["Original prediction" ] = original_pred .prediction [~ passed ]
164+ examples ["Prediction after perturbation" ] = perturbed_pred .prediction [~ passed ]
165+
166+ if model .is_classification :
167+ examples ["Original prediction" ] = examples ["Original prediction" ].astype (str )
168+ examples ["Prediction after perturbation" ] = examples ["Prediction after perturbation" ].astype (str )
169+
170+ ps_before = pd .Series (original_pred .probabilities [~ passed ], index = examples .index )
171+ ps_after = pd .Series (perturbed_pred .probabilities [~ passed ], index = examples .index )
172+
173+ examples ["Original prediction" ] += ps_before .apply (lambda p : f" (p={ p :.2f} )" )
174+ examples ["Prediction after perturbation" ] += ps_after .apply (lambda p : f" (p={ p :.2f} )" )
175+
176+ return examples
79177
80178 def _detect_issues (
81179 self ,
82180 model : BaseModel ,
83181 dataset : Dataset ,
84- transformation : TextTransformation ,
182+ transformation ,
85183 features : Sequence [str ],
86184 ) -> Sequence [Issue ]:
185+ # Fall back to defaults if not explicitly set
87186 num_samples = self .num_samples if self .num_samples is not None else _get_default_num_samples (model )
187+ threshold = self .threshold if self .threshold is not None else _get_default_threshold (model )
88188 output_sensitivity = (
89189 self .output_sensitivity if self .output_sensitivity is not None else _get_default_output_sensitivity (model )
90190 )
91- threshold = self .threshold if self .threshold is not None else _get_default_threshold (model )
92191
93192 issues = []
94- # @TODO: integrate this with Giskard metamorphic tests already present
95193 for feature in features :
194+ # Build transformation function for this feature
96195 transformation_fn = transformation (column = feature )
97196 transformed = dataset .transform (transformation_fn )
98197
99198 # Select only the records which were changed
100199 changed_idx = dataset .df .index [transformed .df [feature ] != dataset .df [feature ]]
101-
102200 if changed_idx .empty :
103201 continue
104202
@@ -107,6 +205,7 @@ def _detect_issues(
107205 rng = np .random .default_rng (747 )
108206 changed_idx = changed_idx [rng .choice (len (changed_idx ), num_samples , replace = False )]
109207
208+ # Build original vs. perturbed datasets
110209 original_data = Dataset (
111210 dataset .df .loc [changed_idx ],
112211 target = dataset .target ,
@@ -124,27 +223,12 @@ def _detect_issues(
124223 original_pred = model .predict (original_data )
125224 perturbed_pred = model .predict (perturbed_data )
126225
127- if model .is_classification :
128- passed = original_pred .raw_prediction == perturbed_pred .raw_prediction
129- elif model .is_regression :
130- rel_delta = _relative_delta (perturbed_pred .raw_prediction , original_pred .raw_prediction )
131- passed = np .abs (rel_delta ) < output_sensitivity
132- elif model .is_text_generation :
133- try :
134- import evaluate
135- except ImportError as err :
136- raise LLMImportError () from err
137-
138- scorer = evaluate .load ("bertscore" )
139- score = scorer .compute (
140- predictions = perturbed_pred .prediction ,
141- references = original_pred .prediction ,
142- model_type = "distilbert-base-multilingual-cased" ,
143- idf = True ,
144- )
145- passed = np .array (score ["f1" ]) > 1 - output_sensitivity
146- else :
147- raise NotImplementedError ("Only classification, regression, or text generation models are supported." )
226+ passed = self ._compute_passed (
227+ model = model ,
228+ original_pred = original_pred ,
229+ perturbed_pred = perturbed_pred ,
230+ output_sensitivity = output_sensitivity ,
231+ )
148232
149233 pass_rate = passed .mean ()
150234 fail_rate = 1 - pass_rate
@@ -196,61 +280,88 @@ def _detect_issues(
196280 )
197281
198282 # Add examples
199- examples = original_data .df .loc [~ passed , (feature ,)].copy ()
200- examples [f"{ transformation_fn .name } ({ feature } )" ] = perturbed_data .df .loc [~ passed , feature ]
201-
202- examples ["Original prediction" ] = original_pred .prediction [~ passed ]
203- examples ["Prediction after perturbation" ] = perturbed_pred .prediction [~ passed ]
204-
205- if model .is_classification :
206- examples ["Original prediction" ] = examples ["Original prediction" ].astype (str )
207- examples ["Prediction after perturbation" ] = examples ["Prediction after perturbation" ].astype (str )
208- ps_before = pd .Series (original_pred .probabilities [~ passed ], index = examples .index )
209- ps_after = pd .Series (perturbed_pred .probabilities [~ passed ], index = examples .index )
210- examples ["Original prediction" ] += ps_before .apply (lambda p : f" (p = { p :.2f} )" )
211- examples ["Prediction after perturbation" ] += ps_after .apply (lambda p : f" (p = { p :.2f} )" )
212-
283+ examples = self ._create_examples (
284+ original_data ,
285+ original_pred ,
286+ perturbed_data ,
287+ perturbed_pred ,
288+ feature ,
289+ passed ,
290+ model ,
291+ transformation_fn ,
292+ )
213293 issue .add_examples (examples )
214294
215295 issues .append (issue )
216296
217297 return issues
218298
299+ def run (self , model : BaseModel , dataset : Dataset , features : Sequence [str ]) -> Sequence [Issue ]:
300+ """
301+ Runs the perturbation detector on the given model and dataset.
219302
220- def _generate_robustness_tests (issue : Issue ):
221- from ...testing .tests .metamorphic import test_metamorphic_invariance
303+ Parameters
304+ ----------
305+ model: BaseModel
306+ The model to test.
307+ dataset: Dataset
308+ The dataset to use for testing.
309+ features: Sequence[str]
310+ The features (columns) to test.
311+
312+ Returns
313+ -------
314+ Sequence[Issue]
315+ A list of issues found during the testing.
316+ """
317+ transformations = self .transformations or self ._get_default_transformations ()
318+ selected_features = self ._select_features (dataset , features )
222319
223- # Only generates a single metamorphic test
224- return {
225- f"Invariance to “{ issue .transformation_fn } ”" : test_metamorphic_invariance (
226- transformation_function = issue .transformation_fn ,
227- slicing_function = None ,
228- threshold = 1 - issue .meta ["threshold" ],
229- output_sensitivity = issue .meta ["output_sentitivity" ],
320+ logger .info (
321+ f"{ self .__class__ .__name__ } : Running with transformations={ [t .name for t in transformations ]} "
322+ f"threshold={ self .threshold } output_sensitivity={ self .output_sensitivity } num_samples={ self .num_samples } "
230323 )
231- }
232324
325+ issues = []
326+ for transformation in transformations :
327+ issues .extend (self ._detect_issues (model , dataset , transformation , selected_features ))
233328
234- def _relative_delta (actual , reference ):
235- return (actual - reference ) / reference
329+ return [i for i in issues if i is not None ]
236330
237331
238- def _get_default_num_samples (model ) -> int :
239- if model .is_text_generation :
240- return 10
332+ class BaseTextPerturbationDetector (BasePerturbationDetector ):
333+ """
334+ Base class for metamorphic detectors based on text transformations.
335+ """
241336
242- return 1_000
337+ def _select_features (self , dataset : Dataset , features : Sequence [str ]) -> Sequence [str ]:
338+ # Only analyze text features
339+ return [
340+ f
341+ for f in features
342+ if dataset .column_types [f ] == "text" and pd .api .types .is_string_dtype (dataset .df [f ].dtype )
343+ ]
243344
345+ @abstractmethod
346+ def _get_default_transformations (self ) -> Sequence [TextTransformation ]:
347+ raise NotImplementedError
244348
245- def _get_default_output_sensitivity (model ) -> float :
246- if model .is_text_generation :
247- return 0.15
349+ def _supports_text_generation (self ) -> bool :
350+ return True
248351
249- return 0.05
250352
353+ class BaseNumericalPerturbationDetector (BasePerturbationDetector ):
354+ """
355+ Base class for metamorphic detectors based on numerical feature perturbations.
356+ """
251357
252- def _get_default_threshold ( model ) -> float :
253- if model . is_text_generation :
254- return 0.10
358+ def _select_features ( self , dataset : Dataset , features : Sequence [ str ] ) -> Sequence [ str ] :
359+ # Only analyze numeric features
360+ return [ f for f in features if dataset . column_types [ f ] == "numeric" ]
255361
256- return 0.05
362+ @abstractmethod
363+ def _get_default_transformations (self ) -> Sequence [NumericalTransformation ]:
364+ raise NotImplementedError
365+
366+ def _supports_text_generation (self ) -> bool :
367+ return False
0 commit comments