diff --git a/tests/attacks/sklearn_dataset.py b/tests/attacks/sklearn_dataset.py index cab9ede9..27c24391 100644 --- a/tests/attacks/sklearn_dataset.py +++ b/tests/attacks/sklearn_dataset.py @@ -2,10 +2,11 @@ from __future__ import annotations +import types from collections.abc import Sequence import numpy as np -from sklearn.datasets import fetch_openml +from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder @@ -14,13 +15,67 @@ random_state = 1 +def _generate_nursery_data(n_samples=2000, random_state=1): + """Generate synthetic categorical data mimicking the nursery dataset. + + Uses make_classification to create a learnable classification problem, + then discretises continuous features into categories matching the + OpenML nursery dataset (data_id=26) structure so that one-hot encoding + yields the same column layout. + """ + feature_specs = [ + ("parents", ["great_pret", "pretentious", "usual"]), + ( + "has_nurs", + ["critical", "less_proper", "proper", "slightly_prob", "very_crit"], + ), + ("form", ["complete", "foster", "other", "others"]), + ("children", ["1", "2", "3", "more"]), + ("housing", ["convenient", "less_proper", "slightly_prob"]), + ("finance", ["convenient", "inconv"]), + ("social", ["non_prob", "slightly_prob", "very_recom"]), + ("health", ["not_recom", "priority", "recommended"]), + ] + target_classes = ["not_recom", "priority", "spec_prior", "very_recom"] + + n_features = len(feature_specs) + n_classes = len(target_classes) + + x_cont, y_int = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + n_redundant=0, + n_classes=n_classes, + n_clusters_per_class=1, + class_sep=2.0, + random_state=random_state, + ) + + # Discretise each continuous feature into categories via percentile binning + feature_names = [] + columns = [] + for i, (name, categories) in enumerate(feature_specs): + feature_names.append(name) + n_cats = len(categories) + percentiles = np.linspace(0, 100, n_cats + 1)[1:-1] + bins = np.percentile(x_cont[:, i], percentiles) + bin_indices = np.digitize(x_cont[:, i], bins) + columns.append(np.array([categories[idx] for idx in bin_indices])) + + data = np.column_stack(columns) + target = np.array([target_classes[idx] for idx in y_int]) + + return types.SimpleNamespace(data=data, target=target, feature_names=feature_names) + + class Nursery(SklearnDataHandler): """Nursery dataset handler.""" def __init__(self) -> None: """Fetch and process the nursery dataset.""" # Get original dataset - nursery_data = fetch_openml(data_id=26, as_frame=True) + nursery_data = _generate_nursery_data() self.X_orig = np.asarray(nursery_data.data, dtype=str) self.y_orig = np.asarray(nursery_data.target, dtype=str) diff --git a/tests/attacks/test_structural_attack.py b/tests/attacks/test_structural_attack.py index b52aa01b..e61b1e9f 100644 --- a/tests/attacks/test_structural_attack.py +++ b/tests/attacks/test_structural_attack.py @@ -241,7 +241,7 @@ def test_dt_disclosive(): def test_adaboost_nondisclosive(): """Test for nondisclosive adaboost classifier.""" param_dict_adasafe = { - "n_estimators": 2, + "n_estimators": 5, "estimator": DecisionTreeClassifier(**kwargs_dtsafe), } target = get_target("adaboost", **param_dict_adasafe) diff --git a/tests/conftest.py b/tests/conftest.py index 120034e6..2607e9aa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,12 +7,13 @@ import contextlib import os import shutil +import types from datetime import date import numpy as np import pytest import sklearn -from sklearn.datasets import fetch_openml, make_classification +from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder @@ -107,15 +108,70 @@ def _cleanup(): os.remove(file) +def _generate_nursery_data(n_samples=2000, random_state=1): + """Generate synthetic categorical data mimicking the nursery dataset. + + Uses make_classification to create a learnable classification problem, + then discretises continuous features into categories matching the + OpenML nursery dataset (data_id=26) structure so that one-hot encoding + yields the same column layout. + """ + feature_specs = [ + ("parents", ["great_pret", "pretentious", "usual"]), + ( + "has_nurs", + ["critical", "less_proper", "proper", "slightly_prob", "very_crit"], + ), + ("form", ["complete", "foster", "other", "others"]), + ("children", ["1", "2", "3", "more"]), + ("housing", ["convenient", "less_proper", "slightly_prob"]), + ("finance", ["convenient", "inconv"]), + ("social", ["non_prob", "slightly_prob", "very_recom"]), + ("health", ["not_recom", "priority", "recommended"]), + ] + target_classes = ["not_recom", "priority", "spec_prior", "very_recom"] + + n_features = len(feature_specs) + n_classes = len(target_classes) + + x_cont, y_int = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + n_redundant=0, + n_classes=n_classes, + n_clusters_per_class=1, + class_sep=2.0, + random_state=random_state, + ) + + # Discretise each continuous feature into categories via percentile binning + feature_names = [] + columns = [] + for i, (name, categories) in enumerate(feature_specs): + feature_names.append(name) + n_cats = len(categories) + percentiles = np.linspace(0, 100, n_cats + 1)[1:-1] + bins = np.percentile(x_cont[:, i], percentiles) + bin_indices = np.digitize(x_cont[:, i], bins) + columns.append(np.array([categories[idx] for idx in bin_indices])) + + data = np.column_stack(columns) + target = np.array([target_classes[idx] for idx in y_int]) + + return types.SimpleNamespace(data=data, target=target, feature_names=feature_names) + + @pytest.fixture def get_target(request) -> Target: """Return a target object with test data and fitted model. - Uses a randomly sampled 10+10% of the nursery data set. + Uses synthetic categorical data generated with make_classification, + discretised to match the nursery dataset structure. """ model: sklearn.BaseEstimator = request.param - nursery_data = fetch_openml(data_id=26, as_frame=True) + nursery_data = _generate_nursery_data() x = np.asarray(nursery_data.data, dtype=str) y = np.asarray(nursery_data.target, dtype=str) # change labels from recommend to priority for the two odd cases