|
| 1 | +""" |
| 2 | +============================================ |
| 3 | +Tutorial: Conditional CP for classification |
| 4 | +============================================ |
| 5 | +
|
| 6 | +We will use a synthetic toy dataset for the tutorial of the CCP method, and |
| 7 | +its comparison with the other methods available in MAPIE. The CCP method |
| 8 | +implements the method described in the Gibbs et al. (2023) paper [1]. |
| 9 | +
|
| 10 | +In this tutorial, the classifier will be |
| 11 | +:class:`~sklearn.linear_model.LogisticRegression`. |
| 12 | +
|
| 13 | +We will compare the CCP method (using |
| 14 | +:class:`~mapie.futur.split.SplitCPRegressor`, |
| 15 | +:class:`~mapie.calibrators.ccp.CustomCCP` and |
| 16 | +:class:`~mapie.calibrators.ccp.GaussianCCP`), with the |
| 17 | +standard method, using for both, the LAC conformity score |
| 18 | +(:class:`~mapie.conformity_scores.LACConformityScore`). |
| 19 | +
|
| 20 | +Recall that the ``LAC`` method consists on applying a threshold on the |
| 21 | +predicted softmax, to keep all the classes above the threshold |
| 22 | +(``alpha`` is ``1 - target coverage``). |
| 23 | +
|
| 24 | +[1] Isaac Gibbs, John J. Cherian, and Emmanuel J. Candès, |
| 25 | +"Conformal Prediction With Conditional Guarantees", |
| 26 | +`arXiv <https://arxiv.org/abs/2305.12616>`_, 2023. |
| 27 | +""" |
| 28 | + |
| 29 | +import warnings |
| 30 | + |
| 31 | +import matplotlib.pyplot as plt |
| 32 | +import numpy as np |
| 33 | +from matplotlib.patches import Patch |
| 34 | +from sklearn.model_selection import ShuffleSplit |
| 35 | +from sklearn.linear_model import LogisticRegression |
| 36 | + |
| 37 | +from mapie.calibrators import CustomCCP, GaussianCCP |
| 38 | +from mapie.classification import MapieClassifier |
| 39 | +from mapie.conformity_scores import LACConformityScore |
| 40 | +from mapie.futur.split.classification import SplitCPClassifier |
| 41 | + |
| 42 | +warnings.filterwarnings("ignore") |
| 43 | + |
| 44 | +random_state = 1 |
| 45 | +np.random.seed(random_state) |
| 46 | + |
| 47 | +ALPHA = 0.2 |
| 48 | +N_CLASSES = 5 |
| 49 | + |
| 50 | +############################################################################## |
| 51 | +# 1. Data generation |
| 52 | +# -------------------------------------------------------------------------- |
| 53 | +# Let's start by creating some synthetic data with 5 gaussian distributions |
| 54 | +# |
| 55 | +# We are going to use 5000 samples for training, 3000 for calibration and |
| 56 | +# 10000 for testing (to have a good conditional coverage evaluation). |
| 57 | + |
| 58 | + |
| 59 | +def create_toy_dataset(n_samples=1000): |
| 60 | + centers = [(0, 3.5), (-3, 0), (0, -2), (4, -1), (3, 1)] |
| 61 | + covs = [ |
| 62 | + np.diag([1, 1]), np.diag([2, 2]), np.diag([3, 2]), |
| 63 | + np.diag([3, 3]), np.diag([2, 2]), |
| 64 | + ] |
| 65 | + n_per_class = ( |
| 66 | + np.linspace(0, n_samples, N_CLASSES + 1)[1:] |
| 67 | + - np.linspace(0, n_samples, N_CLASSES + 1)[: -1].astype(int) |
| 68 | + ).astype(int) |
| 69 | + X = np.vstack([ |
| 70 | + np.random.multivariate_normal(center, cov, n) |
| 71 | + for center, cov, n in zip(centers, covs, n_per_class) |
| 72 | + |
| 73 | + ]) |
| 74 | + y = np.hstack([np.full(n_per_class[i], i) for i in range(N_CLASSES)]) |
| 75 | + |
| 76 | + return X, y |
| 77 | + |
| 78 | + |
| 79 | +def generate_data(seed=1, n_train=2000, n_calib=2000, n_test=2000, ): |
| 80 | + np.random.seed(seed) |
| 81 | + x_train, y_train = create_toy_dataset(n_train) |
| 82 | + x_calib, y_calib = create_toy_dataset(n_calib) |
| 83 | + x_test, y_test = create_toy_dataset(n_test) |
| 84 | + |
| 85 | + return x_train, y_train, x_calib, y_calib, x_test, y_test |
| 86 | + |
| 87 | +############################################################################## |
| 88 | +# Let's visualize the data and its distribution |
| 89 | + |
| 90 | + |
| 91 | +x_train, y_train, *_ = generate_data(seed=None, n_train=2000) |
| 92 | + |
| 93 | +for c in range(N_CLASSES): |
| 94 | + plt.scatter(x_train[y_train == c, 0], x_train[y_train == c, 1], |
| 95 | + c=f"C{c}", s=1.5, label=f'Class {c}') |
| 96 | +plt.legend() |
| 97 | +plt.show() |
| 98 | + |
| 99 | + |
| 100 | +############################################################################## |
| 101 | +# 2. Plotting and adaptativity comparison functions |
| 102 | +# -------------------------------------------------------------------------- |
| 103 | + |
| 104 | + |
| 105 | +def run_exp( |
| 106 | + mapies, names, alpha, n_train=2000, n_calib=2000, |
| 107 | + n_test=2000, grid_step=100, plot=True, seed=1, max_display=2000 |
| 108 | +): |
| 109 | + ( |
| 110 | + x_train, y_train, x_calib, y_calib, x_test, y_test |
| 111 | + ) = generate_data( |
| 112 | + seed=seed, n_train=n_train, n_calib=n_calib, n_test=n_test |
| 113 | + ) |
| 114 | + |
| 115 | + if max_display: |
| 116 | + display_ind = np.random.choice(np.arange(0, len(x_test)), max_display) |
| 117 | + else: |
| 118 | + display_ind = np.arange(0, len(x_test)) |
| 119 | + |
| 120 | + color_map = plt.cm.get_cmap("Purples", N_CLASSES + 1) |
| 121 | + |
| 122 | + if plot: |
| 123 | + fig = plt.figure() |
| 124 | + fig.set_size_inches(6 * (len(mapies) + 1), 7) |
| 125 | + grid = plt.GridSpec(1, len(mapies) + 1) |
| 126 | + |
| 127 | + x_min = np.min(x_train) |
| 128 | + x_max = np.max(x_train) |
| 129 | + step = (x_max - x_min) / grid_step |
| 130 | + |
| 131 | + xx, yy = np.meshgrid( |
| 132 | + np.arange(x_min, x_max, step), np.arange(x_min, x_max, step) |
| 133 | + ) |
| 134 | + X_test_mesh = np.stack([xx.ravel(), yy.ravel()], axis=1) |
| 135 | + |
| 136 | + scores = np.zeros((len(mapies), N_CLASSES+1)) |
| 137 | + for i, (mapie, name) in enumerate(zip(mapies, names)): |
| 138 | + if isinstance(mapie, MapieClassifier): |
| 139 | + mapie.fit( |
| 140 | + np.vstack([x_train, x_calib]), np.hstack([y_train, y_calib]) |
| 141 | + ) |
| 142 | + _, y_ps_test = mapie.predict(x_test, alpha=alpha) |
| 143 | + if plot: |
| 144 | + y_pred_mesh, y_ps_mesh = mapie.predict( |
| 145 | + X_test_mesh, alpha=alpha |
| 146 | + ) |
| 147 | + elif isinstance(mapie, SplitCPClassifier): |
| 148 | + mapie.fit( |
| 149 | + np.vstack([x_train, x_calib]), np.hstack([y_train, y_calib]) |
| 150 | + ) |
| 151 | + _, y_ps_test = mapie.predict(x_test) |
| 152 | + if plot: |
| 153 | + y_pred_mesh, y_ps_mesh = mapie.predict(X_test_mesh) |
| 154 | + else: |
| 155 | + raise |
| 156 | + |
| 157 | + if plot: |
| 158 | + if i == 0: |
| 159 | + ax1 = fig.add_subplot(grid[0, 0]) |
| 160 | + |
| 161 | + ax1.scatter( |
| 162 | + X_test_mesh[:, 0], X_test_mesh[:, 1], |
| 163 | + c=[f"C{x}" for x in y_pred_mesh], alpha=1, marker="s", |
| 164 | + edgecolor="none", s=220 * step |
| 165 | + ) |
| 166 | + ax1.fill_between( |
| 167 | + x=[min(X_test_mesh[:, 0]) - step] + list(X_test_mesh[:, 0]) |
| 168 | + + [max(X_test_mesh[:, 0]) + step], |
| 169 | + y1=min(X_test_mesh[:, 1]) - step, |
| 170 | + y2=max(X_test_mesh[:, 1]) + step, |
| 171 | + color="white", alpha=0.6 |
| 172 | + ) |
| 173 | + ax1.scatter( |
| 174 | + x_test[display_ind, 0], x_test[display_ind, 1], |
| 175 | + c=[f"C{x}" for x in y_test[display_ind]], |
| 176 | + alpha=1, marker=".", edgecolor="black", s=80 |
| 177 | + ) |
| 178 | + |
| 179 | + ax1.set_title("Predictions", fontsize=22, pad=12) |
| 180 | + ax1.set_xlim([-6, 8]) |
| 181 | + ax1.set_ylim([-6, 8]) |
| 182 | + legend_labels = [f"Class {i}" for i in range(N_CLASSES)] |
| 183 | + handles = [ |
| 184 | + plt.Line2D([0], [0], marker='.', color='w', |
| 185 | + markerfacecolor=f"C{i}", markersize=10) |
| 186 | + for i in range(N_CLASSES) |
| 187 | + ] |
| 188 | + ax1.legend(handles, legend_labels, title="Classes", |
| 189 | + fontsize=18, title_fontsize=20) |
| 190 | + |
| 191 | + y_ps_sums = y_ps_mesh[:, :, 0].sum(axis=1) |
| 192 | + |
| 193 | + ax = fig.add_subplot(grid[0, i + 1]) |
| 194 | + |
| 195 | + scatter = ax.scatter( |
| 196 | + X_test_mesh[:, 0], |
| 197 | + X_test_mesh[:, 1], |
| 198 | + c=y_ps_sums, |
| 199 | + marker='s', |
| 200 | + edgecolor="none", |
| 201 | + s=220 * step, |
| 202 | + alpha=1, |
| 203 | + cmap=color_map, |
| 204 | + vmin=0, |
| 205 | + vmax=N_CLASSES, |
| 206 | + ) |
| 207 | + ax.scatter(x_test[display_ind, 0], x_test[display_ind, 1], |
| 208 | + c=[f"C{x}" for x in y_test[display_ind]], |
| 209 | + alpha=0.6, marker=".", edgecolor="gray", s=50) |
| 210 | + |
| 211 | + colorbar = plt.colorbar(scatter, ax=ax) |
| 212 | + colorbar.ax.set_ylabel("Set size", fontsize=20) |
| 213 | + colorbar.ax.tick_params(labelsize=18) |
| 214 | + ax.set_title(name, fontsize=22, pad=12) |
| 215 | + ax.set_xlim([-6, 8]) |
| 216 | + ax.set_ylim([-6, 8]) |
| 217 | + |
| 218 | + if isinstance(mapie, SplitCPClassifier): |
| 219 | + centers = [] |
| 220 | + for f in mapie.calibrator_.functions_ + [mapie.calibrator_]: |
| 221 | + if hasattr(f, "points_"): |
| 222 | + centers += list(f.points_) |
| 223 | + if len(centers) > 0: |
| 224 | + centers = np.stack(centers) |
| 225 | + else: |
| 226 | + centers = None |
| 227 | + |
| 228 | + if centers is not None: |
| 229 | + ax.scatter(centers[:, 0], centers[:, 1], c="gold", |
| 230 | + alpha=1, edgecolors="black", s=50) |
| 231 | + |
| 232 | + scores[i, 1:] = [ |
| 233 | + y_ps_test[(y_test == c), c, 0].astype(int).sum(axis=0) |
| 234 | + / len(y_ps_test[(y_test == c), :, 0]) |
| 235 | + for c in range(N_CLASSES) |
| 236 | + ] |
| 237 | + scores[i, 0] = np.mean(scores[i, 1:]) |
| 238 | + |
| 239 | + if plot: |
| 240 | + fig.tight_layout() |
| 241 | + plt.show() |
| 242 | + else: |
| 243 | + return scores |
| 244 | + |
| 245 | + |
| 246 | +def plot_cond_coverage(scores, names): |
| 247 | + labels = [f"Class {i}" for i in range(N_CLASSES)] |
| 248 | + labels.insert(0, "marginal") |
| 249 | + x = np.arange(len(labels)) |
| 250 | + width = 0.2 |
| 251 | + |
| 252 | + fig, ax = plt.subplots(figsize=(10, 6)) |
| 253 | + for i in range(len(mapies)): |
| 254 | + ax.boxplot( |
| 255 | + scores[:, i, :], positions=x + width * (i-1), widths=width, |
| 256 | + patch_artist=True, boxprops=dict(facecolor=f"C{i}"), |
| 257 | + medianprops=dict(color="black"), labels=labels |
| 258 | + ) |
| 259 | + ax.axhline(y=1-ALPHA, color='red', linestyle='--', label=f'alpha={ALPHA}') |
| 260 | + ax.axvline(x=0.5, color='black', linestyle='--') |
| 261 | + |
| 262 | + ax.set_ylabel('Coverage') |
| 263 | + ax.set_title('Coverage on each class') |
| 264 | + ax.set_xticks(x) |
| 265 | + ax.set_xticklabels(labels) |
| 266 | + ax.set_ylim([0.6, 1]) |
| 267 | + |
| 268 | + custom_handles = [Patch(facecolor=f"C{i}", edgecolor='black', |
| 269 | + label=names[i]) for i in range(len(mapies))] |
| 270 | + handles, labels = ax.get_legend_handles_labels() |
| 271 | + |
| 272 | + # Update the legend with the combined handles and labels |
| 273 | + ax.legend(handles + custom_handles, labels + names, loc="lower left") |
| 274 | + |
| 275 | + plt.show() |
| 276 | + |
| 277 | + |
| 278 | +############################################################################## |
| 279 | +# 3. Creation of Mapie instances |
| 280 | +# -------------------------------------------------------------------------- |
| 281 | +# We are going to compare the standard ``LAC`` method with: |
| 282 | +# |
| 283 | +# - The ``CCP`` method using the predicted classes as groups (to have a |
| 284 | +# homogenous coverage on each class). |
| 285 | +# - The ``CCP`` method with gaussian kernels, to have adaptative prediction |
| 286 | +# sets, without prior knowledge or information |
| 287 | +# (:class:`~mapie.calibrators.ccp.GaussianCCP`). |
| 288 | + |
| 289 | + |
| 290 | +n_train = 5000 |
| 291 | +n_calib = 3000 |
| 292 | +n_test = 10000 |
| 293 | + |
| 294 | +cv = ShuffleSplit(n_splits=1, test_size=n_calib/(n_train + n_calib), |
| 295 | + random_state=random_state) |
| 296 | + |
| 297 | +# =========================== Standard LAC =========================== |
| 298 | +mapie_lac = MapieClassifier(LogisticRegression(), method="lac", cv=cv) |
| 299 | + |
| 300 | + |
| 301 | +# ============= CCP indicator groups on predicted classes ============= |
| 302 | +mapie_ccp_y_pred = SplitCPClassifier( |
| 303 | + LogisticRegression(), |
| 304 | + calibrator=CustomCCP(lambda y_pred: y_pred), |
| 305 | + alpha=ALPHA, cv=cv, conformity_score=LACConformityScore() |
| 306 | +) |
| 307 | + |
| 308 | +# ======================== CCP Gaussian kernels ======================== |
| 309 | +mapie_ccp_gauss = SplitCPClassifier( |
| 310 | + LogisticRegression(), |
| 311 | + calibrator=GaussianCCP(40, 1, bias=True), |
| 312 | + alpha=ALPHA, cv=cv, conformity_score=LACConformityScore() |
| 313 | +) |
| 314 | + |
| 315 | +mapies = [mapie_lac, mapie_ccp_y_pred, mapie_ccp_gauss] |
| 316 | +names = ["Standard LAC", "CCP predicted class groups", "CCP Gaussian kernel"] |
| 317 | + |
| 318 | + |
| 319 | +############################################################################## |
| 320 | +# 4. Generate the prediction sets |
| 321 | +# -------------------------------------------------------------------------- |
| 322 | + |
| 323 | +run_exp(mapies, names, ALPHA, n_train=n_train, n_calib=n_calib, n_test=n_test) |
| 324 | + |
| 325 | +############################################################################## |
| 326 | +# We can see that the ``CCP`` method seems to create better |
| 327 | +# prediction sets than the standard method. Indeed, where the |
| 328 | +# classes distributions overlap (especially for class 3 and 4), |
| 329 | +# the size of the sets should increase, to correctly represente the model |
| 330 | +# uncertainty on those samples. |
| 331 | +# |
| 332 | +# The middle of all the classes distributions, where points could |
| 333 | +# belong to any class, should have the biggest prediction sets (with almost |
| 334 | +# all the clases in the sets, as we are very uncertain). The calibrator |
| 335 | +# with gaussian kernels perfectly represented this uncertainty, with big sets |
| 336 | +# for the middle points (the dark purple being sets with 4 classes). |
| 337 | +# |
| 338 | +# Thus, between the two ``CCP`` methods, the one using gaussian kernels |
| 339 | +# (:class:`~mapie.calibrators.ccp.GaussianCCP`) seems the most adaptative. |
| 340 | +# |
| 341 | +# This modelisation of uncertainty is not visible at all in the standard |
| 342 | +# method, where we have, in the opposite, empty sets where the distributions |
| 343 | +# overlap. |
| 344 | + |
| 345 | + |
| 346 | +############################################################################## |
| 347 | +# 5. Evaluate the adaptativity |
| 348 | +# -------------------------------------------------------------------------- |
| 349 | +# If we can, at first, assess the adaptativity of the methods just looking at |
| 350 | +# the prediction sets, the most accurate way is to look if the coverage is |
| 351 | +# homogenous on sub parts of the data (on each class for instance). |
| 352 | + |
| 353 | + |
| 354 | +N_TRIALS = 6 |
| 355 | +scores = np.zeros((N_TRIALS, len(mapies), N_CLASSES+1)) |
| 356 | +for i in range(N_TRIALS): |
| 357 | + scores[i, :, :] = run_exp( |
| 358 | + mapies, names, ALPHA, n_train=n_train, n_calib=n_calib, n_test=n_test, |
| 359 | + plot=False, seed=i |
| 360 | + ) |
| 361 | + |
| 362 | +plot_cond_coverage(scores, names) |
| 363 | + |
| 364 | +############################################################################## |
| 365 | +# A pefectly adaptative method whould result in a homogenous coverage |
| 366 | +# for all classes. We can see that the ``CCP`` method, with the predicted |
| 367 | +# classes as groups, is more adaptative than the standard method. The |
| 368 | +# over-coverage of the standard method on class 1 was corrected in the ``CCP`` |
| 369 | +# method, and the under-coverage on class 4 was also slightly corrected. |
| 370 | +# |
| 371 | +# However, the ``CCP`` with a gaussian calibrator |
| 372 | +# (:class:`~mapie.calibrators.ccp.GaussianCCP`), is clearly the |
| 373 | +# most adaptative method, with no under-coverage neither for the class 2 and 4. |
| 374 | +# |
| 375 | +# To conclude, the ``CCP`` method offer adaptative perdiction sets. |
| 376 | +# We can inject prior knowledge or groups on which we want to avois bias |
| 377 | +# (We tried to do this with the classes, but it was not perfect because we only |
| 378 | +# had access to the predictions, not the true classes). |
| 379 | +# Using gaussian kernels, with a correct sigma parameter |
| 380 | +# (which can be optimized using cross-validation if needed), can be the easiest |
| 381 | +# and best solution to have very adaptative prdiction sets. |
0 commit comments