From 67278783f2d670f15888d3360d64af187f3d3151 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Fri, 13 May 2022 08:52:49 +0200
Subject: [PATCH 01/19] FIX minor

---
 CONTRIBUTING.md | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 19aaca371e..73ce781618 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -61,7 +61,7 @@ Following that we'll tell you about how you can test your changes locally and th
     # If you missed the --recurse-submodules arg during clone or need to install the
     # submodule manually, then execute the following line:
     #
-    # git submodule udate --init --recursive
+    # git submodule update --init --recursive
     ```
 
     The reason to create a new branch is two fold:
@@ -207,16 +207,16 @@ Sometimes, the new functionality isn't so clear from a simple parameter descript
 Lastly, if the feature really is a game changer or you're very proud of it, consider making an `example_*.py` that will be run and rendered in the online docs!
 
 ## Testing
-*   Let's assume you've made some changes, now we have to make sure they work.
+* Let's assume you've made some changes, now we have to make sure they work.
     Begin by simply running all the tests.
     If there's any errors, they'll pop up once it's complete.
     ```bash
     pytest
     ```
-    *   Note that these may take a while so check out `pytest --help` to see how you can run tests so that only previous failures run or only certain tests are run.
+    * Note that these may take a while so check out `pytest --help` to see how you can run tests so that only previous failures run or only certain tests are run.
         This can help you try changes and get results faster.
         Do however run one last full `pytest` once you are finished and happy!
-    *   Here are some we find particularly useful
+    * Here are some we find particularly useful
         ```
         # Run tests in specific file like 'test_estimators.py'
         pytest "test/test_automl/test_estimators.py"
@@ -236,9 +236,18 @@ Lastly, if the feature really is a game changer or you're very proud of it, cons
         # Exit on the first test failure
         pytest -x
         ```
-    *   More advanced editors like PyCharm may have built in integrations which could be good to check out!
+    * More advanced editors like PyCharm may have built in integrations which could be good to check out!
+    * Running all unittests will take a while, here's how you can run them in parallel
+        ```
+        export OPENBLAS_NUM_THREADS=1
+        export MKL_NUM_THREADS=1
+        export OMP_NUM_THREADS=1
+      
+        pytest -n 4
+        ```
+
 
-*   Now we are going to use [sphinx](https://www.sphinx-doc.org/en/master/) to generate all the documentation and make sure there are no issues.
+* Now we are going to use [sphinx](https://www.sphinx-doc.org/en/master/) to generate all the documentation and make sure there are no issues.
     ```bash
     make doc
     ```
@@ -261,7 +270,7 @@ Lastly, if the feature really is a game changer or you're very proud of it, cons
     xdg-open ./doc/build/html/index.html
     ```
 
-*   Once you've made all your changes and all the tests pass successfully, we need to make sure that the code fits a certain format and that the [typing](https://docs.python.org/3/library/typing.html) is correct.
+* Once you've made all your changes and all the tests pass successfully, we need to make sure that the code fits a certain format and that the [typing](https://docs.python.org/3/library/typing.html) is correct.
     * Formatting and import sorting can helps keep things uniform across all coding styles. We use [`black`](https://black.readthedocs.io/en/stable/) and [`isort`](https://isort.readthedocs.io/en/latest/) to do this for us. To automatically run these formatters across the code base, just run the following command:
     ```bash
     make format

From 2a5c13c6e8e25f19078f7dedeccf54097faead47 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Fri, 13 May 2022 15:47:08 +0200
Subject: [PATCH 02/19] update submodule

---
 autosklearn/automl_common | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autosklearn/automl_common b/autosklearn/automl_common
index 4c8ab915e0..932d830360 160000
--- a/autosklearn/automl_common
+++ b/autosklearn/automl_common
@@ -1 +1 @@
-Subproject commit 4c8ab915e007745611b9b7266137497839aba701
+Subproject commit 932d830360ae6c057e9430432702e86d82a630af

From f6ac3f478b1090a7ac1092384508b7980508f0e1 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Fri, 13 May 2022 15:57:22 +0200
Subject: [PATCH 03/19] update submodule

---
 autosklearn/automl_common | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autosklearn/automl_common b/autosklearn/automl_common
index 932d830360..9f554ee855 160000
--- a/autosklearn/automl_common
+++ b/autosklearn/automl_common
@@ -1 +1 @@
-Subproject commit 932d830360ae6c057e9430432702e86d82a630af
+Subproject commit 9f554ee85579d7b3b7d0b002323d7d13afc0c014

From eea9ab3d53a72d6ca0065a109c53fef551a83c7b Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Fri, 13 May 2022 17:46:20 +0200
Subject: [PATCH 04/19] ADD pass xdata to metric

---
 autosklearn/evaluation/abstract_evaluator.py |  13 +-
 autosklearn/evaluation/train_evaluator.py    |  38 +++++-
 autosklearn/metrics/__init__.py              | 127 ++++++++++++++-----
 3 files changed, 141 insertions(+), 37 deletions(-)

diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
index ab9e961128..d6f2212856 100644
--- a/autosklearn/evaluation/abstract_evaluator.py
+++ b/autosklearn/evaluation/abstract_evaluator.py
@@ -25,6 +25,7 @@
     MULTIOUTPUT_REGRESSION,
     REGRESSION_TASKS,
 )
+from autosklearn.data.validation import SUPPORTED_TARGET_TYPES
 from autosklearn.metrics import Scorer, calculate_losses
 from autosklearn.pipeline.components.base import ThirdPartyComponents, _addons
 from autosklearn.pipeline.implementations.util import (
@@ -273,7 +274,8 @@ def __init__(
                 port=self.port,
             )
 
-        self.Y_optimization: Optional[Union[List, np.ndarray]] = None
+        self.X_optimization: Optional[SUPPORTED_TARGET_TYPES] = None
+        self.Y_optimization: Optional[SUPPORTED_TARGET_TYPES] = None
         self.Y_actual_train = None
 
         self.budget = budget
@@ -328,6 +330,7 @@ def _loss(
         self,
         y_true: np.ndarray,
         y_hat: np.ndarray,
+        x_data: Optional[np.ndarray] = None,
     ) -> Dict[str, float]:
         """Auto-sklearn follows a minimization goal.
         The calculate_loss internally translate a score function to
@@ -354,6 +357,7 @@ def _loss(
                 y_hat,
                 self.task_type,
                 self.metrics,
+                x_data=x_data,
                 scoring_functions=self.scoring_functions,
             )
 
@@ -522,7 +526,12 @@ def file_output(
         # This file can be written independently of the others down bellow
         if "y_optimization" not in self.disable_file_output:
             if self.output_y_hat_optimization:
-                self.backend.save_targets_ensemble(self.Y_optimization)
+                self.backend.save_additional_data(
+                    self.Y_optimization, what="targets_ensemble"
+                )
+                self.backend.save_additional_data(
+                    self.X_optimization, what="input_ensemble"
+                )
 
         models: Optional[BaseEstimator] = None
         if hasattr(self, "models"):
diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
index f6317ca94e..dbb1674db8 100644
--- a/autosklearn/evaluation/train_evaluator.py
+++ b/autosklearn/evaluation/train_evaluator.py
@@ -235,7 +235,7 @@ def __init__(
         )
         self.X_train = self.datamanager.data["X_train"]
         self.Y_train = self.datamanager.data["Y_train"]
-        self.Y_optimization: Optional[SUPPORTED_TARGET_TYPES] = None
+        self.X_targets = [None] * self.num_cv_folds
         self.Y_targets = [None] * self.num_cv_folds
         self.Y_train_targets = np.ones(self.Y_train.shape) * np.NaN
         self.models = [None] * self.num_cv_folds
@@ -265,6 +265,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     self.Y_train,
                     groups=self.resampling_strategy_args.get("groups"),
                 ):
+                    self.X_optimization = self.X_train[test_split]
                     self.Y_optimization = self.Y_train[test_split]
                     self.Y_actual_train = self.Y_train[train_split]
                     self._partial_fit_and_predict_iterative(
@@ -359,6 +360,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                                 if hasattr(self.Y_train, "iloc")
                                 else self.Y_train[train_indices]
                             )
+                            self.X_targets[i] = self.X_train[test_indices]
                             self.Y_targets[i] = self.Y_train[test_indices]
 
                             Xt, fit_params = model.fit_transformer(
@@ -400,6 +402,8 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                             if hasattr(self.Y_train, "iloc")
                             else self.Y_train[train_indices],
                             train_pred,
+                            # TODO: Check whether this is the correct xdata
+                            x_data=Xt_array[i],
                         )
                         train_losses[i] = train_loss
                         # Number of training data points for this fold.
@@ -408,8 +412,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                         # Compute validation loss of this fold and store it.
                         optimization_loss = self._loss(
-                            self.Y_targets[i],
-                            opt_pred,
+                            self.Y_targets[i], opt_pred, x_data=self.X_targets[i]
                         )
                         opt_losses[i] = optimization_loss
                         # number of optimization data points for this fold.
@@ -455,6 +458,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                             weights=opt_fold_weights_percentage,
                         )
 
+                    X_targets = self.X_targets
                     Y_targets = self.Y_targets
                     Y_train_targets = self.Y_train_targets
 
@@ -465,6 +469,13 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                             if Y_optimization_pred[i] is not None
                         ]
                     )
+                    X_targets = np.concatenate(
+                        [
+                            X_targets[i]
+                            for i in range(self.num_cv_folds)
+                            if X_targets[i] is not None
+                        ]
+                    )
                     Y_targets = np.concatenate(
                         [
                             Y_targets[i]
@@ -501,6 +512,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     else:
                         Y_test_preds = None
 
+                    self.X_optimization = X_targets
                     self.Y_optimization = Y_targets
                     self.Y_actual_train = Y_train_targets
 
@@ -602,6 +614,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                 train_loss = self._loss(
                     self.Y_train_targets[train_split],
                     train_pred,
+                    x_data=self.X_train[train_split],
                 )
                 train_losses.append(train_loss)
                 # number of training data points for this fold. Used for weighting
@@ -610,8 +623,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                 # Compute validation loss of this fold and store it.
                 optimization_loss = self._loss(
-                    self.Y_targets[i],
-                    opt_pred,
+                    self.Y_targets[i], opt_pred, x_data=self.X_targets[i]
                 )
                 opt_losses.append(optimization_loss)
                 # number of optimization data points for this fold. Used for weighting
@@ -644,6 +656,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     weights=opt_fold_weights,
                 )
 
+            X_targets = self.X_targets
             Y_targets = self.Y_targets
             Y_train_targets = self.Y_train_targets
 
@@ -654,6 +667,13 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     if Y_optimization_pred[i] is not None
                 ]
             )
+            X_targets = np.concatenate(
+                [
+                    X_targets[i]
+                    for i in range(self.num_cv_folds)
+                    if X_targets[i] is not None
+                ]
+            )
             Y_targets = np.concatenate(
                 [
                     Y_targets[i]
@@ -686,6 +706,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                 if len(np.shape(Y_test_pred)) == 3:
                     Y_test_pred = np.nanmean(Y_test_pred, axis=0)
 
+            self.X_optimization = X_targets
             self.Y_optimization = Y_targets
             self.Y_actual_train = Y_train_targets
 
@@ -754,6 +775,7 @@ def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> No
                 break
 
         if self.num_cv_folds > 1:
+            self.X_optimization = self.X_train[test_split]
             self.Y_optimization = self.Y_train[test_split]
             self.Y_actual_train = self.Y_train[train_split]
 
@@ -981,6 +1003,11 @@ def _partial_fit_and_predict_standard(
         else:
             self.models[fold] = model
 
+        self.X_targets[fold] = (
+            self.X_train.iloc[test_indices]
+            if hasattr(self.X_train, "iloc")
+            else self.X_train[test_indices]
+        )
         self.Y_targets[fold] = (
             self.Y_train.iloc[test_indices]
             if hasattr(self.Y_train, "iloc")
@@ -1026,6 +1053,7 @@ def _partial_fit_and_predict_budget(
 
         model = self._get_model()
         self.indices[fold] = (train_indices, test_indices)
+        self.X_targets[fold] = self.X_train[test_indices]
         self.Y_targets[fold] = self.Y_train[test_indices]
         self.Y_train_targets[train_indices] = (
             self.Y_train.iloc[train_indices]
diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index 3104716da3..9e167694c8 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -1,5 +1,5 @@
 from abc import ABCMeta, abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Sequence
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union
 
 import collections
 from functools import partial
@@ -32,11 +32,13 @@ def __init__(
         worst_possible_result: float,
         sign: float,
         kwargs: Any,
+        needs_x: bool = False,
     ) -> None:
         self.name = name
         self._kwargs = kwargs
         self._score_func = score_func
         self._optimum = optimum
+        self._needs_x = needs_x
         self._worst_possible_result = worst_possible_result
         self._sign = sign
 
@@ -45,6 +47,7 @@ def __call__(
         self,
         y_true: np.ndarray,
         y_pred: np.ndarray,
+        x_data: Optional[np.ndarray] = None,
         sample_weight: Optional[List[float]] = None,
     ) -> float:
         pass
@@ -58,6 +61,7 @@ def __call__(
         self,
         y_true: np.ndarray,
         y_pred: np.ndarray,
+        x_data: Optional[np.ndarray] = None,
         sample_weight: Optional[List[float]] = None,
     ) -> float:
         """Evaluate predicted target values for X relative to y_true.
@@ -104,12 +108,13 @@ def __call__(
         else:
             raise ValueError(type_true)
 
+        sc_args = {}  # type: Dict[str, Union[List[float], np.ndarray]]
         if sample_weight is not None:
-            return self._sign * self._score_func(
-                y_true, y_pred, sample_weight=sample_weight, **self._kwargs
-            )
-        else:
-            return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
+            sc_args["sample_weight"] = sample_weight
+        if self._needs_x is True:
+            sc_args["x_data"] = x_data
+
+        return self._sign * self._score_func(y_true, y_pred, **sc_args, **self._kwargs)
 
 
 class _ProbaScorer(Scorer):
@@ -117,6 +122,7 @@ def __call__(
         self,
         y_true: np.ndarray,
         y_pred: np.ndarray,
+        x_data: Optional[np.ndarray] = None,
         sample_weight: Optional[List[float]] = None,
     ) -> float:
         """Evaluate predicted probabilities for X relative to y_true.
@@ -156,12 +162,13 @@ def __call__(
                         y_true, y_pred, labels=labels, **self._kwargs
                     )
 
+        sc_args = {}  # type: Dict[str, Union[List[float], np.ndarray]]
         if sample_weight is not None:
-            return self._sign * self._score_func(
-                y_true, y_pred, sample_weight=sample_weight, **self._kwargs
-            )
-        else:
-            return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
+            sc_args["sample_weight"] = sample_weight
+        if self._needs_x is True:
+            sc_args["x_data"] = x_data
+
+        return self._sign * self._score_func(y_true, y_pred, **sc_args, **self._kwargs)
 
 
 class _ThresholdScorer(Scorer):
@@ -169,6 +176,7 @@ def __call__(
         self,
         y_true: np.ndarray,
         y_pred: np.ndarray,
+        x_data: Optional[np.ndarray] = None,
         sample_weight: Optional[List[float]] = None,
     ) -> float:
         """Evaluate decision function output for X relative to y_true.
@@ -199,22 +207,25 @@ def __call__(
         elif isinstance(y_pred, list):
             y_pred = np.vstack([p[:, -1] for p in y_pred]).T
 
+        sc_args = {}  # type: Dict[str, Union[List[float], np.ndarray]]
         if sample_weight is not None:
-            return self._sign * self._score_func(
-                y_true, y_pred, sample_weight=sample_weight, **self._kwargs
-            )
-        else:
-            return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
+            sc_args["sample_weight"] = sample_weight
+        if self._needs_x is True:
+            sc_args["x_data"] = x_data
+
+        return self._sign * self._score_func(y_true, y_pred, **sc_args, **self._kwargs)
 
 
 def make_scorer(
     name: str,
     score_func: Callable,
+    *,
     optimum: float = 1.0,
     worst_possible_result: float = 0.0,
     greater_is_better: bool = True,
     needs_proba: bool = False,
     needs_threshold: bool = False,
+    needs_x: bool = False,
     **kwargs: Any,
 ) -> Scorer:
     """Make a scorer from a performance metric or loss function.
@@ -224,6 +235,9 @@ def make_scorer(
 
     Parameters
     ----------
+    name: str
+        Descriptive name of the metric
+
     score_func : callable
         Score function (or loss function) with signature
         ``score_func(y, y_pred, **kwargs)``.
@@ -232,6 +246,10 @@ def make_scorer(
         The best score achievable by the score function, i.e. maximum in case of
         scorer function and minimum in case of loss function.
 
+    worst_possible_result : int of float, default=0
+        The worst score achievable by the score function, i.e. minimum in case of
+        scorer function and maximum in case of loss function.
+
     greater_is_better : boolean, default=True
         Whether score_func is a score function (default), meaning high is good,
         or a loss function, meaning low is good. In the latter case, the
@@ -245,27 +263,34 @@ def make_scorer(
         Whether score_func takes a continuous decision certainty.
         This only works for binary classification.
 
+    needs_x : boolean, default=False
+        Whether score_func requires X in __call__ to compute a metric.
+
     **kwargs : additional arguments
         Additional parameters to be passed to score_func.
 
     Returns
     -------
     scorer : callable
-        Callable object that returns a scalar score; greater is better.
+        Callable object that returns a scalar score; greater is better or set
+        greater_is_better to False.
     """
     sign = 1 if greater_is_better else -1
-    if needs_proba:
-        return _ProbaScorer(
-            name, score_func, optimum, worst_possible_result, sign, kwargs
+    if needs_proba and needs_threshold:
+        raise ValueError(
+            "Set either needs_proba or needs_threshold to True, but not both."
         )
+
+    cls = None  # type: Any
+    if needs_proba:
+        cls = _ProbaScorer
     elif needs_threshold:
-        return _ThresholdScorer(
-            name, score_func, optimum, worst_possible_result, sign, kwargs
-        )
+        cls = _ThresholdScorer
     else:
-        return _PredictScorer(
-            name, score_func, optimum, worst_possible_result, sign, kwargs
-        )
+        cls = _PredictScorer
+    return cls(
+        name, score_func, optimum, worst_possible_result, sign, kwargs, needs_x=needs_x
+    )
 
 
 # Standard regression scores
@@ -420,6 +445,8 @@ def calculate_scores(
     prediction: np.ndarray,
     task_type: int,
     metrics: Sequence[Scorer],
+    *,
+    x_data: Optional[np.ndarray] = None,
     scoring_functions: Optional[List[Scorer]] = None,
 ) -> Dict[str, float]:
     """
@@ -460,7 +487,11 @@ def calculate_scores(
 
             try:
                 score_dict[metric_.name] = _compute_single_scorer(
-                    metric_, prediction, solution, task_type
+                    metric_,
+                    prediction,
+                    solution,
+                    task_type,
+                    x_data,
                 )
             except ValueError as e:
                 print(e, e.args[0])
@@ -480,7 +511,11 @@ def calculate_scores(
 
             try:
                 score_dict[metric_.name] = _compute_single_scorer(
-                    metric_, prediction, solution, task_type
+                    metric_,
+                    prediction,
+                    solution,
+                    task_type,
+                    x_data,
                 )
             except ValueError as e:
                 if e.args[0] == "multiclass format is not supported":
@@ -507,6 +542,8 @@ def calculate_losses(
     prediction: np.ndarray,
     task_type: int,
     metrics: Sequence[Scorer],
+    *,
+    x_data: Optional[np.ndarray] = None,
     scoring_functions: Optional[List[Scorer]] = None,
 ) -> Dict[str, float]:
     """
@@ -526,6 +563,8 @@ def calculate_losses(
     metrics: Sequence[Scorer]
         A list of objects that hosts a function to calculate how good the
         prediction is according to the solution.
+    x_data: Optional[np.ndarray]
+        X data necessary for some metrics
     scoring_functions: List[Scorer]
         A list of metrics to calculate multiple losses
 
@@ -537,6 +576,7 @@ def calculate_losses(
     score = calculate_scores(
         solution=solution,
         prediction=prediction,
+        x_data=x_data,
         task_type=task_type,
         metrics=metrics,
         scoring_functions=scoring_functions,
@@ -556,7 +596,11 @@ def calculate_losses(
 
 
 def compute_single_metric(
-    metric: Scorer, prediction: np.ndarray, solution: np.ndarray, task_type: int
+    metric: Scorer,
+    prediction: np.ndarray,
+    solution: np.ndarray,
+    task_type: int,
+    x_data: Optional[np.ndarray] = None,
 ) -> float:
     """
     Returns a metric for the given Auto-Sklearn Scorer object.
@@ -583,13 +627,18 @@ def compute_single_metric(
         solution=solution,
         prediction=prediction,
         metric=metric,
+        x_data=x_data,
         task_type=task_type,
     )
     return metric._sign * score
 
 
 def _compute_single_scorer(
-    metric: Scorer, prediction: np.ndarray, solution: np.ndarray, task_type: int
+    metric: Scorer,
+    prediction: np.ndarray,
+    solution: np.ndarray,
+    task_type: int,
+    x_data: Optional[np.ndarray] = None,
 ) -> float:
     """
     Returns a score (a magnitude that allows casting the
@@ -612,6 +661,24 @@ def _compute_single_scorer(
     -------
     float
     """
+    if metric._needs_x:
+        if x_data is None:
+            raise ValueError(
+                f"Metric {metric.name} needs x_data, but x_data is {x_data}"
+            )
+        elif x_data.shape[0] != solution.shape[0]:
+            raise ValueError(
+                f"x_data has wrong length. "
+                f"Should be {solution.shape[0]}, but is {x_data.shape[0]}"
+            )
+        if task_type in REGRESSION_TASKS:
+            # TODO put this into the regression metric itself
+            cprediction = sanitize_array(prediction)
+            score = metric(solution, cprediction, x_data=x_data)
+        else:
+            score = metric(solution, prediction, x_data=x_data)
+        return score
+
     if task_type in REGRESSION_TASKS:
         # TODO put this into the regression metric itself
         cprediction = sanitize_array(prediction)

From 0bbe2ab2f21e119b094a3c493d8a3c6df970fb42 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Mon, 16 May 2022 18:43:44 +0200
Subject: [PATCH 05/19] update submodule

---
 autosklearn/automl_common | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autosklearn/automl_common b/autosklearn/automl_common
index 9f554ee855..ea3ad3811c 160000
--- a/autosklearn/automl_common
+++ b/autosklearn/automl_common
@@ -1 +1 @@
-Subproject commit 9f554ee85579d7b3b7d0b002323d7d13afc0c014
+Subproject commit ea3ad3811c5a18a6a0a2d143399e4ade25890e51

From 2e882a783bb46a6cd6607b521ba0080aa41d3f93 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Mon, 16 May 2022 19:51:13 +0200
Subject: [PATCH 06/19] Fix tests

---
 autosklearn/evaluation/train_evaluator.py    | 90 ++++++++++++++++----
 test/fixtures/ensemble_building.py           |  4 +-
 test/test_evaluation/test_evaluation.py      |  8 +-
 test/test_evaluation/test_train_evaluator.py |  5 +-
 test/test_metric/test_metrics.py             |  4 +-
 5 files changed, 85 insertions(+), 26 deletions(-)

diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
index dbb1674db8..1a0933ea68 100644
--- a/autosklearn/evaluation/train_evaluator.py
+++ b/autosklearn/evaluation/train_evaluator.py
@@ -7,6 +7,9 @@
 import warnings
 
 import numpy as np
+import pandas
+import pandas as pd
+import scipy.sparse
 from ConfigSpace import Configuration
 from sklearn.base import BaseEstimator
 from sklearn.model_selection import (
@@ -265,7 +268,11 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     self.Y_train,
                     groups=self.resampling_strategy_args.get("groups"),
                 ):
-                    self.X_optimization = self.X_train[test_split]
+                    self.X_optimization = (
+                        self.X_train.iloc[test_split]
+                        if hasattr(self.X_train, "iloc")
+                        else self.X_train[test_split]
+                    )
                     self.Y_optimization = self.Y_train[test_split]
                     self.Y_actual_train = self.Y_train[train_split]
                     self._partial_fit_and_predict_iterative(
@@ -360,7 +367,12 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                                 if hasattr(self.Y_train, "iloc")
                                 else self.Y_train[train_indices]
                             )
-                            self.X_targets[i] = self.X_train[test_indices]
+                            self.X_targets[i] = (
+                                self.X_train.iloc[test_indices]
+                                if hasattr(self.X_train, "iloc")
+                                else self.X_train[train_indices]
+                            )
+
                             self.Y_targets[i] = self.Y_train[test_indices]
 
                             Xt, fit_params = model.fit_transformer(
@@ -469,13 +481,35 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                             if Y_optimization_pred[i] is not None
                         ]
                     )
-                    X_targets = np.concatenate(
-                        [
-                            X_targets[i]
-                            for i in range(self.num_cv_folds)
-                            if X_targets[i] is not None
-                        ]
-                    )
+
+                    if isinstance(X_targets[0], np.ndarray):
+                        X_targets = np.concatenate(
+                            [
+                                X_targets[i]
+                                for i in range(self.num_cv_folds)
+                                if X_targets[i] is not None
+                            ]
+                        )
+                    elif isinstance(X_targets[0], scipy.sparse.spmatrix):
+                        X_targets = scipy.sparse.vstack(
+                            [
+                                X_targets[i]
+                                for i in range(self.num_cv_folds)
+                                if X_targets[i] is not None
+                            ]
+                        )
+                    elif isinstance(X_targets[0], pd.DataFrame):
+                        X_targets = pd.concat(
+                            [
+                                X_targets[i]
+                                for i in range(self.num_cv_folds)
+                                if X_targets[i] is not None
+                            ],
+                            axis=0,
+                        )
+                    else:
+                        raise ValueError(f"Unknown datatype {type(X_targets[0])}")
+
                     Y_targets = np.concatenate(
                         [
                             Y_targets[i]
@@ -614,7 +648,9 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                 train_loss = self._loss(
                     self.Y_train_targets[train_split],
                     train_pred,
-                    x_data=self.X_train[train_split],
+                    x_data=self.X_train.iloc[train_split]
+                    if hasattr(self.X_train, "iloc")
+                    else self.X_train[train_split],
                 )
                 train_losses.append(train_loss)
                 # number of training data points for this fold. Used for weighting
@@ -667,13 +703,33 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     if Y_optimization_pred[i] is not None
                 ]
             )
-            X_targets = np.concatenate(
-                [
-                    X_targets[i]
-                    for i in range(self.num_cv_folds)
-                    if X_targets[i] is not None
-                ]
-            )
+            if isinstance(X_targets[0], np.ndarray):
+                X_targets = np.concatenate(
+                    [
+                        X_targets[i]
+                        for i in range(self.num_cv_folds)
+                        if X_targets[i] is not None
+                    ]
+                )
+            elif isinstance(X_targets[0], scipy.sparse.spmatrix):
+                X_targets = scipy.sparse.vstack(
+                    [
+                        X_targets[i]
+                        for i in range(self.num_cv_folds)
+                        if X_targets[i] is not None
+                    ]
+                )
+            elif isinstance(X_targets[0], pd.DataFrame):
+                X_targets = pd.concat(
+                    [
+                        X_targets[i]
+                        for i in range(self.num_cv_folds)
+                        if X_targets[i] is not None
+                    ],
+                    axis=0,
+                )
+            else:
+                raise ValueError(f"Unknown datatype {type(X_targets[0])}")
             Y_targets = np.concatenate(
                 [
                     Y_targets[i]
diff --git a/test/fixtures/ensemble_building.py b/test/fixtures/ensemble_building.py
index 42dd7fbb9a..1a6c18073a 100644
--- a/test/fixtures/ensemble_building.py
+++ b/test/fixtures/ensemble_building.py
@@ -145,7 +145,9 @@ def _make(
             # Hence, we take the y_train of the datamanager and use that as the
             # the targets
             if "Y_train" in datamanager.data:
-                backend.save_targets_ensemble(datamanager.data["Y_train"])
+                backend.save_additional_data(
+                    datamanager.data["Y_train"], what="targets_ensemble"
+                )
 
         builder = EnsembleBuilder(
             backend=backend,
diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py
index f5292060a6..32ba71e925 100644
--- a/test/test_evaluation/test_evaluation.py
+++ b/test/test_evaluation/test_evaluation.py
@@ -495,7 +495,7 @@ def test_silent_exception_in_target_function(self):
         config = unittest.mock.Mock()
         config.config_id = 198
 
-        delattr(self.backend, "save_targets_ensemble")
+        delattr(self.backend, "save_additional_data")
         ta = ExecuteTaFuncWithQueue(
             backend=self.backend,
             port=self.logger_port,
@@ -531,10 +531,10 @@ def test_silent_exception_in_target_function(self):
             info[1].additional_info["error"],
             (
                 """AttributeError("'BackendMock' object has no attribute """
-                """'save_targets_ensemble'",)""",
+                """'save_additional_data'",)""",
                 """AttributeError("'BackendMock' object has no attribute """
-                """'save_targets_ensemble'")""",
-                """AttributeError('save_targets_ensemble')""",
+                """'save_additional_data'")""",
+                """AttributeError('save_additional_data')""",
             ),
         )
         self.assertNotIn("exitcode", info[1].additional_info)
diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py
index 23607b8e4d..1c5842b457 100644
--- a/test/test_evaluation/test_train_evaluator.py
+++ b/test/test_evaluation/test_train_evaluator.py
@@ -816,7 +816,7 @@ def test_file_output(self, loss_mock, model_mock):
         )
 
         self.assertEqual(rval, (None, {}))
-        self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, 1)
+        self.assertEqual(self.backend_mock.save_additional_data.call_count, 2)
         self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 1)
         self.assertEqual(
             self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(),
@@ -845,7 +845,7 @@ def test_file_output(self, loss_mock, model_mock):
             D.data["Y_test"],
         )
         self.assertEqual(rval, (None, {}))
-        self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, 2)
+        self.assertEqual(self.backend_mock.save_additional_data.call_count, 4)
         self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 2)
         self.assertEqual(
             self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(),
@@ -1109,6 +1109,7 @@ def test_fit_predict_and_loss_standard_additional_run_info(
         evaluator.file_output.return_value = (None, {})
         evaluator.model = unittest.mock.Mock()
         evaluator.model.estimator_supports_iterative_fit.return_value = False
+        evaluator.X_targets[0] = np.array([1, 0] * 23)
         evaluator.Y_targets[0] = np.array([1] * 23)
         evaluator.Y_train_targets = np.array([1] * 69)
         rval = evaluator.fit_predict_and_loss(iterative=False)
diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py
index 2cb7dc2158..36fa4f72ae 100644
--- a/test/test_metric/test_metrics.py
+++ b/test/test_metric/test_metrics.py
@@ -570,7 +570,7 @@ def test_classification_scoring_functions(self):
             y_pred,
             BINARY_CLASSIFICATION,
             [autosklearn.metrics.accuracy],
-            scoring_functions,
+            scoring_functions=scoring_functions,
         )
 
         self.assertIsInstance(score_dict, dict)
@@ -600,7 +600,7 @@ def test_regression_scoring_functions(self):
             y_pred,
             REGRESSION,
             [autosklearn.metrics.root_mean_squared_error],
-            scoring_functions,
+            scoring_functions=scoring_functions,
         )
 
         self.assertIsInstance(score_dict, dict)

From 915386d8862c6af8874c12fc42f36713da1dbab4 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Mon, 16 May 2022 20:21:24 +0200
Subject: [PATCH 07/19] update submodule

---
 autosklearn/automl_common | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autosklearn/automl_common b/autosklearn/automl_common
index ea3ad3811c..63877b2531 160000
--- a/autosklearn/automl_common
+++ b/autosklearn/automl_common
@@ -1 +1 @@
-Subproject commit ea3ad3811c5a18a6a0a2d143399e4ade25890e51
+Subproject commit 63877b253124ca7ac805aefaf02b69ebd7837010

From 5df1d5a44c7697c6a34972f092a5b0d019c5be81 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Mon, 16 May 2022 21:17:25 +0200
Subject: [PATCH 08/19] ADD example

---
 examples/40_advanced/example_metrics.py | 41 +++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/examples/40_advanced/example_metrics.py b/examples/40_advanced/example_metrics.py
index 7784491746..6c534acc3a 100644
--- a/examples/40_advanced/example_metrics.py
+++ b/examples/40_advanced/example_metrics.py
@@ -46,6 +46,12 @@ def error_wk(solution, prediction, extra_argument):
     return np.mean(solution != prediction)
 
 
+def metric_which_needs_x(solution, prediction, x_data, extra_argument):
+    # custom function defining accuracy
+    assert x_data is not None
+    return np.mean(solution[extra_argument, :] == prediction[extra_argument, :])
+
+
 ############################################################################
 # Data Loading
 # ============
@@ -186,3 +192,38 @@ def error_wk(solution, prediction, extra_argument):
 predictions = cls.predict(X_test)
 score = error_rate(y_test, predictions)
 print(f"Error score {score:.3f} using {error_rate.name:s}")
+
+
+#############################################################################
+# Sixth example: Use a metric with additional argument which also needs xdata
+# ===============================================================
+"""
+Finally, *Auto-sklearn* also support metric that require the train data (aka x_data) to
+compute a value. This can be useful if one only cares about the score on a subset of the
+data.
+"""
+
+accuracy_scorer = autosklearn.metrics.make_scorer(
+    name="accu_X",
+    score_func=metric_which_needs_x,
+    optimum=1,
+    greater_is_better=True,
+    needs_proba=False,
+    needs_x=True,
+    needs_threshold=False,
+    extra_argument=[1, 2, 3, 4, 5],
+)
+cls = autosklearn.classification.AutoSklearnClassifier(
+    time_left_for_this_task=60,
+    per_run_time_limit=30,
+    seed=1,
+    metric=accuracy_scorer,
+    ensemble_size=0,
+)
+cls.fit(X_train, y_train)
+
+predictions = cls.predict(X_test)
+score = metric_which_needs_x(
+    y_test, predictions, x_data=X_test, extra_argument=X_test[:, 1] > 20
+)
+print(f"Error score {score:.3f} using {error_rate.name:s}")

From e47ccf5aaa27a6c5657ede09eae49a2849b3a2bd Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Tue, 17 May 2022 13:01:51 +0200
Subject: [PATCH 09/19] UPDATE example

---
 examples/40_advanced/example_metrics.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/examples/40_advanced/example_metrics.py b/examples/40_advanced/example_metrics.py
index 6c534acc3a..f6808c8ebe 100644
--- a/examples/40_advanced/example_metrics.py
+++ b/examples/40_advanced/example_metrics.py
@@ -46,10 +46,11 @@ def error_wk(solution, prediction, extra_argument):
     return np.mean(solution != prediction)
 
 
-def metric_which_needs_x(solution, prediction, x_data, extra_argument):
+def metric_which_needs_x(solution, prediction, x_data, consider_col, threshold):
     # custom function defining accuracy
     assert x_data is not None
-    return np.mean(solution[extra_argument, :] == prediction[extra_argument, :])
+    rel_idx = x_data[:, consider_col] > threshold
+    return np.mean(solution[rel_idx] == prediction[rel_idx])
 
 
 ############################################################################
@@ -211,7 +212,8 @@ def metric_which_needs_x(solution, prediction, x_data, extra_argument):
     needs_proba=False,
     needs_x=True,
     needs_threshold=False,
-    extra_argument=[1, 2, 3, 4, 5],
+    consider_col=1,
+    threshold=20,
 )
 cls = autosklearn.classification.AutoSklearnClassifier(
     time_left_for_this_task=60,
@@ -224,6 +226,10 @@ def metric_which_needs_x(solution, prediction, x_data, extra_argument):
 
 predictions = cls.predict(X_test)
 score = metric_which_needs_x(
-    y_test, predictions, x_data=X_test, extra_argument=X_test[:, 1] > 20
+    y_test,
+    predictions,
+    x_data=X_test,
+    consider_col=1,
+    threshold=18.8,
 )
 print(f"Error score {score:.3f} using {error_rate.name:s}")

From f3c2c66be6fdce2ca5e60398676a872849078ad3 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Tue, 17 May 2022 13:16:28 +0200
Subject: [PATCH 10/19] ADD extra method to concat data

---
 autosklearn/evaluation/train_evaluator.py | 112 ++++++----------------
 1 file changed, 29 insertions(+), 83 deletions(-)

diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
index 1a0933ea68..d0a119b5c1 100644
--- a/autosklearn/evaluation/train_evaluator.py
+++ b/autosklearn/evaluation/train_evaluator.py
@@ -180,6 +180,26 @@ def _fit_with_budget(
         raise ValueError(budget_type)
 
 
+def concat_data(
+    data: List(Any), num_cv_folds: int
+) -> Union(np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix):
+    if isinstance(data[0], np.ndarray):
+        return np.concatenate(
+            [data[i] for i in range(num_cv_folds) if data[i] is not None]
+        )
+    elif isinstance(data[0], scipy.sparse.spmatrix):
+        return scipy.sparse.vstack(
+            [data[i] for i in range(num_cv_folds) if data[i] is not None]
+        )
+    elif isinstance(data[0], pd.DataFrame):
+        return pd.concat(
+            [data[i] for i in range(num_cv_folds) if data[i] is not None],
+            axis=0,
+        )
+    else:
+        raise ValueError(f"Unknown datatype {type(data[0])}")
+
+
 class TrainEvaluator(AbstractEvaluator):
     def __init__(
         self,
@@ -474,49 +494,11 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     Y_targets = self.Y_targets
                     Y_train_targets = self.Y_train_targets
 
-                    Y_optimization_preds = np.concatenate(
-                        [
-                            Y_optimization_pred[i]
-                            for i in range(self.num_cv_folds)
-                            if Y_optimization_pred[i] is not None
-                        ]
-                    )
-
-                    if isinstance(X_targets[0], np.ndarray):
-                        X_targets = np.concatenate(
-                            [
-                                X_targets[i]
-                                for i in range(self.num_cv_folds)
-                                if X_targets[i] is not None
-                            ]
-                        )
-                    elif isinstance(X_targets[0], scipy.sparse.spmatrix):
-                        X_targets = scipy.sparse.vstack(
-                            [
-                                X_targets[i]
-                                for i in range(self.num_cv_folds)
-                                if X_targets[i] is not None
-                            ]
-                        )
-                    elif isinstance(X_targets[0], pd.DataFrame):
-                        X_targets = pd.concat(
-                            [
-                                X_targets[i]
-                                for i in range(self.num_cv_folds)
-                                if X_targets[i] is not None
-                            ],
-                            axis=0,
-                        )
-                    else:
-                        raise ValueError(f"Unknown datatype {type(X_targets[0])}")
-
-                    Y_targets = np.concatenate(
-                        [
-                            Y_targets[i]
-                            for i in range(self.num_cv_folds)
-                            if Y_targets[i] is not None
-                        ]
+                    Y_optimization_pred = concat_data(
+                        Y_optimization_pred, num_cv_folds=self.num_cv_folds
                     )
+                    X_targets = concat_data(X_targets, num_cv_folds=self.num_cv_folds)
+                    Y_targets = concat_data(Y_targets, num_cv_folds=self.num_cv_folds)
 
                     if self.X_valid is not None:
                         Y_valid_preds = np.array(
@@ -562,7 +544,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     self.finish_up(
                         loss=opt_loss,
                         train_loss=train_loss,
-                        opt_pred=Y_optimization_preds,
+                        opt_pred=Y_optimization_pred,
                         valid_pred=Y_valid_preds,
                         test_pred=Y_test_preds,
                         additional_run_info=additional_run_info,
@@ -696,47 +678,11 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
             Y_targets = self.Y_targets
             Y_train_targets = self.Y_train_targets
 
-            Y_optimization_pred = np.concatenate(
-                [
-                    Y_optimization_pred[i]
-                    for i in range(self.num_cv_folds)
-                    if Y_optimization_pred[i] is not None
-                ]
-            )
-            if isinstance(X_targets[0], np.ndarray):
-                X_targets = np.concatenate(
-                    [
-                        X_targets[i]
-                        for i in range(self.num_cv_folds)
-                        if X_targets[i] is not None
-                    ]
-                )
-            elif isinstance(X_targets[0], scipy.sparse.spmatrix):
-                X_targets = scipy.sparse.vstack(
-                    [
-                        X_targets[i]
-                        for i in range(self.num_cv_folds)
-                        if X_targets[i] is not None
-                    ]
-                )
-            elif isinstance(X_targets[0], pd.DataFrame):
-                X_targets = pd.concat(
-                    [
-                        X_targets[i]
-                        for i in range(self.num_cv_folds)
-                        if X_targets[i] is not None
-                    ],
-                    axis=0,
-                )
-            else:
-                raise ValueError(f"Unknown datatype {type(X_targets[0])}")
-            Y_targets = np.concatenate(
-                [
-                    Y_targets[i]
-                    for i in range(self.num_cv_folds)
-                    if Y_targets[i] is not None
-                ]
+            Y_optimization_pred = concat_data(
+                Y_optimization_pred, num_cv_folds=self.num_cv_folds
             )
+            X_targets = concat_data(X_targets, num_cv_folds=self.num_cv_folds)
+            Y_targets = concat_data(Y_targets, num_cv_folds=self.num_cv_folds)
 
             if self.X_valid is not None:
                 Y_valid_pred = np.array(

From e6becba97d209d6289a7fa1b152b61a9efbfefa5 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Tue, 17 May 2022 13:19:09 +0200
Subject: [PATCH 11/19] RM note

---
 autosklearn/evaluation/train_evaluator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
index d0a119b5c1..a1995052b4 100644
--- a/autosklearn/evaluation/train_evaluator.py
+++ b/autosklearn/evaluation/train_evaluator.py
@@ -434,7 +434,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                             if hasattr(self.Y_train, "iloc")
                             else self.Y_train[train_indices],
                             train_pred,
-                            # TODO: Check whether this is the correct xdata
                             x_data=Xt_array[i],
                         )
                         train_losses[i] = train_loss

From fbe492372f6ebfc212ae0e3312a684706d573d5e Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Tue, 17 May 2022 22:39:38 +0200
Subject: [PATCH 12/19] Fix minor

---
 autosklearn/evaluation/train_evaluator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
index a1995052b4..95acea87ba 100644
--- a/autosklearn/evaluation/train_evaluator.py
+++ b/autosklearn/evaluation/train_evaluator.py
@@ -181,8 +181,8 @@ def _fit_with_budget(
 
 
 def concat_data(
-    data: List(Any), num_cv_folds: int
-) -> Union(np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix):
+    data: List[Any], num_cv_folds: int
+) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix]:
     if isinstance(data[0], np.ndarray):
         return np.concatenate(
             [data[i] for i in range(num_cv_folds) if data[i] is not None]

From c89ab1aba800bbf05cf6f6c8f49562b81baf678c Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Wed, 18 May 2022 09:42:54 +0200
Subject: [PATCH 13/19] ADD more types for xdata

---
 autosklearn/data/target_validator.py         | 1 +
 autosklearn/evaluation/abstract_evaluator.py | 9 ++++++---
 autosklearn/metrics/__init__.py              | 7 ++++---
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/autosklearn/data/target_validator.py b/autosklearn/data/target_validator.py
index 030a40b9b0..6f1ee1335c 100644
--- a/autosklearn/data/target_validator.py
+++ b/autosklearn/data/target_validator.py
@@ -17,6 +17,7 @@
 from autosklearn.util.logging_ import PickableLoggerAdapter
 
 SUPPORTED_TARGET_TYPES = Union[List, pd.Series, pd.DataFrame, np.ndarray, spmatrix]
+SUPPORTED_XDATA_TYPES = Union[pd.Series, pd.DataFrame, np.ndarray, spmatrix]
 
 
 class TargetValidator(BaseEstimator):
diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
index d6f2212856..851c50da40 100644
--- a/autosklearn/evaluation/abstract_evaluator.py
+++ b/autosklearn/evaluation/abstract_evaluator.py
@@ -25,7 +25,10 @@
     MULTIOUTPUT_REGRESSION,
     REGRESSION_TASKS,
 )
-from autosklearn.data.validation import SUPPORTED_TARGET_TYPES
+from autosklearn.data.target_validator import (
+    SUPPORTED_TARGET_TYPES,
+    SUPPORTED_XDATA_TYPES,
+)
 from autosklearn.metrics import Scorer, calculate_losses
 from autosklearn.pipeline.components.base import ThirdPartyComponents, _addons
 from autosklearn.pipeline.implementations.util import (
@@ -274,7 +277,7 @@ def __init__(
                 port=self.port,
             )
 
-        self.X_optimization: Optional[SUPPORTED_TARGET_TYPES] = None
+        self.X_optimization: Optional[SUPPORTED_XDATA_TYPES] = None
         self.Y_optimization: Optional[SUPPORTED_TARGET_TYPES] = None
         self.Y_actual_train = None
 
@@ -330,7 +333,7 @@ def _loss(
         self,
         y_true: np.ndarray,
         y_hat: np.ndarray,
-        x_data: Optional[np.ndarray] = None,
+        x_data: Optional[SUPPORTED_XDATA_TYPES] = None,
     ) -> Dict[str, float]:
         """Auto-sklearn follows a minimization goal.
         The calculate_loss internally translate a score function to
diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index 9e167694c8..ec3a91b2d0 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -19,6 +19,7 @@
     REGRESSION_TASKS,
     TASK_TYPES,
 )
+from autosklearn.data.target_validator import SUPPORTED_XDATA_TYPES
 
 from .util import sanitize_array
 
@@ -446,7 +447,7 @@ def calculate_scores(
     task_type: int,
     metrics: Sequence[Scorer],
     *,
-    x_data: Optional[np.ndarray] = None,
+    x_data: Optional[SUPPORTED_XDATA_TYPES] = None,
     scoring_functions: Optional[List[Scorer]] = None,
 ) -> Dict[str, float]:
     """
@@ -543,7 +544,7 @@ def calculate_losses(
     task_type: int,
     metrics: Sequence[Scorer],
     *,
-    x_data: Optional[np.ndarray] = None,
+    x_data: Optional[SUPPORTED_XDATA_TYPES] = None,
     scoring_functions: Optional[List[Scorer]] = None,
 ) -> Dict[str, float]:
     """
@@ -638,7 +639,7 @@ def _compute_single_scorer(
     prediction: np.ndarray,
     solution: np.ndarray,
     task_type: int,
-    x_data: Optional[np.ndarray] = None,
+    x_data: Optional[SUPPORTED_XDATA_TYPES] = None,
 ) -> float:
     """
     Returns a score (a magnitude that allows casting the

From e2e37a6225145e14791dbedb9e1e6d1353ece06d Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Fri, 20 May 2022 14:11:38 +0200
Subject: [PATCH 14/19] FIX unittests

---
 test/test_evaluation/test_train_evaluator.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py
index 1c5842b457..9413af5509 100644
--- a/test/test_evaluation/test_train_evaluator.py
+++ b/test/test_evaluation/test_train_evaluator.py
@@ -1270,6 +1270,7 @@ def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info(
         self.assertEqual(finish_up_mock.call_count, 1)
         self.assertEqual(finish_up_mock.call_args[1]["additional_run_info"], 14678)
 
+    @unittest.mock.patch("autosklearn.evaluation.train_evaluator.concat_data")
     @unittest.mock.patch.object(TrainEvaluator, "_loss")
     @unittest.mock.patch.object(TrainEvaluator, "finish_up")
     @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend")
@@ -1282,6 +1283,7 @@ def test_fit_predict_and_loss_budget_additional_run_info(
         backend_mock,
         finish_up_mock,
         loss_mock,
+        _,
     ):
         class Counter:
             counter = 0
@@ -1331,6 +1333,7 @@ def __call__(self):
             finish_up_mock.call_args[1]["additional_run_info"], {"val": 14678}
         )
 
+    @unittest.mock.patch("autosklearn.evaluation.train_evaluator.concat_data")
     @unittest.mock.patch.object(TrainEvaluator, "_loss")
     @unittest.mock.patch.object(TrainEvaluator, "finish_up")
     @unittest.mock.patch("autosklearn.automl_common.common.utils.backend.Backend")
@@ -1338,11 +1341,7 @@ def __call__(self):
         "autosklearn.pipeline.classification.SimpleClassificationPipeline"
     )
     def test_fit_predict_and_loss_budget_2_additional_run_info(
-        self,
-        mock,
-        backend_mock,
-        finish_up_mock,
-        loss_mock,
+        self, mock, backend_mock, finish_up_mock, loss_mock, _
     ):
         mock.estimator_supports_iterative_fit.return_value = False
         mock.fit_transformer.return_value = ("Xt", {})

From 9c6c633cfef78b9a145ff8ce69f3533612de4767 Mon Sep 17 00:00:00 2001
From: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
Date: Fri, 20 May 2022 16:14:09 +0200
Subject: [PATCH 15/19] FIX variable naming bug

---
 autosklearn/evaluation/train_evaluator.py | 4 ++--
 test/test_estimators/test_estimators.py   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
index 95acea87ba..df81a47b57 100644
--- a/autosklearn/evaluation/train_evaluator.py
+++ b/autosklearn/evaluation/train_evaluator.py
@@ -493,7 +493,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     Y_targets = self.Y_targets
                     Y_train_targets = self.Y_train_targets
 
-                    Y_optimization_pred = concat_data(
+                    Y_optimization_pred_concat = concat_data(
                         Y_optimization_pred, num_cv_folds=self.num_cv_folds
                     )
                     X_targets = concat_data(X_targets, num_cv_folds=self.num_cv_folds)
@@ -543,7 +543,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     self.finish_up(
                         loss=opt_loss,
                         train_loss=train_loss,
-                        opt_pred=Y_optimization_pred,
+                        opt_pred=Y_optimization_pred_concat,
                         valid_pred=Y_valid_preds,
                         test_pred=Y_test_preds,
                         additional_run_info=additional_run_info,
diff --git a/test/test_estimators/test_estimators.py b/test/test_estimators/test_estimators.py
index 8e3134e3a3..dc5c359c8a 100644
--- a/test/test_estimators/test_estimators.py
+++ b/test/test_estimators/test_estimators.py
@@ -1281,7 +1281,7 @@ def test_autosklearn2_classification_methods_returns_self(dask_client):
 
     predictions = automl_fitted.predict(X_test)
     assert (
-        sklearn.metrics.accuracy_score(y_test, predictions) >= 2 / 3
+        sklearn.metrics.accuracy_score(y_test, predictions) >= 4 / 5
     ), print_debug_information(automl)
 
     pickle.dumps(automl_fitted)
@@ -1310,7 +1310,7 @@ def test_autosklearn2_classification_methods_returns_self_sparse(dask_client):
 
     predictions = automl_fitted.predict(X_test)
     assert (
-        sklearn.metrics.accuracy_score(y_test, predictions) >= 2 / 3
+        sklearn.metrics.accuracy_score(y_test, predictions) >= 4 / 5
     ), print_debug_information(automl)
 
     assert "boosting" not in str(automl.get_configuration_space(X=X_train, y=y_train))

From a50fabead829af450827b55f72a5c39007295150 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Mon, 23 May 2022 16:27:38 +0200
Subject: [PATCH 16/19] change varible name; fix docstring

---
 autosklearn/metrics/__init__.py         | 68 +++++++++++++++++--------
 examples/40_advanced/example_metrics.py |  8 +--
 2 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index ec3a91b2d0..2a3242fe61 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -33,13 +33,13 @@ def __init__(
         worst_possible_result: float,
         sign: float,
         kwargs: Any,
-        needs_x: bool = False,
+        needs_X: bool = False,
     ) -> None:
         self.name = name
         self._kwargs = kwargs
         self._score_func = score_func
         self._optimum = optimum
-        self._needs_x = needs_x
+        self._needs_X = needs_X
         self._worst_possible_result = worst_possible_result
         self._sign = sign
 
@@ -75,6 +75,10 @@ def __call__(
         y_pred : array-like, [n_samples x n_classes]
             Model predictions
 
+        x_data : array-like [n_samples x n_features]
+            X data used to obtain the predictions: each row x_j corresponds to the input
+             used to obtain predictions y_j
+
         sample_weight : array-like, optional (default=None)
             Sample weights.
 
@@ -109,13 +113,15 @@ def __call__(
         else:
             raise ValueError(type_true)
 
-        sc_args = {}  # type: Dict[str, Union[List[float], np.ndarray]]
+        scorer_kwargs = {}  # type: Dict[str, Union[List[float], np.ndarray]]
         if sample_weight is not None:
-            sc_args["sample_weight"] = sample_weight
-        if self._needs_x is True:
-            sc_args["x_data"] = x_data
+            scorer_kwargs["sample_weight"] = sample_weight
+        if self._needs_X is True:
+            scorer_kwargs["x_data"] = x_data
 
-        return self._sign * self._score_func(y_true, y_pred, **sc_args, **self._kwargs)
+        return self._sign * self._score_func(
+            y_true, y_pred, **scorer_kwargs, **self._kwargs
+        )
 
 
 class _ProbaScorer(Scorer):
@@ -136,6 +142,10 @@ def __call__(
         y_pred : array-like, [n_samples x n_classes]
             Model predictions
 
+        x_data : array-like [n_samples x n_features]
+            X data used to obtain the predictions: each row x_j corresponds to the input
+             used to obtain predictions y_j
+
         sample_weight : array-like, optional (default=None)
             Sample weights.
 
@@ -163,13 +173,15 @@ def __call__(
                         y_true, y_pred, labels=labels, **self._kwargs
                     )
 
-        sc_args = {}  # type: Dict[str, Union[List[float], np.ndarray]]
+        scorer_kwargs = {}  # type: Dict[str, Union[List[float], np.ndarray]]
         if sample_weight is not None:
-            sc_args["sample_weight"] = sample_weight
-        if self._needs_x is True:
-            sc_args["x_data"] = x_data
+            scorer_kwargs["sample_weight"] = sample_weight
+        if self._needs_X is True:
+            scorer_kwargs["x_data"] = x_data
 
-        return self._sign * self._score_func(y_true, y_pred, **sc_args, **self._kwargs)
+        return self._sign * self._score_func(
+            y_true, y_pred, **scorer_kwargs, **self._kwargs
+        )
 
 
 class _ThresholdScorer(Scorer):
@@ -190,6 +202,10 @@ def __call__(
         y_pred : array-like, [n_samples x n_classes]
             Model predictions
 
+        x_data : array-like [n_samples x n_features]
+            X data used to obtain the predictions: each row x_j corresponds to the input
+             used to obtain predictions y_j
+
         sample_weight : array-like, optional (default=None)
             Sample weights.
 
@@ -208,13 +224,15 @@ def __call__(
         elif isinstance(y_pred, list):
             y_pred = np.vstack([p[:, -1] for p in y_pred]).T
 
-        sc_args = {}  # type: Dict[str, Union[List[float], np.ndarray]]
+        scorer_kwargs = {}  # type: Dict[str, Union[List[float], np.ndarray]]
         if sample_weight is not None:
-            sc_args["sample_weight"] = sample_weight
-        if self._needs_x is True:
-            sc_args["x_data"] = x_data
+            scorer_kwargs["sample_weight"] = sample_weight
+        if self._needs_X is True:
+            scorer_kwargs["x_data"] = x_data
 
-        return self._sign * self._score_func(y_true, y_pred, **sc_args, **self._kwargs)
+        return self._sign * self._score_func(
+            y_true, y_pred, **scorer_kwargs, **self._kwargs
+        )
 
 
 def make_scorer(
@@ -226,7 +244,7 @@ def make_scorer(
     greater_is_better: bool = True,
     needs_proba: bool = False,
     needs_threshold: bool = False,
-    needs_x: bool = False,
+    needs_X: bool = False,
     **kwargs: Any,
 ) -> Scorer:
     """Make a scorer from a performance metric or loss function.
@@ -264,7 +282,7 @@ def make_scorer(
         Whether score_func takes a continuous decision certainty.
         This only works for binary classification.
 
-    needs_x : boolean, default=False
+    needs_X : boolean, default=False
         Whether score_func requires X in __call__ to compute a metric.
 
     **kwargs : additional arguments
@@ -290,7 +308,7 @@ def make_scorer(
     else:
         cls = _PredictScorer
     return cls(
-        name, score_func, optimum, worst_possible_result, sign, kwargs, needs_x=needs_x
+        name, score_func, optimum, worst_possible_result, sign, kwargs, needs_X=needs_X
     )
 
 
@@ -467,6 +485,8 @@ def calculate_scores(
     metrics: Sequence[Scorer]
         A list of objects that hosts a function to calculate how good the
         prediction is according to the solution.
+    x_data : array-like [n_samples x n_features]
+        X data used to obtain the predictions
     scoring_functions: List[Scorer]
         A list of metrics to calculate multiple losses
     Returns
@@ -565,7 +585,7 @@ def calculate_losses(
         A list of objects that hosts a function to calculate how good the
         prediction is according to the solution.
     x_data: Optional[np.ndarray]
-        X data necessary for some metrics
+        X data used to obtain the predictions
     scoring_functions: List[Scorer]
         A list of metrics to calculate multiple losses
 
@@ -619,6 +639,8 @@ def compute_single_metric(
     metric: Scorer
         Object that host a function to calculate how good the
         prediction is according to the solution.
+    x_data : array-like [n_samples x n_features]
+        X data used to obtain the predictions
 
     Returns
     -------
@@ -658,11 +680,13 @@ def _compute_single_scorer(
     metric: Scorer
         Object that host a function to calculate how good the
         prediction is according to the solution.
+    x_data : array-like [n_samples x n_features]
+        X data used to obtain the predictions
     Returns
     -------
     float
     """
-    if metric._needs_x:
+    if metric._needs_X:
         if x_data is None:
             raise ValueError(
                 f"Metric {metric.name} needs x_data, but x_data is {x_data}"
diff --git a/examples/40_advanced/example_metrics.py b/examples/40_advanced/example_metrics.py
index f6808c8ebe..8d7ea0b4a5 100644
--- a/examples/40_advanced/example_metrics.py
+++ b/examples/40_advanced/example_metrics.py
@@ -46,10 +46,10 @@ def error_wk(solution, prediction, extra_argument):
     return np.mean(solution != prediction)
 
 
-def metric_which_needs_x(solution, prediction, x_data, consider_col, threshold):
+def metric_which_needs_x(solution, prediction, x_data, consider_col, val_threshold):
     # custom function defining accuracy
     assert x_data is not None
-    rel_idx = x_data[:, consider_col] > threshold
+    rel_idx = x_data[:, consider_col] > val_threshold
     return np.mean(solution[rel_idx] == prediction[rel_idx])
 
 
@@ -210,7 +210,7 @@ def metric_which_needs_x(solution, prediction, x_data, consider_col, threshold):
     optimum=1,
     greater_is_better=True,
     needs_proba=False,
-    needs_x=True,
+    needs_X=True,
     needs_threshold=False,
     consider_col=1,
     threshold=20,
@@ -230,6 +230,6 @@ def metric_which_needs_x(solution, prediction, x_data, consider_col, threshold):
     predictions,
     x_data=X_test,
     consider_col=1,
-    threshold=18.8,
+    val_threshold=18.8,
 )
 print(f"Error score {score:.3f} using {error_rate.name:s}")

From 644918910aa4412ed6226033ffab3d8c5603307a Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Mon, 23 May 2022 18:38:52 +0200
Subject: [PATCH 17/19] Rename variable

---
 autosklearn/evaluation/abstract_evaluator.py |  4 +-
 autosklearn/evaluation/train_evaluator.py    |  8 +--
 autosklearn/metrics/__init__.py              | 58 ++++++++++----------
 examples/40_advanced/example_metrics.py      | 10 ++--
 test/test_metric/test_metrics.py             | 28 ++++++++++
 5 files changed, 68 insertions(+), 40 deletions(-)

diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
index 851c50da40..9f45acea41 100644
--- a/autosklearn/evaluation/abstract_evaluator.py
+++ b/autosklearn/evaluation/abstract_evaluator.py
@@ -333,7 +333,7 @@ def _loss(
         self,
         y_true: np.ndarray,
         y_hat: np.ndarray,
-        x_data: Optional[SUPPORTED_XDATA_TYPES] = None,
+        X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
     ) -> Dict[str, float]:
         """Auto-sklearn follows a minimization goal.
         The calculate_loss internally translate a score function to
@@ -360,7 +360,7 @@ def _loss(
                 y_hat,
                 self.task_type,
                 self.metrics,
-                x_data=x_data,
+                X_data=X_data,
                 scoring_functions=self.scoring_functions,
             )
 
diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
index df81a47b57..a8433c2136 100644
--- a/autosklearn/evaluation/train_evaluator.py
+++ b/autosklearn/evaluation/train_evaluator.py
@@ -434,7 +434,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                             if hasattr(self.Y_train, "iloc")
                             else self.Y_train[train_indices],
                             train_pred,
-                            x_data=Xt_array[i],
+                            X_data=Xt_array[i],
                         )
                         train_losses[i] = train_loss
                         # Number of training data points for this fold.
@@ -443,7 +443,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                         # Compute validation loss of this fold and store it.
                         optimization_loss = self._loss(
-                            self.Y_targets[i], opt_pred, x_data=self.X_targets[i]
+                            self.Y_targets[i], opt_pred, X_data=self.X_targets[i]
                         )
                         opt_losses[i] = optimization_loss
                         # number of optimization data points for this fold.
@@ -629,7 +629,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                 train_loss = self._loss(
                     self.Y_train_targets[train_split],
                     train_pred,
-                    x_data=self.X_train.iloc[train_split]
+                    X_data=self.X_train.iloc[train_split]
                     if hasattr(self.X_train, "iloc")
                     else self.X_train[train_split],
                 )
@@ -640,7 +640,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                 # Compute validation loss of this fold and store it.
                 optimization_loss = self._loss(
-                    self.Y_targets[i], opt_pred, x_data=self.X_targets[i]
+                    self.Y_targets[i], opt_pred, X_data=self.X_targets[i]
                 )
                 opt_losses.append(optimization_loss)
                 # number of optimization data points for this fold. Used for weighting
diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index 2a3242fe61..1fa124532a 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -48,7 +48,7 @@ def __call__(
         self,
         y_true: np.ndarray,
         y_pred: np.ndarray,
-        x_data: Optional[np.ndarray] = None,
+        X_data: Optional[np.ndarray] = None,
         sample_weight: Optional[List[float]] = None,
     ) -> float:
         pass
@@ -62,7 +62,7 @@ def __call__(
         self,
         y_true: np.ndarray,
         y_pred: np.ndarray,
-        x_data: Optional[np.ndarray] = None,
+        X_data: Optional[np.ndarray] = None,
         sample_weight: Optional[List[float]] = None,
     ) -> float:
         """Evaluate predicted target values for X relative to y_true.
@@ -75,7 +75,7 @@ def __call__(
         y_pred : array-like, [n_samples x n_classes]
             Model predictions
 
-        x_data : array-like [n_samples x n_features]
+        X_data : array-like [n_samples x n_features]
             X data used to obtain the predictions: each row x_j corresponds to the input
              used to obtain predictions y_j
 
@@ -117,7 +117,7 @@ def __call__(
         if sample_weight is not None:
             scorer_kwargs["sample_weight"] = sample_weight
         if self._needs_X is True:
-            scorer_kwargs["x_data"] = x_data
+            scorer_kwargs["X_data"] = X_data
 
         return self._sign * self._score_func(
             y_true, y_pred, **scorer_kwargs, **self._kwargs
@@ -129,7 +129,7 @@ def __call__(
         self,
         y_true: np.ndarray,
         y_pred: np.ndarray,
-        x_data: Optional[np.ndarray] = None,
+        X_data: Optional[np.ndarray] = None,
         sample_weight: Optional[List[float]] = None,
     ) -> float:
         """Evaluate predicted probabilities for X relative to y_true.
@@ -142,7 +142,7 @@ def __call__(
         y_pred : array-like, [n_samples x n_classes]
             Model predictions
 
-        x_data : array-like [n_samples x n_features]
+        X_data : array-like [n_samples x n_features]
             X data used to obtain the predictions: each row x_j corresponds to the input
              used to obtain predictions y_j
 
@@ -177,7 +177,7 @@ def __call__(
         if sample_weight is not None:
             scorer_kwargs["sample_weight"] = sample_weight
         if self._needs_X is True:
-            scorer_kwargs["x_data"] = x_data
+            scorer_kwargs["X_data"] = X_data
 
         return self._sign * self._score_func(
             y_true, y_pred, **scorer_kwargs, **self._kwargs
@@ -189,7 +189,7 @@ def __call__(
         self,
         y_true: np.ndarray,
         y_pred: np.ndarray,
-        x_data: Optional[np.ndarray] = None,
+        X_data: Optional[np.ndarray] = None,
         sample_weight: Optional[List[float]] = None,
     ) -> float:
         """Evaluate decision function output for X relative to y_true.
@@ -202,7 +202,7 @@ def __call__(
         y_pred : array-like, [n_samples x n_classes]
             Model predictions
 
-        x_data : array-like [n_samples x n_features]
+        X_data : array-like [n_samples x n_features]
             X data used to obtain the predictions: each row x_j corresponds to the input
              used to obtain predictions y_j
 
@@ -228,7 +228,7 @@ def __call__(
         if sample_weight is not None:
             scorer_kwargs["sample_weight"] = sample_weight
         if self._needs_X is True:
-            scorer_kwargs["x_data"] = x_data
+            scorer_kwargs["X_data"] = X_data
 
         return self._sign * self._score_func(
             y_true, y_pred, **scorer_kwargs, **self._kwargs
@@ -465,7 +465,7 @@ def calculate_scores(
     task_type: int,
     metrics: Sequence[Scorer],
     *,
-    x_data: Optional[SUPPORTED_XDATA_TYPES] = None,
+    X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
     scoring_functions: Optional[List[Scorer]] = None,
 ) -> Dict[str, float]:
     """
@@ -485,7 +485,7 @@ def calculate_scores(
     metrics: Sequence[Scorer]
         A list of objects that hosts a function to calculate how good the
         prediction is according to the solution.
-    x_data : array-like [n_samples x n_features]
+    X_data : array-like [n_samples x n_features]
         X data used to obtain the predictions
     scoring_functions: List[Scorer]
         A list of metrics to calculate multiple losses
@@ -512,7 +512,7 @@ def calculate_scores(
                     prediction,
                     solution,
                     task_type,
-                    x_data,
+                    X_data,
                 )
             except ValueError as e:
                 print(e, e.args[0])
@@ -536,7 +536,7 @@ def calculate_scores(
                     prediction,
                     solution,
                     task_type,
-                    x_data,
+                    X_data,
                 )
             except ValueError as e:
                 if e.args[0] == "multiclass format is not supported":
@@ -564,7 +564,7 @@ def calculate_losses(
     task_type: int,
     metrics: Sequence[Scorer],
     *,
-    x_data: Optional[SUPPORTED_XDATA_TYPES] = None,
+    X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
     scoring_functions: Optional[List[Scorer]] = None,
 ) -> Dict[str, float]:
     """
@@ -584,7 +584,7 @@ def calculate_losses(
     metrics: Sequence[Scorer]
         A list of objects that hosts a function to calculate how good the
         prediction is according to the solution.
-    x_data: Optional[np.ndarray]
+    X_data: Optional[np.ndarray]
         X data used to obtain the predictions
     scoring_functions: List[Scorer]
         A list of metrics to calculate multiple losses
@@ -597,7 +597,7 @@ def calculate_losses(
     score = calculate_scores(
         solution=solution,
         prediction=prediction,
-        x_data=x_data,
+        X_data=X_data,
         task_type=task_type,
         metrics=metrics,
         scoring_functions=scoring_functions,
@@ -621,7 +621,7 @@ def compute_single_metric(
     prediction: np.ndarray,
     solution: np.ndarray,
     task_type: int,
-    x_data: Optional[np.ndarray] = None,
+    X_data: Optional[np.ndarray] = None,
 ) -> float:
     """
     Returns a metric for the given Auto-Sklearn Scorer object.
@@ -639,7 +639,7 @@ def compute_single_metric(
     metric: Scorer
         Object that host a function to calculate how good the
         prediction is according to the solution.
-    x_data : array-like [n_samples x n_features]
+    X_data : array-like [n_samples x n_features]
         X data used to obtain the predictions
 
     Returns
@@ -650,7 +650,7 @@ def compute_single_metric(
         solution=solution,
         prediction=prediction,
         metric=metric,
-        x_data=x_data,
+        X_data=X_data,
         task_type=task_type,
     )
     return metric._sign * score
@@ -661,7 +661,7 @@ def _compute_single_scorer(
     prediction: np.ndarray,
     solution: np.ndarray,
     task_type: int,
-    x_data: Optional[SUPPORTED_XDATA_TYPES] = None,
+    X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
 ) -> float:
     """
     Returns a score (a magnitude that allows casting the
@@ -680,28 +680,28 @@ def _compute_single_scorer(
     metric: Scorer
         Object that host a function to calculate how good the
         prediction is according to the solution.
-    x_data : array-like [n_samples x n_features]
+    X_data : array-like [n_samples x n_features]
         X data used to obtain the predictions
     Returns
     -------
     float
     """
     if metric._needs_X:
-        if x_data is None:
+        if X_data is None:
             raise ValueError(
-                f"Metric {metric.name} needs x_data, but x_data is {x_data}"
+                f"Metric {metric.name} needs X_data, but X_data is {X_data}"
             )
-        elif x_data.shape[0] != solution.shape[0]:
+        elif X_data.shape[0] != solution.shape[0]:
             raise ValueError(
-                f"x_data has wrong length. "
-                f"Should be {solution.shape[0]}, but is {x_data.shape[0]}"
+                f"X_data has wrong length. "
+                f"Should be {solution.shape[0]}, but is {X_data.shape[0]}"
             )
         if task_type in REGRESSION_TASKS:
             # TODO put this into the regression metric itself
             cprediction = sanitize_array(prediction)
-            score = metric(solution, cprediction, x_data=x_data)
+            score = metric(solution, cprediction, X_data=X_data)
         else:
-            score = metric(solution, prediction, x_data=x_data)
+            score = metric(solution, prediction, X_data=X_data)
         return score
 
     if task_type in REGRESSION_TASKS:
diff --git a/examples/40_advanced/example_metrics.py b/examples/40_advanced/example_metrics.py
index 8d7ea0b4a5..0fda05b9d1 100644
--- a/examples/40_advanced/example_metrics.py
+++ b/examples/40_advanced/example_metrics.py
@@ -46,10 +46,10 @@ def error_wk(solution, prediction, extra_argument):
     return np.mean(solution != prediction)
 
 
-def metric_which_needs_x(solution, prediction, x_data, consider_col, val_threshold):
+def metric_which_needs_x(solution, prediction, X_data, consider_col, val_threshold):
     # custom function defining accuracy
-    assert x_data is not None
-    rel_idx = x_data[:, consider_col] > val_threshold
+    assert X_data is not None
+    rel_idx = X_data[:, consider_col] > val_threshold
     return np.mean(solution[rel_idx] == prediction[rel_idx])
 
 
@@ -199,7 +199,7 @@ def metric_which_needs_x(solution, prediction, x_data, consider_col, val_thresho
 # Sixth example: Use a metric with additional argument which also needs xdata
 # ===============================================================
 """
-Finally, *Auto-sklearn* also support metric that require the train data (aka x_data) to
+Finally, *Auto-sklearn* also support metric that require the train data (aka X_data) to
 compute a value. This can be useful if one only cares about the score on a subset of the
 data.
 """
@@ -228,7 +228,7 @@ def metric_which_needs_x(solution, prediction, x_data, consider_col, val_thresho
 score = metric_which_needs_x(
     y_test,
     predictions,
-    x_data=X_test,
+    X_data=X_test,
     consider_col=1,
     val_threshold=18.8,
 )
diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py
index 36fa4f72ae..4443024c4b 100644
--- a/test/test_metric/test_metrics.py
+++ b/test/test_metric/test_metrics.py
@@ -17,6 +17,34 @@
 
 
 class TestScorer(unittest.TestCase):
+    def test_needs_X(self):
+        y_true = np.array([0, 0, 1, 1])
+        y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
+
+        def dummy_metric(y_true, y_pred, X_data=None, **kwargs):
+            if not np.array_equal(np.array([45]), X_data):
+                raise ValueError(f"is {X_data}")
+            return 1
+
+        scorer = autosklearn.metrics._PredictScorer(
+            "accuracy", dummy_metric, 1, 0, 1, {}, needs_X=True
+        )
+        scorer(y_true, y_pred, X_data=np.array([45]))
+
+        scorer_nox = autosklearn.metrics._PredictScorer(
+            "accuracy", dummy_metric, 1, 0, 1, {}, needs_X=False
+        )
+        with self.assertRaises(ValueError) as cm:
+            scorer_nox(y_true, y_pred, X_data=np.array([32]))
+        the_exception = cm.exception
+        # X_data is not forwarded
+        self.assertEqual(the_exception.args[0], "is None")
+
+        scorer_nox = autosklearn.metrics._PredictScorer(
+            "accuracy", sklearn.metrics.accuracy_score, 1, 0, 1, {}, needs_X=False
+        )
+        scorer_nox(y_true, y_pred, X_data=np.array([32]))
+
     def test_predict_scorer_binary(self):
         y_true = np.array([0, 0, 1, 1])
         y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])

From 9f490b5fd60936899ced97a40b6240e18786cefe Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@cs.uni-freiburg.de>
Date: Mon, 23 May 2022 18:40:57 +0200
Subject: [PATCH 18/19] FIX example

---
 examples/40_advanced/example_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/40_advanced/example_metrics.py b/examples/40_advanced/example_metrics.py
index 0fda05b9d1..f57592d44f 100644
--- a/examples/40_advanced/example_metrics.py
+++ b/examples/40_advanced/example_metrics.py
@@ -213,7 +213,7 @@ def metric_which_needs_x(solution, prediction, X_data, consider_col, val_thresho
     needs_X=True,
     needs_threshold=False,
     consider_col=1,
-    threshold=20,
+    val_threshold=18.8,
 )
 cls = autosklearn.classification.AutoSklearnClassifier(
     time_left_for_this_task=60,

From 26ff4e59d27ab65aff3933d2ceea271031f874dd Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@informatik.uni-freiburg.de>
Date: Tue, 24 May 2022 10:59:18 +0200
Subject: [PATCH 19/19] Update examples/40_advanced/example_metrics.py

Co-authored-by: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
---
 examples/40_advanced/example_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/40_advanced/example_metrics.py b/examples/40_advanced/example_metrics.py
index f57592d44f..4da40f3cbb 100644
--- a/examples/40_advanced/example_metrics.py
+++ b/examples/40_advanced/example_metrics.py
@@ -197,7 +197,7 @@ def metric_which_needs_x(solution, prediction, X_data, consider_col, val_thresho
 
 #############################################################################
 # Sixth example: Use a metric with additional argument which also needs xdata
-# ===============================================================
+# ===========================================================================
 """
 Finally, *Auto-sklearn* also support metric that require the train data (aka X_data) to
 compute a value. This can be useful if one only cares about the score on a subset of the