diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 50e4591346..8c0f4717dc 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1,9 +1,10 @@ from __future__ import annotations -from typing import Any, Callable, Iterable, Mapping, Optional, Tuple +from typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Tuple import copy import io +import itertools import json import logging.handlers import multiprocessing @@ -66,7 +67,12 @@ from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings from autosklearn.evaluation.train_evaluator import TrainEvaluator, _fit_with_budget -from autosklearn.metrics import Scorer, calculate_metric, default_metric_for_task +from autosklearn.metrics import ( + Scorer, + _validate_metrics, + compute_single_metric, + default_metric_for_task, +) from autosklearn.pipeline.base import BasePipeline from autosklearn.pipeline.components.classification import ClassifierChoice from autosklearn.pipeline.components.data_preprocessing.categorical_encoding import ( @@ -210,7 +216,7 @@ def __init__( get_smac_object_callback: Optional[Callable] = None, smac_scenario_args: Optional[Mapping] = None, logging_config: Optional[Mapping] = None, - metric: Optional[Scorer] = None, + metrics: Sequence[Scorer] | None = None, scoring_functions: Optional[list[Scorer]] = None, get_trials_callback: Optional[IncorporateRunResultCallback] = None, dataset_compression: bool | Mapping[str, Any] = True, @@ -244,7 +250,7 @@ def __init__( self._delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit - self._metric = metric + self._metrics = metrics self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._max_models_on_disc = max_models_on_disc @@ -265,7 +271,7 @@ def __init__( initial_configurations_via_metalearning ) - self._scoring_functions = scoring_functions or {} + self._scoring_functions = scoring_functions or [] self._resampling_strategy_arguments = resampling_strategy_arguments or {} # Single core, local runs should use fork to prevent the __main__ requirements @@ -422,8 +428,8 @@ def _do_dummy_prediction(self) -> None: if self._resampling_strategy in ["partial-cv", "partial-cv-iterative-fit"]: return - if self._metric is None: - raise ValueError("Metric was not set") + if self._metrics is None: + raise ValueError("Metric/Metrics was/were not set") # Dummy prediction always have num_run set to 1 dummy_run_num = 1 @@ -447,11 +453,11 @@ def _do_dummy_prediction(self) -> None: resampling_strategy=self._resampling_strategy, initial_num_run=dummy_run_num, stats=stats, - metric=self._metric, + metrics=self._metrics, memory_limit=memory_limit, disable_file_output=self._disable_evaluator_output, abort_on_first_run_crash=False, - cost_for_crash=get_cost_of_crash(self._metric), + cost_for_crash=get_cost_of_crash(self._metrics), port=self._logger_port, pynisher_context=self._multiprocessing_context, **self._resampling_strategy_arguments, @@ -611,8 +617,9 @@ def fit( self._task = task # Assign a metric if it doesnt exist - if self._metric is None: - self._metric = default_metric_for_task[self._task] + if self._metrics is None: + self._metrics = [default_metric_for_task[self._task]] + _validate_metrics(self._metrics, self._scoring_functions) if dataset_name is None: dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) @@ -690,11 +697,16 @@ def fit( # The metric must exist as of this point # It can be provided in the constructor, or automatically # defined in the estimator fit call - if self._metric is None: - raise ValueError("No metric given.") - if not isinstance(self._metric, Scorer): + if isinstance(self._metrics, Sequence): + for entry in self._metrics: + if not isinstance(entry, Scorer): + raise ValueError( + "Metric {entry} must be instance of autosklearn.metrics.Scorer." + ) + else: raise ValueError( - "Metric must be instance of " "autosklearn.metrics.Scorer." + "Metric must be a sequence of instances of " + "autosklearn.metrics.Scorer." ) # If no dask client was provided, we create one, so that we can @@ -790,7 +802,7 @@ def fit( backend=copy.deepcopy(self._backend), dataset_name=dataset_name, task=self._task, - metric=self._metric, + metric=self._metrics[0], ensemble_size=self._ensemble_size, ensemble_nbest=self._ensemble_nbest, max_models_on_disc=self._max_models_on_disc, @@ -862,7 +874,7 @@ def fit( config_file=configspace_path, seed=self._seed, metadata_directory=self._metadata_directory, - metric=self._metric, + metrics=self._metrics, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, include=self._include, @@ -1001,7 +1013,10 @@ def _log_fit_setup(self) -> None: ) self._logger.debug(" smac_scenario_args: %s", str(self._smac_scenario_args)) self._logger.debug(" logging_config: %s", str(self.logging_config)) - self._logger.debug(" metric: %s", str(self._metric)) + if len(self._metrics) == 1: + self._logger.debug(" metric: %s", str(self._metrics[0])) + else: + self._logger.debug(" metrics: %s", str(self._metrics)) self._logger.debug("Done printing arguments to auto-sklearn") self._logger.debug("Starting to print available components") for choice in ( @@ -1254,8 +1269,8 @@ def fit_pipeline( self._task = task # Assign a metric if it doesnt exist - if self._metric is None: - self._metric = default_metric_for_task[self._task] + if self._metrics is None: + self._metrics = [default_metric_for_task[self._task]] # Get the configuration space # This also ensures that the Backend has processed the @@ -1288,8 +1303,16 @@ def fit_pipeline( kwargs["memory_limit"] = self._memory_limit if "resampling_strategy" not in kwargs: kwargs["resampling_strategy"] = self._resampling_strategy - if "metric" not in kwargs: - kwargs["metric"] = self._metric + if "metrics" not in kwargs: + if "metric" in kwargs: + kwargs["metrics"] = kwargs["metric"] + del kwargs["metric"] + else: + kwargs["metrics"] = self._metrics + if not isinstance(kwargs["metrics"], Sequence): + kwargs["metrics"] = [kwargs["metrics"]] + if "scoring_functions" not in kwargs: + kwargs["scoring_functions"] = self._scoring_functions if "disable_file_output" not in kwargs: kwargs["disable_file_output"] = self._disable_evaluator_output if "pynisher_context" not in kwargs: @@ -1300,6 +1323,8 @@ def fit_pipeline( kwargs["stats"] = Stats(scenario_mock) kwargs["stats"].start_timing() + _validate_metrics(kwargs["metrics"], kwargs["scoring_functions"]) + # Fit a pipeline, which will be stored on disk # which we can later load via the backend ta = ExecuteTaFuncWithQueue( @@ -1307,7 +1332,7 @@ def fit_pipeline( autosklearn_seed=self._seed, abort_on_first_run_crash=False, multi_objectives=["cost"], - cost_for_crash=get_cost_of_crash(kwargs["metric"]), + cost_for_crash=get_cost_of_crash(kwargs["metrics"]), port=self._logger_port, **kwargs, **self._resampling_strategy_arguments, @@ -1492,7 +1517,7 @@ def fit_ensemble( backend=copy.deepcopy(self._backend), dataset_name=dataset_name if dataset_name else self._dataset_name, task=task if task else self._task, - metric=self._metric, + metric=self._metrics[0], ensemble_size=ensemble_size if ensemble_size else self._ensemble_size, ensemble_nbest=ensemble_nbest if ensemble_nbest else self._ensemble_nbest, max_models_on_disc=self._max_models_on_disc, @@ -1590,7 +1615,7 @@ def _load_best_individual_model(self): # SingleBest contains the best model found by AutoML ensemble = SingleBest( - metric=self._metric, + metric=self._metrics[0], seed=self._seed, run_history=self.runhistory_, backend=self._backend, @@ -1624,15 +1649,15 @@ def score(self, X, y): # same representation domain prediction = self.InputValidator.target_validator.transform(prediction) - return calculate_metric( + return compute_single_metric( solution=y, prediction=prediction, task_type=self._task, - metric=self._metric, + metric=self._metrics[0], ) def _get_runhistory_models_performance(self): - metric = self._metric + metric = self._metrics[0] data = self.runhistory_.data performance_list = [] for run_key, run_value in data.items(): @@ -1644,7 +1669,10 @@ def _get_runhistory_models_performance(self): endtime = pd.Timestamp( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(run_value.endtime)) ) - val_score = metric._optimum - (metric._sign * run_value.cost) + cost = run_value.cost + if len(self._metrics) > 1: + cost = cost[0] + val_score = metric._optimum - (metric._sign * cost) train_score = metric._optimum - ( metric._sign * run_value.additional_info["train_loss"] ) @@ -1656,9 +1684,10 @@ def _get_runhistory_models_performance(self): # Append test-scores, if data for test_loss are available. # This is the case, if X_test and y_test where provided. if "test_loss" in run_value.additional_info: - test_score = metric._optimum - ( - metric._sign * run_value.additional_info["test_loss"] - ) + test_loss = run_value.additional_info["test_loss"] + if len(self._metrics) > 1: + test_loss = test_loss[0] + test_score = metric._optimum - (metric._sign * test_loss) scores["single_best_test_score"] = test_score performance_list.append(scores) @@ -1747,14 +1776,11 @@ def cv_results_(self): metric_mask = dict() metric_dict = dict() - metric_name = [] - for metric in self._scoring_functions: - metric_name.append(metric.name) + for metric in itertools.chain(self._metrics, self._scoring_functions): metric_dict[metric.name] = [] metric_mask[metric.name] = [] - mean_test_score = [] mean_fit_time = [] params = [] status = [] @@ -1787,9 +1813,7 @@ def cv_results_(self): param_dict = config.get_dictionary() params.append(param_dict) - mean_test_score.append( - self._metric._optimum - (self._metric._sign * run_value.cost) - ) + mean_fit_time.append(run_value.time) budgets.append(run_key.budget) @@ -1804,7 +1828,18 @@ def cv_results_(self): parameter_dictionaries[hp_name].append(hp_value) masks[hp_name].append(mask_value) + cost = [run_value.cost] if len(self._metrics) == 1 else run_value.cost + for metric_idx, metric in enumerate(self._metrics): + metric_cost = cost[metric_idx] + metric_value = metric._optimum - (metric._sign * metric_cost) + mask_value = False + metric_dict[metric.name].append(metric_value) + metric_mask[metric.name].append(mask_value) + + optimization_metric_names = set(m.name for m in self._metrics) for metric in self._scoring_functions: + if metric.name in optimization_metric_names: + continue if metric.name in run_value.additional_info.keys(): metric_cost = run_value.additional_info[metric.name] metric_value = metric._optimum - (metric._sign * metric_cost) @@ -1815,15 +1850,26 @@ def cv_results_(self): metric_dict[metric.name].append(metric_value) metric_mask[metric.name].append(mask_value) - results["mean_test_score"] = np.array(mean_test_score) - for name in metric_name: - masked_array = ma.MaskedArray(metric_dict[name], metric_mask[name]) - results["metric_%s" % name] = masked_array + if len(self._metrics) == 1: + results["mean_test_score"] = np.array(metric_dict[self._metrics[0].name]) + rank_order = -1 * self._metrics[0]._sign * results["mean_test_score"] + results["rank_test_scores"] = scipy.stats.rankdata(rank_order, method="min") + else: + for metric in self._metrics: + key = f"mean_test_{metric.name}" + results[key] = np.array(metric_dict[metric.name]) + rank_order = -1 * metric._sign * results[key] + results[f"rank_test_{metric.name}"] = scipy.stats.rankdata( + rank_order, method="min" + ) + for metric in self._scoring_functions: + masked_array = ma.MaskedArray( + metric_dict[metric.name], metric_mask[metric.name] + ) + results[f"metric_{metric.name}"] = masked_array results["mean_fit_time"] = np.array(mean_fit_time) results["params"] = params - rank_order = -1 * self._metric._sign * results["mean_test_score"] - results["rank_test_scores"] = scipy.stats.rankdata(rank_order, method="min") results["status"] = status results["budgets"] = budgets @@ -1841,7 +1887,10 @@ def sprint_statistics(self) -> str: sio = io.StringIO() sio.write("auto-sklearn results:\n") sio.write(" Dataset name: %s\n" % self._dataset_name) - sio.write(" Metric: %s\n" % self._metric) + if len(self._metrics) == 1: + sio.write(" Metric: %s\n" % self._metrics[0]) + else: + sio.write(" Metrics: %s\n" % self._metrics) idx_success = np.where( np.array( [ @@ -1852,7 +1901,7 @@ def sprint_statistics(self) -> str: ) )[0] if len(idx_success) > 0: - if not self._metric._optimum: + if not self._metrics[0]._optimum: idx_best_run = np.argmin(cv_results["mean_test_score"][idx_success]) else: idx_best_run = np.argmax(cv_results["mean_test_score"][idx_success]) @@ -1912,7 +1961,6 @@ def show_models(self) -> dict[int, Any]: .. code-block:: python import sklearn.datasets - import sklearn.metrics import autosklearn.regression X, y = sklearn.datasets.load_diabetes(return_X_y=True) diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index 3707ce84c9..3dec9828ef 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -30,7 +30,7 @@ from autosklearn.automl_common.common.utils.backend import Backend from autosklearn.constants import BINARY_CLASSIFICATION from autosklearn.ensembles.ensemble_selection import EnsembleSelection -from autosklearn.metrics import Scorer, calculate_loss, calculate_score +from autosklearn.metrics import Scorer, calculate_losses, calculate_scores from autosklearn.util.logging_ import get_named_client_logger from autosklearn.util.parallel import preload_modules @@ -999,13 +999,13 @@ def compute_loss_per_model(self): # actually read the predictions and compute their respective loss try: y_ensemble = self._read_np_fn(y_ens_fn) - loss = calculate_loss( + loss = calculate_losses( solution=self.y_true_ensemble, prediction=y_ensemble, task_type=self.task_type, - metric=self.metric, + metrics=[self.metric], scoring_functions=None, - ) + )[self.metric.name] if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]): self.logger.debug( @@ -1511,34 +1511,34 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): performance_stamp = { "Timestamp": pd.Timestamp.now(), - "ensemble_optimization_score": calculate_score( + "ensemble_optimization_score": calculate_scores( solution=self.y_true_ensemble, prediction=train_pred, task_type=self.task_type, - metric=self.metric, + metrics=[self.metric], scoring_functions=None, - ), + )[self.metric.name], } if valid_pred is not None: # TODO: valid_pred are a legacy from competition manager # and this if never happens. Re-evaluate Y_valid support - performance_stamp["ensemble_val_score"] = calculate_score( + performance_stamp["ensemble_val_score"] = calculate_scores( solution=self.y_valid, prediction=valid_pred, task_type=self.task_type, - metric=self.metric, + metrics=[self.metric], scoring_functions=None, - ) + )[self.metric.name] # In case test_pred was provided if test_pred is not None: - performance_stamp["ensemble_test_score"] = calculate_score( + performance_stamp["ensemble_test_score"] = calculate_scores( solution=self.y_test, prediction=test_pred, task_type=self.task_type, - metric=self.metric, + metrics=[self.metric], scoring_functions=None, - ) + )[self.metric.name] self.ensemble_history.append(performance_stamp) diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index 3ae216da01..0c99db64c1 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union import random from collections import Counter @@ -8,7 +8,7 @@ from autosklearn.constants import TASK_TYPES from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble -from autosklearn.metrics import Scorer, calculate_loss +from autosklearn.metrics import Scorer, calculate_losses from autosklearn.pipeline.base import BasePipeline @@ -164,18 +164,13 @@ def _fast( out=fant_ensemble_prediction, ) - # calculate_loss is versatile and can return a dict of losses - # when scoring_functions=None, we know it will be a float - losses[j] = cast( - float, - calculate_loss( - solution=labels, - prediction=fant_ensemble_prediction, - task_type=self.task_type, - metric=self.metric, - scoring_functions=None, - ), - ) + losses[j] = calculate_losses( + solution=labels, + prediction=fant_ensemble_prediction, + task_type=self.task_type, + metrics=[self.metric], + scoring_functions=None, + )[self.metric.name] all_best = np.argwhere(losses == np.nanmin(losses)).flatten() @@ -211,18 +206,13 @@ def _slow(self, predictions: List[np.ndarray], labels: np.ndarray) -> None: for j, pred in enumerate(predictions): ensemble.append(pred) ensemble_prediction = np.mean(np.array(ensemble), axis=0) - # calculate_loss is versatile and can return a dict of losses - # when scoring_functions=None, we know it will be a float - losses[j] = cast( - float, - calculate_loss( - solution=labels, - prediction=ensemble_prediction, - task_type=self.task_type, - metric=self.metric, - scoring_functions=None, - ), - ) + losses[j] = calculate_losses( + solution=labels, + prediction=ensemble_prediction, + task_type=self.task_type, + metrics=[self.metric], + scoring_functions=None, + )[self.metric.name] ensemble.pop() best = np.nanargmin(losses) ensemble.append(predictions[best]) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index db931a338a..1c283e06e6 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -1,5 +1,7 @@ # -*- encoding: utf-8 -*- -from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from __future__ import annotations + +from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, Union import dask.distributed import joblib @@ -46,7 +48,7 @@ def __init__( smac_scenario_args=None, logging_config=None, metadata_directory=None, - metric=None, + metric: Scorer | Sequence[Scorer] | None = None, scoring_functions: Optional[List[Scorer]] = None, load_models: bool = True, get_trials_callback=None, @@ -417,7 +419,7 @@ def build_automl(self): smac_scenario_args=self.smac_scenario_args, logging_config=self.logging_config, metadata_directory=self.metadata_directory, - metric=self.metric, + metrics=[self.metric] if isinstance(self.metric, Scorer) else self.metric, scoring_functions=self.scoring_functions, get_trials_callback=self.get_trials_callback, dataset_compression=self.dataset_compression, @@ -792,6 +794,9 @@ def leaderboard( What column to sort by. If that column is not present, the sorting defaults to the ``"model_id"`` index column. + Defaults to the metric optimized. Sort by the first objective + in case of a multi-objective optimization problem + sort_order: "auto" or "ascending" or "descending" = "auto" Which sort order to apply to the ``sort_by`` column. If left as ``"auto"``, it will sort by a sensible default where "better" is @@ -816,7 +821,16 @@ def leaderboard( # TODO validate that `self` is fitted. This is required for # self.ensemble_ to get the identifiers of models it will generate # weights for. - column_types = AutoSklearnEstimator._leaderboard_columns() + num_metrics = ( + 1 + if self.metric is None or isinstance(self.metric, Scorer) + else len(self.metric) + ) + column_types = AutoSklearnEstimator._leaderboard_columns(num_metrics) + if num_metrics == 1: + multi_objective_cost_names = [] + else: + multi_objective_cost_names = [f"cost_{i}" for i in range(num_metrics)] # Validation of top_k if ( @@ -857,11 +871,26 @@ def leaderboard( columns = column_types["simple"] # Validation of sorting - if sort_by not in column_types["all"]: - raise ValueError( - f"sort_by='{sort_by}' must be one of included " - f"columns {set(column_types['all'])}" - ) + if sort_by == "cost": + sort_by_cost = True + if num_metrics == 1: + sort_by = ["cost", "model_id"] + else: + sort_by = multi_objective_cost_names + ["model_id"] + else: + sort_by_cost = False + if isinstance(sort_by, str): + if sort_by not in column_types["all"]: + raise ValueError( + f"sort_by='{sort_by}' must be one of included " + f"columns {set(column_types['all'])}" + ) + elif len(set(sort_by) - set(column_types["all"])) > 0: + too_much = set(sort_by) - set(column_types["all"]) + raise ValueError( + f"sort_by='{too_much}' must be in the included columns " + f"{set(column_types['all'])}" + ) valid_sort_orders = ["auto", "ascending", "descending"] if not (isinstance(sort_order, str) and sort_order in valid_sort_orders): @@ -871,30 +900,37 @@ def leaderboard( # To get all the models that were optmized, we collect what we can from # runhistory first. - def has_key(rv, key): + def additional_info_has_key(rv, key): return rv.additional_info and key in rv.additional_info - model_runs = { - rval.additional_info["num_run"]: { - "model_id": rval.additional_info["num_run"], - "seed": rkey.seed, - "budget": rkey.budget, - "duration": rval.time, - "config_id": rkey.config_id, - "start_time": rval.starttime, - "end_time": rval.endtime, - "status": str(rval.status), - "cost": rval.cost, - "train_loss": rval.additional_info["train_loss"] - if has_key(rval, "train_loss") - else None, - "config_origin": rval.additional_info["configuration_origin"] - if has_key(rval, "configuration_origin") - else None, - } - for rkey, rval in self.automl_.runhistory_.data.items() - if has_key(rval, "num_run") - } + model_runs = {} + for rkey, rval in self.automl_.runhistory_.data.items(): + if not additional_info_has_key(rval, "num_run"): + continue + else: + model_key = rval.additional_info["num_run"] + model_run = { + "model_id": rval.additional_info["num_run"], + "seed": rkey.seed, + "budget": rkey.budget, + "duration": rval.time, + "config_id": rkey.config_id, + "start_time": rval.starttime, + "end_time": rval.endtime, + "status": str(rval.status), + "train_loss": rval.additional_info["train_loss"] + if additional_info_has_key(rval, "train_loss") + else None, + "config_origin": rval.additional_info["configuration_origin"] + if additional_info_has_key(rval, "configuration_origin") + else None, + } + if num_metrics == 1: + model_run["cost"] = rval.cost + else: + for cost_idx, cost in enumerate(rval.cost): + model_run[f"cost_{cost_idx}"] = cost + model_runs[model_key] = model_run # Next we get some info about the model itself model_class_strings = { @@ -942,7 +978,7 @@ def has_key(rv, key): # collected. I have no clue why this is but to prevent failures, we fill # the values with NaN if model_id not in model_runs: - model_runs[model_id] = { + model_run = { "model_id": model_id, "seed": pd.NA, "budget": pd.NA, @@ -951,10 +987,16 @@ def has_key(rv, key): "start_time": pd.NA, "end_time": pd.NA, "status": pd.NA, - "cost": pd.NA, "train_loss": pd.NA, "config_origin": pd.NA, + "type": pd.NA, } + if num_metrics == 1: + model_run["cost"] = pd.NA + else: + for cost_idx in range(num_metrics): + model_run[f"cost_{cost_idx}"] = pd.NA + model_runs[model_id] = model_run model_runs[model_id]["ensemble_weight"] = weight @@ -973,8 +1015,13 @@ def has_key(rv, key): # `rank` relies on `cost` so we include `cost` # We drop it later if it's not requested - if "rank" in columns and "cost" not in columns: - columns = [*columns, "cost"] + if "rank" in columns: + if num_metrics == 1 and "cost" not in columns: + columns = [*columns, "cost"] + elif num_metrics > 1 and any( + cost_name not in columns for cost_name in multi_objective_cost_names + ): + columns = columns + list(multi_objective_cost_names) # Finally, convert into a tabular format by converting the dict into # column wise orientation. @@ -989,43 +1036,65 @@ def has_key(rv, key): # Give it an index, even if not in the `include` dataframe.set_index("model_id", inplace=True) - # Add the `rank` column if needed, dropping `cost` if it's not + # Add the `rank` column if needed # requested by the user if "rank" in columns: - dataframe.sort_values(by="cost", ascending=True, inplace=True) + if num_metrics == 1: + dataframe.sort_values(by="cost", ascending=True, inplace=True) + else: + dataframe.sort_values(by="cost_0", ascending=True, inplace=True) dataframe.insert( column="rank", value=range(1, len(dataframe) + 1), loc=list(columns).index("rank") - 1, ) # account for `model_id` - if "cost" not in columns: - dataframe.drop("cost", inplace=True) - # Decide on the sort order depending on what it gets sorted by descending_columns = ["ensemble_weight", "duration"] if sort_order == "auto": - ascending_param = False if sort_by in descending_columns else True + ascending_param = [ + False if sby in descending_columns else True for sby in sort_by + ] else: ascending_param = False if sort_order == "descending" else True # Sort by the given column name, defaulting to 'model_id' if not present - if sort_by not in dataframe.columns: + if ( + (not sort_by_cost and len(set(sort_by) - set(dataframe.columns)) > 0) + or (sort_by_cost and "cost" not in dataframe.columns) + or ( + sort_by_cost + and any( + cost_name not in dataframe.columns + for cost_name in multi_objective_cost_names + ) + ) + ): self.automl_._logger.warning( f"sort_by = '{sort_by}' was not present" ", defaulting to sort on the index " "'model_id'" ) sort_by = "model_id" + sort_by_cost = False + ascending_param = True - # Cost can be the same but leave rank all over the place - if "rank" in columns and sort_by == "cost": + # Single objective + if sort_by_cost: dataframe.sort_values( - by=[sort_by, "rank"], ascending=[ascending_param, True], inplace=True + by=sort_by, ascending=[True] * len(sort_by), inplace=True ) else: dataframe.sort_values(by=sort_by, ascending=ascending_param, inplace=True) + if num_metrics == 1: + if "cost" not in columns and "cost" in dataframe.columns: + dataframe.drop("cost", inplace=True) + else: + for cost_name in multi_objective_cost_names: + if cost_name not in columns and cost_name in dataframe.columns: + dataframe.drop(cost_name, inplace=True) + # Lastly, just grab the top_k if top_k == "all" or top_k >= len(dataframe): top_k = len(dataframe) @@ -1035,27 +1104,39 @@ def has_key(rv, key): return dataframe @staticmethod - def _leaderboard_columns() -> Dict[Literal["all", "simple", "detailed"], List[str]]: - all = [ - "model_id", - "rank", - "ensemble_weight", - "type", - "cost", - "duration", - "config_id", - "train_loss", - "seed", - "start_time", - "end_time", - "budget", - "status", - "data_preprocessors", - "feature_preprocessors", - "balancing_strategy", - "config_origin", - ] - simple = ["model_id", "rank", "ensemble_weight", "type", "cost", "duration"] + def _leaderboard_columns( + num_metrics: int, + ) -> Dict[Literal["all", "simple", "detailed"], List[str]]: + if num_metrics == 1: + cost_list = ["cost"] + else: + cost_list = [f"cost_{i}" for i in range(num_metrics)] + all = ( + [ + "model_id", + "rank", + "ensemble_weight", + "type", + ] + + cost_list + + [ + "duration", + "config_id", + "train_loss", + "seed", + "start_time", + "end_time", + "budget", + "status", + "data_preprocessors", + "feature_preprocessors", + "balancing_strategy", + "config_origin", + ] + ) + simple = ( + ["model_id", "rank", "ensemble_weight", "type"] + cost_list + ["duration"] + ) detailed = all return {"all": all, "detailed": detailed, "simple": simple} diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index 89c61d144d..52794fec03 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -1,5 +1,18 @@ # -*- encoding: utf-8 -*- -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union, cast +from __future__ import annotations + +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Tuple, + Type, + Union, + cast, +) import functools import json @@ -85,23 +98,30 @@ def fit_predict_try_except_decorator( queue.close() -def get_cost_of_crash(metric: Scorer) -> float: - - # The metric must always be defined to extract optimum/worst - if not isinstance(metric, Scorer): - raise ValueError("The metric must be stricly be an instance of Scorer") - - # Autosklearn optimizes the err. This function translates - # worst_possible_result to be a minimization problem. - # For metrics like accuracy that are bounded to [0,1] - # metric.optimum==1 is the worst cost. - # A simple guide is to use greater_is_better embedded as sign - if metric._sign < 0: - worst_possible_result = metric._worst_possible_result - else: - worst_possible_result = metric._optimum - metric._worst_possible_result +def get_cost_of_crash(metrics: Sequence[Scorer]) -> List[float] | float: + """Return the cost of crash. + + Return value can be either a list (multi-objective optimization) or a + raw float (single objective) because SMAC assumes different types in the + two different cases. + """ + costs = [] + for metric in metrics: + if not isinstance(metric, Scorer): + raise ValueError("The metric {metric} must be an instance of Scorer") + + # Autosklearn optimizes the err. This function translates + # worst_possible_result to be a minimization problem. + # For metrics like accuracy that are bounded to [0,1] + # metric.optimum==1 is the worst cost. + # A simple guide is to use greater_is_better embedded as sign + if metric._sign < 0: + worst_possible_result = metric._worst_possible_result + else: + worst_possible_result = metric._optimum - metric._worst_possible_result + costs.append(worst_possible_result) - return worst_possible_result + return costs if len(costs) > 1 else costs[0] def _encode_exit_status( @@ -126,7 +146,7 @@ def __init__( resampling_strategy: Union[ str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], - metric: Scorer, + metrics: Sequence[Scorer], cost_for_crash: float, abort_on_first_run_crash: bool, port: int, @@ -144,7 +164,7 @@ def __init__( disable_file_output: bool = False, init_params: Optional[Dict[str, Any]] = None, budget_type: Optional[str] = None, - ta: Optional[Callable] = None, + ta: Optional[Callable] = None, # Required by SMAC's parent class **resampling_strategy_args: Any, ): if resampling_strategy == "holdout": @@ -186,13 +206,14 @@ def __init__( par_factor=par_factor, cost_for_crash=self.worst_possible_result, abort_on_first_run_crash=abort_on_first_run_crash, + multi_objectives=multi_objectives, ) self.backend = backend self.autosklearn_seed = autosklearn_seed self.resampling_strategy = resampling_strategy self.initial_num_run = initial_num_run - self.metric = metric + self.metrics = metrics self.resampling_strategy = resampling_strategy self.resampling_strategy_args = resampling_strategy_args self.scoring_functions = scoring_functions @@ -356,7 +377,7 @@ def run( config=config, backend=self.backend, port=self.port, - metric=self.metric, + metrics=self.metrics, seed=self.autosklearn_seed, num_run=num_run, scoring_functions=self.scoring_functions, @@ -550,4 +571,33 @@ def run( autosklearn.evaluation.util.empty_queue(queue) self.logger.info("Finished evaluating configuration %d" % config_id) + + # Do some sanity checking (for multi objective) + if len(self.multi_objectives) > 1: + error = ( + f"Returned costs {cost} does not match the number of objectives" + f" {len(self.multi_objectives)}." + ) + + # If dict convert to array + # Make sure the ordering is correct + if isinstance(cost, dict): + ordered_cost = [] + for name in self.multi_objectives: + if name not in cost: + raise RuntimeError( + f"Objective {name} was not found " + f"in the returned costs ({cost})" + ) + + ordered_cost.append(cost[name]) + cost = ordered_cost + + if isinstance(cost, list): + if len(cost) != len(self.multi_objectives): + raise RuntimeError(error) + + if isinstance(cost, float): + raise RuntimeError(error) + return status, cost, runtime, additional_run_info diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 7843de6a8a..ab9e961128 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -1,4 +1,6 @@ -from typing import Any, Dict, List, Optional, TextIO, Tuple, Type, Union, cast +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Sequence, TextIO, Tuple, Type, Union, cast import logging import multiprocessing @@ -23,7 +25,7 @@ MULTIOUTPUT_REGRESSION, REGRESSION_TASKS, ) -from autosklearn.metrics import Scorer, calculate_loss +from autosklearn.metrics import Scorer, calculate_losses from autosklearn.pipeline.components.base import ThirdPartyComponents, _addons from autosklearn.pipeline.implementations.util import ( convert_multioutput_multiclass_to_multilabel, @@ -184,7 +186,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Scorer, + metrics: Sequence[Scorer], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -219,12 +221,12 @@ def __init__( self.X_test = self.datamanager.data.get("X_test") self.y_test = self.datamanager.data.get("Y_test") - self.metric = metric + self.metrics = metrics self.task_type = self.datamanager.info["task"] self.seed = seed self.output_y_hat_optimization = output_y_hat_optimization - self.scoring_functions = scoring_functions + self.scoring_functions = scoring_functions if scoring_functions else [] if isinstance(disable_file_output, (bool, list)): self.disable_file_output: Union[bool, List[str]] = disable_file_output @@ -326,8 +328,7 @@ def _loss( self, y_true: np.ndarray, y_hat: np.ndarray, - scoring_functions: Optional[List[Scorer]] = None, - ) -> Union[float, Dict[str, float]]: + ) -> Dict[str, float]: """Auto-sklearn follows a minimization goal. The calculate_loss internally translate a score function to a minimization problem. @@ -338,27 +339,28 @@ def _loss( ---------- y_true """ - scoring_functions = ( - self.scoring_functions if scoring_functions is None else scoring_functions - ) if not isinstance(self.configuration, Configuration): - if scoring_functions: - return {self.metric.name: self.metric._worst_possible_result} - else: - return self.metric._worst_possible_result - - return calculate_loss( - y_true, - y_hat, - self.task_type, - self.metric, - scoring_functions=scoring_functions, - ) + # Dummy prediction + rval = {} + for metric in self.scoring_functions if self.scoring_functions else []: + rval[metric.name] = metric._worst_possible_result + for metric in self.metrics: + rval[metric.name] = metric._worst_possible_result + return rval + + else: + return calculate_losses( + y_true, + y_hat, + self.task_type, + self.metrics, + scoring_functions=self.scoring_functions, + ) def finish_up( self, loss: Union[Dict[str, float], float], - train_loss: Optional[Union[float, Dict[str, float]]], + train_loss: Optional[Dict[str, float]], opt_pred: np.ndarray, valid_pred: np.ndarray, test_pred: np.ndarray, @@ -400,19 +402,25 @@ def finish_up( if file_out_loss is not None: return self.duration, file_out_loss, self.seed, additional_run_info_ - if isinstance(loss, dict): - loss_ = loss - loss = loss_[self.metric.name] + loss_ = loss + if len(self.metrics) == 1: + loss = loss_[self.metrics[0].name] else: - loss_ = {} + loss = {metric.name: loss_[metric.name] for metric in self.metrics} additional_run_info = {} if additional_run_info is None else additional_run_info - for metric_name, value in loss_.items(): - additional_run_info[metric_name] = value + for metric in self.scoring_functions: + if metric.name in loss_: + additional_run_info[metric.name] = loss_[metric.name] additional_run_info["duration"] = self.duration additional_run_info["num_run"] = self.num_run if train_loss is not None: - additional_run_info["train_loss"] = train_loss + if len(self.metrics) == 1: + additional_run_info["train_loss"] = train_loss[self.metrics[0].name] + else: + additional_run_info["train_loss"] = [ + train_loss[metric.name] for metric in self.metrics + ] if validation_loss is not None: additional_run_info["validation_loss"] = validation_loss if test_loss is not None: @@ -433,14 +441,14 @@ def calculate_auxiliary_losses( self, Y_valid_pred: np.ndarray, Y_test_pred: np.ndarray, - ) -> Tuple[Optional[float], Optional[float]]: + ) -> Tuple[Optional[float | Sequence[float]], Optional[float | Sequence[float]]]: if Y_valid_pred is not None: if self.y_valid is not None: validation_loss: Optional[Union[float, Dict[str, float]]] = self._loss( self.y_valid, Y_valid_pred ) - if isinstance(validation_loss, dict): - validation_loss = validation_loss[self.metric.name] + if len(self.metrics) == 1: + validation_loss = validation_loss[self.metrics[0].name] else: validation_loss = None else: @@ -451,8 +459,8 @@ def calculate_auxiliary_losses( test_loss: Optional[Union[float, Dict[str, float]]] = self._loss( self.y_test, Y_test_pred ) - if isinstance(test_loss, dict): - test_loss = test_loss[self.metric.name] + if len(self.metrics) == 1: + test_loss = test_loss[self.metrics[0].name] else: test_loss = None else: diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py index 4b6cf8452c..e76186aa06 100644 --- a/autosklearn/evaluation/test_evaluator.py +++ b/autosklearn/evaluation/test_evaluator.py @@ -1,5 +1,5 @@ # -*- encoding: utf-8 -*- -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union import multiprocessing @@ -12,7 +12,7 @@ AbstractEvaluator, _fit_and_suppress_warnings, ) -from autosklearn.metrics import Scorer, calculate_loss +from autosklearn.metrics import Scorer, calculate_losses from autosklearn.pipeline.components.base import ThirdPartyComponents __all__ = ["eval_t", "TestEvaluator"] @@ -23,7 +23,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Scorer, + metrics: Sequence[Scorer], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -39,7 +39,7 @@ def __init__( queue=queue, port=port, configuration=configuration, - metric=metric, + metrics=metrics, additional_components=additional_components, scoring_functions=scoring_functions, seed=seed, @@ -83,22 +83,22 @@ def predict_and_loss( Y_pred = self.predict_function( self.X_train, self.model, self.task_type, self.Y_train ) - err = calculate_loss( + err = calculate_losses( solution=self.Y_train, prediction=Y_pred, task_type=self.task_type, - metric=self.metric, + metrics=self.metrics, scoring_functions=self.scoring_functions, ) else: Y_pred = self.predict_function( self.X_test, self.model, self.task_type, self.Y_train ) - err = calculate_loss( + err = calculate_losses( solution=self.Y_test, prediction=Y_pred, task_type=self.task_type, - metric=self.metric, + metrics=self.metrics, scoring_functions=self.scoring_functions, ) @@ -111,7 +111,7 @@ def eval_t( queue: multiprocessing.Queue, config: Union[int, Configuration], backend: Backend, - metric: Scorer, + metrics: Sequence[Scorer], seed: int, num_run: int, instance: Dict[str, Any], @@ -129,7 +129,7 @@ def eval_t( evaluator = TestEvaluator( configuration=config, backend=backend, - metric=metric, + metrics=metrics, seed=seed, port=port, queue=queue, diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index 7a047d3e10..f6317ca94e 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast import copy import json @@ -182,7 +182,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Scorer, + metrics: Sequence[Scorer], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -210,7 +210,7 @@ def __init__( queue=queue, port=port, configuration=configuration, - metric=metric, + metrics=metrics, additional_components=additional_components, scoring_functions=scoring_functions, seed=seed, @@ -328,8 +328,8 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: y = _get_y_array(self.Y_train, self.task_type) - # stores train loss of each fold. - train_losses = [np.NaN] * self.num_cv_folds + # stores train loss(es) of each fold. + train_losses = [dict()] * self.num_cv_folds # used as weights when averaging train losses. train_fold_weights = [np.NaN] * self.num_cv_folds # stores opt (validation) loss of each fold. @@ -395,8 +395,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: Y_test_pred[i] = test_pred train_splits[i] = train_indices - # Compute train loss of this fold and store it. train_loss could - # either be a scalar or a dict of scalars with metrics as keys. train_loss = self._loss( self.Y_train.iloc[train_indices] if hasattr(self.Y_train, "iloc") @@ -437,37 +435,24 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: w / sum(opt_fold_weights) for w in opt_fold_weights ] - # train_losses is a list of either scalars or dicts. If it contains - # dicts, then train_loss is computed using the target metric - # (self.metric). - if all(isinstance(elem, dict) for elem in train_losses): - train_loss = np.average( + train_loss = { + metric.name: np.average( [ - train_losses[i][str(self.metric)] + train_losses[i][str(metric)] for i in range(self.num_cv_folds) ], weights=train_fold_weights_percentage, ) - else: - train_loss = np.average( - train_losses, weights=train_fold_weights_percentage - ) + for metric in self.metrics + } # if all_scoring_function is true, return a dict of opt_loss. # Otherwise, return a scalar. - if self.scoring_functions: - opt_loss = {} - for metric in opt_losses[0].keys(): - opt_loss[metric] = np.average( - [ - opt_losses[i][metric] - for i in range(self.num_cv_folds) - ], - weights=opt_fold_weights_percentage, - ) - else: - opt_loss = np.average( - opt_losses, weights=opt_fold_weights_percentage + opt_loss = {} + for metric in opt_losses[0].keys(): + opt_loss[metric] = np.average( + [opt_losses[i][metric] for i in range(self.num_cv_folds)], + weights=opt_fold_weights_percentage, ) Y_targets = self.Y_targets @@ -614,8 +599,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: Y_test_pred[i] = test_pred train_splits[i] = train_split - # Compute train loss of this fold and store it. train_loss could - # either be a scalar or a dict of scalars with metrics as keys. train_loss = self._loss( self.Y_train_targets[train_split], train_pred, @@ -642,30 +625,24 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: ] opt_fold_weights = [w / sum(opt_fold_weights) for w in opt_fold_weights] - # train_losses is a list of either scalars or dicts. If it contains dicts, - # then train_loss is computed using the target metric (self.metric). - if all(isinstance(elem, dict) for elem in train_losses): - train_loss = np.average( - [ - train_losses[i][str(self.metric)] - for i in range(self.num_cv_folds) - ], + train_loss = { + metric.name: np.average( + [train_losses[i][str(metric)] for i in range(self.num_cv_folds)], weights=train_fold_weights, ) - else: - train_loss = np.average(train_losses, weights=train_fold_weights) + for metric in self.metrics + } # if all_scoring_function is true, return a dict of opt_loss. Otherwise, # return a scalar. - if self.scoring_functions: - opt_loss = {} - for metric in opt_losses[0].keys(): - opt_loss[metric] = np.average( - [opt_losses[i][metric] for i in range(self.num_cv_folds)], - weights=opt_fold_weights, - ) - else: - opt_loss = np.average(opt_losses, weights=opt_fold_weights) + opt_loss = {} + for metric_name in list(opt_losses[0].keys()) + [ + metric.name for metric in self.metrics + ]: + opt_loss[metric_name] = np.average( + [opt_losses[i][metric_name] for i in range(self.num_cv_folds)], + weights=opt_fold_weights, + ) Y_targets = self.Y_targets Y_train_targets = self.Y_train_targets @@ -1316,7 +1293,7 @@ def eval_holdout( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1338,7 +1315,7 @@ def eval_holdout( queue=queue, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, - metric=metric, + metrics=metrics, configuration=config, seed=seed, num_run=num_run, @@ -1363,7 +1340,7 @@ def eval_iterative_holdout( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1383,7 +1360,7 @@ def eval_iterative_holdout( port=port, config=config, backend=backend, - metric=metric, + metrics=metrics, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=seed, @@ -1410,7 +1387,7 @@ def eval_partial_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1435,7 +1412,7 @@ def eval_partial_cv( backend=backend, port=port, queue=queue, - metric=metric, + metrics=metrics, configuration=config, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, @@ -1463,7 +1440,7 @@ def eval_partial_cv_iterative( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1484,7 +1461,7 @@ def eval_partial_cv_iterative( queue=queue, config=config, backend=backend, - metric=metric, + metrics=metrics, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=seed, @@ -1511,7 +1488,7 @@ def eval_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1531,7 +1508,7 @@ def eval_cv( backend=backend, port=port, queue=queue, - metric=metric, + metrics=metrics, configuration=config, seed=seed, num_run=num_run, @@ -1559,7 +1536,7 @@ def eval_iterative_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1578,7 +1555,7 @@ def eval_iterative_cv( eval_cv( backend=backend, queue=queue, - metric=metric, + metrics=metrics, config=config, seed=seed, num_run=num_run, diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py index 65ef9b2def..8e9112e48f 100644 --- a/autosklearn/experimental/askl2.py +++ b/autosklearn/experimental/askl2.py @@ -93,6 +93,8 @@ def __call__( metalearning_configurations, n_jobs, dask_client, + multi_objective_algorithm, + multi_objective_kwargs, ): from smac.facade.smac_ac_facade import SMAC4AC from smac.intensification.simple_intensifier import SimpleIntensifier @@ -122,6 +124,8 @@ def __call__( run_id=seed, n_jobs=n_jobs, dask_client=dask_client, + multi_objective_algorithm=multi_objective_algorithm, + multi_objective_kwargs=multi_objective_kwargs, ) @@ -141,6 +145,8 @@ def __call__( metalearning_configurations, n_jobs, dask_client, + multi_objective_algorithm, + multi_objective_kwargs, ): from smac.facade.smac_ac_facade import SMAC4AC from smac.intensification.successive_halving import SuccessiveHalving @@ -178,6 +184,8 @@ def __call__( }, dask_client=dask_client, n_jobs=n_jobs, + multi_objective_algorithm=multi_objective_algorithm, + multi_objective_kwargs=multi_objective_kwargs, ) smac4ac.solver.epm_chooser.min_samples_model = int( len(scenario.cs.get_hyperparameters()) / 2 diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 3234329658..3104716da3 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -1,6 +1,7 @@ from abc import ABCMeta, abstractmethod -from typing import Any, Callable, Dict, List, Optional, Union, cast +from typing import Any, Callable, Dict, List, Optional, Sequence +import collections from functools import partial from itertools import product @@ -384,17 +385,47 @@ def make_scorer( CLASSIFICATION_METRICS[name] = scorer -def calculate_score( +def _validate_metrics( + metrics: Sequence[Scorer], + scoring_functions: Optional[List[Scorer]] = None, +) -> None: + """ + Validate metrics given to Auto-sklearn. Raises an Exception in case of a problem. + + metrics: Sequence[Scorer] + A list of objects that hosts a function to calculate how good the + prediction is according to the solution. + scoring_functions: Optional[List[Scorer]] + A list of metrics to calculate multiple losses + """ + + to_score = list(metrics) + if scoring_functions: + to_score.extend(scoring_functions) + + if len(metrics) == 0: + raise ValueError("Number of metrics to compute must be greater than zero.") + + metric_counter = collections.Counter(to_score) + metric_names_counter = collections.Counter(metric.name for metric in to_score) + if len(metric_counter) != len(metric_names_counter): + raise ValueError( + "Error in metrics passed to Auto-sklearn. A metric name was used " + "multiple times for different metrics!" + ) + + +def calculate_scores( solution: np.ndarray, prediction: np.ndarray, task_type: int, - metric: Scorer, + metrics: Sequence[Scorer], scoring_functions: Optional[List[Scorer]] = None, -) -> Union[float, Dict[str, float]]: +) -> Dict[str, float]: """ - Returns a score (a magnitude that allows casting the + Returns the scores (a magnitude that allows casting the optimization problem as a maximization one) for the - given Auto-Sklearn Scorer object + given Auto-Sklearn Scorer objects. Parameters ---------- @@ -405,82 +436,83 @@ def calculate_score( task_type: int To understand if the problem task is classification or regression - metric: Scorer - Object that host a function to calculate how good the + metrics: Sequence[Scorer] + A list of objects that hosts a function to calculate how good the prediction is according to the solution. scoring_functions: List[Scorer] A list of metrics to calculate multiple losses Returns ------- - float or Dict[str, float] + Dict[str, float] """ if task_type not in TASK_TYPES: raise NotImplementedError(task_type) - if scoring_functions: - score_dict = dict() - if task_type in REGRESSION_TASKS: - for metric_ in scoring_functions + [metric]: - - try: - score_dict[metric_.name] = _compute_scorer( - metric_, prediction, solution, task_type - ) - except ValueError as e: - print(e, e.args[0]) - if ( - e.args[0] - == "Mean Squared Logarithmic Error cannot be used when " - "targets contain negative values." - ): - continue - else: - raise e - - else: - for metric_ in scoring_functions + [metric]: + _validate_metrics(metrics=metrics, scoring_functions=scoring_functions) - # TODO maybe annotate metrics to define which cases they can - # handle? + to_score = list(metrics) + if scoring_functions: + to_score.extend(scoring_functions) - try: - score_dict[metric_.name] = _compute_scorer( - metric_, prediction, solution, task_type - ) - except ValueError as e: - if e.args[0] == "multiclass format is not supported": - continue - elif ( - e.args[0] == "Samplewise metrics are not available " - "outside of multilabel classification." - ): - continue - elif ( - e.args[0] == "Target is multiclass but " - "average='binary'. Please choose another average " - "setting, one of [None, 'micro', 'macro', 'weighted']." - ): - continue - else: - raise e - - return score_dict + score_dict = dict() + if task_type in REGRESSION_TASKS: + for metric_ in to_score: + + try: + score_dict[metric_.name] = _compute_single_scorer( + metric_, prediction, solution, task_type + ) + except ValueError as e: + print(e, e.args[0]) + if ( + e.args[0] == "Mean Squared Logarithmic Error cannot be used when " + "targets contain negative values." + ): + continue + else: + raise e else: - return _compute_scorer(metric, prediction, solution, task_type) + for metric_ in to_score: + + # TODO maybe annotate metrics to define which cases they can + # handle? + + try: + score_dict[metric_.name] = _compute_single_scorer( + metric_, prediction, solution, task_type + ) + except ValueError as e: + if e.args[0] == "multiclass format is not supported": + continue + elif ( + e.args[0] == "Samplewise metrics are not available " + "outside of multilabel classification." + ): + continue + elif ( + e.args[0] == "Target is multiclass but " + "average='binary'. Please choose another average " + "setting, one of [None, 'micro', 'macro', 'weighted']." + ): + continue + else: + raise e + + return score_dict -def calculate_loss( +def calculate_losses( solution: np.ndarray, prediction: np.ndarray, task_type: int, - metric: Scorer, + metrics: Sequence[Scorer], scoring_functions: Optional[List[Scorer]] = None, -) -> Union[float, Dict[str, float]]: +) -> Dict[str, float]: """ - Returns a loss (a magnitude that allows casting the + Returns the losses (a magnitude that allows casting the optimization problem as a minimization one) for the - given Auto-Sklearn Scorer object + given Auto-Sklearn Scorer objects. Parameters ---------- @@ -491,45 +523,39 @@ def calculate_loss( task_type: int To understand if the problem task is classification or regression - metric: Scorer - Object that host a function to calculate how good the + metrics: Sequence[Scorer] + A list of objects that hosts a function to calculate how good the prediction is according to the solution. scoring_functions: List[Scorer] A list of metrics to calculate multiple losses Returns ------- - float or Dict[str, float] + Dict[str, float] A loss function for each of the provided scorer objects """ - score = calculate_score( + score = calculate_scores( solution=solution, prediction=prediction, task_type=task_type, - metric=metric, + metrics=metrics, scoring_functions=scoring_functions, ) + scoring_functions = scoring_functions if scoring_functions else [] - if scoring_functions: - score = cast(Dict, score) - # we expect a dict() object for which we should calculate the loss - loss_dict = dict() - for metric_ in scoring_functions + [metric]: - # TODO: When metrics are annotated with type_of_target support - # we can remove this check - if metric_.name not in score: - continue - # maybe metric argument is not in scoring_functions - # so append it to the list. Rather than check if such - # is the case, redefining loss_dict[metric] is less expensive - loss_dict[metric_.name] = metric_._optimum - score[metric_.name] - return loss_dict - else: - rval = metric._optimum - cast(float, score) - return rval + # we expect a dict() object for which we should calculate the loss + loss_dict = dict() + for metric_ in scoring_functions + list(metrics): + # maybe metric argument is not in scoring_functions + # TODO: When metrics are annotated with type_of_target support + # we can remove this check + if metric_.name not in score: + continue + loss_dict[metric_.name] = metric_._optimum - score[metric_.name] + return loss_dict -def calculate_metric( +def compute_single_metric( metric: Scorer, prediction: np.ndarray, solution: np.ndarray, task_type: int ) -> float: """ @@ -553,7 +579,7 @@ def calculate_metric( ------- float """ - score = _compute_scorer( + score = _compute_single_scorer( solution=solution, prediction=prediction, metric=metric, @@ -562,7 +588,7 @@ def calculate_metric( return metric._sign * score -def _compute_scorer( +def _compute_single_scorer( metric: Scorer, prediction: np.ndarray, solution: np.ndarray, task_type: int ) -> float: """ diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 608c58921d..cd83e94e1e 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -1,5 +1,5 @@ import typing -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Sequence import copy import json @@ -16,6 +16,7 @@ from smac.facade.smac_ac_facade import SMAC4AC from smac.intensification.intensification import Intensifier from smac.intensification.simple_intensifier import SimpleIntensifier +from smac.optimizer.multi_objective.parego import ParEGO from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost from smac.scenario.scenario import Scenario from smac.tae.dask_runner import DaskParallelRunner @@ -40,6 +41,7 @@ ) from autosklearn.metalearning.metalearning.meta_base import MetaBase from autosklearn.metalearning.mismbo import suggest_via_metalearning +from autosklearn.metrics import Scorer from autosklearn.util.logging_ import get_named_client_logger from autosklearn.util.parallel import preload_modules from autosklearn.util.stopwatch import StopWatch @@ -218,6 +220,8 @@ def get_smac_object( metalearning_configurations, n_jobs, dask_client, + multi_objective_algorithm, + multi_objective_kwargs, ): if len(scenario_dict["instances"]) > 1: intensifier = Intensifier @@ -242,6 +246,8 @@ def get_smac_object( intensifier=intensifier, dask_client=dask_client, n_jobs=n_jobs, + multi_objective_algorithm=multi_objective_algorithm, + multi_objective_kwargs=multi_objective_kwargs, ) @@ -254,7 +260,7 @@ def __init__( total_walltime_limit, func_eval_time_limit, memory_limit, - metric, + metrics: Sequence[Scorer], stopwatch: StopWatch, n_jobs, dask_client: dask.distributed.Client, @@ -281,7 +287,7 @@ def __init__( # data related self.dataset_name = dataset_name self.datamanager = None - self.metric = metric + self.metrics = metrics self.task = None self.backend = backend self.port = port @@ -300,7 +306,7 @@ def __init__( self.resampling_strategy_args = resampling_strategy_args # and a bunch of useful limits - self.worst_possible_result = get_cost_of_crash(self.metric) + self.worst_possible_result = get_cost_of_crash(self.metrics) self.total_walltime_limit = int(total_walltime_limit) self.func_eval_time_limit = int(func_eval_time_limit) self.memory_limit = memory_limit @@ -355,7 +361,7 @@ def collect_metalearning_suggestions(self, meta_base): metalearning_configurations = _get_metalearning_configurations( meta_base=meta_base, basename=self.dataset_name, - metric=self.metric, + metric=self.metrics[0], configuration_space=self.config_space, task=self.task, is_sparse=self.datamanager.info["is_sparse"], @@ -469,7 +475,7 @@ def run_smbo(self): initial_num_run=num_run, include=self.include, exclude=self.exclude, - metric=self.metric, + metrics=self.metrics, memory_limit=self.memory_limit, disable_file_output=self.disable_file_output, scoring_functions=self.scoring_functions, @@ -535,6 +541,13 @@ def run_smbo(self): "n_jobs": self.n_jobs, "dask_client": self.dask_client, } + if len(self.metrics) > 1: + smac_args["multi_objective_algorithm"] = ParEGO + smac_args["multi_objective_kwargs"] = {"rho": 0.05} + scenario_dict["multi_objectives"] = [metric.name for metric in self.metrics] + else: + smac_args["multi_objective_algorithm"] = None + smac_args["multi_objective_kwargs"] = {} if self.get_smac_object_callback is not None: smac = self.get_smac_object_callback(**smac_args) else: @@ -580,7 +593,7 @@ def get_metalearning_suggestions(self): "files", "%s_%s_%s" % ( - self.metric, + self.metrics[0], TASK_TYPES_TO_STRING[meta_task], "sparse" if self.datamanager.info["is_sparse"] else "dense", ), @@ -607,7 +620,7 @@ def get_metalearning_suggestions(self): self.metadata_directory, "%s_%s_%s" % ( - self.metric, + self.metrics[0], TASK_TYPES_TO_STRING[meta_task], "sparse" if self.datamanager.info["is_sparse"] else "dense", ), diff --git a/examples/40_advanced/example_metrics.py b/examples/40_advanced/example_metrics.py index 33d0f678fd..7784491746 100644 --- a/examples/40_advanced/example_metrics.py +++ b/examples/40_advanced/example_metrics.py @@ -71,18 +71,18 @@ def error_wk(solution, prediction, extra_argument): print("#" * 80) print("Use predefined accuracy metric") +scorer = autosklearn.metrics.accuracy cls = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=60, per_run_time_limit=30, seed=1, - metric=autosklearn.metrics.accuracy, + metric=scorer, ) cls.fit(X_train, y_train) predictions = cls.predict(X_test) -score = sklearn.metrics.accuracy_score(y_test, predictions) -metric_name = cls.automl_._metric.name -print(f"Accuracy score {score:.3f} using {metric_name}") +score = scorer(y_test, predictions) +print(f"Accuracy score {score:.3f} using {scorer.name}") ############################################################################ # Second example: Use own accuracy metric @@ -108,8 +108,7 @@ def error_wk(solution, prediction, extra_argument): predictions = cls.predict(X_test) score = accuracy_scorer(y_test, predictions) -metric_name = cls.automl_._metric.name -print(f"Accuracy score {score:.3f} using {metric_name:s}") +print(f"Accuracy score {score:.3f} using {accuracy_scorer.name:s}") ############################################################################ # Third example: Use own error metric @@ -135,8 +134,7 @@ def error_wk(solution, prediction, extra_argument): cls.predictions = cls.predict(X_test) score = error_rate(y_test, predictions) -metric_name = cls.automl_._metric.name -print(f"Error score {score:.3f} using {metric_name:s}") +print(f"Error score {score:.3f} using {error_rate.name:s}") ############################################################################ # Fourth example: Use own accuracy metric with additional argument @@ -160,8 +158,7 @@ def error_wk(solution, prediction, extra_argument): predictions = cls.predict(X_test) score = accuracy_scorer(y_test, predictions) -metric_name = cls.automl_._metric.name -print(f"Accuracy score {score:.3f} using {metric_name:s}") +print(f"Accuracy score {score:.3f} using {accuracy_scorer.name:s}") ############################################################################ # Fifth example: Use own accuracy metric with additional argument @@ -188,5 +185,4 @@ def error_wk(solution, prediction, extra_argument): predictions = cls.predict(X_test) score = error_rate(y_test, predictions) -metric_name = cls.automl_._metric.name -print(f"Error score {score:.3f} using {metric_name:s}") +print(f"Error score {score:.3f} using {error_rate.name:s}") diff --git a/examples/40_advanced/example_multi_objective.py b/examples/40_advanced/example_multi_objective.py new file mode 100644 index 0000000000..f81f0d4709 --- /dev/null +++ b/examples/40_advanced/example_multi_objective.py @@ -0,0 +1,65 @@ +# -*- encoding: utf-8 -*- +""" +============== +Classification +============== + +The following example shows how to fit *auto-sklearn* to optimize for two +competing metrics: `precision` and `recall` (read more on this tradeoff +in the `scikit-learn docs `_. + +Auto-sklearn uses `SMAC3's implementation of ParEGO `_. +Multi-objective ensembling and proper access to the full Pareto front will be added in the near +future. +""" +from pprint import pprint + +import sklearn.datasets +import sklearn.metrics + +import autosklearn.classification +import autosklearn.metrics + + +############################################################################ +# Data Loading +# ============ + +X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) + +############################################################################ +# Build and fit a classifier +# ========================== + +automl = autosklearn.classification.AutoSklearnClassifier( + time_left_for_this_task=30, + tmp_folder="/tmp/autosklearn_multi_objective_example_tmp", + metric=[autosklearn.metrics.precision, autosklearn.metrics.recall], +) +automl.fit(X_train, y_train, dataset_name="breast_cancer") + +############################################################################ +# Compute the two competing metrics +# ================================= + +predictions = automl.predict(X_test) +print("Precision", sklearn.metrics.precision_score(y_test, predictions)) +print("Recall", sklearn.metrics.recall_score(y_test, predictions)) + +############################################################################ +# View the models found by auto-sklearn +# ===================================== +# They are by default sorted by the first metric given to *auto-sklearn*. + +print(automl.leaderboard()) + +############################################################################ +# ``cv_results`` also contains both metrics +# ========================================= +# Similarly to the leaderboard, they are sorted by the first metric given +# to *auto-sklearn*. + +pprint(automl.cv_results_) diff --git a/examples/60_search/example_random_search.py b/examples/60_search/example_random_search.py index 520c8c18b0..908fe44ffe 100644 --- a/examples/60_search/example_random_search.py +++ b/examples/60_search/example_random_search.py @@ -45,6 +45,8 @@ def get_roar_object_callback( metalearning_configurations, n_jobs, dask_client, + multi_objective_algorithm, # This argument will be ignored as ROAR does not yet support multi-objective optimization + multi_objective_kwargs, ): """Random online adaptive racing.""" @@ -91,7 +93,15 @@ def get_roar_object_callback( # Fit a classifier using Random Search # ==================================== def get_random_search_object_callback( - scenario_dict, seed, ta, ta_kwargs, metalearning_configurations, n_jobs, dask_client + scenario_dict, + seed, + ta, + ta_kwargs, + metalearning_configurations, + n_jobs, + dask_client, + multi_objective_algorithm, # This argument will be ignored as ROAR does not yet support multi-objective optimization + multi_objective_kwargs, ): """Random search""" diff --git a/examples/60_search/example_successive_halving.py b/examples/60_search/example_successive_halving.py index e57be7f157..71749f5668 100644 --- a/examples/60_search/example_successive_halving.py +++ b/examples/60_search/example_successive_halving.py @@ -37,6 +37,8 @@ def get_smac_object( metalearning_configurations, n_jobs, dask_client, + multi_objective_algorithm, # This argument will be ignored as SH does not yet support multi-objective optimization + multi_objective_kwargs, ): from smac.facade.smac_ac_facade import SMAC4AC from smac.intensification.successive_halving import SuccessiveHalving diff --git a/test/test_automl/test_dummy_predictions.py b/test/test_automl/test_dummy_predictions.py index 9a268d1a2c..3b9350ce8b 100644 --- a/test/test_automl/test_dummy_predictions.py +++ b/test/test_automl/test_dummy_predictions.py @@ -66,7 +66,7 @@ def test_produces_correct_output( * It should produce predictions "predictions_ensemble_1337_1_0.0.npy" """ seed = 1337 - automl = make_automl(metric=metric, seed=seed) + automl = make_automl(metrics=[metric], seed=seed) automl._logger = mock_logger datamanager = make_sklearn_dataset( @@ -115,7 +115,7 @@ def test_runs_with_correct_args( dataset = "iris" task = MULTICLASS_CLASSIFICATION - automl = make_automl(metric=accuracy) + automl = make_automl(metrics=[accuracy]) automl._logger = mock_logger datamanager = make_sklearn_dataset( @@ -159,7 +159,7 @@ def test_crash_due_to_memory_exception( dataset = "iris" task = MULTICLASS_CLASSIFICATION - automl = make_automl(metric=accuracy) + automl = make_automl(metrics=[accuracy]) automl._logger = mock_logger datamanager = make_sklearn_dataset( @@ -181,5 +181,5 @@ def test_crash_due_to_memory_exception( def test_raises_if_no_metric_set(make_automl: Callable[..., AutoML]) -> None: automl = make_automl() - with pytest.raises(ValueError, match="Metric was not set"): + with pytest.raises(ValueError, match="Metric/Metrics was/were not set"): automl._do_dummy_prediction() diff --git a/test/test_estimators/test_estimators.py b/test/test_estimators/test_estimators.py index cd4b0922de..4dd13d4c17 100644 --- a/test/test_estimators/test_estimators.py +++ b/test/test_estimators/test_estimators.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Type, Union, cast +from typing import Any, Dict, Sequence, Type, Union, cast import copy import glob @@ -42,6 +42,7 @@ import unittest import unittest.mock +import test.conftest from test.test_automl.automl_utils import ( count_succeses, include_single_scores, @@ -378,6 +379,74 @@ def test_cv_results(tmp_dir): assert hasattr(cls, "classes_") +def test_cv_results_multi_objective(tmp_dir): + # TODO restructure and actually use real SMAC output from a long run + # to do this unittest! + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") + + cls = AutoSklearnClassifier( + time_left_for_this_task=30, + per_run_time_limit=5, + tmp_folder=os.path.join(tmp_dir, "backend"), + seed=1, + initial_configurations_via_metalearning=0, + metric=[autosklearn.metrics.precision_macro, autosklearn.metrics.roc_auc], + scoring_functions=[autosklearn.metrics.accuracy, autosklearn.metrics.roc_auc], + ) + + params = cls.get_params() + original_params = copy.deepcopy(params) + + cls.fit(X_train, Y_train) + + cv_results = cls.cv_results_ + assert isinstance(cv_results, dict), type(cv_results) + assert "mean_test_score" not in cv_results + assert "rank_test_scores" not in cv_results + for expected_column in ( + "mean_test_precision_macro", + "mean_test_roc_auc", + "mean_fit_time", + "rank_test_precision_macro", + "rank_test_roc_auc", + "metric_roc_auc", + "metric_accuracy", + ): + assert isinstance(cv_results[expected_column], np.ndarray), type( + cv_results[expected_column] + ) + + assert isinstance(cv_results["params"], list), type(cv_results["params"]) + cv_result_items = [ + isinstance(val, npma.MaskedArray) + for key, val in cv_results.items() + if key.startswith("param_") + ] + assert all(cv_result_items), cv_results.items() + + # Compare the state of the model parameters with the original parameters + new_params = clone(cls).get_params() + for param_name, original_value in original_params.items(): + new_value = new_params[param_name] + + # Taken from Sklearn code: + # We should never change or mutate the internal state of input + # parameters by default. To check this we use the joblib.hash function + # that introspects recursively any subobjects to compute a checksum. + # The only exception to this rule of immutable constructor parameters + # is possible RandomState instance but in this check we explicitly + # fixed the random_state params recursively to be integer seeds. + assert joblib.hash(new_value) == joblib.hash(original_value), ( + "Estimator %s should not change or mutate " + " the parameter %s from %s to %s during fit." + % (cls, param_name, original_value, new_value) + ) + + # Comply with https://scikit-learn.org/dev/glossary.html#term-classes + is_classifier(cls) + assert hasattr(cls, "classes_") + + @pytest.mark.parametrize( "estimator_type,dataset_name", [(AutoSklearnClassifier, "iris"), (AutoSklearnRegressor, "boston")], @@ -388,7 +457,7 @@ def test_leaderboard( # Comprehensive test tasks a substantial amount of time, manually set if # required. MAX_COMBO_SIZE_FOR_INCLUDE_PARAM = 3 # [0, len(valid_columns) + 1] - column_types = AutoSklearnEstimator._leaderboard_columns() + column_types = AutoSklearnEstimator._leaderboard_columns(num_metrics=1) # Create a dict of all possible param values for each param # with some invalid one's of the incorrect type @@ -496,6 +565,160 @@ def exclude(lst, s): assert all(leaderboard["ensemble_weight"] > 0) +@pytest.mark.parametrize( + "estimator_type,dataset_name,metrics", + [ + ( + AutoSklearnClassifier, + "iris", + (autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy), + ), + ( + AutoSklearnRegressor, + "boston", + (autosklearn.metrics.r2, autosklearn.metrics.root_mean_squared_error), + ), + ], +) +def test_leaderboard_multi_objective( + tmp_dir: str, + estimator_type: Type[AutoSklearnEstimator], + dataset_name: str, + metrics: Sequence[autosklearn.metrics.Scorer], +): + # Comprehensive test tasks a substantial amount of time, manually set if + # required. + MAX_COMBO_SIZE_FOR_INCLUDE_PARAM = 3 # [0, len(valid_columns) + 1] + column_types = AutoSklearnEstimator._leaderboard_columns(num_metrics=2) + + # Create a dict of all possible param values for each param + # with some invalid one's of the incorrect type + include_combinations = itertools.chain( + itertools.combinations(column_types["all"], item_count) + for item_count in range(1, MAX_COMBO_SIZE_FOR_INCLUDE_PARAM) + ) + valid_params = { + "detailed": [True, False], + "ensemble_only": [True, False], + "top_k": [-10, 0, 1, 10, "all"], + "sort_by": [ + "cost", + "cost_0", + "cost_1", + ["cost_1", "cost_0"], + *column_types["all"], + "invalid", + ], + "sort_order": ["ascending", "descending", "auto", "invalid", None], + "include": itertools.chain([None, "invalid", "type"], include_combinations), + } + + # Create a generator of all possible combinations of valid_params + params_generator = iter( + dict(zip(valid_params.keys(), param_values)) + for param_values in itertools.product(*valid_params.values()) + ) + + X_train, Y_train, _, _ = putil.get_dataset(dataset_name) + model = estimator_type( + time_left_for_this_task=30, + per_run_time_limit=5, + tmp_folder=os.path.join(tmp_dir, "backend"), + seed=test.conftest.DEFAULT_SEED, + metric=metrics, + ) + + model.fit(X_train, Y_train) + + for params in params_generator: + # Convert from iterator to solid list + if params["include"] is not None and not isinstance(params["include"], str): + params["include"] = list(params["include"]) + + # Invalid top_k should raise an error, is a positive int or 'all' + if not (params["top_k"] == "all" or params["top_k"] > 0): + with pytest.raises(ValueError): + model.leaderboard(**params) + + # Invalid sort_by column + elif ( + params["sort_by"] not in column_types["all"] + and params["sort_by"] != "cost" + and params["sort_by"] != ["cost_1", "cost_0"] + and params["sort_by"] not in ["cost_0", "cost_1"] + ): + with pytest.raises(ValueError): + model.leaderboard(**params) + + # Shouldn't accept an invalid sort order + elif params["sort_order"] not in ["ascending", "descending", "auto"]: + with pytest.raises(ValueError): + model.leaderboard(**params) + + # include is single str but not valid + elif ( + isinstance(params["include"], str) + and params["include"] not in column_types["all"] + ): + with pytest.raises(ValueError): + model.leaderboard(**params) + + # Crash if include is list but contains invalid column + elif ( + isinstance(params["include"], list) + and len(set(params["include"]) - set(column_types["all"])) != 0 + ): + with pytest.raises(ValueError): + model.leaderboard(**params) + + # Can't have just model_id, in both single str and list case + elif params["include"] == "model_id" or params["include"] == ["model_id"]: + with pytest.raises(ValueError): + model.leaderboard(**params) + + # Else all valid combinations should be validated + else: + leaderboard = model.leaderboard(**params) + assert "cost" not in leaderboard.columns + + if params["include"] is None: + assert "cost_0" in leaderboard.columns + assert "cost_1" in leaderboard.columns + else: + for cost_name in ["cost_0", "cost_1"]: + if cost_name in params["include"]: + assert cost_name in leaderboard.columns + + # top_k should never be less than the rows given back + # It can however be larger + if isinstance(params["top_k"], int): + assert params["top_k"] >= len(leaderboard) + + # Check the right columns are present and in the right order + # The model_id is set as the index, not included in pandas columns + columns = list(leaderboard.columns) + + def exclude(lst, s): + return [x for x in lst if x != s] + + if params["include"] is not None: + # Include with only single str should be the only column + if isinstance(params["include"], str): + assert params["include"] in columns and len(columns) == 1 + # Include as a list should have all the columns without model_id + else: + assert columns == exclude(params["include"], "model_id") + elif params["detailed"]: + assert columns == exclude(column_types["detailed"], "model_id") + else: + assert columns == exclude(column_types["simple"], "model_id") + + # Ensure that if it's ensemble only + # Can only check if 'ensemble_weight' is present + if params["ensemble_only"] and "ensemble_weight" in columns: + assert all(leaderboard["ensemble_weight"] > 0) + + @pytest.mark.parametrize("estimator", [AutoSklearnRegressor]) @pytest.mark.parametrize("resampling_strategy", ["holdout"]) @pytest.mark.parametrize( diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py index 7f88383bcd..7bd52c0f76 100644 --- a/test/test_evaluation/test_abstract_evaluator.py +++ b/test/test_evaluation/test_abstract_evaluator.py @@ -65,7 +65,7 @@ def test_finish_up_model_predicts_NaN(self): port=self.port, output_y_hat_optimization=False, queue=queue_mock, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) ae.Y_optimization = rs.rand(33, 3) @@ -143,7 +143,7 @@ def test_disable_file_output(self): backend=self.backend_mock, queue=queue_mock, disable_file_output=True, - metric=accuracy, + metrics=[accuracy], port=self.port, additional_components=dict(), ) @@ -169,7 +169,7 @@ def test_disable_file_output(self): output_y_hat_optimization=False, queue=queue_mock, disable_file_output=[disable], - metric=accuracy, + metrics=[accuracy], port=self.port, additional_components=dict(), ) @@ -226,7 +226,7 @@ def test_disable_file_output(self): backend=self.backend_mock, output_y_hat_optimization=False, queue=queue_mock, - metric=accuracy, + metrics=[accuracy], disable_file_output=["y_optimization"], port=self.port, additional_components=dict(), @@ -286,7 +286,7 @@ def test_file_output(self): backend=backend, output_y_hat_optimization=False, queue=queue_mock, - metric=accuracy, + metrics=[accuracy], port=self.port, additional_components=dict(), ) @@ -357,7 +357,7 @@ def test_add_additional_components(self): backend=backend, output_y_hat_optimization=False, queue=queue_mock, - metric=accuracy, + metrics=[accuracy], port=self.port, additional_components=additional_components, ) diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py index 1723b208f2..f5292060a6 100644 --- a/test/test_evaluation/test_evaluation.py +++ b/test/test_evaluation/test_evaluation.py @@ -95,8 +95,8 @@ def test_eval_with_limits_holdout(self, pynisher_mock): multi_objectives=["cost"], stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="fork", ) @@ -126,8 +126,8 @@ def test_zero_or_negative_cutoff(self, pynisher_mock): multi_objectives=["cost"], resampling_strategy="holdout", stats=self.stats, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="forkserver", ) @@ -156,8 +156,8 @@ def test_cutoff_lower_than_remaining_time(self, pynisher_mock): multi_objectives=["cost"], resampling_strategy="holdout", stats=self.stats, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="forkserver", ) @@ -189,8 +189,8 @@ def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="fork", ) @@ -260,8 +260,8 @@ def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=log_loss, - cost_for_crash=get_cost_of_crash(log_loss), + metrics=[log_loss], + cost_for_crash=get_cost_of_crash([log_loss]), abort_on_first_run_crash=False, pynisher_context="fork", ) @@ -302,8 +302,8 @@ def test_eval_with_limits_holdout_fail_timeout(self, pynisher_mock): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="forkserver", ) @@ -352,8 +352,8 @@ def side_effect(**kwargs): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="forkserver", ) @@ -388,8 +388,8 @@ def side_effect(**kwargs): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="forkserver", ) @@ -432,8 +432,8 @@ def side_effect(*args, **kwargs): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="fork", ) @@ -468,8 +468,8 @@ def test_exception_in_target_function(self, eval_holdout_mock): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="fork", ) @@ -504,8 +504,8 @@ def test_silent_exception_in_target_function(self): multi_objectives=["cost"], stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, iterative=False, pynisher_context="fork", diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py index 8615682ce7..457661df03 100644 --- a/test/test_evaluation/test_test_evaluator.py +++ b/test/test_evaluation/test_test_evaluator.py @@ -20,12 +20,14 @@ ) from autosklearn.evaluation.test_evaluator import TestEvaluator, eval_t from autosklearn.evaluation.util import read_queue -from autosklearn.metrics import accuracy, f1_macro, r2 +from autosklearn.metrics import accuracy, balanced_accuracy, f1_macro, r2 from autosklearn.util.pipeline import get_configuration_space import unittest import unittest.mock +import test.conftest + this_directory = os.path.dirname(__file__) sys.path.append(this_directory) from evaluation_util import ( # noqa (E402: module level import not at top of file) @@ -72,7 +74,7 @@ def test_datasets(self): evaluator = TestEvaluator( backend_mock, queue_, - metric=metric_lookup[D.info["task"]], + metrics=[metric_lookup[D.info["task"]]], port=logging.handlers.DEFAULT_TCP_LOGGING_PORT, additional_components=dict(), ) @@ -110,8 +112,8 @@ def test_eval_test(self): queue=self.queue, backend=self.backend, config=self.configuration, - metric=accuracy, - seed=1, + metrics=[accuracy], + seed=test.conftest.DEFAULT_SEED, num_run=1, scoring_functions=None, output_y_hat_optimization=False, @@ -124,7 +126,35 @@ def test_eval_test(self): ) rval = read_queue(self.queue) self.assertEqual(len(rval), 1) - self.assertAlmostEqual(rval[0]["loss"], 0.040000000000000036) + self.assertAlmostEqual(rval[0]["loss"], 0.07999999999999996) + self.assertEqual(rval[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", rval[0]["additional_run_info"]) + + def test_eval_test_multi_objective(self): + metrics = { + accuracy: 0.07999999999999996, + balanced_accuracy: 0.05555555555555547, + } + eval_t( + queue=self.queue, + backend=self.backend, + config=self.configuration, + metrics=list(metrics.keys()), + seed=test.conftest.DEFAULT_SEED, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=False, + include=None, + exclude=None, + disable_file_output=False, + instance=self.dataset_name, + port=self.port, + additional_components=dict(), + ) + rval = read_queue(self.queue) + self.assertEqual(len(rval), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(rval[0]["loss"][metric.name], loss) self.assertEqual(rval[0]["status"], StatusType.SUCCESS) self.assertNotIn("bac_metric", rval[0]["additional_run_info"]) @@ -133,7 +163,7 @@ def test_eval_test_all_loss_functions(self): queue=self.queue, backend=self.backend, config=self.configuration, - metric=accuracy, + metrics=[accuracy], seed=1, num_run=1, scoring_functions=SCORER_LIST, diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index afed8b5ce1..23607b8e4d 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -48,12 +48,14 @@ subsample_indices, ) from autosklearn.evaluation.util import read_queue -from autosklearn.metrics import accuracy, f1_macro, r2 +from autosklearn.metrics import accuracy, balanced_accuracy, f1_macro, r2 from autosklearn.util.pipeline import get_configuration_space import unittest import unittest.mock +import test.conftest + this_directory = os.path.dirname(__file__) sys.path.append(this_directory) from evaluation_util import ( # noqa (E402: module level import not at top of file) @@ -139,7 +141,7 @@ def test_holdout(self, pipeline_mock): resampling_strategy_args={"train_size": 0.66}, scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], port=self.port, additional_components=dict(), ) @@ -229,7 +231,7 @@ def configuration_fully_fitted(self): resampling_strategy="holdout", scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], budget=0.0, additional_components=dict(), ) @@ -239,42 +241,45 @@ def configuration_fully_fitted(self): class LossSideEffect(object): def __init__(self): self.losses = [ - 1.0, - 1.0, - 1.0, - 1.0, - 0.9, - 0.9, - 0.9, - 0.9, - 0.8, - 0.8, - 0.8, - 0.8, - 0.7, - 0.7, - 0.7, - 0.7, - 0.6, - 0.6, - 0.6, - 0.6, - 0.5, - 0.5, - 0.5, - 0.5, - 0.4, - 0.4, - 0.4, - 0.4, - 0.3, - 0.3, - 0.3, - 0.3, - 0.2, - 0.2, - 0.2, - 0.2, + {"accuracy": value} + for value in [ + 1.0, + 1.0, + 1.0, + 1.0, + 0.9, + 0.9, + 0.9, + 0.9, + 0.8, + 0.8, + 0.8, + 0.8, + 0.7, + 0.7, + 0.7, + 0.7, + 0.6, + 0.6, + 0.6, + 0.6, + 0.5, + 0.5, + 0.5, + 0.5, + 0.4, + 0.4, + 0.4, + 0.4, + 0.3, + 0.3, + 0.3, + 0.3, + 0.2, + 0.2, + 0.2, + 0.2, + ] ] self.iteration = 0 @@ -381,7 +386,7 @@ def configuration_fully_fitted(self): resampling_strategy="holdout-iterative-fit", scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], budget=0.0, additional_components=dict(), ) @@ -391,26 +396,29 @@ def configuration_fully_fitted(self): class LossSideEffect(object): def __init__(self): self.losses = [ - 0.8, - 0.8, - 0.8, - 0.8, - 0.6, - 0.6, - 0.6, - 0.6, - 0.4, - 0.4, - 0.4, - 0.4, - 0.2, - 0.2, - 0.2, - 0.2, - 0.0, - 0.0, - 0.0, - 0.0, + {"accuracy": value} + for value in [ + 0.8, + 0.8, + 0.8, + 0.8, + 0.6, + 0.6, + 0.6, + 0.6, + 0.4, + 0.4, + 0.4, + 0.4, + 0.2, + 0.2, + 0.2, + 0.2, + 0.0, + 0.0, + 0.0, + 0.0, + ] ] self.iteration = 0 @@ -482,7 +490,7 @@ def test_iterative_holdout_not_iterative(self, pipeline_mock): resampling_strategy="holdout-iterative-fit", scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -538,7 +546,7 @@ def test_cv(self, pipeline_mock): resampling_strategy_args={"folds": 5}, scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -606,7 +614,7 @@ def test_partial_cv(self, pipeline_mock): resampling_strategy_args={"folds": 5}, scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) @@ -688,7 +696,7 @@ def configuration_fully_fitted(self): resampling_strategy_args={"folds": 5}, scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], budget=0.0, additional_components=dict(), ) @@ -698,42 +706,45 @@ def configuration_fully_fitted(self): class LossSideEffect(object): def __init__(self): self.losses = [ - 1.0, - 1.0, - 1.0, - 1.0, - 0.9, - 0.9, - 0.9, - 0.9, - 0.8, - 0.8, - 0.8, - 0.8, - 0.7, - 0.7, - 0.7, - 0.7, - 0.6, - 0.6, - 0.6, - 0.6, - 0.5, - 0.5, - 0.5, - 0.5, - 0.4, - 0.4, - 0.4, - 0.4, - 0.3, - 0.3, - 0.3, - 0.3, - 0.2, - 0.2, - 0.2, - 0.2, + {"accuracy": value} + for value in [ + 1.0, + 1.0, + 1.0, + 1.0, + 0.9, + 0.9, + 0.9, + 0.9, + 0.8, + 0.8, + 0.8, + 0.8, + 0.7, + 0.7, + 0.7, + 0.7, + 0.6, + 0.6, + 0.6, + 0.6, + 0.5, + 0.5, + 0.5, + 0.5, + 0.4, + 0.4, + 0.4, + 0.4, + 0.3, + 0.3, + 0.3, + 0.3, + 0.2, + 0.2, + 0.2, + 0.2, + ] ] self.iteration = 0 @@ -791,7 +802,7 @@ def test_file_output(self, loss_mock, model_mock): resampling_strategy_args={"folds": 5}, scoring_functions=SCORER_LIST, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) @@ -903,7 +914,7 @@ def test_subsample_indices_classification(self, mock, backend_mock): configuration=configuration, resampling_strategy="cv", resampling_strategy_args={"folds": 10}, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) train_indices = np.arange(69, dtype=int) @@ -974,7 +985,7 @@ def test_subsample_indices_regression(self, mock, backend_mock): configuration=configuration, resampling_strategy="cv", resampling_strategy_args={"folds": 10}, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) train_indices = np.arange(69, dtype=int) @@ -1043,7 +1054,7 @@ def test_predict_proba_binary_classification(self, mock): resampling_strategy="cv", resampling_strategy_args={"folds": 10}, output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) @@ -1091,7 +1102,7 @@ def test_fit_predict_and_loss_standard_additional_run_info( configuration=configuration, resampling_strategy="holdout", output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -1139,7 +1150,7 @@ def __call__(self, *args, **kwargs): resampling_strategy="cv", resampling_strategy_args={"folds": 2}, output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -1182,7 +1193,7 @@ def __call__(self): mock.get_current_iter.side_effect = Counter() mock.get_max_iter.return_value = 1 mock.get_additional_run_info.return_value = 14678 - loss_mock.return_value = 0.5 + loss_mock.return_value = {"accuracy": 0.5} D = get_binary_classification_datamanager() backend_mock.load_datamanager.return_value = D @@ -1199,7 +1210,7 @@ def __call__(self): configuration=configuration, resampling_strategy="holdout", output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], budget=0.0, additional_components=dict(), ) @@ -1228,7 +1239,7 @@ def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info( mock.estimator_supports_iterative_fit.return_value = False mock.fit_transformer.return_value = ("Xt", {}) mock.get_additional_run_info.return_value = 14678 - loss_mock.return_value = 0.5 + loss_mock.return_value = {"accuracy": 0.5} D = get_binary_classification_datamanager() backend_mock.load_datamanager.return_value = D @@ -1245,7 +1256,7 @@ def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info( configuration=configuration, resampling_strategy="holdout", output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -1285,7 +1296,7 @@ def __call__(self): mock.fit_transformer.return_value = ("Xt", {}) mock.get_additional_run_info.return_value = {"val": 14678} mock.get_max_iter.return_value = 512 - loss_mock.return_value = 0.5 + loss_mock.return_value = {"accuracy": 0.5} D = get_binary_classification_datamanager() backend_mock.load_datamanager.return_value = D @@ -1302,7 +1313,7 @@ def __call__(self): configuration=configuration, resampling_strategy="holdout", output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], budget_type="iterations", budget=50, additional_components=dict(), @@ -1335,7 +1346,7 @@ def test_fit_predict_and_loss_budget_2_additional_run_info( mock.estimator_supports_iterative_fit.return_value = False mock.fit_transformer.return_value = ("Xt", {}) mock.get_additional_run_info.return_value = {"val": 14678} - loss_mock.return_value = 0.5 + loss_mock.return_value = {"accuracy": 0.5} D = get_binary_classification_datamanager() backend_mock.load_datamanager.return_value = D @@ -1352,7 +1363,7 @@ def test_fit_predict_and_loss_budget_2_additional_run_info( configuration=configuration, resampling_strategy="holdout", output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], budget_type="subsample", budget=50, additional_components=dict(), @@ -1406,7 +1417,7 @@ def test_datasets(self): resampling_strategy="cv", resampling_strategy_args={"folds": 2}, output_y_hat_optimization=False, - metric=metric_lookup[D.info["task"]], + metrics=[metric_lookup[D.info["task"]]], additional_components=dict(), ) @@ -2984,7 +2995,7 @@ def test_eval_holdout(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) info = read_queue(self.queue) @@ -2993,6 +3004,36 @@ def test_eval_holdout(self): self.assertEqual(info[0]["status"], StatusType.SUCCESS) self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_multi_objective(self): + metrics = { + accuracy: 0.030303030303030276, + balanced_accuracy: 0.033333333333333326, + } + eval_holdout( + queue=self.queue, + port=self.port, + config=self.configuration, + backend=self.backend, + resampling_strategy="holdout", + resampling_strategy_args=None, + seed=test.conftest.DEFAULT_SEED, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude=None, + disable_file_output=False, + instance=self.dataset_name, + metrics=list(metrics.keys()), + additional_components=dict(), + ) + info = read_queue(self.queue) + self.assertEqual(len(info), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(info[0]["loss"][metric.name], loss) + self.assertEqual(info[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_all_loss_functions(self): eval_holdout( queue=self.queue, @@ -3009,7 +3050,7 @@ def test_eval_holdout_all_loss_functions(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) rval = read_queue(self.queue) @@ -3063,7 +3104,7 @@ def test_eval_holdout_iterative_fit_no_timeout(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) rval = read_queue(self.queue) @@ -3072,6 +3113,36 @@ def test_eval_holdout_iterative_fit_no_timeout(self): self.assertEqual(rval[0]["status"], StatusType.DONOTADVANCE) self.assertEqual(rval[-1]["status"], StatusType.SUCCESS) + def test_eval_holdout_iterative_fit_no_timeout_multi_objective(self): + metrics = { + accuracy: 0.030303030303030276, + balanced_accuracy: 0.033333333333333326, + } + eval_iterative_holdout( + queue=self.queue, + port=self.port, + config=self.configuration, + backend=self.backend, + resampling_strategy="holdout", + resampling_strategy_args=None, + seed=1, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude=None, + disable_file_output=False, + instance=self.dataset_name, + metrics=list(metrics.keys()), + additional_components=dict(), + ) + rval = read_queue(self.queue) + self.assertEqual(len(rval), 9) + for metric, loss in metrics.items(): + self.assertAlmostEqual(rval[-1]["loss"][metric.name], loss) + self.assertEqual(rval[0]["status"], StatusType.DONOTADVANCE) + self.assertEqual(rval[-1]["status"], StatusType.SUCCESS) + def test_eval_holdout_budget_iterations(self): eval_holdout( queue=self.queue, @@ -3088,7 +3159,7 @@ def test_eval_holdout_budget_iterations(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], budget=1, budget_type="iterations", additional_components=dict(), @@ -3099,7 +3170,39 @@ def test_eval_holdout_budget_iterations(self): self.assertEqual(info[0]["status"], StatusType.SUCCESS) self.assertNotIn("bac_metric", info[0]["additional_run_info"]) - def test_eval_holdout_budget_iterations_converged(self): + def test_eval_holdout_budget_iterations_multi_objective(self): + metrics = { + accuracy: 0.06060606060606055, + balanced_accuracy: 0.06666666666666676, + } + eval_holdout( + queue=self.queue, + port=self.port, + config=self.configuration, + backend=self.backend, + resampling_strategy="holdout", + resampling_strategy_args=None, + seed=1, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude=None, + disable_file_output=False, + instance=self.dataset_name, + metrics=list(metrics.keys()), + budget=1, # Not iterative, but only for 1% of the budget + budget_type="iterations", + additional_components=dict(), + ) + info = read_queue(self.queue) + self.assertEqual(len(info), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(info[0]["loss"][metric.name], loss) + self.assertEqual(info[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + + def test_eval_holdout_budget_iterations_converged_multi_objective(self): configuration = get_configuration_space( exclude={"classifier": ["random_forest", "liblinear_svc"]}, info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}, @@ -3119,7 +3222,7 @@ def test_eval_holdout_budget_iterations_converged(self): exclude={"classifier": ["random_forest", "liblinear_svc"]}, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], budget=80, budget_type="iterations", additional_components=dict(), @@ -3130,6 +3233,42 @@ def test_eval_holdout_budget_iterations_converged(self): self.assertEqual(info[0]["status"], StatusType.DONOTADVANCE) self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_budget_iterations_converged(self): + metrics = { + accuracy: 0.18181818181818177, + balanced_accuracy: 0.18787878787878787, + } + configuration = get_configuration_space( + exclude={"classifier": ["random_forest", "liblinear_svc"]}, + info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}, + ).get_default_configuration() + eval_holdout( + queue=self.queue, + port=self.port, + config=configuration, + backend=self.backend, + resampling_strategy="holdout", + resampling_strategy_args=None, + seed=1, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude={"classifier": ["random_forest", "liblinear_svc"]}, + disable_file_output=False, + instance=self.dataset_name, + metrics=list(metrics.keys()), + budget=80, + budget_type="iterations", + additional_components=dict(), + ) + info = read_queue(self.queue) + self.assertEqual(len(info), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(info[0]["loss"][metric.name], loss) + self.assertEqual(info[0]["status"], StatusType.DONOTADVANCE) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_budget_subsample(self): eval_holdout( queue=self.queue, @@ -3146,7 +3285,7 @@ def test_eval_holdout_budget_subsample(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], budget=30, budget_type="subsample", additional_components=dict(), @@ -3157,6 +3296,38 @@ def test_eval_holdout_budget_subsample(self): self.assertEqual(info[0]["status"], StatusType.SUCCESS) self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_budget_subsample_multi_objective(self): + metrics = { + accuracy: 0.0, + f1_macro: 0.0, + } + eval_holdout( + queue=self.queue, + port=self.port, + config=self.configuration, + backend=self.backend, + resampling_strategy="holdout", + resampling_strategy_args=None, + seed=1, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude=None, + disable_file_output=False, + instance=self.dataset_name, + metrics=list(metrics.keys()), + budget=30, + budget_type="subsample", + additional_components=dict(), + ) + info = read_queue(self.queue) + self.assertEqual(len(info), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(info[0]["loss"][metric.name], loss) + self.assertEqual(info[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_budget_mixed_iterations(self): print(self.configuration) eval_holdout( @@ -3174,7 +3345,7 @@ def test_eval_holdout_budget_mixed_iterations(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], budget=1, budget_type="mixed", additional_components=dict(), @@ -3204,7 +3375,7 @@ def test_eval_holdout_budget_mixed_subsample(self): exclude={"classifier": ["random_forest"]}, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], budget=40, budget_type="mixed", additional_components=dict(), @@ -3231,7 +3402,7 @@ def test_eval_cv(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) rval = read_queue(self.queue) @@ -3256,7 +3427,7 @@ def test_eval_cv_all_loss_functions(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) rval = read_queue(self.queue) @@ -3330,10 +3501,54 @@ def test_eval_partial_cv(self): include=None, exclude=None, disable_file_output=False, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) rval = read_queue(self.queue) self.assertEqual(len(rval), 1) self.assertAlmostEqual(rval[0]["loss"], results[fold]) self.assertEqual(rval[0]["status"], StatusType.SUCCESS) + + def test_eval_partial_cv_multi_objective(self): + metrics = { + accuracy: [ + 0.050000000000000044, + 0.0, + 0.09999999999999998, + 0.09999999999999998, + 0.050000000000000044, + ], + balanced_accuracy: [ + 0.04761904761904756, + 0.0, + 0.10317460317460314, + 0.11111111111111116, + 0.05555555555555547, + ], + } + + for fold in range(5): + instance = json.dumps({"task_id": "data", "fold": fold}) + eval_partial_cv( + port=self.port, + queue=self.queue, + config=self.configuration, + backend=self.backend, + seed=1, + num_run=1, + instance=instance, + resampling_strategy="partial-cv", + resampling_strategy_args={"folds": 5}, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude=None, + disable_file_output=False, + metrics=list(metrics.keys()), + additional_components=dict(), + ) + rval = read_queue(self.queue) + self.assertEqual(len(rval), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(rval[0]["loss"][metric.name], loss[fold]) + self.assertEqual(rval[0]["status"], StatusType.SUCCESS) diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 541b2d6783..2cb7dc2158 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -6,7 +6,11 @@ import autosklearn.metrics from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION -from autosklearn.metrics import calculate_loss, calculate_metric, calculate_score +from autosklearn.metrics import ( + calculate_losses, + calculate_scores, + compute_single_metric, +) import pytest import unittest @@ -543,7 +547,7 @@ def test_unsupported_task_type(self): raised = False try: - calculate_score(y_true, y_pred, 6, scorer) + calculate_scores(y_true, y_pred, 6, scorer) except NotImplementedError: raised = True self.assertTrue(raised) @@ -561,11 +565,11 @@ def test_classification_scoring_functions(self): y_pred = np.array( [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] ) - score_dict = calculate_score( + score_dict = calculate_scores( y_true, y_pred, BINARY_CLASSIFICATION, - autosklearn.metrics.accuracy, + [autosklearn.metrics.accuracy], scoring_functions, ) @@ -591,11 +595,11 @@ def test_regression_scoring_functions(self): y_true = np.array([1, 2, 3, -4]) y_pred = y_true.copy() - score_dict = calculate_score( + score_dict = calculate_scores( y_true, y_pred, REGRESSION, - autosklearn.metrics.root_mean_squared_error, + [autosklearn.metrics.root_mean_squared_error], scoring_functions, ) @@ -615,7 +619,9 @@ def test_classification_only_metric(self): ) scorer = autosklearn.metrics.accuracy - score = calculate_score(y_true, y_pred, BINARY_CLASSIFICATION, scorer) + score = calculate_scores(y_true, y_pred, BINARY_CLASSIFICATION, [scorer])[ + "accuracy" + ] previous_score = scorer._optimum self.assertAlmostEqual(score, previous_score) @@ -625,36 +631,126 @@ def test_regression_only_metric(self): y_pred = y_true.copy() scorer = autosklearn.metrics.root_mean_squared_error - score = calculate_score(y_true, y_pred, REGRESSION, scorer) + score = calculate_scores(y_true, y_pred, REGRESSION, [scorer])[ + "root_mean_squared_error" + ] previous_score = scorer._optimum self.assertAlmostEqual(score, previous_score) -def test_calculate_loss(): +def test_calculate_losses(): # In a 0-1 ranged scorer, make sure that the loss # has an expected positive value y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0]) y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0]) score = sklearn.metrics.accuracy_score(y_true, y_pred) - assert pytest.approx(score) == calculate_score( + assert {"accuracy": pytest.approx(score)} == calculate_scores( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, - metric=autosklearn.metrics.accuracy, + metrics=[autosklearn.metrics.accuracy], ) - assert pytest.approx(1.0 - score) == calculate_loss( + assert {"accuracy": pytest.approx(1.0 - score)} == calculate_losses( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, - metric=autosklearn.metrics.accuracy, + metrics=[autosklearn.metrics.accuracy], ) - # Test the dictionary case - score_dict = calculate_score( + # Test two metrics + score_dict = calculate_scores( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, - metric=autosklearn.metrics.accuracy, + metrics=[ + autosklearn.metrics.accuracy, + autosklearn.metrics.balanced_accuracy, + ], + ) + expected_score_dict = { + "accuracy": 0.9, + "balanced_accuracy": 0.9285714285714286, + } + loss_dict = calculate_losses( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[ + autosklearn.metrics.accuracy, + autosklearn.metrics.balanced_accuracy, + ], + ) + for expected_metric, expected_score in expected_score_dict.items(): + assert pytest.approx(expected_score) == score_dict[expected_metric] + assert pytest.approx(1 - expected_score) == loss_dict[expected_metric] + + # Test no metric + with pytest.raises( + ValueError, match="Number of metrics to compute must be greater than zero." + ): + calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[], + ) + + with pytest.raises( + ValueError, match="Number of metrics to compute must be greater than zero." + ): + calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[], + scoring_functions=[ + autosklearn.metrics.accuracy, + autosklearn.metrics.balanced_accuracy, + ], + ) + + # Test the same metric twice + accuracy_fixture = {"accuracy": pytest.approx(0.9)} + assert accuracy_fixture == calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[autosklearn.metrics.accuracy, autosklearn.metrics.accuracy], + ) + assert accuracy_fixture == calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[autosklearn.metrics.accuracy], + scoring_functions=[autosklearn.metrics.accuracy], + ) + assert accuracy_fixture == calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[autosklearn.metrics.accuracy], + scoring_functions=[autosklearn.metrics.accuracy, autosklearn.metrics.accuracy], + ) + + # Test the same name for multiple metrics! + bogus_accuracy = autosklearn.metrics.make_scorer( + "accuracy", + score_func=sklearn.metrics.roc_auc_score, + ) + with pytest.raises(ValueError, match="used multiple times"): + calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[autosklearn.metrics.accuracy, bogus_accuracy], + ) + + # Test additional scoring functions + score_dict = calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[autosklearn.metrics.accuracy], scoring_functions=[ autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy, @@ -664,11 +760,11 @@ def test_calculate_loss(): "accuracy": 0.9, "balanced_accuracy": 0.9285714285714286, } - loss_dict = calculate_loss( + loss_dict = calculate_losses( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, - metric=autosklearn.metrics.accuracy, + metrics=[autosklearn.metrics.accuracy], scoring_functions=[ autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy, @@ -683,17 +779,17 @@ def test_calculate_loss(): y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66]) score = sklearn.metrics.mean_squared_error(y_true, y_pred) - assert pytest.approx(0 - score) == calculate_score( + assert {"mean_squared_error": pytest.approx(0 - score)} == calculate_scores( solution=y_true, prediction=y_pred, task_type=REGRESSION, - metric=autosklearn.metrics.mean_squared_error, + metrics=[autosklearn.metrics.mean_squared_error], ) - assert pytest.approx(score) == calculate_loss( + assert {"mean_squared_error": pytest.approx(score)} == calculate_losses( solution=y_true, prediction=y_pred, task_type=REGRESSION, - metric=autosklearn.metrics.mean_squared_error, + metrics=[autosklearn.metrics.mean_squared_error], ) @@ -702,7 +798,7 @@ def test_calculate_metric(): y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0]) y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0]) score = sklearn.metrics.accuracy_score(y_true, y_pred) - assert pytest.approx(score) == calculate_metric( + assert pytest.approx(score) == compute_single_metric( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, @@ -713,7 +809,7 @@ def test_calculate_metric(): y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66]) score = sklearn.metrics.mean_squared_error(y_true, y_pred) - assert pytest.approx(score) == calculate_metric( + assert pytest.approx(score) == compute_single_metric( solution=y_true, prediction=y_pred, task_type=REGRESSION, diff --git a/test/test_optimizer/test_smbo.py b/test/test_optimizer/test_smbo.py index 0b14f4a722..8462c67baf 100644 --- a/test/test_optimizer/test_smbo.py +++ b/test/test_optimizer/test_smbo.py @@ -13,13 +13,13 @@ import pytest -@pytest.mark.parametrize("context", ["fork", "forkserver"]) +@pytest.mark.parametrize("context", ["fork", "forkserver", "spawn"]) def test_smbo_metalearning_configurations(backend, context, dask_client) -> None: # Get the inputs to the optimizer X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") config_space = AutoML( delete_tmp_folder_after_terminate=False, - metric=autosklearn.metrics.accuracy, + metrics=[autosklearn.metrics.accuracy], time_left_for_this_task=20, per_run_time_limit=5, ).fit( @@ -38,7 +38,7 @@ def test_smbo_metalearning_configurations(backend, context, dask_client) -> None total_walltime_limit=10, func_eval_time_limit=5, memory_limit=4096, - metric=autosklearn.metrics.accuracy, + metrics=[autosklearn.metrics.accuracy], stopwatch=stopwatch, n_jobs=1, dask_client=dask_client,